Multiple requests with Scrapy - python

I am trying to scrape the following URLs, however, I need to create two different requests, one for properties for sale and one for rent, as the URL differs.
When I run the code I have I am only able to parse the properties that are for sale ('propbay') and not for rent ('rentbay'). And I am not sure what I am doing wrong with the second request.
Does anyone have any suggestions? Here is my code:
import scrapy
import re
import requests
class ProbRentBaySpider(scrapy.Spider):
name = 'prob_rent_bay'
start_urls = [
'https://www.propbay.co.za/for-sale-property/residential/za/western_cape/7',
'https://www.rentbay.co.za/to-rent-property/residential/za/western_cape/7'
]
headers = {
'authority': 'www.propbay.co.za',
'pragma': 'no-cache',
'cache-control': 'no-cache',
'sec-ch-ua': '"Chromium";v="86", "\\"Not\\\\A;Brand";v="99", "Google Chrome";v="86"',
'accept': '*/*',
'x-requested-with': 'XMLHttpRequest',
'sec-ch-ua-mobile': '?1',
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Mobile Safari/537.36',
'content-type': 'application/json',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
'cookie': '_ga=GA1.3.1758254184.1598490742; ASP.NET_SessionId=v1muaskigrgnn40m42lsqzct; __RequestVerificationToken=AIEv13vh8ksXZeG6Tf_o-vLCscKt7sYKJjwB0kz0CfqmCe8ZpYRQQdGk2BnN095p2A6wlFf7o_lVYyxe1Jro-I5vHE01; _gid=GA1.3.900892753.1605696808',
}
headers2 = {
'authority': 'www.rentbay.co.za',
'pragma': 'no-cache',
'cache-control': 'no-cache',
'sec-ch-ua': '"Chromium";v="86", "\\"Not\\\\A;Brand";v="99", "Google Chrome";v="86"',
'accept': '*/*',
'x-requested-with': 'XMLHttpRequest',
'sec-ch-ua-mobile': '?1',
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Mobile Safari/537.36',
'content-type': 'application/json',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
'cookie': 'ASP.NET_SessionId=rexgmrrgju10aw4rirulzrmk; _ga=GA1.3.225479946.1606814269; __RequestVerificationToken=az6ZATA2H0dJfBQ6KuwDwz39XGSiSuIjc4iZwRT8BGSD2surYfA6iOmkIQk2p835G51hYqJd5FFoiSQYsvx-V3Ndx6s1; _gid=GA1.3.1154633027.1607081144; _gat_gtag_UA_7413963_2=1',
}
base_url_sale = ['https://www.propbay.co.za/Property/ListByCityAjax?city=Cape%20Town&province=western%20cape&cityCode=199041&category=3&page=',
'https://www.propbay.co.za/Property/ListByCityAjax?city=Milnerton&province=western%20cape&cityCode=199014&category=3&page=' ]
base_url_rent = ['https://www.rentbay.co.za/Property/ListByCityAjax?city=Cape%20Town&province=western%20cape&cityCode=199041&category=3&page=',
'https://www.rentbay.co.za/Property/ListByCityAjax?city=Milnerton&province=western%20cape&cityCode=199014&category=3&page=' ]
def parse(self,response):
for page in range(2, 8):# specify page range you would like to scrape data for
for link in self.base_url_sale:
next_page = link + str(page)
response = requests.get(url=next_page, headers=self.headers)
for product in response.json()['Data']['Suburbs']:
area_url = 'https://www.propbay.co.za'+ product['FriendlyUrl']
yield scrapy.Request(area_url,callback=self.parse_sale)
for page2 in range(2, 8):# specify page range you would like to scrape data for
for link2 in self.base_url_rent:
next_page2 = link2 + str(page2)
response2 = requests.get(url=next_page2, headers=self.headers2)
for product2 in response2.json()['Data']['Suburbs']:
area_url_2 = 'https://www.rentbay.co.za'+ product2['FriendlyUrl']
yield scrapy.Request(area_url_2,callback=self.parse_rent)
def parse_sale(self, response):
# follow links to property pages
for href in response.xpath('//a[#class="u-text-uppercase"]/#href').getall():
follow_link = 'https://www.propbay.co.za'+ href
yield response.follow(follow_link, self.parse_property)
# follow pagination links
for href in response.xpath('//*[#id="btnNext"]/#href'):
yield response.follow(href, self.parse_sale)
def parse_rent(self, response):
# follow links to property pages
for href in response.xpath('//a[#class="u-text-uppercase"]/#href').getall():
follow_link = 'https://www.rentbay.co.za'+ href
yield response.follow(follow_link, self.parse_property)
# follow pagination links
for href in response.xpath('//*[#id="btnNext"]/#href').getall():
yield response.follow(href, self.parse_rent)
def parse_property(self, response):
title = response.css('span.u-text-capitalize::text').get()
bedrooms = response.xpath('//span[contains(text(), "Bedrooms")]/following-sibling::span/text()').get()
bedrooms = bedrooms.split()[0] if bedrooms is not None else None
...
Edited code:
I have tried to make separate parsing functions, however, I am only getting rental properties, not sure how to get also the properties for sale.
def parse(self,response):
for page in range(2, 8):# specify page range you would like to scrape data for
for link in self.base_url_sale:
next_page = link + str(page)
response = requests.get(url=next_page, headers=self.headers)
for product in response.json()['Data']['Suburbs']:
area_url = 'https://www.propbay.co.za'+ product['FriendlyUrl']
yield scrapy.Request(area_url,callback=self.parse_rent)
def parse_rent(self, response):
for page2 in range(2, 8):# specify page range you would like to scrape data for
for link2 in self.base_url_rent:
next_page2 = link2 + str(page2)
response = requests.get(url=next_page2, headers=self.headers2)
for product2 in response.json()['Data']['Suburbs']:
item = dict()
area_url_2 = 'https://www.rentbay.co.za'+ product2['FriendlyUrl']
yield scrapy.Request(area_url_2,callback=self.parse_all)
def parse_all(self, response):
# follow links to property pages
for href in response.xpath('//a[#class="u-text-uppercase"]/#href').getall():
follow_link = 'https://www.propbay.co.za'+ href
yield response.follow(follow_link, self.parse_property)
# follow pagination links
for href in response.xpath('//*[#id="btnNext"]/#href'):
yield response.follow(href, self.parse_all)

First things first, you need to override start_requests and specify different callbacks for each url and then split you parse() logic into those two methods. Else you can at least have an if check on url before looping over either propbay or rentbay part. For now the responses of both urls are being treated the same way in your parse. So may be first time the request is incorrect as your response is for propbay but second time when it was correct it is filtered by the dupefilter.
For an instant fix you can try adding don't_filter=True to your requests in parse method.

Related

scrapy_splash not rendering for list of urls

I created a spider with scrapy_splash,
I hardcoded 3 urls in start_requests.
When I run with any one url it is working fine for all the urls.
when I put all the urls in a list and run one by one, it is not working, and splash not returning complete rendered html in response.body.
kindly help.
code:
import re
import time
import json
import scrapy
import w3lib
from scrapy_splash import SplashRequest
class SpeSpider(scrapy.Spider):
name = 'spe'
# allowed_domains = ['s']
# start_urls = ['http://s/']
without_wait_script = """
function main(splash, args)
splash.private_mode_enabled = false
assert(splash:go(args.url))
assert(splash:wait(2))
return {
html = splash:html(),
}
end
"""
wait_script = """
function main(splash, args)
assert(splash:go(args.url))
assert(splash:wait(10))
return {
html = splash:html(),
}
end
"""
splash_headers = {
'authority': 'www.avivainvestors.com',
'sec-ch-ua': '"Google Chrome";v="95", "Chromium";v="95", ";Not A Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'sec-fetch-dest': 'document',
'referer': 'https://www.avivainvestors.com/fr-fr/nos-expertises/nos-fonds/',
'accept-language': 'en-US,en;q=0.9,lb;q=0.8',
}
def start_requests(self):
url1="https://www.avivainvestors.com/fr-fr/nos-expertises/equities/uk-listed-equity-high-alpha-fund/lu0160960752-gbp/"
url2 = "https://www.avivainvestors.com/fr-fr/nos-expertises/equities/japon-isr/fr0013340841-eur/"
url3 = "https://www.avivainvestors.com/fr-fr/nos-expertises/fixed-income/emerging-markets-corporate-bond-fund/lu1550133976-usd/"
urls = [url1, url2, url3]
for url in urls:
time.sleep(10)
yield SplashRequest(
url=url,
endpoint="execute",
callback=self.scrape_document_id,
args={"lua_source":self.wait_script},
splash_headers= self.splash_headers
)
def scrape_document_id(self, response):
value = response.xpath('//div[#class="ec-table__cell-content ng-binding ng-scope" and text() = "Rapport annuel"]/../..//td/ec-button/#mstar-component-id').get()
print("VALUE", value)
v = re.search(r"\[([^]]+)\]", value).group().strip("[]")
yield {
"url": response.url,
"id" : v
}
This is because you are using a yield statement which is a generator.
My guess is that you are just doing this,
x = SpeSpider()
x.start_requests()
which only creates a generator from your yield statement.
Try this,
x = SpeSpider()
list(x.start_requests())
It will run your function and produce a list though I am not sure if this is the behaviour you want because I don't any code on how you instantiate the class objects or what the results should look like.

Trying to scrape a webpage to get all links but all the links got are the webpage's url itself

I'm trying to fetch all the links in the following webpage https://sistemaswebb3-listados.b3.com.br/fundsPage/7. As the webpage is full of javascript, I'm using requests-html library to execute them all.
My goal is to iterate through all the 18 pages to get all the funds' names and its corresponding links (which I intend to scraper later on), but I can't get the real links. The only link I get is https://sistemaswebb3-listados.b3.com.br/fundsPage/7, the page's url itself, common to all elements. If I use a browser to access the page, I get redirected to a different webpage which has a different url.
Here is my code:
session = AsyncHTMLSession()
headers = {
'Connection': 'keep-alive',
'sec-ch-ua': '"Google Chrome";v="87", " Not;A Brand";v="99", "Chromium";v="87"',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36',
'Content-Type': 'text/plain; charset=UTF-8',
'Accept': 'application/json, text/plain, */*',
'Origin': 'https://sistemaswebb3-listados.b3.com.br',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty',
'Referer': 'https://sistemaswebb3-listados.b3.com.br/',
'Accept-Language': 'pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7',
}
base_site = 'https://sistemaswebb3-listados.b3.com.br/fundsPage/7'
r1 = await session.get(base_site, headers=headers, verify=False)
# Getting all the links right before the the rendering to compare to the the answer rendered
divs = r1.html.find('div')
links = r1.html.find('a')
urls = r1.html.absolute_links
await r1.html.arender(wait=2)
new_divs = r1.html.find('div')
new_links = r1.html.find('a')
new_urls = r1.html.absolute_links
print('Divs before the rendering: {0} \nDivs after the rendering: {1}'.format(len(divs), len(new_divs)))
print('Links before the rendering: {0} \nLinks after the rendering: {1}'.format(len(links), len(new_links)))
print('URLs before the rendering: {0} \nURLs after the rendering: {1}'.format(len(urls), len(new_urls)))
i = 1
for l in r1.html.find('a'):
if l.text != '' and '\n' not in l.text:
print('Link of interest {0}: {1} {2}'.format(i, l.text, l.url))
i += 1
The output is:
Link of interest 1: AF INVEST CRI FDO. INV. IMOB – RECEBÍVEIS IMOB. https://sistemaswebb3-listados.b3.com.br/fundsPage/7
Link of interest 2: FII AFHI CRI https://sistemaswebb3-listados.b3.com.br/fundsPage/7
Link of interest 3: AF INVEST FDO INV. IMOB. - RECEBÍVEIS IMOB. https://sistemaswebb3-listados.b3.com.br/fundsPage/7
Link of interest 4: FII AFINVCR https://sistemaswebb3-listados.b3.com.br/fundsPage/7
Link of interest 5: ALIANZA FOF FUNDO DE INVESTIMENTO IMOBILIÁRIO https://sistemaswebb3-listados.b3.com.br/fundsPage/7
Any suggestion?

Why getting empty list while try to get links which includes specific class in python using bs4?

I'm try to get some links which are includes specific class, therefore i writed this code:
from bs4 import BeautifulSoup
import requests
def getPages(requestedURLS):
_buff = []
for url in requestedURLS:
try:
_buff.append(requests.get(url))
except requests.exceptions.RequestException as err:
print(err)
return _buff
def getProductList(pages):
links = []
for page in pages:
content = BeautifulSoup(page.content, 'html.parser')
links.extend(content.find_all("a", class_="sresult lvresult clearfix li shic"))
print(links)
def main():
pageLinks = [
"https://www.ebay.co.uk/sch/m.html?_nkw=&_armrs=1&_from=&_ssn=carabaeuro13&_clu=2&_fcid=3&_localstpos=&_stpos=&gbr=1&_pppn=r1&scp=ce0",
"https://www.ebay.co.uk/sch/m.html?_nkw=&_armrs=1&_from=&_ssn=carabaeuro13&_clu=2&_fcid=3&_localstpos=&_stpos=&gbr=1&_pgn=2&_skc=200&rt=nc",
"https://www.ebay.co.uk/sch/m.html?_nkw=&_armrs=1&_from=&_ssn=carabaeuro13&_clu=2&_fcid=3&_localstpos=&_stpos=&gbr=1&_pgn=3&_skc=400&rt=nc"
]
#Results in : <div id="Results" class="results "> <ul id="ListViewInner">
pages = getPages(pageLinks)
productList = getProductList(pages)
if __name__ == '__main__':
main()
You can check links there are lots of links which includes this class but output is empty as you see in below:
C:\Users\projects\getMarketData\venv\Scripts\python.exe C:/Users/projects/getMarketData/getData.py
[]
[]
[]
Process finished with exit code 0
What is wrong?
Using appropriate headers should do the trick:
url = 'https://www.ebay.co.uk/sch/m.html?_nkw=&_armrs=1&_from=&_ssn=carabaeuro13&_clu=2&_fcid=3&_localstpos=&_stpos=&gbr=1&_pppn=r1&scp=ce0'
request_headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Host': 'www.ebay.co.uk',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
}
r = requests.get(url, headers=request_headers)
soup = BeautifulSoup(r.content, 'lxml')
items = soup.find_all('li', {'class': 'sresult lvresult clearfix li shic'})
Result:
print(len(items)) # 18
print(items[0]['listingid']) # 333200565336
Use the network tab of the developer tools to inspect the traffic. Check this out if you've never done this before.

How to get cookies value to set in requests?

I am accessing a URL https://streeteasy.com/sales/all which does not show the page unless Cookie is set. I am having no idea how this cookie value being generated. I highly doubt that cookie value is fixed so I guess I can't use a hard-coded Cookie value either.
Code below:
import requests
from bs4 import BeautifulSoup
headers = {
'authority': 'streeteasy.com',
'pragma': 'no-cache',
'cache-control': 'no-cache',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'referer': 'https://streeteasy.com/sales/all',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9,ur;q=0.8',
'cookie': 'D_SID=103.228.157.1:Bl5GGXCWIxq4AopS1Hkr7nkveq1nlhWXlD3PMrssGpU; _se_t=0944dfa5-bfb4-4085-812e-fa54d44acc54; google_one_tap=0; D_IID=AFB68ACC-B276-36C0-8718-13AB09A55E51; D_UID=23BA0A61-D0DF-383D-88A9-8CF65634135F; D_ZID=C0263FA4-96BF-3071-8318-56839798C38D; D_ZUID=C2322D79-7BDB-3E32-8620-059B1D352789; D_HID=CE522333-8B7B-3D76-B45A-731EB750DF4D; last_search_tab=sales; se%3Asearch%3Asales%3Astate=%7C%7C%7C%7C; streeteasy_site=nyc; se_rs=123%2C1029856%2C123%2C1172313%2C2815; se%3Asearch%3Ashared%3Astate=102%7C%7C%7C%7Cfalse; anon_searcher_stage=initial; se_login_trigger=4; se%3Abig_banner%3Asearch=%7B%22123%22%3A2%7D; se%3Abig_banner%3Ashown=true; se_lsa=2019-07-08+04%3A01%3A30+-0400; _ses=BAh7DEkiD3Nlc3Npb25faWQGOgZFVEkiJWRiODVjZTA1NmYzMzZkMzZiYmU4YTk4Yjk5YmU5ZTBlBjsAVEkiEG5ld192aXNpdG9yBjsARlRJIhFsYXN0X3NlY3Rpb24GOwBGSSIKc2FsZXMGOwBUSSIQX2NzcmZfdG9rZW4GOwBGSSIxbTM5eGRPUVhLeGYrQU1jcjZIdi81ajVFWmYzQWFSQmhxZThNcG92cWxVdz0GOwBGSSIIcGlzBjsARmkUSSIOdXNlcl9kYXRhBjsARnsQOhBzYWxlc19vcmRlckkiD3ByaWNlX2Rlc2MGOwBUOhJyZW50YWxzX29yZGVySSIPcHJpY2VfZGVzYwY7AFQ6EGluX2NvbnRyYWN0RjoNaGlkZV9tYXBGOhJzaG93X2xpc3RpbmdzRjoSbW9ydGdhZ2VfdGVybWkjOhltb3J0Z2FnZV9kb3ducGF5bWVudGkZOiFtb3J0Z2FnZV9kb3ducGF5bWVudF9kb2xsYXJzaQJQwzoSbW9ydGdhZ2VfcmF0ZWYJNC4wNToTbGlzdGluZ3Nfb3JkZXJJIhBsaXN0ZWRfZGVzYwY7AFQ6EHNlYXJjaF92aWV3SSIMZGV0YWlscwY7AFRJIhBsYXN0X3NlYXJjaAY7AEZpAXs%3D--d869dc53b8165c9f9e77233e78c568f610994ba7',
}
session = requests.Session()
response = session.get('https://streeteasy.com/for-sale/downtown', headers=headers, timeout=20)
if response.status_code == 200:
html = response.text
soup = BeautifulSoup(html, 'lxml')
links = soup.select('h3 > a')
print(links)

Scrapy - Simulating AJAX requests with headers and request payload

https://www.kralilan.com/liste/kiralik-bina
This is the website I am trying to scrape. When you open the website, the listings are generated with an ajax request. The same request keeps populating page whenever you scroll down. This is how they implemented infinite scrolling...
I found out this is the request sent to the server when I scroll down and I tried to simulate the same request with headers and request payload. This is my spider.
class MySpider(scrapy.Spider):
name = 'kralilanspider'
allowed_domains = ['kralilan.com']
start_urls = [
'https://www.kralilan.com/liste/satilik-bina'
]
def parse(self, response):
headers = {'Referer': 'https://www.kralilan.com/liste/kiralik-bina',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
#'Content-Type': 'application/json; charset=utf-8',
#'X-Requested-With': 'XMLHttpRequest',
#'Content-Length': 246,
#'Connection': 'keep-alive',
}
yield scrapy.Request(
url='https://www.kralilan.com/services/ki_operation.asmx/getFilter',
method='POST',
headers=headers,
callback=self.parse_ajax
)
def parse_ajax(self, response):
yield {'data': response.text}
If I uncomment the commented headers, request fails with status code 400 or 500.
I tried to send request payload as a body in the parse method. That didn't work either.
If I try to yield response.body, I get TypeError: Object of type bytes is not JSON serializable.
What am I missing here?
The following implementation will fetch you the response you would like to grab. You missed the most important part data to pass as a parameter in your post requests.
import json
import scrapy
class MySpider(scrapy.Spider):
name = 'kralilanspider'
data = {'incomestr':'["Bina","1",-1,-1,-1,-1,-1,5]', 'intextstr':'{"isCoordinates":false,"ListDrop":[],"ListText":[{"id":"78","Min":"","Max":""},{"id":"107","Min":"","Max":""}],"FiyatData":{"Max":"","Min":""}}', 'index':0 , 'count':'10' , 'opt':'1' , 'type':'3'}
def start_requests(self):
yield scrapy.Request(
url='https://www.kralilan.com/services/ki_operation.asmx/getFilter',
method='POST',
body=json.dumps(self.data),
headers={"content-type": "application/json"}
)
def parse(self, response):
items = json.loads(response.text)['d']
yield {"data":items}
In case you wanna parse data from multiple pages (new page index is recorded when you scroll downward), the following will do the trick. The pagination is within index key in your data.
import json
import scrapy
class MySpider(scrapy.Spider):
name = 'kralilanspider'
data = {'incomestr':'["Bina","1",-1,-1,-1,-1,-1,5]', 'intextstr':'{"isCoordinates":false,"ListDrop":[],"ListText":[{"id":"78","Min":"","Max":""},{"id":"107","Min":"","Max":""}],"FiyatData":{"Max":"","Min":""}}', 'index':0 , 'count':'10' , 'opt':'1' , 'type':'3'}
headers = {"content-type": "application/json"}
url = 'https://www.kralilan.com/services/ki_operation.asmx/getFilter'
def start_requests(self):
yield scrapy.Request(
url=self.url,
method='POST',
body=json.dumps(self.data),
headers=self.headers,
meta={'index': 0}
)
def parse(self, response):
items = json.loads(response.text)['d']
res = scrapy.Selector(text=items)
for item in res.css(".list-r-b-div"):
title = item.css(".add-title strong::text").get()
price = item.css(".item-price::text").get()
yield {"title":title,"price":price}
page = response.meta['index'] + 1
self.data['index'] = page
yield scrapy.Request(self.url, headers=self.headers, method='POST', body=json.dumps(self.data), meta={'index': page})
Why do you ignore POST body? You need to submit it too:
def parse(self, response):
headers = {'Referer': 'https://www.kralilan.com/liste/kiralik-bina',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
'Content-Type': 'application/json; charset=utf-8',
'X-Requested-With': 'XMLHttpRequest',
#'Content-Length': 246,
#'Connection': 'keep-alive',
}
payload = """
{ incomestr:'["Bina","2",-1,-1,-1,-1,-1,5]', intextstr:'{"isCoordinates":false,"ListDrop":[],"ListText":[{"id":"78","Min":"","Max":""},{"id":"107","Min":"","Max":""}],"FiyatData":{"Max":"","Min":""}}', index:'0' , count:'10' , opt:'1' , type:'3'}
"""
yield scrapy.Request(
url='https://www.kralilan.com/services/ki_operation.asmx/getFilter',
method='POST',
body=payload,
headers=headers,
callback=self.parse_ajax
)

Categories

Resources