This is the website I am trying to scrape. When you open the website, the listings are generated with an ajax request. The same request keeps populating page whenever you scroll down. This is how they implemented infinite scrolling...
I found out this is the request sent to the server when I scroll down and I tried to simulate the same request with headers and request payload. This is my spider.
class MySpider(scrapy.Spider):
name = 'kralilanspider'
allowed_domains = ['']
start_urls = [
def parse(self, response):
headers = {'Referer': '',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
#'Content-Type': 'application/json; charset=utf-8',
#'X-Requested-With': 'XMLHttpRequest',
#'Content-Length': 246,
#'Connection': 'keep-alive',
yield scrapy.Request(
def parse_ajax(self, response):
yield {'data': response.text}
If I uncomment the commented headers, request fails with status code 400 or 500.
I tried to send request payload as a body in the parse method. That didn't work either.
If I try to yield response.body, I get TypeError: Object of type bytes is not JSON serializable.
What am I missing here?
The following implementation will fetch you the response you would like to grab. You missed the most important part data to pass as a parameter in your post requests.
import json
import scrapy
class MySpider(scrapy.Spider):
name = 'kralilanspider'
data = {'incomestr':'["Bina","1",-1,-1,-1,-1,-1,5]', 'intextstr':'{"isCoordinates":false,"ListDrop":[],"ListText":[{"id":"78","Min":"","Max":""},{"id":"107","Min":"","Max":""}],"FiyatData":{"Max":"","Min":""}}', 'index':0 , 'count':'10' , 'opt':'1' , 'type':'3'}
def start_requests(self):
yield scrapy.Request(
headers={"content-type": "application/json"}
def parse(self, response):
items = json.loads(response.text)['d']
yield {"data":items}
In case you wanna parse data from multiple pages (new page index is recorded when you scroll downward), the following will do the trick. The pagination is within index key in your data.
import json
import scrapy
class MySpider(scrapy.Spider):
name = 'kralilanspider'
data = {'incomestr':'["Bina","1",-1,-1,-1,-1,-1,5]', 'intextstr':'{"isCoordinates":false,"ListDrop":[],"ListText":[{"id":"78","Min":"","Max":""},{"id":"107","Min":"","Max":""}],"FiyatData":{"Max":"","Min":""}}', 'index':0 , 'count':'10' , 'opt':'1' , 'type':'3'}
headers = {"content-type": "application/json"}
url = ''
def start_requests(self):
yield scrapy.Request(
meta={'index': 0}
def parse(self, response):
items = json.loads(response.text)['d']
res = scrapy.Selector(text=items)
for item in res.css(".list-r-b-div"):
title = item.css(".add-title strong::text").get()
price = item.css(".item-price::text").get()
yield {"title":title,"price":price}
page = response.meta['index'] + 1['index'] = page
yield scrapy.Request(self.url, headers=self.headers, method='POST', body=json.dumps(, meta={'index': page})
Why do you ignore POST body? You need to submit it too:
def parse(self, response):
headers = {'Referer': '',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
'Content-Type': 'application/json; charset=utf-8',
'X-Requested-With': 'XMLHttpRequest',
#'Content-Length': 246,
#'Connection': 'keep-alive',
payload = """
{ incomestr:'["Bina","2",-1,-1,-1,-1,-1,5]', intextstr:'{"isCoordinates":false,"ListDrop":[],"ListText":[{"id":"78","Min":"","Max":""},{"id":"107","Min":"","Max":""}],"FiyatData":{"Max":"","Min":""}}', index:'0' , count:'10' , opt:'1' , type:'3'}
yield scrapy.Request(
I am trying to scrape the following URLs, however, I need to create two different requests, one for properties for sale and one for rent, as the URL differs.
When I run the code I have I am only able to parse the properties that are for sale ('propbay') and not for rent ('rentbay'). And I am not sure what I am doing wrong with the second request.
Does anyone have any suggestions? Here is my code:
import scrapy
import re
import requests
class ProbRentBaySpider(scrapy.Spider):
name = 'prob_rent_bay'
start_urls = [
headers = {
'authority': '',
'pragma': 'no-cache',
'cache-control': 'no-cache',
'sec-ch-ua': '"Chromium";v="86", "\\"Not\\\\A;Brand";v="99", "Google Chrome";v="86"',
'accept': '*/*',
'x-requested-with': 'XMLHttpRequest',
'sec-ch-ua-mobile': '?1',
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Mobile Safari/537.36',
'content-type': 'application/json',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
'cookie': '_ga=GA1.3.1758254184.1598490742; ASP.NET_SessionId=v1muaskigrgnn40m42lsqzct; __RequestVerificationToken=AIEv13vh8ksXZeG6Tf_o-vLCscKt7sYKJjwB0kz0CfqmCe8ZpYRQQdGk2BnN095p2A6wlFf7o_lVYyxe1Jro-I5vHE01; _gid=GA1.3.900892753.1605696808',
headers2 = {
'authority': '',
'pragma': 'no-cache',
'cache-control': 'no-cache',
'sec-ch-ua': '"Chromium";v="86", "\\"Not\\\\A;Brand";v="99", "Google Chrome";v="86"',
'accept': '*/*',
'x-requested-with': 'XMLHttpRequest',
'sec-ch-ua-mobile': '?1',
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Mobile Safari/537.36',
'content-type': 'application/json',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
'cookie': 'ASP.NET_SessionId=rexgmrrgju10aw4rirulzrmk; _ga=GA1.3.225479946.1606814269; __RequestVerificationToken=az6ZATA2H0dJfBQ6KuwDwz39XGSiSuIjc4iZwRT8BGSD2surYfA6iOmkIQk2p835G51hYqJd5FFoiSQYsvx-V3Ndx6s1; _gid=GA1.3.1154633027.1607081144; _gat_gtag_UA_7413963_2=1',
base_url_sale = ['',
'' ]
base_url_rent = ['',
'' ]
def parse(self,response):
for page in range(2, 8):# specify page range you would like to scrape data for
for link in self.base_url_sale:
next_page = link + str(page)
response = requests.get(url=next_page, headers=self.headers)
for product in response.json()['Data']['Suburbs']:
area_url = ''+ product['FriendlyUrl']
yield scrapy.Request(area_url,callback=self.parse_sale)
for page2 in range(2, 8):# specify page range you would like to scrape data for
for link2 in self.base_url_rent:
next_page2 = link2 + str(page2)
response2 = requests.get(url=next_page2, headers=self.headers2)
for product2 in response2.json()['Data']['Suburbs']:
area_url_2 = ''+ product2['FriendlyUrl']
yield scrapy.Request(area_url_2,callback=self.parse_rent)
def parse_sale(self, response):
# follow links to property pages
for href in response.xpath('//a[#class="u-text-uppercase"]/#href').getall():
follow_link = ''+ href
yield response.follow(follow_link, self.parse_property)
# follow pagination links
for href in response.xpath('//*[#id="btnNext"]/#href'):
yield response.follow(href, self.parse_sale)
def parse_rent(self, response):
# follow links to property pages
for href in response.xpath('//a[#class="u-text-uppercase"]/#href').getall():
follow_link = ''+ href
yield response.follow(follow_link, self.parse_property)
# follow pagination links
for href in response.xpath('//*[#id="btnNext"]/#href').getall():
yield response.follow(href, self.parse_rent)
def parse_property(self, response):
title = response.css('span.u-text-capitalize::text').get()
bedrooms = response.xpath('//span[contains(text(), "Bedrooms")]/following-sibling::span/text()').get()
bedrooms = bedrooms.split()[0] if bedrooms is not None else None
Edited code:
I have tried to make separate parsing functions, however, I am only getting rental properties, not sure how to get also the properties for sale.
def parse(self,response):
for page in range(2, 8):# specify page range you would like to scrape data for
for link in self.base_url_sale:
next_page = link + str(page)
response = requests.get(url=next_page, headers=self.headers)
for product in response.json()['Data']['Suburbs']:
area_url = ''+ product['FriendlyUrl']
yield scrapy.Request(area_url,callback=self.parse_rent)
def parse_rent(self, response):
for page2 in range(2, 8):# specify page range you would like to scrape data for
for link2 in self.base_url_rent:
next_page2 = link2 + str(page2)
response = requests.get(url=next_page2, headers=self.headers2)
for product2 in response.json()['Data']['Suburbs']:
item = dict()
area_url_2 = ''+ product2['FriendlyUrl']
yield scrapy.Request(area_url_2,callback=self.parse_all)
def parse_all(self, response):
# follow links to property pages
for href in response.xpath('//a[#class="u-text-uppercase"]/#href').getall():
follow_link = ''+ href
yield response.follow(follow_link, self.parse_property)
# follow pagination links
for href in response.xpath('//*[#id="btnNext"]/#href'):
yield response.follow(href, self.parse_all)
First things first, you need to override start_requests and specify different callbacks for each url and then split you parse() logic into those two methods. Else you can at least have an if check on url before looping over either propbay or rentbay part. For now the responses of both urls are being treated the same way in your parse. So may be first time the request is incorrect as your response is for propbay but second time when it was correct it is filtered by the dupefilter.
For an instant fix you can try adding don't_filter=True to your requests in parse method.
There is an url:
It returns the coordinates. To get the coordinates - it does 3 requests(I SUPPOSE):
the url mentioned above
requesting session_id
getting coordinates using previousely mentioned session_id.
I am getting session_id in the 2nd step, but it is wrong. I can't get coordinates in step 3 using it. How can I know that the problem is in session_id? When I insert the session_id taken from the browser - my code works fine and coordinates are received.
Here are the requests in browser:
Here is the correct response from browser:
And this is what I'm getting with my code:
Here is my code (it is for Scrapy framework):
import inline_requests
def get_map_data(self, response):
""" Getting map data. """
map_referer = (""
response = yield scrapy.Request(
time_str = str(int(time.time()*1000))
headers = {
'Referer': response.url,
'Accept': 'application/javascript, */*; q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
'Host': '',
'Sec-Fetch-Dest': 'script',
'Sec-Fetch-Mode': 'no-cors',
'Sec-Fetch-Site': 'same-origin',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'
response.meta['handle_httpstatus_all'] = True
url = ( ''
f'callback=_jqjsp&_{time_str}=' )
reqest_session_response = yield scrapy.Request(
session_id ='"SessionId":"([^"]+)', reqest_session_response.text)
session_id = if session_id else None
# session_id = '954f04e2-e52c-4dd9-9046-f3f013d3f633'
# pprn = item.get('other', {}).get('PPRN')
pprn = 'ROH9385' # hard coded for the current page
if session_id and pprn:
time_str = str(int(time.time()*1000))
url = (''
coords_response = yield scrapy.Request(
url = url,
dont_filter = True,
Could you please correct my code so that it could get coordinates?
The website creates a sessionId first, then use the sessionId creates a layer on server (I guess). Then you can start requesting, otherwise it can't find the map layer under that sessionId.
import requests
url = ""
res = requests.get(url, verify=False).json()
sid = res["Session"]["SessionId"]
url = f"{sid}"
res = requests.get(url, verify=False)
url = f"{sid}&value=ROH9385&query=QPPRN"
res = requests.get(url, verify=False).json()
Below is my code which basically retrieves data from the database, puts it into a variable in CSV format which I then am trying to append on to a GET request URL. However, the get request results in null as the GET Request URL has an ampersand (&) sign in it.
Question is how do I get rid of it?
This is the URL, note the ampersand (&):,BCHUSD,AT20,
import requests
import json
import time
import datetime
import csv
import pandas as pd
import psycopg2
conn_string = "host=' dbname='' user='' password=''"
conn = psycopg2.connect(conn_string)
# Query to source marketIds
postgreSQL_select_Query = "SELECT DISTINCT () FROM static WHERE TYPE!='' AND marketId!='None'"
#print("Selecting marketId from table using cursor.fetchall")
instrument_static_marketId = cursor.fetchall()
cursor.execute(postgreSQL_select_Query )
#This puts the sql result into nice CSV format
y=','.join([y[0] for y in cursor.fetchall() ])
# closing database connection.
conn.close ()
def main():
headers = {
'Connection': 'keep-alive',
'Origin': 'https://.com',
'X-IG-API-KEY': '',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'Content-Type': 'application/json; charset=UTF-8',
'Accept': 'application/json; charset=UTF-8',
'CST': '',
'Sec-Fetch-Site': 'same-site',
'Sec-Fetch-Mode': 'cors',
'Referer': 'https://',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8',
response = requests.get('',params=y, headers=headers)
result = response.json()
if __name__ == '__main__':
You've included part of a parameter in your URL which is incorrect and confused requests.
Leave that off, and pass a dictionary for params, just like you're already doing with headers:
params = {
'marketIDs': y,
url = ''
response = requests.get(url, params=params, headers=headers)
I am accessing a URL which does not show the page unless Cookie is set. I am having no idea how this cookie value being generated. I highly doubt that cookie value is fixed so I guess I can't use a hard-coded Cookie value either.
Code below:
import requests
from bs4 import BeautifulSoup
headers = {
'authority': '',
'pragma': 'no-cache',
'cache-control': 'no-cache',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'referer': '',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9,ur;q=0.8',
'cookie': 'D_SID=; _se_t=0944dfa5-bfb4-4085-812e-fa54d44acc54; google_one_tap=0; D_IID=AFB68ACC-B276-36C0-8718-13AB09A55E51; D_UID=23BA0A61-D0DF-383D-88A9-8CF65634135F; D_ZID=C0263FA4-96BF-3071-8318-56839798C38D; D_ZUID=C2322D79-7BDB-3E32-8620-059B1D352789; D_HID=CE522333-8B7B-3D76-B45A-731EB750DF4D; last_search_tab=sales; se%3Asearch%3Asales%3Astate=%7C%7C%7C%7C; streeteasy_site=nyc; se_rs=123%2C1029856%2C123%2C1172313%2C2815; se%3Asearch%3Ashared%3Astate=102%7C%7C%7C%7Cfalse; anon_searcher_stage=initial; se_login_trigger=4; se%3Abig_banner%3Asearch=%7B%22123%22%3A2%7D; se%3Abig_banner%3Ashown=true; se_lsa=2019-07-08+04%3A01%3A30+-0400; _ses=BAh7DEkiD3Nlc3Npb25faWQGOgZFVEkiJWRiODVjZTA1NmYzMzZkMzZiYmU4YTk4Yjk5YmU5ZTBlBjsAVEkiEG5ld192aXNpdG9yBjsARlRJIhFsYXN0X3NlY3Rpb24GOwBGSSIKc2FsZXMGOwBUSSIQX2NzcmZfdG9rZW4GOwBGSSIxbTM5eGRPUVhLeGYrQU1jcjZIdi81ajVFWmYzQWFSQmhxZThNcG92cWxVdz0GOwBGSSIIcGlzBjsARmkUSSIOdXNlcl9kYXRhBjsARnsQOhBzYWxlc19vcmRlckkiD3ByaWNlX2Rlc2MGOwBUOhJyZW50YWxzX29yZGVySSIPcHJpY2VfZGVzYwY7AFQ6EGluX2NvbnRyYWN0RjoNaGlkZV9tYXBGOhJzaG93X2xpc3RpbmdzRjoSbW9ydGdhZ2VfdGVybWkjOhltb3J0Z2FnZV9kb3ducGF5bWVudGkZOiFtb3J0Z2FnZV9kb3ducGF5bWVudF9kb2xsYXJzaQJQwzoSbW9ydGdhZ2VfcmF0ZWYJNC4wNToTbGlzdGluZ3Nfb3JkZXJJIhBsaXN0ZWRfZGVzYwY7AFQ6EHNlYXJjaF92aWV3SSIMZGV0YWlscwY7AFRJIhBsYXN0X3NlYXJjaAY7AEZpAXs%3D--d869dc53b8165c9f9e77233e78c568f610994ba7',
session = requests.Session()
response = session.get('', headers=headers, timeout=20)
if response.status_code == 200:
html = response.text
soup = BeautifulSoup(html, 'lxml')
links ='h3 > a')
I know there is builtin middleware to handle downloadings. but it only accept a url. but in my case, my downloading link is a POST request.
When i made that POST request pdf file starts downloading.
Now i want to download that file from POST request in scrapy.
Website is
You can enter district Aurangabad and police station Kasma PS
On last column status there is a link to downloading file.
ps_x = '//*[#id="ctl00_ContentPlaceHolder1_ddlPoliceStation"]//option[.="Kasma PS"]/#value'
police_station_val = response.xpath(ps_x).extract_first()
d_x = '//*[#id="ctl00_ContentPlaceHolder1_ddlDistrict"]//option[.="Aurangabad"]/#value'
district_val = response.xpath(d_x).extract_first()
viewstate = response.xpath(self.viewstate_x).extract_first()
viewstategen = response.xpath(self.viewstategen_x).extract_first()
eventvalidator = response.xpath(self.eventvalidator_x).extract_first()
eventtarget = response.xpath(self.eventtarget_x).extract_first()
eventargs = response.xpath(self.eventargs_x).extract_first()
lastfocus = response.xpath(self.lastfocus_x).extract_first()
payload = {
'__EVENTTARGET': eventtarget,
'__EVENTARGUMENT': eventargs,
'__LASTFOCUS': lastfocus,
'__VIEWSTATE': viewstate,
'__VIEWSTATEGENERATOR': viewstategen,
'__EVENTVALIDATION': eventvalidator,
'ctl00$ContentPlaceHolder1$ddlDistrict': district_val,
'ctl00$ContentPlaceHolder1$ddlPoliceStation': police_station_val,
'ctl00$ContentPlaceHolder1$optionsRadios': 'radioPetioner',
'ctl00$ContentPlaceHolder1$txtSearchBy': '',
'ctl00$ContentPlaceHolder1$rptItem$ctl06$lnkStatus.x': '21',
'ctl00$ContentPlaceHolder1$rptItem$ctl06$lnkStatus.y': '24',
headers = {
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Origin': '',
'Upgrade-Insecure-Requests': '1',
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Referer': '',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'en-US,en;q=0.9',
# req =, data=payload, headers=headers)
# with open('pdf/ch.pdf', 'w+b') as f:
# f.write(req.content)
When You click donwload, webbrowser sends POST request.
So this answer mentioned by El Ruso earlier is applyable in your case
def parse(self, response):
yield scrapy.FormRequest("",.#your post request configuration, callback=self.save_pdf)
def save_pdf(self, response):
path = response.url.split('/')[-1]'Saving PDF %s', path)
with open(path, 'wb') as f: