I am using Windows 10 with Python 3. I never get the 2nd-page data. Please check.
Thanks in advance!
scrapy shell "https://www.industrystock.com/html/hydraulic-cylinder/product-result-uk-19931-0.html"
my terminal
url = 'https://www.industrystock.com/html/hydraulic-cylinder/product-result-uk-19931-0.html'
form = {
'lang': 'en',
'beta': 'false',
'action': 'RESULTPAGE_AJAX#getOverview',
'content': 'resultpage',
'subContent': 'result',
'company_id': '0',
'override_id': '0',
'domain_id': '0',
'user_id': '0',
'keyword_id': '19931',
'JSONStr': '{"key":"company","length":9,"keyword_id":null,"index":6,"filter":{},"override":{"key":"company"},"query":"Hydraulic Cylinder"}'}
headers = {
'Content-Type': 'json/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
}
req = scrapy.FormRequest(url, method='POST', formdata=form, headers=headers)
fetch(req)
view(response)
We expect to crawl the load more pages and data!
I tried to find a way to do it without rendering the page:
from scrapy import Spider
import scrapy
import json
import logging
class IndustrystockSpider(Spider):
name = "industry_stock"
allowed_domains = ['industrystock.com']
start_urls = ["https://www.industrystock.com/html/hydraulic-cylinder/product-result-uk-19931-0.html"]
custom_settings = {'ROBOTSTXT_OBEY': False}
ajax_url = 'https://www.industrystock.com/ajax/ajax_live.php'
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Referer': 'https://www.industrystock.com/html/hydraulic-cylinder/product-result-uk-19931-0.html',
'Origin': 'https://www.industrystock.com',
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
}
data = {
'lang': 'en',
'beta': 'false',
'action': 'RESULTPAGE_AJAX#getOverview',
'content': 'resultpage',
'subContent': 'result',
'company_id': '0',
'override_id': '0',
'domain_id': '0',
'user_id': '0',
'keyword_id': '19931',
}
#staticmethod
def construct_json_str(index):
return '{"key":"company","length":9,"keyword_id":null,"index":' + \
str(index) + \
',"filter":{},"override":{"key":"company"},"query":"Hydraulic Cylinder"}'
def parse(self, response):
index = 0
data = self.data
data['JSONStr'] = self.construct_json_str(index)
logging.info(f"data is {data}")
yield scrapy.FormRequest(self.ajax_url,
callback=self.parse_detail,
method='POST',
formdata=data,
headers=self.headers,
meta={'index': index})
def parse_detail(self, response):
company_data = json.loads(response.body)
overview = company_data['result']['overview']
if overview:
for company in overview:
company_id = company['company_id']
logging.info(f"company_id {company_id}")
previous_index = response.meta['index']
index = previous_index + 1
data = self.data
data['JSONStr'] = self.construct_json_str(index)
yield scrapy.FormRequest(self.ajax_url,
callback=self.parse_detail,
method='POST',
formdata=data,
headers=self.headers,
dont_filter=True,
meta={'index': index})
Related
I need two sets of data from this website:
https://www.nasdaq.com/market-activity/stocks/aapl/institutional-holdings
Which include both the "Active Positions" and "New and Sold Out Positions" tables. The code i have can only provide one piece of data into a JSON:
import requests
import pandas as pd
url = 'https://api.nasdaq.com/api/company/AAPL/institutional-holdings?limit=15&type=TOTAL&sortColumn=marketValue&sortOrder=DESC'
headers = {
'accept': 'application/json, text/plain, */*',
'origin': 'https://www.nasdaq.com',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36'
}
r = requests.get(url, headers=headers)
df = pd.json_normalize(r.json()['data']['newSoldOutPositions']['rows'])
df.to_json('AAPL_institutional_positions.json')
This will give the output of the following (JSON):
{
"positions":{
"0":"New Positions",
"1":"Sold Out Positions"
},
"holders":{
"0":"99",
"1":"90"
},
"shares":{
"0":"37,374,118",
"1":"4,637,465"
}
}
Whereas, for the other table I am scraping, I use this code (All's I have done is change "newSoldOutPositions" to "activePositions"):
import requests
import pandas as pd
url = 'https://api.nasdaq.com/api/company/AAPL/institutional-holdings?limit=15&type=TOTAL&sortColumn=marketValue&sortOrder=DESC'
headers = {
'accept': 'application/json, text/plain, */*',
'origin': 'https://www.nasdaq.com',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36'
}
r = requests.get(url, headers=headers)
df = pd.json_normalize(r.json()['data']['activePositions']['rows'])
df.to_json('AAPL_institutional_positions.json')
Which gives this output (JSON):
{
"positions":{
"0":"Increased Positions",
"1":"Decreased Positions",
"2":"Held Positions",
"3":"Total Institutional Shares"
},
"holders":{
"0":"1,780",
"1":"2,339",
"2":"283",
"3":"4,402"
},
"shares":{
"0":"239,170,203",
"1":"209,017,331",
"2":"8,965,339,255",
"3":"9,413,526,789"
}
}
So my question being, is how can i combine the scraping to grab both sets of data and output them all in one JSON file?
Thanks
If you only want json data, there is no need to use pandas:
import requests
nasdaq_dict = {}
url = 'https://api.nasdaq.com/api/company/AAPL/institutional-holdings?limit=15&type=TOTAL&sortColumn=marketValue&sortOrder=DESC'
headers = {
'accept': 'application/json, text/plain, */*',
'origin': 'https://www.nasdaq.com',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36'
}
r = requests.get(url, headers=headers)
nasdaq_dict['activePositions'] = r.json()['data']['activePositions']['rows']
nasdaq_dict['newSoldOutPositions'] = r.json()['data']['newSoldOutPositions']['rows']
print(nasdaq_dict)
Result in terminal:
{'activePositions': [{'positions': 'Increased Positions', 'holders': '1,795', 'shares': '200,069,709'}, {'positions': 'Decreased Positions', 'holders': '2,314', 'shares': '228,105,026'}, {'positions': 'Held Positions', 'holders': '308', 'shares': '8,976,744,094'}, {'positions': 'Total Institutional Shares', 'holders': '4,417', 'shares': '9,404,918,829'}], 'newSoldOutPositions': [{'positions': 'New Positions', 'holders': '121', 'shares': '55,857,143'}, {'positions': 'Sold Out Positions', 'holders': '73', 'shares': '8,851,038'}]}
I am trying to convert my api crawler into a Scrapy api crawler, but I'm getting a 403 error in scrapy only.
2022-01-29 11:48:14 [scrapy.core.engine] DEBUG: Crawled (403) <POST https://api2.realtor.ca/Listing.svc/PropertySearch_Post/> (referer: https://www.realtor.ca/)
What am I doing wrong on scrapy side? I used Insomnia to generate the working POST request. But scrapy can't seem to handle it, and I don't know why.
Working code:
import http.client
import json
conn = http.client.HTTPSConnection("api2.realtor.ca")
payload = "ZoomLevel=11&LatitudeMax=43.98268&LongitudeMax=-78.96028&LatitudeMin=43.43223&LongitudeMin=-79.79249&Sort=6-D&PropertySearchTypeId=1&TransactionTypeId=2&PropertyTypeGroupID=1&Currency=CAD&CurrentPage=1&ApplicationId=1&CultureId=1&Version=7.0&RecordsPerPage=200"
headers = {
'cookie': "gig_bootstrap_3_mrQiIl6ov44s2X3j6NGWVZ9SDDtplqV7WgdcyEpGYnYxl7ygDWPQHqQqtpSiUfko=gigya-pr_ver4; visid_incap_2271082=JLYUNoQeQ8aK7XCukUKVyLTR4GEAAAAAQUIPAAAAAAD5E9h/kC3MKXYFk8j751oC; reese84=3:kSScWfkGFsGrIfH0oClFcA==:7I6p2X8egFPYsN4vhOQSzcG+bY31pkxS/fXEEvP9k7YrM9pKJIRxaMywKm1+U1DhuZilDxW9dT97UvcEqz9H8d6b81yJvbuEOsNCc1hlGdVd3ZjT2uSjne23lT3z3KZ2lj0sxMQRKgafW63RLwfBQWtCgxWHMp9k338xzLwnTwqzOMI+qkBLPMwUolKoJP3IL6gzbljA6PY7Pyaa/R7VMRlGzcnk+M/ILnqZV8kJ+xV8sukeGZAcs4nJqJbvRr6vSE+fsAe/Jct5iAOnYTGoC+IyUdOAIWn2jEXTjwdjndvGdsKGvvkvcnkIjX1/JNTFH922HZPL3q9LzFCEgMhuYgygFRFjCqaz/HJ0zr0DXL9uTjiripD29T9lA0k0fGs+vyMz0ApV2Ni9UJsosr747CQMLROcZCQSIik2p+Taflc3cIAZgituGSiJss6o94xm:40hT11A7LacDXW+pa+D9sBQF5meDdTDqnBCeCZs/jls=; visid_incap_2269415=SnrfkV/xRM2/zMAwUwhS+8xj4GEAAAAAQkIPAAAAAACALOWhAWPmyCDb18oF6pHY12yEiWlUbIgr; nlbi_2269415=Vq3lX/59Tg3xXSVkkG5lugAAAAB+8mCpAlS1F5Timw4irGua; _gid=GA1.2.1583316853.1643415717; ASP.NET_SessionId=ov1iizcd3npobsilupzii5ep; nlbi_2271082=uYrYPyycJwjJZMEUcbDG1QAAAABYkXBfZhGO2s/HVfvOZarc; incap_ses_303_2269415=tkGkDvUztDFYLuufDHk0BPFK9WEAAAAAaFTGABendGhY7YQ6DfHGqg==; incap_ses_303_2271082=R007Dwk7akm7gfCfDHk0BPhS9WEAAAAAEzSDycWG9SwrHnLXWGiOuQ==; reese84=3:xy6Z/sx1YkpUszEJSoRL3A==:lovsunYlSfQpJkP59Bs11wY2+LlgzOMeiL3ObotvVpDwGDJVy4RKfSLLKuvVMLLcTstWzewelf4RKJ3e6v5hAVv9wkqa01hiSd1TIDBTDdyPzcKMI0xlq6r0G2P+8dMx91eZ0jKEx40QdURnU3XwLghg1BALZ+aWt4US7pC1FIftQksLhz7QlnyBw8pl2ucIJ9JIyuM3gBjNaP4hvYyc17UOnBvf37wtLhWeFb+fomUnnLqyTag5dM/vASoIg+Uo+lH9yQI2K9xGm0KveqgF2nUre9Z+UG1gwHWRBEIjygnhZZjnJOR20wxQU7gOZ8YqW+DJdczgSiqbn93I1um+VwOlf8bD6zCq99miEtaVOdVlGZesCvoe9d9JciEryAFMlYcn3RuLvycVNPQVrdCP9REneI+J1AmfXeSveEGpLhnSZs64rniGIf7iT0lRY9c1:f519Rour27xdzG5PjxP0BlHw/5uwjBdnwdY9Zd3AWpU=; nlbi_2269415_2147483646=5DLdcUXrjg0v1GhykG5lugAAAAA0AOEmZsShmR8VQ3d3LJzx; _dc_gtm_UA-12908513-11=1; _ga=GA1.2.2104488426.1642095581; _gac_UA-12908513-11=1.1643468700.Cj0KCQiA6NOPBhCPARIsAHAy2zAmFT3_yol1CanQDHoHW_z8aJ6HgaY2f7iilRt6yGvssuzmDbbh8FoaAkpxEALw_wcB; _ga_Y07J3B53QP=GS1.1.1643467388.20.1.1643468729.19; _4c_=%7B%22_4c_s_%22%3A%22bZNbj5swEIX%2FSuTnhfgKJm%2BrVKpWatWqUp8jgwdiLcHIOKHbKP%2B9Y3JZ7ap%2BAZ85%2BoxnDmcy72EgG1ZIIQtdCiGUfiKv8DaRzZkEZ9PjRDakrUBWitqsVIxmEjjNdNGYzIAxFFdbcUmeyJ%2BFVemSKc2oKi5PxA53hoXWHPt4tymqpZaSCS3Q5sZ486WPYVJrWWquH17kX5XkvRMN%2BW89zA%2FUtaBoKcUH66KgtRlv1jNpvAVksipnMsdLthPS41%2BUMkEpvo%2FB22MTd%2FFtTMYZ6tVkX7Fg4eQa2M3Oxn0icMbf1T24bh8XWVVJHgNueE6vS1UFlYWUGkuzG6yfP3Nu6mdOHfw8QWJt98EfYPXd166HVVVi0eP8yPNgg8cRYkOghRAW8z7Gcdqs1%2FM85533XQ954w9rNE0upmsFMH30IW%2FMTcM0vMvZIn8zQ3c03dKutPVdB3b1MqSgmH4C1H4Gf3JDkyw%2FhmiC8yhu%2FXGIIeG2ZjA2oX7B5CwM0Zneh60%2FHCC4xvTLiY8K%2BmBIAx9Dus7X593vly%2Bpi4zKlBRe5JgaTiuFsSOXew6FUooJLjTDQUZk6kIuTUfH6REiK2lZK6gzJoBnsq55hjNpMuxzyy2tVQv6EW38TSoqmCjlDSmuwMvlHw%3D%3D%22%7D; _gali=lnkNextResultsPage",
'authority': "api2.realtor.ca",
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"',
'accept': "*/*",
'content-type': "application/x-www-form-urlencoded; charset=UTF-8",
'sec-ch-ua-mobile': "?1",
'user-agent': "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Mobile Safari/537.36",
'sec-ch-ua-platform': '"Android"',
'origin': "https://www.realtor.ca",
'sec-fetch-site': "same-site",
'sec-fetch-mode': "cors",
'sec-fetch-dest': "empty",
'referer': "https://www.realtor.ca/",
'accept-language': "en-US,en;q=0.9"
}
conn.request("POST", "/Listing.svc/PropertySearch_Post", payload, headers)
res = conn.getresponse()
data = res.read()
out = json.loads(data.decode("utf-8"))
results = out.get('Results')
for r in results:
print(r.get('Id'), r.get('Property').get('Price'))
print(len(results))
Broken scrapy code with 403 error:
# -*- coding: utf-8 -*-
import scrapy
import json
from scrapy.exceptions import CloseSpider
class RealtorSpider(scrapy.Spider):
name = 'realtor'
allowed_domains = ['api2.realtor.ca']
api = "https://api2.realtor.ca/Listing.svc/PropertySearch_Post/"
payload = "ZoomLevel=11&LatitudeMax=43.98268&LongitudeMax=-78.96028&LatitudeMin=43.43223&LongitudeMin=-79.79249&Sort=6-D&PropertySearchTypeId=1&TransactionTypeId=2&PropertyTypeGroupID=1&Currency=CAD&CurrentPage=1&ApplicationId=1&CultureId=1&Version=7.0&RecordsPerPage=200"
headers = {
'cookie': "gig_bootstrap_3_mrQiIl6ov44s2X3j6NGWVZ9SDDtplqV7WgdcyEpGYnYxl7ygDWPQHqQqtpSiUfko=gigya-pr_ver4; visid_incap_2271082=JLYUNoQeQ8aK7XCukUKVyLTR4GEAAAAAQUIPAAAAAAD5E9h/kC3MKXYFk8j751oC; reese84=3:kSScWfkGFsGrIfH0oClFcA==:7I6p2X8egFPYsN4vhOQSzcG+bY31pkxS/fXEEvP9k7YrM9pKJIRxaMywKm1+U1DhuZilDxW9dT97UvcEqz9H8d6b81yJvbuEOsNCc1hlGdVd3ZjT2uSjne23lT3z3KZ2lj0sxMQRKgafW63RLwfBQWtCgxWHMp9k338xzLwnTwqzOMI+qkBLPMwUolKoJP3IL6gzbljA6PY7Pyaa/R7VMRlGzcnk+M/ILnqZV8kJ+xV8sukeGZAcs4nJqJbvRr6vSE+fsAe/Jct5iAOnYTGoC+IyUdOAIWn2jEXTjwdjndvGdsKGvvkvcnkIjX1/JNTFH922HZPL3q9LzFCEgMhuYgygFRFjCqaz/HJ0zr0DXL9uTjiripD29T9lA0k0fGs+vyMz0ApV2Ni9UJsosr747CQMLROcZCQSIik2p+Taflc3cIAZgituGSiJss6o94xm:40hT11A7LacDXW+pa+D9sBQF5meDdTDqnBCeCZs/jls=; visid_incap_2269415=SnrfkV/xRM2/zMAwUwhS+8xj4GEAAAAAQkIPAAAAAACALOWhAWPmyCDb18oF6pHY12yEiWlUbIgr; nlbi_2269415=Vq3lX/59Tg3xXSVkkG5lugAAAAB+8mCpAlS1F5Timw4irGua; _gid=GA1.2.1583316853.1643415717; ASP.NET_SessionId=ov1iizcd3npobsilupzii5ep; nlbi_2271082=uYrYPyycJwjJZMEUcbDG1QAAAABYkXBfZhGO2s/HVfvOZarc; incap_ses_303_2269415=tkGkDvUztDFYLuufDHk0BPFK9WEAAAAAaFTGABendGhY7YQ6DfHGqg==; incap_ses_303_2271082=R007Dwk7akm7gfCfDHk0BPhS9WEAAAAAEzSDycWG9SwrHnLXWGiOuQ==; reese84=3:xy6Z/sx1YkpUszEJSoRL3A==:lovsunYlSfQpJkP59Bs11wY2+LlgzOMeiL3ObotvVpDwGDJVy4RKfSLLKuvVMLLcTstWzewelf4RKJ3e6v5hAVv9wkqa01hiSd1TIDBTDdyPzcKMI0xlq6r0G2P+8dMx91eZ0jKEx40QdURnU3XwLghg1BALZ+aWt4US7pC1FIftQksLhz7QlnyBw8pl2ucIJ9JIyuM3gBjNaP4hvYyc17UOnBvf37wtLhWeFb+fomUnnLqyTag5dM/vASoIg+Uo+lH9yQI2K9xGm0KveqgF2nUre9Z+UG1gwHWRBEIjygnhZZjnJOR20wxQU7gOZ8YqW+DJdczgSiqbn93I1um+VwOlf8bD6zCq99miEtaVOdVlGZesCvoe9d9JciEryAFMlYcn3RuLvycVNPQVrdCP9REneI+J1AmfXeSveEGpLhnSZs64rniGIf7iT0lRY9c1:f519Rour27xdzG5PjxP0BlHw/5uwjBdnwdY9Zd3AWpU=; nlbi_2269415_2147483646=5DLdcUXrjg0v1GhykG5lugAAAAA0AOEmZsShmR8VQ3d3LJzx; _dc_gtm_UA-12908513-11=1; _ga=GA1.2.2104488426.1642095581; _gac_UA-12908513-11=1.1643468700.Cj0KCQiA6NOPBhCPARIsAHAy2zAmFT3_yol1CanQDHoHW_z8aJ6HgaY2f7iilRt6yGvssuzmDbbh8FoaAkpxEALw_wcB; _ga_Y07J3B53QP=GS1.1.1643467388.20.1.1643468729.19; _4c_=%7B%22_4c_s_%22%3A%22bZNbj5swEIX%2FSuTnhfgKJm%2BrVKpWatWqUp8jgwdiLcHIOKHbKP%2B9Y3JZ7ap%2BAZ85%2BoxnDmcy72EgG1ZIIQtdCiGUfiKv8DaRzZkEZ9PjRDakrUBWitqsVIxmEjjNdNGYzIAxFFdbcUmeyJ%2BFVemSKc2oKi5PxA53hoXWHPt4tymqpZaSCS3Q5sZ486WPYVJrWWquH17kX5XkvRMN%2BW89zA%2FUtaBoKcUH66KgtRlv1jNpvAVksipnMsdLthPS41%2BUMkEpvo%2FB22MTd%2FFtTMYZ6tVkX7Fg4eQa2M3Oxn0icMbf1T24bh8XWVVJHgNueE6vS1UFlYWUGkuzG6yfP3Nu6mdOHfw8QWJt98EfYPXd166HVVVi0eP8yPNgg8cRYkOghRAW8z7Gcdqs1%2FM85533XQ954w9rNE0upmsFMH30IW%2FMTcM0vMvZIn8zQ3c03dKutPVdB3b1MqSgmH4C1H4Gf3JDkyw%2FhmiC8yhu%2FXGIIeG2ZjA2oX7B5CwM0Zneh60%2FHCC4xvTLiY8K%2BmBIAx9Dus7X593vly%2Bpi4zKlBRe5JgaTiuFsSOXew6FUooJLjTDQUZk6kIuTUfH6REiK2lZK6gzJoBnsq55hjNpMuxzyy2tVQv6EW38TSoqmCjlDSmuwMvlHw%3D%3D%22%7D; _gali=lnkNextResultsPage",
'authority': "api2.realtor.ca",
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"',
'accept': "*/*",
'content-type': "application/x-www-form-urlencoded; charset=UTF-8",
'sec-ch-ua-mobile': "?1",
'user-agent': "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Mobile Safari/537.36",
'sec-ch-ua-platform': '"Android"',
'origin': "https://www.realtor.ca",
'sec-fetch-site': "same-site",
'sec-fetch-mode': "cors",
'sec-fetch-dest': "empty",
'referer': "https://www.realtor.ca/",
'accept-language': "en-US,en;q=0.9"
}
def start_requests(self):
yield scrapy.Request(url=self.api,
callback=self.parse,
method='POST',
headers=self.headers,
body=json.dumps(self.payload) )
def parse(self, response):
print('PARSE')
results = json.loads(response.body).get('Results')
print('results')
for r in results:
yield {
'Id': r.get('Id'),
'Price': r.get('Property').get('Price')
}
if __name__ == '__main__':
import os
from scrapy.cmdline import execute
os.chdir(os.path.dirname(os.path.realpath(__file__)))
SPIDER_NAME = RealtorSpider.name
try:
execute(
[
'scrapy',
'crawl',
SPIDER_NAME,
'-s',
'FEED_EXPORT_ENCODING=utf-8',
]
)
except SystemExit:
pass
You have an excess / in the url. Remove it to avoid 404 error.
You don't need json.dumps since payload is already a string.
Let scrapy handle the cookies itself.
I guess that your headers are wrong because after changing them I don't get 403 status anymore.
See how to run scrapy from a script.
spider.py:
import scrapy
class RealtorSpider(scrapy.Spider):
name = 'realtor'
allowed_domains = ['api2.realtor.ca']
custom_settings = {
'DOWNLOAD_DELAY': 0.5,
}
api = "https://api2.realtor.ca/Listing.svc/PropertySearch_Post"
payload = "ZoomLevel=11&LatitudeMax=43.98268&LongitudeMax=-78.96028&LatitudeMin=43.43223&LongitudeMin=-79.79249&Sort=6-D&PropertySearchTypeId=1&TransactionTypeId=2&PropertyTypeGroupID=1&Currency=CAD&CurrentPage=1&ApplicationId=1&CultureId=1&Version=7.0&RecordsPerPage=200"
headers = {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.5",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"DNT": "1",
"Host": "api2.realtor.ca",
"Origin": "https://www.realtor.ca",
"Pragma": "no-cache",
"Referer": "https://www.realtor.ca/",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-site",
"Sec-GPC": "1",
"TE": "trailers",
'User-Agent': "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Mobile Safari/537.36",
}
def start_requests(self):
yield scrapy.Request(url=self.api,
callback=self.parse,
method='POST',
headers=self.headers,
body=self.payload)
def parse(self, response):
print('PARSE')
results = response.json().get('Results')
print('results')
for r in results:
yield {
'Id': r.get('Id'),
'Price': r.get('Property').get('Price')
}
main.py:
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
if __name__ == "__main__":
spider = 'realtor'
settings = get_project_settings()
process = CrawlerProcess(settings)
process.crawl(spider)
process.start()
output:
{'Id': '23983739', 'Price': '$799,000'}
{'Id': '23983759', 'Price': '$579,000'}
{'Id': '23983752', 'Price': '$750,000'}
...
...
...
I am trying to scrape the data from https://www.anre.ro/ro/info-consumatori/comparator-oferte-tip-de-furnizare-a-gn, which gets its input via Ajax (request URL is https://www.anre.ro/ro/ajax/comparator/get_results_gaz).
However, I can see that the Form Data is in a form of - tip_client=casnic&modalitate_racordare=sistem_de_distributie&transee_de_consum=b1&tip_pret_unitar=cu_reglementate&id_judet=ALBA&id_siruta=1222&consum_mwh=&pret_furnizare_mwh=&componenta_fixa=&suplimentar_componenta_fixa=&termen_plata=&durata_contractului=&garantii=&frecventa_emitere_factura=&tip_pret= (if I view source in Chrome). How do I pass this to scrapy or any other module to retrieve the desired webpage?
So far, I have this (is the json format correct considering the Form Data?):
class ExSpider(scrapy.Spider):
name = 'ExSpider'
allowed_domains = ['anre.ro']
def start_requests(self):
params = {
"tip_client":"casnic",
"modalitate_racordare":"sistem_de_distributie",
"transee_de_consum":"b1",
"tip_pret_unitar":"cu_reglementate",
"id_judet":"ALBA",
"id_siruta":"1222",
"consum_mwh":"",
"pret_furnizare_mwh":"",
"componenta_fixa":"",
"suplimentar_componenta_fixa":"",
"termen_plata":"",
"durata_contractului":"",
"garantii":"",
"frecventa_emitere_factura":"",
"tip_pret":""
}
r = scrapy.FormRequest('https://www.anre.ro/ro/ajax/comparator/get_results_gaz', method = "POST",formdata=params)
print(r)
The following should produce the required response from that page you wish to grab data from.
class ExSpider(scrapy.Spider):
name = "exspider"
url = 'https://www.anre.ro/ro/ajax/comparator/get_results_gaz'
payload = {
'tip_client': 'casnic',
'modalitate_racordare': 'sistem_de_distributie',
'transee_de_consum': 'b2',
'tip_pret_unitar': 'cu_reglementate',
'id_judet': 'ALBA',
'id_siruta': '1222',
'consum_mwh': '',
'pret_furnizare_mwh': '',
'componenta_fixa': '',
'suplimentar_componenta_fixa': '',
'termen_plata': '',
'durata_contractului': '',
'garantii': '',
'frecventa_emitere_factura': '',
'tip_pret': ''
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
'Referer': 'https://www.anre.ro/ro/info-consumatori/comparator-oferte-tip-de-furnizare-a-gn'
}
def start_requests(self):
yield scrapy.FormRequest(
self.url,
formdata=self.payload,
headers=self.headers,
callback=self.parse
)
def parse(self, response):
print(response.text)
My request:
# python 3.7.3
import requests
from requests import Session
session = Session()
session.head('https://www.basspro.com/shop/en/blazer-brass-handgun-ammo')
cookies = requests.utils.cookiejar_from_dict(requests.utils.dict_from_cookiejar(session.cookies))
response = session.post(
url='https://www.basspro.com/shop/BPSGetInventoryStatusByIDView',
data={
'productId': '3074457345616736172',
'itemId': '3074457345616736949',
'isGunFlag': 'false',
},
cookies=cookies,
headers={
'accept': '*/*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9',
'content-length': '72',
'content-type': 'application/x-www-form-urlencoded',
'origin': 'https://www.basspro.com',
'referer': 'https://www.basspro.com/shop/en/blazer-brass-handgun-ammo',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.92 Safari/537.36 Vivaldi/2.9.1705.38',
'x-requested-with': 'XMLHttpRequest',
},
)
print(response.text)
Output:
<input type="hidden" class="relativeToAbsolute" value="true" />
/*
{
"onlineInventory": {
"status": "Status Not Available",
"image": "widget_product_info/outofstock_icon.svg",
"altText": "Status Not Available",
"isDropShip": false,
"availableDate":""
},
"inStoreInventory": {
"stores": [],
"checkStoreText": "Check Store Availability",
"isInStoreInventory": true,
"isPickupInventory": false
}
}
*/
My output when inspecting and running the same AJAX request via browser:
/*
{
"onlineInventory": {
"status": "Backordered",
"image": "widget_product_info/backordered_icon.svg",
"altText": "Backordered",
"isDropShip": false,
"quantity": 0,
"availableDate":"May 1-8"
},
"inStoreInventory": {
"stores": [{
id: '715839555',
name: '83',
gunRestricted: 'false',
dsName: 'TX - Round Rock',
status: 'Unavailable',
statusText: 'Out of Stock',
image: 'widget_product_info/outofstock_icon.svg',
altText: 'Out of Stock',
availableDate: '',
availableQuantity: '',
availableQuantityDisplay: 'false',
cityState: 'Round Rock, TX',
ISPavailableDate: '',
ISPavailableQuantity: '',
pickupTime: 'by 2:00pm',
offerISPOnBPS: 'Yes',
offerISPOnCAB: 'No'}],
"checkStoreText": "Change Store",
"isInStoreInventory": true,
"isPickupInventory": true
}
}
*/
I tried assigning cookies this way as well:
url = "https://www.basspro.com/shop/en/blazer-brass-handgun-ammo"
r = requests.get(url)
cookies = r.cookies
# fails to pass the right cookie
If I instead copy the cookie verbatim from an inspected GET request at https://www.basspro.com/shop/en/blazer-brass-handgun-ammo and put that into the POST headers, it works. How do I get cookies to work properly programatically?
EDIT:
Here's my attempt at just using Session() for cookies:
# python 3.7.3
import requests
from requests import Session
session = Session()
session.get("https://www.basspro.com/shop/en/blazer-brass-handgun-ammo")
# session.head('https://www.basspro.com/shop/en/blazer-brass-handgun-ammo')
response = session.post(
url='https://www.basspro.com/shop/BPSGetInventoryStatusByIDView',
data={
'productId': '3074457345616736172',
'itemId': '3074457345616736949',
'isGunFlag': 'false',
},
headers={
'accept': '*/*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9',
'content-length': '72',
'content-type': 'application/x-www-form-urlencoded',
'origin': 'https://www.basspro.com',
'referer': 'https://www.basspro.com/shop/en/blazer-brass-handgun-ammo',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.92 Safari/537.36 Vivaldi/2.9.1705.38',
'x-requested-with': 'XMLHttpRequest',
},
)
print(response.text)
I get the same result as before ("status": "Status Not Available", etc.)
Here's my attempt at the second solution:
# python 3.7.3
import requests
from requests import Session
url = "https://www.basspro.com/shop/en/blazer-brass-handgun-ammo"
r = requests.get(url)
cookies = r.cookies # the type is RequestsCookieJar
response = requests.post(
url='https://www.basspro.com/shop/BPSGetInventoryStatusByIDView',
data={
'productId': '3074457345616736172',
'itemId': '3074457345616736949',
'isGunFlag': 'false',
},
cookies=cookies,
headers={
'accept': '*/*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9',
'content-length': '72',
'content-type': 'application/x-www-form-urlencoded',
'origin': 'https://www.basspro.com',
'referer': 'https://www.basspro.com/shop/en/blazer-brass-handgun-ammo',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.92 Safari/537.36 Vivaldi/2.9.1705.38',
'x-requested-with': 'XMLHttpRequest',
},
)
print(response.text)
Again, I get the same result as before. What am I doing wrong?
can you try like this
session = Session()
session.get("https://www.basspro.com/shop/en/blazer-brass-handgun-ammo")
Then all the following calls with
session.xxx
donot use cookies parameter in it
another way I have tested,
cookies = r.cookies # the type is RequestsCookieJar
requests.post(.... cookies=cookies...)
at last ,I tested this works:
Please compare carefully
from requests import Session
session = Session()
agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'
r1 = session.get("https://www.basspro.com/shop/en/blazer-brass-handgun-ammo",headers={'user-agent': agent})
response = session.post(
url='https://www.basspro.com/shop/BPSGetOnlineInventoryStatusByIDView',
data={
'productId': '3074457345616736172',
'catalogId': '3074457345616676768',
'storeId': '715838534',
'langId':-1
},
headers={
'user-agent': agent,
'x-requested-with': 'XMLHttpRequest',
},
cookies=r1.cookies
)
print(response.text)
I know there is builtin middleware to handle downloadings. but it only accept a url. but in my case, my downloading link is a POST request.
When i made that POST request pdf file starts downloading.
Now i want to download that file from POST request in scrapy.
Website is http://scrb.bihar.gov.in/View_FIR.aspx
You can enter district Aurangabad and police station Kasma PS
On last column status there is a link to downloading file.
ps_x = '//*[#id="ctl00_ContentPlaceHolder1_ddlPoliceStation"]//option[.="Kasma PS"]/#value'
police_station_val = response.xpath(ps_x).extract_first()
d_x = '//*[#id="ctl00_ContentPlaceHolder1_ddlDistrict"]//option[.="Aurangabad"]/#value'
district_val = response.xpath(d_x).extract_first()
viewstate = response.xpath(self.viewstate_x).extract_first()
viewstategen = response.xpath(self.viewstategen_x).extract_first()
eventvalidator = response.xpath(self.eventvalidator_x).extract_first()
eventtarget = response.xpath(self.eventtarget_x).extract_first()
eventargs = response.xpath(self.eventargs_x).extract_first()
lastfocus = response.xpath(self.lastfocus_x).extract_first()
payload = {
'__EVENTTARGET': eventtarget,
'__EVENTARGUMENT': eventargs,
'__LASTFOCUS': lastfocus,
'__VIEWSTATE': viewstate,
'__VIEWSTATEGENERATOR': viewstategen,
'__EVENTVALIDATION': eventvalidator,
'ctl00$ContentPlaceHolder1$ddlDistrict': district_val,
'ctl00$ContentPlaceHolder1$ddlPoliceStation': police_station_val,
'ctl00$ContentPlaceHolder1$optionsRadios': 'radioPetioner',
'ctl00$ContentPlaceHolder1$txtSearchBy': '',
'ctl00$ContentPlaceHolder1$rptItem$ctl06$lnkStatus.x': '21',
'ctl00$ContentPlaceHolder1$rptItem$ctl06$lnkStatus.y': '24',
}
headers = {
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Origin': 'http://scrb.bihar.gov.in',
'Upgrade-Insecure-Requests': '1',
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Referer': 'http://scrb.bihar.gov.in/View_FIR.aspx',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'en-US,en;q=0.9',
}
# req = requests.post(response.url, data=payload, headers=headers)
# with open('pdf/ch.pdf', 'w+b') as f:
# f.write(req.content)
When You click donwload, webbrowser sends POST request.
So this answer mentioned by El Ruso earlier is applyable in your case
.....
def parse(self, response):
......
yield scrapy.FormRequest("http://scrb.bihar.gov.in/View_FIR.aspx",.#your post request configuration, callback=self.save_pdf)
def save_pdf(self, response):
path = response.url.split('/')[-1]
self.logger.info('Saving PDF %s', path)
with open(path, 'wb') as f:
f.write(response.body)