read json from web by scrapy in Python2

read json from web by scrapy in Python2 - python

I want to extract JSON data from a web page, so I've inspected it. Data I need is stored in the below format:
<script type="application/ld+json">
{
'data I want to extract'
}
</script>
I tried to use:
import scrapy
import json
class OpenriceSpider(scrapy.Spider):
name = 'openrice'
allowed_domains = ['www.openrice.com']
def start_requests(self):
headers = {
'accept-encoding': 'gzip, deflate, sdch, br',
'accept-language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'cache-control': 'max-age=0',
}
url = 'https://www.openrice.com/en/hongkong/r-kitchen-one-cafe-sha-tin-western-r483821'
yield scrapy.Request(url=url, headers=headers, callback=self.parse)
def parse(self, response): # response = request url ?
items = []
jsonresponse = json.loads(response)
But it doesn't work, how should I change it?

You need to locate that script element in the HTML source, extract it's text and only then load with json.loads():
script = response.xpath("//script[#type='application/ld+json']/text()").extract_first()
json_data = json.loads(script)
print(json_data)
Here, I am using the not so common application/ld+json to locate the script, but there are many other options as well - like, locate the script using some text you know it is in the script itself:
//script[contains(., 'Restaurant')]

Related

Scrapy - 400 Response with Headers

Here's an example link I'm trying to scrape: https://www.lowes.com/pd/ZLINE-KITCHEN-BATH-Professional-7-Burners-4-cu-ft-2-cu-ft-Double-Oven-Convection-Dual-Fuel-Range-Stainless-Steel-Common-48-in-Actual-48-in/1000514227
My scraper was working fine till today so I'm guessing Lowe's added more protection against bots :(
After some research, I found that I would have to add headers to my web scraper so I can emulate a real user.
Opened up Dev Console -> Network -> XHR/Fetch -> Found JSON File.
Here's my scrapy script
# -*- coding: utf-8 -*-
import scrapy
from ..items import LowesItem
import re
import pandas as pd
import requests
import json
from scrapy.http import Request
from datetime import date
class LowesSpider(scrapy.Spider):
name = 'Lowes'
def start_requests(self):
HEADERS = {
'method': 'GET',
'scheme': 'https',
'authority': 'content.syndigo.com',
'Accept': '*/*',
'Content-Type': 'text/plain',
'Origin': 'https://lowes.com',
'Accept-Language': 'en-US,en;q=0.9',
'Host': 'content.syndigo.com',
'User-Agent': ' Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.4 Safari/605.1.15',
'Referer': 'https://www.lowes.com/',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Cookie': 'sn=0321'
}
start_urls = ['https://www.lowes.com/pd/ZLINE-KITCHEN-BATH-Professional-7-Burners-4-cu-ft-2-cu-ft-Double-Oven-Convection-Dual-Fuel-Range-Stainless-Steel-Common-48-in-Actual-48-in/1000514227']
for url in start_urls:
yield Request(url,
headers=HEADERS,
meta={'dont_merge_cookies': True,
'url':url})
def parse(self, response):
for item in self.parseLowes(response):
yield item
pass
def parseLowes(self, response):
item = LowesItem() #items from items.py
script_tag = response.xpath('//script[#type="application/ld+json"]/text()').get() #get js container
productPrice = json.loads(script_tag)[2]["offers"]["price"]
productURL = response.url
url = response.meta['url']
productSKU = url.split("=")[-1]
scrapedDate = date.today()
#item['productName'] = productName #display product name
item['productOMS'] = productSKU
item['productPrice'] = productPrice #display price and assign to variable
item['productURL'] = productURL #displayURL
item['scrapedDate'] = scrapedDate
yield item
When I run scrapy, I get 400 as a response from the command.

From what I can see about the network connection, the issue is related to the their CDN (Akamai) which is blocking the access.
I was able to access your link and see the product from Microsoft Edge (version 107). In my request the user agent is:
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.26
So, try to modify in your code the 'User-Agent' key with that value.

scrapy with payload request

I'm trying to get a POST request, but I don't know what's wrong with my code that the data doesn't come.
The following message is displayed:
HTTP status code is not handled or not allowed
This is the website
A screenshot of the header:
This is my code:
import json
import scrapy
class MySpider(scrapy.Spider):
name = 'pb'
payload = {"version":"1.0.0","queries":[{"Query":{"Commands":[{"SemanticQueryDataShapeCommand":{"Query":{"Version":2,"From":[{"Name":"e","Entity":"Events"},{"Name":"d","Entity":"DAX"}],"Select":[{"Column":{"Expression":{"SourceRef":{"Source":"e"}},"Property":"Date Start"},"Name":"Events.Date Start"},{"Column":{"Expression":{"SourceRef":{"Source":"e"}},"Property":"Event Type"},"Name":"Events.Event Type"},{"Column":{"Expression":{"SourceRef":{"Source":"e"}},"Property":"Name"},"Name":"Events.Name"},{"Measure":{"Expression":{"SourceRef":{"Source":"d"}},"Property":"Length"},"Name":"Events.Total Days"},{"Column":{"Expression":{"SourceRef":{"Source":"e"}},"Property":"Location"},"Name":"Events.Location"},{"Column":{"Expression":{"SourceRef":{"Source":"e"}},"Property":"Link to Event"},"Name":"Events.Link to Event"},{"Measure":{"Expression":{"SourceRef":{"Source":"d"}},"Property":"Days Until Event"},"Name":"DAX.Days Until"},{"Column":{"Expression":{"SourceRef":{"Source":"e"}},"Property":"Link to Submit"},"Name":"Events.Link to Submit"},{"Measure":{"Expression":{"SourceRef":{"Source":"d"}},"Property":"Event Type Number"},"Name":"DAX.Event Type Number"}],"OrderBy":[{"Direction":1,"Expression":{"Column":{"Expression":{"SourceRef":{"Source":"e"}},"Property":"Date Start"}}}]},"Binding":{"Primary":{"Groupings":[{"Projections":[0,1,2,3,4,5,6,7,8]}]},"DataReduction":{"DataVolume":3,"Primary":{"Window":{"Count":500}}},"Aggregates":[{"Select":3,"Aggregations":[{"Min":{}},{"Max":{}}]}],"SuppressedJoinPredicates":[8],"Version":1}}}]},"CacheKey":"{\"Commands\":[{\"SemanticQueryDataShapeCommand\":{\"Query\":{\"Version\":2,\"From\":[{\"Name\":\"e\",\"Entity\":\"Events\"},{\"Name\":\"d\",\"Entity\":\"DAX\"}],\"Select\":[{\"Column\":{\"Expression\":{\"SourceRef\":{\"Source\":\"e\"}},\"Property\":\"Date Start\"},\"Name\":\"Events.Date Start\"},{\"Column\":{\"Expression\":{\"SourceRef\":{\"Source\":\"e\"}},\"Property\":\"Event Type\"},\"Name\":\"Events.Event Type\"},{\"Column\":{\"Expression\":{\"SourceRef\":{\"Source\":\"e\"}},\"Property\":\"Name\"},\"Name\":\"Events.Name\"},{\"Measure\":{\"Expression\":{\"SourceRef\":{\"Source\":\"d\"}},\"Property\":\"Length\"},\"Name\":\"Events.Total Days\"},{\"Column\":{\"Expression\":{\"SourceRef\":{\"Source\":\"e\"}},\"Property\":\"Location\"},\"Name\":\"Events.Location\"},{\"Column\":{\"Expression\":{\"SourceRef\":{\"Source\":\"e\"}},\"Property\":\"Link to Event\"},\"Name\":\"Events.Link to Event\"},{\"Measure\":{\"Expression\":{\"SourceRef\":{\"Source\":\"d\"}},\"Property\":\"Days Until Event\"},\"Name\":\"DAX.Days Until\"},{\"Column\":{\"Expression\":{\"SourceRef\":{\"Source\":\"e\"}},\"Property\":\"Link to Submit\"},\"Name\":\"Events.Link to Submit\"},{\"Measure\":{\"Expression\":{\"SourceRef\":{\"Source\":\"d\"}},\"Property\":\"Event Type Number\"},\"Name\":\"DAX.Event Type Number\"}],\"OrderBy\":[{\"Direction\":1,\"Expression\":{\"Column\":{\"Expression\":{\"SourceRef\":{\"Source\":\"e\"}},\"Property\":\"Date Start\"}}}]},\"Binding\":{\"Primary\":{\"Groupings\":[{\"Projections\":[0,1,2,3,4,5,6,7,8]}]},\"DataReduction\":{\"DataVolume\":3,\"Primary\":{\"Window\":{\"Count\":500}}},\"Aggregates\":[{\"Select\":3,\"Aggregations\":[{\"Min\":{}},{\"Max\":{}}]}],\"SuppressedJoinPredicates\":[8],\"Version\":1}}}]}","QueryId":"","ApplicationContext":{"DatasetId":"6427f3c6-42f6-4287-b061-c31c1d2e7ae0","Sources":[{"ReportId":"6e442642-8594-4894-bc32-0ab7f4620772"}]}}],"cancelQueries":[],"modelId":1226835}
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'
def start_requests(self):
yield scrapy.Request(
url='https://wabi-australia-southeast-api.analysis.windows.net/public/reports/querydata?synchronous=true',
method='POST',
body=json.dumps(self.payload),
headers={
'Accept-Language': 'pt-BR,pt;q=0.9,en;q=0.8',
'ActivityId': '1d3ecdc2-5dc0-801e-4140-82a258f127a6',
'Connection': 'keep-alive',
'Content-Length': '3462',
'Content-Type': 'application/json;charset=UTF-8',
'Host': 'wabi-australia-southeast-api.analysis.windows.net',
'Origin': 'https://app.powerbi.com',
'Referer': 'https://app.powerbi.com/view?r=eyJrIjoiMGIwNTY2MjgtMzJhYy00MzEwLTk5MDAtYTI2MGVlMzk1NjM2IiwidCI6IjZmMGU5YzQyLTk2Y2UtNDU1MS05NzAxLWJhMzFkMGQ2ZDE5ZSJ9',
'RequestId': '11c18fe6-00da-7df4-952c-98ba7bdf188e',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'cross-site',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36',
'X-PowerBI-ResourceKey': '0b056628-32ac-4310-9900-a260ee395636'
}
)
def parse(self, response):
items = json.loads(response.text)
yield {"data":items}

The request in your screenshot is a GET request.
The behaviour of this website is very interesting!
Let's examine it.
By looking at the network panel we can see that GET request is being made to some complex url with many various headers. However It seems that the header X-PowerBI-ResourceKey is the only one that's needed and it controls what content the request will return.
So all we need to replicate this is find the X-PowerBI-ResourceKey value.
If you take a look at the source code of the html page:
https://app.powerbi.com/view?r=eyJrIjoiMGIwNTY2MjgtMzJhYy00MzEwLTk5MDAtYTI2MGVlMzk1NjM2IiwidCI6IjZmMGU5YzQyLTk2Y2UtNDU1MS05NzAxLWJhMzFkMGQ2ZDE5ZSJ9
Here we can see that javascript's atob method is used on url parameter. This is javascripts b64decode function. We can run it in python:
$ ptpython
>>> from base64 import b64decode
>>> b64decode("eyJrIjoiMGIwNTY2MjgtMzJhYy00MzEwLTk5MDAtYTI2MGVlMzk1NjM2IiwidCI6IjZmMGU5YzQyLTk2Y2UtNDU1MS05NzAxLWJhMzF
1 kMGQ2ZDE5ZSJ9")
b'{"k":"0b056628-32ac-4310-9900-a260ee395636","t":"6f0e9c42-96ce-4551-9701-ba31d0d6d19e"}'
We got it figured out! Now lets put everything together in our crawler:
import json
from base64 import b64decode
from w3lib.url import url_query_parameter
def parse(self, response):
url = "https://app.powerbi.com/view?r=eyJrIjoiMGIwNTY2MjgtMzJhYy00MzEwLTk5MDAtYTI2MGVlMzk1NjM2IiwidCI6IjZmMGU5YzQyLTk2Y2UtNDU1MS05NzAxLWJhMzFkMGQ2ZDE5ZSJ9"
# get the "r" paremeter from url
resource_key = url_query_parameter(url, 'r')
# base64 decode it
resource_key = b64decode(resource_key)
# {'k': '0b056628-32ac-4310-9900-a260ee395636', 't': '6f0e9c42-96ce-4551-9701-ba31d0d6d19e'}
# it's a json string - load it and get key "k"
resource_key = json.loads(resource_key)['k']
headers = {
'Accept': "application/json, text/plain, */*",
# 'X-PowerBI-ResourceKey': "0b056628-32ac-4310-9900-a260ee395636",
'X-PowerBI-ResourceKey': resource_key,
'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36",
'Accept-Encoding': "gzip, deflate, br",
'Accept-Language': "en-US,en;q=0.9,lt;q=0.8,et;q=0.7,de;q=0.6",
}
yield Request(url, headers=headers)

Python script to download file from button on website

I want to download an xls file by clicking the button "Export to excel" from the following url: https://apps.tampagov.net/CallsForService_Webapp/Default.aspx?type=TPD.
More specifically the button: name = "ctl00$MainContent$btndata". I've already been able to do this using selenium, but, I plan on building a docker image with this script and running as a docker container because this xls is regularly updated and I need the most current data on my local machine and it doesn't make sense to have a browser open that often to fetch this data. I understand there are headless versions of chrome and firefox although I don't believe they support downloads. Also, I understand that web get will not work in this situation because the button is not a static link to the resource. Maybe there's a completely different approach for downloading and updating this data to my computer?
import urllib
import requests
from bs4 import BeautifulSoup
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=.08',
'Origin': 'https://www.tampagov.net',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17',
'Content-Type': 'application/x-www-form-urlencoded',
'Referer': 'https://apps.tampagov.net/CallsForService_Webapp/Default.aspx?type=TPD',
'Accept-Encoding': 'gzip,deflate,br',
'Accept-Language': 'en-US,en;q=0.5',
}
class MyOpener(urllib.FancyURLopener):
version = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17'
myopener = MyOpener()
url = 'https://apps.tampagov.net/CallsForService_Webapp/Default.aspx?type=TPD'
# first HTTP request without form data
f = myopener.open(url)
soup = BeautifulSoup(f, "html.parser")
# parse and retrieve two vital form values
viewstate = soup.select("#__VIEWSTATE")[0]['value']
eventvalidation = soup.select("#__EVENTVALIDATION")[0]['value']
formData = (
('__EVENTVALIDATION', eventvalidation),
('__VIEWSTATE', viewstate),
('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
('Accept-Encoding', 'gzip, deflate, br'),
('Accept-Language', 'en-US,en;q=0.5'),
('Host', 'apps,tampagov.net'),
('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0'))
payload = urllib.urlencode(formData)
# second HTTP request with form data
r = requests.post("https://apps.tampagov.net/CallsForService_Webapp/Default.aspx?type=TPD", params=payload)
print(r.status_code, r.reason)

First: I removed import urllib because 'requests' is enough.
Some issues you have:
You don't need to create one nested tuple then apply urllib.urlencode, uses one dictionary instead that is one reason why requests is so popular.
You'd better populate all parameters for the http post request. like below what I did, otherwise, the request may be rejected by the backend.
I added one simple codes to save the content to the local.
PS: for those form parameters, you can get their values by analysis the html responsed from http get. Also you can customize the parameters as you need, like page size etc.
Below is a working sample:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
def downloadExcel():
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=.08',
'Origin': 'https://www.tampagov.net',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17',
'Content-Type': 'application/x-www-form-urlencoded',
'Referer': 'https://apps.tampagov.net/CallsForService_Webapp/Default.aspx?type=TPD',
'Accept-Encoding': 'gzip,deflate,br',
'Accept-Language': 'en-US,en;q=0.5',
}
r = requests.get("https://apps.tampagov.net/CallsForService_Webapp/Default.aspx?type=TPD", headers=headers)
# parse and retrieve two vital form values
if not r.status_code == 200:
print('Error')
return
soup = BeautifulSoup(r.content, "html.parser")
viewstate = soup.select("#__VIEWSTATE")[0]['value']
eventvalidation = soup.select("#__EVENTVALIDATION")[0]['value']
print ('__VIEWSTATE:', viewstate)
print ('__EVENTVALIDATION:', eventvalidation)
formData = {
'__EVENTVALIDATION': eventvalidation,
'__VIEWSTATE': viewstate,
'__EVENTTARGET': '',
'__EVENTARGUMENT': '',
'__VIEWSTATEGENERATOR': '49DF2C80',
'MainContent_RadScriptManager1_TSM':""";;System.Web.Extensions, Version=4.0.0.0, Culture=neutral, PublicKeyToken=31bf3856ad364e35:en-US:59e0a739-153b-40bd-883f-4e212fc43305:ea597d4b:b25378d2;Telerik.Web.UI, Version=2015.2.826.40, Culture=neutral, PublicKeyToken=121fae78165ba3d4:en-US:c2ba43dc-851e-4009-beab-3032480b6a4b:16e4e7cd:f7645509:24ee1bba:c128760b:874f8ea2:19620875:4877f69a:f46195d3:92fe8ea0:fa31b949:490a9d4e:bd8f85e4:58366029:ed16cbdc:2003d0b8:88144a7a:1e771326:aa288e2d:b092aa46:7c926187:8674cba1:ef347303:2e42e72a:b7778d6c:c08e9f8a:e330518b:c8618e41:e4f8f289:1a73651d:16d8629e:59462f1:a51ee93e""",
'search_block_form':'',
'ctl00$MainContent$btndata':'Export to Excel',
'ctl00_MainContent_RadWindow1_C_RadGridVehicles_ClientState':'',
'ctl00_MainContent_RadWindow1_ClientState':'',
'ctl00_MainContent_RadWindowManager1_ClientState':'',
'ctl00$MainContent$RadGrid1$ctl00$ctl02$ctl00$PageSizeComboBox':'20',
'ctl00_MainContent_RadGrid1_ctl00_ctl02_ctl00_PageSizeComboBox_ClientState':'',
'ctl00$MainContent$RadGrid1$ctl00$ctl02$ctl02$RDIPFdispatch_time':'',
'ctl00$MainContent$RadGrid1$ctl00$ctl02$ctl02$RDIPFdispatch_time$dateInput':'',
'ctl00_MainContent_RadGrid1_ctl00_ctl02_ctl02_RDIPFdispatch_time_dateInput_ClientState':'{"enabled":true,"emptyMessage":"","validationText":"","valueAsString":"","minDateStr":"1900-01-01-00-00-00","maxDateStr":"2099-12-31-00-00-00","lastSetTextBoxValue":""}',
'ctl00_MainContent_RadGrid1_ctl00_ctl02_ctl02_RDIPFdispatch_time_ClientState':'{"minDateStr":"1900-01-01-00-00-00","maxDateStr":"2099-12-31-00-00-00"}',
'ctl00$MainContent$RadGrid1$ctl00$ctl02$ctl02$RadComboBox1address':'',
'ctl00_MainContent_RadGrid1_ctl00_ctl02_ctl02_RadComboBox1address_ClientState':'',
'ctl00$MainContent$RadGrid1$ctl00$ctl02$ctl02$RadComboBox1case_description':'',
'ctl00_MainContent_RadGrid1_ctl00_ctl02_ctl02_RadComboBox1case_description_ClientState':'',
'ctl00$MainContent$RadGrid1$ctl00$ctl02$ctl02$FilterTextBox_grid':'',
'ctl00$MainContent$RadGrid1$ctl00$ctl02$ctl02$RadComboBox1report_number':'',
'ctl00_MainContent_RadGrid1_ctl00_ctl02_ctl02_RadComboBox1report_number_ClientState':'',
'ctl00$MainContent$RadGrid1$ctl00$ctl02$ctl02$FilterTextBox_out_max_date':'',
'ctl00$MainContent$RadGrid1$ctl00$ctl02$ctl02$FilterTextBox_out_rowcount':'',
'ctl00$MainContent$RadGrid1$ctl00$ctl03$ctl01$PageSizeComboBox':'20',
'ctl00_MainContent_RadGrid1_ctl00_ctl03_ctl01_PageSizeComboBox_ClientState':'',
'ctl00_MainContent_RadGrid1_rfltMenu_ClientState':'',
'ctl00_MainContent_RadGrid1_gdtcSharedTimeView_ClientState':'',
'ctl00_MainContent_RadGrid1_gdtcSharedCalendar_SD':'[]',
'ctl00_MainContent_RadGrid1_gdtcSharedCalendar_AD':'[[1900,1,1],[2099,12,31],[2018,3,29]]',
'ctl00_MainContent_RadGrid1_ClientState':'',
}
# second HTTP request with form data
r = requests.post("https://apps.tampagov.net/CallsForService_Webapp/Default.aspx?type=TPD", data=formData, headers=headers)
print('received:', r.status_code, len(r.content))
with open(r"C:\Users\xxx\Desktop\test\test\apps.xls", "wb") as handle:
for data in tqdm(r.iter_content()):
handle.write(data)
downloadExcel()

Find out the URL you need to fetch as #Sphinx explains, and then simulate it using something similar to:
import urllib.request
import urllib.parse
data = urllib.parse.urlencode({...})
data = data.encode('ascii')
with urllib.request.urlopen("http://...", data) as fd:
print(fd.read().decode('utf-8'))
Take a look at the documentation of urllib.

Python Web crawling using Scrapy for html page which require form filling

I am trying to crawl this site which first requires me to fill the form and then get to the required page:
http://fcainfoweb.nic.in/PMSver2/Reports/Report_Menu_web.aspx
I have written following code but don't know whats wrong. Please help:
import scrapy
class SpidyQuotesViewStateSpider(scrapy.Spider):
name = 'spidyquotes-viewstate'
start_urls = ['http://fcainfoweb.nic.in/PMSver2/Reports/Report_Menu_web.aspx']
download_delay = 1.5
def parse(self, response):
yield scrapy.FormRequest.from_response(
response,
formdata={
'ctl00_MainContent_ToolkitScriptManager1_HiddenField':response.css('input#ctl00_MainContent_ToolkitScriptManager1_HiddenField::attr(value)').extract_first(),
'__EVENTTARGET':'ctl00$MainContent$Rbl_Rpt_type$0',
#'__EVENTARGUMENT':'',
#'__LASTFOCUS':'',
#'__VIEWSTATE':response.css('input#__VIEWSTATE::attr(value)').extract_first(),
#'__VIEWSTATEGENERATOR':response.css('input#__VIEWSTATEGENERATOR::attr(value)').extract_first(),
#'__VIEWSTATEENCRYPTED':response.css('input#__VIEWSTATEENCRYPTED::attr(value)').extract_first(),
#'__EVENTVALIDATION': response.css('input#__EVENTVALIDATION::attr(value)').extract_first(),
'ctl00$MainContent$Ddl_Rpt_type':'Retail',
'ctl00$MainContent$ddl_Language':'English',
'ctl00$MainContent$Rbl_Rpt_type':'Price+report',
},
callback=self.parse_tags,
)
def parse_tags(self, response):
yield scrapy.FormRequest.from_response(
response,
formdata={
'ctl00_MainContent_ToolkitScriptManager1_HiddenField':response.css('input#ctl00_MainContent_ToolkitScriptManager1_HiddenField::attr(value)').extract_first(),
'__EVENTTARGET':'ctl00$MainContent$Ddl_Rpt_Option0',
#'__EVENTARGUMENT':'',
#'__LASTFOCUS':'',
#'__VIEWSTATE':response.css('input#__VIEWSTATE::attr(value)').extract_first(),
#'__VIEWSTATEGENERATOR':response.css('input#__VIEWSTATEGENERATOR::attr(value)').extract_first(),
#'__VIEWSTATEENCRYPTED':response.css('input#__VIEWSTATEENCRYPTED::attr(value)').extract_first(),
#'__EVENTVALIDATION': response.css('input#__EVENTVALIDATION::attr(value)').extract_first(),
'ctl00$MainContent$Ddl_Rpt_type':'Retail',
'ctl00$MainContent$ddl_Language':'English',
'ctl00$MainContent$Rbl_Rpt_type':'Price+report',
'ctl00$MainContent$Ddl_Rpt_Option0':'Daily+Prices',
},
callback=self.parse_date,
)
def parse_date(self, response):
yield scrapy.FormRequest(
'http://fcainfoweb.nic.in/PMSver2/Reports/Report_Menu_web.aspx',
formdata={
#'ctl00_MainContent_ToolkitScriptManager1_HiddenField':response.css('input#ctl00_MainContent_ToolkitScriptManager1_HiddenField::attr(value)').extract_first(),
'__EVENTTARGET':'',
#'__EVENTARGUMENT':'',
#'__LASTFOCUS':'',
#'__VIEWSTATE':response.css('input#__VIEWSTATE::attr(value)').extract_first(),
#'__VIEWSTATEGENERATOR':response.css('input#__VIEWSTATEGENERATOR::attr(value)').extract_first(),
#'__VIEWSTATEENCRYPTED':response.css('input#__VIEWSTATEENCRYPTED::attr(value)').extract_first(),
#'__EVENTVALIDATION': response.css('input#__EVENTVALIDATION::attr(value)').extract_first(),
'ctl00$MainContent$Ddl_Rpt_type':'Retail',
'ctl00$MainContent$ddl_Language':'English',
'ctl00$MainContent$Rbl_Rpt_type':'Price+report',
'ctl00$MainContent$Ddl_Rpt_Option0':'Daily+Prices',
'ctl00$MainContent$Txt_FrmDate':'01/02/2017',
'ctl00$MainContent$btn_getdata1':'Get+Data',
},
callback=self.parse_results,
)
def parse_results(self, response):
response.css('div.Panel1')

You are scraping a .Net website. They use variables like VIEWSTATE and EVENTVALIDATION etc, you must have to send same values along with request.

Possibly you need to specify headers as well.
Check with Google inspector what are current ones or you can quickly grab below.
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.8,ru;q=0.6',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36',
'X-Compress': '0',
}

See below - you need to follow same

How to pass arguments for get method with urllib?

The response web page is as below when to slect title and input wordpress.
Here is my python code to pass arguments for get method with python3.
import urllib.request
import urllib.parse
url = 'http://www.it-ebooks.info/'
values = {'q': 'wordpress','type': 'title'}
data = urllib.parse.urlencode(values).encode(encoding='utf-8',errors='ignore')
headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0' }
request = urllib.request.Request(url=url, data=data,headers=headers,method='GET')
response = urllib.request.urlopen(request)
buff = response.read()
html = buff.decode("utf8")
print(html)
I can't get the desired output web page.
How to pass arguments for get method with urllib in my example?

The data kwarg of urllib.request.Request is only used for POST requests as it modifies the request's body.
GET requests simply use URL parameters, so you should append these to the url:
params = '?q=wordpress&type=title'
url = 'http://www.it-ebooks.info/search/{}'.format(params)
You can of course take the time and generalize this into a generic function.

is better if you use the library called requests
import requests
headers = {
'DNT': '1',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'es-ES,es;q=0.8,en;q=0.6',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Referer': 'http://www.it-ebooks.info/',
'Connection': 'keep-alive',
}
r = requests.get('http://www.it-ebooks.info/search/?q=wordpress&type=title', headers=headers)
print r.content

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

read json from web by scrapy in Python2 - python

Related

Scrapy - 400 Response with Headers

scrapy with payload request

Python script to download file from button on website

Python Web crawling using Scrapy for html page which require form filling

How to pass arguments for get method with urllib?

Categories

Resources