How to switch page in a table with scrapy? - python

I am trying to get the table from this site: https://www.burgrieden.de/index.php?id=77
I managed to get the first site, but I can not access the other 4 pages.
The only explanations and examples I can find are with a direct link to the other site or an easy URL manipulation.
I have tried to inspect the button and see what happens in the network logger when pressing it.
But nothing worked.
How can i get to the next page with scrapy?
Here is what I have so far:
from abc import ABC
import scrapy
from scrapy.crawler import CrawlerProcess
import re
from datetime import datetime
class TrashSpider(scrapy.Spider, ABC):
name = "Trasher"
start_urls = ['https://www.burgrieden.de/index.php?id=77']
def parse(self, response, **kwargs):
for row in response.xpath('//*[#class="contenttable"]//tr')[1:]:
d = row.xpath('td//text()')[0].extract()
match = re.search(r'\d{2}.\d{2}.\d{4}', d)
date = datetime.strptime(match.group(), '%d.%m.%Y').date()
entry = {
'date': date,
'type': row.xpath('td//text()')[2].extract()
}
process = CrawlerProcess()
process.crawl(TrashSpider)
process.start()
Inspector Img
Thanks for your help in advance.

For everyone who is having the same question. I figured it out.
It is a button which triggers a POST request. To trigger this request with scrapy you have to define the request-headers and the request-form.
Both can be found using your browsers networkanalyzer:
On the left side you can see the method used to request the site. The marked entry says POST.
Now we need to get the headers which can be found in the right bottom field and put them in a dictionary in our spider_class.
Make sure to ignore the content-length. Just leave it blank or do not add it in your dictionary at all, because scrapy will send an own content-length and when more than one gets send, the side will block your request. Code 400 instead of 200.
The form data can be found at least in firefox under the request tab:
Here we need the whole line right as it is and put in a variable to use it in the actual request.
This is how it is supposed to look:
# request-header for POST request
headers = {
'Host': 'www.burgrieden.de',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'de,en-US;q=0.7,en;q=0.3',
'Accept-Encoding': 'gzip, deflate, br',
'Content-Type': 'application/x-www-form-urlencoded',
# 'Content-Length': '219',
'Origin': 'https://www.burgrieden.de',
'Connection': 'keep-alive',
'Referer': 'https://www.burgrieden.de/index.php?id=77',
'Cookie': 'style=normal.css',
'Upgrade-Insecure-Requests': '1',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache'
}
# POST request form data for every table page
form_data_p1 = 'publish%5BbtnStart%5D=+%7C%3C+&publish%5Bstart%5D=40&publish%5BdayFrom%5D=20&publish%5BmonthFrom%5D=03&publish%5ByearFrom%5D=2021&publish%5BdayTo%5D=&publish%5BmonthTo%5D=&publish%5ByearTo%5D=&publish%5Bfulltext%5D=&id=77'
To make sure it works you have to disable cookies like this:
Just put it in your custom spider class
# custom scraper settings
custom_settings = {
# pass cookies along with headers
'COOKIES_ENABLED': False
}
For the actual request you have to use the start_requests() method.
# crawler's entry point
def start_requests(self):
# make HTTP POST request
# page 1
yield scrapy.Request(
url=self.start_url,
method='POST',
headers=self.headers,
body=self.form_data_p1,
callback=self.parse
)
Now you can parse the response with your normal parse() method.
If you get any problems try leaving the host in your headers blank or delete it.
Here is the whole class code:
class TrashSpider(scrapy.Spider, ABC):
name = "Trasher"
start_url = "https://www.burgrieden.de/index.php?id=77"
# request-header for POST request
headers = {
'Host': 'www.burgrieden.de',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'de,en-US;q=0.7,en;q=0.3',
'Accept-Encoding': 'gzip, deflate, br',
'Content-Type': 'application/x-www-form-urlencoded',
# 'Content-Length': '219',
'Origin': 'https://www.burgrieden.de',
'Connection': 'keep-alive',
'Referer': 'https://www.burgrieden.de/index.php?id=77',
'Cookie': 'style=normal.css',
'Upgrade-Insecure-Requests': '1',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache'
}
# POST request form data for every table page
form_data_p1 = 'publish%5BbtnStart%5D=+%7C%3C+&publish%5Bstart%5D=40&publish%5BdayFrom%5D=20&publish%5BmonthFrom%5D=03&publish%5ByearFrom%5D=2021&publish%5BdayTo%5D=&publish%5BmonthTo%5D=&publish%5ByearTo%5D=&publish%5Bfulltext%5D=&id=77'
form_data_p2 = 'publish%5BbtnNext%5D=+%3E%3E+&publish%5Bstart%5D=0&publish%5BdayFrom%5D=20&publish%5BmonthFrom%5D=03&publish%5ByearFrom%5D=2021&publish%5BdayTo%5D=&publish%5BmonthTo%5D=&publish%5ByearTo%5D=&publish%5Bfulltext%5D=&id=77'
form_data_p3 = 'publish%5BbtnNext%5D=+%3E%3E+&publish%5Bstart%5D=10&publish%5BdayFrom%5D=20&publish%5BmonthFrom%5D=03&publish%5ByearFrom%5D=2021&publish%5BdayTo%5D=&publish%5BmonthTo%5D=&publish%5ByearTo%5D=&publish%5Bfulltext%5D=&id=77'
form_data_p4 = 'publish%5BbtnNext%5D=+%3E%3E+&publish%5Bstart%5D=20&publish%5BdayFrom%5D=20&publish%5BmonthFrom%5D=03&publish%5ByearFrom%5D=2021&publish%5BdayTo%5D=&publish%5BmonthTo%5D=&publish%5ByearTo%5D=&publish%5Bfulltext%5D=&id=77'
form_data_p5 = 'publish%5BbtnNext%5D=+%3E%3E+&publish%5Bstart%5D=30&publish%5BdayFrom%5D=20&publish%5BmonthFrom%5D=03&publish%5ByearFrom%5D=2021&publish%5BdayTo%5D=&publish%5BmonthTo%5D=&publish%5ByearTo%5D=&publish%5Bfulltext%5D=&id=77'
# custom scraper settings
custom_settings = {
# pass cookies along with headers
'COOKIES_ENABLED': False
}
entrys_crawled = []
# crawler's entry point
def start_requests(self):
# make HTTP POST request to burgrieden
# page 1
yield scrapy.Request(
url=self.start_url,
method='POST',
headers=self.headers,
body=self.form_data_p1,
callback=self.parse
)
# page 2
yield scrapy.Request(
url=self.start_url,
method='POST',
headers=self.headers,
body=self.form_data_p2,
callback=self.parse
)
# page 3
yield scrapy.Request(
url=self.start_url,
method='POST',
headers=self.headers,
body=self.form_data_p3,
callback=self.parse
)
# page 4
yield scrapy.Request(
url=self.start_url,
method='POST',
headers=self.headers,
body=self.form_data_p4,
callback=self.parse
)
#page 5
yield scrapy.Request(
url=self.start_url,
method='POST',
headers=self.headers,
body=self.form_data_p5,
callback=self.parse
)
# parse date and description from table "contenttable",
# extract date from shitty formatted text and store in dictionary entry as datetime
def parse(self, response, **kwargs):
for row in response.xpath('//*[#class="contenttable"]//tr')[1:]:
d = row.xpath('td//text()')[0].extract()
match = re.search(r'\d{2}.\d{2}.\d{4}', d)
entry = {
'date': datetime.strptime(match.group(), '%d.%m.%Y').date(),
'type': row.xpath('td//text()')[2].extract()
}
self.entrys_crawled.append(entry)
There is propably a better way for processing more than one post-request, but it worked for me. If someone wants to improve it and send it to me, please feel free to do so.
For everyone who is wondering about the entrys_crawled. I processed it into an .ics file in scrapys close() method.

Related

request data from api with bearer token and refresh token

I am still a beginner at web scraping, I am trying to extract data from an API but the problem is that it has a Bearer token and this token changed after 5 to 6 hours so I have to go to the web page again and copy the token again so is there any way to extract the data without any more opening to the web page and copy the token again
I found this info as well on the network request, as someone told me that I could use the refresh_token to access but I don't know how to do that
Cache-Control: no-cache,
Connection: keep-alive,
Content-Length: 177,
Content-Type: application/json;charset=UTF-8,
Cookie: dhh_token=; refresh_token=; _hurrier_session=81556f54bf555a952d1a7f780766b028,
dnt: 1
import pandas as pd
from time import sleep
def make_request():
headers = {
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
'sec-ch-ua': '^\\^',
'Accept': 'application/json',
'Authorization': 'Bearer eyJhbGciOiJIUzI1NiJ9.eyJpc3MiOiJMdXRiZlZRUVZhWlpmNTNJbGxhaXFDY3BCVTNyaGtqZiIsInN1YiI6MzEzMTcwLCJleHAiOjE2MjQzMjU2NDcsInJvbCI6ImRpc3BhdGNoZXIiLCJyb2xlcyI6WyJodXJyaWVyLmRpc3BhdGNoZXIiLCJjb2QuY29kX21hbmFnZXIiXSwibmFtIjoiRXNsYW0gWmVmdGF3eSIsImVtYSI6ImV6ZWZ0YXd5QHRhbGFiYXQuY29tIiwidXNlcm5hbWUiOiJlemVmdGF3eUB0YWxhYmF0LmNvbSIsImNvdW50cmllcyI6WyJrdyIsImJoIiwicWEiLCJhZSIsImVnIiwib20iLCJqbyIsInEyIiwiazMiXX0.XYykBij-jaiIS_2tdqKFIfYGfw0uS0rKmcOTSHor8Nk',
'sec-ch-ua-mobile': '?0',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36',
'Content-Type': 'application/json;charset=UTF-8',
'Origin': 'url',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty',
'Referer': 'url',
'Accept-Language': 'en-US,en;q=0.9,ar-EG;q=0.8,ar;q=0.7',
'dnt': '1',
}
data = {
'status': 'picked'
}
response = requests.post('url/api', headers=headers, json=data)
print(response.text)
return json.loads(response.text)
def extract_data(row):
data_row = {
'order_id': row['order']['code'],
'deedline': row['order']['deadline'].split('.')[0],
'picked_at': row['picked_at'].split('.')[0],
'picked_by': row['picked_by'],
'processed_at': row['processed_at'],
'type': row['type']
}
return data_row
def periodique_extract(delay):
extract_count = 0
while True:
extract_count += 1
data = make_request()
if extract_count == 1 :
df = pd.DataFrame([extract_data(row) for row in data['data']])
df.to_csv(r"C:\Users\di\Desktop\New folder\a.csv", mode='a')
else:
df = pd.DataFrame([extract_data(row) for row in data['data']])
df.to_csv(r"C:\Users\di\Desktop\New folder\a.csv", mode='a',header=False)
print('exracting data {} times'.format(extract_count))
sleep(delay)
periodique_extract(60)
#note: as the website is track live operation so I extract data every 1 min ```
Sometimes these tokens require JavaScript execution to be set and automatically added to API requests. That means you need to open the page in something that actually runs the javascript, in order to get the token. I.e. actually opening the page in a browser.
One solution could be to use something like Selenium or Puppeteer to open the page whenever the token expires to get a new token, that you then feed to your script. But this depends on the specifics on the page, without a link the correct solution is difficult to say. But if the method of you opening the page in your browser, copying the token, then running your script works, then this is very likely to also work.

Scrapy Trying to get Json Response

I am using a scraper to scrape the steam gaming platform, and I am having trouble with pagination. The pagination from this link: https://steamcommunity.com/sharedfiles/filedetails/comments/2460661464
uses pagination, and I believe is making a POST request to some server. I would like to simulate this request using Scrapy's FormRequest function, and get all of the comments at once. I don't know how to do this. what should my headers and formdata look like? Currently they look like this:
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive',
'Host': 'steamcommunity.com',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0'
}
data = {
"start": "0",
"totalcount": comment_number,
"count": comment_number,
"sessionid": "d880ab2338b70926db0a9591",
f"extended_data": "{\"contributors\":[\"{contributor_id}\",{}],\"appid\":289070,\"sharedfile\":{\"m_parentsDetails\":null,\"m_parentBundlesDetails\":null,\"m_bundledChildren\":[],\"m_ownedBundledItems\":[]},\"parent_item_reported\":false}",
"feature2": "-1"
}
yield FormRequest(url, formdata=data, headers=headers, callback=self.parse_paginated_comments, dont_filter=True, meta={'app_id': app_id, 'game': game, 'workshop_id': workshop_id, 'workshop_name': workshop_name})
What are the correct headers/data and how do I set up my FormRequest to get all of the comments (in this case 1-134)?
I don't know anything about Scrapy, but here's how you could do it using just basic requests and BeautifulSoup.
The API doesn't seem to be very strict about the payload that's POSTed. Even if some parameters are omitted, the API doesn't seem to mind. I've found that you can assign an impossibly large number to the count parameter to have the API return all comments (assuming there will never be more than 99999999 comments in a thread, in this case). I haven't played around with the request headers that much - you could probably trim them down even further.
def get_comments(thread_id):
import requests
from bs4 import BeautifulSoup as Soup
url = "https://steamcommunity.com/comment/PublishedFile_Public/render/76561198401810552/{}/".format(thread_id)
headers = {
"Accept": "text/javascript, text/html, application/xml, text/xml, */*",
"Accept-Encoding": "gzip, deflate",
"Content-type": "application/x-www-form-urlencoded; charset=UTF-8",
"User-Agent": "Mozilla/5.0",
"X-Requested-With": "XMLHttpRequest"
}
payload = {
"start": "0",
"count": "99999999",
}
def to_clean_comment(element):
return element.text.strip()
response = requests.post(url, headers=headers, data=payload)
response.raise_for_status()
soup = Soup(response.json()["comments_html"], "html.parser")
yield from map(to_clean_comment, soup.select("div.commentthread_comment_text"))
def main():
for comment in get_comments("2460661464"):
print(comment)
return 0
if __name__ == "__main__":
import sys
sys.exit(main())

Scrapy-Splash doesn't set custom request headers

I am trying to scrape a website using Scrapy + Splash in Python 2.7.
The website uses JavaScript to generate most of the HTML, which is why I need Splash.
First, I make a FormRequest with Scrapy to login to a website. It is successful.
I then extract "access_token" from JSON response, because it should be used in the next request as an "Authorization" header - to confirm to the website that I am logged in.
jsonresp = json.loads(response.body_as_unicode())
self.token = 'Bearer ' + jsonresp['access_token']
self.my_headers['Authorization'] = self.token
Before proceeding with SplashRequest, I decided to test the session with scrapy.Request. I passed cookies and the new headers:
yield scrapy.Request('https://www.example.com/products', cookies=self.cookies, dont_filter=True, callback=self.parse_pages, headers=self.my_headers)
The HTML from result.body confirmed that I was logged in. Great!
Calling response.request.headers showed that 'Authorization' header was also sent.
{'Accept-Language': ['en-US,en;q=0.5'],
'Accept-Encoding': ['gzip,deflate'],
'Accept': ['application/json, text/plain, */*'],
'User-Agent': ['Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0'],
'Connection': ['keep-alive'],
'Referer': ['https://www.example.com/Web'],
'Cookie': ["___cookies___"],
'Content-Type': ['application/x-www-form-urlencoded'],
'Authorization': ['Bearer Zyb9c20JW0LLJCTA-GmLtEeL9A48se_AviN9xajP8NZVE8r6TddoPHC6dJnmbQ4RCddM8QVJ2v23ey-kq5f8S12uLMXlLF_WzInNI9eaI29WAcIwNK-FixBpDm4Ws3SqXdwBIXfkqYhd6gJs4BP7sNpAKc93t-A4ws9ckpTyih2cHeC8KGQmTnQXLOYch2XIyT5r9verzRMMGHEiu6kgJWK9yRL19PVqCWDjapYbtutKiTRKD1Q35EHjruBJgJD-Fg_iyMovgYkfy9XtHpAEuUvL_ascWHWvrFQqV-19p-6HQPocEuri0Vu0NsAqutfIbi420_zhD8sDFortDmacltNOw-3f6H1imdGstXE_2GQ']}
Cookie DEBUG showed that all cookies were sent without issues.
After that I substituted scrapy.Request with SplashRequest:
yield SplashRequest('https://www.example.com/products', cookies=self.cookies, callback=self.parse_pages, args={"lua_source": lua_script, 'headers':self.my_headers}, endpoint='execute', errback=self.errors)
lua_script:
lua_script = """
function main(splash)
splash:init_cookies(splash.args.cookies)
assert(splash:go{
splash.args.url,
headers=splash.args.headers,
http_method=splash.args.http_method,
body=splash.args.body,
})
assert(splash:wait(2))
local entries = splash:history()
local last_response = entries[#entries].response
return {
url = splash:url(),
headers = last_response.headers,
http_status = last_response.status,
html = splash:html(),
}
end
"""
However, the HTML that I got from Splash response showed that I was not logged in.
Cookie DEBUG didn't show any issues - the same cookies were sent as before.
But here is what I got from calling response.request.headers:
{'Accept-Language': ['en'],
'Accept-Encoding': ['gzip,deflate'],
'Accept': ['text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'],
'User-Agent': ['Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0'],
'Cookie': ["___cokies___"],
'Content-Type': ['application/json']}
As you can see, Splash didn't set my custom headers, instead it just combined cookies with the default ones.
I tried setting my own headers both as SplashRequest function arguments and inside lua_script, but none of the approaches worked.
My question is, how to set my own request headers in Splash?

read json from web by scrapy in Python2

I want to extract JSON data from a web page, so I've inspected it. Data I need is stored in the below format:
<script type="application/ld+json">
{
'data I want to extract'
}
</script>
I tried to use:
import scrapy
import json
class OpenriceSpider(scrapy.Spider):
name = 'openrice'
allowed_domains = ['www.openrice.com']
def start_requests(self):
headers = {
'accept-encoding': 'gzip, deflate, sdch, br',
'accept-language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'cache-control': 'max-age=0',
}
url = 'https://www.openrice.com/en/hongkong/r-kitchen-one-cafe-sha-tin-western-r483821'
yield scrapy.Request(url=url, headers=headers, callback=self.parse)
def parse(self, response): # response = request url ?
items = []
jsonresponse = json.loads(response)
But it doesn't work, how should I change it?
You need to locate that script element in the HTML source, extract it's text and only then load with json.loads():
script = response.xpath("//script[#type='application/ld+json']/text()").extract_first()
json_data = json.loads(script)
print(json_data)
Here, I am using the not so common application/ld+json to locate the script, but there are many other options as well - like, locate the script using some text you know it is in the script itself:
//script[contains(., 'Restaurant')]

Python Web crawling using Scrapy for html page which require form filling

I am trying to crawl this site which first requires me to fill the form and then get to the required page:
http://fcainfoweb.nic.in/PMSver2/Reports/Report_Menu_web.aspx
I have written following code but don't know whats wrong. Please help:
import scrapy
class SpidyQuotesViewStateSpider(scrapy.Spider):
name = 'spidyquotes-viewstate'
start_urls = ['http://fcainfoweb.nic.in/PMSver2/Reports/Report_Menu_web.aspx']
download_delay = 1.5
def parse(self, response):
yield scrapy.FormRequest.from_response(
response,
formdata={
'ctl00_MainContent_ToolkitScriptManager1_HiddenField':response.css('input#ctl00_MainContent_ToolkitScriptManager1_HiddenField::attr(value)').extract_first(),
'__EVENTTARGET':'ctl00$MainContent$Rbl_Rpt_type$0',
#'__EVENTARGUMENT':'',
#'__LASTFOCUS':'',
#'__VIEWSTATE':response.css('input#__VIEWSTATE::attr(value)').extract_first(),
#'__VIEWSTATEGENERATOR':response.css('input#__VIEWSTATEGENERATOR::attr(value)').extract_first(),
#'__VIEWSTATEENCRYPTED':response.css('input#__VIEWSTATEENCRYPTED::attr(value)').extract_first(),
#'__EVENTVALIDATION': response.css('input#__EVENTVALIDATION::attr(value)').extract_first(),
'ctl00$MainContent$Ddl_Rpt_type':'Retail',
'ctl00$MainContent$ddl_Language':'English',
'ctl00$MainContent$Rbl_Rpt_type':'Price+report',
},
callback=self.parse_tags,
)
def parse_tags(self, response):
yield scrapy.FormRequest.from_response(
response,
formdata={
'ctl00_MainContent_ToolkitScriptManager1_HiddenField':response.css('input#ctl00_MainContent_ToolkitScriptManager1_HiddenField::attr(value)').extract_first(),
'__EVENTTARGET':'ctl00$MainContent$Ddl_Rpt_Option0',
#'__EVENTARGUMENT':'',
#'__LASTFOCUS':'',
#'__VIEWSTATE':response.css('input#__VIEWSTATE::attr(value)').extract_first(),
#'__VIEWSTATEGENERATOR':response.css('input#__VIEWSTATEGENERATOR::attr(value)').extract_first(),
#'__VIEWSTATEENCRYPTED':response.css('input#__VIEWSTATEENCRYPTED::attr(value)').extract_first(),
#'__EVENTVALIDATION': response.css('input#__EVENTVALIDATION::attr(value)').extract_first(),
'ctl00$MainContent$Ddl_Rpt_type':'Retail',
'ctl00$MainContent$ddl_Language':'English',
'ctl00$MainContent$Rbl_Rpt_type':'Price+report',
'ctl00$MainContent$Ddl_Rpt_Option0':'Daily+Prices',
},
callback=self.parse_date,
)
def parse_date(self, response):
yield scrapy.FormRequest(
'http://fcainfoweb.nic.in/PMSver2/Reports/Report_Menu_web.aspx',
formdata={
#'ctl00_MainContent_ToolkitScriptManager1_HiddenField':response.css('input#ctl00_MainContent_ToolkitScriptManager1_HiddenField::attr(value)').extract_first(),
'__EVENTTARGET':'',
#'__EVENTARGUMENT':'',
#'__LASTFOCUS':'',
#'__VIEWSTATE':response.css('input#__VIEWSTATE::attr(value)').extract_first(),
#'__VIEWSTATEGENERATOR':response.css('input#__VIEWSTATEGENERATOR::attr(value)').extract_first(),
#'__VIEWSTATEENCRYPTED':response.css('input#__VIEWSTATEENCRYPTED::attr(value)').extract_first(),
#'__EVENTVALIDATION': response.css('input#__EVENTVALIDATION::attr(value)').extract_first(),
'ctl00$MainContent$Ddl_Rpt_type':'Retail',
'ctl00$MainContent$ddl_Language':'English',
'ctl00$MainContent$Rbl_Rpt_type':'Price+report',
'ctl00$MainContent$Ddl_Rpt_Option0':'Daily+Prices',
'ctl00$MainContent$Txt_FrmDate':'01/02/2017',
'ctl00$MainContent$btn_getdata1':'Get+Data',
},
callback=self.parse_results,
)
def parse_results(self, response):
response.css('div.Panel1')
You are scraping a .Net website. They use variables like VIEWSTATE and EVENTVALIDATION etc, you must have to send same values along with request.
Possibly you need to specify headers as well.
Check with Google inspector what are current ones or you can quickly grab below.
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.8,ru;q=0.6',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36',
'X-Compress': '0',
}
See below - you need to follow same

Categories

Resources