Here's an example link I'm trying to scrape: https://www.lowes.com/pd/ZLINE-KITCHEN-BATH-Professional-7-Burners-4-cu-ft-2-cu-ft-Double-Oven-Convection-Dual-Fuel-Range-Stainless-Steel-Common-48-in-Actual-48-in/1000514227
My scraper was working fine till today so I'm guessing Lowe's added more protection against bots :(
After some research, I found that I would have to add headers to my web scraper so I can emulate a real user.
Opened up Dev Console -> Network -> XHR/Fetch -> Found JSON File.
Here's my scrapy script
# -*- coding: utf-8 -*-
import scrapy
from ..items import LowesItem
import re
import pandas as pd
import requests
import json
from scrapy.http import Request
from datetime import date
class LowesSpider(scrapy.Spider):
name = 'Lowes'
def start_requests(self):
HEADERS = {
'method': 'GET',
'scheme': 'https',
'authority': 'content.syndigo.com',
'Accept': '*/*',
'Content-Type': 'text/plain',
'Origin': 'https://lowes.com',
'Accept-Language': 'en-US,en;q=0.9',
'Host': 'content.syndigo.com',
'User-Agent': ' Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.4 Safari/605.1.15',
'Referer': 'https://www.lowes.com/',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Cookie': 'sn=0321'
}
start_urls = ['https://www.lowes.com/pd/ZLINE-KITCHEN-BATH-Professional-7-Burners-4-cu-ft-2-cu-ft-Double-Oven-Convection-Dual-Fuel-Range-Stainless-Steel-Common-48-in-Actual-48-in/1000514227']
for url in start_urls:
yield Request(url,
headers=HEADERS,
meta={'dont_merge_cookies': True,
'url':url})
def parse(self, response):
for item in self.parseLowes(response):
yield item
pass
def parseLowes(self, response):
item = LowesItem() #items from items.py
script_tag = response.xpath('//script[#type="application/ld+json"]/text()').get() #get js container
productPrice = json.loads(script_tag)[2]["offers"]["price"]
productURL = response.url
url = response.meta['url']
productSKU = url.split("=")[-1]
scrapedDate = date.today()
#item['productName'] = productName #display product name
item['productOMS'] = productSKU
item['productPrice'] = productPrice #display price and assign to variable
item['productURL'] = productURL #displayURL
item['scrapedDate'] = scrapedDate
yield item
When I run scrapy, I get 400 as a response from the command.
From what I can see about the network connection, the issue is related to the their CDN (Akamai) which is blocking the access.
I was able to access your link and see the product from Microsoft Edge (version 107). In my request the user agent is:
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.26
So, try to modify in your code the 'User-Agent' key with that value.
Related
I'm trying to get a POST request, but I don't know what's wrong with my code that the data doesn't come.
The following message is displayed:
HTTP status code is not handled or not allowed
This is the website
A screenshot of the header:
This is my code:
import json
import scrapy
class MySpider(scrapy.Spider):
name = 'pb'
payload = {"version":"1.0.0","queries":[{"Query":{"Commands":[{"SemanticQueryDataShapeCommand":{"Query":{"Version":2,"From":[{"Name":"e","Entity":"Events"},{"Name":"d","Entity":"DAX"}],"Select":[{"Column":{"Expression":{"SourceRef":{"Source":"e"}},"Property":"Date Start"},"Name":"Events.Date Start"},{"Column":{"Expression":{"SourceRef":{"Source":"e"}},"Property":"Event Type"},"Name":"Events.Event Type"},{"Column":{"Expression":{"SourceRef":{"Source":"e"}},"Property":"Name"},"Name":"Events.Name"},{"Measure":{"Expression":{"SourceRef":{"Source":"d"}},"Property":"Length"},"Name":"Events.Total Days"},{"Column":{"Expression":{"SourceRef":{"Source":"e"}},"Property":"Location"},"Name":"Events.Location"},{"Column":{"Expression":{"SourceRef":{"Source":"e"}},"Property":"Link to Event"},"Name":"Events.Link to Event"},{"Measure":{"Expression":{"SourceRef":{"Source":"d"}},"Property":"Days Until Event"},"Name":"DAX.Days Until"},{"Column":{"Expression":{"SourceRef":{"Source":"e"}},"Property":"Link to Submit"},"Name":"Events.Link to Submit"},{"Measure":{"Expression":{"SourceRef":{"Source":"d"}},"Property":"Event Type Number"},"Name":"DAX.Event Type Number"}],"OrderBy":[{"Direction":1,"Expression":{"Column":{"Expression":{"SourceRef":{"Source":"e"}},"Property":"Date Start"}}}]},"Binding":{"Primary":{"Groupings":[{"Projections":[0,1,2,3,4,5,6,7,8]}]},"DataReduction":{"DataVolume":3,"Primary":{"Window":{"Count":500}}},"Aggregates":[{"Select":3,"Aggregations":[{"Min":{}},{"Max":{}}]}],"SuppressedJoinPredicates":[8],"Version":1}}}]},"CacheKey":"{\"Commands\":[{\"SemanticQueryDataShapeCommand\":{\"Query\":{\"Version\":2,\"From\":[{\"Name\":\"e\",\"Entity\":\"Events\"},{\"Name\":\"d\",\"Entity\":\"DAX\"}],\"Select\":[{\"Column\":{\"Expression\":{\"SourceRef\":{\"Source\":\"e\"}},\"Property\":\"Date Start\"},\"Name\":\"Events.Date Start\"},{\"Column\":{\"Expression\":{\"SourceRef\":{\"Source\":\"e\"}},\"Property\":\"Event Type\"},\"Name\":\"Events.Event Type\"},{\"Column\":{\"Expression\":{\"SourceRef\":{\"Source\":\"e\"}},\"Property\":\"Name\"},\"Name\":\"Events.Name\"},{\"Measure\":{\"Expression\":{\"SourceRef\":{\"Source\":\"d\"}},\"Property\":\"Length\"},\"Name\":\"Events.Total Days\"},{\"Column\":{\"Expression\":{\"SourceRef\":{\"Source\":\"e\"}},\"Property\":\"Location\"},\"Name\":\"Events.Location\"},{\"Column\":{\"Expression\":{\"SourceRef\":{\"Source\":\"e\"}},\"Property\":\"Link to Event\"},\"Name\":\"Events.Link to Event\"},{\"Measure\":{\"Expression\":{\"SourceRef\":{\"Source\":\"d\"}},\"Property\":\"Days Until Event\"},\"Name\":\"DAX.Days Until\"},{\"Column\":{\"Expression\":{\"SourceRef\":{\"Source\":\"e\"}},\"Property\":\"Link to Submit\"},\"Name\":\"Events.Link to Submit\"},{\"Measure\":{\"Expression\":{\"SourceRef\":{\"Source\":\"d\"}},\"Property\":\"Event Type Number\"},\"Name\":\"DAX.Event Type Number\"}],\"OrderBy\":[{\"Direction\":1,\"Expression\":{\"Column\":{\"Expression\":{\"SourceRef\":{\"Source\":\"e\"}},\"Property\":\"Date Start\"}}}]},\"Binding\":{\"Primary\":{\"Groupings\":[{\"Projections\":[0,1,2,3,4,5,6,7,8]}]},\"DataReduction\":{\"DataVolume\":3,\"Primary\":{\"Window\":{\"Count\":500}}},\"Aggregates\":[{\"Select\":3,\"Aggregations\":[{\"Min\":{}},{\"Max\":{}}]}],\"SuppressedJoinPredicates\":[8],\"Version\":1}}}]}","QueryId":"","ApplicationContext":{"DatasetId":"6427f3c6-42f6-4287-b061-c31c1d2e7ae0","Sources":[{"ReportId":"6e442642-8594-4894-bc32-0ab7f4620772"}]}}],"cancelQueries":[],"modelId":1226835}
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'
def start_requests(self):
yield scrapy.Request(
url='https://wabi-australia-southeast-api.analysis.windows.net/public/reports/querydata?synchronous=true',
method='POST',
body=json.dumps(self.payload),
headers={
'Accept-Language': 'pt-BR,pt;q=0.9,en;q=0.8',
'ActivityId': '1d3ecdc2-5dc0-801e-4140-82a258f127a6',
'Connection': 'keep-alive',
'Content-Length': '3462',
'Content-Type': 'application/json;charset=UTF-8',
'Host': 'wabi-australia-southeast-api.analysis.windows.net',
'Origin': 'https://app.powerbi.com',
'Referer': 'https://app.powerbi.com/view?r=eyJrIjoiMGIwNTY2MjgtMzJhYy00MzEwLTk5MDAtYTI2MGVlMzk1NjM2IiwidCI6IjZmMGU5YzQyLTk2Y2UtNDU1MS05NzAxLWJhMzFkMGQ2ZDE5ZSJ9',
'RequestId': '11c18fe6-00da-7df4-952c-98ba7bdf188e',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'cross-site',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36',
'X-PowerBI-ResourceKey': '0b056628-32ac-4310-9900-a260ee395636'
}
)
def parse(self, response):
items = json.loads(response.text)
yield {"data":items}
The request in your screenshot is a GET request.
The behaviour of this website is very interesting!
Let's examine it.
By looking at the network panel we can see that GET request is being made to some complex url with many various headers. However It seems that the header X-PowerBI-ResourceKey is the only one that's needed and it controls what content the request will return.
So all we need to replicate this is find the X-PowerBI-ResourceKey value.
If you take a look at the source code of the html page:
https://app.powerbi.com/view?r=eyJrIjoiMGIwNTY2MjgtMzJhYy00MzEwLTk5MDAtYTI2MGVlMzk1NjM2IiwidCI6IjZmMGU5YzQyLTk2Y2UtNDU1MS05NzAxLWJhMzFkMGQ2ZDE5ZSJ9
Here we can see that javascript's atob method is used on url parameter. This is javascripts b64decode function. We can run it in python:
$ ptpython
>>> from base64 import b64decode
>>> b64decode("eyJrIjoiMGIwNTY2MjgtMzJhYy00MzEwLTk5MDAtYTI2MGVlMzk1NjM2IiwidCI6IjZmMGU5YzQyLTk2Y2UtNDU1MS05NzAxLWJhMzF
1 kMGQ2ZDE5ZSJ9")
b'{"k":"0b056628-32ac-4310-9900-a260ee395636","t":"6f0e9c42-96ce-4551-9701-ba31d0d6d19e"}'
We got it figured out! Now lets put everything together in our crawler:
import json
from base64 import b64decode
from w3lib.url import url_query_parameter
def parse(self, response):
url = "https://app.powerbi.com/view?r=eyJrIjoiMGIwNTY2MjgtMzJhYy00MzEwLTk5MDAtYTI2MGVlMzk1NjM2IiwidCI6IjZmMGU5YzQyLTk2Y2UtNDU1MS05NzAxLWJhMzFkMGQ2ZDE5ZSJ9"
# get the "r" paremeter from url
resource_key = url_query_parameter(url, 'r')
# base64 decode it
resource_key = b64decode(resource_key)
# {'k': '0b056628-32ac-4310-9900-a260ee395636', 't': '6f0e9c42-96ce-4551-9701-ba31d0d6d19e'}
# it's a json string - load it and get key "k"
resource_key = json.loads(resource_key)['k']
headers = {
'Accept': "application/json, text/plain, */*",
# 'X-PowerBI-ResourceKey': "0b056628-32ac-4310-9900-a260ee395636",
'X-PowerBI-ResourceKey': resource_key,
'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36",
'Accept-Encoding': "gzip, deflate, br",
'Accept-Language': "en-US,en;q=0.9,lt;q=0.8,et;q=0.7,de;q=0.6",
}
yield Request(url, headers=headers)
I want to log in a website and then scrape few details but can't login using scrapy.
Here's my code:
from scrapy import Spider
from scrapy.http import FormRequest
from lxml import html
from scrapy.utils.response import open_in_browser
class QuotesSpider(Spider):
name = 'quotes'
start_urls = ('https://app.thecoachingmanual.com/login',)
def parse(self, response):
return FormRequest.from_response(response,
formdata={'emailAddress':'email#gmail.com','password':'MyPassword'},
callback=self.scrape_pages)
def scrape_pages(self, response):
open_in_browser(response)
# Complete your code here to scrape the pages that you are redirected to after logging in
# ....
# ....
Your code is totally wrong
Open https://app.thecoachingmanual.com/login, open DevTools and click on Network tab, then check Preserve logs
See on which URL request being sent, and also analyze Form Data to see what fields are being sent
from scrapy.http import FormRequest
import logging
class QuotesSpider(Spider):
name = 'quotes'
headers = {
'origin': 'https://app.thecoachingmanual.com',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'content-type': 'application/x-www-form-urlencoded; charset=UTF-8', #NOTICE THIS
'accept': 'application/json, text/javascript, */*; q=0.01',
'referer': 'https://app.thecoachingmanual.com/',
'authority': 'api.thecoachingmanual.com',
'dnt': '1',
}
def start_requests(self):
data = '{"emailAddress":"test#gmail.com","password":"test"}'
yield FormRequest('https://api.thecoachingmanual.com/commandapi/auth/login',
callback=self.login,
headers=self.headers,
formdata=data,
)
def login(self, response):
logging.info(response.status)
logging.info(response.text)
I want to extract JSON data from a web page, so I've inspected it. Data I need is stored in the below format:
<script type="application/ld+json">
{
'data I want to extract'
}
</script>
I tried to use:
import scrapy
import json
class OpenriceSpider(scrapy.Spider):
name = 'openrice'
allowed_domains = ['www.openrice.com']
def start_requests(self):
headers = {
'accept-encoding': 'gzip, deflate, sdch, br',
'accept-language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'cache-control': 'max-age=0',
}
url = 'https://www.openrice.com/en/hongkong/r-kitchen-one-cafe-sha-tin-western-r483821'
yield scrapy.Request(url=url, headers=headers, callback=self.parse)
def parse(self, response): # response = request url ?
items = []
jsonresponse = json.loads(response)
But it doesn't work, how should I change it?
You need to locate that script element in the HTML source, extract it's text and only then load with json.loads():
script = response.xpath("//script[#type='application/ld+json']/text()").extract_first()
json_data = json.loads(script)
print(json_data)
Here, I am using the not so common application/ld+json to locate the script, but there are many other options as well - like, locate the script using some text you know it is in the script itself:
//script[contains(., 'Restaurant')]
I am new the web-scraping game. I am trying to scrap the following website:
http://www.foodemissions.com/foodemissions/Calculator.aspx
Using resources found on the Internet, I put together the following HTTP POST request:
import urllib
from bs4 import BeautifulSoup
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17',
'Content-Type': 'application/x-www-form-urlencoded',
'Accept-Encoding': 'gzip,deflate,sdch',
'Accept-Language': 'en-US,en;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3'
}
class MyOpener(urllib.FancyURLopener):
version = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17'
myopener = MyOpener()
url = 'http://www.foodemissions.com/foodemissions/Calculator.aspx'
# first HTTP request without form data
f = myopener.open(url)
soup_dummy = BeautifulSoup(f,"html5lib")
# parse and retrieve two vital form values
viewstate = soup_dummy.select("#__VIEWSTATE")[0]['value']
viewstategen = soup_dummy.select("#__VIEWSTATEGENERATOR")[0]['value']
soup_dummy.find(id="ctl00_MainContent_category")
#search for the string 'input' to find the form data
formData = (
('__VIEWSTATE', viewstate),
('__VIEWSTATEGENERATOR', viewstategen),
('ctl00$MainContent$transport', '200'),
('ctl00$MainContent$quantity','1'),
('ctl00$MainContent$wastepct','100')
)
encodedFields = urllib.urlencode(formData)
# second HTTP request with form data
f = myopener.open(url, encodedFields)
soup = BeautifulSoup(f,"html5lib")
trans_emissions = soup.find("span", id="ctl00_MainContent_transEmissions")
print(trans_emissions.text)
The output from my final print command doesn't seem to change even when I change the ctl00$MainContent$transport element. Any pointers on why this is the case?
Thanks!
You need to make the ASP.NET App "think" that you clicked the calculate button by adding the button name to the __EVENTTARGET hidden input.
formData = (
('__VIEWSTATE', viewstate),
('__VIEWSTATEGENERATOR', viewstategen),
('ctl00$MainContent$transport', '100'),
('ctl00$MainContent$quantity','150'),
('ctl00$MainContent$wastepct','200'),
('__EVENTTARGET', 'ctl00$MainContent$calculate')
)
I change the default request headers in settings.py as below:
DEFAULT_REQUEST_HEADERS = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.84 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4',
}
However, it doesn't work in my HotSpider. I can see scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware was enabled, but connection was closed cleanly as if the headers were not set.
Here is the HotSpider:
# -*- coding: utf-8 -*-
import scrapy
class HotSpider(scrapy.Spider):
name = "hot"
allowed_domains = ["qiushibaike.com"]
start_urls = (
'http://www.qiushibaike.com/hot',
)
def parse(self, response):
print '\n', response.status, '\n'
If I change the code to override the make_requests_from_url to set the header, everything works well.
# -*- coding: utf-8 -*-
import scrapy
class HotSpider(scrapy.Spider):
name = "hot"
allowed_domains = ["qiushibaike.com"]
start_urls = (
'http://www.qiushibaike.com/hot',
)
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.84 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4',
}
def make_requests_from_url(self, url):
return scrapy.http.Request(url, headers=self.headers)
def parse(self, response):
print '\n', response.status, '\n'
This problem will be settled in Scrapy 1.2 according to prioritize default headers over user agent middlewares #2091
I see User-Agent header is indeed not set properly when using default headers middleware and this particular site refuses connections without some expected user-agent header.
Recommended way to set user-agent for your crawler is by using USER_AGENT setting key:
e.g.
# settings.py
USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36"
not setting user-agent when using default headers might be some bug in Scrapy, or maybe this is expected and documented somewhere. You need to do more research about this, if it is indeed bug it's worth posting bug report in Scrapy github repo.