I'm getting error with processing URL with scrapy 1.5.0, python 2.7.14.
class FootLockerSpider(Spider):
name = "FootLockerSpider"
allowded_domains = ["footlocker.it"]
start_urls = [FootLockerURL]
def __init__(self):
logging.critical("FootLockerSpider STARTED.")
def parse(self, response):
products = Selector(response).xpath('//div[#class="fl-category--productlist"]')
for product in products:
item = FootLockerItem()
item['name'] = product.xpath('.//a/span[#class="fl-product-tile--name"]/span').extract()[0]
item['link'] = product.xpath('.//a/#href').extract()[0]
# item['image'] = product.xpath('.//div/a/div/img/#data-original').extract()[0]
# item['size'] = '**NOT SUPPORTED YET**'
yield item
yield Request(FootLockerURL, callback=self.parse, dont_filter=True, priority=14)
This is my class FootLockerSpider, and this is the error I get:
[scrapy.core.scraper] ERROR: Spider error processing <GET
https://www.footlocker.it/it/uomo/scarpe/> (referer: None)
File "C:\Users\Traian\Downloads\Sneaker-Notify\main\main.py", line 484, in
parse item['name'] = product.xpath('.//a/span[#class="fl-product-tile--
name"]/span').extract()[0]
IndexError: list index out of range
How can I solve this problem?
You need to always check source HTML:
<div class="fl-category--productlist--item" data-category-item><div class="fl-load-animation fl-product-tile--container"
data-lazyloading
data-lazyloading-success-handler="lazyloadingInit"
data-lazyloading-context="product-tile"
data-lazyloading-content-handler="lazyloadingJSONContentHandler"
data-request="https://www.footlocker.it/INTERSHOP/web/WFS/Footlocker-Footlocker_IT-Site/it_IT/-/EUR/ViewProductTile-ProductTileJSON?BaseSKU=314213410104&ShowRating=true&ShowQuickBuy=true&ShowOverlay=true&ShowBadge=true"
data-scroll-to-target="fl-product-tile-314213410104"
>
<noscript>
<span itemprop="name">Nike Air Max 97 Ultra '17 - Uomo Scarpe</span>
</noscript>
</div>
</div>
This will work:
products = response.xpath('//div[#class="fl-category--productlist--item"]')
for product in products:
item = FootLockerItem()
item['name'] = product.xpath('.//a/span/text()').extract_first()
item['link'] = product.xpath('.//a/#href').extract_first()
yield item
Related
Before I was getting the error "HTTP status code is not handled or not allowed", I modified the USER_AGENT that was in default mode and now I am getting this error:
import scrapy
class OlxSpider(scrapy.Spider):
name = "olx"
allowed_domains = ["pe.olx.com.br"]
start_urls = (
'http://pe.olx.com.br/imoveis/aluguel',
)
def parse(self, response):
items = response.xpath(
'//div[contains(#class,"section_OLXad-list")]//li[contains'
'(#class,"item")]'
)
for item in items:
url = item.xpath(
".//a[contains(#class,'OLXad-list-link')]/#href"
).extract_first()
yield scrapy.Request(url=url, callback=self.parse_detail)
next_page = response.xpath(
'//li[contains(#class,"item next")]//a/#href'
).extract_first()
if next_page:
self.log('Next Page: {0}'.format(next_page))
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_detail(self, response):
self.log(u'Imóvel URL: {0}'.format(response.url))
item = {}
item['photos'] = response.xpath(
'//div[contains(#class,"photos")]//a/#href'
).extract()
item['url'] = response.url
item['address'] = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-location")]'
'//.)'
).extract_first()
item['title'] = response.xpath(
'normalize-space(//h1[contains(#id,"ad_title")]//.)'
).extract_first()
item['price'] = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-price")]'
'//span[contains(#class,"actual-price")]//.)'
).extract_first()
item['details'] = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-description")]'
'//.)'
).extract_first()
item['source_id'] = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-id")]//strong//.)'
).extract_first()
date = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-date")]//.)'
).re("Inserido em: (.*).")
item['date'] = (date and date[0]) or ''
yield item
trying to execute the .py file in the terminal, I get the following message:
2022-01-13 12:36:36 [scrapy.core.engine] INFO: Spider opened
2022-01-13 12:36:36 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2022-01-13 12:36:36 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2022-01-13 12:36:37 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://pe.olx.com.br/robots.txt> from <GET http://pe.olx.com.br/robots.txt>
2022-01-13 12:36:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://pe.olx.com.br/robots.txt> (referer: None)
2022-01-13 12:36:37 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://pe.olx.com.br/imoveis/aluguel> from <GET http://pe.olx.com.br/imoveis/aluguel>
Does anyone know what might be causing this problem?
P.s.: I have tried these solutions Python Scrapy 301 redirects
It's just redirected from http to https so there's no problem there.
Your xpath is completely wrong. I fixed it in parse, and I fixed 3 xpaths in parse_detail as an example, but you need to fix the rest of them.
import scrapy
class OlxSpider(scrapy.Spider):
name = "olx"
allowed_domains = ["pe.olx.com.br"]
start_urls = (
'http://pe.olx.com.br/imoveis/aluguel',
)
def parse(self, response):
# from scrapy.shell import inspect_response
# inspect_response(response, self)
items = response.xpath('//ul[#id="ad-list"]/li')
for item in items:
url = item.xpath('.//a/#href').get()
if url:
yield scrapy.Request(url=url, callback=self.parse_detail)
next_page = response.xpath('//a[#data-lurker-detail="next_page"]/#href').get()
if next_page:
self.log('Next Page: {0}'.format(next_page))
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_detail(self, response):
self.log(u'Imóvel URL: {0}'.format(response.url))
item = {}
item['photos'] = response.xpath('//img[#class="image "]/#src').get()
item['url'] = response.url
item['address'] = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-location")]'
'//.)'
).extract_first()
item['title'] = response.xpath('//h1/text()').get()
item['price'] = response.xpath('//h2/text()').get()
item['details'] = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-description")]'
'//.)'
).extract_first()
item['source_id'] = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-id")]//strong//.)'
).extract_first()
date = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-date")]//.)'
).re("Inserido em: (.*).")
item['date'] = (date and date[0]) or ''
yield item
import scrapy
class AdmissionsSpider(scrapy.Spider):
name = 'Admissions'
allowed_domains = ["www.worldometers.info"]
start_urls = ['https://www.worldometers.info/population/countries-in-asia-by-population/']
def parse(self, response):
countries=response.xpath("//td/a")
for country in countries:
name=country.xpath(".//text()").get()
links=country.xpath(".//#href").get()
absolute_url=f"https://www.worldomets.info{links}"
yield scrapy.Request(url=absolute_url)
I am traying to print countries name but it show mw the error Filtered offsite request to 'www.worldomets.info': <GET https://www.worldomets.info/world-population/china-population/>
You can try it like is
import scrapy
class AdmissionsSpider(scrapy.Spider):
name = 'Admissions'
allowed_domains = ["worldometers.info"]
start_urls = ['https://www.worldometers.info/population/countries-in-asia-by-population/']
def parse(self, response):
countries=response.xpath("//td/a")
for country in countries:
name=country.xpath(".//text()").get()
link=country.xpath(".//#href").get()
link = response.urljoin(link)
# print(link)
# absolute_url=f"https://www.worldomets.info{links}"
yield scrapy.Request(url=link, callback=self.parse_absoluteurl)
def parse_absoluteurl(self, response):
print('\n', response.url ,'\n')
for the details Following hyperlink and "Filtered offsite request"
I'm trying to learn how to use scrapy and python but I'm not an expert at all...
I have an empty file after crawling this page :
so.news.com
and I don't understand why...
Here is my code :
import scrapy
class XinhuaSpider(scrapy.Spider):
name = 'xinhua'
allowed_domains = ['xinhuanet.com']
start_urls = ['http://so.news.cn/?keyWordAll=&keyWordOne=%E6%96%B0%E5%86%A0+%E8%82%BA%E7%82%8E+%E6%AD%A6%E6%B1%89+%E7%97%85%E6%AF%92&keyWordIg=&searchFields=1&sortField=0&url=&senSearch=1&lang=cn#search/0/%E6%96%B0%E5%86%A0/1/']
def parse(self, response):
#titles = response.css('#newsCon > div.newsList > div.news > h2 > a::text').extract()
#date = response.css('#newsCon > div.newsList > div.news> div > p.newstime > span::text').extract()
titles = response.xpath("/html/body/div[#id='search-result']/div[#class='resultCnt']/div[#id='resultList']/div[#class='newsListCnt secondlist']/div[#id='newsCon']/div[#class='newsList']/div[#class='news']/h2/a/text()").extract()
date = response.xpath("/html/body/div[#id='search-result']/div[#class='resultCnt']/div[#id='resultList']/div[#class='newsListCnt secondlist']/div[#id='newsCon']/div[#class='newsList']/div[#class='news']/div[#class='easynews']/p[#class='newstime']/span/text()").extract()
for item in zip(titles,date):
scraped_info ={
"title" : item[0],
"date" : item[1],
}
yield scraped_info
nextPg = response.xpath("/html/body/div[#id='search-result']/div[#class='resultCnt']/div[#id='pagination']/a[#class='next']/#href").extract()
if nextPg is not None:
print(nextPg)
This is the messenage in console:
2020-05-11 00:09:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://so.news.cn/?keyWordAll=&keyWordOne=%E6%96%B0%E5%86%A0+%E8%82%BA%E7%82%8E+%E6%AD%A6%E6%B1%89+%E7%97%85%E6%AF%92&keyWordIg=&searchFields=1&sortField=0&url=&senSearch=1&lang=cn#search/0/%E6%96%B0%E5%86%A0/1/> (referer: None)
[]
You need always check page's source code (Ctrl+U) in your browser. Content you see in your browser maybe loaded using XHR Javascript call. Here is code that works for me (I found correct start url using Chrome Developer Console):
import scrapy
import json
import re
class XinhuaSpider(scrapy.Spider):
name = 'xinhua'
# allowed_domains = ['xinhuanet.com']
start_urls = ['http://so.news.cn/getNews?keyWordAll=&keyWordOne=%25E6%2596%25B0%25E5%2586%25A0%2B%25E8%2582%25BA%25E7%2582%258E%2B%25E6%25AD%25A6%25E6%25B1%2589%2B%25E7%2597%2585%25E6%25AF%2592&keyWordIg=&searchFields=1&sortField=0&url=&senSearch=1&lang=cn&keyword=%E6%96%B0%E5%86%A0&curPage=1']
def parse(self, response):
data = json.loads(response.body)
for item in data["content"]["results"]:
scraped_info ={
"title" : item['title'],
"date" : item['pubtime'],
}
yield scraped_info
current_page = data['content']['curPage']
total_pages = data['content']['pageCount']
if current_page < total_pages:
next_page = re.sub(r'curPage=\d+', f"curPage={current_page + 1}", response.url)
yield scrapy.Request(
url=next_page,
callback=self.parse,
)
In method parse() spider crawls 4 urls and then sends to method parse_dir_contents() to scrape some data but only 4th url is being scraped I don't understand why it is not scraping other 3 urls?
import scrapy
from v_one.items import VOneItem
import json
class linkedin(scrapy.Spider):
name = "linkedin"
allowed_domains = ["linkedin.com"]
start_urls = [
"https://in.linkedin.com/directory/people-s-1-2-4/",
]
def parse(self, response):
for href in response.xpath('//*[#id="seo-dir"]/div/div/div/ul/li/a/#href'):
url = response.urljoin(href.extract())
print "________________"+url
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
for sel in response.xpath('//*[#id="profile"]'):
url = response.url
print "____________"+url
item = VOneItem()
item['name'] = sel.xpath('//*[#id="name"]/text()').extract()
item['headline'] = sel.xpath('//*[#id="topcard"]/div/div/div/p/span/text()').extract()
item['current'] = sel.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/span/text()').extract()
item['education'] = sel.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/a/text()').extract()
item['link'] = url
yield item
By inspecting the pages I think that there is no need of the for loop in the parse_dir_contents function. Make the function like this:
def parse_dir_contents(self, response):
item = VOneItem()
item['name'] = response.xpath('//*[#id="name"]/text()').extract()
item['headline'] = response.xpath('//*[#id="topcard"]/div/div/div/p/span/text()').extract()
item['current'] = response.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/span/text()').extract()
item['education'] = response.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/a/text()').extract()
item['link'] = response.url
return item
And check if this solves your issue.
I am getting results such as:
[crawler] DEBUG: Crawled (200) <GET http://www.hormelfoods.com/About/Legal/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/link.aspx?_id=EFFFBF3348524C6ABCD1C2775E7FDA93&_z=z> (referer: http://www.hormelfoods.com/About/Legal/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/link.aspx?_id=3FC7ECFD861B4F1AAF7BFD218236F983&_z=z)
I saw the page source of the referer:
It shows this <a href="~/link.aspx?_id=EFFFBF3348524C6ABCD1C2775E7FDA93&_z=z">
How to rectify this?
I have added a counter which keeps track of the number of pdfs parsed.
My parse_item function:
def parse_item(self, response):
sel = HtmlXPathSelector(response)
for utype in self.url_types:
links = []
# if sel.select('/html/head/link[#type="application/pdf"]/#href').extract():
# links += sel.select('/html/head/link[#type="application/pdf"]/#href').extract()
if sel.xpath('//a[contains(#href, "{0}")]/#href'.format(utype)).extract():
links += sel.xpath('//a[contains(#href, "{0}")]/#href'.format(utype)).extract()
if sel.xpath('/html/head/link[#type="application/{0}"]/#href'.format(utype)).extract():
links += sel.xpath('/html/head/link[#type="application/{0}"]/#href'.format(utype)).extract()
# if sel.select('/html/head/link[#type="application/x-pdf"]/#href').extract():
# links += sel.select('/html/head/link[#type="application/x-pdf"]/#href').extract()
items = []
self.cntr += len(links)
if(self.cntr > 60):
raise CloseSpider('links exceeded')
for link in links:
item = CrawlerItem()
item['main'] = response.url
base_url = get_base_url(response)
item['url'] = urljoin(base_url,link)
company = tldextract.extract(base_url)
item['source'] = company.domain
item['type'] = utype.upper()
yield item
def process_links(self,links):
for i, w in enumerate(links):
w.url = w.url.replace("../", "")
links[i] = w
return links