Incomplete urls in scrapy - python

I am getting results such as:
[crawler] DEBUG: Crawled (200) <GET http://www.hormelfoods.com/About/Legal/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/link.aspx?_id=EFFFBF3348524C6ABCD1C2775E7FDA93&_z=z> (referer: http://www.hormelfoods.com/About/Legal/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/link.aspx?_id=3FC7ECFD861B4F1AAF7BFD218236F983&_z=z)
I saw the page source of the referer:
It shows this <a href="~/link.aspx?_id=EFFFBF3348524C6ABCD1C2775E7FDA93&_z=z">
How to rectify this?
I have added a counter which keeps track of the number of pdfs parsed.
My parse_item function:
def parse_item(self, response):
sel = HtmlXPathSelector(response)
for utype in self.url_types:
links = []
# if sel.select('/html/head/link[#type="application/pdf"]/#href').extract():
# links += sel.select('/html/head/link[#type="application/pdf"]/#href').extract()
if sel.xpath('//a[contains(#href, "{0}")]/#href'.format(utype)).extract():
links += sel.xpath('//a[contains(#href, "{0}")]/#href'.format(utype)).extract()
if sel.xpath('/html/head/link[#type="application/{0}"]/#href'.format(utype)).extract():
links += sel.xpath('/html/head/link[#type="application/{0}"]/#href'.format(utype)).extract()
# if sel.select('/html/head/link[#type="application/x-pdf"]/#href').extract():
# links += sel.select('/html/head/link[#type="application/x-pdf"]/#href').extract()
items = []
self.cntr += len(links)
if(self.cntr > 60):
raise CloseSpider('links exceeded')
for link in links:
item = CrawlerItem()
item['main'] = response.url
base_url = get_base_url(response)
item['url'] = urljoin(base_url,link)
company = tldextract.extract(base_url)
item['source'] = company.domain
item['type'] = utype.upper()
yield item
def process_links(self,links):
for i, w in enumerate(links):
w.url = w.url.replace("../", "")
links[i] = w
return links

Related

Scrapy: Debug Redirecting (301)

Before I was getting the error "HTTP status code is not handled or not allowed", I modified the USER_AGENT that was in default mode and now I am getting this error:
import scrapy
class OlxSpider(scrapy.Spider):
name = "olx"
allowed_domains = ["pe.olx.com.br"]
start_urls = (
'http://pe.olx.com.br/imoveis/aluguel',
)
def parse(self, response):
items = response.xpath(
'//div[contains(#class,"section_OLXad-list")]//li[contains'
'(#class,"item")]'
)
for item in items:
url = item.xpath(
".//a[contains(#class,'OLXad-list-link')]/#href"
).extract_first()
yield scrapy.Request(url=url, callback=self.parse_detail)
next_page = response.xpath(
'//li[contains(#class,"item next")]//a/#href'
).extract_first()
if next_page:
self.log('Next Page: {0}'.format(next_page))
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_detail(self, response):
self.log(u'Imóvel URL: {0}'.format(response.url))
item = {}
item['photos'] = response.xpath(
'//div[contains(#class,"photos")]//a/#href'
).extract()
item['url'] = response.url
item['address'] = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-location")]'
'//.)'
).extract_first()
item['title'] = response.xpath(
'normalize-space(//h1[contains(#id,"ad_title")]//.)'
).extract_first()
item['price'] = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-price")]'
'//span[contains(#class,"actual-price")]//.)'
).extract_first()
item['details'] = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-description")]'
'//.)'
).extract_first()
item['source_id'] = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-id")]//strong//.)'
).extract_first()
date = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-date")]//.)'
).re("Inserido em: (.*).")
item['date'] = (date and date[0]) or ''
yield item
trying to execute the .py file in the terminal, I get the following message:
2022-01-13 12:36:36 [scrapy.core.engine] INFO: Spider opened
2022-01-13 12:36:36 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2022-01-13 12:36:36 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2022-01-13 12:36:37 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://pe.olx.com.br/robots.txt> from <GET http://pe.olx.com.br/robots.txt>
2022-01-13 12:36:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://pe.olx.com.br/robots.txt> (referer: None)
2022-01-13 12:36:37 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://pe.olx.com.br/imoveis/aluguel> from <GET http://pe.olx.com.br/imoveis/aluguel>
Does anyone know what might be causing this problem?
P.s.: I have tried these solutions Python Scrapy 301 redirects
It's just redirected from http to https so there's no problem there.
Your xpath is completely wrong. I fixed it in parse, and I fixed 3 xpaths in parse_detail as an example, but you need to fix the rest of them.
import scrapy
class OlxSpider(scrapy.Spider):
name = "olx"
allowed_domains = ["pe.olx.com.br"]
start_urls = (
'http://pe.olx.com.br/imoveis/aluguel',
)
def parse(self, response):
# from scrapy.shell import inspect_response
# inspect_response(response, self)
items = response.xpath('//ul[#id="ad-list"]/li')
for item in items:
url = item.xpath('.//a/#href').get()
if url:
yield scrapy.Request(url=url, callback=self.parse_detail)
next_page = response.xpath('//a[#data-lurker-detail="next_page"]/#href').get()
if next_page:
self.log('Next Page: {0}'.format(next_page))
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_detail(self, response):
self.log(u'Imóvel URL: {0}'.format(response.url))
item = {}
item['photos'] = response.xpath('//img[#class="image "]/#src').get()
item['url'] = response.url
item['address'] = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-location")]'
'//.)'
).extract_first()
item['title'] = response.xpath('//h1/text()').get()
item['price'] = response.xpath('//h2/text()').get()
item['details'] = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-description")]'
'//.)'
).extract_first()
item['source_id'] = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-id")]//strong//.)'
).extract_first()
date = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-date")]//.)'
).re("Inserido em: (.*).")
item['date'] = (date and date[0]) or ''
yield item

Scrapy-crawled-200 Referer-None

I'm trying to learn how to use scrapy and python but I'm not an expert at all...
I have an empty file after crawling this page :
so.news.com
and I don't understand why...
Here is my code :
import scrapy
class XinhuaSpider(scrapy.Spider):
name = 'xinhua'
allowed_domains = ['xinhuanet.com']
start_urls = ['http://so.news.cn/?keyWordAll=&keyWordOne=%E6%96%B0%E5%86%A0+%E8%82%BA%E7%82%8E+%E6%AD%A6%E6%B1%89+%E7%97%85%E6%AF%92&keyWordIg=&searchFields=1&sortField=0&url=&senSearch=1&lang=cn#search/0/%E6%96%B0%E5%86%A0/1/']
def parse(self, response):
#titles = response.css('#newsCon > div.newsList > div.news > h2 > a::text').extract()
#date = response.css('#newsCon > div.newsList > div.news> div > p.newstime > span::text').extract()
titles = response.xpath("/html/body/div[#id='search-result']/div[#class='resultCnt']/div[#id='resultList']/div[#class='newsListCnt secondlist']/div[#id='newsCon']/div[#class='newsList']/div[#class='news']/h2/a/text()").extract()
date = response.xpath("/html/body/div[#id='search-result']/div[#class='resultCnt']/div[#id='resultList']/div[#class='newsListCnt secondlist']/div[#id='newsCon']/div[#class='newsList']/div[#class='news']/div[#class='easynews']/p[#class='newstime']/span/text()").extract()
for item in zip(titles,date):
scraped_info ={
"title" : item[0],
"date" : item[1],
}
yield scraped_info
nextPg = response.xpath("/html/body/div[#id='search-result']/div[#class='resultCnt']/div[#id='pagination']/a[#class='next']/#href").extract()
if nextPg is not None:
print(nextPg)
This is the messenage in console:
2020-05-11 00:09:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://so.news.cn/?keyWordAll=&keyWordOne=%E6%96%B0%E5%86%A0+%E8%82%BA%E7%82%8E+%E6%AD%A6%E6%B1%89+%E7%97%85%E6%AF%92&keyWordIg=&searchFields=1&sortField=0&url=&senSearch=1&lang=cn#search/0/%E6%96%B0%E5%86%A0/1/> (referer: None)
[]
You need always check page's source code (Ctrl+U) in your browser. Content you see in your browser maybe loaded using XHR Javascript call. Here is code that works for me (I found correct start url using Chrome Developer Console):
import scrapy
import json
import re
class XinhuaSpider(scrapy.Spider):
name = 'xinhua'
# allowed_domains = ['xinhuanet.com']
start_urls = ['http://so.news.cn/getNews?keyWordAll=&keyWordOne=%25E6%2596%25B0%25E5%2586%25A0%2B%25E8%2582%25BA%25E7%2582%258E%2B%25E6%25AD%25A6%25E6%25B1%2589%2B%25E7%2597%2585%25E6%25AF%2592&keyWordIg=&searchFields=1&sortField=0&url=&senSearch=1&lang=cn&keyword=%E6%96%B0%E5%86%A0&curPage=1']
def parse(self, response):
data = json.loads(response.body)
for item in data["content"]["results"]:
scraped_info ={
"title" : item['title'],
"date" : item['pubtime'],
}
yield scraped_info
current_page = data['content']['curPage']
total_pages = data['content']['pageCount']
if current_page < total_pages:
next_page = re.sub(r'curPage=\d+', f"curPage={current_page + 1}", response.url)
yield scrapy.Request(
url=next_page,
callback=self.parse,
)

301 redirection in Scrapy

I have a Scrapy project. I scrape items of a page (I have 10 000 URLS in total). When there is an item, it works. The problem is when there isn't any item.
I have a 301 redirection (DEBUG: Redirecting (301) to <GET https://www.reezocar.com/search/Alfa+Romeo+166+3.0+V6.html?energy=petrol&gearbox=manual&yearMin=1999&yearMax=2003&doors=45&withPicture=off&size=120> from <GET https://reezocar.com/search/Alfa+Romeo+166+3.0+V6.html?energy=petrol&gearbox=manual&yearMin=1999&yearMax=2003&doors=45&withPicture=off&size=120>)
I guess the problem comes from the fact that there is no item but I have another spider for another website that is based on the same code that works well when there is no item on the page.
The spider code:
class AnnonceSpider(scrapy.Spider):
name = 'rzc_results'
def __init__(self, *args, **kwargs):
data_file = pkgutil.get_data(
"rzc_spider", "json/input/complete_rzc_scrape_rectif.json")
self.data = json.loads(data_file)
def start_requests(self):
for item in self.data:
request = scrapy.Request(item['rzc_url'], callback=self.parse)
request.meta['item'] = item
yield request
def parse(self, response):
item = response.meta['item']
item['results'] = []
#item["car_number"] = []
item["car_number"] = response.css(
"h2.subTitle_1guol7j::text").extract_first()
#if len(item["car_number"]) == 0:
# item["car_number"] = None
for caritem in response.css("div.adCardOuter_d2sn17 > div[itemprop='item']"):
data = AnnonceItem()
#model
data["model"] = []
data["model"] = caritem.css("h2.title_16j3u81 > div::text").extract_first()
if len(data["model"]) == 0:
data["model"] = None
#price
data["price_str"] = []
data["price_str"] = caritem.css(
"div.price_1anxiw > span::text").extract_first()
if len(data["price_str"]) == 0:
data["price_str"] = None
item['results'].append(data)
yield item
next_page = response.css(
'a.link_huvdae-o_O-linkPrevNext_1v3fox8::attr(href)').extract_first()
if next_page is not None:
url_pagination = 'https://www.reezocar.com' + next_page
meta = {'item': response.meta['item']}
yield scrapy.Request(url=url_pagination, callback=self.parse, meta=meta)
Include these options in the request.
yield Request('url',
meta = {
'dont_redirect': True,
'handle_httpstatus_list': [301,302]
},
callback= self.func)
Also, try to set dont_redirect flag in the settings file to affect the change globally.

spider error processing URL

I'm getting error with processing URL with scrapy 1.5.0, python 2.7.14.
class FootLockerSpider(Spider):
name = "FootLockerSpider"
allowded_domains = ["footlocker.it"]
start_urls = [FootLockerURL]
def __init__(self):
logging.critical("FootLockerSpider STARTED.")
def parse(self, response):
products = Selector(response).xpath('//div[#class="fl-category--productlist"]')
for product in products:
item = FootLockerItem()
item['name'] = product.xpath('.//a/span[#class="fl-product-tile--name"]/span').extract()[0]
item['link'] = product.xpath('.//a/#href').extract()[0]
# item['image'] = product.xpath('.//div/a/div/img/#data-original').extract()[0]
# item['size'] = '**NOT SUPPORTED YET**'
yield item
yield Request(FootLockerURL, callback=self.parse, dont_filter=True, priority=14)
This is my class FootLockerSpider, and this is the error I get:
[scrapy.core.scraper] ERROR: Spider error processing <GET
https://www.footlocker.it/it/uomo/scarpe/> (referer: None)
File "C:\Users\Traian\Downloads\Sneaker-Notify\main\main.py", line 484, in
parse item['name'] = product.xpath('.//a/span[#class="fl-product-tile--
name"]/span').extract()[0]
IndexError: list index out of range
How can I solve this problem?
You need to always check source HTML:
<div class="fl-category--productlist--item" data-category-item><div class="fl-load-animation fl-product-tile--container"
data-lazyloading
data-lazyloading-success-handler="lazyloadingInit"
data-lazyloading-context="product-tile"
data-lazyloading-content-handler="lazyloadingJSONContentHandler"
data-request="https://www.footlocker.it/INTERSHOP/web/WFS/Footlocker-Footlocker_IT-Site/it_IT/-/EUR/ViewProductTile-ProductTileJSON?BaseSKU=314213410104&ShowRating=true&ShowQuickBuy=true&ShowOverlay=true&ShowBadge=true"
data-scroll-to-target="fl-product-tile-314213410104"
>
<noscript>
<span itemprop="name">Nike Air Max 97 Ultra '17 - Uomo Scarpe</span>
</noscript>
</div>
</div>
This will work:
products = response.xpath('//div[#class="fl-category--productlist--item"]')
for product in products:
item = FootLockerItem()
item['name'] = product.xpath('.//a/span/text()').extract_first()
item['link'] = product.xpath('.//a/#href').extract_first()
yield item

Scrapy crawl multiple domains with reoccurring urls per domain

I am trying to crawl some selected domains and take only the essential pages from those websites. My approach is to crawl one webpage of the domain and take a limit set of urls, these urls will crawled for reoccurring URLs that i found on the first webpage. This way i try to eliminate all the URLs that didn't reoccur (content urls, such as products etc.). The reason i am asking for help is because scrapy.Request is not being executed more than once.
This is what i have so far:
class Finder(scrapy.Spider):
name = "finder"
start_urls = ['http://www.nu.nl/']
uniqueDomainUrl = dict()
maximumReoccurringPages = 5
rules = (
Rule(
LinkExtractor(
allow=('.nl', '.nu', '.info', '.net', '.com', '.org', '.info'),
deny=('facebook','amazon', 'wordpress', 'blogspot', 'free', 'reddit',
'videos', 'youtube', 'google', 'doubleclick', 'microsoft', 'yahoo',
'bing', 'znet', 'stackexchang', 'twitter', 'wikipedia', 'creativecommons',
'mediawiki', 'wikidata'),
),
process_request='parse',
follow=True
),
)
def parse(self, response):
self.logger.info('Entering URL: %s', response.url)
currentUrlParse = urlparse.urlparse( response.url )
currentDomain = currentUrlParse.hostname
if currentDomain in self.uniqueDomainUrl:
yield
self.uniqueDomainUrl[currentDomain] = currentDomain
item = ImportUrlList()
response.meta['item'] = item
# Reoccurring URLs
item = self.findReoccurringUrls(response)
list = item['list']
self.logger.info('Output: %s', list)
# Crawl reoccurring urls
#for href in list:
# yield scrapy.Request(response.urljoin(href), callback=self.parse)
def findReoccurringUrls(self, response):
self.logger.info('Finding reoccurring URLs in: %s', response.url)
item = response.meta['item']
urls = self.findUrlsOnCurrentPage(response)
item['list'] = urls
response.meta['item'] = item
# Get all URLs on each web page (limit 5 pages)
i = 0
for value in urls:
i += 1
if i > self.maximumReoccurringPages:
break
self.logger.info('Parse: %s', value)
request = Request(value, callback=self.test, meta={'item':item})
item = request.meta['item']
return item
def test(self, response):
self.logger.info('Page title: %s', response.css('title').extract())
item = response.meta['item']
urls = self.findUrlsOnCurrentPage( response )
item['list'] = set(item['list']) & set(urls)
return item
def findUrlsOnCurrentPage(self, response):
newUrls = []
currentUrlParse = urlparse.urlparse( response.url )
currentDomain = currentUrlParse.hostname
currentUrl = currentUrlParse.scheme +'://'+ currentUrlParse.hostname
for href in response.css('a::attr(href)').extract():
newUrl = urlparse.urljoin(currentUrl, href)
urlParse = urlparse.urlparse(newUrl)
domain = urlParse.hostname
if href.startswith( '#' ):
continue
if domain != currentDomain:
continue
if newUrl not in newUrls:
newUrls.append(newUrl)
return newUrls
It seems to be only executing the first page, the other Request() are not called as i can see on the callback.
What ImportUrlList() does? You implemented it?
You also forgot to call scrapy.Request on findReoccuringUrls
request = scrapy.Request(value, callback=self.test, meta={'item':item})
def findReoccurringUrls(self, response):
self.logger.info('Finding reoccurring URLs in: %s', response.url)
item = response.meta['item']
urls = self.findUrlsOnCurrentPage(response)
item['list'] = urls
response.meta['item'] = item
# Get all URLs on each web page (limit 5 pages)
i = 0
for value in urls:
i += 1
if i > self.maximumReoccurringPages:
break
self.logger.info('Parse: %s', value)
request = scrapy.Request(value, callback=self.test, meta={'item':item})
item = request.meta['item']

Categories

Resources