Scrapy - avoid duplicate items when crawling multiple pages recursively

Scrapy - avoid duplicate items when crawling multiple pages recursively - python

What should i change in my code to avoid Scrapy retrieving same items during deep crawl into multiple pages?
Right now, Scrapy performs crawling and scraping like this
Visit Page-A >> ScrapeItem1 & Extract_link_to_Page-B >> Visit Page-B >> ScrapeItem2 & Extract_links_to_Pages-C-D-E >> ScrapeItems2-3-4-5 from Pages-C-D-E
Code looks like this
def category_page(self,response):
next_page = response.xpath('').extract()
for item in self.parse_attr(response):
yield item
if next_page:
path = next_page.extract_first()
nextpage = response.urljoin(path)
yield scrapy.Request(nextpage,callback=category_page)
def parse_attr(self, response):
item = TradeItem()
item['NameOfCompany'] = response.xpath('').extract_first().strip()
item['Country'] = response.xpath('').extract_first().strip()
item['TrustPt'] = response.xpath('').extract_first().strip()
company_page = response.xpath('').extract_first()
if company_page:
company_page = response.urljoin(company_page)
request = scrapy.Request(company_page, callback = self.company_data)
request.meta['item'] = item
yield request
else:
yield item
def company_data(self, response):
item = response.meta['item']
item['Address'] = response.xpath('').extract()[1]
product_page = response.xpath('').extract()[1]
sell_page = response.xpath('').extract()[2]
trust_page = response.xpath('').extract()[4]
if sell_page:
sell_page = response.urljoin(sell_page)
request = scrapy.Request(sell_page, callback = self.sell_data)
request.meta['item3'] = item
yield request
if product_page:
product_page = response.urljoin(product_page)
request = scrapy.Request(product_page, callback = self.product_data)
request.meta['item2'] = item
yield request
if trust_page:
trust_page = response.urljoin(trust_page)
request = scrapy.Request(trust_page, callback = self.trust_data)
request.meta['item4'] = item
yield request
yield item
def product_data(self, response):
item = response.meta['item2']
item ['SoldProducts'] = response.xpath('').extract()
yield item
def sell_data(self, response):
item = response.meta['item3']
item ['SellOffers'] = response.xpath('').extract()
yield item
def trust_data(self, response):
item = response.meta['item4']
item ['TrustData'] = response.xpath('').extract()
yield item
Problem is that items are repeated, because Scrapy performs PARTIAL scraping on each function/meta item. So, i get entries like this:
Step1:
{'Address': u'',
'Country': u'',
'NameOfCompany': u'',
'TrustPoints': u''}
Step2:
{'Address': u'',
'Country': ','
'NameOfCompany': ',
'SellOffers': [
'TrustPoints': u''}
Step3:
{'Address': u'',
'Country': u'',
'NameOfCompany': u'',
'SellOffers': [],
'SoldProducts': [u' '],
'TrustData': [u''],
'TrustPoints': u''}
Each STEP repeats values from previous one. I know that this is caused by Scrapy visiting URLS multiple times. There is some error in my logic which i cannot full grasp.

Problem solved.
Corresponding answer:
https://stackoverflow.com/a/16177544/11008259
Code is corrected for my case.
def parse_attr(self, response):
company_page = response.xpath('').extract_first()
company_page = response.urljoin(company_page)
request = scrapy.Request(company_page, callback = self.company_data)
yield request
def company_data(self, response):
item = TradekeyItem()
item['Address'] = response.xpath('').extract()[1]
item['NameOfCompany'] = response.xpath('').extract()[1]
product_page = response.xpath('').extract()[1]
product_page = response.urljoin(product_page)
request = scrapy.Request(product_page, callback = self.product_data, meta={'item': item})
request.meta['item'] = item
return request
def product_data(self, response):
item = response.meta['item']
item ['SoldProducts'] = response.xpath('').extract()
sell_page = response.xpath('').extract()[2]
sell_page = response.urljoin(sell_page)
request = scrapy.Request(sell_page, callback = self.sell_data, meta={'item': item})
return request
def sell_data(self, response):
item = response.meta['item']
item ['SellOffers'] = response.xpath('').extract()
trust_page = response.xpath('').extract()[4]
trust_page = response.urljoin(trust_page)
request = scrapy.Request(trust_page, callback = self.trust_data, meta={'item': item})
return request
def trust_data(self, response):
item = response.meta['item']
item ['TrustData'] = response.xpath('")]//text()').extract()
yield item
We establish chain between items by not yielding items on each step, but yielding it at last step. Each function returns request to next one, therefore items are printed only when all of the functions complete their run.

Related

Python Crawler return the same response on any different URL request

I am building a very simple scraper but there is a very silly mistake i am doing somewhere which i am not able to find.
In response method, I am getting the same response for any URL passed using loop of all the products on the product list page
I am adding my code below please help.
def parse(self, response):
item = {}
count = 0
for single in response.xpath('//div[#class="_3O0U0u"]/div'):
count+=1
# print(count)
item['data_id'] = single.xpath('.//#data-id').extract_first()
item['price'] = single.xpath('.//div[#class="_1vC4OE"]/text()').extract_first()
item['url'] = single.xpath('.//div[#class="_1UoZlX"]/a[#class="_31qSD5"]/#href').extract_first()
if not item['url']:
item['url'] = single.xpath('.//div[#class="_3liAhj _1R0K0g"]/a[#class="Zhf2z-"]/#href').extract_first()
#print(item)
if item['url']:
yield scrapy.Request('https://www.somewebsite.com' + item['url'], callback = self.get_product_detail, priority = 1, meta={'item': item})
# break
next_page = response.xpath('//div[#class="_2zg3yZ"]/nav/a[#class="_3fVaIS"]/span[contains(text(),"Next")]/parent::a/#href').extract_first()
if next_page:
next_page = 'https://www.somewebsite.com'+response.xpath('//div[#class="_2zg3yZ"]/nav/a[#class="_3fVaIS"]/span[contains(text(),"Next")]/parent::a/#href').extract_first()
yield scrapy.Request(next_page, callback=self.parse ,priority=1)
def get_product_detail(self, response):
dict_item = response.meta['item']
sku = dict_item['data_id']
print('dict SKU ======== ', sku)

301 redirection in Scrapy

I have a Scrapy project. I scrape items of a page (I have 10 000 URLS in total). When there is an item, it works. The problem is when there isn't any item.
I have a 301 redirection (DEBUG: Redirecting (301) to <GET https://www.reezocar.com/search/Alfa+Romeo+166+3.0+V6.html?energy=petrol&gearbox=manual&yearMin=1999&yearMax=2003&doors=45&withPicture=off&size=120> from <GET https://reezocar.com/search/Alfa+Romeo+166+3.0+V6.html?energy=petrol&gearbox=manual&yearMin=1999&yearMax=2003&doors=45&withPicture=off&size=120>)
I guess the problem comes from the fact that there is no item but I have another spider for another website that is based on the same code that works well when there is no item on the page.
The spider code:
class AnnonceSpider(scrapy.Spider):
name = 'rzc_results'
def __init__(self, *args, **kwargs):
data_file = pkgutil.get_data(
"rzc_spider", "json/input/complete_rzc_scrape_rectif.json")
self.data = json.loads(data_file)
def start_requests(self):
for item in self.data:
request = scrapy.Request(item['rzc_url'], callback=self.parse)
request.meta['item'] = item
yield request
def parse(self, response):
item = response.meta['item']
item['results'] = []
#item["car_number"] = []
item["car_number"] = response.css(
"h2.subTitle_1guol7j::text").extract_first()
#if len(item["car_number"]) == 0:
# item["car_number"] = None
for caritem in response.css("div.adCardOuter_d2sn17 > div[itemprop='item']"):
data = AnnonceItem()
#model
data["model"] = []
data["model"] = caritem.css("h2.title_16j3u81 > div::text").extract_first()
if len(data["model"]) == 0:
data["model"] = None
#price
data["price_str"] = []
data["price_str"] = caritem.css(
"div.price_1anxiw > span::text").extract_first()
if len(data["price_str"]) == 0:
data["price_str"] = None
item['results'].append(data)
yield item
next_page = response.css(
'a.link_huvdae-o_O-linkPrevNext_1v3fox8::attr(href)').extract_first()
if next_page is not None:
url_pagination = 'https://www.reezocar.com' + next_page
meta = {'item': response.meta['item']}
yield scrapy.Request(url=url_pagination, callback=self.parse, meta=meta)

Include these options in the request.
yield Request('url',
meta = {
'dont_redirect': True,
'handle_httpstatus_list': [301,302]
},
callback= self.func)
Also, try to set dont_redirect flag in the settings file to affect the change globally.

Scrapy crawl multiple domains with reoccurring urls per domain

I am trying to crawl some selected domains and take only the essential pages from those websites. My approach is to crawl one webpage of the domain and take a limit set of urls, these urls will crawled for reoccurring URLs that i found on the first webpage. This way i try to eliminate all the URLs that didn't reoccur (content urls, such as products etc.). The reason i am asking for help is because scrapy.Request is not being executed more than once.
This is what i have so far:
class Finder(scrapy.Spider):
name = "finder"
start_urls = ['http://www.nu.nl/']
uniqueDomainUrl = dict()
maximumReoccurringPages = 5
rules = (
Rule(
LinkExtractor(
allow=('.nl', '.nu', '.info', '.net', '.com', '.org', '.info'),
deny=('facebook','amazon', 'wordpress', 'blogspot', 'free', 'reddit',
'videos', 'youtube', 'google', 'doubleclick', 'microsoft', 'yahoo',
'bing', 'znet', 'stackexchang', 'twitter', 'wikipedia', 'creativecommons',
'mediawiki', 'wikidata'),
),
process_request='parse',
follow=True
),
)
def parse(self, response):
self.logger.info('Entering URL: %s', response.url)
currentUrlParse = urlparse.urlparse( response.url )
currentDomain = currentUrlParse.hostname
if currentDomain in self.uniqueDomainUrl:
yield
self.uniqueDomainUrl[currentDomain] = currentDomain
item = ImportUrlList()
response.meta['item'] = item
# Reoccurring URLs
item = self.findReoccurringUrls(response)
list = item['list']
self.logger.info('Output: %s', list)
# Crawl reoccurring urls
#for href in list:
# yield scrapy.Request(response.urljoin(href), callback=self.parse)
def findReoccurringUrls(self, response):
self.logger.info('Finding reoccurring URLs in: %s', response.url)
item = response.meta['item']
urls = self.findUrlsOnCurrentPage(response)
item['list'] = urls
response.meta['item'] = item
# Get all URLs on each web page (limit 5 pages)
i = 0
for value in urls:
i += 1
if i > self.maximumReoccurringPages:
break
self.logger.info('Parse: %s', value)
request = Request(value, callback=self.test, meta={'item':item})
item = request.meta['item']
return item
def test(self, response):
self.logger.info('Page title: %s', response.css('title').extract())
item = response.meta['item']
urls = self.findUrlsOnCurrentPage( response )
item['list'] = set(item['list']) & set(urls)
return item
def findUrlsOnCurrentPage(self, response):
newUrls = []
currentUrlParse = urlparse.urlparse( response.url )
currentDomain = currentUrlParse.hostname
currentUrl = currentUrlParse.scheme +'://'+ currentUrlParse.hostname
for href in response.css('a::attr(href)').extract():
newUrl = urlparse.urljoin(currentUrl, href)
urlParse = urlparse.urlparse(newUrl)
domain = urlParse.hostname
if href.startswith( '#' ):
continue
if domain != currentDomain:
continue
if newUrl not in newUrls:
newUrls.append(newUrl)
return newUrls
It seems to be only executing the first page, the other Request() are not called as i can see on the callback.

What ImportUrlList() does? You implemented it?
You also forgot to call scrapy.Request on findReoccuringUrls
request = scrapy.Request(value, callback=self.test, meta={'item':item})
def findReoccurringUrls(self, response):
self.logger.info('Finding reoccurring URLs in: %s', response.url)
item = response.meta['item']
urls = self.findUrlsOnCurrentPage(response)
item['list'] = urls
response.meta['item'] = item
# Get all URLs on each web page (limit 5 pages)
i = 0
for value in urls:
i += 1
if i > self.maximumReoccurringPages:
break
self.logger.info('Parse: %s', value)
request = scrapy.Request(value, callback=self.test, meta={'item':item})
item = request.meta['item']

Scrapy not working with return and yield together

This is my code
def parse(self, response):
soup = BeautifulSoup(response.body)
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="row"]')
items = []
for site in sites[:5]:
item = TestItem()
item['username'] = "test5"
request = Request("http://www.example.org/profile.php", callback = self.parseUserProfile)
request.meta['item'] = item
**yield item**
mylinks= soup.find_all("a", text="Next")
if mylinks:
nextlink = mylinks[0].get('href')
yield Request(urljoin(response.url, nextlink), callback=self.parse)
def parseUserProfile(self, response):
item = response.meta['item']
item['image_urls'] = "test3"
return item
Now my above works but with that i am not getting value of item['image_urls'] = "test3"
It is coming as null
Now if use return request instead of yield item
Then get error that cannot use return with generator
If i remove this line
yield Request(urljoin(response.url, nextlink), callback=self.parse)
Then my code works fine and i can get image_urls but then i canot follow the links
So is there any way so that i can use return request and yield together so that i get the item_urls

I don't really understand your issue, but i see one problem in your code:
def parseUserProfile(self, response):
item = response.meta['item']
item['image_urls'] = "test3"
return item
Parse callbacks return values should be sequences, so you should do return [item] or convert your callback into a generator:
def parseUserProfile(self, response):
item = response.meta['item']
item['image_urls'] = "test3"
yield item

Looks like you have a mechanical error. Instead of:
for site in sites[:5]:
item = TestItem()
item['username'] = "test5"
request = Request("http://www.example.org/profile.php", callback = self.parseUserProfile)
request.meta['item'] = item
**yield item**
You need:
for site in sites[:5]:
item = TestItem()
item['username'] = "test5"
request = Request("http://www.example.org/profile.php", callback = self.parseUserProfile)
request.meta['item'] = item
yield request

How can i use multiple requests and pass items in between them in scrapy python

I have the item object and i need to pass that along many pages to store data in single item
LIke my item is
class DmozItem(Item):
title = Field()
description1 = Field()
description2 = Field()
description3 = Field()
Now those three description are in three separate pages. i want to do somrething like
Now this works good for parseDescription1
def page_parser(self, response):
sites = hxs.select('//div[#class="row"]')
items = []
request = Request("http://www.example.com/lin1.cpp", callback =self.parseDescription1)
request.meta['item'] = item
return request
def parseDescription1(self,response):
item = response.meta['item']
item['desc1'] = "test"
return item
But i want something like
def page_parser(self, response):
sites = hxs.select('//div[#class="row"]')
items = []
request = Request("http://www.example.com/lin1.cpp", callback =self.parseDescription1)
request.meta['item'] = item
request = Request("http://www.example.com/lin1.cpp", callback =self.parseDescription2)
request.meta['item'] = item
request = Request("http://www.example.com/lin1.cpp", callback =self.parseDescription2)
request.meta['item'] = item
return request
def parseDescription1(self,response):
item = response.meta['item']
item['desc1'] = "test"
return item
def parseDescription2(self,response):
item = response.meta['item']
item['desc2'] = "test2"
return item
def parseDescription3(self,response):
item = response.meta['item']
item['desc3'] = "test3"
return item

No problem. Following is correct version of your code:
def page_parser(self, response):
sites = hxs.select('//div[#class="row"]')
items = []
request = Request("http://www.example.com/lin1.cpp", callback=self.parseDescription1)
request.meta['item'] = item
yield request
request = Request("http://www.example.com/lin1.cpp", callback=self.parseDescription2, meta={'item': item})
yield request
yield Request("http://www.example.com/lin1.cpp", callback=self.parseDescription3, meta={'item': item})
def parseDescription1(self,response):
item = response.meta['item']
item['desc1'] = "test"
return item
def parseDescription2(self,response):
item = response.meta['item']
item['desc2'] = "test2"
return item
def parseDescription3(self,response):
item = response.meta['item']
item['desc3'] = "test3"
return item

In order to guarantee an ordering of the requests/callbacks and that only one item is ultimately returned you need to chain your requests using a form like:
def page_parser(self, response):
sites = hxs.select('//div[#class="row"]')
items = []
request = Request("http://www.example.com/lin1.cpp", callback=self.parseDescription1)
request.meta['item'] = Item()
return [request]
def parseDescription1(self,response):
item = response.meta['item']
item['desc1'] = "test"
return [Request("http://www.example.com/lin2.cpp", callback=self.parseDescription2, meta={'item': item})]
def parseDescription2(self,response):
item = response.meta['item']
item['desc2'] = "test2"
return [Request("http://www.example.com/lin3.cpp", callback=self.parseDescription3, meta={'item': item})]
def parseDescription3(self,response):
item = response.meta['item']
item['desc3'] = "test3"
return [item]
Each callback function returns an iterable of items or requests, requests are scheduled and items are run through your item pipeline.
If you return an item from each of the callbacks, you'll end up with 4 items in various states of completeness in your pipeline, but if you return the next request, then you can guaruntee the order of requests and that you will have exactly one item at the end of execution.

The accepted answer returns a total of three items [with desc(i) set for i=1,2,3].
If you want to return a single item, Dave McLain's item does work, however it requires parseDescription1, parseDescription2, and parseDescription3 to succeed and run without errors in order to return the item.
For my use case, some of the subrequests MAY return HTTP 403/404 errors at random, thus I lost some of the items, even though I could have scraped them partially.
Workaround
Thus, I currently employ the following workaround: Instead of only passing the item around in the request.meta dict, pass around a call stack that knows what request to call next. It will call the next item on the stack (so long as it isn't empty), and returns the item if the stack is empty.
The errback request parameter is used to return to the dispatcher method upon errors and simply continue with the next stack item.
def callnext(self, response):
''' Call next target for the item loader, or yields it if completed. '''
# Get the meta object from the request, as the response
# does not contain it.
meta = response.request.meta
# Items remaining in the stack? Execute them
if len(meta['callstack']) > 0:
target = meta['callstack'].pop(0)
yield Request(target['url'], meta=meta, callback=target['callback'], errback=self.callnext)
else:
yield meta['loader'].load_item()
def parseDescription1(self, response):
# Recover item(loader)
l = response.meta['loader']
# Use just as before
l.add_css(...)
# Build the call stack
callstack = [
{'url': "http://www.example.com/lin2.cpp",
'callback': self.parseDescription2 },
{'url': "http://www.example.com/lin3.cpp",
'callback': self.parseDescription3 }
]
return self.callnext(response)
def parseDescription2(self, response):
# Recover item(loader)
l = response.meta['loader']
# Use just as before
l.add_css(...)
return self.callnext(response)
def parseDescription3(self, response):
# ...
return self.callnext(response)
Warning
This solution is still synchronous, and will still fail if you have any exceptions within the callbacks.
For more information, check the blog post I wrote about that solution.

All of the answers provided do have their pros and cons. I'm just adding an extra one to demonstrate how this has been simplified due to changes in the codebase (both Python & Scrapy). We no longer need to use meta and can instead use cb_kwargs (i.e. keyword arguments to pass to the callback function).
So instead of doing this:
def page_parser(self, response):
sites = hxs.select('//div[#class="row"]')
items = []
request = Request("http://www.example.com/lin1.cpp",
callback=self.parseDescription1)
request.meta['item'] = Item()
return [request]
def parseDescription1(self,response):
item = response.meta['item']
item['desc1'] = "test"
return [Request("http://www.example.com/lin2.cpp",
callback=self.parseDescription2, meta={'item': item})]
...
We can do this:
def page_parser(self, response):
sites = hxs.select('//div[#class="row"]')
items = []
yield response.follow("http://www.example.com/lin1.cpp",
callback=self.parseDescription1,
cb_kwargs={"item": item()})
def parseDescription1(self,response, item):
item['desc1'] = "More data from this new response"
yield response.follow("http://www.example.com/lin2.cpp",
callback=self.parseDescription2,
cb_kwargs={'item': item})
...
and if for some reason you have multiple links you want to process with the same function, we can swap
yield response.follow(a_single_url,
callback=some_function,
cb_kwargs={"data": to_pass_to_callback})
with
yield from response.follow_all([many, urls, to, parse],
callback=some_function,
cb_kwargs={"data": to_pass_to_callback})

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scrapy - avoid duplicate items when crawling multiple pages recursively - python

Related

Python Crawler return the same response on any different URL request

301 redirection in Scrapy

Scrapy crawl multiple domains with reoccurring urls per domain

Scrapy not working with return and yield together

How can i use multiple requests and pass items in between them in scrapy python

Categories

Resources