Using response.follow in XMLFeedSpider - python

I am writing XMLFeedSpider (https://doc.scrapy.org/en/1.4/topics/spiders.html#xmlfeedspider), which needs to parse additional information from an url in the items. But when I yield a response.follow in parse_node, the next node will not be parsed / only the first node and one response.follow request will be parsed. The response follow somehow disrupts the iteration in parse_nodes. Any ideas how to solve this?
The code in general
shortened and cleaned
class example_domainSpider(XMLFeedSpider):
name = 'example_domain'
allowed_domains = ['example_domain.org']
start_urls = ['https://items.example_domain.org/some-api']
iterator = 'iternodes'
itertag = 'Some_Item'
Item = None
def parse_node(self, response, selector):
xml_data = dict()
# a lot of selectors to fill xml_data
# pass data to next method, so it can be combined in one item
return response.follow(link, self.parse_item, meta={'xml_data': xml_data})
def parse_item(self, response):
self.Item = item()
xml_data = response.request.meta['xml_data']
# fill item with the data from xml
for key, value in xml_data.items():
self.Item[key] = value
# a lot of selectors to fill self.Item
yield self.Item

Related

Scrapy parse value into self.template

What am I doing wrong?
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.template = {'name':'','title':'','tab_1_value':{},'tab_1_description':'','tab_2_value':{},'tab_2_description':''}
def parse(self, response):
links = response.xpath('//table/tbody/tr/td/div/a/#href').extract()
page_no = response.request.url.split('=')[-1]
with tqdm(total=len(links)) as pbar:
for link_index, link in enumerate(links):
pbar.set_description("Processing page {}".format(page_no))
pbar.update(1)
page = response.urljoin(link)
yield scrapy.Request(page, callback=self.parse_company)
def parse_company(self, response):
#Successfully inscribed in self.template['name']
self.template['name'] = response.xpath('//div/div[#class="name"]/text()').get()
#Successfully inscribed in self.template['title']
self.template['title'] = response.xpath('//div/div[#class="title"]/text()').get()
content_links = response.xpath('//ul/li/a/#href').extract()
for content_link in content_links:
content_page = response.urljoin(content_link)
if response.request.url.split('/')[-2] == 'tab_1':
yield scrapy.Request(content_page, callback=self.tab_1)
if response.request.url.split('/')[-2] == 'tab_2':
yield scrapy.Request(content_page, callback=self.tab_2)
#It does not enter values here at all in the self.template
def tab_1(self, response):
self.template['tab_1_value'] = self.valueSeparation(response.xpath('//div/h2/strong/span/text()').get())
self.template['tab_1_description'] = response.xpath('//div/div/p/text()').get()
#It does not enter values here at all in the self.template
def tab_2(self, response):
self.template['tab_2_value'] = self.valueSeparation(response.xpath('//div/h2/strong/span/text()').get())
self.template['tab_2_description'] = response.xpath('//div/div/p/text()').get()
When I check if it goes into tab_1 and/or tab_2... it goes into. When I check if there are values of variables in within functions... everything is fine. It is not clear to me why these values will not be written in the self.template.
Does anyone know why the values will not be assigned to the appropriate elements in the self.template or if there is a better way to do the same?
Maybe there is a better solution, but I managed to solve the problem by sending a self.template through the meta argument through yield.
Maybe this is helpful to somebody...
if response.request.url.split('/')[-2] == 'tab_1':
yield scrapy.Request(content_page, callback=self.tab_1, meta={'template':self.template})
if response.request.url.split('/')[-2] == 'tab_2':
yield scrapy.Request(content_page, callback=self.tab_2, meta={'template':self.template})
And the values are assigned this way:
def tab_1(self, response):
response.meta.get['template']['tab_1_value'] = self.valueSeparation(response.xpath('//div/h2/strong/span/text()').get())
response.meta.get['template']['tab_1_description'] = response.xpath('//div/div/p/text()').get()
def tab_2(self, response):
response.meta.get['template']['tab_2_value'] = self.valueSeparation(response.xpath('//div/h2/strong/span/text()').get())
response.meta.get['template']['tab_2_description'] = response.xpath('//div/div/p/text()').get()

How to omit processing of an Item() if it has already been seen by spider in Python Scrapy

I'm trying to remove duplicate business_names during a spider crawl. However, I still see duplicate business_names.
I tried if x != item['business_name'] continue with parsing.
What I want is that if a business_name doesn't already exist then parse it, if not then delete from list or skip query result.
Instead, the code below ignores my if statement; Here is what I have so far.
class Item(scrapy.Item):
business_name = scrapy.Field()
website = scrapy.Field()
phone_number = scrapy.Field()
class QuotesSpider(scrapy.Spider):
def parse(self, response):
for business in response.css('div.info'):
item = Item()
item['business_name'] = business.css('span[itemprop="name"]::text').extract()
for x in item['business_name']:
if (x != item['business_name']):
if item['business_name']:
item['website'] = business.css('div.links a::attr(href)').extract_first()
if item['website']:
item['phone_number'] = business.css('div.phones.phone.primary::text').extract()
yield item
The reason why you are seeing this behaviour is a problem of scope. You set item['business_name'] = to a the results of .extract() which is always a list (even if there is only one successful css.tag.
Then the code iterates over item['business_name'] and checks if each element of the list is =! item['business_name']
Turns out, that will always be True.
It is equivalent to doing the following:
numbers = [1, 2 , 3, 4]
for x in numbers:
if x != numbers:
print(x)
#output
1
2
3
4
Instead, initialize a list outside of the for loop and check if value is in that list. For instance, something to the effect of:
def parse(self, response):
for business in response.css('div.info'):
seen_business_names = []
item = Item()
item['business_name'] = business.css('span[itemprop="name"]::text').extract()
for x in item['business_name']:
if (x not in seen_business_names):
if item['business_name']: # not sure why this is here unless it is possible you are extracting empty strings
item['website'] = business.css('div.links a::attr(href)').extract_first()
if item['website']:
item['phone_number'] = business.css('div.phones.phone.primary::text').extract()
seen_business_names.append(x)
yield item
I don't have access to your html file, so I can't guarantee that above code will work, but the behavior you are facing would be expected based on the code you provided in original post.
Side note: The list in the solution above will only be preserved for each call to the parse step. In other words, for each start_url passed to parse. If you want to ensure that one and only one business_name is extracted GLOBALLY for any page passed to parse during the life of the Spider class, we could maintain a list in class definition and check against it in the same way we did locally to parse. Consider:
class Item(scrapy.Item):
business_name = scrapy.Field()
website = scrapy.Field()
phone_number = scrapy.Field()
class QuotesSpider(scrapy.Spider):
#new code here
def __init__(self):
self.seen_business_names = []
def parse(self, response):
for business in response.css('div.info'):
item = Item()
item['business_name'] = business.css('span[itemprop="name"]::text').extract()
for x in item['business_name']:
#new code here, call to self.seen_business_names
if (x not in self.seen_business_names):
if item['business_name']:
item['website'] = business.css('div.links a::attr(href)').extract_first()
if item['website']:
item['phone_number'] = business.css('div.phones.phone.primary::text').extract()
#new code here, call to self.seen_business_names
self.seen_business_names.append(x)
yield item
Cheers!

Scrapy: how to populate hierarchic items with multipel requests

This one is extension of Multiple nested request with scrapy
. Asking because presented solution have flaws:
1. It iliminates asynchrony, thus heavily reducing scraping efficiency
2. Should exception appeare while processing links "stack" and no item will be yelded
3. What if there is a huge amount of child items?
To deal with (1) I considered this:
class CatLoader(ItemLoader):
def __int__(self, item=None, selector=None, response=None, parent=None, **context):
super(self.__class__, self).__init__(item, selector, response, parent, **context)
self.lock = threading.Lock()
self.counter = 0
def dec_counter(self):
self.lock.acquire()
self.counter += 1
self.lock.release()
Then in parser:
if len(urls) == 0:
self.logger.warning('Cat without items, url: ' + response.url)
item = cl.load_item()
yield item
cl.counter = len(urls)
for url in urls:
rq = Request(url, self.parse_item)
rq.meta['loader'] = cl
yield rq
And in parse_item() I can do:
def parse_item(self, response):
l = response.meta['loader']
l.dec_counter()
if l.counter == 0:
yield l.load_item()
BUT! To deal with 2 i neeed in each function do:
def parse_item(self, response):
try:
l = response.meta['loader']
finally:
l.dec_counter()
if l.counter == 0:
yield l.load_item()
Which I consider not elegant solution. So could anyone help with better solution? Also I'm up to insert items to DB, rather than json output, so maybe it better to create item with promise and make pipline, that parses children to check if promise is fulfiled(when item is inserted to DB), or something like that?
UPD: Hierchic items: category -> article -> images. All to be saved in different tables with proper relations. So:
1) Articles must be inservet to table AFTER category.
2) Article must know ID of it's category to form relation
Same thing for images records

Scrapy: How to do I prevent a yield request with a conditional item value?

I'm parsing a list of urls, and I want to avoid saving some url resulted item on the condition of some its value. My code is something like this:
start_urls = [www.rootpage.com]
def parse(self,response):
item = CreatedItem()
url_list = response.xpath('somepath').extract()
for url in url_list:
request = scrapy.Request(item['url'],callback=self.parse_article)
request.meta['item'] = item
yield request
def parse_article(self,response):
item = response.meta['item']
item['parameterA'] = response.xpath('somepath').extract()
yield item
Now I want that in case item['parameterA'] follows a condition, there is no need to "yield request" (so that no saving for this url occurs). I tried add a conditional like:
if item['parameterA'] == 0:
continue
else:
yield item
but as expected it does not work, because scrapy continues the loop even before the request is performed.
From what I understand, you should make the decision inside the parse_article method:
def parse_article(self,response):
item = response.meta['item']
item['parameterA'] = response.xpath('somepath').extract_first()
if item['parameterA'] != "0":
yield item
Note the use of the extract_first() and the quotes around 0.

Passing a argument to a callback function [duplicate]

This question already has answers here:
Python Argument Binders
(7 answers)
Closed last month.
def parse(self, response):
for sel in response.xpath('//tbody/tr'):
item = HeroItem()
item['hclass'] = response.request.url.split("/")[8].split('-')[-1]
item['server'] = response.request.url.split('/')[2].split('.')[0]
item['hardcore'] = len(response.request.url.split("/")[8].split('-')) == 3
item['seasonal'] = response.request.url.split("/")[6] == 'season'
item['rank'] = sel.xpath('td[#class="cell-Rank"]/text()').extract()[0].strip()
item['battle_tag'] = sel.xpath('td[#class="cell-BattleTag"]//a/text()').extract()[1].strip()
item['grift'] = sel.xpath('td[#class="cell-RiftLevel"]/text()').extract()[0].strip()
item['time'] = sel.xpath('td[#class="cell-RiftTime"]/text()').extract()[0].strip()
item['date'] = sel.xpath('td[#class="cell-RiftTime"]/text()').extract()[0].strip()
url = 'https://' + item['server'] + '.battle.net/' + sel.xpath('td[#class="cell-BattleTag"]//a/#href').extract()[0].strip()
yield Request(url, callback=self.parse_profile)
def parse_profile(self, response):
sel = Selector(response)
item = HeroItem()
item['weapon'] = sel.xpath('//li[#class="slot-mainHand"]/a[#class="slot-link"]/#href').extract()[0].split('/')[4]
return item
Well, I'm scraping a whole table in the main parse method and I have taken several fields from that table. One of these fields is an url and I want to explore it to get a whole new bunch of fields. How can I pass my already created ITEM object to the callback function so the final item keeps all the fields?
As it is shown in the code above, I'm able to save the fields inside the url (code at the moment) or only the ones in the table (simply write yield item)
but I can't yield only one object with all the fields together.
I have tried this, but obviously, it doesn't work.
yield Request(url, callback=self.parse_profile(item))
def parse_profile(self, response, item):
sel = Selector(response)
item['weapon'] = sel.xpath('//li[#class="slot-mainHand"]/a[#class="slot-link"]/#href').extract()[0].split('/')[4]
return item
This is what you'd use the meta Keyword for.
def parse(self, response):
for sel in response.xpath('//tbody/tr'):
item = HeroItem()
# Item assignment here
url = 'https://' + item['server'] + '.battle.net/' + sel.xpath('td[#class="cell-BattleTag"]//a/#href').extract()[0].strip()
yield Request(url, callback=self.parse_profile, meta={'hero_item': item})
def parse_profile(self, response):
item = response.meta.get('hero_item')
item['weapon'] = response.xpath('//li[#class="slot-mainHand"]/a[#class="slot-link"]/#href').extract()[0].split('/')[4]
yield item
Also note, doing sel = Selector(response) is a waste of resources and differs from what you did earlier, so I changed it. It's automatically mapped in the response as response.selector, which also has the convenience shortcut of response.xpath.
Here's a better way to pass args to callback function:
def parse(self, response):
request = scrapy.Request('http://www.example.com/index.html',
callback=self.parse_page2,
cb_kwargs=dict(main_url=response.url))
request.cb_kwargs['foo'] = 'bar' # add more arguments for the callback
yield request
def parse_page2(self, response, main_url, foo):
yield dict(
main_url=main_url,
other_url=response.url,
foo=foo,
)
source: https://docs.scrapy.org/en/latest/topics/request-response.html#topics-request-response-ref-request-callback-arguments
I had a similar issue with Tkinter's extra argument passing, and found this solution to work (here: http://infohost.nmt.edu/tcc/help/pubs/tkinter/web/extra-args.html), converted to your problem:
def parse(self, response):
item = HeroItem()
[...]
def handler(self = self, response = response, item = item):
""" passing as default argument values """
return self.parse_profile(response, item)
yield Request(url, callback=handler)
#peduDev
Tried your approach but something failed due to an unexpected keyword.
scrapy_req = scrapy.Request(url=url,
callback=self.parseDetailPage,
cb_kwargs=dict(participant_id=nParticipantId))
def parseDetailPage(self, response, participant_id ):
.. Some code here..
yield MyParseResult (
.. some code here ..
participant_id = participant_id
)
Error reported
, cb_kwargs=dict(participant_id=nParticipantId)
TypeError: _init_() got an unexpected keyword argument 'cb_kwargs'
Any idea what caused the unexpected keyword argument other than perhaps an to old scrapy version?
Yep. I verified my own suggestion and after an upgrade it all worked as suspected.
sudo pip install --upgrade scrapy

Categories

Resources