Scrapy parse value into self.template - python

What am I doing wrong?
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.template = {'name':'','title':'','tab_1_value':{},'tab_1_description':'','tab_2_value':{},'tab_2_description':''}
def parse(self, response):
links = response.xpath('//table/tbody/tr/td/div/a/#href').extract()
page_no = response.request.url.split('=')[-1]
with tqdm(total=len(links)) as pbar:
for link_index, link in enumerate(links):
pbar.set_description("Processing page {}".format(page_no))
pbar.update(1)
page = response.urljoin(link)
yield scrapy.Request(page, callback=self.parse_company)
def parse_company(self, response):
#Successfully inscribed in self.template['name']
self.template['name'] = response.xpath('//div/div[#class="name"]/text()').get()
#Successfully inscribed in self.template['title']
self.template['title'] = response.xpath('//div/div[#class="title"]/text()').get()
content_links = response.xpath('//ul/li/a/#href').extract()
for content_link in content_links:
content_page = response.urljoin(content_link)
if response.request.url.split('/')[-2] == 'tab_1':
yield scrapy.Request(content_page, callback=self.tab_1)
if response.request.url.split('/')[-2] == 'tab_2':
yield scrapy.Request(content_page, callback=self.tab_2)
#It does not enter values here at all in the self.template
def tab_1(self, response):
self.template['tab_1_value'] = self.valueSeparation(response.xpath('//div/h2/strong/span/text()').get())
self.template['tab_1_description'] = response.xpath('//div/div/p/text()').get()
#It does not enter values here at all in the self.template
def tab_2(self, response):
self.template['tab_2_value'] = self.valueSeparation(response.xpath('//div/h2/strong/span/text()').get())
self.template['tab_2_description'] = response.xpath('//div/div/p/text()').get()
When I check if it goes into tab_1 and/or tab_2... it goes into. When I check if there are values of variables in within functions... everything is fine. It is not clear to me why these values will not be written in the self.template.
Does anyone know why the values will not be assigned to the appropriate elements in the self.template or if there is a better way to do the same?

Maybe there is a better solution, but I managed to solve the problem by sending a self.template through the meta argument through yield.
Maybe this is helpful to somebody...
if response.request.url.split('/')[-2] == 'tab_1':
yield scrapy.Request(content_page, callback=self.tab_1, meta={'template':self.template})
if response.request.url.split('/')[-2] == 'tab_2':
yield scrapy.Request(content_page, callback=self.tab_2, meta={'template':self.template})
And the values are assigned this way:
def tab_1(self, response):
response.meta.get['template']['tab_1_value'] = self.valueSeparation(response.xpath('//div/h2/strong/span/text()').get())
response.meta.get['template']['tab_1_description'] = response.xpath('//div/div/p/text()').get()
def tab_2(self, response):
response.meta.get['template']['tab_2_value'] = self.valueSeparation(response.xpath('//div/h2/strong/span/text()').get())
response.meta.get['template']['tab_2_description'] = response.xpath('//div/div/p/text()').get()

Related

Using response.follow in XMLFeedSpider

I am writing XMLFeedSpider (https://doc.scrapy.org/en/1.4/topics/spiders.html#xmlfeedspider), which needs to parse additional information from an url in the items. But when I yield a response.follow in parse_node, the next node will not be parsed / only the first node and one response.follow request will be parsed. The response follow somehow disrupts the iteration in parse_nodes. Any ideas how to solve this?
The code in general
shortened and cleaned
class example_domainSpider(XMLFeedSpider):
name = 'example_domain'
allowed_domains = ['example_domain.org']
start_urls = ['https://items.example_domain.org/some-api']
iterator = 'iternodes'
itertag = 'Some_Item'
Item = None
def parse_node(self, response, selector):
xml_data = dict()
# a lot of selectors to fill xml_data
# pass data to next method, so it can be combined in one item
return response.follow(link, self.parse_item, meta={'xml_data': xml_data})
def parse_item(self, response):
self.Item = item()
xml_data = response.request.meta['xml_data']
# fill item with the data from xml
for key, value in xml_data.items():
self.Item[key] = value
# a lot of selectors to fill self.Item
yield self.Item

Scrapy: how to populate hierarchic items with multipel requests

This one is extension of Multiple nested request with scrapy
. Asking because presented solution have flaws:
1. It iliminates asynchrony, thus heavily reducing scraping efficiency
2. Should exception appeare while processing links "stack" and no item will be yelded
3. What if there is a huge amount of child items?
To deal with (1) I considered this:
class CatLoader(ItemLoader):
def __int__(self, item=None, selector=None, response=None, parent=None, **context):
super(self.__class__, self).__init__(item, selector, response, parent, **context)
self.lock = threading.Lock()
self.counter = 0
def dec_counter(self):
self.lock.acquire()
self.counter += 1
self.lock.release()
Then in parser:
if len(urls) == 0:
self.logger.warning('Cat without items, url: ' + response.url)
item = cl.load_item()
yield item
cl.counter = len(urls)
for url in urls:
rq = Request(url, self.parse_item)
rq.meta['loader'] = cl
yield rq
And in parse_item() I can do:
def parse_item(self, response):
l = response.meta['loader']
l.dec_counter()
if l.counter == 0:
yield l.load_item()
BUT! To deal with 2 i neeed in each function do:
def parse_item(self, response):
try:
l = response.meta['loader']
finally:
l.dec_counter()
if l.counter == 0:
yield l.load_item()
Which I consider not elegant solution. So could anyone help with better solution? Also I'm up to insert items to DB, rather than json output, so maybe it better to create item with promise and make pipline, that parses children to check if promise is fulfiled(when item is inserted to DB), or something like that?
UPD: Hierchic items: category -> article -> images. All to be saved in different tables with proper relations. So:
1) Articles must be inservet to table AFTER category.
2) Article must know ID of it's category to form relation
Same thing for images records

scrapy run order, prints before it's assigned

so the problem is the run order, it basically runs the fuctions last
import scrapy
class uppspider(scrapy.Spider):
start_urls = ['something.com']
def parse(self, response):
return scrapy.FormRequest.from_response(
response,
formdata={'login': '', 'Password': ''},
callback=self.after_login
)
def after_login(self, response):
#check login succeed before going on
return Request(url="", callback=self.ret)
def ret(self, response):
#scraping
yield scrapy.Request(callback=self.parse_tastypage)
def parse_tastypage(self, response):
item = uppItem()
er = response.status
self = list()
self.append(er)
#scraping
yield item
print "whatever i print here, prints before the spider"
mylist = list()
parse_tastypage(mylist, 0)
print (mylist)
so if i want to print a variable assigned in a function. It doesn't work because it gets printed before assigned in function.
import logging
class uppspider(scrapy.Spider):
mylist = list()
def parse_tastypage(self):
# access the above declared list like this
self.mylist = ['some data']
parse_tastypage()
logging.info(mylist) # this will print ['some data']

Passing a argument to a callback function [duplicate]

This question already has answers here:
Python Argument Binders
(7 answers)
Closed last month.
def parse(self, response):
for sel in response.xpath('//tbody/tr'):
item = HeroItem()
item['hclass'] = response.request.url.split("/")[8].split('-')[-1]
item['server'] = response.request.url.split('/')[2].split('.')[0]
item['hardcore'] = len(response.request.url.split("/")[8].split('-')) == 3
item['seasonal'] = response.request.url.split("/")[6] == 'season'
item['rank'] = sel.xpath('td[#class="cell-Rank"]/text()').extract()[0].strip()
item['battle_tag'] = sel.xpath('td[#class="cell-BattleTag"]//a/text()').extract()[1].strip()
item['grift'] = sel.xpath('td[#class="cell-RiftLevel"]/text()').extract()[0].strip()
item['time'] = sel.xpath('td[#class="cell-RiftTime"]/text()').extract()[0].strip()
item['date'] = sel.xpath('td[#class="cell-RiftTime"]/text()').extract()[0].strip()
url = 'https://' + item['server'] + '.battle.net/' + sel.xpath('td[#class="cell-BattleTag"]//a/#href').extract()[0].strip()
yield Request(url, callback=self.parse_profile)
def parse_profile(self, response):
sel = Selector(response)
item = HeroItem()
item['weapon'] = sel.xpath('//li[#class="slot-mainHand"]/a[#class="slot-link"]/#href').extract()[0].split('/')[4]
return item
Well, I'm scraping a whole table in the main parse method and I have taken several fields from that table. One of these fields is an url and I want to explore it to get a whole new bunch of fields. How can I pass my already created ITEM object to the callback function so the final item keeps all the fields?
As it is shown in the code above, I'm able to save the fields inside the url (code at the moment) or only the ones in the table (simply write yield item)
but I can't yield only one object with all the fields together.
I have tried this, but obviously, it doesn't work.
yield Request(url, callback=self.parse_profile(item))
def parse_profile(self, response, item):
sel = Selector(response)
item['weapon'] = sel.xpath('//li[#class="slot-mainHand"]/a[#class="slot-link"]/#href').extract()[0].split('/')[4]
return item
This is what you'd use the meta Keyword for.
def parse(self, response):
for sel in response.xpath('//tbody/tr'):
item = HeroItem()
# Item assignment here
url = 'https://' + item['server'] + '.battle.net/' + sel.xpath('td[#class="cell-BattleTag"]//a/#href').extract()[0].strip()
yield Request(url, callback=self.parse_profile, meta={'hero_item': item})
def parse_profile(self, response):
item = response.meta.get('hero_item')
item['weapon'] = response.xpath('//li[#class="slot-mainHand"]/a[#class="slot-link"]/#href').extract()[0].split('/')[4]
yield item
Also note, doing sel = Selector(response) is a waste of resources and differs from what you did earlier, so I changed it. It's automatically mapped in the response as response.selector, which also has the convenience shortcut of response.xpath.
Here's a better way to pass args to callback function:
def parse(self, response):
request = scrapy.Request('http://www.example.com/index.html',
callback=self.parse_page2,
cb_kwargs=dict(main_url=response.url))
request.cb_kwargs['foo'] = 'bar' # add more arguments for the callback
yield request
def parse_page2(self, response, main_url, foo):
yield dict(
main_url=main_url,
other_url=response.url,
foo=foo,
)
source: https://docs.scrapy.org/en/latest/topics/request-response.html#topics-request-response-ref-request-callback-arguments
I had a similar issue with Tkinter's extra argument passing, and found this solution to work (here: http://infohost.nmt.edu/tcc/help/pubs/tkinter/web/extra-args.html), converted to your problem:
def parse(self, response):
item = HeroItem()
[...]
def handler(self = self, response = response, item = item):
""" passing as default argument values """
return self.parse_profile(response, item)
yield Request(url, callback=handler)
#peduDev
Tried your approach but something failed due to an unexpected keyword.
scrapy_req = scrapy.Request(url=url,
callback=self.parseDetailPage,
cb_kwargs=dict(participant_id=nParticipantId))
def parseDetailPage(self, response, participant_id ):
.. Some code here..
yield MyParseResult (
.. some code here ..
participant_id = participant_id
)
Error reported
, cb_kwargs=dict(participant_id=nParticipantId)
TypeError: _init_() got an unexpected keyword argument 'cb_kwargs'
Any idea what caused the unexpected keyword argument other than perhaps an to old scrapy version?
Yep. I verified my own suggestion and after an upgrade it all worked as suspected.
sudo pip install --upgrade scrapy

Scrapy recursive parse : what i am doing wrong here

I am trying to scrape aspx websites list view , hence structure of each page will be same and ( hence i am using recursive spider call's)
Error: ERROR: Spider must return Request, BaseItem or None, got 'list'
not sure what this error means ..
I am doing something wrong , very basic but can't identify ...point me in the right direction..Thanks
My Code:
name = "XYZscraper"
allowed_domains = ["xyz.com"]
def __init__(self):
self.start_urls = [
"xyz.com with aspx list viwe",
]
def parse(self, response):
sel = Selector(response)
if sel.xpath('//table/tr/td/form/table/tr'):
print "xpath is present"
elements = sel.xpath('//table/tr/td/form/table/tr')
else:
print "xpath not present "
print " going in with fallback xpath"
elements = sel.xpath('///table/tr')
counter = 1
nextPageAvailable = False # flat if netx page link is available or not
base_url = "xyz.com/"
try:
items = response.meta['item']
except Exception as e:
items = []
pass
no_of_row = len(elements)
for each_row in elements:
#first two row and last two row does not have data
#first and last row have link to previous and next page ...using first row for navigation
if counter == 1:
if each_row.xpath('td/a[1]/text()').extract()[0] == "Previous":
if each_row.xpath('td/a[2]/text()'):
if each_row.xpath('td/a[2]/text()').extract()[0] == "Next":
nextPageAvailable = True
elif each_row.xpath('td/a[1]/text()').extract()[0] == "Next":
nextPageAvailable = True
if counter > 2:
if counter < (no_of_row - 1):
item = myItem()
item['title'] = each_row.xpath('td/div/a/span/text()').extract()[0].encode('ascii', 'ignore') # Title
items.append(item)
counter += 1
if nextPageAvailable:
yield FormRequest.from_response(
response,
meta={'item': items},
formnumber=1,
formdata={
'__EVENTTARGET': 'ctl00$ctl10$EventsDG$ctl01$ctl01', #for request to navigate to next page in table
},
callback=self.parse # calling recursive function since signature of page will remain same just data is refreshed
)
else:
# when end of the list is arrived ...calling next functin to pop item ..may be !! does not work !!
self.popItems(response)
# does not work
# Error: python < 3.3 does not allow return with argument inside the generator
# return item
def popItems(self, response):
print "i am here"
items = ()
baseitem = response.meta['item']
items = baseitem
return items
Maybe you mean something like this:
else:
for item in self.popItems(response):
yield item
Or the shorter version:
else:
yield from self.popItems(response)

Categories

Resources