so the problem is the run order, it basically runs the fuctions last
import scrapy
class uppspider(scrapy.Spider):
start_urls = ['something.com']
def parse(self, response):
return scrapy.FormRequest.from_response(
response,
formdata={'login': '', 'Password': ''},
callback=self.after_login
)
def after_login(self, response):
#check login succeed before going on
return Request(url="", callback=self.ret)
def ret(self, response):
#scraping
yield scrapy.Request(callback=self.parse_tastypage)
def parse_tastypage(self, response):
item = uppItem()
er = response.status
self = list()
self.append(er)
#scraping
yield item
print "whatever i print here, prints before the spider"
mylist = list()
parse_tastypage(mylist, 0)
print (mylist)
so if i want to print a variable assigned in a function. It doesn't work because it gets printed before assigned in function.
import logging
class uppspider(scrapy.Spider):
mylist = list()
def parse_tastypage(self):
# access the above declared list like this
self.mylist = ['some data']
parse_tastypage()
logging.info(mylist) # this will print ['some data']
Related
What am I doing wrong?
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.template = {'name':'','title':'','tab_1_value':{},'tab_1_description':'','tab_2_value':{},'tab_2_description':''}
def parse(self, response):
links = response.xpath('//table/tbody/tr/td/div/a/#href').extract()
page_no = response.request.url.split('=')[-1]
with tqdm(total=len(links)) as pbar:
for link_index, link in enumerate(links):
pbar.set_description("Processing page {}".format(page_no))
pbar.update(1)
page = response.urljoin(link)
yield scrapy.Request(page, callback=self.parse_company)
def parse_company(self, response):
#Successfully inscribed in self.template['name']
self.template['name'] = response.xpath('//div/div[#class="name"]/text()').get()
#Successfully inscribed in self.template['title']
self.template['title'] = response.xpath('//div/div[#class="title"]/text()').get()
content_links = response.xpath('//ul/li/a/#href').extract()
for content_link in content_links:
content_page = response.urljoin(content_link)
if response.request.url.split('/')[-2] == 'tab_1':
yield scrapy.Request(content_page, callback=self.tab_1)
if response.request.url.split('/')[-2] == 'tab_2':
yield scrapy.Request(content_page, callback=self.tab_2)
#It does not enter values here at all in the self.template
def tab_1(self, response):
self.template['tab_1_value'] = self.valueSeparation(response.xpath('//div/h2/strong/span/text()').get())
self.template['tab_1_description'] = response.xpath('//div/div/p/text()').get()
#It does not enter values here at all in the self.template
def tab_2(self, response):
self.template['tab_2_value'] = self.valueSeparation(response.xpath('//div/h2/strong/span/text()').get())
self.template['tab_2_description'] = response.xpath('//div/div/p/text()').get()
When I check if it goes into tab_1 and/or tab_2... it goes into. When I check if there are values of variables in within functions... everything is fine. It is not clear to me why these values will not be written in the self.template.
Does anyone know why the values will not be assigned to the appropriate elements in the self.template or if there is a better way to do the same?
Maybe there is a better solution, but I managed to solve the problem by sending a self.template through the meta argument through yield.
Maybe this is helpful to somebody...
if response.request.url.split('/')[-2] == 'tab_1':
yield scrapy.Request(content_page, callback=self.tab_1, meta={'template':self.template})
if response.request.url.split('/')[-2] == 'tab_2':
yield scrapy.Request(content_page, callback=self.tab_2, meta={'template':self.template})
And the values are assigned this way:
def tab_1(self, response):
response.meta.get['template']['tab_1_value'] = self.valueSeparation(response.xpath('//div/h2/strong/span/text()').get())
response.meta.get['template']['tab_1_description'] = response.xpath('//div/div/p/text()').get()
def tab_2(self, response):
response.meta.get['template']['tab_2_value'] = self.valueSeparation(response.xpath('//div/h2/strong/span/text()').get())
response.meta.get['template']['tab_2_description'] = response.xpath('//div/div/p/text()').get()
How to get results of scrapy request in a usable variable.
def parse_node(self,response,node):
yield Request('LINK',callback=self.parse_listing)
def parse_listing(self,response):
for agent in string.split(response.xpath('//node[#id="Agent"]/text()').extract_first() or "",'^'):
HERE=Request('LINK',callback=self.parse_agent)
print HERE
def parse_agent(self,response):
yield response.xpath('//node[#id="Email"]/text()').extract_first()
I am trying to get results from my HERE=Request('LINK',callback=self.parse_agent) and print them. The parse_agent should pick up an email but I would like to get it and use it inside parse_listing.
Based on your comments under the first answer, I think what you really need is using scrapy-inline-requests for the purpose (see the example there). Your code would look something like this:
def parse_node(self, response, node):
yield Request('LINK', callback=self.parse_listing)
#inline_requests
def parse_listing(self, response):
for agent in string.split(response.xpath('//node[#id="Agent"]/text()').extract_first() or "",'^'):
agent_response = yield Request('LINK')
email = agent_response.xpath('//node[#id="Email"]/text()').extract_first()
def parse_listing(self, response):
for agent in string.split(response.xpath('//node[#id="Agent"]/text()').extract_first() or "", '^'):
HERE = scrapy.Request('LINK', callback=self.parse_agent)
# call this req or something calls parse_agent(link)
yield HERE # this will yield to callback which will print or log
def parse_agent(self, response):
print response #response is the parsed page from HERE)
email = response.xpath('//node[#id="Email"]/text()').extract_first() #something
print email # logging is better
#import logging
#logging.log(logging.INFO, "info from page")
yield email #yield to whatever function
This question already has answers here:
Python Argument Binders
(7 answers)
Closed last month.
def parse(self, response):
for sel in response.xpath('//tbody/tr'):
item = HeroItem()
item['hclass'] = response.request.url.split("/")[8].split('-')[-1]
item['server'] = response.request.url.split('/')[2].split('.')[0]
item['hardcore'] = len(response.request.url.split("/")[8].split('-')) == 3
item['seasonal'] = response.request.url.split("/")[6] == 'season'
item['rank'] = sel.xpath('td[#class="cell-Rank"]/text()').extract()[0].strip()
item['battle_tag'] = sel.xpath('td[#class="cell-BattleTag"]//a/text()').extract()[1].strip()
item['grift'] = sel.xpath('td[#class="cell-RiftLevel"]/text()').extract()[0].strip()
item['time'] = sel.xpath('td[#class="cell-RiftTime"]/text()').extract()[0].strip()
item['date'] = sel.xpath('td[#class="cell-RiftTime"]/text()').extract()[0].strip()
url = 'https://' + item['server'] + '.battle.net/' + sel.xpath('td[#class="cell-BattleTag"]//a/#href').extract()[0].strip()
yield Request(url, callback=self.parse_profile)
def parse_profile(self, response):
sel = Selector(response)
item = HeroItem()
item['weapon'] = sel.xpath('//li[#class="slot-mainHand"]/a[#class="slot-link"]/#href').extract()[0].split('/')[4]
return item
Well, I'm scraping a whole table in the main parse method and I have taken several fields from that table. One of these fields is an url and I want to explore it to get a whole new bunch of fields. How can I pass my already created ITEM object to the callback function so the final item keeps all the fields?
As it is shown in the code above, I'm able to save the fields inside the url (code at the moment) or only the ones in the table (simply write yield item)
but I can't yield only one object with all the fields together.
I have tried this, but obviously, it doesn't work.
yield Request(url, callback=self.parse_profile(item))
def parse_profile(self, response, item):
sel = Selector(response)
item['weapon'] = sel.xpath('//li[#class="slot-mainHand"]/a[#class="slot-link"]/#href').extract()[0].split('/')[4]
return item
This is what you'd use the meta Keyword for.
def parse(self, response):
for sel in response.xpath('//tbody/tr'):
item = HeroItem()
# Item assignment here
url = 'https://' + item['server'] + '.battle.net/' + sel.xpath('td[#class="cell-BattleTag"]//a/#href').extract()[0].strip()
yield Request(url, callback=self.parse_profile, meta={'hero_item': item})
def parse_profile(self, response):
item = response.meta.get('hero_item')
item['weapon'] = response.xpath('//li[#class="slot-mainHand"]/a[#class="slot-link"]/#href').extract()[0].split('/')[4]
yield item
Also note, doing sel = Selector(response) is a waste of resources and differs from what you did earlier, so I changed it. It's automatically mapped in the response as response.selector, which also has the convenience shortcut of response.xpath.
Here's a better way to pass args to callback function:
def parse(self, response):
request = scrapy.Request('http://www.example.com/index.html',
callback=self.parse_page2,
cb_kwargs=dict(main_url=response.url))
request.cb_kwargs['foo'] = 'bar' # add more arguments for the callback
yield request
def parse_page2(self, response, main_url, foo):
yield dict(
main_url=main_url,
other_url=response.url,
foo=foo,
)
source: https://docs.scrapy.org/en/latest/topics/request-response.html#topics-request-response-ref-request-callback-arguments
I had a similar issue with Tkinter's extra argument passing, and found this solution to work (here: http://infohost.nmt.edu/tcc/help/pubs/tkinter/web/extra-args.html), converted to your problem:
def parse(self, response):
item = HeroItem()
[...]
def handler(self = self, response = response, item = item):
""" passing as default argument values """
return self.parse_profile(response, item)
yield Request(url, callback=handler)
#peduDev
Tried your approach but something failed due to an unexpected keyword.
scrapy_req = scrapy.Request(url=url,
callback=self.parseDetailPage,
cb_kwargs=dict(participant_id=nParticipantId))
def parseDetailPage(self, response, participant_id ):
.. Some code here..
yield MyParseResult (
.. some code here ..
participant_id = participant_id
)
Error reported
, cb_kwargs=dict(participant_id=nParticipantId)
TypeError: _init_() got an unexpected keyword argument 'cb_kwargs'
Any idea what caused the unexpected keyword argument other than perhaps an to old scrapy version?
Yep. I verified my own suggestion and after an upgrade it all worked as suspected.
sudo pip install --upgrade scrapy
I am trying to scrape aspx websites list view , hence structure of each page will be same and ( hence i am using recursive spider call's)
Error: ERROR: Spider must return Request, BaseItem or None, got 'list'
not sure what this error means ..
I am doing something wrong , very basic but can't identify ...point me in the right direction..Thanks
My Code:
name = "XYZscraper"
allowed_domains = ["xyz.com"]
def __init__(self):
self.start_urls = [
"xyz.com with aspx list viwe",
]
def parse(self, response):
sel = Selector(response)
if sel.xpath('//table/tr/td/form/table/tr'):
print "xpath is present"
elements = sel.xpath('//table/tr/td/form/table/tr')
else:
print "xpath not present "
print " going in with fallback xpath"
elements = sel.xpath('///table/tr')
counter = 1
nextPageAvailable = False # flat if netx page link is available or not
base_url = "xyz.com/"
try:
items = response.meta['item']
except Exception as e:
items = []
pass
no_of_row = len(elements)
for each_row in elements:
#first two row and last two row does not have data
#first and last row have link to previous and next page ...using first row for navigation
if counter == 1:
if each_row.xpath('td/a[1]/text()').extract()[0] == "Previous":
if each_row.xpath('td/a[2]/text()'):
if each_row.xpath('td/a[2]/text()').extract()[0] == "Next":
nextPageAvailable = True
elif each_row.xpath('td/a[1]/text()').extract()[0] == "Next":
nextPageAvailable = True
if counter > 2:
if counter < (no_of_row - 1):
item = myItem()
item['title'] = each_row.xpath('td/div/a/span/text()').extract()[0].encode('ascii', 'ignore') # Title
items.append(item)
counter += 1
if nextPageAvailable:
yield FormRequest.from_response(
response,
meta={'item': items},
formnumber=1,
formdata={
'__EVENTTARGET': 'ctl00$ctl10$EventsDG$ctl01$ctl01', #for request to navigate to next page in table
},
callback=self.parse # calling recursive function since signature of page will remain same just data is refreshed
)
else:
# when end of the list is arrived ...calling next functin to pop item ..may be !! does not work !!
self.popItems(response)
# does not work
# Error: python < 3.3 does not allow return with argument inside the generator
# return item
def popItems(self, response):
print "i am here"
items = ()
baseitem = response.meta['item']
items = baseitem
return items
Maybe you mean something like this:
else:
for item in self.popItems(response):
yield item
Or the shorter version:
else:
yield from self.popItems(response)
If the spider gets redirect, then it should do request again, but with different parameters.
The callback in second Request is not performed.
If I use different urls in start and checker methods, it's works fine. I think requests are using lazy loads and this is why my code isn't working, but not sure.
from scrapy.http import Request
from scrapy.spider import BaseSpider
class TestSpider(BaseSpider):
def start(self, response):
return Request(url = 'http://localhost/', callback=self.checker, meta={'dont_redirect': True})
def checker(self, response):
if response.status == 301:
return Request(url = "http://localhost/", callback=self.results, meta={'dont_merge_cookies': True})
else:
return self.results(response)
def results(self, response):
# here I work with response
Not sure if you still need this but I have put together an example. If you have a specific website in mind, we can all definitely take a look at it.
from scrapy.http import Request
from scrapy.spider import BaseSpider
class TestSpider(BaseSpider):
name = "TEST"
allowed_domains = ["example.com", "example.iana.org"]
def __init__(self, **kwargs):
super( TestSpider, self ).__init__(**kwargs)\
self.url = "http://www.example.com"
self.max_loop = 3
self.loop = 0 # We want it to loop 3 times so keep a class var
def start_requests(self):
# I'll write it out more explicitly here
print "OPEN"
checkRequest = Request(
url = self.url,
meta = {"test":"first"},
callback = self.checker
)
return [ checkRequest ]
def checker(self, response):
# I wasn't sure about a specific website that gives 302
# so I just used 200. We need the loop counter or it will keep going
if(self.loop<self.max_loop and response.status==200):
print "RELOOPING", response.status, self.loop, response.meta['test']
self.loop += 1
checkRequest = Request(
url = self.url,
callback = self.checker
).replace(meta = {"test":"not first"})
return [checkRequest]
else:
print "END LOOPING"
self.results(response) # No need to return, just call method
def results(self, response):
print "DONE" # Do stuff here
In settings.py, set this option
DUPEFILTER_CLASS = 'scrapy.dupefilter.BaseDupeFilter'
This is actually what turns off the filter for duplicate site requests. It's confusing because the BaseDupeFilter is not actually the default since it doesn't really filter anything. This means we will submit 3 different requests that will loop through the checker method. Also, I am using scrapy 0.16:
>scrapy crawl TEST
>OPEN
>RELOOPING 200 0 first
>RELOOPING 200 1 not first
>RELOOPING 200 2 not first
>END LOOPING
>DONE