I am using Scrapy to crawl a website.
In code I am using more than one call back function, where data related to one search result will get retrived in two call back functions. Like,
class PubmedProjSpider(CrawlSpider):
name = str(CONFIG.get('project_name', 'project_name'))
start_urls = ['https://pubmed.ncbi.nlm.nih.gov/?term=(((((((((((((((((((((((((sodium%20oxybate%5BText%20Word%5D)%20OR%20(Xyrem%5BText%20Word%5D))%20OR%20(certolizumab%20pegol%5BText%20Word%5D))%20OR%20(Cimzia%5BText%20Word%5D))%20OR%20(vancomycin%20hydrochloride%5BText%20Word%5D))%20OR%20(Vancomycin%5BText%20Word%5D))%20OR%20(Vancocin%5BText%20Word%5D))%20OR%20(atorvastatin%20calcium%20trihydrate%5BText%20Word%5D))%20OR%20(atorvastatin%5BText%20Word%5D))%20OR%20(Lipitor))%20OR%20(alprostadil%5BText%20Word%5D))%20OR%20(Caverject%5BText%20Word%5D))%20OR%20(atenolol%5BText%20Word%5D))%20OR%20(Tenormin%5BText%20Word%5D))%20OR%20(tramadol%20hydrochloride%5BText%20Word%5D))%20OR%20(tramadol%5BText%20Word%5D))%20OR%20(Maneo%5BText%20Word%5D))%20OR%20(temazepam%5BText%20Word%5D))%20OR%20(citalopram%20hydrobromide%5BText%20Word%5D))%20OR%20(citalopram%5BText%20Word%5D))%20OR%20(Cipramil%5BText%20Word%5D))%20OR%20(fluticasone%20propionate%5BText%20Word%5D))%20OR%20(fluticasone%5BText%20Word%5D))%20OR%20(Cutivate%5BText%20Word%5D)))%20AND%20((%222020%2F03%2F03%22%5BDate%20-%20Create%5D%20%3A%20%222020%2F03%2F05%22%5BDate%20-%20Create%5D))&filter=simsearch2.ffrft&pos=6']
path = r"C:\Users\vighnesh.paramasivam\Documents\pubmed_organised_copy\pubmed_organised\pubmed\pubmed\output_pdf_files"
def __init__(self):
self.file_storage_location = CONFIG.get('storage_location', 'text_storage_destination')
def parse(self, response):
try:
hxs = Selector(response)
items = []
titles = hxs.xpath("//div[#class='docsum-wrap']//div[#class='docsum-content']")
items.append(titles)
for title in items:
for href in title.xpath("a/#href").extract():
yield Request(
url=response.urljoin(href),
callback=self.parse_article
)
if response.xpath("//button[#class='load-button next-page']"):
temp_url = response.xpath("//div[#data-next-page-url]/#data-next-page-url").getall()[0]
next_page_url = response.urljoin(temp_url)
next_page_url = next_page_url.replace('/more','')
yield Request(
url = next_page_url,
callback=self.parse)
except Exception as message:
#print("###### exception from parse method")
raise CloseSpider(message)
def parse_article(self, response):
try:
w={}
w['title'] = str(' '.join(response.xpath('.//h1[#class="heading-title"]')[0].xpath(".//text()").getall()).encode('utf-8').lstrip().rstrip())
w['url'] = str(response).split(' ')[-1].strip('>')
w['pmcid'] = str(response.xpath(".//ul/li/span[#class='identifier pubmed']/strong[#title='PubMed ID']/text()").getall()[0])
w['authors'] = response.xpath('//div[#class="inline-authors"]/div[#class="authors"]/div[#class="authors-list"]/span/a/text()').getall()
abstract = {'Free-Text':[]}
w['pdf_downloaded'] = 'No'
w['pdf_links'] = ''
q = response.xpath("//div[#class='abstract'][#id='abstract']").getall()
if response.xpath("//div[#class='full-text-links-list']/a/#href"):
w['pdf_links'] = list(set(response.xpath("//div[#class='full-text-links-list']/a/#href").getall()))
if q:
for i in response.xpath("//div[#class='abstract'][#id='abstract']/div[#class='abstract-content selected']/p"):
strong_format = i.xpath("./strong//text()").getall()
bold_format = i.xpath("./b/text()").getall()
if strong_format:
abstract[i.xpath("./strong//text()").getall()[0].strip().strip(':').lstrip()] = ' '.join(i.xpath("./text()").getall()).lstrip().rstrip()
elif bold_format:
headings = response.xpath("//div[#class='abstract'][#id='abstract']/div[#class='abstract-content selected']/p/b/text()").getall()
if headings:
if response.xpath('normalize-space(substring-before(string(//div[#class="abstract"][#id="abstract"]/div[#class="abstract-content selected"]/p),//b[.="{}"]))'.format(headings[0])).getall():
abstract['Free-Text'] = response.xpath('normalize-space(substring-before(string(//div[#class="abstract"][#id="abstract"]/div[#class="abstract-content selected"]/p),//b[.="{}"]))'.format(headings[0])).getall()[0]
for num, header in enumerate(headings):
if num != len(headings)-1:
abstract[header] = response.xpath('normalize-space(substring-before(substring-after(string(//div[#class="abstract"][#id="abstract"]/div[#class="abstract-content selected"]/p),//b[.="{}"]),//b[.="{}"]))'.format(headings[num], headings[num+1])).getall()[0]
else:
abstract[header] = response.xpath('normalize-space(substring-after(string(//div[#class="abstract"][#id="abstract"]/div[#class="abstract-content selected"]/p),//b[.="{}"]))'.format(headings[num])).getall()[0]
else:
abstract['Free-Text'].append((' '.join(i.xpath(".//text()").getall()).lstrip().rstrip()))
if response.xpath("//div[#class='abstract'][#id='abstract']/p/strong[contains(text(), 'Keywords:')]"):
abstract['Keywords']=' '.join(response.xpath("//div[#class='abstract'][#id='abstract']/p/text()").getall()).strip()
w['abstract'] = abstract
path = os.path.join(self.file_storage_location,'PMCID_'+w['pmcid']+'.txt')
with open(path, 'w') as e:
for p in w.items():
e.write("%s:%s\n\n" % p)
if 'PMC' in response.xpath(".//div[#class='full-text-links-list']/a/#data-ga-action").getall():
pdf_url = response.xpath(".//div[#class='full-text-links-list']/a[#data-ga-action='PMC']/#href").getall()[0]
#for href in response.css('a[href$=".pdf"]::attr(href)').extract():
yield Request(
url=response.urljoin(pdf_url),
callback=self.link, meta={'hero_item': w['pmcid']}
)
yield(w)
except Exception as message:
#print("###############Exception from parse_article")
raise CloseSpider(message)
def link(self, response):
print("################# entering link function")
try:
if response.xpath('.//div[#class="format-menu"]/ul/li/a[contains(text(), "PDF")]/#href'):
link1 = response.xpath('.//div[#class="format-menu"]/ul/li/a[contains(text(), "PDF")]/#href').getall()[0]
item = response.meta.get('hero_item')
yield Request(
url=response.urljoin(link1),
callback=self.save_pdf, meta={'hero_item': item}
)
except Exception as message:
#print("###############Exception from link")
pass
def save_pdf(self, response):
try:
print("################# entering pdf function")
item = response.meta.get('hero_item')
path = self.path + "\\"+ "PMCID_" + item + '.pdf'
self.logger.info('Saving PDF %s', path)
with open(path, 'wb') as f:
f.write(response.body)
except Exception as message:
pass
As in the above code all the details are getting extracted in "parse_article , but one information as of whether "pdf_downloaded" will be decided from save_pdf function which is a call back function.
Now the data is there in two call back functions, how can I append then before stroing them.
Any help is appreciated!!
Related
I'm getting the following traceback but unsure how to refactor.
ValueError: Missing scheme in request url: #mw-head
Full code:
class MissleSpiderBio(scrapy.Spider):
name = 'missle_spider_bio'
allowed_domains = ['en.wikipedia.org']
start_urls = ['https://en.wikipedia.org/wiki/...']
this is the part giving me issues (I believe)
def parse(self, response):
filename = response.url.split('/')[-1]
table = response.xpath('///div/table[2]/tbody')
rows = table.xpath('//tr')
row = rows[2]
row.xpath('td//text()')[0].extract()
wdata = {}
for row in response.xpath('//* \
[#class="wikitable"]//tbody//tr'):
for link in response.xpath('//a/#href'):
link = link.extract()
if((link.strip() != '')):
yield Request(link, callback=self.parse)
#wdata.append(link)
else:
yield None
#wdata = {}
#wdata['link'] = BASE_URL +
#row.xpath('a/#href').extract() #[0]
wdata['link'] = BASE_URL + link
request = scrapy.Request(wdata['link'],\
callback=self.get_mini_bio, dont_filter=True)
request.meta['item'] = MissleItem(**wdata)
yield request
here is the second part of the code:
def get_mini_bio(self, response):
BASE_URL_ESCAPED = 'http:\/\/en.wikipedia.org'
item = response.meta['item']
item['image_urls'] = []
img_src = response.xpath('//table[contains(#class, \
"infobox")]//img/#src')
if img_src:
item['image_urls'] = ['http:' + img_src[0].extract()]
mini_bio = ''
paras = response.xpath('//*[#id="mw-content-text"]/p[text()\
or normalize-space(.)=""]').extract()
for p in paras:
if p =='<p></p>':
break
mini_bio += p
mini_bio = mini_bio.replace('href="/wiki', 'href="' + \
BASE_URL + '/wiki')
mini_bio = mini_bio.replace('href="#', item['link'] + '#')
item['mini_bio'] = mini_bio
yield item
I tried refactoring but am now getting a:
ValueError: Missing scheme in request url: #mw-head
any help would be immensely appreciated
Looks like you were on the right track with the commented out [0].
xpath().extract() #returns a list of strings
You need to select the string with [0]
row.xpath('a/#href').extract()
That expression evaluates to a list NOT a string. When you pass the URL to the request object, scrapy expects a string, not a list
To fix this, you have a few options:
You can use LinkExtractors which will allow you to search a page for links and automatically create scrapy request objects for those links:
https://doc.scrapy.org/en/latest/topics/link-extractors.html
OR
You could run a for loop and go through each of the links:
from scrapy.spiders import Request
for link in response.xpath('//a/#href'):
link = link.extract()
if((link.strip() != '')):
yield Request(link, callback=self.parse)
else:
yield None
You can add whatever string filters you want to that code
OR
If you just want the first link, you can use .extract_first() instead of .extract()
I have a scrapy spider that uses XMLFeedSpider. As well as the data returned for each node in parse_node(), I also need to make an additional request to get more data. The only issue, is if I yield an additional request from parse_node() nothing gets returned at all:
class MySpidersSpider(XMLFeedSpider):
name = "myspiders"
namespaces = [('g', 'http://base.google.com/ns/1.0')]
allowed_domains = {"www.myspiders.com"}
start_urls = [
"https://www.myspiders.com/productMap.xml"
]
iterator = 'iternodes'
itertag = 'item'
def parse_node(self, response, node):
if(self.settings['CLOSESPIDER_ITEMCOUNT'] and int(self.settings['CLOSESPIDER_ITEMCOUNT']) == self.item_count):
raise CloseSpider('CLOSESPIDER_ITEMCOUNT limit reached - ' + str(self.settings['CLOSESPIDER_ITEMCOUNT']))
else:
self.item_count += 1
id = node.xpath('id/text()').extract()
title = node.xpath('title/text()').extract()
link = node.xpath('link/text()').extract()
image_link = node.xpath('g:image_link/text()').extract()
gtin = node.xpath('g:gtin/text()').extract()
product_type = node.xpath('g:product_type/text()').extract()
price = node.xpath('g:price/text()').extract()
sale_price = node.xpath('g:sale_price/text()').extract()
availability = node.xpath('g:availability/text()').extract()
item = MySpidersItem()
item['id'] = id[0]
item['title'] = title[0]
item['link'] = link[0]
item['image_link'] = image_link[0]
item['gtin'] = gtin[0]
item['product_type'] = product_type[0]
item['price'] = price[0]
item['sale_price'] = '' if len(sale_price) == 0 else sale_price[0]
item['availability'] = availability[0]
yield Request(item['link'], callback=self.parse_details, meta={'item': item})
def parse_details(self, response):
item = response.meta['item']
item['price_per'] = 'test'
return item
If I change the last line of parse_node() to return item it works fine (without setting price_per in the item, naturally).
Any idea what I'm doing wrong?
Have you tried checking the contents of item['link']? If it is a relative link (example: /products?id=5), the URL won't return anything and the request will fail. You need to make sure it's a resolvable link (example: https://www.myspiders.com/products?id=5).
I discovered the issue - I was limiting the number of items processed in my parse_node() function. However, because of the limit, my spider was terminating prior to the request being made. Moving the code to limit the item processed to my parse_details() function resolves the issue:
def parse_details(self, response):
if(self.settings['CLOSESPIDER_ITEMCOUNT'] and int(self.settings['CLOSESPIDER_ITEMCOUNT']) == self.item_count):
raise CloseSpider('CLOSESPIDER_ITEMCOUNT limit reached - ' + str(self.settings['CLOSESPIDER_ITEMCOUNT']))
else:
self.item_count += 1
item = response.meta['item']
item['price_per'] = 'test'
return item
So I was just wondering what my getURLs function's issue might be. I'm trying to get all urls from within the containing body's string.
My crawler isn't crawling anything because my input urls are invalid.
# Get all URLs contained within the body string
def getURLs(body):
urls = []
tempArr = body.split("a href=")
index = 1
for part in tempArr:
if part[0] == '"':
while (part[index] != '"' and index < len(part)):
index += 1
if index < len(part):
urls.append(part[1:index-1])
index = 1
return urls
# Open file which contains input urls
with open("test_urls.txt","rU") as infile:
urls = [row.strip("\n") for row in infile]
class BackpageSpider(CrawlSpider):
name = 'backpage'
allowed_domains = ['backpage.com']
start_urls = urls
def parse(self,response):
#print response.url
if response.status < 600:
# all_links = response.xpath("//div[contains(#class,'cat')]/a/#href").extract()
#all the links FOR THE ESCORTS on whatever page we're on
todays_links = []
#all the links for today's date
backpage_date = backpage_date_today()
yesterday_date = backpage_date_yesterday()
if backpage_date in response.body:
todays_section = response.body.split(backpage_date)[1].split(yesterday_date)[0].decode('utf-8')
# todays_links = todays_section.xpath("//div[contains(#class,'cat')]/a/#href").extract
todays_links = getURLs(todays_section)
# for url in todays_links:
# todays_links.append(url)
# for url in all_links:
# if url in todays_section:
# todays_links.append(url)
for url in todays_links:
yield scrapy.Request(url,callback=self.parse_ad_into_content)####HERE
for url in set(response.xpath('//a[#class="pagination next"]/#href').extract()):
yield scrapy.Request(url,callback=self.parse)
else:
time.sleep(600)
yield scrapy.Request(response.url,callback=self.parse)
def parse_ad_into_content(self,response):
#ipdb.set_trace()
item = items.BackpageScrapeItem(
url=response.url,
backpage_id=response.url.split('.')[0].split('/')[2].encode('utf-8'),
text = response.body,
posting_body= response.xpath("//div[#class='postingBody']").extract()[0].encode('utf-8'),
date = datetime.utcnow()-timedelta(hours=5),
posted_date = response.xpath("//div[#class='adInfo']/text()").extract()[0].encode('utf-8'),
posted_age = response.xpath("//p[#class='metaInfoDisplay']/text()").extract()[0].encode('utf-8'),
posted_title = response.xpath("//div[#id='postingTitle']//h1/text()").extract()[0].encode('utf-8')
)
return item
The web page is: http://grandisland.backpage.com/FemaleEscorts/?layout=date
I have a main page to crawl name and url.Again need to go to that url and crawl further details like fullname,age,and link. Finally need to return the items with (name,url,age,sex,link) in a single item.
Want to define first level of crawl in one method crawl_page and the second level of crawl in another method crawl_item.
class CrawlLink(CrawlSpider):
name = "crawllink"
allowed_domains = ['www.xyz.org']
start_urls = ["www.xyz.org/profile?page=0"]
rules = [Rule(SgmlLinkExtractor(allow = ('/profile\?page=\d+'),restrict_xpaths = ('//li[#class="pager-next"]',),canonicalize=False ),
callback = 'parse_page',
follow=True)
]
def parse_page(self, response):
self.log ('Started Crawling List %s' %response.url)
items = response.xpath("//div[#id='profile']/div")
ulists = []
for temp in items:
usritem = PostUsers()
usrlink = temp.xpath("./div[#class='name']/a/#href").extract()[0]
usritem ["url"] = 'www.xyz.org'+usrlink
usritem ["namel"] = temp.xpath("//div[#id='user_profile_main']/dl/dd[1]/text()").extract()
for urltemp in usrlink:
yield Request(url=usritem["url"], callback=self.parse_user)
# ulists.append( usritem)
return ulists
def parse_user(self, response):
self.log ('Started Crawling Profile %s' %response.url)
usr = PostUsers()
relative_url = response.xpath("//div[#id='nav-content']/ul/li[2]/a/#href").extract()[0]
usr["link"] = 'www.xyz.org'+relative_url
usr ["age"] = response.xpath("//div[#id='user_user_full_group_profile_main']/dl/dd[1]/text()").extract()
usr ["fullname"] = response.xpath("//h1[#id='page-title']/text()").extract()
self.log ('Finished Crawling Profile %s' %response.url)
return usr
I'm trying to call a parse function from the main parse function, but it isn't working.
Here is the code:
class CodechefSpider(CrawlSpider):
name = "codechef_crawler"
allowed_domains = ["codechef.com"]
start_urls = ["http://www.codechef.com/problems/easy/","http://www.codechef.com/problems/medium/","http://www.codechef.com/problems/hard/","http://www.codechef.com/problems/challenege/"]
rules = (Rule(SgmlLinkExtractor(allow=('/problems/[A-Z,0-9,-]+')), callback='parse_item'),)
def parse_solution(self,response):
hxs = HtmlXPathSelector(response)
x = hxs.select("//tr[#class='kol']//td[8]").exctract()
f = open('test/'+response.url.split('/')[-1]+'.txt','wb')
f.write(x.encode("utf-8"))
f.close()
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
item = Problem()
item['title'] = hxs.select("//table[#class='pagetitle-prob']/tr/td/h1/text()").extract()
item['content'] = hxs.select("//div[#class='node clear-block']//div[#class='content']").extract()
filename = str(item['title'][0])
solutions_url = 'http://www.codechef.com/status/' + response.url.split('/')[-1] + '?language=All&status=15&handle=&sort_by=Time&sorting_order=asc'
Request(solutions_url, callback = self.parse_solution)
f = open('problems/'+filename+'.html','wb')
f.write("<div style='width:800px;margin:50px'>")
for i in item['content']:
f.write(i.encode("utf-8"))
f.write("</div>")
f.close()
parse solution method is not being called. The spider runs without any errors.
You should put yield Request(solutions_url, callback = self.parse_solution) and not just Request(solutions_url, callback = self.parse_solution).