scrapy's Request function is not being called - python

I'm trying to call a parse function from the main parse function, but it isn't working.
Here is the code:
class CodechefSpider(CrawlSpider):
name = "codechef_crawler"
allowed_domains = ["codechef.com"]
start_urls = ["http://www.codechef.com/problems/easy/","http://www.codechef.com/problems/medium/","http://www.codechef.com/problems/hard/","http://www.codechef.com/problems/challenege/"]
rules = (Rule(SgmlLinkExtractor(allow=('/problems/[A-Z,0-9,-]+')), callback='parse_item'),)
def parse_solution(self,response):
hxs = HtmlXPathSelector(response)
x = hxs.select("//tr[#class='kol']//td[8]").exctract()
f = open('test/'+response.url.split('/')[-1]+'.txt','wb')
f.write(x.encode("utf-8"))
f.close()
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
item = Problem()
item['title'] = hxs.select("//table[#class='pagetitle-prob']/tr/td/h1/text()").extract()
item['content'] = hxs.select("//div[#class='node clear-block']//div[#class='content']").extract()
filename = str(item['title'][0])
solutions_url = 'http://www.codechef.com/status/' + response.url.split('/')[-1] + '?language=All&status=15&handle=&sort_by=Time&sorting_order=asc'
Request(solutions_url, callback = self.parse_solution)
f = open('problems/'+filename+'.html','wb')
f.write("<div style='width:800px;margin:50px'>")
for i in item['content']:
f.write(i.encode("utf-8"))
f.write("</div>")
f.close()
parse solution method is not being called. The spider runs without any errors.

You should put yield Request(solutions_url, callback = self.parse_solution) and not just Request(solutions_url, callback = self.parse_solution).

Related

How to get data back from a function in scrapy

I am using Scrapy to crawl a website.
In code I am using more than one call back function, where data related to one search result will get retrived in two call back functions. Like,
class PubmedProjSpider(CrawlSpider):
name = str(CONFIG.get('project_name', 'project_name'))
start_urls = ['https://pubmed.ncbi.nlm.nih.gov/?term=(((((((((((((((((((((((((sodium%20oxybate%5BText%20Word%5D)%20OR%20(Xyrem%5BText%20Word%5D))%20OR%20(certolizumab%20pegol%5BText%20Word%5D))%20OR%20(Cimzia%5BText%20Word%5D))%20OR%20(vancomycin%20hydrochloride%5BText%20Word%5D))%20OR%20(Vancomycin%5BText%20Word%5D))%20OR%20(Vancocin%5BText%20Word%5D))%20OR%20(atorvastatin%20calcium%20trihydrate%5BText%20Word%5D))%20OR%20(atorvastatin%5BText%20Word%5D))%20OR%20(Lipitor))%20OR%20(alprostadil%5BText%20Word%5D))%20OR%20(Caverject%5BText%20Word%5D))%20OR%20(atenolol%5BText%20Word%5D))%20OR%20(Tenormin%5BText%20Word%5D))%20OR%20(tramadol%20hydrochloride%5BText%20Word%5D))%20OR%20(tramadol%5BText%20Word%5D))%20OR%20(Maneo%5BText%20Word%5D))%20OR%20(temazepam%5BText%20Word%5D))%20OR%20(citalopram%20hydrobromide%5BText%20Word%5D))%20OR%20(citalopram%5BText%20Word%5D))%20OR%20(Cipramil%5BText%20Word%5D))%20OR%20(fluticasone%20propionate%5BText%20Word%5D))%20OR%20(fluticasone%5BText%20Word%5D))%20OR%20(Cutivate%5BText%20Word%5D)))%20AND%20((%222020%2F03%2F03%22%5BDate%20-%20Create%5D%20%3A%20%222020%2F03%2F05%22%5BDate%20-%20Create%5D))&filter=simsearch2.ffrft&pos=6']
path = r"C:\Users\vighnesh.paramasivam\Documents\pubmed_organised_copy\pubmed_organised\pubmed\pubmed\output_pdf_files"
def __init__(self):
self.file_storage_location = CONFIG.get('storage_location', 'text_storage_destination')
def parse(self, response):
try:
hxs = Selector(response)
items = []
titles = hxs.xpath("//div[#class='docsum-wrap']//div[#class='docsum-content']")
items.append(titles)
for title in items:
for href in title.xpath("a/#href").extract():
yield Request(
url=response.urljoin(href),
callback=self.parse_article
)
if response.xpath("//button[#class='load-button next-page']"):
temp_url = response.xpath("//div[#data-next-page-url]/#data-next-page-url").getall()[0]
next_page_url = response.urljoin(temp_url)
next_page_url = next_page_url.replace('/more','')
yield Request(
url = next_page_url,
callback=self.parse)
except Exception as message:
#print("###### exception from parse method")
raise CloseSpider(message)
def parse_article(self, response):
try:
w={}
w['title'] = str(' '.join(response.xpath('.//h1[#class="heading-title"]')[0].xpath(".//text()").getall()).encode('utf-8').lstrip().rstrip())
w['url'] = str(response).split(' ')[-1].strip('>')
w['pmcid'] = str(response.xpath(".//ul/li/span[#class='identifier pubmed']/strong[#title='PubMed ID']/text()").getall()[0])
w['authors'] = response.xpath('//div[#class="inline-authors"]/div[#class="authors"]/div[#class="authors-list"]/span/a/text()').getall()
abstract = {'Free-Text':[]}
w['pdf_downloaded'] = 'No'
w['pdf_links'] = ''
q = response.xpath("//div[#class='abstract'][#id='abstract']").getall()
if response.xpath("//div[#class='full-text-links-list']/a/#href"):
w['pdf_links'] = list(set(response.xpath("//div[#class='full-text-links-list']/a/#href").getall()))
if q:
for i in response.xpath("//div[#class='abstract'][#id='abstract']/div[#class='abstract-content selected']/p"):
strong_format = i.xpath("./strong//text()").getall()
bold_format = i.xpath("./b/text()").getall()
if strong_format:
abstract[i.xpath("./strong//text()").getall()[0].strip().strip(':').lstrip()] = ' '.join(i.xpath("./text()").getall()).lstrip().rstrip()
elif bold_format:
headings = response.xpath("//div[#class='abstract'][#id='abstract']/div[#class='abstract-content selected']/p/b/text()").getall()
if headings:
if response.xpath('normalize-space(substring-before(string(//div[#class="abstract"][#id="abstract"]/div[#class="abstract-content selected"]/p),//b[.="{}"]))'.format(headings[0])).getall():
abstract['Free-Text'] = response.xpath('normalize-space(substring-before(string(//div[#class="abstract"][#id="abstract"]/div[#class="abstract-content selected"]/p),//b[.="{}"]))'.format(headings[0])).getall()[0]
for num, header in enumerate(headings):
if num != len(headings)-1:
abstract[header] = response.xpath('normalize-space(substring-before(substring-after(string(//div[#class="abstract"][#id="abstract"]/div[#class="abstract-content selected"]/p),//b[.="{}"]),//b[.="{}"]))'.format(headings[num], headings[num+1])).getall()[0]
else:
abstract[header] = response.xpath('normalize-space(substring-after(string(//div[#class="abstract"][#id="abstract"]/div[#class="abstract-content selected"]/p),//b[.="{}"]))'.format(headings[num])).getall()[0]
else:
abstract['Free-Text'].append((' '.join(i.xpath(".//text()").getall()).lstrip().rstrip()))
if response.xpath("//div[#class='abstract'][#id='abstract']/p/strong[contains(text(), 'Keywords:')]"):
abstract['Keywords']=' '.join(response.xpath("//div[#class='abstract'][#id='abstract']/p/text()").getall()).strip()
w['abstract'] = abstract
path = os.path.join(self.file_storage_location,'PMCID_'+w['pmcid']+'.txt')
with open(path, 'w') as e:
for p in w.items():
e.write("%s:%s\n\n" % p)
if 'PMC' in response.xpath(".//div[#class='full-text-links-list']/a/#data-ga-action").getall():
pdf_url = response.xpath(".//div[#class='full-text-links-list']/a[#data-ga-action='PMC']/#href").getall()[0]
#for href in response.css('a[href$=".pdf"]::attr(href)').extract():
yield Request(
url=response.urljoin(pdf_url),
callback=self.link, meta={'hero_item': w['pmcid']}
)
yield(w)
except Exception as message:
#print("###############Exception from parse_article")
raise CloseSpider(message)
def link(self, response):
print("################# entering link function")
try:
if response.xpath('.//div[#class="format-menu"]/ul/li/a[contains(text(), "PDF")]/#href'):
link1 = response.xpath('.//div[#class="format-menu"]/ul/li/a[contains(text(), "PDF")]/#href').getall()[0]
item = response.meta.get('hero_item')
yield Request(
url=response.urljoin(link1),
callback=self.save_pdf, meta={'hero_item': item}
)
except Exception as message:
#print("###############Exception from link")
pass
def save_pdf(self, response):
try:
print("################# entering pdf function")
item = response.meta.get('hero_item')
path = self.path + "\\"+ "PMCID_" + item + '.pdf'
self.logger.info('Saving PDF %s', path)
with open(path, 'wb') as f:
f.write(response.body)
except Exception as message:
pass
As in the above code all the details are getting extracted in "parse_article , but one information as of whether "pdf_downloaded" will be decided from save_pdf function which is a call back function.
Now the data is there in two call back functions, how can I append then before stroing them.
Any help is appreciated!!

Scrapy - ValueError: Missing scheme in request url: #mw-head

I'm getting the following traceback but unsure how to refactor.
ValueError: Missing scheme in request url: #mw-head
Full code:
class MissleSpiderBio(scrapy.Spider):
name = 'missle_spider_bio'
allowed_domains = ['en.wikipedia.org']
start_urls = ['https://en.wikipedia.org/wiki/...']
this is the part giving me issues (I believe)
def parse(self, response):
filename = response.url.split('/')[-1]
table = response.xpath('///div/table[2]/tbody')
rows = table.xpath('//tr')
row = rows[2]
row.xpath('td//text()')[0].extract()
wdata = {}
for row in response.xpath('//* \
[#class="wikitable"]//tbody//tr'):
for link in response.xpath('//a/#href'):
link = link.extract()
if((link.strip() != '')):
yield Request(link, callback=self.parse)
#wdata.append(link)
else:
yield None
#wdata = {}
#wdata['link'] = BASE_URL +
#row.xpath('a/#href').extract() #[0]
wdata['link'] = BASE_URL + link
request = scrapy.Request(wdata['link'],\
callback=self.get_mini_bio, dont_filter=True)
request.meta['item'] = MissleItem(**wdata)
yield request
here is the second part of the code:
def get_mini_bio(self, response):
BASE_URL_ESCAPED = 'http:\/\/en.wikipedia.org'
item = response.meta['item']
item['image_urls'] = []
img_src = response.xpath('//table[contains(#class, \
"infobox")]//img/#src')
if img_src:
item['image_urls'] = ['http:' + img_src[0].extract()]
mini_bio = ''
paras = response.xpath('//*[#id="mw-content-text"]/p[text()\
or normalize-space(.)=""]').extract()
for p in paras:
if p =='<p></p>':
break
mini_bio += p
mini_bio = mini_bio.replace('href="/wiki', 'href="' + \
BASE_URL + '/wiki')
mini_bio = mini_bio.replace('href="#', item['link'] + '#')
item['mini_bio'] = mini_bio
yield item
I tried refactoring but am now getting a:
ValueError: Missing scheme in request url: #mw-head
any help would be immensely appreciated
Looks like you were on the right track with the commented out [0].
xpath().extract() #returns a list of strings
You need to select the string with [0]
row.xpath('a/#href').extract()
That expression evaluates to a list NOT a string. When you pass the URL to the request object, scrapy expects a string, not a list
To fix this, you have a few options:
You can use LinkExtractors which will allow you to search a page for links and automatically create scrapy request objects for those links:
https://doc.scrapy.org/en/latest/topics/link-extractors.html
OR
You could run a for loop and go through each of the links:
from scrapy.spiders import Request
for link in response.xpath('//a/#href'):
link = link.extract()
if((link.strip() != '')):
yield Request(link, callback=self.parse)
else:
yield None
You can add whatever string filters you want to that code
OR
If you just want the first link, you can use .extract_first() instead of .extract()

Scrapy - unable to make additional request in XMLFeedSpider

I have a scrapy spider that uses XMLFeedSpider. As well as the data returned for each node in parse_node(), I also need to make an additional request to get more data. The only issue, is if I yield an additional request from parse_node() nothing gets returned at all:
class MySpidersSpider(XMLFeedSpider):
name = "myspiders"
namespaces = [('g', 'http://base.google.com/ns/1.0')]
allowed_domains = {"www.myspiders.com"}
start_urls = [
"https://www.myspiders.com/productMap.xml"
]
iterator = 'iternodes'
itertag = 'item'
def parse_node(self, response, node):
if(self.settings['CLOSESPIDER_ITEMCOUNT'] and int(self.settings['CLOSESPIDER_ITEMCOUNT']) == self.item_count):
raise CloseSpider('CLOSESPIDER_ITEMCOUNT limit reached - ' + str(self.settings['CLOSESPIDER_ITEMCOUNT']))
else:
self.item_count += 1
id = node.xpath('id/text()').extract()
title = node.xpath('title/text()').extract()
link = node.xpath('link/text()').extract()
image_link = node.xpath('g:image_link/text()').extract()
gtin = node.xpath('g:gtin/text()').extract()
product_type = node.xpath('g:product_type/text()').extract()
price = node.xpath('g:price/text()').extract()
sale_price = node.xpath('g:sale_price/text()').extract()
availability = node.xpath('g:availability/text()').extract()
item = MySpidersItem()
item['id'] = id[0]
item['title'] = title[0]
item['link'] = link[0]
item['image_link'] = image_link[0]
item['gtin'] = gtin[0]
item['product_type'] = product_type[0]
item['price'] = price[0]
item['sale_price'] = '' if len(sale_price) == 0 else sale_price[0]
item['availability'] = availability[0]
yield Request(item['link'], callback=self.parse_details, meta={'item': item})
def parse_details(self, response):
item = response.meta['item']
item['price_per'] = 'test'
return item
If I change the last line of parse_node() to return item it works fine (without setting price_per in the item, naturally).
Any idea what I'm doing wrong?
Have you tried checking the contents of item['link']? If it is a relative link (example: /products?id=5), the URL won't return anything and the request will fail. You need to make sure it's a resolvable link (example: https://www.myspiders.com/products?id=5).
I discovered the issue - I was limiting the number of items processed in my parse_node() function. However, because of the limit, my spider was terminating prior to the request being made. Moving the code to limit the item processed to my parse_details() function resolves the issue:
def parse_details(self, response):
if(self.settings['CLOSESPIDER_ITEMCOUNT'] and int(self.settings['CLOSESPIDER_ITEMCOUNT']) == self.item_count):
raise CloseSpider('CLOSESPIDER_ITEMCOUNT limit reached - ' + str(self.settings['CLOSESPIDER_ITEMCOUNT']))
else:
self.item_count += 1
item = response.meta['item']
item['price_per'] = 'test'
return item

Scrapy crawler does not scrape or print results in CSV

Seems like this scrapy spider locates the links that it is supposed to go to in order to collect additional information, but it either doesn't go to the next page or it is unable to collect the information on the other page. I checked the xpath links, they all appear to be correct.
Terminal output:
2017-01-10 10:31:16 [scrapy.extensions.logstats] INFO: Crawled 213 pages (at 23 pages/min), scraped 0 items (at 0 items/min)
Code:
#!/usr/bin/env python
import types
import time
from datetime import date, datetime, timedelta
import requests
import msgpack
from scrapy.http import Request
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector, Selector
from resume_data.items import ResumeDataItem, ResultListItem, WorkItem, SchoolItem, ItemList
from bs4 import BeautifulSoup, Tag, NavigableString, Comment
from bs4.element import NavigableString
class ResumeIndeedSpider(CrawlSpider):
name = "indeed_resume"
allowed_domains = ["indeed.com"]
start_urls = ['http://www.indeed.com/resumes/mechanical-engineer',
'http://www.indeed.com/resumes/mechanical-engineering',
'http://www.indeed.com/resumes/piping-engineer',
'http://www.indeed.com/resumes/design-engineer',
'http://www.indeed.com/resumes/project-engineer']
#def __init__(self, filename=None):
#self.unis = list()
rules = (Rule (SgmlLinkExtractor(restrict_xpaths = ('//a[contains(#class,"app_link")]')), callback = "parse_item", follow = True),)
def parse_item(self, response):
hxs = Selector(response)
digest = hxs.xpath('//ol[#class="resultsList"]')
records = ResumeDataItem()
url_prefix = 'http://www.indeed.com'
resume_links = digest.xpath('//li[#class="sre"]//div[#class="sre-entry"]')
names = digest.xpath('//a[#target="_blank"]/text()').extract()
links = digest.xpath('//a[#target="_blank"]/#href').extract()
for name, link in zip(names,links):
if name not in 'Feedback':
records['name'] = name
records['link'] = url_prefix+link
yield Request(records['link'], meta={'item': records}, callback= self.parse_node)
def parse_node(self, response):
hxs = Selector(response)
records = ResumeDataItem()
# name = hxs.xpath('/text()').extract()
name = hxs.xpath('//h1[#id="resume-contact"]/text()').extract()
headline = hxs.xpath('//h2[#id="headline"]/text()').extract()
# locale = hxs.xpath('//div[#class="addr" and #itemprop="address"]//p//text()').extract()
rlocale = hxs.xpath('//p[#id="headline_location" and #class="locality"]//text()').extract()
summary = hxs.xpath('//p[#id="res_summary" and #class="summary"]/text()').extract()
skills = list()
skill = hxs.xpath('//div[#id="skills-items" and #class="items-container"]//p//text()').extract()
if len(skill) != 0:
skills.append(''.join(skill).encode('utf-8'))
skill = hxs.xpath('//div[#id="additionalinfo-section" and #class="last"]//div[#class="data_display"]//p//text()').extract()
if len(skill) != 0:
skills.append(''.join(skill).encode('utf-8'))
resume_links = list()
links = hxs.xpath('//div[#id="link-items" and #class="items-container"]//p//text()').extract()
for link in links:
resume_links.append(''.join(link).encode('utf-8'))
workHistory = ItemList()
experience = hxs.xpath('//div[#id="work-experience-items"]/div')
for elem in experience:
item = elem.xpath('div')
for entry in item:
workEntry = WorkItem()
title = entry.xpath('p[#class="work_title title"]//text()').extract()
workEntry['title'] = ''.join(title).encode('utf-8')
company = entry.xpath('div[#class="work_company"]/span/text()').extract()
workEntry['company']= ''.join(company).encode('utf-8')
location = entry.xpath('div[#class="work_company"]/div[#class="inline-block"]/span/text()').extract()
workEntry['work_location'] = ''.join(company).encode('utf-8')
dates = entry.xpath('p[#class="work_dates"]//text()').extract()
dates_str = ''.join(dates).encode('utf-8').split(' to ')
if len(dates) > 0:
if dates_str[0]:
workEntry['start_date'] = dates_str[0]
if dates_str[1]:
workEntry['end_date'] = dates_str[1]
else:
workEntry['start_date'] = 'NULL'
workEntry['end_date'] = 'NULL'
description = entry.xpath('p[#class="work_description"]//text()').extract()
workEntry['description'] = ''.join(description).encode('utf-8')
workHistory.container.append(workEntry)
eduHistory = ItemList()
education = hxs.xpath('//div[#id="education-items" and #class="items-container"]/div')
for elem in education:
item = elem.xpath('div')
for entry in item:
eduEntry = SchoolItem()
degree = entry.xpath('p[#class="edu_title"]/text()').extract()
degree = ''.join(degree).encode('utf-8')
eduEntry['degree'] = degree
school = entry.xpath('div[#class="edu_school"]/span//text()').extract()
school = ''.join(school).encode('utf-8')
eduEntry['school'] = school
locale = entry.xpath('span[#itemprop="addressLocality"]/text()').extract()
locale = ''.join(locale).encode('utf-8')
eduEntry['locale'] = locale
grad_date = entry.xpath('p[#class="edu_dates"]/text()').extract()
dates_str = ''.join(grad_date).encode('utf-8').split(' to ')
if len(grad_date) > 0:
if len(dates_str) == 2:
if dates_str[0]:
eduEntry['admit_date'] = dates_str[0]
try:
if dates_str[1]:
eduEntry['grad_date'] = dates_str[1]
except:
pass
elif len(dates_str) == 1:
if dates_str[0]:
eduEntry['grad_date'] = dates_str[0]
eduEntry['admit_date'] = 'NULL'
else:
eduEntry['admit_date'] = 'NULL'
eduEntry['grad_date'] = 'NULL'
eduHistory.container.append(eduEntry)
records['url'] = response.url
records['name'] = ''.join(name).encode('utf-8')
records['headline'] = msgpack.packb(''.join(headline).encode('utf-8'))
records['locale'] = ''.join(rlocale).encode('utf-8')
records['summary'] = msgpack.packb(''.join(summary).encode('utf-8'))
records['skills'] = msgpack.packb(skills)
records['links'] = resume_links
#records['experience'] = msgpack.packb(workHistory, default=workHistory.encode)
records['experience'] = workHistory
records['education'] = msgpack.packb(eduHistory, default=eduHistory.encode)
#records['experience'] = workHistory
#records['education'] = eduHistory
return records`
Obviously this part of code
for name, link in zip(names,links):
if name not in 'Feedback':
records['name'] = name
records['link'] = url_prefix+link
yield Request(records['link'], meta={'item': records}, callback= self.parse_node)
doesn't emit any link. Perhaps you meant if 'Feedback' not in name
Also note that XPath here digest.xpath('//a[#target="_blank"]/text()') applied to DOM overall, not only part previously extracted for digest. If you'd like to apply XPath to digest selector you should rather use leading dot in xpath like this digest.xpath('.//a[#target="_blank"]/text()')

Crawling website at two levels and return item

I have a main page to crawl name and url.Again need to go to that url and crawl further details like fullname,age,and link. Finally need to return the items with (name,url,age,sex,link) in a single item.
Want to define first level of crawl in one method crawl_page and the second level of crawl in another method crawl_item.
class CrawlLink(CrawlSpider):
name = "crawllink"
allowed_domains = ['www.xyz.org']
start_urls = ["www.xyz.org/profile?page=0"]
rules = [Rule(SgmlLinkExtractor(allow = ('/profile\?page=\d+'),restrict_xpaths = ('//li[#class="pager-next"]',),canonicalize=False ),
callback = 'parse_page',
follow=True)
]
def parse_page(self, response):
self.log ('Started Crawling List %s' %response.url)
items = response.xpath("//div[#id='profile']/div")
ulists = []
for temp in items:
usritem = PostUsers()
usrlink = temp.xpath("./div[#class='name']/a/#href").extract()[0]
usritem ["url"] = 'www.xyz.org'+usrlink
usritem ["namel"] = temp.xpath("//div[#id='user_profile_main']/dl/dd[1]/text()").extract()
for urltemp in usrlink:
yield Request(url=usritem["url"], callback=self.parse_user)
# ulists.append( usritem)
return ulists
def parse_user(self, response):
self.log ('Started Crawling Profile %s' %response.url)
usr = PostUsers()
relative_url = response.xpath("//div[#id='nav-content']/ul/li[2]/a/#href").extract()[0]
usr["link"] = 'www.xyz.org'+relative_url
usr ["age"] = response.xpath("//div[#id='user_user_full_group_profile_main']/dl/dd[1]/text()").extract()
usr ["fullname"] = response.xpath("//h1[#id='page-title']/text()").extract()
self.log ('Finished Crawling Profile %s' %response.url)
return usr

Categories

Resources