Problems with passing global variables in a Python scrapy project - python

In a Scrapy project I am doing, I am having difficulties in sending a variable containing a list from one function to another. I need to do so, as I need to combine the values from one page along with another at the end of the script. The code is as follows:
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request
from scrapy.http.request import Request
from dirbot.items import Website
from scrapy.contrib.spiders import CrawlSpider,Rule
from six import string_types
from datetime import datetime
from decimal import Decimal
import itertools
import numpy
import urlparse
import scrapy
class DmozSpider(Spider):
name = "dnot"
allowed_domains = ["ca.finance.yahoo.com", "http://eoddata.com/"]
start_urls = [
"http://eoddata.com/stocklist/TSX.htm"
]
def parse(self,response):
companyList = response.xpath('//tr[#class="ro"]/td/a/text()').extract()
for company in companyList:
go = 'https://ca.finance.yahoo.com/q/hp?s={0}.TO&a=02&b=2&c=2005&d=02&e=2&f=2015&g=m'.format(company)
for link in go:
yield Request(go, self.stocks1)
def stocks1(self, response):
# global returns_page1
# EAFP = Easier to ask for forgiveness then permission
# Gathers ONLY adjusted closing stock price
global returns_page1
returns_page1 = []
rows = response.xpath('//table[#class="yfnc_datamodoutline1"]//table/tr')[1:]
for row in rows:
cells = row.xpath('.//td/text()').extract()
try:
datetime.strptime(cells[0], "%b %d, %Y")
values = cells[-1]
returns_page1.append(values)
except ValueError:
continue
current_page = response.url
next_page = current_page + "&z=66&y=66"
yield Request(next_page, self.stocks2)
def stocks2(self, response):
item = Website()
global returns_page1
returns_page2 = []
rows = response.xpath('//table[#class="yfnc_datamodoutline1"]//table/tr')[1:]
for row in rows:
cells = row.xpath('.//td/text()').extract()
try:
datetime.strptime(cells[0], "%b %d, %Y")
values = cells[-1]
returns_page2.append(values)
except ValueError:
continue
returns_tot = returns_page1 + returns_page2
returns_dec = [Decimal(float(i)) for i in returns_tot]
returns = [float(n) for n in returns_dec]
items = []
item = Website()
item['url'] = response.url
item['name'] = response.xpath('//div[#class="title"]/h2/text()').extract()
item['avgreturns'] = numpy.mean(returns)
item['varreturns'] = numpy.var(returns)
item['sdreturns'] = numpy.std(returns)
item['returns'] = returns
items.append(item)
yield item
I am trying to combine returns_page1 from the def stocks1 function with returns_page2 that is gathered in the def stocks2 function. However my output is only giving me the values from the returns_page2 variable.
I know I can't put a return in the def stocks1 function because I have a yield in there. That's why I tried using global variables.
What am I doing wrong here?

Best way of passing values from one function to another is using meta in request,
in first function
yield Request(next_page, self.stocks2, meta={'returns_page1': returns_page1})
in second function
returns_page1 = response.meta.get('returns_page1')

Related

how to yield a parsed item from one link with other parsed items from other links in the same item list

The problem is that I've been iterating from a list of places to scrape the latitude longitude and elevation. The thing is when I get what I scraped back I have no way to link it with my current df since the names that I iterated may have either been modified or skipped.
I've managed to get the name of what I looked but since its parsed from an outside the link from the rest of the items it doesn't work properly.
import scrapy
import pandas as pd
from ..items import latlonglocItem
df = pd.read_csv('wine_df_final.csv')
df = df[pd.notnull(df.real_place)]
real_place = list(set(df.real_place))
class latlonglocSpider(scrapy.Spider):
name = 'latlonglocs'
start_urls = []
for place in real_place:
baseurl = place.replace(',', '').replace(' ', '+')
cleaned_href = f'http://www.google.com/search?q={baseurl}+coordinates+latitude+longitude+distancesto'
start_urls.append(cleaned_href)
def parse(self, response):
items = latlonglocItem()
items['base_name'] = response.xpath('string(/html/head/title)').get().split(' coordinates')[0]
for href in response.xpath('//*[#id="ires"]/ol/div/h3/a/#href').getall():
if href.startswith('/url?q=https://www.distancesto'):
yield response.follow(href, self.parse_distancesto)
else:
pass
yield items
def parse_distancesto(self, response):
items = latlonglocItem()
try:
items['appellation'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[2]/p/strong)').get()
items['latitude'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[1]/td)').get()
items['longitude'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[2]/td)').get()
items['elevation'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[10]/td)').get()
yield items
except Exception:
pass
#output
appellation base_name elevation latitude longitude
Chalone, USA
Santa Cruz, USA 56.81 35 9.23
what is happening is that I parse what I looked for then it goes inside a link and parses the rest of the information. However, evidently on my dataframe I get the name of what I looked for completely unattached with the rest of the items and even then is hard to find the match. I wish to pass the info to the other function so it yields all the items all together.
This may work. I will comment both what I am doing and a little bit of your code you have an understanding of what I am doing.
import scrapy
import pandas as pd
from ..items import latlonglocItem
df = pd.read_csv('wine_df_final.csv')
df = df[pd.notnull(df.real_place)]
real_place = list(set(df.real_place))
class latlonglocSpider(scrapy.Spider): # latlonglocSpider is a child class of scrapy.Spider
name = 'latlonglocs'
start_urls = []
for place in real_place:
baseurl = place.replace(',', '').replace(' ', '+')
cleaned_href = f'http://www.google.com/search?q={baseurl}+coordinates+latitude+longitude+distancesto'
start_urls.append(cleaned_href)
def __init__(self): # Constructor for our class
# Since we did our own constructor we need to call the parents constructor
scrapy.Spider.__init__(self)
self.base_name = None # Here is the base_name we can now use class wide
def parse(self, response):
items = latlonglocItem()
items['base_name'] = response.xpath('string(/html/head/title)').get().split(' coordinates')[0]
self.base_name = items['base_name'] # Lets store the base_name in the class
for href in response.xpath('//*[#id="ires"]/ol/div/h3/a/#href').getall():
if href.startswith('/url?q=https://www.distancesto'):
yield response.follow(href, self.parse_distancesto)
else:
pass
yield items
def parse_distancesto(self, response):
items = latlonglocItem()
try:
# If for some reason self.base_name is never assigned in
# parse() then we want to use an empty string instead of the self.base_name
# The following syntax means use self.base_name unless it is None or empty
# in which case just use and empty string.
base_name = self.base_name or "" # If for some reason
items['appellation'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[2]/p/strong)').get()
items['latitude'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[1]/td)').get()
items['longitude'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[2]/td)').get()
items['elevation'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[10]/td)').get()
yield items
except Exception:
pass
import scrapy
import pandas as pd
from ..items import latlonglocItem
df = pd.read_csv('wine_df_final.csv')
df = df[pd.notnull(df.real_place)]
real_place = list(set(df.real_place))
class latlonglocSpider(scrapy.Spider): # latlonglocSpider is a child class of scrapy.Spider
name = 'latlonglocs'
start_urls = []
for place in real_place:
baseurl = place.replace(',', '').replace(' ', '+')
cleaned_href = f'http://www.google.com/search?q={baseurl}+coordinates+latitude+longitude+distancesto'
start_urls.append(cleaned_href)
def __init__(self): # Constructor for our class
# Since we did our own constructor we need to call the parents constructor
scrapy.Spider.__init__(self)
self.base_name = None # Here is the base_name we can now use class wide
def parse(self, response):
for href in response.xpath('//*[#id="ires"]/ol/div/h3/a/#href').getall():
if href.startswith('/url?q=https://www.distancesto'):
self.base_name = response.xpath('string(/html/head/title)').get().split(' coordinates')[0]
yield response.follow(href, self.parse_distancesto)
else:
pass
def parse_distancesto(self, response):
items = latlonglocItem()
try:
# If for some reason self.base_name is never assigned in
# parse() then we want to use an empty string instead of the self.base_name
# The following syntax means use self.base_name unless it is None or empty
# in which case just use and empty string.
items['base_name'] = self.base_name or "" # If for some reason
items['appellation'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[2]/p/strong)').get()
items['latitude'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[1]/td)').get()
items['longitude'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[2]/td)').get()
items['elevation'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[10]/td)').get()
yield items
except Exception:
pass
thanks to Error - Syntactical Remorse. Concurrent requests had to set to 1 for it to work and placed base_name inside the loop.

Reducing data usage of Scrapy Spider/How to be a better internet citizen?

I recently resurrected an old scrapy spider (0.18.4) that crawls craigslist. The spider grabs links from the first page of apartment listings every 15mins, and scrapes the links of new postings. (see code below)
This spider was downloading >1GB of data every 15min! I was able to reduce it by half, but this is still a lot of load on the craigslist site. Please note, this is from the spider alone, as I tested without pipelines enabled. I have also tried limiting through MAX_DEPTH and DOWNLOAD_MAXSIZE in settings.py, spider class settings, and individual follow on requests meta settings. I have spent several hours in the docs, messed around with Rules, updated to the latest version of scrapy, etc.; all to no avail. Granted, I wrote this script several years ago when I was new to python, but perhaps my folly can be the comunities gain...
Given my code below, what can I, or anyone else using scrapy, do to reduce the amount of data I'm downloading, when I only care about a couple kB of text???? Which calls are data greedy?
Spider:
from scrapy.spider import BaseSpider
from scrapy.contrib.spiders import CrawlSpider
#from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.loader import XPathItemLoader
from craigslist_2br.items import CraigslistItem
from craigslist_2br import craigslistTools
from scrapy.http import Request
from urlparse import urljoin
import types
import exceptions
import datetime
import ipdb as pdb
#from ghost import Ghost
class Suppressor:
def __init__(self, exception_type,localList):
self._exception_type = exception_type
self.safe_dict = dict([ (k, locals().get(k, None)) for k in localList ])
def __call__(self, expression):
try:
exec(expression, {"__builtins__" : None}, self.safe_dict)
except self._exception_type as e:
print 'Suppressor: suppressed exception %s with content \'%s\'' % (type(self._exception_type), e)
class MySpider(BaseSpider):
name = "craig"
allowed_domains = ["craigslist.org"]
start_urls = ["http://sfbay.craigslist.org/search/apa/"]
#g=Ghost()
def parse(self, response):
hxsM = HtmlXPathSelector(response)
##titles = hxs.select('//a[#class="result-title hdrlnk"]')
postings=hxsM.select('//p[#class="result-info"]')
resultRows=hxsM.select('//li[#class="result-row"]')
try:
IDf=open("/home/dapper/python/scrapy/scrapy_craig2br/craigslist_2br/craigslist_2br/postingIDs.txt","r")
IDvec=[int(sl.strip("\n")) for sl in IDf.readlines()]
except:
IDvec=[]
finally:
try:
IDf.close()
except:
pass
for posting,resultRow in zip(postings,resultRows):
item = CraigslistItem()
try:
item["ID"]=posting.select("a/#data-id").extract()[0]
if (int(item["ID"])==int(resultRow.select("#data-pid").extract()[0])):
try:
item["repostID"]=resultRow.select("#data-repost-of").extract()[0]
except:
item["repostID"]=''
else:
item["repostID"]=''
except:
item["ID"]=''
item["repostID"]=''
try:
item ["price"] = posting.select("span[#class='result-meta']/span[#class='result-price']/text()").extract()[0]
except:
item ["price"] = 0
try:
item ["neighborhood"] = posting.select("span[#class='result-meta']/span[#class='result-hood']/text()").extract()[0]
except:
item ["neighborhood"] = ''
if ((not any(item["ID"]) or (int(item["ID"]) not in IDvec)) and
(craigslistTools.FixPriceStr(item['price'])<3000.0) and
(not((any(item["repostID"]) and (int(item["repostID"]) in IDvec)) and (any(item['neighborhood']) and craigslistTools.outDaHoods(item['neighborhood']))))):
#s = Suppressor(exception_type=exceptions.IndexError,localList=[item,titles,postings]) # TODO: put your exception type here
item ["title"] = posting.select("a/text()").extract()[0]
item ["link"] = posting.select("a/#href").extract()[0]
item ["dateAdded"] = datetime.datetime.now().strftime("%I:%M%p on %B %d, %Y")
try:
item ["size"] = posting.select("span[#class='result-meta']/span[#class='housing']/text()").extract()[0]
except:
item ["size"] = ''
if item['link']:
if 'http://' not in item['link']:
if isinstance(item['link'], types.StringTypes):
item['link'] = urljoin(response.url, item['link'])
else:
item['link'] = urljoin(response.url, item['link'][0])
yield Request(item['link'],
meta={'item': item,'download_maxsize':8**8,'depth_limit':1},
callback=self.anchor_page)
def anchor_page(self, response):
hxs = HtmlXPathSelector(response)
old_item = response.request.meta['item'] # Receiving parse Method item that was in Request meta
# parse some more values
#place them in old_item
#e.g
old_item['postingbody']=hxs.select("//section [#id='postingbody']").extract()
try:
latit=float(hxs.select("//div [#id='map']//#data-latitude").extract()[0])
longi=float(hxs.select("//div [#id='map']//#data-longitude").extract()[0])
#print '%f,%f'%(latit,longi)
except:
latit=0
longi=0
try:
old_item["address"] = hxs.select(".//div [#class='mapaddress']/text()").extract()[0]
except:
old_item["address"] = []
try:
if any(latit)&(not any([char.isdigit() for char in old_item["address"][0]])):
try:
old_item["address"] = craigslistTools.coord2addr(latit,longi)
except:
pass
elif (not any(old_item["address"])):
try:
zips=[94611,94610,94609,94606,94618,94705]
for z in zips:
for line in old_item['postingbody'].split("\n"):
if str(z) in line:
old_item["address"] = line
except:
pass
except:
pass
if (latit==0)&(any(old_item["address"])):
try:
cities=['Oakland','oakland','Piedmont','piedmont','Berkeley','berkeley','montclair','Montclair']
if not any([c in old_item["address"] for c in cities]):
old_item["address"]+=', Oakland'
geoRes=Geocoder.geocode(old_item["address"])
latit=geoRes[0].latitude
longi=geoRes[0].longitude
except:
pass
old_item["latit"] = latit
old_item["longi"] = longi
try:
(bartDist,bartCoord)=craigslistTools.bartDist((latit,longi))
except:
bartDist=0
try:
if ((bartDist!=0) and (bartDist<2)):
bartWalkingTime=craigslistTools.bartWalking((latit,longi),bartCoord)
else:
bartWalkingTime=0
except:
bartWalkingTime=0
old_item["bartDist"] = bartDist
old_item["BartWalkingTime"] = bartWalkingTime
try:
if ((bartDist<1) and (bartDist!=0)):
old_item['LLNLDrivingTime']=craigslistTools.LLNLdriving((latit,longi))
else:
old_item['LLNLDrivingTime']=0
except:
old_item['LLNLDrivingTime']=0
try:
old_item["Br"]=int(hxs.select("//p [#class='attrgroup']/span/b/text()").extract()[0].strip('BR'))
old_item["baths"]=int(hxs.select("//p [#class='attrgroup']/span/b/text()").extract()[1].strip('Ba'))
except:
try:
old_item["Br"]=int(re.findall(r"(\d+)br",old_item['size'])[0])
old_item["baths"]=1
except:
old_item["Br"]=-1
old_item["baths"]=-1
try:
old_item['amenities']=hxs.select("//p [#class='attrgroup']/span/text()").extract()
except:
old_item['amenities']=[]
yield old_item
settings.py:
# Scrapy settings for craigslist_2br project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
#
BOT_NAME = 'craigslist_2br'
SPIDER_MODULES = ['craigslist_2br.spiders']
NEWSPIDER_MODULE = 'craigslist_2br.spiders'
ITEM_PIPELINES = {'craigslist_2br.pipelines.Craigslist2BrPipeline':0}
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'craigslist_2br (+http://www.craigslist.org)'
DOWNLOAD_DELAY = 5
RANDOMIZE_DOWNLOAD_DELAY = False
CONCURRENT_REQUESTS_PER_DOMAIN = 1 # Default: 8
#SCHEDULER = 'scrapy.core.scheduler.Scheduler'
#HTTPCACHE_ENABLED = True
DEPTH_LIMIT = 1
Good news! The problem I experienced will likely not effect you (just what you wanted to read when searching for answers ;) ) Using the Depreciated HtmlXPathSelector from v0.18 can cause major issues (surprise surprise), so don't do it.

Scrapy crawler does not scrape or print results in CSV

Seems like this scrapy spider locates the links that it is supposed to go to in order to collect additional information, but it either doesn't go to the next page or it is unable to collect the information on the other page. I checked the xpath links, they all appear to be correct.
Terminal output:
2017-01-10 10:31:16 [scrapy.extensions.logstats] INFO: Crawled 213 pages (at 23 pages/min), scraped 0 items (at 0 items/min)
Code:
#!/usr/bin/env python
import types
import time
from datetime import date, datetime, timedelta
import requests
import msgpack
from scrapy.http import Request
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector, Selector
from resume_data.items import ResumeDataItem, ResultListItem, WorkItem, SchoolItem, ItemList
from bs4 import BeautifulSoup, Tag, NavigableString, Comment
from bs4.element import NavigableString
class ResumeIndeedSpider(CrawlSpider):
name = "indeed_resume"
allowed_domains = ["indeed.com"]
start_urls = ['http://www.indeed.com/resumes/mechanical-engineer',
'http://www.indeed.com/resumes/mechanical-engineering',
'http://www.indeed.com/resumes/piping-engineer',
'http://www.indeed.com/resumes/design-engineer',
'http://www.indeed.com/resumes/project-engineer']
#def __init__(self, filename=None):
#self.unis = list()
rules = (Rule (SgmlLinkExtractor(restrict_xpaths = ('//a[contains(#class,"app_link")]')), callback = "parse_item", follow = True),)
def parse_item(self, response):
hxs = Selector(response)
digest = hxs.xpath('//ol[#class="resultsList"]')
records = ResumeDataItem()
url_prefix = 'http://www.indeed.com'
resume_links = digest.xpath('//li[#class="sre"]//div[#class="sre-entry"]')
names = digest.xpath('//a[#target="_blank"]/text()').extract()
links = digest.xpath('//a[#target="_blank"]/#href').extract()
for name, link in zip(names,links):
if name not in 'Feedback':
records['name'] = name
records['link'] = url_prefix+link
yield Request(records['link'], meta={'item': records}, callback= self.parse_node)
def parse_node(self, response):
hxs = Selector(response)
records = ResumeDataItem()
# name = hxs.xpath('/text()').extract()
name = hxs.xpath('//h1[#id="resume-contact"]/text()').extract()
headline = hxs.xpath('//h2[#id="headline"]/text()').extract()
# locale = hxs.xpath('//div[#class="addr" and #itemprop="address"]//p//text()').extract()
rlocale = hxs.xpath('//p[#id="headline_location" and #class="locality"]//text()').extract()
summary = hxs.xpath('//p[#id="res_summary" and #class="summary"]/text()').extract()
skills = list()
skill = hxs.xpath('//div[#id="skills-items" and #class="items-container"]//p//text()').extract()
if len(skill) != 0:
skills.append(''.join(skill).encode('utf-8'))
skill = hxs.xpath('//div[#id="additionalinfo-section" and #class="last"]//div[#class="data_display"]//p//text()').extract()
if len(skill) != 0:
skills.append(''.join(skill).encode('utf-8'))
resume_links = list()
links = hxs.xpath('//div[#id="link-items" and #class="items-container"]//p//text()').extract()
for link in links:
resume_links.append(''.join(link).encode('utf-8'))
workHistory = ItemList()
experience = hxs.xpath('//div[#id="work-experience-items"]/div')
for elem in experience:
item = elem.xpath('div')
for entry in item:
workEntry = WorkItem()
title = entry.xpath('p[#class="work_title title"]//text()').extract()
workEntry['title'] = ''.join(title).encode('utf-8')
company = entry.xpath('div[#class="work_company"]/span/text()').extract()
workEntry['company']= ''.join(company).encode('utf-8')
location = entry.xpath('div[#class="work_company"]/div[#class="inline-block"]/span/text()').extract()
workEntry['work_location'] = ''.join(company).encode('utf-8')
dates = entry.xpath('p[#class="work_dates"]//text()').extract()
dates_str = ''.join(dates).encode('utf-8').split(' to ')
if len(dates) > 0:
if dates_str[0]:
workEntry['start_date'] = dates_str[0]
if dates_str[1]:
workEntry['end_date'] = dates_str[1]
else:
workEntry['start_date'] = 'NULL'
workEntry['end_date'] = 'NULL'
description = entry.xpath('p[#class="work_description"]//text()').extract()
workEntry['description'] = ''.join(description).encode('utf-8')
workHistory.container.append(workEntry)
eduHistory = ItemList()
education = hxs.xpath('//div[#id="education-items" and #class="items-container"]/div')
for elem in education:
item = elem.xpath('div')
for entry in item:
eduEntry = SchoolItem()
degree = entry.xpath('p[#class="edu_title"]/text()').extract()
degree = ''.join(degree).encode('utf-8')
eduEntry['degree'] = degree
school = entry.xpath('div[#class="edu_school"]/span//text()').extract()
school = ''.join(school).encode('utf-8')
eduEntry['school'] = school
locale = entry.xpath('span[#itemprop="addressLocality"]/text()').extract()
locale = ''.join(locale).encode('utf-8')
eduEntry['locale'] = locale
grad_date = entry.xpath('p[#class="edu_dates"]/text()').extract()
dates_str = ''.join(grad_date).encode('utf-8').split(' to ')
if len(grad_date) > 0:
if len(dates_str) == 2:
if dates_str[0]:
eduEntry['admit_date'] = dates_str[0]
try:
if dates_str[1]:
eduEntry['grad_date'] = dates_str[1]
except:
pass
elif len(dates_str) == 1:
if dates_str[0]:
eduEntry['grad_date'] = dates_str[0]
eduEntry['admit_date'] = 'NULL'
else:
eduEntry['admit_date'] = 'NULL'
eduEntry['grad_date'] = 'NULL'
eduHistory.container.append(eduEntry)
records['url'] = response.url
records['name'] = ''.join(name).encode('utf-8')
records['headline'] = msgpack.packb(''.join(headline).encode('utf-8'))
records['locale'] = ''.join(rlocale).encode('utf-8')
records['summary'] = msgpack.packb(''.join(summary).encode('utf-8'))
records['skills'] = msgpack.packb(skills)
records['links'] = resume_links
#records['experience'] = msgpack.packb(workHistory, default=workHistory.encode)
records['experience'] = workHistory
records['education'] = msgpack.packb(eduHistory, default=eduHistory.encode)
#records['experience'] = workHistory
#records['education'] = eduHistory
return records`
Obviously this part of code
for name, link in zip(names,links):
if name not in 'Feedback':
records['name'] = name
records['link'] = url_prefix+link
yield Request(records['link'], meta={'item': records}, callback= self.parse_node)
doesn't emit any link. Perhaps you meant if 'Feedback' not in name
Also note that XPath here digest.xpath('//a[#target="_blank"]/text()') applied to DOM overall, not only part previously extracted for digest. If you'd like to apply XPath to digest selector you should rather use leading dot in xpath like this digest.xpath('.//a[#target="_blank"]/text()')

Added iterating over page id in Scrapy, responses in parse method no longer run

I have a few print functions in my spider for debugging. In the start_request function, I'm generating urls by adding numbers in the range [0,4] with base url which gets parsed by parse_grant function.In that function, first print function gets called, but second does not.
Still learning here, so I may have made a stupid mistake and don't quite understand what's happening with Twisted in the background.
# -*- coding: utf-8 -*-
from scrapy.spiders import Spider, Rule
from scrapy.http import Request
from scraper_app.items import NSERCGrant
from scrapy.selector import Selector
class NSERC_Spider(Spider):
name = 'NSERCSpider'
allowed_domains = ["http://www.nserc-crsng.gc.ca"]
# Maximum page id to use.
max_id = 5
def start_requests(self):
for i in range(self.max_id):
if i == 0:
continue
yield Request("http://www.nserc-crsng.gc.ca/ase-oro/Details-Detailles_eng.asp?id=%d" % i,
callback=self.parse_grant)
def parse_grant(self, response):
print("Being called")
sel = Selector(response)
grants = sel.xpath('.//html//body')
items = []
for response in grants:
print("Responses being called")
item = NSERCGrant()
# Row one
item['Competition_Year'] = response.xpath('.//tr[1]//td[2]//text()').extract()
item['Fiscal_Year'] = response.xpath('.//tr[1]//td[4]//text()').extract()
# Row two
item['Project_Lead_Name'] = response.xpath('.//tr[2]//td[2]//text()').extract()
item['Institution'] = response.xpath('.//tr[2]//td[4]//text()').extract()
# Row three
item['Department'] = response.xpath('.//tr[3]//td[2]//text()').extract()
item['Province'] = response.xpath('.//tr[3]//td[4]//text()').extract()
# Row four
item['Award_Amount'] = response.xpath('.//tr[4]//td[2]//text()').extract()
item['Installment'] = response.xpath('.//tr[4]//td[4]//text()').extract()
# Row five
item['Program'] = response.xpath('.//tr[5]//td[2]//text()').extract()
item['Selection_Committee'] = response.xpath('.//tr[5]//td[4]//text()').extract()
# Row six
item['Research_Subject'] = response.xpath('.//tr[6]//td[2]//text()').extract()
item['Area_of_Application'] = response.xpath('.//tr[6]//td[4]//text()').extract()
# Row seven
item['Co_Researchers'] = response.xpath(".//tr[7]//td[2]//text()").extract()
item['Partners'] = response.xpath('.//tr[7]//td[4]//text()').extract()
# Award Summary
item['Award_Summary'] = response.xpath('.//p//text()').extract()
items.append(item)
return items
The information you are looking for only occurs once on each page and the body tag is on every page so the loop and the line
grants = sel.xpath('.//html//body')
are redundant. Also, response.xpath('... your xpath here ...') saves some code. Try this
# -*- coding: utf-8 -*-
from scrapy.spiders import Spider
from scrapy.http import Request
from scraper_app.items import NSERCGrant
class NSERC_Spider(Spider):
name = 'NSERCSpider'
allowed_domains = ["http://www.nserc-crsng.gc.ca"]
# Maximum page id to use.
max_id = 5
def start_requests(self):
for i in range(1, self.max_id):
yield Request("http://www.nserc-crsng.gc.ca/ase-oro/Details-Detailles_eng.asp?id=%d" % i,
callback=self.parse_grant)
def parse_grant(self, response):
print("Being called")
item = NSERCGrant()
# Row one
item['Competition_Year'] = response.xpath('//tr[1]//td[2]//text()').extract()
item['Fiscal_Year'] = response.xpath('//tr[1]//td[4]//text()').extract()
# Row two
item['Project_Lead_Name'] = response.xpath('//tr[2]//td[2]//text()').extract()
item['Institution'] = response.xpath('//tr[2]//td[4]//text()').extract()
# Row three
item['Department'] = response.xpath('//tr[3]//td[2]//text()').extract()
item['Province'] = response.xpath('//tr[3]//td[4]//text()').extract()
# Row four
item['Award_Amount'] = response.xpath('//tr[4]//td[2]//text()').extract()
item['Installment'] = response.xpath('//tr[4]//td[4]//text()').extract()
# Row five
item['Program'] = response.xpath('//tr[5]//td[2]//text()').extract()
item['Selection_Committee'] = response.xpath('//tr[5]//td[4]//text()').extract()
# Row six
item['Research_Subject'] = response.xpath('//tr[6]//td[2]//text()').extract()
item['Area_of_Application'] = response.xpath('//tr[6]//td[4]//text()').extract()
# Row seven
item['Co_Researchers'] = response.xpath("//tr[7]//td[2]//text()").extract()
item['Partners'] = response.xpath('//tr[7]//td[4]//text()').extract()
# Award Summary
item['Award_Summary'] = response.xpath('//p//text()').extract()
yield item
I've also tweaked your start_request routine to remove the if i = 0.
Take a look at scrapy shell which allows you to try out your xpaths and see the results interactively.
When I try
grants = sel.xpath('.//html//body')
from my scrapy shell, this is what I get
In [10]: grants = sel.xpath('.//html//body')
In [11]: grants
Out[11]: []
When I change it to the following code,
In [12]: grants = sel.xpath('/html/body')
In [13]: grants
Out[13]: [<Selector xpath='/html/body' data=u'<body>\r\n<div id="cn-body-inner-1col">\r\n<'>]

Scrapy (Python): Iterating over 'next' page without multiple functions

I am using Scrapy to grab stock data from Yahoo! Finance.
Sometimes, I need to loop over several pages, 19 in this example , in order to get all of the stock data.
Previously (when I knew there would only be two pages), I would use one function for each page, like so:
def stocks_page_1(self, response):
returns_page1 = []
#Grabs data here...
current_page = response.url
next_page = current_page + "&z=66&y=66"
yield Request(next_page, self.stocks_page_2, meta={'returns_page1': returns_page1})
def stocks_page_2(self, response):
# Grab data again...
Now, instead of writing 19 or more functions, I was wondering if there was a way I could loop through an iteration using one function to grab all data from all pages available for a given stock.
Something like this:
for x in range(30): # 30 was randomly selected
current_page = response.url
# Grabs Data
# Check if there is a 'next' page:
if response.xpath('//td[#align="right"]/a[#rel="next"]').extract() != ' ':
u = x * 66
next_page = current_page + "&z=66&y={0}".format(u)
# Go to the next page somehow within the function???
Updated Code:
Works, but only returns one page of data.
class DmozSpider(CrawlSpider):
name = "dnot"
allowed_domains = ["finance.yahoo.com", "http://eoddata.com/"]
start_urls = ['http://finance.yahoo.com/q?s=CAT']
rules = [
Rule(LinkExtractor(restrict_xpaths='//td[#align="right"]/a[#rel="next"]'),
callback='stocks1',
follow=True),
]
def stocks1(self, response):
returns = []
rows = response.xpath('//table[#class="yfnc_datamodoutline1"]//table/tr')[1:]
for row in rows:
cells = row.xpath('.//td/text()').extract()
try:
values = cells[-1]
try:
float(values)
returns.append(values)
except ValueError:
continue
except ValueError:
continue
unformatted_returns = response.meta.get('returns_pages')
returns = [float(i) for i in returns]
global required_amount_of_returns, counter
if counter == 1 and "CAT" in response.url:
required_amount_of_returns = len(returns)
elif required_amount_of_returns == 0:
raise CloseSpider("'Error with initiating required amount of returns'")
counter += 1
print counter
# Iterator to calculate Rate of return
# ====================================
if data_intervals == "m":
k = 12
elif data_intervals == "w":
k = 4
else:
k = 30
sub_returns_amount = required_amount_of_returns - k
sub_returns = returns[:sub_returns_amount]
rate_of_return = []
if len(returns) == required_amount_of_returns or "CAT" in response.url:
for number in sub_returns:
numerator = number - returns[k]
rate = numerator/returns[k]
if rate == '':
rate = 0
rate_of_return.append(rate)
k += 1
item = Website()
items = []
item['url'] = response.url
item['name'] = response.xpath('//div[#class="title"]/h2/text()').extract()
item['avg_returns'] = numpy.average(rate_of_return)
item['var_returns'] = numpy.cov(rate_of_return)
item['sd_returns'] = numpy.std(rate_of_return)
item['returns'] = returns
item['rate_of_returns'] = rate_of_return
item['exchange'] = response.xpath('//span[#class="rtq_exch"]/text()').extract()
item['ind_sharpe'] = ((numpy.average(rate_of_return) - RFR) / numpy.std(rate_of_return))
items.append(item)
yield item
You see, a parse callback is just a function that takes the response and returns or yields either Items or Requests or both. There is no issue at all with reusing these callbacks, so you can just pass the same callback for every request.
Now, you could pass the current page info using the Request meta but instead, I'd leverage the CrawlSpider to crawl across every page. It's really easy, start generating the Spider with the command line:
scrapy genspider --template crawl finance finance.yahoo.com
Then write it like this:
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
Scrapy 1.0 has deprecated the scrapy.contrib namespace for the modules above, but if you're stuck with 0.24, use scrapy.contrib.linkextractors and scrapy.contrib.spiders.
from yfinance.items import YfinanceItem
class FinanceSpider(CrawlSpider):
name = 'finance'
allowed_domains = ['finance.yahoo.com']
start_urls = ['http://finance.yahoo.com/q/hp?s=PWF.TO&a=04&b=19&c=2005&d=04&e=19&f=2010&g=d&z=66&y=132']
rules = (
Rule(LinkExtractor(restrict_css='[rel="next"]'),
callback='parse_items',
follow=True),
)
LinkExtractor will pick up the links in the response to follow, but it can be limited with XPath (or CSS) and regular expressions. See documentation for more.
Rules will follow the links and call the callback on every response. follow=True will keep extracting links on every new response, but it can be limited by depth. See documentation again.
def parse_items(self, response):
for line in response.css('.yfnc_datamodoutline1 table tr')[1:-1]:
yield YfinanceItem(date=line.css('td:first-child::text').extract()[0])
Just yield the Items, since Requests for the next pages will be handled by the CrawlSpider Rules.

Categories

Resources