Scraping Project Euler site with scrapy [closed] - python

Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 3 years ago.
Improve this question
I'm trying to scrape projecteuler.net with python's scrapy library, just to make practice with it. I've seen online more than one existent implementation of such a scraper, but they seem just too much elaborated for me. I want simply to save the problems (titles, ids, contents) in a json and next loading with ajax in a local webpage on my pc.
I'm implementing my solution that I will terminate anyway, but since I want to discover the smarter way to use the library, I'm asking you to propose the most intelligent programs with scrapy for doing this job (if you want to avoid the json way, and save directly in html... for me may be even better).
This is my first approach (doesn't work):
# -*- coding: utf-8 -*-
import httplib2
import requests
import scrapy
from eulerscraper.items import Problem
from scrapy.linkextractors import LinkExtractor
from scrapy.loader import ItemLoader
from scrapy.spiders import CrawlSpider, Rule
def start_urls_detection():
# su = ['https://projecteuler.net/archives', 'https://projecteuler.net/archives;page=2']
# i = 1
#
# while True:
# request = requests.get(su[i])
#
# if request.status_code != 200:
# break
#
# i += 1
# su.append('https://projecteuler.net/archives;page=' + str(i + 1))
return ["https://projecteuler.net/"]
class EulerSpider(CrawlSpider):
name = 'euler'
allowed_domains = ['projecteuler.net']
start_urls = start_urls_detection()
rules = (
# Extract links matching 'category.php' (but not matching 'subsection.php')
# and follow links from them (since no callback means follow=True by default).
# Rule(LinkExtractor(allow=('category\.php',), deny=('subsection\.php',))),
Rule(LinkExtractor(allow=('problem=\d*',)), callback="parse_problems"),
Rule(LinkExtractor(allow=('archives;page=\d*',), unique=True), follow=True)
)
def start_requests(self):
# su = ['https://projecteuler.net/archives', 'https://projecteuler.net/archives;page=2']
# i = 1
#
# while True:
# request = requests.get(su[i])
#
# if request.status_code != 200:
# break
#
# i += 1
# su.append('https://projecteuler.net/archives;page=' + str(i + 1))
return [scrapy.Request("https://projecteuler.net/archives", self.parse)]
def parse_problems(self, response):
l = ItemLoader(item=Problem(), response=response)
l.add_css("title", "h2")
l.add_css("id", "#problem_info")
l.add_css("content", ".problem_content")
yield l.load_item()
# def parse_content(self, response):
# #return response.css("div.problem_content::text").extract()
# next_page = "https://projecteuler.net/archives;page=2"
# n = 3
#
# while n < 14:
# next_page = response.urljoin(next_page)
# yield scrapy.Request(next_page, callback=self.parse)
# next_page = next_page[0:len(next_page) - 1] + str(n)
# n += 1
now I will try with some linkExtractor + manual requests combo. In the meantime, I hopefully wait for your solutions...

I think I have found a simplest yet fitting solution (at least for my purpose), in respect to existent code written to scrape projecteuler:
# -*- coding: utf-8 -*-
import scrapy
from eulerscraper.items import Problem
from scrapy.loader import ItemLoader
class EulerSpider(scrapy.Spider):
name = 'euler'
allowed_domains = ['projecteuler.net']
start_urls = ["https://projecteuler.net/archives"]
def parse(self, response):
numpag = response.css("div.pagination a[href]::text").extract()
maxpag = int(numpag[len(numpag) - 1])
for href in response.css("table#problems_table a::attr(href)").extract():
next_page = "https://projecteuler.net/" + href
yield response.follow(next_page, self.parse_problems)
for i in range(2, maxpag + 1):
next_page = "https://projecteuler.net/archives;page=" + str(i)
yield response.follow(next_page, self.parse_next)
return [scrapy.Request("https://projecteuler.net/archives", self.parse)]
def parse_next(self, response):
for href in response.css("table#problems_table a::attr(href)").extract():
next_page = "https://projecteuler.net/" + href
yield response.follow(next_page, self.parse_problems)
def parse_problems(self, response):
l = ItemLoader(item=Problem(), response=response)
l.add_css("title", "h2")
l.add_css("id", "#problem_info")
l.add_css("content", ".problem_content")
yield l.load_item()
From the start page (archives) I follow every single link to a problem, scraping the data that I need with parse_problems. Then I launch the scraper for the other pages of the site, with the same procedure for every list of link.
Also the Item definition with pre and post processes is very clean:
import re
import scrapy
from scrapy.loader.processors import MapCompose, Compose
from w3lib.html import remove_tags
def extract_first_number(text):
i = re.search('\d+', text)
return int(text[i.start():i.end()])
def array_to_value(element):
return element[0]
class Problem(scrapy.Item):
id = scrapy.Field(
input_processor=MapCompose(remove_tags, extract_first_number),
output_processor=Compose(array_to_value)
)
title = scrapy.Field(input_processor=MapCompose(remove_tags))
content = scrapy.Field()
I launch this with the command scrapy crawl euler -o euler.json and it outputs an array of unordered json objects, everyone corrisponding to a single problem: this is fine for me because I'm going to process it with javascript, even if I think resolving the ordering problem via scrapy can be very simple.
EDIT: in fact it is simple, using this pipeline
import json
class JsonWriterPipeline(object):
def open_spider(self, spider):
self.list_items = []
self.file = open('euler.json', 'w')
def close_spider(self, spider):
ordered_list = [None for i in range(len(self.list_items))]
self.file.write("[\n")
for i in self.list_items:
ordered_list[int(i['id']-1)] = json.dumps(dict(i))
for i in ordered_list:
self.file.write(str(i)+",\n")
self.file.write("]\n")
self.file.close()
def process_item(self, item, spider):
self.list_items.append(item)
return item
though the best solution may be to create a custom exporter:
from scrapy.exporters import JsonItemExporter
from scrapy.utils.python import to_bytes
class OrderedJsonItemExporter(JsonItemExporter):
def __init__(self, file, **kwargs):
# To initialize the object we use JsonItemExporter's constructor
super().__init__(file)
self.list_items = []
def export_item(self, item):
self.list_items.append(item)
def finish_exporting(self):
ordered_list = [None for i in range(len(self.list_items))]
for i in self.list_items:
ordered_list[int(i['id'] - 1)] = i
for i in ordered_list:
if self.first_item:
self.first_item = False
else:
self.file.write(b',')
self._beautify_newline()
itemdict = dict(self._get_serialized_fields(i))
data = self.encoder.encode(itemdict)
self.file.write(to_bytes(data, self.encoding))
self._beautify_newline()
self.file.write(b"]")
and configure it in settings to call it for json:
FEED_EXPORTERS = {
'json': 'eulerscraper.exporters.OrderedJsonItemExporter',
}

Related

Result is not saved in json

I am using scrapy and running this script:
import scrapy
from ..items import SizeerItem
from scrapy.http.request import Request
class SizeerSpiderSpider(scrapy.Spider):
name = 'sizeer'
pg = 0
currentPg = 2
start_urls = [
'https://sizeer.lt/moterims'
]
def parse(self, response):
items = SizeerItem()
pages = response.xpath("//nav[#class='m-pagination']//span[3]/text()").extract()
pages = list(dict.fromkeys(pages))
if self.pg == 0:
pages = list(int(s) for s in pages[0].split() if s.isdigit())
self.pg = pages[0]
name = response.xpath("//div[#class='b-productList_content']//a/#href").extract()
items['name'] = list(dict.fromkeys(name))
while self.currentPg <= self.pg:
url = response.request.url + "?sort=default&limit=60&page=" + str(self.currentPg)
self.currentPg += 1
yield Request(url, callback=self.parse)
This way:
scrapy crawl sizeer -s FEED_URI='mydata.json' -s FEED_FORMAT=json
But after that my mydata.json is empty. This is my first time trying to 'play' with it and can't really understand where is the issue.
You also need to yield the items you scrape so Scrapy Engine will run them through the pipelines and thorugh the Feed Export (which is what you need to export to the file).
Since yield is non-blocking you can add just after populating it and the function will still yield your requests after:
...
name = response.xpath("//div[#class='b-productList_content']//a/#href").extract()
items['name'] = list(dict.fromkeys(name))
yield items # <<< Here for example
while self.currentPg <= self.pg:
...
As #yordan pointed out, you can simplify the way you are executing the spider like this: (However it's not the solution to the problem)
scrapy crawl sizeer -o mydata.json
Try this one:
Scrapy use item and save data in a json file
Pay attention to the yielding and the calling of the spider.

How to write python scrapy code for extracting url's present in sitemap of a site and export it to csv

I've found the working solution to write python scrapy code for extracting url's present in sitemap of a site from here but don't know how to export the data to CSV file!
When I try to run scrapy crawl myspider -o mydata.csv it returns an empty csv file, but list of urls are getting printed on screen!
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import SitemapSpider
from scrapy.spiders import Spider
from scrapy.http import Request, XmlResponse
from scrapy.utils.sitemap import Sitemap, sitemap_urls_from_robots
from scrapy.utils.gz import gunzip, is_gzipped
import re
import requests
class GetpagesfromsitemapSpider(SitemapSpider):
name = "myspider"
handle_httpstatus_list = [404]
def parse(self, response):
print(response.url)
def _parse_sitemap(self, response):
if response.url.endswith('/robots.txt'):
for url in sitemap_urls_from_robots(response.body):
yield Request(url, callback=self._parse_sitemap)
else:
body = self._get_sitemap_body(response)
if body is None:
self.logger.info('Ignoring invalid sitemap: %s', response.url)
return
s = Sitemap(body)
sites = []
if s.type == 'sitemapindex':
for loc in iterloc(s, self.sitemap_alternate_links):
if any(x.search(loc) for x in self._follow):
yield Request(loc, callback=self._parse_sitemap)
elif s.type == 'urlset':
for loc in iterloc(s):
for r, c in self._cbs:
if r.search(loc):
sites.append(loc)
break
print(sites)
def __init__(self, spider=None, *a, **kw):
super(GetpagesfromsitemapSpider, self).__init__(*a, **kw)
self.spider = spider
l = []
url = "http://www.example.com/"
resp = requests.head(url + "/sitemap.xml")
if (resp.status_code != 404):
l.append(resp.url)
else:
resp = requests.head(url + "/robots.txt")
if (resp.status_code == 200):
l.append(resp.url)
self.sitemap_urls = l
print(self.sitemap_urls)
def iterloc(it, alt=False):
for d in it:
yield d['loc']
# Also consider alternate URLs (xhtml:link rel="alternate")
if alt and 'alternate' in d:
for l in d['alternate']:
yield l
First, you aren't make any request with scrapy, also you're combining scrapy with requests, that i think it's not the best idea. Try to change __init__ to:
def start_requests(self):
l = []
url = "http://www.example.com"
l.append(url + '/sitemap.xml')
l.append(url + '/robots.txt')
for link in l:
yield Request(link, callback=self._parse_sitemap)
Also, your self._parse_sitemap SHOULD return dict-like or Request(not only your self._parse_sitemap, every function in your scrapy spider, see docs):
def _parse_sitemap(self, response):
# handle here status responses(200,401,etc)
body = self._get_sitemap_body(response)
if body is None:
self.logger.info('Ignoring invalid sitemap: %s', response.url)
return
s = Sitemap(body)
sites = {} # You should return a dict-like item!
if s.type == 'sitemapindex':
for loc in iterloc(s, self.sitemap_alternate_links):
if any(x.search(loc) for x in self._follow):
yield Request(loc, callback=self._parse_sitemap)
elif s.type == 'urlset':
for loc in iterloc(s):
for r, c in self._cbs:
if r.search(loc):
sites.append(loc)
break
yield sites # Change print to yield!, this is the way to populate your .csv file
The whole file(probably doesn't work, but explains the idea):
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import SitemapSpider
from scrapy.spiders import Spider
from scrapy.http import Request, XmlResponse
from scrapy.utils.sitemap import Sitemap, sitemap_urls_from_robots
from scrapy.utils.gz import gunzip, is_gzipped
import re
import requests
class GetpagesfromsitemapSpider(SitemapSpider):
name = "myspider"
handle_httpstatus_list = [404]
def parse(self, response):
print(response.url)
def _parse_sitemap(self, response):
# handle here status responses(200,401,etc)
body = self._get_sitemap_body(response)
if body is None:
self.logger.info('Ignoring invalid sitemap: %s', response.url)
return
s = Sitemap(body)
sites = {} # You should return a dict-like item!
if s.type == 'sitemapindex':
for loc in iterloc(s, self.sitemap_alternate_links):
if any(x.search(loc) for x in self._follow):
yield Request(loc, callback=self._parse_sitemap)
elif s.type == 'urlset':
for loc in iterloc(s):
for r, c in self._cbs:
if r.search(loc):
sites.append(loc)
break
yield sites # Change print to yield!, this is the way to populate your .csv file
def start_requests(self):
l = []
url = "http://www.example.com"
l.append(url + '/sitemap.xml')
l.append(url + '/robots.txt')
for link in l:
yield Request(link, callback=self._parse_sitemap)
def iterloc(it, alt=False):
for d in it:
yield d['loc']
# Also consider alternate URLs (xhtml:link rel="alternate")
if alt and 'alternate' in d:
for l in d['alternate']:
yield l

Reducing data usage of Scrapy Spider/How to be a better internet citizen?

I recently resurrected an old scrapy spider (0.18.4) that crawls craigslist. The spider grabs links from the first page of apartment listings every 15mins, and scrapes the links of new postings. (see code below)
This spider was downloading >1GB of data every 15min! I was able to reduce it by half, but this is still a lot of load on the craigslist site. Please note, this is from the spider alone, as I tested without pipelines enabled. I have also tried limiting through MAX_DEPTH and DOWNLOAD_MAXSIZE in settings.py, spider class settings, and individual follow on requests meta settings. I have spent several hours in the docs, messed around with Rules, updated to the latest version of scrapy, etc.; all to no avail. Granted, I wrote this script several years ago when I was new to python, but perhaps my folly can be the comunities gain...
Given my code below, what can I, or anyone else using scrapy, do to reduce the amount of data I'm downloading, when I only care about a couple kB of text???? Which calls are data greedy?
Spider:
from scrapy.spider import BaseSpider
from scrapy.contrib.spiders import CrawlSpider
#from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.loader import XPathItemLoader
from craigslist_2br.items import CraigslistItem
from craigslist_2br import craigslistTools
from scrapy.http import Request
from urlparse import urljoin
import types
import exceptions
import datetime
import ipdb as pdb
#from ghost import Ghost
class Suppressor:
def __init__(self, exception_type,localList):
self._exception_type = exception_type
self.safe_dict = dict([ (k, locals().get(k, None)) for k in localList ])
def __call__(self, expression):
try:
exec(expression, {"__builtins__" : None}, self.safe_dict)
except self._exception_type as e:
print 'Suppressor: suppressed exception %s with content \'%s\'' % (type(self._exception_type), e)
class MySpider(BaseSpider):
name = "craig"
allowed_domains = ["craigslist.org"]
start_urls = ["http://sfbay.craigslist.org/search/apa/"]
#g=Ghost()
def parse(self, response):
hxsM = HtmlXPathSelector(response)
##titles = hxs.select('//a[#class="result-title hdrlnk"]')
postings=hxsM.select('//p[#class="result-info"]')
resultRows=hxsM.select('//li[#class="result-row"]')
try:
IDf=open("/home/dapper/python/scrapy/scrapy_craig2br/craigslist_2br/craigslist_2br/postingIDs.txt","r")
IDvec=[int(sl.strip("\n")) for sl in IDf.readlines()]
except:
IDvec=[]
finally:
try:
IDf.close()
except:
pass
for posting,resultRow in zip(postings,resultRows):
item = CraigslistItem()
try:
item["ID"]=posting.select("a/#data-id").extract()[0]
if (int(item["ID"])==int(resultRow.select("#data-pid").extract()[0])):
try:
item["repostID"]=resultRow.select("#data-repost-of").extract()[0]
except:
item["repostID"]=''
else:
item["repostID"]=''
except:
item["ID"]=''
item["repostID"]=''
try:
item ["price"] = posting.select("span[#class='result-meta']/span[#class='result-price']/text()").extract()[0]
except:
item ["price"] = 0
try:
item ["neighborhood"] = posting.select("span[#class='result-meta']/span[#class='result-hood']/text()").extract()[0]
except:
item ["neighborhood"] = ''
if ((not any(item["ID"]) or (int(item["ID"]) not in IDvec)) and
(craigslistTools.FixPriceStr(item['price'])<3000.0) and
(not((any(item["repostID"]) and (int(item["repostID"]) in IDvec)) and (any(item['neighborhood']) and craigslistTools.outDaHoods(item['neighborhood']))))):
#s = Suppressor(exception_type=exceptions.IndexError,localList=[item,titles,postings]) # TODO: put your exception type here
item ["title"] = posting.select("a/text()").extract()[0]
item ["link"] = posting.select("a/#href").extract()[0]
item ["dateAdded"] = datetime.datetime.now().strftime("%I:%M%p on %B %d, %Y")
try:
item ["size"] = posting.select("span[#class='result-meta']/span[#class='housing']/text()").extract()[0]
except:
item ["size"] = ''
if item['link']:
if 'http://' not in item['link']:
if isinstance(item['link'], types.StringTypes):
item['link'] = urljoin(response.url, item['link'])
else:
item['link'] = urljoin(response.url, item['link'][0])
yield Request(item['link'],
meta={'item': item,'download_maxsize':8**8,'depth_limit':1},
callback=self.anchor_page)
def anchor_page(self, response):
hxs = HtmlXPathSelector(response)
old_item = response.request.meta['item'] # Receiving parse Method item that was in Request meta
# parse some more values
#place them in old_item
#e.g
old_item['postingbody']=hxs.select("//section [#id='postingbody']").extract()
try:
latit=float(hxs.select("//div [#id='map']//#data-latitude").extract()[0])
longi=float(hxs.select("//div [#id='map']//#data-longitude").extract()[0])
#print '%f,%f'%(latit,longi)
except:
latit=0
longi=0
try:
old_item["address"] = hxs.select(".//div [#class='mapaddress']/text()").extract()[0]
except:
old_item["address"] = []
try:
if any(latit)&(not any([char.isdigit() for char in old_item["address"][0]])):
try:
old_item["address"] = craigslistTools.coord2addr(latit,longi)
except:
pass
elif (not any(old_item["address"])):
try:
zips=[94611,94610,94609,94606,94618,94705]
for z in zips:
for line in old_item['postingbody'].split("\n"):
if str(z) in line:
old_item["address"] = line
except:
pass
except:
pass
if (latit==0)&(any(old_item["address"])):
try:
cities=['Oakland','oakland','Piedmont','piedmont','Berkeley','berkeley','montclair','Montclair']
if not any([c in old_item["address"] for c in cities]):
old_item["address"]+=', Oakland'
geoRes=Geocoder.geocode(old_item["address"])
latit=geoRes[0].latitude
longi=geoRes[0].longitude
except:
pass
old_item["latit"] = latit
old_item["longi"] = longi
try:
(bartDist,bartCoord)=craigslistTools.bartDist((latit,longi))
except:
bartDist=0
try:
if ((bartDist!=0) and (bartDist<2)):
bartWalkingTime=craigslistTools.bartWalking((latit,longi),bartCoord)
else:
bartWalkingTime=0
except:
bartWalkingTime=0
old_item["bartDist"] = bartDist
old_item["BartWalkingTime"] = bartWalkingTime
try:
if ((bartDist<1) and (bartDist!=0)):
old_item['LLNLDrivingTime']=craigslistTools.LLNLdriving((latit,longi))
else:
old_item['LLNLDrivingTime']=0
except:
old_item['LLNLDrivingTime']=0
try:
old_item["Br"]=int(hxs.select("//p [#class='attrgroup']/span/b/text()").extract()[0].strip('BR'))
old_item["baths"]=int(hxs.select("//p [#class='attrgroup']/span/b/text()").extract()[1].strip('Ba'))
except:
try:
old_item["Br"]=int(re.findall(r"(\d+)br",old_item['size'])[0])
old_item["baths"]=1
except:
old_item["Br"]=-1
old_item["baths"]=-1
try:
old_item['amenities']=hxs.select("//p [#class='attrgroup']/span/text()").extract()
except:
old_item['amenities']=[]
yield old_item
settings.py:
# Scrapy settings for craigslist_2br project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
#
BOT_NAME = 'craigslist_2br'
SPIDER_MODULES = ['craigslist_2br.spiders']
NEWSPIDER_MODULE = 'craigslist_2br.spiders'
ITEM_PIPELINES = {'craigslist_2br.pipelines.Craigslist2BrPipeline':0}
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'craigslist_2br (+http://www.craigslist.org)'
DOWNLOAD_DELAY = 5
RANDOMIZE_DOWNLOAD_DELAY = False
CONCURRENT_REQUESTS_PER_DOMAIN = 1 # Default: 8
#SCHEDULER = 'scrapy.core.scheduler.Scheduler'
#HTTPCACHE_ENABLED = True
DEPTH_LIMIT = 1
Good news! The problem I experienced will likely not effect you (just what you wanted to read when searching for answers ;) ) Using the Depreciated HtmlXPathSelector from v0.18 can cause major issues (surprise surprise), so don't do it.

why 'return' in scrapy print so much times

when run the spider in command line ,it returns repeatly so much time. When I add a code 'items['i is '] = i' before 'items.append(item)',the result is the last number, and repeated the same time.
So it's maybe the problem with use of return . what's the right one?
# -*- coding: utf-8 -*-
from a.items import Item
from scrapy.selector import Selector
import scrapy
import json
class ASpider(scrapy.Spider):
name = "A"
allowed_domains = ["a.com"]
start_urls = []
start_urls.append("a.com")
def parse(self, response):
jsonresponse = json.loads(response.body_as_unicode())
items = []
item = aItem()
item['a_id'] = response.url[120:-51]
item['a_nights'] = jsonresponse['calendar_months'][0]['condition_ranges'][0]['conditions']['min_nights']
for i in range(0,len(jsonresponse['calendar_months'][0]['days'])):
item['{}'.format(jsonresponse['calendar_months'][0]['days'][i]['date'])] = jsonresponse['calendar_months'][0]['days'][i]['available']
items.append(item)
return items
You need to create a new item for each day.
If you don't you are simply updating that item with each days values and then only the values from the last day remain with a list of items containing multiple references to the same item. Which is exactly what you describe as happening.
def parse(self, response):
jsonresponse = json.loads(response.body_as_unicode())
items = []
for day in jsonresponse['calendar_months'][0]['days']:
item = Item()
item['a_id'] = response.url[120:-51]
item['a_nights'] = jsonresponse['calendar_months'][0]['condition_ranges'][0]['conditions']['min_nights']
item['{}'.format(day['date'])] = day['available']
items.append(item)
return items
Consider using the for loop above and not indexing when possible.

Added iterating over page id in Scrapy, responses in parse method no longer run

I have a few print functions in my spider for debugging. In the start_request function, I'm generating urls by adding numbers in the range [0,4] with base url which gets parsed by parse_grant function.In that function, first print function gets called, but second does not.
Still learning here, so I may have made a stupid mistake and don't quite understand what's happening with Twisted in the background.
# -*- coding: utf-8 -*-
from scrapy.spiders import Spider, Rule
from scrapy.http import Request
from scraper_app.items import NSERCGrant
from scrapy.selector import Selector
class NSERC_Spider(Spider):
name = 'NSERCSpider'
allowed_domains = ["http://www.nserc-crsng.gc.ca"]
# Maximum page id to use.
max_id = 5
def start_requests(self):
for i in range(self.max_id):
if i == 0:
continue
yield Request("http://www.nserc-crsng.gc.ca/ase-oro/Details-Detailles_eng.asp?id=%d" % i,
callback=self.parse_grant)
def parse_grant(self, response):
print("Being called")
sel = Selector(response)
grants = sel.xpath('.//html//body')
items = []
for response in grants:
print("Responses being called")
item = NSERCGrant()
# Row one
item['Competition_Year'] = response.xpath('.//tr[1]//td[2]//text()').extract()
item['Fiscal_Year'] = response.xpath('.//tr[1]//td[4]//text()').extract()
# Row two
item['Project_Lead_Name'] = response.xpath('.//tr[2]//td[2]//text()').extract()
item['Institution'] = response.xpath('.//tr[2]//td[4]//text()').extract()
# Row three
item['Department'] = response.xpath('.//tr[3]//td[2]//text()').extract()
item['Province'] = response.xpath('.//tr[3]//td[4]//text()').extract()
# Row four
item['Award_Amount'] = response.xpath('.//tr[4]//td[2]//text()').extract()
item['Installment'] = response.xpath('.//tr[4]//td[4]//text()').extract()
# Row five
item['Program'] = response.xpath('.//tr[5]//td[2]//text()').extract()
item['Selection_Committee'] = response.xpath('.//tr[5]//td[4]//text()').extract()
# Row six
item['Research_Subject'] = response.xpath('.//tr[6]//td[2]//text()').extract()
item['Area_of_Application'] = response.xpath('.//tr[6]//td[4]//text()').extract()
# Row seven
item['Co_Researchers'] = response.xpath(".//tr[7]//td[2]//text()").extract()
item['Partners'] = response.xpath('.//tr[7]//td[4]//text()').extract()
# Award Summary
item['Award_Summary'] = response.xpath('.//p//text()').extract()
items.append(item)
return items
The information you are looking for only occurs once on each page and the body tag is on every page so the loop and the line
grants = sel.xpath('.//html//body')
are redundant. Also, response.xpath('... your xpath here ...') saves some code. Try this
# -*- coding: utf-8 -*-
from scrapy.spiders import Spider
from scrapy.http import Request
from scraper_app.items import NSERCGrant
class NSERC_Spider(Spider):
name = 'NSERCSpider'
allowed_domains = ["http://www.nserc-crsng.gc.ca"]
# Maximum page id to use.
max_id = 5
def start_requests(self):
for i in range(1, self.max_id):
yield Request("http://www.nserc-crsng.gc.ca/ase-oro/Details-Detailles_eng.asp?id=%d" % i,
callback=self.parse_grant)
def parse_grant(self, response):
print("Being called")
item = NSERCGrant()
# Row one
item['Competition_Year'] = response.xpath('//tr[1]//td[2]//text()').extract()
item['Fiscal_Year'] = response.xpath('//tr[1]//td[4]//text()').extract()
# Row two
item['Project_Lead_Name'] = response.xpath('//tr[2]//td[2]//text()').extract()
item['Institution'] = response.xpath('//tr[2]//td[4]//text()').extract()
# Row three
item['Department'] = response.xpath('//tr[3]//td[2]//text()').extract()
item['Province'] = response.xpath('//tr[3]//td[4]//text()').extract()
# Row four
item['Award_Amount'] = response.xpath('//tr[4]//td[2]//text()').extract()
item['Installment'] = response.xpath('//tr[4]//td[4]//text()').extract()
# Row five
item['Program'] = response.xpath('//tr[5]//td[2]//text()').extract()
item['Selection_Committee'] = response.xpath('//tr[5]//td[4]//text()').extract()
# Row six
item['Research_Subject'] = response.xpath('//tr[6]//td[2]//text()').extract()
item['Area_of_Application'] = response.xpath('//tr[6]//td[4]//text()').extract()
# Row seven
item['Co_Researchers'] = response.xpath("//tr[7]//td[2]//text()").extract()
item['Partners'] = response.xpath('//tr[7]//td[4]//text()').extract()
# Award Summary
item['Award_Summary'] = response.xpath('//p//text()').extract()
yield item
I've also tweaked your start_request routine to remove the if i = 0.
Take a look at scrapy shell which allows you to try out your xpaths and see the results interactively.
When I try
grants = sel.xpath('.//html//body')
from my scrapy shell, this is what I get
In [10]: grants = sel.xpath('.//html//body')
In [11]: grants
Out[11]: []
When I change it to the following code,
In [12]: grants = sel.xpath('/html/body')
In [13]: grants
Out[13]: [<Selector xpath='/html/body' data=u'<body>\r\n<div id="cn-body-inner-1col">\r\n<'>]

Categories

Resources