I use the XMLFeedSpider in Scrapy to scrap a real estate website.
Each url request generated by my spider (via start_urls) return a page in XML with a bunch of ads and a link to the next page (search results is limited to 50 ads).
I was therefore wondering how i could add this additional page as new request in my spider ?
I've been searching through stackoverflow for a while but i just can't find a simple answer to my problem !
Below is the code i have in my spider. I have updated it with the parse_nodes() method mentioned by Paul but the next url is not picked up for some reasons.
Could i yield additional requests in the adapt_response method ?
from scrapy.spider import log
from scrapy.selector import XmlXPathSelector
from scrapy.contrib.spiders import XMLFeedSpider
from crawler.items import RefItem, PicItem
from crawler.seloger_helper import urlbuilder
from scrapy.http import Request
class Seloger_spider_XML(XMLFeedSpider):
name = 'Seloger_spider_XML'
allowed_domains = ['seloger.com']
iterator = 'iternodes' # This is actually unnecessary, since it's the default value
itertag = 'annonce'
'''Spider Initialized with department as argument'''
def __init__(self, departement=None, *args, **kwargs):
super(Seloger_spider_XML, self).__init__(*args, **kwargs)
#self.start_urls = urlbuilder(departement) #helper function which generate start_urls
self.start_urls = ['http://ws.seloger.com/search.xml?cp=72&idtt=2&tri=d_dt_crea&SEARCHpg=1']
def parse_node(self, response, node):
items = []
item = RefItem()
item['ref'] = int(''.join(node.select('//annonce/idAnnonce/text()').extract()))
item['desc'] = ''.join(node.select('//annonce/descriptif/text()').extract()).encode('utf-8')
item['libelle'] = ''.join(node.select('//annonce/libelle/text()').extract()).encode('utf-8')
item['titre'] = ''.join(node.select('//annonce/titre/text()').extract()).encode('utf-8')
item['ville'] = ''.join(node.select('//annonce/ville/text()').extract()).encode('utf-8')
item['url'] =''.join(node.select('//annonce/permaLien/text()').extract()).encode('utf-8')
item['prix'] = ''.join(node.select('//annonce/prix/text()').extract())
item['prixunite'] = ''.join(node.select('//annonce/prixUnite/text()').extract())
item['datemaj'] = ''.join(node.select('//annonce/dtFraicheur/text()').extract())[:10]
item['datecrea'] = ''.join(node.select('//annonce/dtCreation/text()').extract())[:10]
item['lati'] = (''.join(node.select('//annonce/latitude/text()').extract()))
item['longi'] = (''.join(node.select('//annonce/longitude/text()').extract()))
item['surface'] = (''.join(node.select('//annonce/surface/text()').extract()))
item['surfaceunite'] = (''.join(node.select('//annonce/surfaceUnite/text()').extract()))
item['piece'] = (''.join(node.select('//annonce/nbPiece/text()').extract())).encode('utf-8')
item['ce'] = (''.join(node.select('//annonce/dbilanEmissionGES/text()').extract())).encode('utf-8')
items.append(item)
for photos in node.select('//annonce/photos'):
for link in photos.select('photo/thbUrl/text()').extract():
pic = PicItem()
pic['pic'] = link.encode('utf-8')
pic['refpic'] = item['ref']
items.append(pic)
return items
def parse_nodes(self, response, nodes):
for n in super(Seloger_spider_XML, self).parse_nodes(response, nodes):
yield n
# once you're done with item/nodes
# look for the next page link using XPath
# these lines are borrowed form
# https://github.com/scrapy/scrapy/blob/master/scrapy/contrib/spiders/feed.py#L73
selector = XmlXPathSelector(response)
self._register_namespaces(selector)
for link_url in selector.select('//pageSuivante/text()').extract():
yield Request(link_url)
Thank you
Gilles
You can override the parse_nodes() method to hook in your "next page" URL extraction.
The example below is based on Scrapy docs XMLFeedExample:
from scrapy import log
from scrapy.contrib.spiders import XMLFeedSpider
from myproject.items import TestItem
from scrapy.selector import XmlXPathSelector
from scrapy.http import Request
class MySpider(XMLFeedSpider):
name = 'example.com'
allowed_domains = ['example.com']
start_urls = ['http://www.example.com/feed.xml']
iterator = 'iternodes' # This is actually unnecessary, since it's the default value
itertag = 'item'
def parse_node(self, response, node):
log.msg('Hi, this is a <%s> node!: %s' % (self.itertag, ''.join(node.extract())))
item = Item()
item['id'] = node.select('#id').extract()
item['name'] = node.select('name').extract()
item['description'] = node.select('description').extract()
return item
def parse_nodes(self, response, nodes):
# call built-in method that itself calls parse_node()
# and yield whatever it returns
for n in super(MySpider, self).parse_nodes(response, nodes):
yield n
# once you're done with item/nodes
# look for the next page link using XPath
# these lines are borrowed form
# https://github.com/scrapy/scrapy/blob/master/scrapy/contrib/spiders/feed.py#L73
selector = XmlXPathSelector(response)
self._register_namespaces(selector)
for link_url in selector.select('//pageSuivante/text()').extract():
print "link_url", link_url
yield Request(link_url)
Related
I believe that I have my xpaths coded in the incorrect way, as I only get a single result for each url. Whereas, there are in total 25 job posts for each url (not included those in the next page.) How can I correct my xpaths to get all the results?
Here's my scraper:
from scrapy.item import Field
import scrapy
from scrapy.loader import ItemLoader
from scrapy.crawler import CrawlerProcess
from itemloaders.processors import TakeFirst
import pandas as pd
from collections import defaultdict
class CvItem(scrapy.Item):
category = Field(output_processor = TakeFirst())
salary = Field(output_processor = TakeFirst())
title = Field(output_processor = TakeFirst())
organisation = Field(output_processor = TakeFirst())
class CvSpider(scrapy.Spider):
name = 'cv'
start_urls = {'Accountancy_finance': ['https://www.cv-library.co.uk/Degree-Finance-jobs?us=1',
'https://www.cv-library.co.uk/Degree-Accounting-jobs?us=1'],
'Aeronautical_Engineering': ['https://www.cv-library.co.uk/Degree-Aeronautical-Engineering-jobs?us=1'],
'Manufacturing_Engineering': ['https://www.cv-library.co.uk/Degree-Manufacturing-Engineering-jobs?us=1'],
'Agriculture_and_Forestry': ['https://www.cv-library.co.uk/Degree-Forestry-jobs?us=1']}
def start_requests(self):
for items, urls in self.start_urls.items():
for url in urls:
yield scrapy.Request(
url = url,
callback = self.parse,
cb_kwargs = {
'items':items
}
)
def parse(self, response, items):
container = response.xpath('//ol[#id="searchResults"]')
for lists in container:
loader = ItemLoader(CvItem(), selector = lists)
loader.add_value('category', items)
loader.add_xpath('title', '//article[#id]//a[#title]/#title')
loader.add_xpath('salary', '//article[#id]//dl//dd[#class="job__details-value salary"]//text()')
loader.add_xpath('organisation', '//article[#id]/div//div/p/a//text()')
yield loader.load_item()
There was a slight mistake with the requests that I updated for those of you that had checked for the first 15minutes since I uploaded it.
The problem was in the container's xpath. You only get the container without actually the items in it so you only loop once on the container itself and not the actual items you want to scrape.
from scrapy.item import Field
import scrapy
from scrapy.loader import ItemLoader
from scrapy.crawler import CrawlerProcess
from itemloaders.processors import TakeFirst
import pandas as pd
from collections import defaultdict
class CvItem(scrapy.Item):
category = Field(output_processor=TakeFirst())
salary = Field(output_processor=TakeFirst())
title = Field(output_processor=TakeFirst())
organisation = Field(output_processor=TakeFirst())
class CvSpider(scrapy.Spider):
name = 'cv'
start_urls = {'Accountancy_finance': ['https://www.cv-library.co.uk/Degree-Finance-jobs?us=1',
'https://www.cv-library.co.uk/Degree-Accounting-jobs?us=1'],
'Aeronautical_Engineering': ['https://www.cv-library.co.uk/Degree-Aeronautical-Engineering-jobs?us=1'],
'Manufacturing_Engineering': ['https://www.cv-library.co.uk/Degree-Manufacturing-Engineering-jobs?us=1'],
'Agriculture_and_Forestry': ['https://www.cv-library.co.uk/Degree-Forestry-jobs?us=1']}
def start_requests(self):
for items, urls in self.start_urls.items():
for url in urls:
yield scrapy.Request(
url=url,
cb_kwargs={
'items': items
}
)
def parse(self, response, items):
container = response.xpath('//ol[#id="searchResults"]//li[#class="results__item"]')
for lists in container:
loader = ItemLoader(CvItem(), selector=lists)
loader.add_value('category', items)
loader.add_xpath('title', '//article[#id]//a[#title]/#title')
loader.add_xpath('salary', '//article[#id]//dl//dd[#class="job__details-value salary"]//text()')
loader.add_xpath('organisation', '//article[#id]/div//div/p/a//text()')
yield loader.load_item()
I'd like to collect an item's data from 2 different websites.
It should work as follows:
parse_website_1 fetches a persons name from website_1 and populates
the item
parse_website_1 yields a request for parse_website_2
parse_website_2 parses website_2, collects the persons hair-color, based on the person name which was scraped from website_1 and populates the item
parse_website_2 loads the item
Would this be in the right direction, given that the item is defined in items.py:
import scrapy
from scrapy.loader import ItemLoader
from myproject.items import MyItem
class MySpider(scrapy.Spider):
name = "myspider"
def __init__(self):
self.item = ItemLoader(item=MyItem(), response=response)
def start_requests(self):
scrapy.Request(url="http://website_1.com", callback=self.parse_website_1)
def parse_website_1(self, response):
name = response.xpath('//div[#class="name"]/text()').extract_first()
self.item.add_value("name", name)
website_2_path = "http://website_2.com/" + name
yield scrapy.Request(url=website_2_path, callback=self.parse_website_2)
def parse_website_2(self, response):
self.item.add_xpath("hair_color", '//div[#class="hair_color"]')
yield self.item.load_item()
The idea is right, but the implementation is not correct in that you are trying to pass data between consecutive requests using an instance attribute (self.item). Scrapy requests are asynchronous so it would not work as expected.
The correct way how to do it is outlined in Scrapy FAQ. Pass the partial item data to consecutive request using Request's meta attribute where you obtain it using Response's meta attribute, add some more data and finally yield the item. Here's the adapted code:
import scrapy
from scrapy.loader import ItemLoader
from myproject.items import MyItem
class MySpider(scrapy.Spider):
name = "myspider"
def start_requests(self):
scrapy.Request(url="http://website_1.com", callback=self.parse_website_1)
def parse_website_1(self, response):
item = ItemLoader(item=MyItem(), response=response)
name = response.xpath('//div[#class="name"]/text()').extract_first()
item.add_value("name", name)
website_2_path = "http://website_1.com/" + name
yield scrapy.Request(url=website_2_path, callback=self.parse_website_2, meta={'item': item})
def parse_website_2(self, response):
item = response.meta['item']
item.add_xpath("hair_color", '//div[#class="hair_color"]')
yield item.load_item()
I am trying to extract information from Listing and Detail pages.
The code below correctly scrapes the reviewer information from the Listing page and all linked pages (where a contains Next)
The detail_pages Urls are also captured. e.g. http://www.screwfix.com/p/prysmian-6242y-twin-earth-cable-2-5mm-x-100m-grey/20967
However I cannot see how I can navigate to and scrape the information from the Detail pages.
Is there anyone here who used Scrapy successfully who can help me to finish this spider?
Thank you for the help.
I include the code for the spider below:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
from scrapy.spider import Spider
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from hn_scraper.items import HnArticleItem
class ScrewfixSpider(Spider):
name = "Screwfix"
allowed_domains = ["www.screwfix.com"]
start_urls = ('http://www.screwfix.com/', )
link_extractor = SgmlLinkExtractor(
allow=('www', ),
restrict_xpaths=('//a[contains(., "Next")]', ))
detail_page_extractor = SgmlLinkExtractor(
allow=('www', ),
restrict_xpaths=('//tr[#id[contains(., "reviewer")]]/td[3]/a', ))
def extract_one(self, selector, xpath, default=None):
extracted = selector.xpath(xpath).extract()
if extracted:
return extracted[0]
return default
def parse(self, response):
for link in self.link_extractor.extract_links(response):
request = Request(url=link.url)
request.meta.update(link_text=link.text)
yield request
for item in self.parse_item(response):
yield item
def parse_item(self, response):
selector = Selector(response)
rows = selector.xpath('//table[contains(.,"crDataGrid")]//tr[#id[contains(., "reviewer")]]')
for row in rows:
item = HnArticleItem()
reviewer = row.xpath('td[3]/a')
reviewer_url = self.extract_one(reviewer, './#href', '')
reviewer_name = self.extract_one(reviewer, 'b/text()', '')
total_reviews = row.xpath('td[4]/text()').extract()
item['url'] = reviewer_url
item['name'] = reviewer_name
item['total_reviews'] = total_reviews
yield item
detail_pages = self.detail_page_extractor.extract_links(response)
if detail_pages:
print 'detail_pages'
print detail_pages[0].url
yield Request(detail_pages[0].url)
My goal is to extract all 25 rows ( 6 items per row) per page then iterate over each of the 40 pages.
Currently, my spider extracts the first row from page 1-3 (see CSV output image).
I assumed the, list_iterator() function would iterate over each row; however, there appears to be an error in either my rules or list_iterator() function that is not allowing all rows per page to be scrapped.
Any assistance or advice is greatly appreciated!
propub_spider.py:
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from propub.items import PropubItem
from scrapy.http import Request
class propubSpider(CrawlSpider):
name = 'prop$'
allowed_domains = ['https://projects.propublica.org']
max_pages = 40
start_urls = [
'https://projects.propublica.org/docdollars/search?state%5Bid%5D=33',
'https://projects.propublica.org/docdollars/search?page=2&state%5Bid%5D=33',
'https://projects.propublica.org/docdollars/search?page=3&state%5Bid%5D=33']
rules = (Rule(SgmlLinkExtractor(allow=('\\search?page=\\d')), 'parse_start_url', follow=True),)
def list_iterator(self):
for i in range(self.max_pages):
yield Request('https://projects.propublica.org/docdollars/search?page=d' % i, callback=self.parse)
def parse(self, response):
for sel in response.xpath('//*[#id="payments_list"]/tbody'):
item = PropubItem()
item['payee'] = sel.xpath('tr[1]/td[1]/a[2]/text()').extract()
item['link'] = sel.xpath('tr[1]/td[1]/a[1]/#href').extract()
item['city'] = sel.xpath('tr[1]/td[2]/text()').extract()
item['state'] = sel.xpath('tr[1]/td[3]/text()').extract()
item['company'] = sel.xpath('tr[1]/td[4]').extract()
item['amount'] = sel.xpath('tr[1]/td[7]/span/text()').extract()
yield item
pipelines.py:
import csv
class PropubPipeline(object):
def __init__(self):
self.myCSV = csv.writer(open('C:\Users\Desktop\propub.csv', 'wb'))
self.myCSV.writerow(['payee', 'link', 'city', 'state', 'company', 'amount'])
def process_item(self, item, spider):
self.myCSV.writerow([item['payee'][0].encode('utf-8'),
item['link'][0].encode('utf-8'),
item['city'][0].encode('utf-8'),
item['state'][0].encode('utf-8'),
item['company'][0].encode('utf-8'),
item['amount'][0].encode('utf-8')])
return item
items.py:
import scrapy
from scrapy.item import Item, Field
class PropubItem(scrapy.Item):
payee = scrapy.Field()
link = scrapy.Field()
city = scrapy.Field()
state = scrapy.Field()
company = scrapy.Field()
amount = scrapy.Field()
pass
CSV output:
Multiple things need to be fixed:
use start_requests() method instead of list_iterator()
there is a missing % here:
yield Request('https://projects.propublica.org/docdollars/search?page=%d' % i, callback=self.parse)
# HERE^
you don't need CrawlSpider since you are providing the pagination links via start_requests() - use regular scrapy.Spider
it would more reliable if XPath expressions would match the cells by class attributes
Fixed version:
import scrapy
from propub.items import PropubItem
class propubSpider(scrapy.Spider):
name = 'prop$'
allowed_domains = ['projects.propublica.org']
max_pages = 40
def start_requests(self):
for i in range(self.max_pages):
yield scrapy.Request('https://projects.propublica.org/docdollars/search?page=%d' % i, callback=self.parse)
def parse(self, response):
for sel in response.xpath('//*[#id="payments_list"]//tr[#data-payment-id]'):
item = PropubItem()
item['payee'] = sel.xpath('td[#class="name_and_payee"]/a[last()]/text()').extract()
item['link'] = sel.xpath('td[#class="name_and_payee"]/a[1]/#href').extract()
item['city'] = sel.xpath('td[#class="city"]/text()').extract()
item['state'] = sel.xpath('td[#class="state"]/text()').extract()
item['company'] = sel.xpath('td[#class="company"]/text()').extract()
item['amount'] = sel.xpath('td[#class="amount"]/text()').extract()
yield item
I have a problem like this:
how to filter duplicate requests based on url in scrapy
So, I do not want a website to be crawled more than once. I adapted the middleware and wrote a print statement to test whether it correctly classifies already seen websites. It does.
Nonetheless the parsing seems to be executed multiple times because the json-File I receive contains double entries.
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item
from crawlspider.items import KickstarterItem
from HTMLParser import HTMLParser
### code for stripping off HTML tags:
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return str(''.join(self.fed))
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
###
items = []
class MySpider(CrawlSpider):
name = 'kickstarter'
allowed_domains = ['kickstarter.com']
start_urls = ['http://www.kickstarter.com']
rules = (
# Extract links matching 'category.php' (but not matching 'subsection.php')
# and follow links from them (since no callback means follow=True by default).
Rule(SgmlLinkExtractor(allow=('discover/categories/comics', ))),
# Extract links matching 'item.php' and parse them with the spider's method parse_item
Rule(SgmlLinkExtractor(allow=('projects/', )), callback='parse_item'),
)
def parse_item(self, response):
self.log('Hi, this is an item page! %s' % response.url)
hxs = HtmlXPathSelector(response)
item = KickstarterItem()
item['date'] = hxs.select('//*[#id="about"]/div[2]/ul/li[1]/text()').extract()
item['projname'] = hxs.select('//*[#id="title"]/a').extract()
item['projname'] = strip_tags(str(item['projname']))
item['projauthor'] = hxs.select('//*[#id="name"]')
item['projauthor'] = item['projauthor'].select('string()').extract()[0]
item['backers'] = hxs.select('//*[#id="backers_count"]/data').extract()
item['backers'] = strip_tags(str(item['backers']))
item['collmoney'] = hxs.select('//*[#id="pledged"]/data').extract()
item['collmoney'] = strip_tags(str(item['collmoney']))
item['goalmoney'] = hxs.select('//*[#id="stats"]/h5[2]/text()').extract()
items.append(item)
return items
My items.py looks like that:
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/topics/items.html
from scrapy.item import Item, Field
class KickstarterItem(Item):
# define the fields for your item here like:
date = Field()
projname = Field()
projauthor = Field()
backers = Field()
collmoney = Field()
goalmoney = Field()
pass
My middleware looks like this:
import os
from scrapy.dupefilter import RFPDupeFilter
from scrapy.utils.request import request_fingerprint
class CustomFilter(RFPDupeFilter):
def __getid(self, url):
mm = url.split("/")[4] #extracts project-id (is a number) from project-URL
print "_____________", mm
return mm
def request_seen(self, request):
fp = self.__getid(request.url)
self.fingerprints.add(fp)
if fp in self.fingerprints and fp.isdigit(): # .isdigit() checks wether fp comes from a project ID
print "______fp is a number (therefore a project-id) and has been encountered before______"
return True
if self.file:
self.file.write(fp + os.linesep)
I added this line to settings.py:
DUPEFILTER_CLASS = 'crawlspider.duplicate_filter.CustomFilter'
I call the script using "scrapy crawl kickstarter -o items.json -t json". Then I see the correct print statements from the middleware code.
Any comments on why the json contains multiple entries containing the same data?
So now these are the three modifications that removed the duplicates:
I added this to settings.py:
ITEM_PIPELINES = ['crawlspider.pipelines.DuplicatesPipeline',]
to let scrapy know that I added a function DuplicatesPipeline in pipelines.py:
from scrapy import signals
from scrapy.exceptions import DropItem
class DuplicatesPipeline(object):
def __init__(self):
self.ids_seen = set()
def process_item(self, item, spider):
if item['projname'] in self.ids_seen:
raise DropItem("Duplicate item found: %s" % item)
else:
self.ids_seen.add(item['projname'])
return item
You do not need to adjust the spider and do not use the dupefilter/middleware stuff I posted before.
But I got the feeling that my solution doesn't reduce the communication as the Item-object has to be created first before it is evaluated and possibly dropped. But I am okay with that.
(Solution found by asker, moved into an answer)