Scrapy: How to populate an item with data from two websites - python

I'd like to collect an item's data from 2 different websites.
It should work as follows:
parse_website_1 fetches a persons name from website_1 and populates
the item
parse_website_1 yields a request for parse_website_2
parse_website_2 parses website_2, collects the persons hair-color, based on the person name which was scraped from website_1 and populates the item
parse_website_2 loads the item
Would this be in the right direction, given that the item is defined in items.py:
import scrapy
from scrapy.loader import ItemLoader
from myproject.items import MyItem
class MySpider(scrapy.Spider):
name = "myspider"
def __init__(self):
self.item = ItemLoader(item=MyItem(), response=response)
def start_requests(self):
scrapy.Request(url="http://website_1.com", callback=self.parse_website_1)
def parse_website_1(self, response):
name = response.xpath('//div[#class="name"]/text()').extract_first()
self.item.add_value("name", name)
website_2_path = "http://website_2.com/" + name
yield scrapy.Request(url=website_2_path, callback=self.parse_website_2)
def parse_website_2(self, response):
self.item.add_xpath("hair_color", '//div[#class="hair_color"]')
yield self.item.load_item()

The idea is right, but the implementation is not correct in that you are trying to pass data between consecutive requests using an instance attribute (self.item). Scrapy requests are asynchronous so it would not work as expected.
The correct way how to do it is outlined in Scrapy FAQ. Pass the partial item data to consecutive request using Request's meta attribute where you obtain it using Response's meta attribute, add some more data and finally yield the item. Here's the adapted code:
import scrapy
from scrapy.loader import ItemLoader
from myproject.items import MyItem
class MySpider(scrapy.Spider):
name = "myspider"
def start_requests(self):
scrapy.Request(url="http://website_1.com", callback=self.parse_website_1)
def parse_website_1(self, response):
item = ItemLoader(item=MyItem(), response=response)
name = response.xpath('//div[#class="name"]/text()').extract_first()
item.add_value("name", name)
website_2_path = "http://website_1.com/" + name
yield scrapy.Request(url=website_2_path, callback=self.parse_website_2, meta={'item': item})
def parse_website_2(self, response):
item = response.meta['item']
item.add_xpath("hair_color", '//div[#class="hair_color"]')
yield item.load_item()

Related

Scrapy pipeline only save one page of results

I have a spider to crawl course_tal which has a pipeline to save two types of items:
moocs.csv which contains the course data.
moocs_review.csv which contains the reviews data.
This is the spider code I have:
import scrapy
from scrapy import Request
from scrapy.loader import ItemLoader
from urlparse import urljoin
from moocs.items import MoocsItem,MoocsReviewItem
class MoocsSpiderSpider(scrapy.Spider):
name = "moocs_spider"
#allowed_domains = ["https://www.coursetalk.com/subjects/data-science/courses"]
start_urls = (
'https://www.coursetalk.com/subjects/data-science/courses',
)
def parse(self, response):
courses_xpath = '//*[#class="course-listing-card"]//a[contains(#href, "/courses/")]/#href'
courses_url = [urljoin(response.url,relative_url) for relative_url in response.xpath(courses_xpath).extract()]
for course_url in courses_url[0:3]:
print course_url
yield Request(url=course_url, callback=self.parse_reviews)
next_page_url = response.xpath('//*[#class="js-course-pagination"]//a[contains(#aria-label,"Next")]/#href').extract()
yield Request(url=next_page_url, callback=self.parse)
def parse_reviews(self, response):
#print response.body
l = ItemLoader(item=MoocsItem(), response=response)
l.add_xpath('course_title', '//*[#class="course-header-ng__main-info__name__title"]//text()')
l.add_xpath('course_description', '//*[#class="course-info__description"]//p/text()')
l.add_xpath('course_instructors', '//*[#class="course-info__instructors__names"]//text()')
l.add_xpath('course_key_concepts', '//*[#class="key-concepts__labels"]//text()')
l.add_value('course_link', response.url)
l.add_value('course_provider', response.url)
l.add_xpath('course_cost', '//*[#class="course-details-panel__course-cost"]//text()')
l.add_xpath('university', '//*[#class="course-info__school__name"]//text()[2]')
#'//*[#class="course-info__school__name"]'
item = l.load_item()
for review in response.xpath('//*[#class="review-body"]'):
r = ItemLoader(item=MoocsReviewItem(), response=response, selector=review)
r.add_value('course_title', item['course_title'])
r.add_xpath('review_body', './/div[#class="review-body__content"]//text()')
r.add_xpath('course_stage', './/*[#class="review-body-info__course-stage--completed"]//text()')
r.add_xpath('user_name', './/*[#class="review-body__username"]//text()')
r.add_xpath('review_date', './/*[#itemprop="datePublished"]/#datetime')
r.add_xpath('score', './/*[#class="sr-only"]//text()')
yield r.load_item()
yield item
Which goes to each course page and save the details into the corresponding item. I'm getting the pagination here:
next_page_url = response.xpath('//*[#class="js-course-pagination"]//a[contains(#aria-label,"Next")]/#href').extract()
The spider goes to next pages but the result is not saved in the output file.
I'm guessing the problem is in the pipeline, where the files are created:
class MultiCSVItemPipeline(object):
CSVDir = '/moocs/scripts/moocs/moocs/'
SaveTypes = ['moocs','moocsreview']
def __init__(self):
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def spider_opened(self, spider):
self.files = dict([ (name, open(CSVDir+name+'.csv','w+b')) for name in self.SaveTypes ])
self.exporters = dict([ (name,CsvItemExporter(self.files[name])) for name in self.SaveTypes])
[e.start_exporting() for e in self.exporters.values()]
def spider_closed(self, spider):
[e.finish_exporting() for e in self.exporters.values()]
[f.close() for f in self.files.values()]
def process_item(self, item, spider):
what = item_type(item)
if what in set(self.SaveTypes):
self.exporters[what].export_item(item)
return item
Are you sure the spider is doing the pagination properly?
When you do this:
next_page_url = response.xpath('//*[#class="js-course-pagination"]//a[contains(#aria-label,"Next")]/#href').extract()
extract() returns a list of results that you are then passing into the url parameter of the Request:
yield Request(url=next_page_url, callback=self.parse)
But url must be a string or unicode value, so, doing that will generate the following error:
TypeError: Request url must be str or unicode, got list:
It can be solved by using the extract_first() method, and I would also check that the value is not None:
next_page_url = response.xpath('//*[#class="js-course-pagination"]//a[contains(#aria-label,"Next")]/#href').extract_first()
if next_page_url:
yield Request(url=next_page_url) # parse is the callback by default
Please, try this and tell me if it solved your problem
If you use -t csv, this will also do work. instead of pipeline
scrapy crawl moocs -t csv -o moocs.csv --loglevel=INFO
This will automatically create a file in spider folder.

using regex on scrapy item loader

I'm trying to figure out how to use regex with scrapy item loaders.
I've tried to use a lambda function with split() and got the following error.
Split cannot be defined. You can see the function is commented out in the item loader class.
What I'm trying to do is remove all the text before the date including the "/"
of the date item. Date item being the url that I've just parsed
"https://www.sofascore.com/tennis/2018-02-07"
How do I use regex with scrapy item loaders?
Can I pass in the regex to the item loader or do I have to process it at the spider?
spider.py
import scrapy
from scrapy_splash import SplashRequest
from scrapejs.items import SofascoreItemLoader
from scrapy import Spider
import json
from scrapy.http import Request, FormRequest
class MySpider(scrapy.Spider):
name = "jsscraper"
start_urls = ["https://www.sofascore.com/tennis/2018-02-07"]
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url,
callback=self.parse,
endpoint='render.html',
args={'wait': 1.5})
def parse(self, response):
for row in response.css('.event-team'):
il = SofascoreItemLoader(selector=row)
il.add_css('winner' , '.event-team:nth-child(2)::text')
il.add_css('loser' , '.event-team:nth-child(1)::text')
il.add_value('date', response.url)
yield il.load_item()
items.py
import scrapy
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, MapCompose, Split
from operator import methodcaller
from scrapy import Spider, Request, Selector
class SofascoreItem(scrapy.Item):
loser = scrapy.Field()
winner = scrapy.Field()
date = scrapy.Field()
class SofascoreItemLoader(ItemLoader):
default_item_class = SofascoreItem
default_input_processor = MapCompose(methodcaller('strip'))
default_output_processor = TakeFirst()
#review_in = MapCompose(lambda x: x.split("/" , [-1]))
il.add_value('date', response.url, re='([^/]+)$')
See https://doc.scrapy.org/en/latest/topics/loaders.html for more details
Here is what is wrong with the code.
Apparently you do not have to 'feed' item loader with add_value but you won't get field populated in the end.
class SofascoreItemLoader(ItemLoader):
default_item_class = SofascoreItem
default_input_processor = MapCompose(methodcaller('strip'))
default_output_processor = TakeFirst()
review_in = MapCompose(lambda x: x.split("/")[-1])
You have to do split and then select last item in list that is generated by splitting. split(SEPARATOR, [-1]) is not what you want. Second argument is for selecting in how many parts you want string to be split.
Second, you want to add url value to review field, right?
This is not an answer how to use regex in scrapy ItemLoader, but you do not need it here. You just need to properly use split method.

Scrapy spider not saving to csv

I have a spider which reads a list of urls from a text file and saves the title and body text from each. The crawl works but the data does not get saved to csv. I set up a pipeline to save to csv because the normal -o option did not work for me. I did change the settings.py for piepline. Any help with this would be greatly appreciated.
The code is as follows:
Items.py
from scrapy.item import Item, Field
class PrivacyItem(Item):
# define the fields for your item here like:
# name = Field()
title = Field()
desc = Field()
PrivacySpider.py
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import HtmlXPathSelector
from privacy.items import PrivacyItem
class PrivacySpider(CrawlSpider):
name = "privacy"
f = open("urls.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()
def parse(self, response):
hxs = HtmlXPathSelector(response)
items =[]
for url in start_urls:
item = PrivacyItem()
item['desc'] = hxs.select('//body//p/text()').extract()
item['title'] = hxs.select('//title/text()').extract()
items.append(item)
return items
Pipelines.py
import csv
class CSVWriterPipeline(object):
def __init__(self):
self.csvwriter = csv.writer(open('CONTENT.csv', 'wb'))
def process_item(self, item, spider):
self.csvwriter.writerow([item['title'][0], item['desc'][0]])
return item
you don't have to loop on start_urls, scrapy is doing something like this:
for url in spider.start_urls:
request url and call spider.parse() with its response
so your parse function should look something like:
def parse(self, response):
hxs = HtmlXPathSelector(response)
item = PrivacyItem()
item['desc'] = hxs.select('//body//p/text()').extract()
item['title'] = hxs.select('//title/text()').extract()
return item
also try to avoid returning lists as item fields, do something like: hxs.select('..').extract()[0]

How to add new requests for my Scrapy Spider during crawling

I use the XMLFeedSpider in Scrapy to scrap a real estate website.
Each url request generated by my spider (via start_urls) return a page in XML with a bunch of ads and a link to the next page (search results is limited to 50 ads).
I was therefore wondering how i could add this additional page as new request in my spider ?
I've been searching through stackoverflow for a while but i just can't find a simple answer to my problem !
Below is the code i have in my spider. I have updated it with the parse_nodes() method mentioned by Paul but the next url is not picked up for some reasons.
Could i yield additional requests in the adapt_response method ?
from scrapy.spider import log
from scrapy.selector import XmlXPathSelector
from scrapy.contrib.spiders import XMLFeedSpider
from crawler.items import RefItem, PicItem
from crawler.seloger_helper import urlbuilder
from scrapy.http import Request
class Seloger_spider_XML(XMLFeedSpider):
name = 'Seloger_spider_XML'
allowed_domains = ['seloger.com']
iterator = 'iternodes' # This is actually unnecessary, since it's the default value
itertag = 'annonce'
'''Spider Initialized with department as argument'''
def __init__(self, departement=None, *args, **kwargs):
super(Seloger_spider_XML, self).__init__(*args, **kwargs)
#self.start_urls = urlbuilder(departement) #helper function which generate start_urls
self.start_urls = ['http://ws.seloger.com/search.xml?cp=72&idtt=2&tri=d_dt_crea&SEARCHpg=1']
def parse_node(self, response, node):
items = []
item = RefItem()
item['ref'] = int(''.join(node.select('//annonce/idAnnonce/text()').extract()))
item['desc'] = ''.join(node.select('//annonce/descriptif/text()').extract()).encode('utf-8')
item['libelle'] = ''.join(node.select('//annonce/libelle/text()').extract()).encode('utf-8')
item['titre'] = ''.join(node.select('//annonce/titre/text()').extract()).encode('utf-8')
item['ville'] = ''.join(node.select('//annonce/ville/text()').extract()).encode('utf-8')
item['url'] =''.join(node.select('//annonce/permaLien/text()').extract()).encode('utf-8')
item['prix'] = ''.join(node.select('//annonce/prix/text()').extract())
item['prixunite'] = ''.join(node.select('//annonce/prixUnite/text()').extract())
item['datemaj'] = ''.join(node.select('//annonce/dtFraicheur/text()').extract())[:10]
item['datecrea'] = ''.join(node.select('//annonce/dtCreation/text()').extract())[:10]
item['lati'] = (''.join(node.select('//annonce/latitude/text()').extract()))
item['longi'] = (''.join(node.select('//annonce/longitude/text()').extract()))
item['surface'] = (''.join(node.select('//annonce/surface/text()').extract()))
item['surfaceunite'] = (''.join(node.select('//annonce/surfaceUnite/text()').extract()))
item['piece'] = (''.join(node.select('//annonce/nbPiece/text()').extract())).encode('utf-8')
item['ce'] = (''.join(node.select('//annonce/dbilanEmissionGES/text()').extract())).encode('utf-8')
items.append(item)
for photos in node.select('//annonce/photos'):
for link in photos.select('photo/thbUrl/text()').extract():
pic = PicItem()
pic['pic'] = link.encode('utf-8')
pic['refpic'] = item['ref']
items.append(pic)
return items
def parse_nodes(self, response, nodes):
for n in super(Seloger_spider_XML, self).parse_nodes(response, nodes):
yield n
# once you're done with item/nodes
# look for the next page link using XPath
# these lines are borrowed form
# https://github.com/scrapy/scrapy/blob/master/scrapy/contrib/spiders/feed.py#L73
selector = XmlXPathSelector(response)
self._register_namespaces(selector)
for link_url in selector.select('//pageSuivante/text()').extract():
yield Request(link_url)
Thank you
Gilles
You can override the parse_nodes() method to hook in your "next page" URL extraction.
The example below is based on Scrapy docs XMLFeedExample:
from scrapy import log
from scrapy.contrib.spiders import XMLFeedSpider
from myproject.items import TestItem
from scrapy.selector import XmlXPathSelector
from scrapy.http import Request
class MySpider(XMLFeedSpider):
name = 'example.com'
allowed_domains = ['example.com']
start_urls = ['http://www.example.com/feed.xml']
iterator = 'iternodes' # This is actually unnecessary, since it's the default value
itertag = 'item'
def parse_node(self, response, node):
log.msg('Hi, this is a <%s> node!: %s' % (self.itertag, ''.join(node.extract())))
item = Item()
item['id'] = node.select('#id').extract()
item['name'] = node.select('name').extract()
item['description'] = node.select('description').extract()
return item
def parse_nodes(self, response, nodes):
# call built-in method that itself calls parse_node()
# and yield whatever it returns
for n in super(MySpider, self).parse_nodes(response, nodes):
yield n
# once you're done with item/nodes
# look for the next page link using XPath
# these lines are borrowed form
# https://github.com/scrapy/scrapy/blob/master/scrapy/contrib/spiders/feed.py#L73
selector = XmlXPathSelector(response)
self._register_namespaces(selector)
for link_url in selector.select('//pageSuivante/text()').extract():
print "link_url", link_url
yield Request(link_url)

Using middleware to prevent scrapy from double-visiting websites

I have a problem like this:
how to filter duplicate requests based on url in scrapy
So, I do not want a website to be crawled more than once. I adapted the middleware and wrote a print statement to test whether it correctly classifies already seen websites. It does.
Nonetheless the parsing seems to be executed multiple times because the json-File I receive contains double entries.
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item
from crawlspider.items import KickstarterItem
from HTMLParser import HTMLParser
### code for stripping off HTML tags:
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return str(''.join(self.fed))
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
###
items = []
class MySpider(CrawlSpider):
name = 'kickstarter'
allowed_domains = ['kickstarter.com']
start_urls = ['http://www.kickstarter.com']
rules = (
# Extract links matching 'category.php' (but not matching 'subsection.php')
# and follow links from them (since no callback means follow=True by default).
Rule(SgmlLinkExtractor(allow=('discover/categories/comics', ))),
# Extract links matching 'item.php' and parse them with the spider's method parse_item
Rule(SgmlLinkExtractor(allow=('projects/', )), callback='parse_item'),
)
def parse_item(self, response):
self.log('Hi, this is an item page! %s' % response.url)
hxs = HtmlXPathSelector(response)
item = KickstarterItem()
item['date'] = hxs.select('//*[#id="about"]/div[2]/ul/li[1]/text()').extract()
item['projname'] = hxs.select('//*[#id="title"]/a').extract()
item['projname'] = strip_tags(str(item['projname']))
item['projauthor'] = hxs.select('//*[#id="name"]')
item['projauthor'] = item['projauthor'].select('string()').extract()[0]
item['backers'] = hxs.select('//*[#id="backers_count"]/data').extract()
item['backers'] = strip_tags(str(item['backers']))
item['collmoney'] = hxs.select('//*[#id="pledged"]/data').extract()
item['collmoney'] = strip_tags(str(item['collmoney']))
item['goalmoney'] = hxs.select('//*[#id="stats"]/h5[2]/text()').extract()
items.append(item)
return items
My items.py looks like that:
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/topics/items.html
from scrapy.item import Item, Field
class KickstarterItem(Item):
# define the fields for your item here like:
date = Field()
projname = Field()
projauthor = Field()
backers = Field()
collmoney = Field()
goalmoney = Field()
pass
My middleware looks like this:
import os
from scrapy.dupefilter import RFPDupeFilter
from scrapy.utils.request import request_fingerprint
class CustomFilter(RFPDupeFilter):
def __getid(self, url):
mm = url.split("/")[4] #extracts project-id (is a number) from project-URL
print "_____________", mm
return mm
def request_seen(self, request):
fp = self.__getid(request.url)
self.fingerprints.add(fp)
if fp in self.fingerprints and fp.isdigit(): # .isdigit() checks wether fp comes from a project ID
print "______fp is a number (therefore a project-id) and has been encountered before______"
return True
if self.file:
self.file.write(fp + os.linesep)
I added this line to settings.py:
DUPEFILTER_CLASS = 'crawlspider.duplicate_filter.CustomFilter'
I call the script using "scrapy crawl kickstarter -o items.json -t json". Then I see the correct print statements from the middleware code.
Any comments on why the json contains multiple entries containing the same data?
So now these are the three modifications that removed the duplicates:
I added this to settings.py:
ITEM_PIPELINES = ['crawlspider.pipelines.DuplicatesPipeline',]
to let scrapy know that I added a function DuplicatesPipeline in pipelines.py:
from scrapy import signals
from scrapy.exceptions import DropItem
class DuplicatesPipeline(object):
def __init__(self):
self.ids_seen = set()
def process_item(self, item, spider):
if item['projname'] in self.ids_seen:
raise DropItem("Duplicate item found: %s" % item)
else:
self.ids_seen.add(item['projname'])
return item
You do not need to adjust the spider and do not use the dupefilter/middleware stuff I posted before.
But I got the feeling that my solution doesn't reduce the communication as the Item-object has to be created first before it is evaluated and possibly dropped. But I am okay with that.
(Solution found by asker, moved into an answer)

Categories

Resources