Using middleware to prevent scrapy from double-visiting websites - python

I have a problem like this:
how to filter duplicate requests based on url in scrapy
So, I do not want a website to be crawled more than once. I adapted the middleware and wrote a print statement to test whether it correctly classifies already seen websites. It does.
Nonetheless the parsing seems to be executed multiple times because the json-File I receive contains double entries.
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item
from crawlspider.items import KickstarterItem
from HTMLParser import HTMLParser
### code for stripping off HTML tags:
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return str(''.join(self.fed))
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
###
items = []
class MySpider(CrawlSpider):
name = 'kickstarter'
allowed_domains = ['kickstarter.com']
start_urls = ['http://www.kickstarter.com']
rules = (
# Extract links matching 'category.php' (but not matching 'subsection.php')
# and follow links from them (since no callback means follow=True by default).
Rule(SgmlLinkExtractor(allow=('discover/categories/comics', ))),
# Extract links matching 'item.php' and parse them with the spider's method parse_item
Rule(SgmlLinkExtractor(allow=('projects/', )), callback='parse_item'),
)
def parse_item(self, response):
self.log('Hi, this is an item page! %s' % response.url)
hxs = HtmlXPathSelector(response)
item = KickstarterItem()
item['date'] = hxs.select('//*[#id="about"]/div[2]/ul/li[1]/text()').extract()
item['projname'] = hxs.select('//*[#id="title"]/a').extract()
item['projname'] = strip_tags(str(item['projname']))
item['projauthor'] = hxs.select('//*[#id="name"]')
item['projauthor'] = item['projauthor'].select('string()').extract()[0]
item['backers'] = hxs.select('//*[#id="backers_count"]/data').extract()
item['backers'] = strip_tags(str(item['backers']))
item['collmoney'] = hxs.select('//*[#id="pledged"]/data').extract()
item['collmoney'] = strip_tags(str(item['collmoney']))
item['goalmoney'] = hxs.select('//*[#id="stats"]/h5[2]/text()').extract()
items.append(item)
return items
My items.py looks like that:
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/topics/items.html
from scrapy.item import Item, Field
class KickstarterItem(Item):
# define the fields for your item here like:
date = Field()
projname = Field()
projauthor = Field()
backers = Field()
collmoney = Field()
goalmoney = Field()
pass
My middleware looks like this:
import os
from scrapy.dupefilter import RFPDupeFilter
from scrapy.utils.request import request_fingerprint
class CustomFilter(RFPDupeFilter):
def __getid(self, url):
mm = url.split("/")[4] #extracts project-id (is a number) from project-URL
print "_____________", mm
return mm
def request_seen(self, request):
fp = self.__getid(request.url)
self.fingerprints.add(fp)
if fp in self.fingerprints and fp.isdigit(): # .isdigit() checks wether fp comes from a project ID
print "______fp is a number (therefore a project-id) and has been encountered before______"
return True
if self.file:
self.file.write(fp + os.linesep)
I added this line to settings.py:
DUPEFILTER_CLASS = 'crawlspider.duplicate_filter.CustomFilter'
I call the script using "scrapy crawl kickstarter -o items.json -t json". Then I see the correct print statements from the middleware code.
Any comments on why the json contains multiple entries containing the same data?

So now these are the three modifications that removed the duplicates:
I added this to settings.py:
ITEM_PIPELINES = ['crawlspider.pipelines.DuplicatesPipeline',]
to let scrapy know that I added a function DuplicatesPipeline in pipelines.py:
from scrapy import signals
from scrapy.exceptions import DropItem
class DuplicatesPipeline(object):
def __init__(self):
self.ids_seen = set()
def process_item(self, item, spider):
if item['projname'] in self.ids_seen:
raise DropItem("Duplicate item found: %s" % item)
else:
self.ids_seen.add(item['projname'])
return item
You do not need to adjust the spider and do not use the dupefilter/middleware stuff I posted before.
But I got the feeling that my solution doesn't reduce the communication as the Item-object has to be created first before it is evaluated and possibly dropped. But I am okay with that.
(Solution found by asker, moved into an answer)

Related

Scrapy response uniform blank rows making it impossible to format response output

I want to remove the [ ] brackets scrapy adds to all it's output, to do this you simply add [0] at the end of an xpath statement as follows:
'a[#class="question-hyperlink"]/text()').extract()[0]
this solves the [ ] problem in some cases but in other cases scrapy returns every second row of output as blank and as such the moment it gets to the second row when using [0] i'm given the error:
Index error: list index out of range
How can I prevent scrapy from creating blank rows ? It seems like this is a common problem, but everyone faces this problem when exporting to CSV while for me it's with the scrapy response before exporting as CSV.
Items.py:
import scrapy
from scrapy.item import Item, Field
class QuestionItem(Item):
title = Field()
url = Field()
class PopularityItem(Item):
votes = Field()
answers = Field()
views = Field()
class ModifiedItem(Item):
lastModified = Field()
modName = Field()
The spider that doesn't output every second row as blank and thus works with [0]:
from scrapy import Spider
from scrapy.selector import Selector
from stack.items import QuestionItem
class QuestionSpider(Spider):
name = "questions"
allowed_domains = ["stackoverflow.com"]
start_urls = [
"http://stackoverflow.com/questions?pagesize=50&sort=newest",
]
def parse(self, response):
questions = Selector(response).xpath('//div[#class="summary"]/h3')
for question in questions:
item = QuestionItem()
item['title'] = question.xpath(
'a[#class="question-hyperlink"]/text()').extract()[0]
item['url'] = question.xpath(
'a[#class="question-hyperlink"]/#href').extract()[0]
yield item
The spider that gives every second row of output as blank:
from scrapy import Spider
from scrapy.selector import Selector
from stack.items import PopularityItem
class PopularitySpider(Spider):
name = "popularity"
allowed_domains = ["stackoverflow.com"]
start_urls = [
"https://stackoverflow.com/",
]
def parse(self, response):
popularity = response.xpath('//div[contains(#class, "question-summary narrow")]/div')
for poppart in popularity:
item = PopularityItem()
item['votes'] = poppart.xpath(
'div[contains(#class, "votes")]//span/text()').extract()#[0]
item['answers'] = poppart.xpath(
'div[contains(#class, "answered")]//span/text()').extract()#[0]
item['views'] = poppart.xpath(
'div[contains(#class, "views")]//span/text()').extract()#[0]
yield item
Pipelines.py
import pymongo
import logging
class StackPipeline(object):
def process_item(self, item, spider):
return item
from scrapy.conf import settings
from scrapy.exceptions import DropItem
from scrapy import log
class MongoDBPipeline(object):
def __init__(self):
connection = pymongo.MongoClient(settings['MONGODB_SERVER'], settings['MONGODB_PORT'])
self.db = connection[settings['MONGODB_DB']]
def process_item(self, item, spider):
collection = self.db[type(item).__name__.lower()]
logging.info(collection.insert(dict(item)))
return item
The easiest way to handle an error like this is to catch it and deal with it then (in this case, by just moving on past the blank lines).
class PopularitySpider(Spider):
name = "popularity"
allowed_domains = ["stackoverflow.com"]
start_urls = ["https://stackoverflow.com/"]
def parse(self, response):
popularity = response.xpath('//div[contains(#class, "question-summary narrow")]/div')
for poppart in popularity:
try:
item = PopularityItem()
item['votes'] = poppart.xpath('div[contains(#class, "votes")]//span/text()').extract()[0]
item['answers'] = poppart.xpath('div[contains(#class, "answered")]//span/text()').extract()[0]
item['views'] = poppart.xpath('div[contains(#class, "views")]//span/text()').extract()[0]
except IndexError:
continue
yield item

Scrapy: How to populate an item with data from two websites

I'd like to collect an item's data from 2 different websites.
It should work as follows:
parse_website_1 fetches a persons name from website_1 and populates
the item
parse_website_1 yields a request for parse_website_2
parse_website_2 parses website_2, collects the persons hair-color, based on the person name which was scraped from website_1 and populates the item
parse_website_2 loads the item
Would this be in the right direction, given that the item is defined in items.py:
import scrapy
from scrapy.loader import ItemLoader
from myproject.items import MyItem
class MySpider(scrapy.Spider):
name = "myspider"
def __init__(self):
self.item = ItemLoader(item=MyItem(), response=response)
def start_requests(self):
scrapy.Request(url="http://website_1.com", callback=self.parse_website_1)
def parse_website_1(self, response):
name = response.xpath('//div[#class="name"]/text()').extract_first()
self.item.add_value("name", name)
website_2_path = "http://website_2.com/" + name
yield scrapy.Request(url=website_2_path, callback=self.parse_website_2)
def parse_website_2(self, response):
self.item.add_xpath("hair_color", '//div[#class="hair_color"]')
yield self.item.load_item()
The idea is right, but the implementation is not correct in that you are trying to pass data between consecutive requests using an instance attribute (self.item). Scrapy requests are asynchronous so it would not work as expected.
The correct way how to do it is outlined in Scrapy FAQ. Pass the partial item data to consecutive request using Request's meta attribute where you obtain it using Response's meta attribute, add some more data and finally yield the item. Here's the adapted code:
import scrapy
from scrapy.loader import ItemLoader
from myproject.items import MyItem
class MySpider(scrapy.Spider):
name = "myspider"
def start_requests(self):
scrapy.Request(url="http://website_1.com", callback=self.parse_website_1)
def parse_website_1(self, response):
item = ItemLoader(item=MyItem(), response=response)
name = response.xpath('//div[#class="name"]/text()').extract_first()
item.add_value("name", name)
website_2_path = "http://website_1.com/" + name
yield scrapy.Request(url=website_2_path, callback=self.parse_website_2, meta={'item': item})
def parse_website_2(self, response):
item = response.meta['item']
item.add_xpath("hair_color", '//div[#class="hair_color"]')
yield item.load_item()

How to add new requests for my Scrapy Spider during crawling

I use the XMLFeedSpider in Scrapy to scrap a real estate website.
Each url request generated by my spider (via start_urls) return a page in XML with a bunch of ads and a link to the next page (search results is limited to 50 ads).
I was therefore wondering how i could add this additional page as new request in my spider ?
I've been searching through stackoverflow for a while but i just can't find a simple answer to my problem !
Below is the code i have in my spider. I have updated it with the parse_nodes() method mentioned by Paul but the next url is not picked up for some reasons.
Could i yield additional requests in the adapt_response method ?
from scrapy.spider import log
from scrapy.selector import XmlXPathSelector
from scrapy.contrib.spiders import XMLFeedSpider
from crawler.items import RefItem, PicItem
from crawler.seloger_helper import urlbuilder
from scrapy.http import Request
class Seloger_spider_XML(XMLFeedSpider):
name = 'Seloger_spider_XML'
allowed_domains = ['seloger.com']
iterator = 'iternodes' # This is actually unnecessary, since it's the default value
itertag = 'annonce'
'''Spider Initialized with department as argument'''
def __init__(self, departement=None, *args, **kwargs):
super(Seloger_spider_XML, self).__init__(*args, **kwargs)
#self.start_urls = urlbuilder(departement) #helper function which generate start_urls
self.start_urls = ['http://ws.seloger.com/search.xml?cp=72&idtt=2&tri=d_dt_crea&SEARCHpg=1']
def parse_node(self, response, node):
items = []
item = RefItem()
item['ref'] = int(''.join(node.select('//annonce/idAnnonce/text()').extract()))
item['desc'] = ''.join(node.select('//annonce/descriptif/text()').extract()).encode('utf-8')
item['libelle'] = ''.join(node.select('//annonce/libelle/text()').extract()).encode('utf-8')
item['titre'] = ''.join(node.select('//annonce/titre/text()').extract()).encode('utf-8')
item['ville'] = ''.join(node.select('//annonce/ville/text()').extract()).encode('utf-8')
item['url'] =''.join(node.select('//annonce/permaLien/text()').extract()).encode('utf-8')
item['prix'] = ''.join(node.select('//annonce/prix/text()').extract())
item['prixunite'] = ''.join(node.select('//annonce/prixUnite/text()').extract())
item['datemaj'] = ''.join(node.select('//annonce/dtFraicheur/text()').extract())[:10]
item['datecrea'] = ''.join(node.select('//annonce/dtCreation/text()').extract())[:10]
item['lati'] = (''.join(node.select('//annonce/latitude/text()').extract()))
item['longi'] = (''.join(node.select('//annonce/longitude/text()').extract()))
item['surface'] = (''.join(node.select('//annonce/surface/text()').extract()))
item['surfaceunite'] = (''.join(node.select('//annonce/surfaceUnite/text()').extract()))
item['piece'] = (''.join(node.select('//annonce/nbPiece/text()').extract())).encode('utf-8')
item['ce'] = (''.join(node.select('//annonce/dbilanEmissionGES/text()').extract())).encode('utf-8')
items.append(item)
for photos in node.select('//annonce/photos'):
for link in photos.select('photo/thbUrl/text()').extract():
pic = PicItem()
pic['pic'] = link.encode('utf-8')
pic['refpic'] = item['ref']
items.append(pic)
return items
def parse_nodes(self, response, nodes):
for n in super(Seloger_spider_XML, self).parse_nodes(response, nodes):
yield n
# once you're done with item/nodes
# look for the next page link using XPath
# these lines are borrowed form
# https://github.com/scrapy/scrapy/blob/master/scrapy/contrib/spiders/feed.py#L73
selector = XmlXPathSelector(response)
self._register_namespaces(selector)
for link_url in selector.select('//pageSuivante/text()').extract():
yield Request(link_url)
Thank you
Gilles
You can override the parse_nodes() method to hook in your "next page" URL extraction.
The example below is based on Scrapy docs XMLFeedExample:
from scrapy import log
from scrapy.contrib.spiders import XMLFeedSpider
from myproject.items import TestItem
from scrapy.selector import XmlXPathSelector
from scrapy.http import Request
class MySpider(XMLFeedSpider):
name = 'example.com'
allowed_domains = ['example.com']
start_urls = ['http://www.example.com/feed.xml']
iterator = 'iternodes' # This is actually unnecessary, since it's the default value
itertag = 'item'
def parse_node(self, response, node):
log.msg('Hi, this is a <%s> node!: %s' % (self.itertag, ''.join(node.extract())))
item = Item()
item['id'] = node.select('#id').extract()
item['name'] = node.select('name').extract()
item['description'] = node.select('description').extract()
return item
def parse_nodes(self, response, nodes):
# call built-in method that itself calls parse_node()
# and yield whatever it returns
for n in super(MySpider, self).parse_nodes(response, nodes):
yield n
# once you're done with item/nodes
# look for the next page link using XPath
# these lines are borrowed form
# https://github.com/scrapy/scrapy/blob/master/scrapy/contrib/spiders/feed.py#L73
selector = XmlXPathSelector(response)
self._register_namespaces(selector)
for link_url in selector.select('//pageSuivante/text()').extract():
print "link_url", link_url
yield Request(link_url)

Scrapy TypeError: Request url must be str or unicode [duplicate]

I am new to python programming and using scrapy. I have setup my crawler and so far it was working until I got to the point where I wanted to figure out how to download images. The error I am getting is cannot import name NsiscrapePipeline. I dont know what I am doing wrong and I dont understand some of the documentation as I am new. Please help
Items File
from scrapy.item import Item, Field
class NsiscrapeItem(Item):
# define the fields for your item here like:
# name = Field()
location = Field()
stock_number = Field()
year = Field()
manufacturer = Field()
model = Field()
length = Field()
price = Field()
status = Field()
url = Field()
pass
Spider
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from NSIscrape.items import NsiscrapeItem
from scrapy.http import Request
from scrapy.contrib.pipeline.images import NsiscrapePipeline
import Image
class NsiscrapeSpider(BaseSpider):
name = "Nsiscrape"
allowed_domain = ["yachtauctions.com"]
start_urls = [
"http://www.yachtauctions.com/inventory/"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//tr')
items = []
for site in sites:
item = NsiscrapeItem()
item['location'] = site.select('td[2]/text()').extract()
item['stock_number'] = site.select('td[3]/a/text()').extract()
item['year'] = site.select('td[4]/text()').extract()
item['manufacturer'] = site.select('td[5]/text()').extract()
item['model'] = site.select('td[6]/text()').extract()
item['length'] = site.select('td[7]/text()').extract()
item['price'] = site.select('td[8]/text()').extract()
item['status'] = site.select('td[10]/img/#src').extract()
item['url'] = site.select('td[1]/a/#href').extract()
item['image_urls'] = site.select('td/a[3]/img/#data-original').extract()
item['images'] = item['image_urls']
yield Request(item['url'][0], meta={'item':item}, callback=self.product_detail_page)
def product_detail_page(self, response):
hxs = HtmlXPathSelector(response)
item = response.request.meta['item']
#add all images url in the item['image_urls']
yield item
settings
ITEM_PIPELINES = ['scrapy.contrib.pipeline.image.NsiscrapePipeline']
IMAGES_STORE = 'c:\Python27\NSIscrape\IMG'
IMAGES_EXPIRES = 90
Pipelines This is where I am unsure if I am missing something
from scrapy.item import Item
class NsiscrapePipeline(Item):
image_urls = Field()
images = Field()
def process_item(self, item, spider):
return item
error
File "NSIscrape\spiders\NSI_Spider.py", line 9, in <module>
from scrapy.contrib.pipeline.images import NsiscrapePipeline
ImportError: cannot import name NsiscrapePipeline
You tried to pass list, but this function accepts only string. Pass only one element from list (for example list[0]).
Heres my final code thats working. There was two issues
1: I was missing the second backslash that needede to be in the request --> //td[1]/a[3]/img/#data-original
2: I had to check the full URL in which the image would be displayed and join them together which was the main URL or the allowed URL and the image URL.
def parse(self, response):
hxs = HtmlXPathSelector(response)
images = hxs.select('//tr')
url = []
for image in images:
urls = NsiscrapeItem()
urls['image_urls'] = ["http://www.yachtauctions.com" + x for x in image.select('//td[1]/a[3]/img/#data-original').extract()]
url.append(urls)
return url
That isn't part of the library :) - at least by looking at their current master branch
I think you're looking for ImagesPipeline
Their example may help! example
p.s. I don't think you custom name the class - at least not by how scapy is designed; i'm reasonably sure you use their class ;)

scrapy: newbie attempting to debug code

Total newbie, trying to get scrapy to read a list of urls from csv and return the items in a csv.
Need some help to figure out where I'm going wrong here:
Spider code:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
import random
class incyspider(BaseSpider):
name = "incyspider"
def __init__(self):
super(incyspider, self).__init__()
domain_name = "incyspider.co.uk"
f = open("urls.csv")
start_urls = [url.strip() for url in f.readlines()]
f.close
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="Product"]')
items = []
for site in sites:
item['title'] = hxs.select('//div[#class="Name"]/node()').extract()
item['hlink'] = hxs.select('//div[#class="Price"]/node()').extract()
item['price'] = hxs.select('//div[#class="Codes"]/node()').extract()
items.append(item)
return items
SPIDER = incyspider()
Here's the items.py code:
from scrapy.item import Item, Field
class incyspider(Item):
# define the fields for your item here like:
# name = Field()
title = Field()
hlink = Field()
price = Field()
pass
To run, I'm using
scrapy crawl incyspider -o items.csv -t csv
I would seriously appreciate any pointers.
I'm not exactly sure but after a quick look at your code I would say that at least you need to replace this line
sites = hxs.select('//div[#class="Product"]')
by this line
sites = hxs.select('//div[#class="Product"]').extract()
As a first punt at answering this, your spider code is missing an import for your incyspider item class. Also you're not creating an instance of any kind of item to store the title/hlink/price info, so the items.append(item) line might complain.
Since your spider is also called incyspider, you should rename the item to be something like incyspiderItem and then add the following line to your spider code
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
import random
from incyspider.items import incyspiderItem
class incyspider(BaseSpider):
name = "incyspider"
def __init__(self):
super(incyspider, self).__init__()
domain_name = "incyspider.co.uk"
f = open("urls.csv")
start_urls = [url.strip() for url in f.readlines()]
f.close
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="Product"]')
items = []
for site in sites:
item = incyspiderItem()
item['title'] = hxs.select('//div[#class="Name"]/node()').extract()
item['hlink'] = hxs.select('//div[#class="Price"]/node()').extract()
item['price'] = hxs.select('//div[#class="Codes"]/node()').extract()
items.append(item)
return items
If I'm wrong, then please edit the question to explain how you know there is a problem with the code eg: is the expected output different to the actual output? If so, how?

Categories

Resources