Iterate over all links/sub-links with Scrapy run from script

Iterate over all links/sub-links with Scrapy run from script - python

I want to run Scrapy Spider from my script, but it works only for 1 request. I cannot execute the procedure self.parse_product from scrapy.http.Request(product_url, callback=self.parse_product).
I guess it's being due the command crawler.signals.connect(callback, signal=signals.spider_closed). Please advise how correctly go over all links and sub-links.
Whole script is shown below.
import json
import scrapy
from scrapy.crawler import Crawler
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader.processor import Join, MapCompose, TakeFirst
from scrapy import log, signals, Spider, Item, Field
from scrapy.settings import Settings
from twisted.internet import reactor
# https://gist.github.com/alecxe/fc1527d6d9492b59c610
# define an item class
class WebStoreItem(Item):
name = Field()
price = Field()
developer = Field()
date_added = Field()
date_modified = Field()
votes = Field()
views = Field()
sales = Field()
avg_rating = Field()
comments = Field()
# define an item loader with input and output processors
class WebStoreItemLoader(ItemLoader):
default_input_processor = MapCompose(unicode.strip)
default_output_processor = TakeFirst()
desc_out = Join()
# define a pipeline
class JsonWriterPipeline(object):
def __init__(self):
self.file = open('items.json', 'wb')
def __del__(self):
self.file.close()
def process_item(self, item, spider):
line = json.dumps(dict(item)) + "\n"
self.file.write(line)
return item
# define a spider
class WebStoreSpider(Spider):
name = "WebStore"
allowed_domains = ["http://www.WebStore.com"]
start_urls = [
"http://www.WebStore.com/index.php"
]
def parse(self, response):
for meta in response.xpath('//div[#class="extension-grid"]'):
for product_block in meta.xpath('//div[#class="image-holder image"]'):
item = WebStoreItem()
avg_rating = meta.xpath('//div[#class="rating"]/text()').extract()[0]
item['avg_rating'] = avg_rating[avg_rating.find(': ') + 1:].strip()
comment = meta.xpath('//div[#class="comment"]/text()').extract()[0]
item['comments'] = comment[comment.find(': ') + 1:].strip()
print 'product_block: ', product_block
product_url = product_block.xpath('a[1]/#href').extract()[0]
print 'product_url: ', product_url
request = scrapy.http.Request(product_url, callback=self.parse_product)
request.meta['item'] = item
yield request
def parse_product(self, response):
item = response.meta['item']
product_meta_block = response.xpath('//div[#class="name"]')
print 'product_meta_block: ', product_meta_block
product_rows = product_meta_block.xpath('//tr)')
print 'product_rows: ', product_rows
i = 0
for row in product_rows:
if i == 1:
item['name'] = row.select('td/text()').extract()
elif i == 3:
item['votes'] = row.select('td/text()').extract()
i += 1
return item
# callback fired when the spider is closed
def callback(spider, reason):
stats = spider.crawler.stats.get_stats() # collect/log stats?
# stop the reactor
reactor.stop()
def stop_reactor():
reactor.stop()
if __name__ == '__main__':
# instantiate settings and provide a custom configuration
settings = Settings()
settings.set('ITEM_PIPELINES', {
'__main__.JsonWriterPipeline': 100
})
# instantiate a crawler passing in settings
crawler = Crawler(settings)
# instantiate a spider
spider = WebStoreSpider()
# configure signals
crawler.signals.connect(callback, signal=signals.spider_closed)
# configure and start the crawler
crawler.configure()
crawler.crawl(spider)
crawler.start()
# start logging
log.start()
# start the reactor (blocks execution)
reactor.run()

Your spider is being blocked from visiting pages after the start page by your allowed_domains specification. The value should include just the domain, not the protocol. Try
allowed_domains = ["www.WebStore.com"]
Also the line desc_out = Join() in your WebStoreItemLoader definition may give an error as you have no desc field.

Related

Scrapy Pipeline with MongoDB is not working

I'm trying to put all the data that I'm scraping in MongoDB to monitor the properties prices.
I've already done a lot of tests, but it's not working.
I will put the code here, if anyone could help me. Please.
init.py
import scrapy
from realstatedata.items import RealstatedataItem
class RsdataSpider(scrapy.Spider):
name = 'realstatedata'
allowed_domains = ['vivareal.com.br']
start_urls = ['https://www.vivareal.com.br/aluguel/sp/sao-jose-dos-campos/apartamento_residencial/#preco-ate=2000']
def parse(self, response):
nextpageurl = response.xpath('//a[#title="Próxima página"]/#href')
yield from self.scrape(response)
if nextpageurl:
path = nextpageurl.extract_first()
#print(path)
# Got #pagina=2 => Replace with ?pagina=2
path = '?' + path[1:]
#print(path)
nextpage = response.urljoin(path)
print("Found url: {}".format(nextpage))
yield scrapy.Request(nextpage)
def scrape(self, response):
for resource in response.xpath('//article[#class="property-card__container js-property-card"]/..'):
item = RealstatedataItem()
item['description'] = resource.xpath('.//h2/span[#class="property-card__title js-cardLink js-card-title"]/text()').extract_first()
item['address'] = resource.xpath('.//span[#class="property-card__address"]/text()').extract_first()
item['prop_area'] = resource.xpath('.//span[#class="property-card__detail-value js-property-card-value property-card__detail-area js-property-card-detail-area"]/text()').extract_first()
item['prop_rooms'] = resource.xpath('.//span[#class="property-card__detail-value js-property-card-value"]/text()').extract_first()
item['prop_bath'] = resource.xpath('.//span[#class="property-card__detail-value js-property-card-value"]/text()').extract_first()
item['prop_parking'] = resource.xpath('.//ul/li[4]/span[#class="property-card__detail-value js-property-card-value"]/text()').extract_first()
item['price_rent'] = resource.xpath('.//p[#style="display: block;"]/text()').extract_first()
item['price_cond'] = resource.xpath('.//strong[#class="js-condo-price"]/text()').extract_first()
item['realstate_name'] = resource.xpath('.//picture/img/#alt').extract_first()
yield item
settings.py
BOT_NAME = 'realstatedata'
SPIDER_MODULES = ['realstatedata.spiders']
NEWSPIDER_MODULE = 'realstatedata.spiders'
ITEM_PIPELINES = ['realstatedata.pipelines.MongoPipeline', ]
MONGO_URI = 'mongodb://localhost:27017'
MONGO_DATABASE = "vivareal_db"
items.py
import scrapy
class RealstatedataItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
description = scrapy.Field()
address = scrapy.Field()
prop_area = scrapy.Field()
prop_rooms = scrapy.Field()
prop_bath = scrapy.Field()
prop_parking = scrapy.Field()
price_rent = scrapy.Field()
price_cond = scrapy.Field()
realstate_name = scrapy.Field()
pass
pipeline.py
in this part of the code, i've tried two differents forms to do, but none works.
import pymongo
import logging
class MongoPipeline(object):
collection_name = 'rent_properties'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
#classmethod
def from_crawler(cls, crawler):
## pull in information from settings.py
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE')
)
def open_spider(self, spider):
## initializing spider
## opening db connection
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
## clean up when spider is closed
self.client.close()
def process_item(self, item, spider):
## how to handle each post
self.db[self.collection_name].insert(dict(item))
logging.debug("Properties added to MongoDB")
return item

Obviously your settings for enabling pipeline is wrong. ITEM_PIPELINES should be defined as a dict but not a list. In your code, the pipeline is not loaded at all.
ITEM_PIPELINES = {
"realstatedata.pipelines.MongoPipeline": 100,
}
The value in the dict represents priority when more than 1 pipeline are enabled.

Scrapy response uniform blank rows making it impossible to format response output

I want to remove the [ ] brackets scrapy adds to all it's output, to do this you simply add [0] at the end of an xpath statement as follows:
'a[#class="question-hyperlink"]/text()').extract()[0]
this solves the [ ] problem in some cases but in other cases scrapy returns every second row of output as blank and as such the moment it gets to the second row when using [0] i'm given the error:
Index error: list index out of range
How can I prevent scrapy from creating blank rows ? It seems like this is a common problem, but everyone faces this problem when exporting to CSV while for me it's with the scrapy response before exporting as CSV.
Items.py:
import scrapy
from scrapy.item import Item, Field
class QuestionItem(Item):
title = Field()
url = Field()
class PopularityItem(Item):
votes = Field()
answers = Field()
views = Field()
class ModifiedItem(Item):
lastModified = Field()
modName = Field()
The spider that doesn't output every second row as blank and thus works with [0]:
from scrapy import Spider
from scrapy.selector import Selector
from stack.items import QuestionItem
class QuestionSpider(Spider):
name = "questions"
allowed_domains = ["stackoverflow.com"]
start_urls = [
"http://stackoverflow.com/questions?pagesize=50&sort=newest",
]
def parse(self, response):
questions = Selector(response).xpath('//div[#class="summary"]/h3')
for question in questions:
item = QuestionItem()
item['title'] = question.xpath(
'a[#class="question-hyperlink"]/text()').extract()[0]
item['url'] = question.xpath(
'a[#class="question-hyperlink"]/#href').extract()[0]
yield item
The spider that gives every second row of output as blank:
from scrapy import Spider
from scrapy.selector import Selector
from stack.items import PopularityItem
class PopularitySpider(Spider):
name = "popularity"
allowed_domains = ["stackoverflow.com"]
start_urls = [
"https://stackoverflow.com/",
]
def parse(self, response):
popularity = response.xpath('//div[contains(#class, "question-summary narrow")]/div')
for poppart in popularity:
item = PopularityItem()
item['votes'] = poppart.xpath(
'div[contains(#class, "votes")]//span/text()').extract()#[0]
item['answers'] = poppart.xpath(
'div[contains(#class, "answered")]//span/text()').extract()#[0]
item['views'] = poppart.xpath(
'div[contains(#class, "views")]//span/text()').extract()#[0]
yield item
Pipelines.py
import pymongo
import logging
class StackPipeline(object):
def process_item(self, item, spider):
return item
from scrapy.conf import settings
from scrapy.exceptions import DropItem
from scrapy import log
class MongoDBPipeline(object):
def __init__(self):
connection = pymongo.MongoClient(settings['MONGODB_SERVER'], settings['MONGODB_PORT'])
self.db = connection[settings['MONGODB_DB']]
def process_item(self, item, spider):
collection = self.db[type(item).__name__.lower()]
logging.info(collection.insert(dict(item)))
return item

The easiest way to handle an error like this is to catch it and deal with it then (in this case, by just moving on past the blank lines).
class PopularitySpider(Spider):
name = "popularity"
allowed_domains = ["stackoverflow.com"]
start_urls = ["https://stackoverflow.com/"]
def parse(self, response):
popularity = response.xpath('//div[contains(#class, "question-summary narrow")]/div')
for poppart in popularity:
try:
item = PopularityItem()
item['votes'] = poppart.xpath('div[contains(#class, "votes")]//span/text()').extract()[0]
item['answers'] = poppart.xpath('div[contains(#class, "answered")]//span/text()').extract()[0]
item['views'] = poppart.xpath('div[contains(#class, "views")]//span/text()').extract()[0]
except IndexError:
continue
yield item

How to stop scrapy spider after certain number of requests?

I am developing an simple scraper to get 9 gag posts and its images but due to some technical difficulties iam unable to stop the scraper and it keeps on scraping which i dont want.I want to increase the counter value and stop after 100 posts.
But the 9gag page was designed in a fashion in each response it gives only 10 posts and after each iteration my counter value resets to 10 in this case my loop runs infintely long and never stops.
# -*- coding: utf-8 -*-
import scrapy
from _9gag.items import GagItem
class FirstSpider(scrapy.Spider):
name = "first"
allowed_domains = ["9gag.com"]
start_urls = (
'http://www.9gag.com/',
)
last_gag_id = None
def parse(self, response):
count = 0
for article in response.xpath('//article'):
gag_id = article.xpath('#data-entry-id').extract()
count +=1
if gag_id:
if (count != 100):
last_gag_id = gag_id[0]
ninegag_item = GagItem()
ninegag_item['entry_id'] = gag_id[0]
ninegag_item['url'] = article.xpath('#data-entry-url').extract()[0]
ninegag_item['votes'] = article.xpath('#data-entry-votes').extract()[0]
ninegag_item['comments'] = article.xpath('#data-entry-comments').extract()[0]
ninegag_item['title'] = article.xpath('.//h2/a/text()').extract()[0].strip()
ninegag_item['img_url'] = article.xpath('.//div[1]/a/img/#src').extract()
yield ninegag_item
else:
break
next_url = 'http://9gag.com/?id=%s&c=200' % last_gag_id
yield scrapy.Request(url=next_url, callback=self.parse)
print count
Code for items.py is here
from scrapy.item import Item, Field
class GagItem(Item):
entry_id = Field()
url = Field()
votes = Field()
comments = Field()
title = Field()
img_url = Field()
So i want to increase a global count value and tried this by passing 3 arguments to parse function it gives error
TypeError: parse() takes exactly 3 arguments (2 given)
So is there a way to pass a global count value and return it after each iteration and stop after 100 posts(suppose).
Entire project is available here Github
Even if i set POST_LIMIT =100 the infinite loop happens,see here command i executed
scrapy crawl first -s POST_LIMIT=10 --output=output.json

There's a built-in setting CLOSESPIDER_PAGECOUNT that can be passed via command-line -s argument or changed in settings: scrapy crawl <spider> -s CLOSESPIDER_PAGECOUNT=100
One small caveat is that if you've enabled caching, it will count cache hits as page counts as well.

First: Use self.count and initialize outside of parse. Then don't prevent the parsing of the items, but generating new requests. See the following code:
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Item, Field
class GagItem(Item):
entry_id = Field()
url = Field()
votes = Field()
comments = Field()
title = Field()
img_url = Field()
class FirstSpider(scrapy.Spider):
name = "first"
allowed_domains = ["9gag.com"]
start_urls = ('http://www.9gag.com/', )
last_gag_id = None
COUNT_MAX = 30
count = 0
def parse(self, response):
for article in response.xpath('//article'):
gag_id = article.xpath('#data-entry-id').extract()
ninegag_item = GagItem()
ninegag_item['entry_id'] = gag_id[0]
ninegag_item['url'] = article.xpath('#data-entry-url').extract()[0]
ninegag_item['votes'] = article.xpath('#data-entry-votes').extract()[0]
ninegag_item['comments'] = article.xpath('#data-entry-comments').extract()[0]
ninegag_item['title'] = article.xpath('.//h2/a/text()').extract()[0].strip()
ninegag_item['img_url'] = article.xpath('.//div[1]/a/img/#src').extract()
self.last_gag_id = gag_id[0]
self.count = self.count + 1
yield ninegag_item
if (self.count < self.COUNT_MAX):
next_url = 'http://9gag.com/?id=%s&c=10' % self.last_gag_id
yield scrapy.Request(url=next_url, callback=self.parse)

One can use custom_settings
with CLOSESPIDER_PAGECOUNT as shown below.
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Item, Field
class GagItem(Item):
entry_id = Field()
url = Field()
votes = Field()
comments = Field()
title = Field()
img_url = Field()
class FirstSpider(scrapy.Spider):
name = "first"
allowed_domains = ["9gag.com"]
start_urls = ('http://www.9gag.com/', )
last_gag_id = None
COUNT_MAX = 30
custom_settings = {
'CLOSESPIDER_PAGECOUNT': COUNT_MAX
}
def parse(self, response):
for article in response.xpath('//article'):
gag_id = article.xpath('#data-entry-id').extract()
ninegag_item = GagItem()
ninegag_item['entry_id'] = gag_id[0]
ninegag_item['url'] = article.xpath('#data-entry-url').extract()[0]
ninegag_item['votes'] = article.xpath('#data-entry-votes').extract()[0]
ninegag_item['img_url'] = article.xpath('.//div[1]/a/img/#src').extract()
self.last_gag_id = gag_id[0]
yield ninegag_item
next_url = 'http://9gag.com/?id=%s&c=10' % self.last_gag_id
yield scrapy.Request(url=next_url, callback=self.parse)

count is local to the parse() method so it's not preserved between pages. Change all occurences of count to self.count to make it an instance variable of the class and it will persist betwen pages.

Spider arguments are passed through the crawl command using the -a option.check link

Why scrapy not storing data into mongodb?

My main File:
import scrapy
from scrapy.exceptions import CloseSpider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request
class Product(scrapy.Item):
brand = scrapy.Field()
title = scrapy.Field()
link = scrapy.Field()
name = scrapy.Field()
title = scrapy.Field()
date = scrapy.Field()
heading = scrapy.Field()
data = scrapy.Field()
Model_name = scrapy.Field()
class aqaqspider(CrawlSpider):
name = "mouth_shut_new"
allowed_domains = ["mouthshut.com"]
start_urls = ["http://www.mouthshut.com/mobile-phones/Yu-Yureka-reviews-925723476"
]
rules = (
Rule(
SgmlLinkExtractor(allow=('.*\-page-.*',)),
callback="parse_start_url",
follow=True),
)
def parse_start_url(self, response):
products = response.xpath('//div[#id="allreviews"]/ul/li')
items = []
if not products:
raise CloseSpider("No more products!")
for product in products:
item = Product()
#item['Model_name'] = product.xpath('/html/body/form/div[12]/div/div[5]/div/div[1]/div[3]/ul/li[1]/h1/a/span/text()').extract()
item['name'] = product.xpath('.//li[#class="profile"]/div/a/span/text()').extract()[0]
item['title'] = product.xpath('.//div[#class="reviewtitle fl"]/strong/a/text()').extract()[0]
item['date'] = product.xpath('.//div[#class="reviewrate"]//span[#class="datetime"]/span/span/span/text()').extract()[0]
item['link'] = product.xpath('.//div[#class="reviewtitle fl"]/strong/a/#href').extract()[0]
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield scrapy.Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
items.append(item)
def anchor_page(self, response):
old_item = response.request.meta['item']
old_item['data'] = response.xpath('.//div[#itemprop="description"]/p/text()').extract()
yield old_item
# yield Request(url="http://www.mouthshut.com/Product/mobileListing.aspx?cid=925602729&f1=1&view=list&nsort1=0&nsort2=2015-06-01%2016:12:23.000&ntype=3&mpad=1&ran=0.3691624044781373&dcal=Intex%20Aqua%20Xtreme" ,
# headers={"Referer": "http://www.mouthshut.com/mobile-phones.php", "X-Requested-With": "XMLHttpRequest"},
# callback=self.parse,
# dont_filter=True)
My settings.py:
# -*- coding: utf-8 -*-
# Scrapy settings for mouth project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
#
BOT_NAME = 'mouth'
SPIDER_MODULES = ['mouth.spiders']
NEWSPIDER_MODULE = 'mouth.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'mouth (+http://www.yourdomain.com)'
ITEM_PIPELINES = {'mouth.pipelines.MongoDBPipeline':300}
MONGODB_HOST = 'localhost' # Change in prod
MONGODB_PORT = 27017 # Change in prod
MONGODB_DATABASE = "mobiles_complaints" # Change in prod
MONGODB_COLLECTION = "Yu_Yureka"
MONGODB_USERNAME = "" # Change in prod
MONGODB_PASSWORD = "" # Change in prod
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'consumer (+http://www.yourdomain.com)'
My pipelines.py:
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo
from scrapy.conf import settings
from scrapy import log
class MongoDBPipeline(object):
def __init__(self):
connection = pymongo.Connection(settings['MONGODB_HOST'], settings['MONGODB_PORT'])
db = connection[settings['MONGODB_DATABASE']]
self.collection = db[settings['MONGODB_COLLECTION']]
def process_item(self, item, spider):
self.collection.insert(dict(item))
log.msg("Item wrote to MongoDB database {}, collection {}, at host {}, port {}".format(
settings['MONGODB_DATABASE'],
settings['MONGODB_COLLECTION'],
settings['MONGODB_HOST'],
settings['MONGODB_PORT']))
return item
I ran scrapy scrapy crawl mouth_shut_new. But my data didn't store in the database. In the output it should show that the data is stored in mongo and the collection name. What I am missing?

process_item() method is not indented properly, should be:
class MongoDBPipeline(object):
def __init__(self):
connection = pymongo.Connection(settings['MONGODB_HOST'], settings['MONGODB_PORT'])
db = connection[settings['MONGODB_DATABASE']]
self.collection = db[settings['MONGODB_COLLECTION']]
def process_item(self, item, spider):
self.collection.insert(dict(item))
log.msg("Item wrote to MongoDB database {}, collection {}, at host {}, port {}".format(
settings['MONGODB_DATABASE'],
settings['MONGODB_COLLECTION'],
settings['MONGODB_HOST'],
settings['MONGODB_PORT']))
return item

You didn't yield the item in callback function: callback="parse_start_url", You should do it like this:
def parse_start_ul(self, response):
...
for product in products:
item = Product()
....
yield item

DropItem if parsed url contains key words (pipeline)

I am trying to build a spider for a school project where I am scraping recipes from allrecipes.com. Everything is working really well, however I seem to be unable to remove duplicate recipes where one url contains the actual recipe, and the other contains the same url with "video=true" appended.
Here is my attempt to dealing with this in pipelines.py:
from scrapy.exceptions import DropItem
from scrapy import log
class DuplicatesPipeline(object):
# minCal = 50
def __init__(self):
self.urls_seen = set()
def process_vids(self, item, spider):
video = "video=true"
url = str(item.get('url'))
if video in url:
raise DropItem("Contains video")
else:
return item
def process_item(self, item, spider):
unique_id = item.get('url')
if unique_id in self.urls_seen:
raise DropItem("Duplicate Item found (%s)" % unique_id)
else:
self.urls_seen.add('url')
return item
settings.py:
# Scrapy settings for dirbot project
BOT_NAME = 'dirbot'
SPIDER_MODULES = ['dirbot.spiders']
NEWSPIDER_MODULE = 'dirbot.spiders'
DEFAULT_ITEM_CLASS = 'dirbot.items.Website'
ITEM_PIPELINES = {'dirbot.pipelines.DuplicatesPipeline': 300,}
items.py:
from scrapy.item import Item, Field
class Website(Item):
name = Field()
url = Field()
description = Field()
kcal = Field()
carbs = Field()
fat = Field()
protein = Field()
main = Field()
sugar = Field()
fibre = Field()
author = Field()
rating = Field()
img = Field()
dnot.py:
from scrapy.spider import Spider
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy.http import Request
from scrapy.http.request import Request
from dirbot.items import Website
from scrapy.contrib.spiders import CrawlSpider,Rule
import urlparse
import scrapy
page = "http://allrecipes.com/recipes/main.aspx?Page=%d#recipes"
class DmozSpider(Spider):
name = "dnot"
allowed_domains = ["allrecipes.com"]
start_urls = [page % 1]
rules = [Rule(SgmlLinkExtractor(allow=('allrecipes.com'), restrict_xpaths = '//a[contains(.,"NEXT")]'),
callback="parse", follow= True),
]
def __init__(self):
self.page_number = 1
def parse(self, response):
print "-------------------------------------------------"
print self.page_number
print "-------------------------------------------------"
sel = Selector(response)
sites = response.xpath('//div[#id="divGridItemWrapper"]')
items = []
for site in sites:
item = Website()
recipe = response.xpath('//a[contains(#href, "/Recipe/")]/#href').extract()
url = "http://www.allrecipes.com"
for nth in recipe:
go = urlparse.urljoin(url, str(nth))
items.append(item)
for link in go:
yield Request(go, self.recipes)
if self.page_number <= 3:
self.page_number += 1
yield Request(page % self.page_number)
else:
pass
def recipes(self,response):
item = Website()
sel = Selector(response)
recipe = response.xpath('//div[#id="content-wrapper"]')
items = []
print "second page - %s" % response.url
for i in recipe:
item['url'] = response.url
item['description'] = i.xpath('//span[#itemprop="description"]/text()').extract()
item['name'] = i.xpath('//h1[#itemprop="name"]/text()').extract()
item['kcal'] = i.xpath('//ul/li[contains(.,"kcal")]/span/text()').extract()
item['carbs'] = i.xpath('//ul/li[contains(.,"Carbohydrates")]/following-sibling::li[1]//span[#id="lblNutrientValue"]/text()').extract()
item['fat'] = i.xpath('//ul/li[contains(.,"Fat")]/following-sibling::li[1]//span[#id="lblNutrientValue"]/text()').extract()
item['protein'] = i.xpath('//ul/li[contains(.,"Protein")]/following-sibling::li[1]//span[#id="lblNutrientValue"]/text()').extract()
item['main'] = "allrecipes.com"
item['sugar'] = i.xpath('//li/span[#itemprop="sugarContent"]/text()').extract()
item['fibre'] = i.xpath('//li/span[#itemprop="proteinContent"]/text()').extract()
item['author'] = i.xpath('//span[#id="lblUser0"]/text()').extract()
item['rating'] = i.xpath('//div[#class="rating-stars-img"][1]/meta[1][#itemprop="ratingValue"]/#content').extract()
item['img'] = i.xpath('//img[#id="imgPhoto"]/#src').extract()
items.append(item)
yield item
I am a little new with Python, and I'm not sure if I need to convert the item['url'] into a string or not; however I have tried with the "str" and without. I have also tried a few other methods that others have used for doing something similar, but nothing has worked for me so far.
Hoping someone can point me in the right direction. Thanks in advance!
Example:
item['url'] = http://allrecipes.com/Recipe/Delicious-Ham-and-Potato-Soup/Detail.aspx?evt19=1&referringHubId=1
item['url'] = http://allrecipes.com/Recipe/Delicious-Ham-and-Potato-Soup/Detail.aspx?evt19=1&referringHubId=1&video=true

You need to create a class that implements the process_item method on the pipelines.py file, something like:
from urllib import urlencode
from urlparse import urlparse, urlunparse, parse_qs
class DuplicatesPipeline(object):
def __init__(self):
self.ids_seen = set()
def process_item(self, item, spider):
url = item['url']
u = urlparse(url)
query = parse_qs(u.query)
query.pop('video', None)
u = u._replace(query=urlencode(query, True))
unique_id = urlunparse(u)
if unique_id and unique_id in self.ids_seen:
raise DropItem("Duplicate Item found (%s)" % unique_id)
else:
self.ids_seen.add(unique_id)
return item
Then you need to add that class, to settings.py
ITEM_PIPELINES = {
'yourproject.pipelines.DuplicatesPipeline': 300,
}
Also, your process_vids method isn't being used.
let me know if it helps you.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Iterate over all links/sub-links with Scrapy run from script - python

Related

Scrapy Pipeline with MongoDB is not working

Scrapy response uniform blank rows making it impossible to format response output

How to stop scrapy spider after certain number of requests?

Why scrapy not storing data into mongodb?

DropItem if parsed url contains key words (pipeline)

Categories

Resources