Scrapy - What to do when no downloadable file is found? - python

I am currently working on a scrapy program that has the availability to download files from the page I'm scraping from, the issue that I am currently running into is that some of the pages have a datasheet like this page - https://www.tyconsystems.com/rpms24-720-720 - while others do not like this page - https://www.tyconsystems.com/tpdin-cable-232 -
What is the proper way of passing data for when there is no file found on the page?
Additional question, is there anyway to fix the issue with the csv file having multiple lines per item when the item data length is too long? example item - rpms24-720-720.
Below is the code that I am using.
productInfo.py
from copyreg import clear_extension_cache
import scrapy
from ..items import tyconItem
class ProductInfoSpider(scrapy.Spider):
name = "productInfo"
allowed_domains = ['tyconsystems.com']
start_urls = [
'https://www.tyconsystems.com/rpms24-720-720',
'https://www.tyconsystems.com/tpdin-cable-232',
]
def parse(self, response):
for product in response.css('section#listing'):
items = tyconItem() # Unique item for each iteration
name_dirty = product.css('div.product-id span#product_id::text').get()
product_sku = name_dirty.strip()
product_sub_title_dirty = product.css('div.product-details h1.page_headers::text').get()
product_sub_title = product_sub_title_dirty.strip()
#product_store_description = product.css('p.series-card__intro').get()
if product.xpath('//p[contains(#class, "MsoNormal")]'):
summary = product.css('div.item > div p.MsoNormal').getall()
elif product.xpath('//div[contains(#class, "item")]/div'):
summary = product.css('div.item > div').getall()
else:
summary = product.css('div.item').getall()
category_list = product.xpath('//div[#class="container"]//ol//li//a/span//text()').getall()
category = category_list[-2].strip()
description = product.css('div.item > p.MsoNormal::text').getall()
if product.css('div.extrafieldsBlock span.info a::attr(href)').get() == '':
datasheet = 'no-file'
else:
datasheet = product.css('div.extrafieldsBlock span.info a::attr(href)').get()
file_urls = datasheet
specification = product.css('div#tab-6 div.info > table').getall()
price = product.css('span#price::text').get()
products_zoom_image = name_dirty.strip() + '.jpg'
main_image = product.css('div#addl-images a::attr(href)').getall()
image_urls = [response.urljoin(i) for i in main_image]
items['category'] = category,
items['datasheet'] = datasheet,
items['description'] = description,
items['main_image'] = main_image,
items['price'] = price,
items['product_link'] = response.url, # get the product link from response
items['product_sku'] = product_sku,
items['product_sub_title'] = product_sub_title,
items['products_zoom_image'] = products_zoom_image
items['specification'] = specification,
items['summary'] = summary,
items['file_urls'] = [file_urls]
items["name"] = product_sku
items["image_urls"] = image_urls
yield items
items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
from scrapy import Field, Item
class tyconItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
category = scrapy.Field()
datasheet = scrapy.Field()
description = scrapy.Field()
file_urls = scrapy.Field()
files = scrapy.Field()
name = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
main_image = scrapy.Field()
price = scrapy.Field()
product_link = scrapy.Field()
product_sku = scrapy.Field()
product_sub_title = scrapy.Field()
products_zoom_image = scrapy.Field()
specification = scrapy.Field()
summary = scrapy.Field()
pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
# from scrapy.pipelines.images import ImagesPipeline
from scrapy.http import Request
from scrapy.pipelines.files import FilesPipeline
from scrapy.pipelines.images import ImagesPipeline
from io import BytesIO
from PIL import Image
class tyconPipeline:
def process_item(self, item, spider):
return item
class DownfilesPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None):
file_name: str = request.url.split("/")[-1]
return file_name
class ImagePipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None, *args, item=None):
filename = request.meta["filename"].strip()
number = request.meta["file_num"]
return filename + "_" + str(number) + ".jpg"
def thumb_path(self, request, thumb_id, response=None, info=None):
filename = request.meta["filename"]
number = request.meta["file_num"]
return f'thumbs/{thumb_id}/{filename}_{str(number)}.jpg'
def get_media_requests(self, item, info):
name = item["name"]
for i, url in enumerate(item["image_urls"]):
meta = {"filename": name, "file_num": i}
yield Request(url, meta=meta)
def convert_image(self, image, size=None):
if size is not None:
# If the size is not None then it is a thumbnail
# so we resize it according the parameter
image = image.resize(size, Image.ANTIALIAS)
else:
# otherwise we give the image to back to the superclass version of
# this method for it to process.
return super().convert_image(image, size=size)
buf = BytesIO() # These next 3 lines are from the scrapy source code.
image.save(buf, 'JPEG', quality=72)
return image, buf
Scrapy Error in Log
Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/twisted/internet/defer.py", line 857, in _runCallbacks
current.result = callback( # type: ignore[misc]
File "/usr/lib/python3/dist-packages/scrapy/utils/defer.py", line 162, in f
return deferred_from_coro(coro_f(*coro_args, **coro_kwargs))
File "/usr/lib/python3/dist-packages/scrapy/pipelines/media.py", line 87, in process_item
requests = arg_to_iter(self.get_media_requests(item, info))
File "/usr/lib/python3/dist-packages/scrapy/pipelines/files.py", line 492, in get_media_requests
return [Request(u) for u in urls]
File "/usr/lib/python3/dist-packages/scrapy/pipelines/files.py", line 492, in <listcomp>
return [Request(u) for u in urls]
File "/usr/lib/python3/dist-packages/scrapy/http/request/__init__.py", line 60, in __init__
self._set_url(url)
File "/usr/lib/python3/dist-packages/scrapy/http/request/__init__.py", line 98, in _set_url
raise TypeError(f"Request url must be str, got {type(url).__name__}")
TypeError: Request url must be str, got NoneType
Thanks everyone!

Two approaches are possible:
1. Override get_media_requests
Override get_media_requests in your pipelines to check for the existence of URLs as follows:
class DownfilesPipeline(FilesPipeline):
def get_media_requests(self, item, info):
urls = ItemAdapter(item).get(self.files_urls_field, [])
if not all(urls):
return #THIS - Don't return Request if there is no URL
return [Request(u) for u in URLs]
# Rest of the code
class ImagePipeline(ImagesPipeline):
def get_media_requests(self, item, info):
urls = item.get("image_urls")
if not all(urls):
return None #THIS - Don't return Request if there is no URL
name = item["name"]
for i, url in enumerate(item["image_urls"]):
meta = {"filename": name, "file_num": i}
yield Request(url, meta=meta)
2. Return different items
You can have different item types returned from the Spider based on if you have an image to download or not. For ease, I prefer using anonymous dictionaries as follows:
def parse(self, response)
item={}
items['category'] = category,
items['datasheet'] = datasheet,
...
if file_to_download:
items['file_urls'] = [file_urls]
if image_to_download:
items['image_urls'] = [image_urls]
items["name"] = product_sku
items["image_urls"] = image_urls
yield item
Hope it helps!

Related

Scrapy Pipeline with MongoDB is not working

I'm trying to put all the data that I'm scraping in MongoDB to monitor the properties prices.
I've already done a lot of tests, but it's not working.
I will put the code here, if anyone could help me. Please.
init.py
import scrapy
from realstatedata.items import RealstatedataItem
class RsdataSpider(scrapy.Spider):
name = 'realstatedata'
allowed_domains = ['vivareal.com.br']
start_urls = ['https://www.vivareal.com.br/aluguel/sp/sao-jose-dos-campos/apartamento_residencial/#preco-ate=2000']
def parse(self, response):
nextpageurl = response.xpath('//a[#title="Próxima página"]/#href')
yield from self.scrape(response)
if nextpageurl:
path = nextpageurl.extract_first()
#print(path)
# Got #pagina=2 => Replace with ?pagina=2
path = '?' + path[1:]
#print(path)
nextpage = response.urljoin(path)
print("Found url: {}".format(nextpage))
yield scrapy.Request(nextpage)
def scrape(self, response):
for resource in response.xpath('//article[#class="property-card__container js-property-card"]/..'):
item = RealstatedataItem()
item['description'] = resource.xpath('.//h2/span[#class="property-card__title js-cardLink js-card-title"]/text()').extract_first()
item['address'] = resource.xpath('.//span[#class="property-card__address"]/text()').extract_first()
item['prop_area'] = resource.xpath('.//span[#class="property-card__detail-value js-property-card-value property-card__detail-area js-property-card-detail-area"]/text()').extract_first()
item['prop_rooms'] = resource.xpath('.//span[#class="property-card__detail-value js-property-card-value"]/text()').extract_first()
item['prop_bath'] = resource.xpath('.//span[#class="property-card__detail-value js-property-card-value"]/text()').extract_first()
item['prop_parking'] = resource.xpath('.//ul/li[4]/span[#class="property-card__detail-value js-property-card-value"]/text()').extract_first()
item['price_rent'] = resource.xpath('.//p[#style="display: block;"]/text()').extract_first()
item['price_cond'] = resource.xpath('.//strong[#class="js-condo-price"]/text()').extract_first()
item['realstate_name'] = resource.xpath('.//picture/img/#alt').extract_first()
yield item
settings.py
BOT_NAME = 'realstatedata'
SPIDER_MODULES = ['realstatedata.spiders']
NEWSPIDER_MODULE = 'realstatedata.spiders'
ITEM_PIPELINES = ['realstatedata.pipelines.MongoPipeline', ]
MONGO_URI = 'mongodb://localhost:27017'
MONGO_DATABASE = "vivareal_db"
items.py
import scrapy
class RealstatedataItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
description = scrapy.Field()
address = scrapy.Field()
prop_area = scrapy.Field()
prop_rooms = scrapy.Field()
prop_bath = scrapy.Field()
prop_parking = scrapy.Field()
price_rent = scrapy.Field()
price_cond = scrapy.Field()
realstate_name = scrapy.Field()
pass
pipeline.py
in this part of the code, i've tried two differents forms to do, but none works.
import pymongo
import logging
class MongoPipeline(object):
collection_name = 'rent_properties'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
#classmethod
def from_crawler(cls, crawler):
## pull in information from settings.py
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE')
)
def open_spider(self, spider):
## initializing spider
## opening db connection
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
## clean up when spider is closed
self.client.close()
def process_item(self, item, spider):
## how to handle each post
self.db[self.collection_name].insert(dict(item))
logging.debug("Properties added to MongoDB")
return item
Obviously your settings for enabling pipeline is wrong. ITEM_PIPELINES should be defined as a dict but not a list. In your code, the pipeline is not loaded at all.
ITEM_PIPELINES = {
"realstatedata.pipelines.MongoPipeline": 100,
}
The value in the dict represents priority when more than 1 pipeline are enabled.

Activating a Pipeline Component in Scrapy to write JSON

I am trying to save scraped items in separate json files, but I don't see any output files. The pipeline and the item is defined in the piplines.py and items.py files in the scrapy project folder. Do I have to call process_item() explicitly or will it be called automatically when I return item in scrape()? I enabled the pipeline in CrawlerProcess(settings={'ITEM_PIPELINES'}). Thanks.
The pipeline
import json,datetime
class JsonWriterPipeline(object):
def process_item(self, item, spider):
# return item
fileName = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + '.json'
try:
with open(fileName,'w') as fp:
json.dump(dict(item),fp)
return item
except:
return item
class ProjectItem(scrapy.Item):
title = scrapy.Field()
url = scrapy.Field()
class mySpider(CrawlSpider):
name = 'mySPider'
allowed_domains = ['allowedDOmain.org']
start_urls = ['https://url.org']
def parse(self,response):
monthSelector = '//div[#class="archives-column"]/ul/li/a[contains(text(),"November 2019")]/#href'
monthLink = response.xpath(monthSelector).extract_first()
yield response.follow(monthLink,callback=self.scrape)
def scrape(self,response):
# get the links to all individual articles
linkSelector = '.entry-title a::attr(href)'
allLinks = response.css(linkSelector).extract()
for link in allLinks:
# item = articleItem()
item = ProjectItem()
item['url'] = link
request = response.follow(link,callback=self.getContent)
request.meta['item'] = item
item = request.meta['item']
yield item
nextPageSelector = 'span.page-link a::attr(href)'
nextPageLink = response.css(nextPageSelector).extract_first()
yield response.follow(nextPageLink,callback=self.scrape)
def getContent(self,response):
item = response.meta['item']
TITLE_SELECTOR = '.entry-title ::text'
item['title'] = response.css(TITLE_SELECTOR).extract_first()
yield item
To settings.py, add:
ITEM_PIPELINES = {
'myproject.pipelines.JsonWriterPipeline':100
}
where myproject is the name of your project/folder.
See the very last heading on this page : https://docs.scrapy.org/en/latest/topics/item-pipeline.html
When running a spider inside a script, the settings need to be imported using the method described in the following. Running scrapy from script not including pipeline

Trouble with downloading images using Scrapy

I'm getting the following error when attempting to download images using a spider with Scrapy.
File "C:\Python27\lib\site-packages\scrapy\http\request\__init__.py",
line 61, in _set_url
raise ValueError('Missing scheme in request url: %s' % self._url)
exceptions.ValueError: Missing scheme in request url: h
As best as I can understand it, it looks like I'm missing an "h" in a url somewhere? But I can't for the life of me see where. Everything works fine if I'm not trying to download images. But once I add the appropriate code to the four files below, I can't get anything to work properly. Could anyone help me make sense of this error?
items.py
import scrapy
class ProductItem(scrapy.Item):
model = scrapy.Field()
shortdesc = scrapy.Field()
desc = scrapy.Field()
series = scrapy.Field()
imageorig = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
settings.py
BOT_NAME = 'allenheath'
SPIDER_MODULES = ['allenheath.spiders']
NEWSPIDER_MODULE = 'allenheath.spiders'
ITEM_PIPELINES = {'scrapy.contrib.pipeline.images.ImagesPipeline': 1}
IMAGES_STORE = 'c:/allenheath/images'
pipelines.py
class AllenheathPipeline(object):
def process_item(self, item, spider):
return item
import scrapy
from scrapy.contrib.pipeline.images import ImagesPipeline
from scrapy.exceptions import DropItem
class MyImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
products.py (my spider)
import scrapy
from allenheath.items import ProductItem
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
class productsSpider(scrapy.Spider):
name = "products"
allowed_domains = ["http://www.allen-heath.com/"]
start_urls = [
"http://www.allen-heath.com/ahproducts/ilive-80/",
"http://www.allen-heath.com/ahproducts/ilive-112/"
]
def parse(self, response):
for sel in response.xpath('/html'):
item = ProductItem()
item['model'] = sel.css('#prodsingleouter > div > div > h2::text').extract()
item['shortdesc'] = sel.css('#prodsingleouter > div > div > h3::text').extract()
item['desc'] = sel.css('#tab1 #productcontent').extract()
item['series'] = sel.css('#pagestrip > div > div > a:nth-child(3)::text').extract()
item['imageorig'] = sel.css('#prodsingleouter > div > div > h2::text').extract()
item['image_urls'] = sel.css('#tab1 #productcontent img').extract()[0]
item['image_urls'] = 'http://www.allen-heath.com' + item['image_urls']
yield item
Any help would be greatly appreciated.
The issue is here:
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url)
and here:
item['image_urls'] = sel.css('#tab1 #productcontent img').extract()[0]
You are extracting this field and taking the first element. Which means that once you are iterating over it in the pipeline, you are in fact iterating over characters in the URL, which begins with http - explaining the error message you are seeing, as soon as the first letter tries to be processed:
Missing scheme in request url: h
Remove the [0] from the line. While you're at it, fetch the src of the image, instead of the entire element:
item['image_urls'] = sel.css('#tab1 #productcontent img').xpath('./#src').extract()
After that, you should update the next line also, in case the image url is relative, to convert it to absolute:
import urlparse # put this at the top of the script
item['image_urls'] = [urlparse.urljoin(response.url, url) for url in item['image_urls']]
But you don't need this last part if the image URL in src is actually absolute, so just remove it.

Renaming downloaded images in Scrapy 0.24 with content from an item field while avoiding filename conflicts?

I'm attempting to rename the images that are downloaded by my Scrapy 0.24 spider. Right now the downloaded images are stored with a SHA1 hash of their URLs as the file names. I'd like to instead name them the value I extract with item['model']. This question from 2011 outlines what I want, but the answers are for previous versions of Scrapy and don't work with the latest version.
Once I manage to get this working I'll also need to make sure I account for different images being downloaded with the same filename. So I'll need to download each image to its own uniquely named folder, presumably based on the original URL.
Here is a copy of the code I am using in my pipeline. I got this code from a more recent answer in the link above, but it's not working for me. Nothing errors out and the images are downloaded as normal. It doesn't seem my extra code has any effect on the filenames as they still appear as SHA1 hashes.
pipelines.py
class AllenheathPipeline(object):
def process_item(self, item, spider):
return item
import scrapy
from scrapy.contrib.pipeline.images import ImagesPipeline
from scrapy.http import Request
from scrapy.exceptions import DropItem
class MyImagesPipeline(ImagesPipeline):
#Name download version
def file_path(self, request, response=None, info=None):
item=request.meta['item'] # Like this you can use all from item, not just url.
image_guid = request.url.split('/')[-1]
return 'full/%s' % (image_guid)
#Name thumbnail version
def thumb_path(self, request, thumb_id, response=None, info=None):
image_guid = thumb_id + request.url.split('/')[-1]
return 'thumbs/%s/%s.jpg' % (thumb_id, image_guid)
def get_media_requests(self, item, info):
#yield Request(item['images']) # Adding meta. I don't know, how to put it in one line :-)
for image in item['images']:
yield Request(image)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
settings.py
BOT_NAME = 'allenheath'
SPIDER_MODULES = ['allenheath.spiders']
NEWSPIDER_MODULE = 'allenheath.spiders'
ITEM_PIPELINES = {'scrapy.contrib.pipeline.images.ImagesPipeline': 1}
IMAGES_STORE = 'c:/allenheath/images'
products.py (my spider)
import scrapy
import urlparse
from allenheath.items import ProductItem
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
class productsSpider(scrapy.Spider):
name = "products"
allowed_domains = ["http://www.allen-heath.com/"]
start_urls = [
"http://www.allen-heath.com/ahproducts/ilive-80/",
"http://www.allen-heath.com/ahproducts/ilive-112/"
]
def parse(self, response):
for sel in response.xpath('/html'):
item = ProductItem()
item['model'] = sel.css('#prodsingleouter > div > div > h2::text').extract() # The value I'd like to use to name my images.
item['shortdesc'] = sel.css('#prodsingleouter > div > div > h3::text').extract()
item['desc'] = sel.css('#tab1 #productcontent').extract()
item['series'] = sel.css('#pagestrip > div > div > a:nth-child(3)::text').extract()
item['imageorig'] = sel.css('#prodsingleouter > div > div > h2::text').extract()
item['image_urls'] = sel.css('#tab1 #productcontent .col-sm-9 img').xpath('./#src').extract()
item['image_urls'] = [urlparse.urljoin(response.url, url) for url in item['image_urls']]
yield item
items.py
import scrapy
class ProductItem(scrapy.Item):
model = scrapy.Field()
itemcode = scrapy.Field()
shortdesc = scrapy.Field()
desc = scrapy.Field()
series = scrapy.Field()
imageorig = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
Here's a pastebin of the output I get from the command prompt when I run the spider: http://pastebin.com/ir7YZFqf
Any help would be greatly appreciated!
The pipelines.py:
from scrapy.pipelines.images import ImagesPipeline
from scrapy.http import Request
from scrapy.exceptions import DropItem
from scrapy import log
class MyImagesPipeline(ImagesPipeline):
#Name download version
def file_path(self, request, response=None, info=None):
image_guid = request.meta['model'][0]
log.msg(image_guid, level=log.DEBUG)
return 'full/%s' % (image_guid)
#Name thumbnail version
def thumb_path(self, request, thumb_id, response=None, info=None):
image_guid = thumb_id + request.url.split('/')[-1]
log.msg(image_guid, level=log.DEBUG)
return 'thumbs/%s/%s.jpg' % (thumb_id, image_guid)
def get_media_requests(self, item, info):
yield Request(item['image_urls'][0], meta=item)
You're using the settings.py wrong. You should use this:
ITEM_PIPELINES = {'allenheath.pipelines.MyImagesPipeline': 1}
For thumbsnails to work, add this to settings.py:
IMAGES_THUMBS = {
'small': (50, 50),
'big': (100, 100),
}
Since the URL hash will make sure you'll end up with a unique identifier, you could perhaps just write separately to a file the item's value and the URL hash.
After all is done, you can then just loop over this file and do the renaming (and using a Counter dictionary to make sure you rename them with a number appended based on how many Items with an equal value).

DropItem if parsed url contains key words (pipeline)

I am trying to build a spider for a school project where I am scraping recipes from allrecipes.com. Everything is working really well, however I seem to be unable to remove duplicate recipes where one url contains the actual recipe, and the other contains the same url with "video=true" appended.
Here is my attempt to dealing with this in pipelines.py:
from scrapy.exceptions import DropItem
from scrapy import log
class DuplicatesPipeline(object):
# minCal = 50
def __init__(self):
self.urls_seen = set()
def process_vids(self, item, spider):
video = "video=true"
url = str(item.get('url'))
if video in url:
raise DropItem("Contains video")
else:
return item
def process_item(self, item, spider):
unique_id = item.get('url')
if unique_id in self.urls_seen:
raise DropItem("Duplicate Item found (%s)" % unique_id)
else:
self.urls_seen.add('url')
return item
settings.py:
# Scrapy settings for dirbot project
BOT_NAME = 'dirbot'
SPIDER_MODULES = ['dirbot.spiders']
NEWSPIDER_MODULE = 'dirbot.spiders'
DEFAULT_ITEM_CLASS = 'dirbot.items.Website'
ITEM_PIPELINES = {'dirbot.pipelines.DuplicatesPipeline': 300,}
items.py:
from scrapy.item import Item, Field
class Website(Item):
name = Field()
url = Field()
description = Field()
kcal = Field()
carbs = Field()
fat = Field()
protein = Field()
main = Field()
sugar = Field()
fibre = Field()
author = Field()
rating = Field()
img = Field()
dnot.py:
from scrapy.spider import Spider
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy.http import Request
from scrapy.http.request import Request
from dirbot.items import Website
from scrapy.contrib.spiders import CrawlSpider,Rule
import urlparse
import scrapy
page = "http://allrecipes.com/recipes/main.aspx?Page=%d#recipes"
class DmozSpider(Spider):
name = "dnot"
allowed_domains = ["allrecipes.com"]
start_urls = [page % 1]
rules = [Rule(SgmlLinkExtractor(allow=('allrecipes.com'), restrict_xpaths = '//a[contains(.,"NEXT")]'),
callback="parse", follow= True),
]
def __init__(self):
self.page_number = 1
def parse(self, response):
print "-------------------------------------------------"
print self.page_number
print "-------------------------------------------------"
sel = Selector(response)
sites = response.xpath('//div[#id="divGridItemWrapper"]')
items = []
for site in sites:
item = Website()
recipe = response.xpath('//a[contains(#href, "/Recipe/")]/#href').extract()
url = "http://www.allrecipes.com"
for nth in recipe:
go = urlparse.urljoin(url, str(nth))
items.append(item)
for link in go:
yield Request(go, self.recipes)
if self.page_number <= 3:
self.page_number += 1
yield Request(page % self.page_number)
else:
pass
def recipes(self,response):
item = Website()
sel = Selector(response)
recipe = response.xpath('//div[#id="content-wrapper"]')
items = []
print "second page - %s" % response.url
for i in recipe:
item['url'] = response.url
item['description'] = i.xpath('//span[#itemprop="description"]/text()').extract()
item['name'] = i.xpath('//h1[#itemprop="name"]/text()').extract()
item['kcal'] = i.xpath('//ul/li[contains(.,"kcal")]/span/text()').extract()
item['carbs'] = i.xpath('//ul/li[contains(.,"Carbohydrates")]/following-sibling::li[1]//span[#id="lblNutrientValue"]/text()').extract()
item['fat'] = i.xpath('//ul/li[contains(.,"Fat")]/following-sibling::li[1]//span[#id="lblNutrientValue"]/text()').extract()
item['protein'] = i.xpath('//ul/li[contains(.,"Protein")]/following-sibling::li[1]//span[#id="lblNutrientValue"]/text()').extract()
item['main'] = "allrecipes.com"
item['sugar'] = i.xpath('//li/span[#itemprop="sugarContent"]/text()').extract()
item['fibre'] = i.xpath('//li/span[#itemprop="proteinContent"]/text()').extract()
item['author'] = i.xpath('//span[#id="lblUser0"]/text()').extract()
item['rating'] = i.xpath('//div[#class="rating-stars-img"][1]/meta[1][#itemprop="ratingValue"]/#content').extract()
item['img'] = i.xpath('//img[#id="imgPhoto"]/#src').extract()
items.append(item)
yield item
I am a little new with Python, and I'm not sure if I need to convert the item['url'] into a string or not; however I have tried with the "str" and without. I have also tried a few other methods that others have used for doing something similar, but nothing has worked for me so far.
Hoping someone can point me in the right direction. Thanks in advance!
Example:
item['url'] = http://allrecipes.com/Recipe/Delicious-Ham-and-Potato-Soup/Detail.aspx?evt19=1&referringHubId=1
item['url'] = http://allrecipes.com/Recipe/Delicious-Ham-and-Potato-Soup/Detail.aspx?evt19=1&referringHubId=1&video=true
You need to create a class that implements the process_item method on the pipelines.py file, something like:
from urllib import urlencode
from urlparse import urlparse, urlunparse, parse_qs
class DuplicatesPipeline(object):
def __init__(self):
self.ids_seen = set()
def process_item(self, item, spider):
url = item['url']
u = urlparse(url)
query = parse_qs(u.query)
query.pop('video', None)
u = u._replace(query=urlencode(query, True))
unique_id = urlunparse(u)
if unique_id and unique_id in self.ids_seen:
raise DropItem("Duplicate Item found (%s)" % unique_id)
else:
self.ids_seen.add(unique_id)
return item
Then you need to add that class, to settings.py
ITEM_PIPELINES = {
'yourproject.pipelines.DuplicatesPipeline': 300,
}
Also, your process_vids method isn't being used.
let me know if it helps you.

Categories

Resources