I am writing an image scraper using Scrapy with default ImagePipeline.
Generally, everything has been working fine now.
However I cannot get the saved path of scraped image.
items.py:
class MyItem(scrapy.Item):
name = scrapy.Field()
type = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
pipelines.py:
class MyPipeline(object):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url)
def item_completed(self, results, item, info):
mage_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
myspider.py:
import scrapy
from scrapy.contrib.spiders import Rule, CrawlSpider
from scrapy.contrib.linkextractors import LinkExtractor
from mycrawler.items import MyItem
class VscrawlerSpider(CrawlSpider):
"""docstring for VscrawlerSpider"""
name = "myspider"
allowed_domains = ["vesselfinder.com"]
start_urls = [
"https://www.vesselfinder.com/vessels?page=1"
]
rules = [
Rule(LinkExtractor(allow=r'vesselfinder.com/vessels\?page=[1-4]'),
callback='parse_item', follow=True)
]
def parse_item(self, response):
ships = response.xpath('//div[#class="items"]/article')
for ship in ships:
item = MyItem()
item['name'] = ship.xpath('div[2]/header/h1/a/text()').extract()[1].strip()
item['image_urls'] = [ship.xpath('div[1]/a/picture/img/#src').extract()[0]]
item['type'] = ship.xpath('div[2]/div[2]/div[2]/text()').extract()[0]
str = item['image_paths'][0] + item['type'] + item['name']
yield item
I got the error:
exceptions.KeyError: 'image_paths'.
I tried to use item['images'][0].path but the some error still occurs. I don't know where does this error come from?
You have not defined image_paths field, define it:
class MyItem(scrapy.Item):
# ...
image_paths = scrapy.Field()
You probably meant to use images field for it instead
Related
I want to remove the [ ] brackets scrapy adds to all it's output, to do this you simply add [0] at the end of an xpath statement as follows:
'a[#class="question-hyperlink"]/text()').extract()[0]
this solves the [ ] problem in some cases but in other cases scrapy returns every second row of output as blank and as such the moment it gets to the second row when using [0] i'm given the error:
Index error: list index out of range
How can I prevent scrapy from creating blank rows ? It seems like this is a common problem, but everyone faces this problem when exporting to CSV while for me it's with the scrapy response before exporting as CSV.
Items.py:
import scrapy
from scrapy.item import Item, Field
class QuestionItem(Item):
title = Field()
url = Field()
class PopularityItem(Item):
votes = Field()
answers = Field()
views = Field()
class ModifiedItem(Item):
lastModified = Field()
modName = Field()
The spider that doesn't output every second row as blank and thus works with [0]:
from scrapy import Spider
from scrapy.selector import Selector
from stack.items import QuestionItem
class QuestionSpider(Spider):
name = "questions"
allowed_domains = ["stackoverflow.com"]
start_urls = [
"http://stackoverflow.com/questions?pagesize=50&sort=newest",
]
def parse(self, response):
questions = Selector(response).xpath('//div[#class="summary"]/h3')
for question in questions:
item = QuestionItem()
item['title'] = question.xpath(
'a[#class="question-hyperlink"]/text()').extract()[0]
item['url'] = question.xpath(
'a[#class="question-hyperlink"]/#href').extract()[0]
yield item
The spider that gives every second row of output as blank:
from scrapy import Spider
from scrapy.selector import Selector
from stack.items import PopularityItem
class PopularitySpider(Spider):
name = "popularity"
allowed_domains = ["stackoverflow.com"]
start_urls = [
"https://stackoverflow.com/",
]
def parse(self, response):
popularity = response.xpath('//div[contains(#class, "question-summary narrow")]/div')
for poppart in popularity:
item = PopularityItem()
item['votes'] = poppart.xpath(
'div[contains(#class, "votes")]//span/text()').extract()#[0]
item['answers'] = poppart.xpath(
'div[contains(#class, "answered")]//span/text()').extract()#[0]
item['views'] = poppart.xpath(
'div[contains(#class, "views")]//span/text()').extract()#[0]
yield item
Pipelines.py
import pymongo
import logging
class StackPipeline(object):
def process_item(self, item, spider):
return item
from scrapy.conf import settings
from scrapy.exceptions import DropItem
from scrapy import log
class MongoDBPipeline(object):
def __init__(self):
connection = pymongo.MongoClient(settings['MONGODB_SERVER'], settings['MONGODB_PORT'])
self.db = connection[settings['MONGODB_DB']]
def process_item(self, item, spider):
collection = self.db[type(item).__name__.lower()]
logging.info(collection.insert(dict(item)))
return item
The easiest way to handle an error like this is to catch it and deal with it then (in this case, by just moving on past the blank lines).
class PopularitySpider(Spider):
name = "popularity"
allowed_domains = ["stackoverflow.com"]
start_urls = ["https://stackoverflow.com/"]
def parse(self, response):
popularity = response.xpath('//div[contains(#class, "question-summary narrow")]/div')
for poppart in popularity:
try:
item = PopularityItem()
item['votes'] = poppart.xpath('div[contains(#class, "votes")]//span/text()').extract()[0]
item['answers'] = poppart.xpath('div[contains(#class, "answered")]//span/text()').extract()[0]
item['views'] = poppart.xpath('div[contains(#class, "views")]//span/text()').extract()[0]
except IndexError:
continue
yield item
I am trying to build a spider for a school project where I am scraping recipes from allrecipes.com. Everything is working really well, however I seem to be unable to remove duplicate recipes where one url contains the actual recipe, and the other contains the same url with "video=true" appended.
Here is my attempt to dealing with this in pipelines.py:
from scrapy.exceptions import DropItem
from scrapy import log
class DuplicatesPipeline(object):
# minCal = 50
def __init__(self):
self.urls_seen = set()
def process_vids(self, item, spider):
video = "video=true"
url = str(item.get('url'))
if video in url:
raise DropItem("Contains video")
else:
return item
def process_item(self, item, spider):
unique_id = item.get('url')
if unique_id in self.urls_seen:
raise DropItem("Duplicate Item found (%s)" % unique_id)
else:
self.urls_seen.add('url')
return item
settings.py:
# Scrapy settings for dirbot project
BOT_NAME = 'dirbot'
SPIDER_MODULES = ['dirbot.spiders']
NEWSPIDER_MODULE = 'dirbot.spiders'
DEFAULT_ITEM_CLASS = 'dirbot.items.Website'
ITEM_PIPELINES = {'dirbot.pipelines.DuplicatesPipeline': 300,}
items.py:
from scrapy.item import Item, Field
class Website(Item):
name = Field()
url = Field()
description = Field()
kcal = Field()
carbs = Field()
fat = Field()
protein = Field()
main = Field()
sugar = Field()
fibre = Field()
author = Field()
rating = Field()
img = Field()
dnot.py:
from scrapy.spider import Spider
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy.http import Request
from scrapy.http.request import Request
from dirbot.items import Website
from scrapy.contrib.spiders import CrawlSpider,Rule
import urlparse
import scrapy
page = "http://allrecipes.com/recipes/main.aspx?Page=%d#recipes"
class DmozSpider(Spider):
name = "dnot"
allowed_domains = ["allrecipes.com"]
start_urls = [page % 1]
rules = [Rule(SgmlLinkExtractor(allow=('allrecipes.com'), restrict_xpaths = '//a[contains(.,"NEXT")]'),
callback="parse", follow= True),
]
def __init__(self):
self.page_number = 1
def parse(self, response):
print "-------------------------------------------------"
print self.page_number
print "-------------------------------------------------"
sel = Selector(response)
sites = response.xpath('//div[#id="divGridItemWrapper"]')
items = []
for site in sites:
item = Website()
recipe = response.xpath('//a[contains(#href, "/Recipe/")]/#href').extract()
url = "http://www.allrecipes.com"
for nth in recipe:
go = urlparse.urljoin(url, str(nth))
items.append(item)
for link in go:
yield Request(go, self.recipes)
if self.page_number <= 3:
self.page_number += 1
yield Request(page % self.page_number)
else:
pass
def recipes(self,response):
item = Website()
sel = Selector(response)
recipe = response.xpath('//div[#id="content-wrapper"]')
items = []
print "second page - %s" % response.url
for i in recipe:
item['url'] = response.url
item['description'] = i.xpath('//span[#itemprop="description"]/text()').extract()
item['name'] = i.xpath('//h1[#itemprop="name"]/text()').extract()
item['kcal'] = i.xpath('//ul/li[contains(.,"kcal")]/span/text()').extract()
item['carbs'] = i.xpath('//ul/li[contains(.,"Carbohydrates")]/following-sibling::li[1]//span[#id="lblNutrientValue"]/text()').extract()
item['fat'] = i.xpath('//ul/li[contains(.,"Fat")]/following-sibling::li[1]//span[#id="lblNutrientValue"]/text()').extract()
item['protein'] = i.xpath('//ul/li[contains(.,"Protein")]/following-sibling::li[1]//span[#id="lblNutrientValue"]/text()').extract()
item['main'] = "allrecipes.com"
item['sugar'] = i.xpath('//li/span[#itemprop="sugarContent"]/text()').extract()
item['fibre'] = i.xpath('//li/span[#itemprop="proteinContent"]/text()').extract()
item['author'] = i.xpath('//span[#id="lblUser0"]/text()').extract()
item['rating'] = i.xpath('//div[#class="rating-stars-img"][1]/meta[1][#itemprop="ratingValue"]/#content').extract()
item['img'] = i.xpath('//img[#id="imgPhoto"]/#src').extract()
items.append(item)
yield item
I am a little new with Python, and I'm not sure if I need to convert the item['url'] into a string or not; however I have tried with the "str" and without. I have also tried a few other methods that others have used for doing something similar, but nothing has worked for me so far.
Hoping someone can point me in the right direction. Thanks in advance!
Example:
item['url'] = http://allrecipes.com/Recipe/Delicious-Ham-and-Potato-Soup/Detail.aspx?evt19=1&referringHubId=1
item['url'] = http://allrecipes.com/Recipe/Delicious-Ham-and-Potato-Soup/Detail.aspx?evt19=1&referringHubId=1&video=true
You need to create a class that implements the process_item method on the pipelines.py file, something like:
from urllib import urlencode
from urlparse import urlparse, urlunparse, parse_qs
class DuplicatesPipeline(object):
def __init__(self):
self.ids_seen = set()
def process_item(self, item, spider):
url = item['url']
u = urlparse(url)
query = parse_qs(u.query)
query.pop('video', None)
u = u._replace(query=urlencode(query, True))
unique_id = urlunparse(u)
if unique_id and unique_id in self.ids_seen:
raise DropItem("Duplicate Item found (%s)" % unique_id)
else:
self.ids_seen.add(unique_id)
return item
Then you need to add that class, to settings.py
ITEM_PIPELINES = {
'yourproject.pipelines.DuplicatesPipeline': 300,
}
Also, your process_vids method isn't being used.
let me know if it helps you.
I am new to python programming and using scrapy. I have setup my crawler and so far it was working until I got to the point where I wanted to figure out how to download images. The error I am getting is cannot import name NsiscrapePipeline. I dont know what I am doing wrong and I dont understand some of the documentation as I am new. Please help
Items File
from scrapy.item import Item, Field
class NsiscrapeItem(Item):
# define the fields for your item here like:
# name = Field()
location = Field()
stock_number = Field()
year = Field()
manufacturer = Field()
model = Field()
length = Field()
price = Field()
status = Field()
url = Field()
pass
Spider
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from NSIscrape.items import NsiscrapeItem
from scrapy.http import Request
from scrapy.contrib.pipeline.images import NsiscrapePipeline
import Image
class NsiscrapeSpider(BaseSpider):
name = "Nsiscrape"
allowed_domain = ["yachtauctions.com"]
start_urls = [
"http://www.yachtauctions.com/inventory/"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//tr')
items = []
for site in sites:
item = NsiscrapeItem()
item['location'] = site.select('td[2]/text()').extract()
item['stock_number'] = site.select('td[3]/a/text()').extract()
item['year'] = site.select('td[4]/text()').extract()
item['manufacturer'] = site.select('td[5]/text()').extract()
item['model'] = site.select('td[6]/text()').extract()
item['length'] = site.select('td[7]/text()').extract()
item['price'] = site.select('td[8]/text()').extract()
item['status'] = site.select('td[10]/img/#src').extract()
item['url'] = site.select('td[1]/a/#href').extract()
item['image_urls'] = site.select('td/a[3]/img/#data-original').extract()
item['images'] = item['image_urls']
yield Request(item['url'][0], meta={'item':item}, callback=self.product_detail_page)
def product_detail_page(self, response):
hxs = HtmlXPathSelector(response)
item = response.request.meta['item']
#add all images url in the item['image_urls']
yield item
settings
ITEM_PIPELINES = ['scrapy.contrib.pipeline.image.NsiscrapePipeline']
IMAGES_STORE = 'c:\Python27\NSIscrape\IMG'
IMAGES_EXPIRES = 90
Pipelines This is where I am unsure if I am missing something
from scrapy.item import Item
class NsiscrapePipeline(Item):
image_urls = Field()
images = Field()
def process_item(self, item, spider):
return item
error
File "NSIscrape\spiders\NSI_Spider.py", line 9, in <module>
from scrapy.contrib.pipeline.images import NsiscrapePipeline
ImportError: cannot import name NsiscrapePipeline
You tried to pass list, but this function accepts only string. Pass only one element from list (for example list[0]).
Heres my final code thats working. There was two issues
1: I was missing the second backslash that needede to be in the request --> //td[1]/a[3]/img/#data-original
2: I had to check the full URL in which the image would be displayed and join them together which was the main URL or the allowed URL and the image URL.
def parse(self, response):
hxs = HtmlXPathSelector(response)
images = hxs.select('//tr')
url = []
for image in images:
urls = NsiscrapeItem()
urls['image_urls'] = ["http://www.yachtauctions.com" + x for x in image.select('//td[1]/a[3]/img/#data-original').extract()]
url.append(urls)
return url
That isn't part of the library :) - at least by looking at their current master branch
I think you're looking for ImagesPipeline
Their example may help! example
p.s. I don't think you custom name the class - at least not by how scapy is designed; i'm reasonably sure you use their class ;)
I am new to python programming and using scrapy. I have setup my crawler and so far it was working until I got to the point where I wanted to figure out how to download images. The error I am getting is cannot import name NsiscrapePipeline. I dont know what I am doing wrong and I dont understand some of the documentation as I am new. Please help
Items File
from scrapy.item import Item, Field
class NsiscrapeItem(Item):
# define the fields for your item here like:
# name = Field()
location = Field()
stock_number = Field()
year = Field()
manufacturer = Field()
model = Field()
length = Field()
price = Field()
status = Field()
url = Field()
pass
Spider
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from NSIscrape.items import NsiscrapeItem
from scrapy.http import Request
from scrapy.contrib.pipeline.images import NsiscrapePipeline
import Image
class NsiscrapeSpider(BaseSpider):
name = "Nsiscrape"
allowed_domain = ["yachtauctions.com"]
start_urls = [
"http://www.yachtauctions.com/inventory/"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//tr')
items = []
for site in sites:
item = NsiscrapeItem()
item['location'] = site.select('td[2]/text()').extract()
item['stock_number'] = site.select('td[3]/a/text()').extract()
item['year'] = site.select('td[4]/text()').extract()
item['manufacturer'] = site.select('td[5]/text()').extract()
item['model'] = site.select('td[6]/text()').extract()
item['length'] = site.select('td[7]/text()').extract()
item['price'] = site.select('td[8]/text()').extract()
item['status'] = site.select('td[10]/img/#src').extract()
item['url'] = site.select('td[1]/a/#href').extract()
item['image_urls'] = site.select('td/a[3]/img/#data-original').extract()
item['images'] = item['image_urls']
yield Request(item['url'][0], meta={'item':item}, callback=self.product_detail_page)
def product_detail_page(self, response):
hxs = HtmlXPathSelector(response)
item = response.request.meta['item']
#add all images url in the item['image_urls']
yield item
settings
ITEM_PIPELINES = ['scrapy.contrib.pipeline.image.NsiscrapePipeline']
IMAGES_STORE = 'c:\Python27\NSIscrape\IMG'
IMAGES_EXPIRES = 90
Pipelines This is where I am unsure if I am missing something
from scrapy.item import Item
class NsiscrapePipeline(Item):
image_urls = Field()
images = Field()
def process_item(self, item, spider):
return item
error
File "NSIscrape\spiders\NSI_Spider.py", line 9, in <module>
from scrapy.contrib.pipeline.images import NsiscrapePipeline
ImportError: cannot import name NsiscrapePipeline
You tried to pass list, but this function accepts only string. Pass only one element from list (for example list[0]).
Heres my final code thats working. There was two issues
1: I was missing the second backslash that needede to be in the request --> //td[1]/a[3]/img/#data-original
2: I had to check the full URL in which the image would be displayed and join them together which was the main URL or the allowed URL and the image URL.
def parse(self, response):
hxs = HtmlXPathSelector(response)
images = hxs.select('//tr')
url = []
for image in images:
urls = NsiscrapeItem()
urls['image_urls'] = ["http://www.yachtauctions.com" + x for x in image.select('//td[1]/a[3]/img/#data-original').extract()]
url.append(urls)
return url
That isn't part of the library :) - at least by looking at their current master branch
I think you're looking for ImagesPipeline
Their example may help! example
p.s. I don't think you custom name the class - at least not by how scapy is designed; i'm reasonably sure you use their class ;)
I am trying to crawl images from a website with the following scrapy code:
import urlparse
from PIL import Image
from scrapy.exceptions import DropItem, NotConfigured, IgnoreRequest
from scrapy.spider import BaseSpider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.loader import XPathItemLoader
from scrapy.selector import HtmlXPathSelector
from scrapy.http.request import Request
from scrapy.contrib.pipeline.images import ImagesPipeline
from mobile.items import Website
class MobileSpider(CrawlSpider):
name = "mobile"
allowed_domains = ["mobile-store.ro"]
start_urls = ["http://www.mobile-store.ro/produse/"]
rules = (
Rule(SgmlLinkExtractor(allow=r"/produs/d+"), follow=True),
Rule(SgmlLinkExtractor(allow=r"/produse/d+"), callback='parse_item')
)
def parse(self, response, response2):
hxs = HtmlXPathSelector(response)
next_page = hxs.select("//ul[#class='products']/li/a/#href").extract()
if not not next_page:
yield Request(next_page[0], self.parse)
sites = hxs.select('//div[#id="wrapper"]/div[#id="content"]')
items = []
for site in sites:
item = Website()
item['nume'] = site.select('//div[#class="summary"]/h1[#class="product_title entry-title"]/text()').extract()
item['categorie'] = site.select('//div[#class="summary"]/div[#class="product_meta"]/span[#class="posted_in"]/a/text()').extract()
item['brand'] = site.select('//div[#class="summary"]/div[#class="product_meta"]/span[#class="tagged_as"]/a/text()').extract()
item['descriere'] = site.select('//div[#class="woocommerce_tabs"]/div[#id="tab-description"]/p/text()').extract()
image_relative_url = site.select('//div[#class="ad-image-wrapper"]/div[#class="ad-image"]/img[#class="lightbox"]/#src').extract()
item['image_urls'] = [urlparse.urljoin(response.url,image_relative_url)]
#item['image_urls'] = site.select('//div[#class="ad-image-wrapper"]/div[#class="ad-image"]/img[#class="lightbox"]/#src').extract()
item['pret'] = site.select('//div[#class="summary"]/div[1]/p[#class="price"]/span[#class="amount"]/text()').extract()
item['url'] = response.url
items.append(item)
for item in items:
yield item
settings.py:
SPIDER_MODULES = ['mobile.spiders']
NEWSPIDER_MODULE = 'mobile.spiders'
DEFAULT_ITEM_CLASS = 'mobile.items.Website'
ITEM_PIPELINES = ['scrapy.contrib.pipeline.images.ImagesPipeline']
items.py:
from scrapy.item import Item, Field
class Website(Item):
nume = Field()
descriere = Field()
categorie = Field()
brand = Field()
pret = Field()
url = Field()
image_urls = Field()
images = Field()
image_paths = Field()
pipelines.py:
from mobile.contrib.pipeline.images import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy.http import Request
class MyImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield Request(image_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
The issues comes when I try to get the images url, by using the following code:
for site in sites:
item = Website()
item['nume'] = site.select('//div[#class="summary"]/h1[#class="product_title entry-title"]/text()').extract()
item['categorie'] = site.select('//div[#class="summary"]/div[#class="product_meta"]/span[#class="posted_in"]/a/text()').extract()
item['brand'] = site.select('//div[#class="summary"]/div[#class="product_meta"]/span[#class="tagged_as"]/a/text()').extract()
item['descriere'] = site.select('//div[#class="woocommerce_tabs"]/div[#id="tab-description"]/p/text()').extract()
image_relative_url = site.select('//div[#class="ad-image-wrapper"]/div[#class="ad-image"]/img[#class="lightbox"]/#src').extract()
item['image_urls'] = [urlparse.urljoin(response.url2,image_relative_url)]
#item['image_urls'] = site.select('//div[#class="ad-image-wrapper"]/div[#class="ad-image"]/img[#class="lightbox"]/#src').extract()
item['pret'] = site.select('//div[#class="summary"]/div[1]/p[#class="price"]/span[#class="amount"]/text()').extract()
item['url'] = response.url
items.append(item)
for item in items:
yield item
Which returns me the page url instead of the image url. All other fields are crawled correctly. Any clues on how to fix this issue and get the image url properly?
This is because the image (and the whole content of ad-image-wrapper div) is filled dynamically via javascript.
Dumping response.body in parse method helped me to figure out that the actual image link is originally kept in the ad-thumb-list list. So, try use the following for getting the image url:
image_relative_url = site.select('//ul[#class="ad-thumb-list"]/li[#class="first_item"]/a/#href').extract()
if image_relative_url:
image_relative_url = image_relative_url[0]
Hope that is what you needed.