I'm getting the following error when attempting to download images using a spider with Scrapy.
File "C:\Python27\lib\site-packages\scrapy\http\request\__init__.py",
line 61, in _set_url
raise ValueError('Missing scheme in request url: %s' % self._url)
exceptions.ValueError: Missing scheme in request url: h
As best as I can understand it, it looks like I'm missing an "h" in a url somewhere? But I can't for the life of me see where. Everything works fine if I'm not trying to download images. But once I add the appropriate code to the four files below, I can't get anything to work properly. Could anyone help me make sense of this error?
items.py
import scrapy
class ProductItem(scrapy.Item):
model = scrapy.Field()
shortdesc = scrapy.Field()
desc = scrapy.Field()
series = scrapy.Field()
imageorig = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
settings.py
BOT_NAME = 'allenheath'
SPIDER_MODULES = ['allenheath.spiders']
NEWSPIDER_MODULE = 'allenheath.spiders'
ITEM_PIPELINES = {'scrapy.contrib.pipeline.images.ImagesPipeline': 1}
IMAGES_STORE = 'c:/allenheath/images'
pipelines.py
class AllenheathPipeline(object):
def process_item(self, item, spider):
return item
import scrapy
from scrapy.contrib.pipeline.images import ImagesPipeline
from scrapy.exceptions import DropItem
class MyImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
products.py (my spider)
import scrapy
from allenheath.items import ProductItem
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
class productsSpider(scrapy.Spider):
name = "products"
allowed_domains = ["http://www.allen-heath.com/"]
start_urls = [
"http://www.allen-heath.com/ahproducts/ilive-80/",
"http://www.allen-heath.com/ahproducts/ilive-112/"
]
def parse(self, response):
for sel in response.xpath('/html'):
item = ProductItem()
item['model'] = sel.css('#prodsingleouter > div > div > h2::text').extract()
item['shortdesc'] = sel.css('#prodsingleouter > div > div > h3::text').extract()
item['desc'] = sel.css('#tab1 #productcontent').extract()
item['series'] = sel.css('#pagestrip > div > div > a:nth-child(3)::text').extract()
item['imageorig'] = sel.css('#prodsingleouter > div > div > h2::text').extract()
item['image_urls'] = sel.css('#tab1 #productcontent img').extract()[0]
item['image_urls'] = 'http://www.allen-heath.com' + item['image_urls']
yield item
Any help would be greatly appreciated.
The issue is here:
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url)
and here:
item['image_urls'] = sel.css('#tab1 #productcontent img').extract()[0]
You are extracting this field and taking the first element. Which means that once you are iterating over it in the pipeline, you are in fact iterating over characters in the URL, which begins with http - explaining the error message you are seeing, as soon as the first letter tries to be processed:
Missing scheme in request url: h
Remove the [0] from the line. While you're at it, fetch the src of the image, instead of the entire element:
item['image_urls'] = sel.css('#tab1 #productcontent img').xpath('./#src').extract()
After that, you should update the next line also, in case the image url is relative, to convert it to absolute:
import urlparse # put this at the top of the script
item['image_urls'] = [urlparse.urljoin(response.url, url) for url in item['image_urls']]
But you don't need this last part if the image URL in src is actually absolute, so just remove it.
Related
My spider code has been working well so far, but now when I am trying to run a batch of these spiders, everything works except that for some spiders, scrapy downloads the images, and for the rest nothing. All the spiders are the same except for the start_urls. Any help is appreciated!
Here's my pipelines.py
from scrapy.contrib.pipeline.images import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy.http import Request
class DmozPipeline(object):
def process_item(self, item, spider):
return item
class MyImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield Request(image_url)
for nlabel in item['nlabel']:
yield Request(nlabel)
print item['image_urls']
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
settings.py:
BOT_NAME = 'dmoz2'
BOT_VERSION = '1.0'
SPIDER_MODULES = ['dmoz2.spiders']
NEWSPIDER_MODULE = 'dmoz2.spiders'
DEFAULT_ITEM_CLASS = 'dmoz2.items.DmozItem'
ITEM_PIPELINES = ['dmoz2.pipelines.MyImagesPipeline']
IMAGES_STORE = '/ps/dmoz2/images'
IMAGES_THUMBS = {
#letting height be variable
#'small': ('', 120),
'small': (120, ''),
#'big': ('', 240),
'big': (300, ''),
}
USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
items.py:
from scrapy.item import Item, Field
from scrapy.utils.python import unicode_to_str
def u_to_str(text):
unicode_to_str(text,'latin-1','ignore')
class DmozItem(Item):
category_ids = Field()
....
image_urls = Field()
image_paths = Field()
pass
myspider.py:
from scrapy.spider import BaseSpider
from scrapy.spider import Spider
from scrapy.selector import HtmlXPathSelector
from scrapy import Selector
from scrapy.utils.url import urljoin_rfc
from scrapy.utils.response import get_base_url
from dmoz2.items import DmozItem
class DmozSpider(Spider):
name = "fritos_jun2015"
allowed_domains = ["walmart.com"]
start_urls = [
"http://www.walmart.com/ip/Fritos-Bar-B-Q-Flavored-Corn-Chips-9.75- oz/36915853",
"http://www.walmart.com/ip/Fritos-Corn-Chips-1-oz-6-count/10900088",
]
def parse(self, response):
hxs = Selector(response)
sites = hxs.xpath('/html/body/div[1]/section/section[4]/div[2]')
items = []
for site in sites:
item = DmozItem()
item['category_ids'] = ''
.....
item['image_urls'] = site.xpath('div[1]/div[3]/div[1]/div/div/div[2]/div/div/div[1]/div/div/img[2]/#src').extract()
items.append(item)
return items
Would really like to know why this same spider fetches images sometimes, and at other times not. All the spiders are the same, except for the start_urls from the same allowed_domain. Also the images are all absolute path, and the path is correct.
Thanks in advance.
-TM
When screen scraping one problem that is common is that the server will cut the connection because you are trying to access it too often (to prevent screen scrapers from inadvertently ddosing their website and to prevent costs from going to high because someone pings their website every millisecond etc).
Try adding a
sleep()
method between every request to the walmart page. This way you wont get blocked from accessing the server.
I am writing an image scraper using Scrapy with default ImagePipeline.
Generally, everything has been working fine now.
However I cannot get the saved path of scraped image.
items.py:
class MyItem(scrapy.Item):
name = scrapy.Field()
type = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
pipelines.py:
class MyPipeline(object):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url)
def item_completed(self, results, item, info):
mage_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
myspider.py:
import scrapy
from scrapy.contrib.spiders import Rule, CrawlSpider
from scrapy.contrib.linkextractors import LinkExtractor
from mycrawler.items import MyItem
class VscrawlerSpider(CrawlSpider):
"""docstring for VscrawlerSpider"""
name = "myspider"
allowed_domains = ["vesselfinder.com"]
start_urls = [
"https://www.vesselfinder.com/vessels?page=1"
]
rules = [
Rule(LinkExtractor(allow=r'vesselfinder.com/vessels\?page=[1-4]'),
callback='parse_item', follow=True)
]
def parse_item(self, response):
ships = response.xpath('//div[#class="items"]/article')
for ship in ships:
item = MyItem()
item['name'] = ship.xpath('div[2]/header/h1/a/text()').extract()[1].strip()
item['image_urls'] = [ship.xpath('div[1]/a/picture/img/#src').extract()[0]]
item['type'] = ship.xpath('div[2]/div[2]/div[2]/text()').extract()[0]
str = item['image_paths'][0] + item['type'] + item['name']
yield item
I got the error:
exceptions.KeyError: 'image_paths'.
I tried to use item['images'][0].path but the some error still occurs. I don't know where does this error come from?
You have not defined image_paths field, define it:
class MyItem(scrapy.Item):
# ...
image_paths = scrapy.Field()
You probably meant to use images field for it instead
I'm attempting to rename the images that are downloaded by my Scrapy 0.24 spider. Right now the downloaded images are stored with a SHA1 hash of their URLs as the file names. I'd like to instead name them the value I extract with item['model']. This question from 2011 outlines what I want, but the answers are for previous versions of Scrapy and don't work with the latest version.
Once I manage to get this working I'll also need to make sure I account for different images being downloaded with the same filename. So I'll need to download each image to its own uniquely named folder, presumably based on the original URL.
Here is a copy of the code I am using in my pipeline. I got this code from a more recent answer in the link above, but it's not working for me. Nothing errors out and the images are downloaded as normal. It doesn't seem my extra code has any effect on the filenames as they still appear as SHA1 hashes.
pipelines.py
class AllenheathPipeline(object):
def process_item(self, item, spider):
return item
import scrapy
from scrapy.contrib.pipeline.images import ImagesPipeline
from scrapy.http import Request
from scrapy.exceptions import DropItem
class MyImagesPipeline(ImagesPipeline):
#Name download version
def file_path(self, request, response=None, info=None):
item=request.meta['item'] # Like this you can use all from item, not just url.
image_guid = request.url.split('/')[-1]
return 'full/%s' % (image_guid)
#Name thumbnail version
def thumb_path(self, request, thumb_id, response=None, info=None):
image_guid = thumb_id + request.url.split('/')[-1]
return 'thumbs/%s/%s.jpg' % (thumb_id, image_guid)
def get_media_requests(self, item, info):
#yield Request(item['images']) # Adding meta. I don't know, how to put it in one line :-)
for image in item['images']:
yield Request(image)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
settings.py
BOT_NAME = 'allenheath'
SPIDER_MODULES = ['allenheath.spiders']
NEWSPIDER_MODULE = 'allenheath.spiders'
ITEM_PIPELINES = {'scrapy.contrib.pipeline.images.ImagesPipeline': 1}
IMAGES_STORE = 'c:/allenheath/images'
products.py (my spider)
import scrapy
import urlparse
from allenheath.items import ProductItem
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
class productsSpider(scrapy.Spider):
name = "products"
allowed_domains = ["http://www.allen-heath.com/"]
start_urls = [
"http://www.allen-heath.com/ahproducts/ilive-80/",
"http://www.allen-heath.com/ahproducts/ilive-112/"
]
def parse(self, response):
for sel in response.xpath('/html'):
item = ProductItem()
item['model'] = sel.css('#prodsingleouter > div > div > h2::text').extract() # The value I'd like to use to name my images.
item['shortdesc'] = sel.css('#prodsingleouter > div > div > h3::text').extract()
item['desc'] = sel.css('#tab1 #productcontent').extract()
item['series'] = sel.css('#pagestrip > div > div > a:nth-child(3)::text').extract()
item['imageorig'] = sel.css('#prodsingleouter > div > div > h2::text').extract()
item['image_urls'] = sel.css('#tab1 #productcontent .col-sm-9 img').xpath('./#src').extract()
item['image_urls'] = [urlparse.urljoin(response.url, url) for url in item['image_urls']]
yield item
items.py
import scrapy
class ProductItem(scrapy.Item):
model = scrapy.Field()
itemcode = scrapy.Field()
shortdesc = scrapy.Field()
desc = scrapy.Field()
series = scrapy.Field()
imageorig = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
Here's a pastebin of the output I get from the command prompt when I run the spider: http://pastebin.com/ir7YZFqf
Any help would be greatly appreciated!
The pipelines.py:
from scrapy.pipelines.images import ImagesPipeline
from scrapy.http import Request
from scrapy.exceptions import DropItem
from scrapy import log
class MyImagesPipeline(ImagesPipeline):
#Name download version
def file_path(self, request, response=None, info=None):
image_guid = request.meta['model'][0]
log.msg(image_guid, level=log.DEBUG)
return 'full/%s' % (image_guid)
#Name thumbnail version
def thumb_path(self, request, thumb_id, response=None, info=None):
image_guid = thumb_id + request.url.split('/')[-1]
log.msg(image_guid, level=log.DEBUG)
return 'thumbs/%s/%s.jpg' % (thumb_id, image_guid)
def get_media_requests(self, item, info):
yield Request(item['image_urls'][0], meta=item)
You're using the settings.py wrong. You should use this:
ITEM_PIPELINES = {'allenheath.pipelines.MyImagesPipeline': 1}
For thumbsnails to work, add this to settings.py:
IMAGES_THUMBS = {
'small': (50, 50),
'big': (100, 100),
}
Since the URL hash will make sure you'll end up with a unique identifier, you could perhaps just write separately to a file the item's value and the URL hash.
After all is done, you can then just loop over this file and do the renaming (and using a Counter dictionary to make sure you rename them with a number appended based on how many Items with an equal value).
I am new to python programming and using scrapy. I have setup my crawler and so far it was working until I got to the point where I wanted to figure out how to download images. The error I am getting is cannot import name NsiscrapePipeline. I dont know what I am doing wrong and I dont understand some of the documentation as I am new. Please help
Items File
from scrapy.item import Item, Field
class NsiscrapeItem(Item):
# define the fields for your item here like:
# name = Field()
location = Field()
stock_number = Field()
year = Field()
manufacturer = Field()
model = Field()
length = Field()
price = Field()
status = Field()
url = Field()
pass
Spider
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from NSIscrape.items import NsiscrapeItem
from scrapy.http import Request
from scrapy.contrib.pipeline.images import NsiscrapePipeline
import Image
class NsiscrapeSpider(BaseSpider):
name = "Nsiscrape"
allowed_domain = ["yachtauctions.com"]
start_urls = [
"http://www.yachtauctions.com/inventory/"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//tr')
items = []
for site in sites:
item = NsiscrapeItem()
item['location'] = site.select('td[2]/text()').extract()
item['stock_number'] = site.select('td[3]/a/text()').extract()
item['year'] = site.select('td[4]/text()').extract()
item['manufacturer'] = site.select('td[5]/text()').extract()
item['model'] = site.select('td[6]/text()').extract()
item['length'] = site.select('td[7]/text()').extract()
item['price'] = site.select('td[8]/text()').extract()
item['status'] = site.select('td[10]/img/#src').extract()
item['url'] = site.select('td[1]/a/#href').extract()
item['image_urls'] = site.select('td/a[3]/img/#data-original').extract()
item['images'] = item['image_urls']
yield Request(item['url'][0], meta={'item':item}, callback=self.product_detail_page)
def product_detail_page(self, response):
hxs = HtmlXPathSelector(response)
item = response.request.meta['item']
#add all images url in the item['image_urls']
yield item
settings
ITEM_PIPELINES = ['scrapy.contrib.pipeline.image.NsiscrapePipeline']
IMAGES_STORE = 'c:\Python27\NSIscrape\IMG'
IMAGES_EXPIRES = 90
Pipelines This is where I am unsure if I am missing something
from scrapy.item import Item
class NsiscrapePipeline(Item):
image_urls = Field()
images = Field()
def process_item(self, item, spider):
return item
error
File "NSIscrape\spiders\NSI_Spider.py", line 9, in <module>
from scrapy.contrib.pipeline.images import NsiscrapePipeline
ImportError: cannot import name NsiscrapePipeline
You tried to pass list, but this function accepts only string. Pass only one element from list (for example list[0]).
Heres my final code thats working. There was two issues
1: I was missing the second backslash that needede to be in the request --> //td[1]/a[3]/img/#data-original
2: I had to check the full URL in which the image would be displayed and join them together which was the main URL or the allowed URL and the image URL.
def parse(self, response):
hxs = HtmlXPathSelector(response)
images = hxs.select('//tr')
url = []
for image in images:
urls = NsiscrapeItem()
urls['image_urls'] = ["http://www.yachtauctions.com" + x for x in image.select('//td[1]/a[3]/img/#data-original').extract()]
url.append(urls)
return url
That isn't part of the library :) - at least by looking at their current master branch
I think you're looking for ImagesPipeline
Their example may help! example
p.s. I don't think you custom name the class - at least not by how scapy is designed; i'm reasonably sure you use their class ;)
I am new to python programming and using scrapy. I have setup my crawler and so far it was working until I got to the point where I wanted to figure out how to download images. The error I am getting is cannot import name NsiscrapePipeline. I dont know what I am doing wrong and I dont understand some of the documentation as I am new. Please help
Items File
from scrapy.item import Item, Field
class NsiscrapeItem(Item):
# define the fields for your item here like:
# name = Field()
location = Field()
stock_number = Field()
year = Field()
manufacturer = Field()
model = Field()
length = Field()
price = Field()
status = Field()
url = Field()
pass
Spider
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from NSIscrape.items import NsiscrapeItem
from scrapy.http import Request
from scrapy.contrib.pipeline.images import NsiscrapePipeline
import Image
class NsiscrapeSpider(BaseSpider):
name = "Nsiscrape"
allowed_domain = ["yachtauctions.com"]
start_urls = [
"http://www.yachtauctions.com/inventory/"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//tr')
items = []
for site in sites:
item = NsiscrapeItem()
item['location'] = site.select('td[2]/text()').extract()
item['stock_number'] = site.select('td[3]/a/text()').extract()
item['year'] = site.select('td[4]/text()').extract()
item['manufacturer'] = site.select('td[5]/text()').extract()
item['model'] = site.select('td[6]/text()').extract()
item['length'] = site.select('td[7]/text()').extract()
item['price'] = site.select('td[8]/text()').extract()
item['status'] = site.select('td[10]/img/#src').extract()
item['url'] = site.select('td[1]/a/#href').extract()
item['image_urls'] = site.select('td/a[3]/img/#data-original').extract()
item['images'] = item['image_urls']
yield Request(item['url'][0], meta={'item':item}, callback=self.product_detail_page)
def product_detail_page(self, response):
hxs = HtmlXPathSelector(response)
item = response.request.meta['item']
#add all images url in the item['image_urls']
yield item
settings
ITEM_PIPELINES = ['scrapy.contrib.pipeline.image.NsiscrapePipeline']
IMAGES_STORE = 'c:\Python27\NSIscrape\IMG'
IMAGES_EXPIRES = 90
Pipelines This is where I am unsure if I am missing something
from scrapy.item import Item
class NsiscrapePipeline(Item):
image_urls = Field()
images = Field()
def process_item(self, item, spider):
return item
error
File "NSIscrape\spiders\NSI_Spider.py", line 9, in <module>
from scrapy.contrib.pipeline.images import NsiscrapePipeline
ImportError: cannot import name NsiscrapePipeline
You tried to pass list, but this function accepts only string. Pass only one element from list (for example list[0]).
Heres my final code thats working. There was two issues
1: I was missing the second backslash that needede to be in the request --> //td[1]/a[3]/img/#data-original
2: I had to check the full URL in which the image would be displayed and join them together which was the main URL or the allowed URL and the image URL.
def parse(self, response):
hxs = HtmlXPathSelector(response)
images = hxs.select('//tr')
url = []
for image in images:
urls = NsiscrapeItem()
urls['image_urls'] = ["http://www.yachtauctions.com" + x for x in image.select('//td[1]/a[3]/img/#data-original').extract()]
url.append(urls)
return url
That isn't part of the library :) - at least by looking at their current master branch
I think you're looking for ImagesPipeline
Their example may help! example
p.s. I don't think you custom name the class - at least not by how scapy is designed; i'm reasonably sure you use their class ;)