Trouble downloading images with Scrapy - works sometimes - python

My spider code has been working well so far, but now when I am trying to run a batch of these spiders, everything works except that for some spiders, scrapy downloads the images, and for the rest nothing. All the spiders are the same except for the start_urls. Any help is appreciated!
Here's my pipelines.py
from scrapy.contrib.pipeline.images import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy.http import Request
class DmozPipeline(object):
def process_item(self, item, spider):
return item
class MyImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield Request(image_url)
for nlabel in item['nlabel']:
yield Request(nlabel)
print item['image_urls']
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
settings.py:
BOT_NAME = 'dmoz2'
BOT_VERSION = '1.0'
SPIDER_MODULES = ['dmoz2.spiders']
NEWSPIDER_MODULE = 'dmoz2.spiders'
DEFAULT_ITEM_CLASS = 'dmoz2.items.DmozItem'
ITEM_PIPELINES = ['dmoz2.pipelines.MyImagesPipeline']
IMAGES_STORE = '/ps/dmoz2/images'
IMAGES_THUMBS = {
#letting height be variable
#'small': ('', 120),
'small': (120, ''),
#'big': ('', 240),
'big': (300, ''),
}
USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
items.py:
from scrapy.item import Item, Field
from scrapy.utils.python import unicode_to_str
def u_to_str(text):
unicode_to_str(text,'latin-1','ignore')
class DmozItem(Item):
category_ids = Field()
....
image_urls = Field()
image_paths = Field()
pass
myspider.py:
from scrapy.spider import BaseSpider
from scrapy.spider import Spider
from scrapy.selector import HtmlXPathSelector
from scrapy import Selector
from scrapy.utils.url import urljoin_rfc
from scrapy.utils.response import get_base_url
from dmoz2.items import DmozItem
class DmozSpider(Spider):
name = "fritos_jun2015"
allowed_domains = ["walmart.com"]
start_urls = [
"http://www.walmart.com/ip/Fritos-Bar-B-Q-Flavored-Corn-Chips-9.75- oz/36915853",
"http://www.walmart.com/ip/Fritos-Corn-Chips-1-oz-6-count/10900088",
]
def parse(self, response):
hxs = Selector(response)
sites = hxs.xpath('/html/body/div[1]/section/section[4]/div[2]')
items = []
for site in sites:
item = DmozItem()
item['category_ids'] = ''
.....
item['image_urls'] = site.xpath('div[1]/div[3]/div[1]/div/div/div[2]/div/div/div[1]/div/div/img[2]/#src').extract()
items.append(item)
return items
Would really like to know why this same spider fetches images sometimes, and at other times not. All the spiders are the same, except for the start_urls from the same allowed_domain. Also the images are all absolute path, and the path is correct.
Thanks in advance.
-TM

When screen scraping one problem that is common is that the server will cut the connection because you are trying to access it too often (to prevent screen scrapers from inadvertently ddosing their website and to prevent costs from going to high because someone pings their website every millisecond etc).
Try adding a
sleep()
method between every request to the walmart page. This way you wont get blocked from accessing the server.

Related

I implemented an ItemLoader in my scrapy project to format that data and it is no longer adding anything to the csv file

I create a scrapy project to scrape a few information off this classifieds website, however the data I was getting needed to be formatted. After doing some research I figured out how to implement an ItemLoader but now it does not write any scraped data to the csv file.
Here's my spider.py:
import scrapy
from..items import TestItem
from scrapy.loader import ItemLoader
class TestSpiderSpider(scrapy.Spider):
name = 'test'
page_number = 2
start_urls = ['https://jamaicaclassifiedonline.com/auto/cars/']
def parse(self, response):
for car in response.css('.col.l3.s12.m6'):
items = TestItem()
product_title = car.css('.jco-card-title::text').extract()
product_imagelink = car.css('.card-image img::attr(data-src)').getall()
urls = car.css('.card-image a::attr(href)').getall()
for url in urls:
url = response.urljoin(url)
yield scrapy.Request(url=url, callback=self.parse_details)
if product_title and product_imagelink:
items['urls'] = urls
def parse_details(self, response):
l= ItemLoader(item=TestItem(), selector=response)
l.add_css('product_title','#title::text')
yield l.load_item()
pass
Here's my items.py
import scrapy
from scrapy.loader.processors import MapCompose, TakeFirst
from w3lib.html import remove_tags
class TestItem(scrapy.Item):
product_title = scrapy.Field(input_processors= MapCompose(remove_tags),output_processor= TakeFirst())
pass
Here's my setting.py:
BOT_NAME = 'test'
SPIDER_MODULES = ['test.spiders']
NEWSPIDER_MODULE = 'test.spiders'
ROBOTSTXT_OBEY = True
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_user_agents.middlewares.RandomUserAgentMiddleware': 400,
}
Here's my pipeline.py:
class TestPipeline:
def process_item(self, item, spider):
return item
You don't need pipelines enabled to use ItemLoader, try without.

Get the path of scraped image in Scrapy

I am writing an image scraper using Scrapy with default ImagePipeline.
Generally, everything has been working fine now.
However I cannot get the saved path of scraped image.
items.py:
class MyItem(scrapy.Item):
name = scrapy.Field()
type = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
pipelines.py:
class MyPipeline(object):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url)
def item_completed(self, results, item, info):
mage_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
myspider.py:
import scrapy
from scrapy.contrib.spiders import Rule, CrawlSpider
from scrapy.contrib.linkextractors import LinkExtractor
from mycrawler.items import MyItem
class VscrawlerSpider(CrawlSpider):
"""docstring for VscrawlerSpider"""
name = "myspider"
allowed_domains = ["vesselfinder.com"]
start_urls = [
"https://www.vesselfinder.com/vessels?page=1"
]
rules = [
Rule(LinkExtractor(allow=r'vesselfinder.com/vessels\?page=[1-4]'),
callback='parse_item', follow=True)
]
def parse_item(self, response):
ships = response.xpath('//div[#class="items"]/article')
for ship in ships:
item = MyItem()
item['name'] = ship.xpath('div[2]/header/h1/a/text()').extract()[1].strip()
item['image_urls'] = [ship.xpath('div[1]/a/picture/img/#src').extract()[0]]
item['type'] = ship.xpath('div[2]/div[2]/div[2]/text()').extract()[0]
str = item['image_paths'][0] + item['type'] + item['name']
yield item
I got the error:
exceptions.KeyError: 'image_paths'.
I tried to use item['images'][0].path but the some error still occurs. I don't know where does this error come from?
You have not defined image_paths field, define it:
class MyItem(scrapy.Item):
# ...
image_paths = scrapy.Field()
You probably meant to use images field for it instead

Trouble with downloading images using Scrapy

I'm getting the following error when attempting to download images using a spider with Scrapy.
File "C:\Python27\lib\site-packages\scrapy\http\request\__init__.py",
line 61, in _set_url
raise ValueError('Missing scheme in request url: %s' % self._url)
exceptions.ValueError: Missing scheme in request url: h
As best as I can understand it, it looks like I'm missing an "h" in a url somewhere? But I can't for the life of me see where. Everything works fine if I'm not trying to download images. But once I add the appropriate code to the four files below, I can't get anything to work properly. Could anyone help me make sense of this error?
items.py
import scrapy
class ProductItem(scrapy.Item):
model = scrapy.Field()
shortdesc = scrapy.Field()
desc = scrapy.Field()
series = scrapy.Field()
imageorig = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
settings.py
BOT_NAME = 'allenheath'
SPIDER_MODULES = ['allenheath.spiders']
NEWSPIDER_MODULE = 'allenheath.spiders'
ITEM_PIPELINES = {'scrapy.contrib.pipeline.images.ImagesPipeline': 1}
IMAGES_STORE = 'c:/allenheath/images'
pipelines.py
class AllenheathPipeline(object):
def process_item(self, item, spider):
return item
import scrapy
from scrapy.contrib.pipeline.images import ImagesPipeline
from scrapy.exceptions import DropItem
class MyImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
products.py (my spider)
import scrapy
from allenheath.items import ProductItem
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
class productsSpider(scrapy.Spider):
name = "products"
allowed_domains = ["http://www.allen-heath.com/"]
start_urls = [
"http://www.allen-heath.com/ahproducts/ilive-80/",
"http://www.allen-heath.com/ahproducts/ilive-112/"
]
def parse(self, response):
for sel in response.xpath('/html'):
item = ProductItem()
item['model'] = sel.css('#prodsingleouter > div > div > h2::text').extract()
item['shortdesc'] = sel.css('#prodsingleouter > div > div > h3::text').extract()
item['desc'] = sel.css('#tab1 #productcontent').extract()
item['series'] = sel.css('#pagestrip > div > div > a:nth-child(3)::text').extract()
item['imageorig'] = sel.css('#prodsingleouter > div > div > h2::text').extract()
item['image_urls'] = sel.css('#tab1 #productcontent img').extract()[0]
item['image_urls'] = 'http://www.allen-heath.com' + item['image_urls']
yield item
Any help would be greatly appreciated.
The issue is here:
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url)
and here:
item['image_urls'] = sel.css('#tab1 #productcontent img').extract()[0]
You are extracting this field and taking the first element. Which means that once you are iterating over it in the pipeline, you are in fact iterating over characters in the URL, which begins with http - explaining the error message you are seeing, as soon as the first letter tries to be processed:
Missing scheme in request url: h
Remove the [0] from the line. While you're at it, fetch the src of the image, instead of the entire element:
item['image_urls'] = sel.css('#tab1 #productcontent img').xpath('./#src').extract()
After that, you should update the next line also, in case the image url is relative, to convert it to absolute:
import urlparse # put this at the top of the script
item['image_urls'] = [urlparse.urljoin(response.url, url) for url in item['image_urls']]
But you don't need this last part if the image URL in src is actually absolute, so just remove it.

Scrapy pipeline error cannot import name

I am new to python programming and using scrapy. I have setup my crawler and so far it was working until I got to the point where I wanted to figure out how to download images. The error I am getting is cannot import name NsiscrapePipeline. I dont know what I am doing wrong and I dont understand some of the documentation as I am new. Please help
Items File
from scrapy.item import Item, Field
class NsiscrapeItem(Item):
# define the fields for your item here like:
# name = Field()
location = Field()
stock_number = Field()
year = Field()
manufacturer = Field()
model = Field()
length = Field()
price = Field()
status = Field()
url = Field()
pass
Spider
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from NSIscrape.items import NsiscrapeItem
from scrapy.http import Request
from scrapy.contrib.pipeline.images import NsiscrapePipeline
import Image
class NsiscrapeSpider(BaseSpider):
name = "Nsiscrape"
allowed_domain = ["yachtauctions.com"]
start_urls = [
"http://www.yachtauctions.com/inventory/"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//tr')
items = []
for site in sites:
item = NsiscrapeItem()
item['location'] = site.select('td[2]/text()').extract()
item['stock_number'] = site.select('td[3]/a/text()').extract()
item['year'] = site.select('td[4]/text()').extract()
item['manufacturer'] = site.select('td[5]/text()').extract()
item['model'] = site.select('td[6]/text()').extract()
item['length'] = site.select('td[7]/text()').extract()
item['price'] = site.select('td[8]/text()').extract()
item['status'] = site.select('td[10]/img/#src').extract()
item['url'] = site.select('td[1]/a/#href').extract()
item['image_urls'] = site.select('td/a[3]/img/#data-original').extract()
item['images'] = item['image_urls']
yield Request(item['url'][0], meta={'item':item}, callback=self.product_detail_page)
def product_detail_page(self, response):
hxs = HtmlXPathSelector(response)
item = response.request.meta['item']
#add all images url in the item['image_urls']
yield item
settings
ITEM_PIPELINES = ['scrapy.contrib.pipeline.image.NsiscrapePipeline']
IMAGES_STORE = 'c:\Python27\NSIscrape\IMG'
IMAGES_EXPIRES = 90
Pipelines This is where I am unsure if I am missing something
from scrapy.item import Item
class NsiscrapePipeline(Item):
image_urls = Field()
images = Field()
def process_item(self, item, spider):
return item
error
File "NSIscrape\spiders\NSI_Spider.py", line 9, in <module>
from scrapy.contrib.pipeline.images import NsiscrapePipeline
ImportError: cannot import name NsiscrapePipeline
You tried to pass list, but this function accepts only string. Pass only one element from list (for example list[0]).
Heres my final code thats working. There was two issues
1: I was missing the second backslash that needede to be in the request --> //td[1]/a[3]/img/#data-original
2: I had to check the full URL in which the image would be displayed and join them together which was the main URL or the allowed URL and the image URL.
def parse(self, response):
hxs = HtmlXPathSelector(response)
images = hxs.select('//tr')
url = []
for image in images:
urls = NsiscrapeItem()
urls['image_urls'] = ["http://www.yachtauctions.com" + x for x in image.select('//td[1]/a[3]/img/#data-original').extract()]
url.append(urls)
return url
That isn't part of the library :) - at least by looking at their current master branch
I think you're looking for ImagesPipeline
Their example may help! example
p.s. I don't think you custom name the class - at least not by how scapy is designed; i'm reasonably sure you use their class ;)

Crawling images fails

I am trying to crawl images from a website with the following scrapy code:
import urlparse
from PIL import Image
from scrapy.exceptions import DropItem, NotConfigured, IgnoreRequest
from scrapy.spider import BaseSpider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.loader import XPathItemLoader
from scrapy.selector import HtmlXPathSelector
from scrapy.http.request import Request
from scrapy.contrib.pipeline.images import ImagesPipeline
from mobile.items import Website
class MobileSpider(CrawlSpider):
name = "mobile"
allowed_domains = ["mobile-store.ro"]
start_urls = ["http://www.mobile-store.ro/produse/"]
rules = (
Rule(SgmlLinkExtractor(allow=r"/produs/d+"), follow=True),
Rule(SgmlLinkExtractor(allow=r"/produse/d+"), callback='parse_item')
)
def parse(self, response, response2):
hxs = HtmlXPathSelector(response)
next_page = hxs.select("//ul[#class='products']/li/a/#href").extract()
if not not next_page:
yield Request(next_page[0], self.parse)
sites = hxs.select('//div[#id="wrapper"]/div[#id="content"]')
items = []
for site in sites:
item = Website()
item['nume'] = site.select('//div[#class="summary"]/h1[#class="product_title entry-title"]/text()').extract()
item['categorie'] = site.select('//div[#class="summary"]/div[#class="product_meta"]/span[#class="posted_in"]/a/text()').extract()
item['brand'] = site.select('//div[#class="summary"]/div[#class="product_meta"]/span[#class="tagged_as"]/a/text()').extract()
item['descriere'] = site.select('//div[#class="woocommerce_tabs"]/div[#id="tab-description"]/p/text()').extract()
image_relative_url = site.select('//div[#class="ad-image-wrapper"]/div[#class="ad-image"]/img[#class="lightbox"]/#src').extract()
item['image_urls'] = [urlparse.urljoin(response.url,image_relative_url)]
#item['image_urls'] = site.select('//div[#class="ad-image-wrapper"]/div[#class="ad-image"]/img[#class="lightbox"]/#src').extract()
item['pret'] = site.select('//div[#class="summary"]/div[1]/p[#class="price"]/span[#class="amount"]/text()').extract()
item['url'] = response.url
items.append(item)
for item in items:
yield item
settings.py:
SPIDER_MODULES = ['mobile.spiders']
NEWSPIDER_MODULE = 'mobile.spiders'
DEFAULT_ITEM_CLASS = 'mobile.items.Website'
ITEM_PIPELINES = ['scrapy.contrib.pipeline.images.ImagesPipeline']
items.py:
from scrapy.item import Item, Field
class Website(Item):
nume = Field()
descriere = Field()
categorie = Field()
brand = Field()
pret = Field()
url = Field()
image_urls = Field()
images = Field()
image_paths = Field()
pipelines.py:
from mobile.contrib.pipeline.images import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy.http import Request
class MyImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield Request(image_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
The issues comes when I try to get the images url, by using the following code:
for site in sites:
item = Website()
item['nume'] = site.select('//div[#class="summary"]/h1[#class="product_title entry-title"]/text()').extract()
item['categorie'] = site.select('//div[#class="summary"]/div[#class="product_meta"]/span[#class="posted_in"]/a/text()').extract()
item['brand'] = site.select('//div[#class="summary"]/div[#class="product_meta"]/span[#class="tagged_as"]/a/text()').extract()
item['descriere'] = site.select('//div[#class="woocommerce_tabs"]/div[#id="tab-description"]/p/text()').extract()
image_relative_url = site.select('//div[#class="ad-image-wrapper"]/div[#class="ad-image"]/img[#class="lightbox"]/#src').extract()
item['image_urls'] = [urlparse.urljoin(response.url2,image_relative_url)]
#item['image_urls'] = site.select('//div[#class="ad-image-wrapper"]/div[#class="ad-image"]/img[#class="lightbox"]/#src').extract()
item['pret'] = site.select('//div[#class="summary"]/div[1]/p[#class="price"]/span[#class="amount"]/text()').extract()
item['url'] = response.url
items.append(item)
for item in items:
yield item
Which returns me the page url instead of the image url. All other fields are crawled correctly. Any clues on how to fix this issue and get the image url properly?
This is because the image (and the whole content of ad-image-wrapper div) is filled dynamically via javascript.
Dumping response.body in parse method helped me to figure out that the actual image link is originally kept in the ad-thumb-list list. So, try use the following for getting the image url:
image_relative_url = site.select('//ul[#class="ad-thumb-list"]/li[#class="first_item"]/a/#href').extract()
if image_relative_url:
image_relative_url = image_relative_url[0]
Hope that is what you needed.

Categories

Resources