scrapy extract the url of image - python

how to fetch image url from website using scrapy in python.please help me.this is my code
from scrapy.spiders import CrawlSpider, Rule
#from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.item import Item, Field
class MyItem(Item):
url= Field()
class someSpider(CrawlSpider):
name = 'crawltest'
allowed_domains = ['bambeeq.com']
start_urls = ['http://www.bambeeq.com/']
rules = (Rule(LinkExtractor(allow=()), callback='parse_obj', follow=True),)
def parse_obj(self,response):
item = MyItem()
item['url'] = []
for link in LinkExtractor(allow=(),deny = self.allowed_domains).extract_links(response):
item['url'].append(link.url)
#item['image'].append(link.img)
return item

You are extracting the links ('a' element), not the images ('img' element). Try this:
# iterate over the list of images
for image in response.xpath('//img/#src').extract():
# make each one into a full URL and add to item[]
item['url'].append(response.urljoin(image))
yield item

Related

How scrape items from 2 different sections?

im new with Scrapy and web crawling and I've been working on the page www.mercadolibre.com.mx I have to get (from the startpage) some data (descripton and prices) about the produtcs displayed in there. Here is my items.py:
from scrapy.item import Item, Field
class PruebaMercadolibreItem(Item):
producto = Field()
precio = Field()
And here is my spider:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from prueba_mercadolibre.items import PruebaMercadolibreItem
class MLSpider(BaseSpider):
name = "mlspider"
allowed_domains = ["mercadolibre.com"]
start_urls = ["http://www.mercadolibre.com.mx"]
def parse (self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//div[#class='item-data']")
items = []
for titles in titles:
item = PruebaMercadolibreItem()
item["producto"] = titles.select("p[#class='tit le']/#title").extract()
item["precio"] = titles.select("span[#class='ch-price']/text()").extract()
items.append(item)
return items
The problem is that I get the same results in when I change this line:
titles = hxs.select("//div[#class='item-data']")
To this:
titles = hxs.select("//div[#class='item-data'] | //div[#class='item-data item-data-mp']")
And Im not getting the same data as when I use the first line.
Can anyone help me? do I have any errorin my xPath selection?
Also I cant find a good tutorial for using MySQL with scrapy, I would appreciate any help. Thx
Better use contains if you want to get all div tags containing item-data class:
titles = hxs.select("//div[contains(#class, 'item-data')]")
Also, you have other problems in the spider:
the loop, you are overriding the titles
class name in producto xpath should be title, not tit le
you probably don't want to have lists in Field values, get the first items out of the extracted lists
HtmlXPathSelector is deprecated, use Selector instead
select() is deprecated, use xpath() instead
BaseSpider has been renamed to Spider
Here's the code with modifications:
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.item import Item, Field
from prueba_mercadolibre.items import PruebaMercadolibreItem
class MLSpider(Spider):
name = "mlspider"
allowed_domains = ["mercadolibre.com"]
start_urls = ["http://www.mercadolibre.com.mx"]
def parse (self, response):
hxs = Selector(response)
titles = hxs.xpath("//div[contains(#class, 'item-data')]")
for title in titles:
item = PruebaMercadolibreItem()
item["producto"] = title.xpath("p[#class='title']/#title").extract()[0]
item["precio"] = title.xpath("span[#class='ch-price']/text()").extract()[0]
yield item
Example items from the output:
{'precio': u'$ 35,000', 'producto': u'Cuatrimoto, Utv De 500cc 4x4 ,moto , Motos, Atv ,'}
{'precio': u'$ 695', 'producto': u'Reloj Esp\xeda Camara Oculta Video Hd 16 Gb! Sony Compara.'}

Scrapy only scraping first result of each page

I'm currently trying to run the following code but it keeps scraping only the first result of each page. Any idea what the issue may be?
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from firstproject.items import xyz123Item
import urlparse
from scrapy.http.request import Request
class MySpider(CrawlSpider):
name = "xyz123"
allowed_domains = ["www.xyz123.com.au"]
start_urls = ["http://www.xyz123.com.au/",]
rules = (Rule (SgmlLinkExtractor(allow=("",),restrict_xpaths=('//*[#id="1234headerPagination_hlNextLink"]',))
, callback="parse_xyz", follow=True),
)
def parse_xyz(self, response):
hxs = HtmlXPathSelector(response)
xyz = hxs.select('//div[#id="1234SearchResults"]//div/h2')
items = []
for xyz in xyz:
item = xyz123Item()
item ["title"] = xyz.select('a/text()').extract()[0]
item ["link"] = xyz.select('a/#href').extract()[0]
items.append(item)
return items
The Basespider version works well scraping ALL the required data on the first page:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from firstproject.items import xyz123
class MySpider(BaseSpider):
name = "xyz123test"
allowed_domains = ["xyz123.com.au"]
start_urls = ["http://www.xyz123.com.au/"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select('//div[#id="1234SearchResults"]//div/h2')
items = []
for titles in titles:
item = xyz123Item()
item ["title"] = titles.select("a/text()").extract()
item ["link"] = titles.select("a/#href").extract()
items.append(item)
return items
Sorry for the censoring. I had to censor the website for privacy reasons.
The first code crawls through the pages well the way I'd like it to crawl, however it only pulls the first item title and link. NOTE: The XPath of the first title using "inspect element" in google is:
//*[#id="xyz123SearchResults"]/div[1]/h2/a,
second is //*[#id="xyz123SearchResults"]/div[2]/h2/a
third is //*[#id="xyz123SearchResults"]/div[3]/h2/a etc.
I'm not sure if the div[n] bit is what's killing it. I'm hoping it's an easy fix.
Thanks
for xyz in xyz:
item = xyz123Item()
item ["title"] = xyz.select('a/text()').extract()[0]
item ["link"] = xyz.select('a/#href').extract()[0]
items.append(item)
return items
Are you sure about the indentation of the return items ? It should be one less.

Scrape using multiple POST data from the same URL

I have already created one spider that collects a list of company names with matching phone numbers. This is then saved to a CSV file.
I am then wanting to scrape data from another site using the phones numbers in the CSV file as POST data. I am wanting it to loop through the same start URL but just scraping the data that each phone number produces until there are no more numbers left in the CSV file.
This is what I have got so far:
from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.http import FormRequest
from scrapy.selector import HtmlXPathSelector
from scrapy import log
import sys
from scrapy.shell import inspect_response
from btw.items import BtwItem
import csv
class BtwSpider(BaseSpider):
name = "btw"
allowed_domains = ["siteToScrape.com"]
start_urls = ["http://www.siteToScrape.com/broadband/broadband_checker"]
def parse(self, response):
phoneNumbers = ['01253873647','01253776535','01142726749']
return [FormRequest.from_response(response,formdata={'broadband_checker[phone]': phoneNumbers[1]},callback=self.after_post)]
def after_post(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#id="results"]')
items = []
for site in sites:
item = BtwItem()
fttcText = site.select("div[#class='content']/div[#id='btfttc']/ul/li/text()").extract()
# Now we will change the text to be a boolean value
if fttcText[0].count('not') > 0:
fttcEnabled=0
else:
fttcEnabled=1
item['fttcAvailable'] = fttcEnabled
items.append(item)
return items
At the minute I have just been trying to get this looping through a list(phoneNumbers) but I have not even managed to get that to work so far. Once I know how to do that I will be able to get it to pull it from a CSV file by myself. In its current state it is just using the phoneNumber with a index of 1 in the list.
Assuming you have a phones.csv file with phones in it:
01253873647
01253776535
01142726749
Here's your spider:
import csv
from scrapy.item import Item, Field
from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.http import FormRequest
from scrapy.selector import HtmlXPathSelector
class BtwItem(Item):
fttcAvailable = Field()
phoneNumber = Field()
class BtwSpider(BaseSpider):
name = "btw"
allowed_domains = ["samknows.com"]
def start_requests(self):
yield Request("http://www.samknows.com/broadband/broadband_checker", self.parse_main_page)
def parse_main_page(self, response):
with open('phones.csv', 'r') as f:
reader = csv.reader(f)
for row in reader:
phone_number = row[0]
yield FormRequest.from_response(response,
formdata={'broadband_checker[phone]': phone_number},
callback=self.after_post,
meta={'phone_number': phone_number})
def after_post(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#id="results"]')
phone_number = response.meta['phone_number']
for site in sites:
item = BtwItem()
fttc = site.select("div[#class='content']/div[#id='btfttc']/ul/li/text()").extract()
item['phoneNumber'] = phone_number
item['fttcAvailable'] = 'not' in fttc[0]
yield item
Here's what was scraped after running it:
{'fttcAvailable': False, 'phoneNumber': '01253873647'}
{'fttcAvailable': False, 'phoneNumber': '01253776535'}
{'fttcAvailable': True, 'phoneNumber': '01142726749'}
The idea is to scrape the main page using start_requests, then read the csv file line-by-line in the callback and yield new Requests for each phone number (csv row). Additionally, pass phone_number to the callback through the meta dictionary in order to write it to the Item field (I think you need this to distinguish items/results).
Hope that helps.

Crawling images fails

I am trying to crawl images from a website with the following scrapy code:
import urlparse
from PIL import Image
from scrapy.exceptions import DropItem, NotConfigured, IgnoreRequest
from scrapy.spider import BaseSpider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.loader import XPathItemLoader
from scrapy.selector import HtmlXPathSelector
from scrapy.http.request import Request
from scrapy.contrib.pipeline.images import ImagesPipeline
from mobile.items import Website
class MobileSpider(CrawlSpider):
name = "mobile"
allowed_domains = ["mobile-store.ro"]
start_urls = ["http://www.mobile-store.ro/produse/"]
rules = (
Rule(SgmlLinkExtractor(allow=r"/produs/d+"), follow=True),
Rule(SgmlLinkExtractor(allow=r"/produse/d+"), callback='parse_item')
)
def parse(self, response, response2):
hxs = HtmlXPathSelector(response)
next_page = hxs.select("//ul[#class='products']/li/a/#href").extract()
if not not next_page:
yield Request(next_page[0], self.parse)
sites = hxs.select('//div[#id="wrapper"]/div[#id="content"]')
items = []
for site in sites:
item = Website()
item['nume'] = site.select('//div[#class="summary"]/h1[#class="product_title entry-title"]/text()').extract()
item['categorie'] = site.select('//div[#class="summary"]/div[#class="product_meta"]/span[#class="posted_in"]/a/text()').extract()
item['brand'] = site.select('//div[#class="summary"]/div[#class="product_meta"]/span[#class="tagged_as"]/a/text()').extract()
item['descriere'] = site.select('//div[#class="woocommerce_tabs"]/div[#id="tab-description"]/p/text()').extract()
image_relative_url = site.select('//div[#class="ad-image-wrapper"]/div[#class="ad-image"]/img[#class="lightbox"]/#src').extract()
item['image_urls'] = [urlparse.urljoin(response.url,image_relative_url)]
#item['image_urls'] = site.select('//div[#class="ad-image-wrapper"]/div[#class="ad-image"]/img[#class="lightbox"]/#src').extract()
item['pret'] = site.select('//div[#class="summary"]/div[1]/p[#class="price"]/span[#class="amount"]/text()').extract()
item['url'] = response.url
items.append(item)
for item in items:
yield item
settings.py:
SPIDER_MODULES = ['mobile.spiders']
NEWSPIDER_MODULE = 'mobile.spiders'
DEFAULT_ITEM_CLASS = 'mobile.items.Website'
ITEM_PIPELINES = ['scrapy.contrib.pipeline.images.ImagesPipeline']
items.py:
from scrapy.item import Item, Field
class Website(Item):
nume = Field()
descriere = Field()
categorie = Field()
brand = Field()
pret = Field()
url = Field()
image_urls = Field()
images = Field()
image_paths = Field()
pipelines.py:
from mobile.contrib.pipeline.images import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy.http import Request
class MyImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield Request(image_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
The issues comes when I try to get the images url, by using the following code:
for site in sites:
item = Website()
item['nume'] = site.select('//div[#class="summary"]/h1[#class="product_title entry-title"]/text()').extract()
item['categorie'] = site.select('//div[#class="summary"]/div[#class="product_meta"]/span[#class="posted_in"]/a/text()').extract()
item['brand'] = site.select('//div[#class="summary"]/div[#class="product_meta"]/span[#class="tagged_as"]/a/text()').extract()
item['descriere'] = site.select('//div[#class="woocommerce_tabs"]/div[#id="tab-description"]/p/text()').extract()
image_relative_url = site.select('//div[#class="ad-image-wrapper"]/div[#class="ad-image"]/img[#class="lightbox"]/#src').extract()
item['image_urls'] = [urlparse.urljoin(response.url2,image_relative_url)]
#item['image_urls'] = site.select('//div[#class="ad-image-wrapper"]/div[#class="ad-image"]/img[#class="lightbox"]/#src').extract()
item['pret'] = site.select('//div[#class="summary"]/div[1]/p[#class="price"]/span[#class="amount"]/text()').extract()
item['url'] = response.url
items.append(item)
for item in items:
yield item
Which returns me the page url instead of the image url. All other fields are crawled correctly. Any clues on how to fix this issue and get the image url properly?
This is because the image (and the whole content of ad-image-wrapper div) is filled dynamically via javascript.
Dumping response.body in parse method helped me to figure out that the actual image link is originally kept in the ad-thumb-list list. So, try use the following for getting the image url:
image_relative_url = site.select('//ul[#class="ad-thumb-list"]/li[#class="first_item"]/a/#href').extract()
if image_relative_url:
image_relative_url = image_relative_url[0]
Hope that is what you needed.

Scrapy: one row per item

I'm using scrapy to scrape category pages for products from architonic*com. However, I would like to display these products in a csv one per row. In the current situation all the brand names from a given category page are listed under 'brand' while I would like to have an output like this:
{'brand': [u'Elisabeth Ellefsen'],
'title': [u'Up chair I 907'],
'img_url': [u'http://image.architonic.com/img_pro1-1/117/4373/t-up-06f-sq.jpg'],
'link': [u'http://www.architonic.com/pmsht/up-chair-tonon/1174373']
}
I tried playing with the Item Loaders (added default_output_processor= TakeFirst()), adding 'yield item' (see commented code) and searched two days to find a solution without luck. Hoping someone is willing to help me. Any help is really appreciated.
My output looks like this:
2013-01-14 11:53:23+0100 [archi] DEBUG: Scraped from <200 http://www.architonic.com/pmpro/home-furnishings/3210002/2/2/3>
{'brand': [u'Softline',
u'Elisabeth Ellefsen',
u'Sellex',
u'Lievore Altherr Molina',
u'Poliform',
.....
u'Hans Thyge & Co.'],
'img_url': [u'http://image.architonic.com/img_pro1-1/117/3661/terra-h-sq.jpg',
u'http://image.architonic.com/img_pro1-1/117/0852/fly-01-sq.jpg',
u'http://image.architonic.com/img_pro1-1/116/9870/ley-0004-sq.jpg',
u'http://image.architonic.com/img_pro1-1/117/1023/arflex-hollywood-03-sq.jpg',
...
u'http://image.architonic.com/img_pro1-1/118/5357/reef-002-sq.jpg'],
'link': [u'http://www.architonic.com/pmsht/terra-softline/1173661',
u'http://www.architonic.com/pmsht/fly-sellex/1170852',
u'http://www.architonic.com/pmsht/ley-poliform/1169870',
.....
u'http://www.architonic.com/pmsht/reef-collection-labofa/1185357'],
'title': [u'Terra',
u'Fly',
u'Ley chair',
.....
u'Hollywood Sofa',
u'Pouff Round']}
I'm using this in spider/archi_spider.py
import string
import re
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.utils.markup import remove_entities
from archiscraper.items import ArchiItemFields, ArchiLoader
class ArchiScraper(BaseSpider):
name = "archi"
allowed_domains = ["architonic.com"]
start_urls = ['http://www.architonic.com/pmpro/home-furnishings/3210002/2/2/%s' % page for page in xrange(1, 4)]
# rules = (Rule(SgmlLinkExtractor(allow=('.', ),restrict_xpaths=('//*[#id="right_arrow"]',))
# , callback="parse_items", follow= True),
# )
#
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//li[contains(#class, "nav_pro_item")]')
items = []
for site in sites:
item = ArchiLoader(ArchiItemFields(), site)
item.add_xpath('brand', '//*[contains(#class, "nav_pro_text")]/a/br/following-sibling::node()[1][self::text()]')
item.add_xpath('designer', '//*[contains(#class, "nav_pro_text")]/a/br/following-sibling::node()[3][self::text()]')
item.add_xpath('title', '//*[contains(#class, "nav_pro_text")]/a/strong/text()')
item.add_xpath('img_url', '//li[contains(#class, "nav_pro_item")]/div/a/img/#src[1]')
item.add_xpath('link', '//*[contains(#class, "nav_pro_text")]/a/#href')
items.append(item.load_item())
return items
# for item in items:
# yield item
items.py
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/topics/items.html
import string
from scrapy.item import Item, Field
from scrapy.contrib.loader.processor import MapCompose, Join, TakeFirst
from scrapy.utils.markup import remove_entities
from scrapy.contrib.loader import XPathItemLoader
class ArchiItem():
pass
class ArchiItemFields(Item):
brand = Field()
title = Field()
designer = Field()
img_url = Field()
img = Field()
link = Field()
pass
class ArchiLoader(XPathItemLoader):
# default_input_processor = MapCompose(unicode.strip)
# default_output_processor= TakeFirst()
brand_out = MapCompose(unicode.strip)
# title_out = Join()

Categories

Resources