I'm using scrapy to scrape category pages for products from architonic*com. However, I would like to display these products in a csv one per row. In the current situation all the brand names from a given category page are listed under 'brand' while I would like to have an output like this:
{'brand': [u'Elisabeth Ellefsen'],
'title': [u'Up chair I 907'],
'img_url': [u'http://image.architonic.com/img_pro1-1/117/4373/t-up-06f-sq.jpg'],
'link': [u'http://www.architonic.com/pmsht/up-chair-tonon/1174373']
}
I tried playing with the Item Loaders (added default_output_processor= TakeFirst()), adding 'yield item' (see commented code) and searched two days to find a solution without luck. Hoping someone is willing to help me. Any help is really appreciated.
My output looks like this:
2013-01-14 11:53:23+0100 [archi] DEBUG: Scraped from <200 http://www.architonic.com/pmpro/home-furnishings/3210002/2/2/3>
{'brand': [u'Softline',
u'Elisabeth Ellefsen',
u'Sellex',
u'Lievore Altherr Molina',
u'Poliform',
.....
u'Hans Thyge & Co.'],
'img_url': [u'http://image.architonic.com/img_pro1-1/117/3661/terra-h-sq.jpg',
u'http://image.architonic.com/img_pro1-1/117/0852/fly-01-sq.jpg',
u'http://image.architonic.com/img_pro1-1/116/9870/ley-0004-sq.jpg',
u'http://image.architonic.com/img_pro1-1/117/1023/arflex-hollywood-03-sq.jpg',
...
u'http://image.architonic.com/img_pro1-1/118/5357/reef-002-sq.jpg'],
'link': [u'http://www.architonic.com/pmsht/terra-softline/1173661',
u'http://www.architonic.com/pmsht/fly-sellex/1170852',
u'http://www.architonic.com/pmsht/ley-poliform/1169870',
.....
u'http://www.architonic.com/pmsht/reef-collection-labofa/1185357'],
'title': [u'Terra',
u'Fly',
u'Ley chair',
.....
u'Hollywood Sofa',
u'Pouff Round']}
I'm using this in spider/archi_spider.py
import string
import re
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.utils.markup import remove_entities
from archiscraper.items import ArchiItemFields, ArchiLoader
class ArchiScraper(BaseSpider):
name = "archi"
allowed_domains = ["architonic.com"]
start_urls = ['http://www.architonic.com/pmpro/home-furnishings/3210002/2/2/%s' % page for page in xrange(1, 4)]
# rules = (Rule(SgmlLinkExtractor(allow=('.', ),restrict_xpaths=('//*[#id="right_arrow"]',))
# , callback="parse_items", follow= True),
# )
#
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//li[contains(#class, "nav_pro_item")]')
items = []
for site in sites:
item = ArchiLoader(ArchiItemFields(), site)
item.add_xpath('brand', '//*[contains(#class, "nav_pro_text")]/a/br/following-sibling::node()[1][self::text()]')
item.add_xpath('designer', '//*[contains(#class, "nav_pro_text")]/a/br/following-sibling::node()[3][self::text()]')
item.add_xpath('title', '//*[contains(#class, "nav_pro_text")]/a/strong/text()')
item.add_xpath('img_url', '//li[contains(#class, "nav_pro_item")]/div/a/img/#src[1]')
item.add_xpath('link', '//*[contains(#class, "nav_pro_text")]/a/#href')
items.append(item.load_item())
return items
# for item in items:
# yield item
items.py
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/topics/items.html
import string
from scrapy.item import Item, Field
from scrapy.contrib.loader.processor import MapCompose, Join, TakeFirst
from scrapy.utils.markup import remove_entities
from scrapy.contrib.loader import XPathItemLoader
class ArchiItem():
pass
class ArchiItemFields(Item):
brand = Field()
title = Field()
designer = Field()
img_url = Field()
img = Field()
link = Field()
pass
class ArchiLoader(XPathItemLoader):
# default_input_processor = MapCompose(unicode.strip)
# default_output_processor= TakeFirst()
brand_out = MapCompose(unicode.strip)
# title_out = Join()
Related
Hey so I have some experience scraping html but never json and so I need to scrape the following web page using scrapy, http://www.starcitygames.com/buylist/search?search-type=category&id=5061, and I found a tutorial online that uses scrapy along with jmspath to scrape json data from the web. And I got the tutorial to work but I am trying to alter it to work with my website to no luck. No errors but it does not return any data. Any help would be greatly appreciated!
items.py
import scrapy
class NameItem(scrapy.Item):
"""User item definition for jsonplaceholder /LoginSpider endpoint."""
name = scrapy.Field()
condition = scrapy.Field()
price = scrapy.Field()
rarity = scrapy.Field()
LoginSpider.py
import scrapy
import json
from scrapy.spiders import Spider
from scrapy_splash import SplashRequest
from ..items import NameItem
from scrapy.loader import ItemLoader
from scrapy.loader.processors import Join, MapCompose, SelectJmes
class UserSpider(scrapy.Spider):
"""Spider to scrape `http://www.starcitygames.com/buylist/search?search-type=category&id=5061`."""
name = 'LoginSpider'
allowed_domains = ['http://www.starcitygames.com/buylist/search?search-type=category&id=5061']
start_urls = ['http://www.starcitygames.com/buylist/search?search-type=category&id=5061']
# dictionary to map UserItem fields to Jmes query paths
jmes_paths = {
'name': 'name',
'condition': 'condition',
'price': 'price',
'rarity': 'rarity',
}
def parse(self, response):
jsonresponse = json.loads(response.body_as_unicode())
for user in jsonresponse:
loader = ItemLoader(item=NameItem()) # create an ItemLoader to populate a NameItem
loader.default_input_processor = MapCompose(str) # apply str conversion on each value
loader.default_output_processor = Join(' ')
for (field, path) in self.jmes_paths.items():
loader.add_value(field, SelectJmes(path)(user))
yield loader.load_item()
The response of this url http://www.starcitygames.com/buylist/search?search-type=category&id=5061has 3 levels:
'Ok'
'search'
'results' ## this contain the data
And results key has multiple values what you should iterate.
Inside the values are the data.
Try this code, I hope you can help.
This is the module items.py
class SoResponseItem(scrapy.Item):
name = scrapy.Field()
condition = scrapy.Field()
price = scrapy.Field()
rarity = scrapy.Field()
This is the spider
import scrapy
import json
from SO_response.items import SoResponseItem
class LoginspiderSpider(scrapy.Spider):
name = 'LoginSpider'
allowed_domains = ['www.starcitygames.com']
url = 'http://www.starcitygames.com/'
def start_requests(self):
yield scrapy.Request(url=self.url, callback=self.parse)
def parse(self, response):
url = response.urljoin('buylist/search?search-type=category&id=5061')
yield scrapy.Request(url=url, callback=self.parse_data)
def parse_data(self, response):
jsonreponse = json.loads(response.body)
for result in jsonreponse['results']:
for index in range(len(result)):
items = SoResponseItem()
items['name'] = result[index]['name']
items['condition'] = result[index]['condition']
items['price'] = result[index]['price']
items['rarity'] = result[index]['rarity']
yield items
Try in your shell:
scrapy crawl -o jmes.json
how to fetch image url from website using scrapy in python.please help me.this is my code
from scrapy.spiders import CrawlSpider, Rule
#from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.item import Item, Field
class MyItem(Item):
url= Field()
class someSpider(CrawlSpider):
name = 'crawltest'
allowed_domains = ['bambeeq.com']
start_urls = ['http://www.bambeeq.com/']
rules = (Rule(LinkExtractor(allow=()), callback='parse_obj', follow=True),)
def parse_obj(self,response):
item = MyItem()
item['url'] = []
for link in LinkExtractor(allow=(),deny = self.allowed_domains).extract_links(response):
item['url'].append(link.url)
#item['image'].append(link.img)
return item
You are extracting the links ('a' element), not the images ('img' element). Try this:
# iterate over the list of images
for image in response.xpath('//img/#src').extract():
# make each one into a full URL and add to item[]
item['url'].append(response.urljoin(image))
yield item
im new with Scrapy and web crawling and I've been working on the page www.mercadolibre.com.mx I have to get (from the startpage) some data (descripton and prices) about the produtcs displayed in there. Here is my items.py:
from scrapy.item import Item, Field
class PruebaMercadolibreItem(Item):
producto = Field()
precio = Field()
And here is my spider:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from prueba_mercadolibre.items import PruebaMercadolibreItem
class MLSpider(BaseSpider):
name = "mlspider"
allowed_domains = ["mercadolibre.com"]
start_urls = ["http://www.mercadolibre.com.mx"]
def parse (self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//div[#class='item-data']")
items = []
for titles in titles:
item = PruebaMercadolibreItem()
item["producto"] = titles.select("p[#class='tit le']/#title").extract()
item["precio"] = titles.select("span[#class='ch-price']/text()").extract()
items.append(item)
return items
The problem is that I get the same results in when I change this line:
titles = hxs.select("//div[#class='item-data']")
To this:
titles = hxs.select("//div[#class='item-data'] | //div[#class='item-data item-data-mp']")
And Im not getting the same data as when I use the first line.
Can anyone help me? do I have any errorin my xPath selection?
Also I cant find a good tutorial for using MySQL with scrapy, I would appreciate any help. Thx
Better use contains if you want to get all div tags containing item-data class:
titles = hxs.select("//div[contains(#class, 'item-data')]")
Also, you have other problems in the spider:
the loop, you are overriding the titles
class name in producto xpath should be title, not tit le
you probably don't want to have lists in Field values, get the first items out of the extracted lists
HtmlXPathSelector is deprecated, use Selector instead
select() is deprecated, use xpath() instead
BaseSpider has been renamed to Spider
Here's the code with modifications:
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.item import Item, Field
from prueba_mercadolibre.items import PruebaMercadolibreItem
class MLSpider(Spider):
name = "mlspider"
allowed_domains = ["mercadolibre.com"]
start_urls = ["http://www.mercadolibre.com.mx"]
def parse (self, response):
hxs = Selector(response)
titles = hxs.xpath("//div[contains(#class, 'item-data')]")
for title in titles:
item = PruebaMercadolibreItem()
item["producto"] = title.xpath("p[#class='title']/#title").extract()[0]
item["precio"] = title.xpath("span[#class='ch-price']/text()").extract()[0]
yield item
Example items from the output:
{'precio': u'$ 35,000', 'producto': u'Cuatrimoto, Utv De 500cc 4x4 ,moto , Motos, Atv ,'}
{'precio': u'$ 695', 'producto': u'Reloj Esp\xeda Camara Oculta Video Hd 16 Gb! Sony Compara.'}
So I'm trying to scrape the schedule at this page.. http://stats.swehockey.se/ScheduleAndResults/Schedule/3940
..with this code.
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class SchemaSpider(BaseSpider):
name = "schema"
allowed_domains = ["http://stats.swehockey.se/"]
start_urls = [
"http://stats.swehockey.se/ScheduleAndResults/Schedule/3940"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
rows = hxs.select('//table[#class="tblContent"]/tbody/tr')
for row in rows:
date = row.select('/td[1]/div/span/text()').extract()
teams = row.select('/td[2]/text()').extract()
print date, teams
But I can't get it to work. What am I doing wrong? I've been trying to figure out myself for a couple of hours now but I have no idea why my XPath doesn't work properly.
Two problems:
tbody is a tag that is added by modern browsers. Scrapy simply doesn't see it in the html.
xpaths for data and teams weren't right: you should use relative xpath (.//), also td indexes was wrong, should be 2 and 3 instead of 1 and 2
Here's the whole code with some mofidications (working):
from scrapy.item import Item, Field
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class SchemaItem(Item):
date = Field()
teams = Field()
class SchemaSpider(BaseSpider):
name = "schema"
allowed_domains = ["http://stats.swehockey.se/"]
start_urls = [
"http://stats.swehockey.se/ScheduleAndResults/Schedule/3940"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
rows = hxs.select('//table[#class="tblContent"]/tr')
for row in rows:
item = SchemaItem()
item['date'] = row.select('.//td[2]/div/span/text()').extract()
item['teams'] = row.select('.//td[3]/text()').extract()
yield item
Hope that helps.
After banging my head several time, I am finally coming here.
Problem : I am trying to download the content of each of the craiglist posting. By content I mean the "posting body" like description of the cell phone. Looking for a new old phone since iPhone is done with all excitement.
The code is an awesome work by Michael Herman.
My Spider Class
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import *
from craig.items import CraiglistSampleItem
class MySpider(CrawlSpider):
name = "craigs"
allowed_domains = ["craigslist.org"]
start_urls = ["http://minneapolis.craigslist.org/moa/"]
rules = (Rule (SgmlLinkExtractor(allow=("index\d00\.html", ),restrict_xpaths=('//p[#class="nextpage"]',))
, callback="parse_items", follow= True),
)
def parse_items(self,response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//span[#class='pl']")
items = []
for titles in titles:
item = CraiglistSampleItem()
item ["title"] = titles.select("a/text()").extract()
item ["link"] = titles.select("a/#href").extract()
items.append(item)
return items
And the Item class
from scrapy.item import Item, Field
class CraiglistSampleItem(Item):
title = Field()
link = Field()
Since the code will traverse many links , hence I wanted to save the description of each cell phone in sepearte csv but one more column in csv will be fine also.
Any lead !!!
Instead of returning items in parse_items method you should return/yield scrapy Request instance in order to get the description from the item page, link and title you can pass inside of an Item, and Item inside of the meta dictionary:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request
from scrapy.selector import *
from scrapy.item import Item, Field
class CraiglistSampleItem(Item):
title = Field()
link = Field()
description = Field()
class MySpider(CrawlSpider):
name = "craigs"
allowed_domains = ["craigslist.org"]
start_urls = ["http://minneapolis.craigslist.org/moa/"]
rules = (Rule(SgmlLinkExtractor(allow=("index\d00\.html", ), restrict_xpaths=('//p[#class="nextpage"]',))
, callback="parse_items", follow=True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//span[#class='pl']")
for title in titles:
item = CraiglistSampleItem()
item["title"] = title.select("a/text()").extract()[0]
item["link"] = title.select("a/#href").extract()[0]
url = "http://minneapolis.craigslist.org%s" % item["link"]
yield Request(url=url, meta={'item': item}, callback=self.parse_item_page)
def parse_item_page(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
item['description'] = hxs.select('//section[#id="postingbody"]/text()').extract()
return item
Run it and see additional description column in your output csv file.
Hope that helps.