How scrape items from 2 different sections? - python

im new with Scrapy and web crawling and I've been working on the page www.mercadolibre.com.mx I have to get (from the startpage) some data (descripton and prices) about the produtcs displayed in there. Here is my items.py:
from scrapy.item import Item, Field
class PruebaMercadolibreItem(Item):
producto = Field()
precio = Field()
And here is my spider:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from prueba_mercadolibre.items import PruebaMercadolibreItem
class MLSpider(BaseSpider):
name = "mlspider"
allowed_domains = ["mercadolibre.com"]
start_urls = ["http://www.mercadolibre.com.mx"]
def parse (self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//div[#class='item-data']")
items = []
for titles in titles:
item = PruebaMercadolibreItem()
item["producto"] = titles.select("p[#class='tit le']/#title").extract()
item["precio"] = titles.select("span[#class='ch-price']/text()").extract()
items.append(item)
return items
The problem is that I get the same results in when I change this line:
titles = hxs.select("//div[#class='item-data']")
To this:
titles = hxs.select("//div[#class='item-data'] | //div[#class='item-data item-data-mp']")
And Im not getting the same data as when I use the first line.
Can anyone help me? do I have any errorin my xPath selection?
Also I cant find a good tutorial for using MySQL with scrapy, I would appreciate any help. Thx

Better use contains if you want to get all div tags containing item-data class:
titles = hxs.select("//div[contains(#class, 'item-data')]")
Also, you have other problems in the spider:
the loop, you are overriding the titles
class name in producto xpath should be title, not tit le
you probably don't want to have lists in Field values, get the first items out of the extracted lists
HtmlXPathSelector is deprecated, use Selector instead
select() is deprecated, use xpath() instead
BaseSpider has been renamed to Spider
Here's the code with modifications:
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.item import Item, Field
from prueba_mercadolibre.items import PruebaMercadolibreItem
class MLSpider(Spider):
name = "mlspider"
allowed_domains = ["mercadolibre.com"]
start_urls = ["http://www.mercadolibre.com.mx"]
def parse (self, response):
hxs = Selector(response)
titles = hxs.xpath("//div[contains(#class, 'item-data')]")
for title in titles:
item = PruebaMercadolibreItem()
item["producto"] = title.xpath("p[#class='title']/#title").extract()[0]
item["precio"] = title.xpath("span[#class='ch-price']/text()").extract()[0]
yield item
Example items from the output:
{'precio': u'$ 35,000', 'producto': u'Cuatrimoto, Utv De 500cc 4x4 ,moto , Motos, Atv ,'}
{'precio': u'$ 695', 'producto': u'Reloj Esp\xeda Camara Oculta Video Hd 16 Gb! Sony Compara.'}

Related

Scrapy returns all value in a single cell

i'm trying to scrape this site using scrapy but returns all the value in a
single cell, i except each value in a different row.
example:
milage: 25
milage: 377
milage: 247433
milage: 464130
but i'm getting the data like this
example:
milage:[u'25',
u'377',
u'247433',
u'399109',
u'464130',
u'399631',
u'435238',
u'285000',
u'287470',
u'280000']
here is my code
import scrapy
from ..items import ExampleItem
from scrapy.selector import HtmlXPathSelector
url = 'https://example.com'
class Example(scrapy.Spider):
name = 'example'
allowed_domains = ['www.example.com']
start_urls = [url]
def parse(self, response):
hxs = HtmlXPathSelector(response)
item_selector = hxs.select('//div[#class="listing_format card5 relative"]')
for fields in item_selector:
item = ExampleItem()
item ['Mileage'] = fields.select('//li[strong="Mileage"]/span/text()').extract()
yield item
You didn't show your site but may be you need relative XPath:
item ['Mileage'] = fields.select('.//li[strong="Mileage"]/span/text()').extract_first()
It sounds like you need to iterate over your milages.
for fields in item_selector:
milages = fields.select('//li[strong="Mileage"]/span/text()').extract()
for milage in milages:
item = CommercialtrucktraderItem()
item ['Mileage'] = milage
yield item
Also consider making your fields.select('//li[strong="Mileage"]/span/text()').extract() more specific?

Scrapy merging to 1 list

I've build my 1st Scrapy project but can't figure out the last hurdle.
With my script below I get one long list in csv. First all the Product Prices and than all the Product Names.
What I would like to achieve is that for every Product the price is next to in.
For example:
Product Name, Product Price
Product Name, Product Price
My scrapy project:
Items.py
from scrapy.item import Item, Field
class PrijsvergelijkingItem(Item):
Product_ref = Field()
Product_price = Field()
My Spider called nvdb.py:
from scrapy.spider import BaseSpider
import scrapy.selector
from Prijsvergelijking.items import PrijsvergelijkingItem
class MySpider(BaseSpider):
name = "nvdb"
allowed_domains = ["vandenborre.be"]
start_urls = ["http://www.vandenborre.be/tv-lcd-led/lcd-led-tv-80-cm-alle-producten"]
def parse(self, response):
hxs = scrapy.Selector(response)
titles = hxs.xpath("//ul[#id='prodlist_ul']")
items = []
for titles in titles:
item = PrijsvergelijkingItem()
item["Product_ref"] = titles.xpath("//div[#class='prod_naam']//text()[2]").extract()
item["Product_price"] = titles.xpath("//div[#class='prijs']//text()[2]").extract()
items.append(item)
return items
You need to switch your XPath expressions to work in the context of every "product". In order to do this, you need to prepend a dot to the expressions:
def parse(self, response):
products = response.xpath("//ul[#id='prodlist_ul']/li")
for product in products:
item = PrijsvergelijkingItem()
item["Product_ref"] = product.xpath(".//div[#class='prod_naam']//text()[2]").extract_first()
item["Product_price"] = product.xpath(".//div[#class='prijs']//text()[2]").extract_first()
yield item
I've also improved the code a little bit:
I assume you meant to iterate over list items ul->li and not just ul - fixed the expression
used the response.xpath() shortcut method
used extract_first() instead of extract()
improved the variable naming
used yield instead of collecting items in a list and then returning
I am not sure if this can help you, but you can use OrderedDict from collections for your need.
from scrapy.spider import BaseSpider
import scrapy.selector
from collections import OrderedDict
from Prijsvergelijking.items import PrijsvergelijkingItem
class MySpider(BaseSpider):
name = "nvdb"
allowed_domains = ["vandenborre.be"]
start_urls = ["http://www.vandenborre.be/tv-lcd-led/lcd-led-tv-80-cm-alle-producten"]
def parse(self, response):
hxs = scrapy.Selector(response)
titles = hxs.xpath("//ul[#id='prodlist_ul']")
items = []
for titles in titles:
item = OrderedDict(PrijsvergelijkingItem())
item["Product_ref"] = titles.xpath("//div[#class='prod_naam']//text()[2]").extract()
item["Product_price"] = titles.xpath("//div[#class='prijs']//text()[2]").extract()
items.append(item)
return items
Also you might have to change the way you iterate dict,
for od in items:
for key,value in od.items():
print key,value

Scrapy only scraping first result of each page

I'm currently trying to run the following code but it keeps scraping only the first result of each page. Any idea what the issue may be?
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from firstproject.items import xyz123Item
import urlparse
from scrapy.http.request import Request
class MySpider(CrawlSpider):
name = "xyz123"
allowed_domains = ["www.xyz123.com.au"]
start_urls = ["http://www.xyz123.com.au/",]
rules = (Rule (SgmlLinkExtractor(allow=("",),restrict_xpaths=('//*[#id="1234headerPagination_hlNextLink"]',))
, callback="parse_xyz", follow=True),
)
def parse_xyz(self, response):
hxs = HtmlXPathSelector(response)
xyz = hxs.select('//div[#id="1234SearchResults"]//div/h2')
items = []
for xyz in xyz:
item = xyz123Item()
item ["title"] = xyz.select('a/text()').extract()[0]
item ["link"] = xyz.select('a/#href').extract()[0]
items.append(item)
return items
The Basespider version works well scraping ALL the required data on the first page:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from firstproject.items import xyz123
class MySpider(BaseSpider):
name = "xyz123test"
allowed_domains = ["xyz123.com.au"]
start_urls = ["http://www.xyz123.com.au/"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select('//div[#id="1234SearchResults"]//div/h2')
items = []
for titles in titles:
item = xyz123Item()
item ["title"] = titles.select("a/text()").extract()
item ["link"] = titles.select("a/#href").extract()
items.append(item)
return items
Sorry for the censoring. I had to censor the website for privacy reasons.
The first code crawls through the pages well the way I'd like it to crawl, however it only pulls the first item title and link. NOTE: The XPath of the first title using "inspect element" in google is:
//*[#id="xyz123SearchResults"]/div[1]/h2/a,
second is //*[#id="xyz123SearchResults"]/div[2]/h2/a
third is //*[#id="xyz123SearchResults"]/div[3]/h2/a etc.
I'm not sure if the div[n] bit is what's killing it. I'm hoping it's an easy fix.
Thanks
for xyz in xyz:
item = xyz123Item()
item ["title"] = xyz.select('a/text()').extract()[0]
item ["link"] = xyz.select('a/#href').extract()[0]
items.append(item)
return items
Are you sure about the indentation of the return items ? It should be one less.

Scrapy project, scraping a schedule

So I'm trying to scrape the schedule at this page.. http://stats.swehockey.se/ScheduleAndResults/Schedule/3940
..with this code.
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class SchemaSpider(BaseSpider):
name = "schema"
allowed_domains = ["http://stats.swehockey.se/"]
start_urls = [
"http://stats.swehockey.se/ScheduleAndResults/Schedule/3940"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
rows = hxs.select('//table[#class="tblContent"]/tbody/tr')
for row in rows:
date = row.select('/td[1]/div/span/text()').extract()
teams = row.select('/td[2]/text()').extract()
print date, teams
But I can't get it to work. What am I doing wrong? I've been trying to figure out myself for a couple of hours now but I have no idea why my XPath doesn't work properly.
Two problems:
tbody is a tag that is added by modern browsers. Scrapy simply doesn't see it in the html.
xpaths for data and teams weren't right: you should use relative xpath (.//), also td indexes was wrong, should be 2 and 3 instead of 1 and 2
Here's the whole code with some mofidications (working):
from scrapy.item import Item, Field
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class SchemaItem(Item):
date = Field()
teams = Field()
class SchemaSpider(BaseSpider):
name = "schema"
allowed_domains = ["http://stats.swehockey.se/"]
start_urls = [
"http://stats.swehockey.se/ScheduleAndResults/Schedule/3940"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
rows = hxs.select('//table[#class="tblContent"]/tr')
for row in rows:
item = SchemaItem()
item['date'] = row.select('.//td[2]/div/span/text()').extract()
item['teams'] = row.select('.//td[3]/text()').extract()
yield item
Hope that helps.

scrapy: newbie attempting to debug code

Total newbie, trying to get scrapy to read a list of urls from csv and return the items in a csv.
Need some help to figure out where I'm going wrong here:
Spider code:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
import random
class incyspider(BaseSpider):
name = "incyspider"
def __init__(self):
super(incyspider, self).__init__()
domain_name = "incyspider.co.uk"
f = open("urls.csv")
start_urls = [url.strip() for url in f.readlines()]
f.close
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="Product"]')
items = []
for site in sites:
item['title'] = hxs.select('//div[#class="Name"]/node()').extract()
item['hlink'] = hxs.select('//div[#class="Price"]/node()').extract()
item['price'] = hxs.select('//div[#class="Codes"]/node()').extract()
items.append(item)
return items
SPIDER = incyspider()
Here's the items.py code:
from scrapy.item import Item, Field
class incyspider(Item):
# define the fields for your item here like:
# name = Field()
title = Field()
hlink = Field()
price = Field()
pass
To run, I'm using
scrapy crawl incyspider -o items.csv -t csv
I would seriously appreciate any pointers.
I'm not exactly sure but after a quick look at your code I would say that at least you need to replace this line
sites = hxs.select('//div[#class="Product"]')
by this line
sites = hxs.select('//div[#class="Product"]').extract()
As a first punt at answering this, your spider code is missing an import for your incyspider item class. Also you're not creating an instance of any kind of item to store the title/hlink/price info, so the items.append(item) line might complain.
Since your spider is also called incyspider, you should rename the item to be something like incyspiderItem and then add the following line to your spider code
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
import random
from incyspider.items import incyspiderItem
class incyspider(BaseSpider):
name = "incyspider"
def __init__(self):
super(incyspider, self).__init__()
domain_name = "incyspider.co.uk"
f = open("urls.csv")
start_urls = [url.strip() for url in f.readlines()]
f.close
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="Product"]')
items = []
for site in sites:
item = incyspiderItem()
item['title'] = hxs.select('//div[#class="Name"]/node()').extract()
item['hlink'] = hxs.select('//div[#class="Price"]/node()').extract()
item['price'] = hxs.select('//div[#class="Codes"]/node()').extract()
items.append(item)
return items
If I'm wrong, then please edit the question to explain how you know there is a problem with the code eg: is the expected output different to the actual output? If so, how?

Categories

Resources