Scrapy project, scraping a schedule - python

So I'm trying to scrape the schedule at this page.. http://stats.swehockey.se/ScheduleAndResults/Schedule/3940
..with this code.
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class SchemaSpider(BaseSpider):
name = "schema"
allowed_domains = ["http://stats.swehockey.se/"]
start_urls = [
"http://stats.swehockey.se/ScheduleAndResults/Schedule/3940"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
rows = hxs.select('//table[#class="tblContent"]/tbody/tr')
for row in rows:
date = row.select('/td[1]/div/span/text()').extract()
teams = row.select('/td[2]/text()').extract()
print date, teams
But I can't get it to work. What am I doing wrong? I've been trying to figure out myself for a couple of hours now but I have no idea why my XPath doesn't work properly.

Two problems:
tbody is a tag that is added by modern browsers. Scrapy simply doesn't see it in the html.
xpaths for data and teams weren't right: you should use relative xpath (.//), also td indexes was wrong, should be 2 and 3 instead of 1 and 2
Here's the whole code with some mofidications (working):
from scrapy.item import Item, Field
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class SchemaItem(Item):
date = Field()
teams = Field()
class SchemaSpider(BaseSpider):
name = "schema"
allowed_domains = ["http://stats.swehockey.se/"]
start_urls = [
"http://stats.swehockey.se/ScheduleAndResults/Schedule/3940"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
rows = hxs.select('//table[#class="tblContent"]/tr')
for row in rows:
item = SchemaItem()
item['date'] = row.select('.//td[2]/div/span/text()').extract()
item['teams'] = row.select('.//td[3]/text()').extract()
yield item
Hope that helps.

Related

Scrapy returns all value in a single cell

i'm trying to scrape this site using scrapy but returns all the value in a
single cell, i except each value in a different row.
example:
milage: 25
milage: 377
milage: 247433
milage: 464130
but i'm getting the data like this
example:
milage:[u'25',
u'377',
u'247433',
u'399109',
u'464130',
u'399631',
u'435238',
u'285000',
u'287470',
u'280000']
here is my code
import scrapy
from ..items import ExampleItem
from scrapy.selector import HtmlXPathSelector
url = 'https://example.com'
class Example(scrapy.Spider):
name = 'example'
allowed_domains = ['www.example.com']
start_urls = [url]
def parse(self, response):
hxs = HtmlXPathSelector(response)
item_selector = hxs.select('//div[#class="listing_format card5 relative"]')
for fields in item_selector:
item = ExampleItem()
item ['Mileage'] = fields.select('//li[strong="Mileage"]/span/text()').extract()
yield item
You didn't show your site but may be you need relative XPath:
item ['Mileage'] = fields.select('.//li[strong="Mileage"]/span/text()').extract_first()
It sounds like you need to iterate over your milages.
for fields in item_selector:
milages = fields.select('//li[strong="Mileage"]/span/text()').extract()
for milage in milages:
item = CommercialtrucktraderItem()
item ['Mileage'] = milage
yield item
Also consider making your fields.select('//li[strong="Mileage"]/span/text()').extract() more specific?

No output while scraping using scrapy

I was going to scrap the commentary from espncricnfo website using scrapy and i got output(items.csv) as blank. These are my files.
cricinfo.py (Spider File)
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from crictest.items import CrictestItem
class MySpider(BaseSpider):
name = "cricinfo"
allowed_domains = ["espncricinfo.com/"]
start_urls = ["http://www.espncricinfo.com/champions-league-twenty20-2014/engine/match/763595.html?innings=1;view=commentary/"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
rows = hxs.select('//td[#class="battingComms" and b]')
for row in rows:
item = CrictestItem()
item['overnum'] = row.select('b/text()').extract()[0]
item['overnumtext'] = row.select('b/following-sibling::text()').extract()[0]
yield item
items.py
import scrapy
class CrictestItem(scrapy.Item):
overnum = scrapy.Field()
overnumtext = scrapy.Field()
the problem is your xpath
you can try using this in chrome:
$x('//*[#id="commInnings"]/div[2]/div/div')
in your code rewrite the code in:
rows = hxs.select('//td[#class="battingComms" and b]')
i can't get any output in the console

How scrape items from 2 different sections?

im new with Scrapy and web crawling and I've been working on the page www.mercadolibre.com.mx I have to get (from the startpage) some data (descripton and prices) about the produtcs displayed in there. Here is my items.py:
from scrapy.item import Item, Field
class PruebaMercadolibreItem(Item):
producto = Field()
precio = Field()
And here is my spider:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from prueba_mercadolibre.items import PruebaMercadolibreItem
class MLSpider(BaseSpider):
name = "mlspider"
allowed_domains = ["mercadolibre.com"]
start_urls = ["http://www.mercadolibre.com.mx"]
def parse (self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//div[#class='item-data']")
items = []
for titles in titles:
item = PruebaMercadolibreItem()
item["producto"] = titles.select("p[#class='tit le']/#title").extract()
item["precio"] = titles.select("span[#class='ch-price']/text()").extract()
items.append(item)
return items
The problem is that I get the same results in when I change this line:
titles = hxs.select("//div[#class='item-data']")
To this:
titles = hxs.select("//div[#class='item-data'] | //div[#class='item-data item-data-mp']")
And Im not getting the same data as when I use the first line.
Can anyone help me? do I have any errorin my xPath selection?
Also I cant find a good tutorial for using MySQL with scrapy, I would appreciate any help. Thx
Better use contains if you want to get all div tags containing item-data class:
titles = hxs.select("//div[contains(#class, 'item-data')]")
Also, you have other problems in the spider:
the loop, you are overriding the titles
class name in producto xpath should be title, not tit le
you probably don't want to have lists in Field values, get the first items out of the extracted lists
HtmlXPathSelector is deprecated, use Selector instead
select() is deprecated, use xpath() instead
BaseSpider has been renamed to Spider
Here's the code with modifications:
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.item import Item, Field
from prueba_mercadolibre.items import PruebaMercadolibreItem
class MLSpider(Spider):
name = "mlspider"
allowed_domains = ["mercadolibre.com"]
start_urls = ["http://www.mercadolibre.com.mx"]
def parse (self, response):
hxs = Selector(response)
titles = hxs.xpath("//div[contains(#class, 'item-data')]")
for title in titles:
item = PruebaMercadolibreItem()
item["producto"] = title.xpath("p[#class='title']/#title").extract()[0]
item["precio"] = title.xpath("span[#class='ch-price']/text()").extract()[0]
yield item
Example items from the output:
{'precio': u'$ 35,000', 'producto': u'Cuatrimoto, Utv De 500cc 4x4 ,moto , Motos, Atv ,'}
{'precio': u'$ 695', 'producto': u'Reloj Esp\xeda Camara Oculta Video Hd 16 Gb! Sony Compara.'}

Scrapy only scraping first result of each page

I'm currently trying to run the following code but it keeps scraping only the first result of each page. Any idea what the issue may be?
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from firstproject.items import xyz123Item
import urlparse
from scrapy.http.request import Request
class MySpider(CrawlSpider):
name = "xyz123"
allowed_domains = ["www.xyz123.com.au"]
start_urls = ["http://www.xyz123.com.au/",]
rules = (Rule (SgmlLinkExtractor(allow=("",),restrict_xpaths=('//*[#id="1234headerPagination_hlNextLink"]',))
, callback="parse_xyz", follow=True),
)
def parse_xyz(self, response):
hxs = HtmlXPathSelector(response)
xyz = hxs.select('//div[#id="1234SearchResults"]//div/h2')
items = []
for xyz in xyz:
item = xyz123Item()
item ["title"] = xyz.select('a/text()').extract()[0]
item ["link"] = xyz.select('a/#href').extract()[0]
items.append(item)
return items
The Basespider version works well scraping ALL the required data on the first page:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from firstproject.items import xyz123
class MySpider(BaseSpider):
name = "xyz123test"
allowed_domains = ["xyz123.com.au"]
start_urls = ["http://www.xyz123.com.au/"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select('//div[#id="1234SearchResults"]//div/h2')
items = []
for titles in titles:
item = xyz123Item()
item ["title"] = titles.select("a/text()").extract()
item ["link"] = titles.select("a/#href").extract()
items.append(item)
return items
Sorry for the censoring. I had to censor the website for privacy reasons.
The first code crawls through the pages well the way I'd like it to crawl, however it only pulls the first item title and link. NOTE: The XPath of the first title using "inspect element" in google is:
//*[#id="xyz123SearchResults"]/div[1]/h2/a,
second is //*[#id="xyz123SearchResults"]/div[2]/h2/a
third is //*[#id="xyz123SearchResults"]/div[3]/h2/a etc.
I'm not sure if the div[n] bit is what's killing it. I'm hoping it's an easy fix.
Thanks
for xyz in xyz:
item = xyz123Item()
item ["title"] = xyz.select('a/text()').extract()[0]
item ["link"] = xyz.select('a/#href').extract()[0]
items.append(item)
return items
Are you sure about the indentation of the return items ? It should be one less.

Scrapy Recursive download of Content

After banging my head several time, I am finally coming here.
Problem : I am trying to download the content of each of the craiglist posting. By content I mean the "posting body" like description of the cell phone. Looking for a new old phone since iPhone is done with all excitement.
The code is an awesome work by Michael Herman.
My Spider Class
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import *
from craig.items import CraiglistSampleItem
class MySpider(CrawlSpider):
name = "craigs"
allowed_domains = ["craigslist.org"]
start_urls = ["http://minneapolis.craigslist.org/moa/"]
rules = (Rule (SgmlLinkExtractor(allow=("index\d00\.html", ),restrict_xpaths=('//p[#class="nextpage"]',))
, callback="parse_items", follow= True),
)
def parse_items(self,response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//span[#class='pl']")
items = []
for titles in titles:
item = CraiglistSampleItem()
item ["title"] = titles.select("a/text()").extract()
item ["link"] = titles.select("a/#href").extract()
items.append(item)
return items
And the Item class
from scrapy.item import Item, Field
class CraiglistSampleItem(Item):
title = Field()
link = Field()
Since the code will traverse many links , hence I wanted to save the description of each cell phone in sepearte csv but one more column in csv will be fine also.
Any lead !!!
Instead of returning items in parse_items method you should return/yield scrapy Request instance in order to get the description from the item page, link and title you can pass inside of an Item, and Item inside of the meta dictionary:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request
from scrapy.selector import *
from scrapy.item import Item, Field
class CraiglistSampleItem(Item):
title = Field()
link = Field()
description = Field()
class MySpider(CrawlSpider):
name = "craigs"
allowed_domains = ["craigslist.org"]
start_urls = ["http://minneapolis.craigslist.org/moa/"]
rules = (Rule(SgmlLinkExtractor(allow=("index\d00\.html", ), restrict_xpaths=('//p[#class="nextpage"]',))
, callback="parse_items", follow=True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//span[#class='pl']")
for title in titles:
item = CraiglistSampleItem()
item["title"] = title.select("a/text()").extract()[0]
item["link"] = title.select("a/#href").extract()[0]
url = "http://minneapolis.craigslist.org%s" % item["link"]
yield Request(url=url, meta={'item': item}, callback=self.parse_item_page)
def parse_item_page(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
item['description'] = hxs.select('//section[#id="postingbody"]/text()').extract()
return item
Run it and see additional description column in your output csv file.
Hope that helps.

Categories

Resources