verify scrapy project code - python

I try to extract job offers informations from this website and this is my code
from scrapy.spider import Spider
from scrapy.selector import Selector
from tutorial.items import DmozItem
class DmozSpider(Spider):
name = "myspider"
allowed_domains =["tanitjobs.com/"]
start_urls =["http://tanitjobs.com/search-results-jobs/"]
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//div[#class="offre"]/div[#class="detail"]')
items = []
item = DmozItem()
for site in sites:
item['title'] = site.xpath('a/text()').extract()
item['link'] = site.xpath('a/#href').extract()
item['desc'] = site.xpath('div[#class="descriptionjob"]/text()').extract()
items.append(item)
return items
but the result is incorrect (empty item list):
{'desc': [],
'link': [u'lien'],
'title': []}
and many blocks like this ...

item = DmozItem() should be called for each loop iteration, otherwise you are rewriting the same item, appending the same item to the items list
It should look like:
from scrapy.spider import Spider
from scrapy.selector import Selector
from tutorial.items import DmozItem
class DmozSpider(Spider):
name = "myspider"
allowed_domains =["tanitjobs.com/"]
start_urls =["http://tanitjobs.com/search-results-jobs/"]
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//div[#class="offre"]/div[#class="detail"]')
items = []
for site in sites:
item = DmozItem()
item['title'] = site.xpath('a/text()').extract()
item['link'] = site.xpath('a/#href').extract()
item['desc'] = site.xpath('div[#class="descriptionjob"]/text()').extract()
items.append(item)
return items

Your title xpath didn't take into account the <strong> tags on either side of the text, and your desc xpath needs to go down another div to retrieve the required information.
I just noticed that the xpath for job description varies. The xpath in the code below returns job descriptions for the first three results but not subsequent ones. You would need to examine subsequent results to determine how the xpath changes to retrieve descriptions for those jobs.
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//div[#class="offre"]/div[#class="detail"]')
items = []
for site in sites:
item = DmozItem()
item['title'] = site.xpath('normalize-space(a/strong/text())').extract()
item['link'] = site.xpath('a/#href').extract()
item['desc'] = site.xpath('normalize-space(./div/div[#class="descriptionjob"]/text())').extract()
items.append(item)
return items

Related

Scrapy crawler will not crawl any webpages

I have been trying to get this crawler working but I keep getting errors.
Can anyone suggest any ways to get it to run?
The main spider code is
import scrapy
from scrapy.spiders import Spider
from scrapy.selector import Selector
class gameSpider(scrapy.Spider):
name = "game_spider.py"
allowed_domains = ["*"]
start_urls = [
"http://www.game.co.uk/en/grand-theft-auto-v-with-gta-online-3-500-000-1085837?categoryIdentifier=706209&catGroupId="
]
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//ul[#class="directory-url"]/li')
items = []
for site in sites:
item = Website()
item['name'] = site.xpath('//*[#id="details301149"]/div/div/h2/text()').extract()
"""item['link'] = site.xpath('//a/#href').extract()
item['description'] = site.xpath('//*[#id="overview"]/div[3]()').re('-\s[^\n]*\\r')"""
items.append(item)
print items
return items
The item code is
import scrapy
class GameItem(Item):
name = Field()
pass
Your start_urls link returns erorr 500.
There's no items.
In [7]: sites = response.xpath('//ul[#class="directory-url"]/li')
In [8]: sites
Out[8]: []

scraping url and title from nested anchor tag

This is my first scraper using scrapy.
I am trying to scrap video url, title from https://www.google.co.in/trends/hotvideos#hvsm=0 site.
import scrapy
from scrapy.item import Item, Field
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class CraigslistItem(Item):
title = Field()
link = Field()
class DmozSpider(scrapy.Spider):
name = "google"
allowed_domains = ["google.co.in"]
start_urls = [
"https://www.google.co.in/trends/hotvideos#hvsm=0"
]
def parse(self, response):
#for sel in response.xpath('//body/div'):
hxs = HtmlXPathSelector(response)
sites = hxs.xpath("//span[#class='single-video-image-container']")
items = []
for sel in response.xpath("//span[#class='single-video-image-container']"):
item = CraigslistItem()
item['title'] = sel.xpath('a/text()').extract()
item['link'] = sel.xpath('a/#href').extract()
items.append(item)
print items
General walk through of what I am doing wrong would be much appreciable.
Use the help Scrapy FormRequest to get it done.
from scrapy.http import FormRequest
import json
class DmozSpider(scrapy.Spider):
name = "google"
allowed_domains = ["google.co.in"]
start_urls = [
"https://www.google.co.in/trends/hotvideos#hvsm=0"
]
def parse(self, response):
url = 'https://www.google.co.in/trends/hotvideos/hotItems'
formdata = {'hvd':'','geo': 'IN','mob': '0','hvsm': '0'}
yield FormRequest(url=url, formdata=formdata, callback=self.parse_data)
def parse_data(self, response):
json_response = json.loads(response.body)
videos = json_response.get('videoList')
for video in videos:
item = CraigslistItem()
item['title'] = video.get('title')
item['link'] = video.get('url')
yield item

using scrapy extracting data inside links

I have been trying to extract data from consumercomplaints.in the title and the data inside those title links.I wrote the following code and unable to parse through the links and extract the data and also I am unable to extract all the links related.plz guide
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
from comp.items import CompItem
class criticspider(CrawlSpider):
name ="comp"
allowed_domains =["consumercomplaints.in"]
#start_urls =["http://www.consumercomplaints.in/?search=delhivery&page=2","http://www.consumercomplaints.in/?search=delhivery&page=3","http://www.consumercomplaints.in/?search=delhivery&page=4","http://www.consumercomplaints.in/?search=delhivery&page=5","http://www.consumercomplaints.in/?search=delhivery&page=6","http://www.consumercomplaints.in/?search=delhivery&page=7","http://www.consumercomplaints.in/?search=delhivery&page=8","http://www.consumercomplaints.in/?search=delhivery&page=9","http://www.consumercomplaints.in/?search=delhivery&page=10","http://www.consumercomplaints.in/?search=delhivery&page=11"]
start_urls=["http://www.consumercomplaints.in/?search=delhivery"]
rules=(
Rule(SgmlLinkExtractor(allow=("search=delhivery&page=1/+",)), callback="parse", follow=True),
#Rule(SgmlLinkExtractor(allow=("startrow=\d",)),callback="parse_health",follow=True),
)
def parse(self,response):
hxs = Selector(response)
sites = hxs.select('//table[#width="100%"]')
items = []
for site in sites:
item = CompItem()
item['title'] = site.select('.//td[#class="complaint"]/a/span/text()').extract()
item['link'] = site.select('.//td[#class="complaint"]/a/#href').extract()
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
# item['intro'] = site.select('.//td[#class="small"]//a[2]/text()').extract()
# item['heading'] = site.select('.//td[#class="compl-text"]/div/b[1]/text()').extract()
# item['date'] = site.select('.//td[#class="small"]/text()[2]').extract()
# item['complaint'] = site.select('.//td[#class="compl-text"]/div/text()').extract()
items.append(item)
def anchor_page(self, response):
hxs = Selector(response)
old_item = response.request.meta['item'] # Receiving parse Method item that was in Request meta
# parse some more values
#place them in old_item
#e.g
old_item['data']=hxs.select('.//td[#class="compl-text"]/div/text()').extract()
yield old_item
Are you using an old version of Scrapy?
In the latest stable version you don't need to do hxs = Selector(response) nor using the hxs.select() method. You can do the same thing just with response.xpath().
I think the problem in your code is that the result of select() (or response.xpath) is actually a Python list, so you need to do:
link = site.select('.//td[#class="complaint"]/a/#href').extract()
if link:
item['link'] = link[0]
You probably want to do a similar thing for title too.
EDIT: I got it working with a few changes:
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
class CompItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
data = scrapy.Field()
class criticspider(CrawlSpider):
name = "comp"
allowed_domains = ["consumercomplaints.in"]
start_urls = ["http://www.consumercomplaints.in/?search=delhivery"]
rules = (
Rule(
SgmlLinkExtractor(allow=("search=delhivery&page=1/+",)),
callback="parse",
follow=True),
)
def parse(self, response):
sites = response.xpath('//table[#width="100%"]')
items = []
for site in sites:
item = CompItem()
item['title'] = site.xpath('.//td[#class="complaint"]/a/span/text()').extract()[0]
item['link'] = site.xpath('.//td[#class="complaint"]/a/#href').extract()[0]
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield scrapy.Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
items.append(item)
def anchor_page(self, response):
old_item = response.request.meta['item']
old_item['data'] = response.xpath('.//td[#class="compl-text"]/div/text()').extract()
yield old_item

Scrapy Recursive download of Content

After banging my head several time, I am finally coming here.
Problem : I am trying to download the content of each of the craiglist posting. By content I mean the "posting body" like description of the cell phone. Looking for a new old phone since iPhone is done with all excitement.
The code is an awesome work by Michael Herman.
My Spider Class
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import *
from craig.items import CraiglistSampleItem
class MySpider(CrawlSpider):
name = "craigs"
allowed_domains = ["craigslist.org"]
start_urls = ["http://minneapolis.craigslist.org/moa/"]
rules = (Rule (SgmlLinkExtractor(allow=("index\d00\.html", ),restrict_xpaths=('//p[#class="nextpage"]',))
, callback="parse_items", follow= True),
)
def parse_items(self,response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//span[#class='pl']")
items = []
for titles in titles:
item = CraiglistSampleItem()
item ["title"] = titles.select("a/text()").extract()
item ["link"] = titles.select("a/#href").extract()
items.append(item)
return items
And the Item class
from scrapy.item import Item, Field
class CraiglistSampleItem(Item):
title = Field()
link = Field()
Since the code will traverse many links , hence I wanted to save the description of each cell phone in sepearte csv but one more column in csv will be fine also.
Any lead !!!
Instead of returning items in parse_items method you should return/yield scrapy Request instance in order to get the description from the item page, link and title you can pass inside of an Item, and Item inside of the meta dictionary:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request
from scrapy.selector import *
from scrapy.item import Item, Field
class CraiglistSampleItem(Item):
title = Field()
link = Field()
description = Field()
class MySpider(CrawlSpider):
name = "craigs"
allowed_domains = ["craigslist.org"]
start_urls = ["http://minneapolis.craigslist.org/moa/"]
rules = (Rule(SgmlLinkExtractor(allow=("index\d00\.html", ), restrict_xpaths=('//p[#class="nextpage"]',))
, callback="parse_items", follow=True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//span[#class='pl']")
for title in titles:
item = CraiglistSampleItem()
item["title"] = title.select("a/text()").extract()[0]
item["link"] = title.select("a/#href").extract()[0]
url = "http://minneapolis.craigslist.org%s" % item["link"]
yield Request(url=url, meta={'item': item}, callback=self.parse_item_page)
def parse_item_page(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
item['description'] = hxs.select('//section[#id="postingbody"]/text()').extract()
return item
Run it and see additional description column in your output csv file.
Hope that helps.

How to use Request function in a Scrapy Spider?

from string import join
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders.crawl import Rule, CrawlSpider
from scrapy.http.request import Request
from scrapy.selector import HtmlXPathSelector
from Gfire.items import GfireItem
class GuideSpider(CrawlSpider):
name = "Gfire"
allowed_domains = ['www.example.com']
start_urls = [
"http://www.example.com/gfire/guides"
]
rules = (
Rule(SgmlLinkExtractor(allow=("gfire/guides.*page=")), callback='parse_item', follow=True),
)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items = []
sites = hxs.select('//div[#class="title"]')
for site in sites:
item = GFireItem()
item['title'] = site.select('./a/text()').extract()
item['guide_url'] = site.select('./a/#href').extract()
item['guide_url'] = "http://www.example.com" + join(item['guide_url'])
items.append(item)
return Request(items[1], callback=self.parse_item2)
def parse_item2(self, response):
hxs = HtmlXPathSelector(response)
hero = hxs.select("//h3/a/text()").extract()
return hero
Can't get this spider to work. The request function contains items[1] that should be item['guide_url'] but it says me that the parameter has to be str or unicode.
How can I corret this error? And how can I pass to the callback function the items list? Via request.meta?
Your item[1] is actually an instance of GFireItem.
I'm not certain why you are creating these as you only use one (the second site in your list of sites), discarding the rest of the list.
That aside, you need to extract the items[1]['guide_url'] url when creating the Request:
return Request(items[1]['guide_url'], callback=self.parse_item2)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items = []
sites = hxs.select('//div[#class="title"]')
for site in sites:
item = GFireItem()
item['title'] = site.select('./a/text()').extract()
item['guide_url'] = site.select('./a/#href').extract()
item['guide_url'] = "http://www.example.com" + join(item['guide_url'])
items.append(item)
return Request(items[1]['guide_url'], request.meta={'items':items}, callback=self.parse_item2)
def parse_item2(self, response):
items = response.meta["items"]
hxs = HtmlXPathSelector(response)
hero = hxs.select("//h3/a/text()").extract()
return hero

Categories

Resources