Scraper not finding pages - python

I have a spider written as below, but it doesn't seem to be getting to the function parse. Could someone take a quick look and let me know if I'm missing something. Am I implementing the SgmlLinkExtractor properly?
The spider should pick out all the links from the left sidebar, create a request from them, then parse the next page for a facebook link. It should also do this for other pages as specified in the SgmlLinkExtractor. At the moment, the spider is running, but not parsing any pages.
class PrinzSpider(CrawlSpider):
name = "prinz"
allowed_domains = ["prinzwilly.de"]
start_urls = ["http://www.prinzwilly.de/"]
rules = (
Rule(
SgmlLinkExtractor(
allow=(r'veranstaltungen-(.*)', ),
),
callback='parse'
),
)
def parse(self, response):
hxs = HtmlXPathSelector(response)
startlinks = hxs.select("//ul[#id='mainNav2']/li/a")
print startlinks
for link in startlinks:
giglink = link.select('#href').extract()
item = GigItem()
item['gig_link'] = giglink
request = Request(item['gig_link'], callback='parse_gig_page')
item.meta['item'] = item
yield request
def parse_gig_page(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
gig_content = hxs.select("//div[#class='n']/table/tbody").extract()
fb_link = re.findall(r'(?:www.facebook.com/)(.*)', gig_content)
print '********** FB LINK ********', fb_link
return item
EDIT **
settings.py
BOT_NAME = 'gigscraper'
SPIDER_MODULES = ['gigscraper.spiders']
NEWSPIDER_MODULE = 'gigscraper.spiders'
ITEM_PIPLINES = ['gigscraper.pipelines.GigscraperPipeline']
items.py
from scrapy.item import Item, Field
class GigItem(Item):
gig_link = Field()
pipelines.py
class GigscraperPipeline(object):
def process_item(self, item, spider):
print 'here I am in the pipeline'
return item

Two problems:
extract() returns a list, you are missing [0]
Request's callback should not be a string, use self.parse_gig_page
Here's the modified code (working):
import re
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.http import Request
from scrapy.item import Item, Field
from scrapy.selector import HtmlXPathSelector
class GigItem(Item):
gig_link = Field()
class PrinzSpider(CrawlSpider):
name = "prinz"
allowed_domains = ["prinzwilly.de"]
start_urls = ["http://www.prinzwilly.de/"]
rules = (Rule(SgmlLinkExtractor(allow=(r'veranstaltungen-(.*)',)), callback='parse'),)
def parse(self, response):
hxs = HtmlXPathSelector(response)
startlinks = hxs.select("//ul[#id='mainNav2']/li/a")
for link in startlinks:
item = GigItem()
item['gig_link'] = link.select('#href').extract()[0]
yield Request(item['gig_link'], callback=self.parse_gig_page, meta={'item': item})
def parse_gig_page(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
gig_content = hxs.select("//div[#class='n']/table/tbody").extract()[0]
fb_link = re.findall(r'(?:www.facebook.com/)(.*)', gig_content)
print '********** FB LINK ********', fb_link
return item
Hope that helps.

Related

using scrapy extracting data inside links

I have been trying to extract data from consumercomplaints.in the title and the data inside those title links.I wrote the following code and unable to parse through the links and extract the data and also I am unable to extract all the links related.plz guide
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
from comp.items import CompItem
class criticspider(CrawlSpider):
name ="comp"
allowed_domains =["consumercomplaints.in"]
#start_urls =["http://www.consumercomplaints.in/?search=delhivery&page=2","http://www.consumercomplaints.in/?search=delhivery&page=3","http://www.consumercomplaints.in/?search=delhivery&page=4","http://www.consumercomplaints.in/?search=delhivery&page=5","http://www.consumercomplaints.in/?search=delhivery&page=6","http://www.consumercomplaints.in/?search=delhivery&page=7","http://www.consumercomplaints.in/?search=delhivery&page=8","http://www.consumercomplaints.in/?search=delhivery&page=9","http://www.consumercomplaints.in/?search=delhivery&page=10","http://www.consumercomplaints.in/?search=delhivery&page=11"]
start_urls=["http://www.consumercomplaints.in/?search=delhivery"]
rules=(
Rule(SgmlLinkExtractor(allow=("search=delhivery&page=1/+",)), callback="parse", follow=True),
#Rule(SgmlLinkExtractor(allow=("startrow=\d",)),callback="parse_health",follow=True),
)
def parse(self,response):
hxs = Selector(response)
sites = hxs.select('//table[#width="100%"]')
items = []
for site in sites:
item = CompItem()
item['title'] = site.select('.//td[#class="complaint"]/a/span/text()').extract()
item['link'] = site.select('.//td[#class="complaint"]/a/#href').extract()
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
# item['intro'] = site.select('.//td[#class="small"]//a[2]/text()').extract()
# item['heading'] = site.select('.//td[#class="compl-text"]/div/b[1]/text()').extract()
# item['date'] = site.select('.//td[#class="small"]/text()[2]').extract()
# item['complaint'] = site.select('.//td[#class="compl-text"]/div/text()').extract()
items.append(item)
def anchor_page(self, response):
hxs = Selector(response)
old_item = response.request.meta['item'] # Receiving parse Method item that was in Request meta
# parse some more values
#place them in old_item
#e.g
old_item['data']=hxs.select('.//td[#class="compl-text"]/div/text()').extract()
yield old_item
Are you using an old version of Scrapy?
In the latest stable version you don't need to do hxs = Selector(response) nor using the hxs.select() method. You can do the same thing just with response.xpath().
I think the problem in your code is that the result of select() (or response.xpath) is actually a Python list, so you need to do:
link = site.select('.//td[#class="complaint"]/a/#href').extract()
if link:
item['link'] = link[0]
You probably want to do a similar thing for title too.
EDIT: I got it working with a few changes:
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
class CompItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
data = scrapy.Field()
class criticspider(CrawlSpider):
name = "comp"
allowed_domains = ["consumercomplaints.in"]
start_urls = ["http://www.consumercomplaints.in/?search=delhivery"]
rules = (
Rule(
SgmlLinkExtractor(allow=("search=delhivery&page=1/+",)),
callback="parse",
follow=True),
)
def parse(self, response):
sites = response.xpath('//table[#width="100%"]')
items = []
for site in sites:
item = CompItem()
item['title'] = site.xpath('.//td[#class="complaint"]/a/span/text()').extract()[0]
item['link'] = site.xpath('.//td[#class="complaint"]/a/#href').extract()[0]
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield scrapy.Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
items.append(item)
def anchor_page(self, response):
old_item = response.request.meta['item']
old_item['data'] = response.xpath('.//td[#class="compl-text"]/div/text()').extract()
yield old_item

Scrapy Recursive download of Content

After banging my head several time, I am finally coming here.
Problem : I am trying to download the content of each of the craiglist posting. By content I mean the "posting body" like description of the cell phone. Looking for a new old phone since iPhone is done with all excitement.
The code is an awesome work by Michael Herman.
My Spider Class
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import *
from craig.items import CraiglistSampleItem
class MySpider(CrawlSpider):
name = "craigs"
allowed_domains = ["craigslist.org"]
start_urls = ["http://minneapolis.craigslist.org/moa/"]
rules = (Rule (SgmlLinkExtractor(allow=("index\d00\.html", ),restrict_xpaths=('//p[#class="nextpage"]',))
, callback="parse_items", follow= True),
)
def parse_items(self,response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//span[#class='pl']")
items = []
for titles in titles:
item = CraiglistSampleItem()
item ["title"] = titles.select("a/text()").extract()
item ["link"] = titles.select("a/#href").extract()
items.append(item)
return items
And the Item class
from scrapy.item import Item, Field
class CraiglistSampleItem(Item):
title = Field()
link = Field()
Since the code will traverse many links , hence I wanted to save the description of each cell phone in sepearte csv but one more column in csv will be fine also.
Any lead !!!
Instead of returning items in parse_items method you should return/yield scrapy Request instance in order to get the description from the item page, link and title you can pass inside of an Item, and Item inside of the meta dictionary:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request
from scrapy.selector import *
from scrapy.item import Item, Field
class CraiglistSampleItem(Item):
title = Field()
link = Field()
description = Field()
class MySpider(CrawlSpider):
name = "craigs"
allowed_domains = ["craigslist.org"]
start_urls = ["http://minneapolis.craigslist.org/moa/"]
rules = (Rule(SgmlLinkExtractor(allow=("index\d00\.html", ), restrict_xpaths=('//p[#class="nextpage"]',))
, callback="parse_items", follow=True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//span[#class='pl']")
for title in titles:
item = CraiglistSampleItem()
item["title"] = title.select("a/text()").extract()[0]
item["link"] = title.select("a/#href").extract()[0]
url = "http://minneapolis.craigslist.org%s" % item["link"]
yield Request(url=url, meta={'item': item}, callback=self.parse_item_page)
def parse_item_page(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
item['description'] = hxs.select('//section[#id="postingbody"]/text()').extract()
return item
Run it and see additional description column in your output csv file.
Hope that helps.

scrapy scrape extra field in linked page

I am trying to scrape some posts on the primary page where almost everything i need is there. But on the link(ed) page there is a date field I additionally need. I tried a callback with the following:
from scrapy.spider import BaseSpider
from macnn_com.items import MacnnComItem
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.loader import XPathItemLoader
from scrapy.contrib.loader.processor import MapCompose, Join
from scrapy.http.request import Request
class MacnnSpider(BaseSpider):
name = 'macnn_com'
allowed_domains = ['macnn.com']
start_urls = ['http://www.macnn.com']
posts_list_xpath = '//div[#class="post"]'
item_fields = { 'title': './/h1/a/text()',
'link': './/h1/a/#href',
'summary': './/p/text()',
'image': './/div[#class="post_img"]/div[#class="post_img_border"]/a/img/#original' }
def parse(self, response):
hxs = HtmlXPathSelector(response)
# iterate over posts
for qxs in hxs.select(self.posts_list_xpath):
loader = XPathItemLoader(MacnnComItem(), selector=qxs)
# define processors
loader.default_input_processor = MapCompose(unicode.strip)
loader.default_output_processor = Join()
# skip posts with empty titles
if loader.get_xpath('.//h1/a/text()') == []:
continue
# iterate over fields and add xpaths to the loader
for field, xpath in self.item_fields.iteritems():
loader.add_xpath(field, xpath)
request = Request(loader.get_xpath('.//h1/a/#href')[0], callback=self.parse_link,meta={'loader':loader})
yield request
#loader.add_value('datums',request)
yield loader.load_item()
def parse_link(self, response):
loader = response.meta["loader"]
hxs = HtmlXPathSelector(response)
hero = hxs.select("//div[#class='post_header']/h2/text()").extract()
loader.add_value('datums',hero)
return loader
But I get errors like
ERROR: Spider must return Request, BaseItem or None, got 'XPathItemLoader' in <GET http://www.macnn.com/articles/13/06/14/sidebar.makes.it.easier.to.jump.between.columns/>
What am I doing wrong here?
parse_link needs to return an item, not the loader.
def parse_link(self, response):
loader = response.meta["loader"]
hxs = HtmlXPathSelector(response)
hero = hxs.select("//div[#class='post-header']/h2/text()").extract()
loader.add_value('datums',hero)
return loader.load_item()

How to mention link extractor rules when using BaseSpider in scrapy

Suppose this is my code
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from dmoz.items import DmozItem
class DmozSpider(BaseSpider):
domain_name = "dmoz.org"
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//ul[2]/li')
items = []
for site in sites:
item = DmozItem()
item['title'] = site.select('a/text()').extract()
item['link'] = site.select('a/#href').extract()
item['desc'] = site.select('text()').extract()
items.append(item)
return items
SPIDER = DmozSpider()
If i have used crawlSpider then i could uses Rules to implement thelink extractor but how can i mention rules in base spider. Like in above example. Because rules is only avaialble in crawlspider not base spider
Perhaps you could parse the response for your rule criteria and then pass the successful responses on to a second callback? Pseudo-code below:
def parse(self, response):
# check response for rule criteria
...
if rule:
# create new request to pass to second callback
req = Request("http://www.example.com/follow", callback=self.parse2)
return req
def parse2(self, response):
hxs = HtmlXPathSelector(response)
# do stuff with the successful response

How to use Request function in a Scrapy Spider?

from string import join
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders.crawl import Rule, CrawlSpider
from scrapy.http.request import Request
from scrapy.selector import HtmlXPathSelector
from Gfire.items import GfireItem
class GuideSpider(CrawlSpider):
name = "Gfire"
allowed_domains = ['www.example.com']
start_urls = [
"http://www.example.com/gfire/guides"
]
rules = (
Rule(SgmlLinkExtractor(allow=("gfire/guides.*page=")), callback='parse_item', follow=True),
)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items = []
sites = hxs.select('//div[#class="title"]')
for site in sites:
item = GFireItem()
item['title'] = site.select('./a/text()').extract()
item['guide_url'] = site.select('./a/#href').extract()
item['guide_url'] = "http://www.example.com" + join(item['guide_url'])
items.append(item)
return Request(items[1], callback=self.parse_item2)
def parse_item2(self, response):
hxs = HtmlXPathSelector(response)
hero = hxs.select("//h3/a/text()").extract()
return hero
Can't get this spider to work. The request function contains items[1] that should be item['guide_url'] but it says me that the parameter has to be str or unicode.
How can I corret this error? And how can I pass to the callback function the items list? Via request.meta?
Your item[1] is actually an instance of GFireItem.
I'm not certain why you are creating these as you only use one (the second site in your list of sites), discarding the rest of the list.
That aside, you need to extract the items[1]['guide_url'] url when creating the Request:
return Request(items[1]['guide_url'], callback=self.parse_item2)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items = []
sites = hxs.select('//div[#class="title"]')
for site in sites:
item = GFireItem()
item['title'] = site.select('./a/text()').extract()
item['guide_url'] = site.select('./a/#href').extract()
item['guide_url'] = "http://www.example.com" + join(item['guide_url'])
items.append(item)
return Request(items[1]['guide_url'], request.meta={'items':items}, callback=self.parse_item2)
def parse_item2(self, response):
items = response.meta["items"]
hxs = HtmlXPathSelector(response)
hero = hxs.select("//h3/a/text()").extract()
return hero

Categories

Resources