using scrapy extracting data inside links

using scrapy extracting data inside links - python

I have been trying to extract data from consumercomplaints.in the title and the data inside those title links.I wrote the following code and unable to parse through the links and extract the data and also I am unable to extract all the links related.plz guide
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
from comp.items import CompItem
class criticspider(CrawlSpider):
name ="comp"
allowed_domains =["consumercomplaints.in"]
#start_urls =["http://www.consumercomplaints.in/?search=delhivery&page=2","http://www.consumercomplaints.in/?search=delhivery&page=3","http://www.consumercomplaints.in/?search=delhivery&page=4","http://www.consumercomplaints.in/?search=delhivery&page=5","http://www.consumercomplaints.in/?search=delhivery&page=6","http://www.consumercomplaints.in/?search=delhivery&page=7","http://www.consumercomplaints.in/?search=delhivery&page=8","http://www.consumercomplaints.in/?search=delhivery&page=9","http://www.consumercomplaints.in/?search=delhivery&page=10","http://www.consumercomplaints.in/?search=delhivery&page=11"]
start_urls=["http://www.consumercomplaints.in/?search=delhivery"]
rules=(
Rule(SgmlLinkExtractor(allow=("search=delhivery&page=1/+",)), callback="parse", follow=True),
#Rule(SgmlLinkExtractor(allow=("startrow=\d",)),callback="parse_health",follow=True),
)
def parse(self,response):
hxs = Selector(response)
sites = hxs.select('//table[#width="100%"]')
items = []
for site in sites:
item = CompItem()
item['title'] = site.select('.//td[#class="complaint"]/a/span/text()').extract()
item['link'] = site.select('.//td[#class="complaint"]/a/#href').extract()
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
# item['intro'] = site.select('.//td[#class="small"]//a[2]/text()').extract()
# item['heading'] = site.select('.//td[#class="compl-text"]/div/b[1]/text()').extract()
# item['date'] = site.select('.//td[#class="small"]/text()[2]').extract()
# item['complaint'] = site.select('.//td[#class="compl-text"]/div/text()').extract()
items.append(item)
def anchor_page(self, response):
hxs = Selector(response)
old_item = response.request.meta['item'] # Receiving parse Method item that was in Request meta
# parse some more values
#place them in old_item
#e.g
old_item['data']=hxs.select('.//td[#class="compl-text"]/div/text()').extract()
yield old_item

Are you using an old version of Scrapy?
In the latest stable version you don't need to do hxs = Selector(response) nor using the hxs.select() method. You can do the same thing just with response.xpath().
I think the problem in your code is that the result of select() (or response.xpath) is actually a Python list, so you need to do:
link = site.select('.//td[#class="complaint"]/a/#href').extract()
if link:
item['link'] = link[0]
You probably want to do a similar thing for title too.
EDIT: I got it working with a few changes:
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
class CompItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
data = scrapy.Field()
class criticspider(CrawlSpider):
name = "comp"
allowed_domains = ["consumercomplaints.in"]
start_urls = ["http://www.consumercomplaints.in/?search=delhivery"]
rules = (
Rule(
SgmlLinkExtractor(allow=("search=delhivery&page=1/+",)),
callback="parse",
follow=True),
)
def parse(self, response):
sites = response.xpath('//table[#width="100%"]')
items = []
for site in sites:
item = CompItem()
item['title'] = site.xpath('.//td[#class="complaint"]/a/span/text()').extract()[0]
item['link'] = site.xpath('.//td[#class="complaint"]/a/#href').extract()[0]
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield scrapy.Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
items.append(item)
def anchor_page(self, response):
old_item = response.request.meta['item']
old_item['data'] = response.xpath('.//td[#class="compl-text"]/div/text()').extract()
yield old_item

Related

scraping url and title from nested anchor tag

This is my first scraper using scrapy.
I am trying to scrap video url, title from https://www.google.co.in/trends/hotvideos#hvsm=0 site.
import scrapy
from scrapy.item import Item, Field
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class CraigslistItem(Item):
title = Field()
link = Field()
class DmozSpider(scrapy.Spider):
name = "google"
allowed_domains = ["google.co.in"]
start_urls = [
"https://www.google.co.in/trends/hotvideos#hvsm=0"
]
def parse(self, response):
#for sel in response.xpath('//body/div'):
hxs = HtmlXPathSelector(response)
sites = hxs.xpath("//span[#class='single-video-image-container']")
items = []
for sel in response.xpath("//span[#class='single-video-image-container']"):
item = CraigslistItem()
item['title'] = sel.xpath('a/text()').extract()
item['link'] = sel.xpath('a/#href').extract()
items.append(item)
print items
General walk through of what I am doing wrong would be much appreciable.

Use the help Scrapy FormRequest to get it done.
from scrapy.http import FormRequest
import json
class DmozSpider(scrapy.Spider):
name = "google"
allowed_domains = ["google.co.in"]
start_urls = [
"https://www.google.co.in/trends/hotvideos#hvsm=0"
]
def parse(self, response):
url = 'https://www.google.co.in/trends/hotvideos/hotItems'
formdata = {'hvd':'','geo': 'IN','mob': '0','hvsm': '0'}
yield FormRequest(url=url, formdata=formdata, callback=self.parse_data)
def parse_data(self, response):
json_response = json.loads(response.body)
videos = json_response.get('videoList')
for video in videos:
item = CraigslistItem()
item['title'] = video.get('title')
item['link'] = video.get('url')
yield item

Scrapy: Rule SgmlLinkExtractor concept

Please guide me how to write Rule SgmlLinkExtractor
I am confused and can't figure out the english documents
I want to crawl the web with many pages
And the rule is :
http://abctest.com/list.php?c=&&page=1
http://abctest.com/list.php?c=&&page=2
http://abctest.com/list.php?c=&&page=3 ...
Here is my code:
from scrapy.selector import Selector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
import re
class Spider(CrawlSpider):
name = "find"
start_urls = ["http://abctest.com/list.php?c=&&page=1",]
#crawl 2 pages to test if the data is normal allow=('?c=&&page=/d+')
rules = [Rule(SgmlLinkExtractor(allow=('?c=&&page=2')),callback='parse_item',follow=True)]
#get the page1 item
def parse(self, response):
sel = Selector(response)
sites = sel.css("div#list table tr ")
for site in sites:
item = LAItem()
item['day'] = site.css(" td.date::text ").extract()
item['URL'] = site.css(" td.subject a::attr(href) ").extract()
yield item
#get the page2 item
def parse_item(self, response):
sel = Selector(response)
sites = sel.css("div#list table tr ")
for site in sites:
item = LAItem()
item['day'] = site.css(" td.date::text ").extract()
item['URL'] = site.css(" td.subject a::attr(href) ").extract()
yield item

You don't really need a LinkExtractor and CrawlSpider here - just regular Spider. What you need is to define start_requests() method and yield requests from it:
from scrapy import Request, Spider
from scrapy.exceptions import CloseSpider
from scrapy.selector import Selector
URL = 'http://abctest.com/list.php?c=&&page={page}'
class Spider(Spider):
handle_httpstatus_list = [404]
name = "find"
def start_requests(self):
index = 1
while True:
yield Request(URL.format(page=index))
index +=1
def parse(self, response):
if response.status == 404:
raise CloseSpider("Met the page which doesn't exist")
sel = Selector(response)
sites = sel.css("div#list table tr ")
for site in sites:
item = LAItem()
item['day'] = site.css(" td.date::text ").extract()
item['URL'] = site.css(" td.subject a::attr(href) ").extract()
yield item
Note that the trick here is to continue getting the pages until we meet the first response with 404 - Page not found. This should make it work for any number of pages.

Scraper not finding pages

I have a spider written as below, but it doesn't seem to be getting to the function parse. Could someone take a quick look and let me know if I'm missing something. Am I implementing the SgmlLinkExtractor properly?
The spider should pick out all the links from the left sidebar, create a request from them, then parse the next page for a facebook link. It should also do this for other pages as specified in the SgmlLinkExtractor. At the moment, the spider is running, but not parsing any pages.
class PrinzSpider(CrawlSpider):
name = "prinz"
allowed_domains = ["prinzwilly.de"]
start_urls = ["http://www.prinzwilly.de/"]
rules = (
Rule(
SgmlLinkExtractor(
allow=(r'veranstaltungen-(.*)', ),
),
callback='parse'
),
)
def parse(self, response):
hxs = HtmlXPathSelector(response)
startlinks = hxs.select("//ul[#id='mainNav2']/li/a")
print startlinks
for link in startlinks:
giglink = link.select('#href').extract()
item = GigItem()
item['gig_link'] = giglink
request = Request(item['gig_link'], callback='parse_gig_page')
item.meta['item'] = item
yield request
def parse_gig_page(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
gig_content = hxs.select("//div[#class='n']/table/tbody").extract()
fb_link = re.findall(r'(?:www.facebook.com/)(.*)', gig_content)
print '********** FB LINK ********', fb_link
return item
EDIT **
settings.py
BOT_NAME = 'gigscraper'
SPIDER_MODULES = ['gigscraper.spiders']
NEWSPIDER_MODULE = 'gigscraper.spiders'
ITEM_PIPLINES = ['gigscraper.pipelines.GigscraperPipeline']
items.py
from scrapy.item import Item, Field
class GigItem(Item):
gig_link = Field()
pipelines.py
class GigscraperPipeline(object):
def process_item(self, item, spider):
print 'here I am in the pipeline'
return item

Two problems:
extract() returns a list, you are missing [0]
Request's callback should not be a string, use self.parse_gig_page
Here's the modified code (working):
import re
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.http import Request
from scrapy.item import Item, Field
from scrapy.selector import HtmlXPathSelector
class GigItem(Item):
gig_link = Field()
class PrinzSpider(CrawlSpider):
name = "prinz"
allowed_domains = ["prinzwilly.de"]
start_urls = ["http://www.prinzwilly.de/"]
rules = (Rule(SgmlLinkExtractor(allow=(r'veranstaltungen-(.*)',)), callback='parse'),)
def parse(self, response):
hxs = HtmlXPathSelector(response)
startlinks = hxs.select("//ul[#id='mainNav2']/li/a")
for link in startlinks:
item = GigItem()
item['gig_link'] = link.select('#href').extract()[0]
yield Request(item['gig_link'], callback=self.parse_gig_page, meta={'item': item})
def parse_gig_page(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
gig_content = hxs.select("//div[#class='n']/table/tbody").extract()[0]
fb_link = re.findall(r'(?:www.facebook.com/)(.*)', gig_content)
print '********** FB LINK ********', fb_link
return item
Hope that helps.

How to mention link extractor rules when using BaseSpider in scrapy

Suppose this is my code
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from dmoz.items import DmozItem
class DmozSpider(BaseSpider):
domain_name = "dmoz.org"
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//ul[2]/li')
items = []
for site in sites:
item = DmozItem()
item['title'] = site.select('a/text()').extract()
item['link'] = site.select('a/#href').extract()
item['desc'] = site.select('text()').extract()
items.append(item)
return items
SPIDER = DmozSpider()
If i have used crawlSpider then i could uses Rules to implement thelink extractor but how can i mention rules in base spider. Like in above example. Because rules is only avaialble in crawlspider not base spider

Perhaps you could parse the response for your rule criteria and then pass the successful responses on to a second callback? Pseudo-code below:
def parse(self, response):
# check response for rule criteria
...
if rule:
# create new request to pass to second callback
req = Request("http://www.example.com/follow", callback=self.parse2)
return req
def parse2(self, response):
hxs = HtmlXPathSelector(response)
# do stuff with the successful response

How to use Request function in a Scrapy Spider?

from string import join
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders.crawl import Rule, CrawlSpider
from scrapy.http.request import Request
from scrapy.selector import HtmlXPathSelector
from Gfire.items import GfireItem
class GuideSpider(CrawlSpider):
name = "Gfire"
allowed_domains = ['www.example.com']
start_urls = [
"http://www.example.com/gfire/guides"
]
rules = (
Rule(SgmlLinkExtractor(allow=("gfire/guides.*page=")), callback='parse_item', follow=True),
)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items = []
sites = hxs.select('//div[#class="title"]')
for site in sites:
item = GFireItem()
item['title'] = site.select('./a/text()').extract()
item['guide_url'] = site.select('./a/#href').extract()
item['guide_url'] = "http://www.example.com" + join(item['guide_url'])
items.append(item)
return Request(items[1], callback=self.parse_item2)
def parse_item2(self, response):
hxs = HtmlXPathSelector(response)
hero = hxs.select("//h3/a/text()").extract()
return hero
Can't get this spider to work. The request function contains items[1] that should be item['guide_url'] but it says me that the parameter has to be str or unicode.
How can I corret this error? And how can I pass to the callback function the items list? Via request.meta?

Your item[1] is actually an instance of GFireItem.
I'm not certain why you are creating these as you only use one (the second site in your list of sites), discarding the rest of the list.
That aside, you need to extract the items[1]['guide_url'] url when creating the Request:
return Request(items[1]['guide_url'], callback=self.parse_item2)

def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items = []
sites = hxs.select('//div[#class="title"]')
for site in sites:
item = GFireItem()
item['title'] = site.select('./a/text()').extract()
item['guide_url'] = site.select('./a/#href').extract()
item['guide_url'] = "http://www.example.com" + join(item['guide_url'])
items.append(item)
return Request(items[1]['guide_url'], request.meta={'items':items}, callback=self.parse_item2)
def parse_item2(self, response):
items = response.meta["items"]
hxs = HtmlXPathSelector(response)
hero = hxs.select("//h3/a/text()").extract()
return hero

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

using scrapy extracting data inside links - python

Related

scraping url and title from nested anchor tag

Scrapy: Rule SgmlLinkExtractor concept

Scraper not finding pages

How to mention link extractor rules when using BaseSpider in scrapy

How to use Request function in a Scrapy Spider?

Categories

Resources