python web recursive scraping error - python

I am trying to scrape multiple pages. Its structured like this:
--> Page 1 - Scrape links
-------> Page 2 - Scrape more links (some pages contain pagination) and data
------------> Page 3 - Scrape the data
It is returning 18 items, but there are 127 pages (in 2nd step) and 18 items/page. And not returning author and author_link in item.
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor as lext
from scrapy.selector import Selector
from scrapy.http import Request
from ror.items import RorItem
class RorSpiderSpider(CrawlSpider):
name = "ror_spider"
allowed_domains = ["example.com"]
start_urls = (
'http://www.example.com/',
)
rules = [
Rule(lext(allow=("http://www.example.com/$"), restrict_xpaths=('//a[#class="nextpostslink"]',)), callback='parse', follow=True),
]
def parse(self, response):
links = Selector(response).xpath('//ul[#id="nav"]/li')
for link in links:
item = RorItem()
item['menu_link'] = link.xpath('a/#href').extract()[0]
item['menu_title'] = link.xpath('a/text()').extract()[0]
if "http" not in item['menu_link']:
item['menu_link'] = "http://www.reviewofreligions.org" + ''.join(item['menu_link'])
yield Request(url=item['menu_link'], meta={'item': item}, callback=self.parse_articles)
else:
yield Request(url=item['menu_link'], meta={'item': item}, callback=self.parse_articles)
def parse_articles(self, response):
sel = Selector(response)
item = response.meta['item']
if "articles" in item['menu_link']:
item['link_cat'] = item['menu_title']
pg = 1
maxPgs = 124
while pg <= 124:
item['article_pg_link'] = item['menu_link'] + "page/" + str(pg) + "/"
article_links = sel.xpath('//div[#id="rightcol"]/div[#class="articlebox"]')
for art_link in article_links:
item['article_link'] = art_link.xpath('a[#class="title "]/#href').extract()[0]
item['article_title'] = art_link.xpath('a[#class="title "]/text()').extract()[0].replace('\n\t\t\t\t', '').replace('\t\t\t\t', '')
# article_txt_1 = art_link.xpath('text()').extract()[1].replace('\n \n\t\t\t\t', '').replace('\t\t\t\t', '').replace('\n \n', '')
# article_txt_2 = art_link.xpath('text()').extract()[2].replace('\n \n\t\t\t\t', '') if art_link.xpath('text()').extract()[2] else ''
# item['article_txt'] = article_txt_1 + '\n'.join(article_txt_2).replace('\n\n\n \n\n\n \n \n \n \n\n\n\t\n\t\n\t', '')
yield Request(url=item['article_link'], meta={'item': item}, callback=self.article_page)
pg += 1
def article_page(self, response):
select = Selector(response)
item = response.meta['item']
item['author'] = select.xpath('//div[#id="author"]/a/text()').extract()
item['author_link'] = select.xpath('//div[#id="author"]/a/#href').extract()
return item
What is wrong in the code?

Related

Scraping multiple pages with multiple start_urls

I want to scrape the details present in json form using scrapy. They are multiple start_urls and each start_url have multiple pages to scrape with. I am just not able to get the logic of how to do so.
import scrapy
from scrapy.http import Request
BASE_URL = ["https://www.change.org/api-proxy/-/tags/animals-19/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/civic/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/human-rights-en-in/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/child-rights-2/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/health-9/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/environment-18/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/education-en-in/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/women-s-rights-13/petitions?offset={}&limit=8&show_promoted_cards=true"
]
class ChangeSpider(scrapy.Spider):
name = 'change'
def start_requests(self):
for i in range(len(BASE_URL)):
yield Request(BASE_URL[i], callback = self.parse)
pageNumber = 11
def parse(self, response):
data = response.json()
for item in range(len(data['items'])):
yield {
"petition_id": data['items'][item]['petition']['id'],
}
next_page = "https://www.change.org/api-proxy/-/tags/animals-19/petitions?offset=" + str(ChangeSpider.pageNumber) + "&limit=8&show_promoted_cards=true"
if data['last_page'] == False:
ChangeSpider.pageNumber += 1
yield response.follow(next_page, callback=self.parse)
Try like this:
import scrapy
from scrapy.http import Request
class ChangeSpider(scrapy.Spider):
name = 'change'
start_urls = ["https://www.change.org/api-proxy/-/tags/animals-19/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/civic/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/human-rights-en-in/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/child-rights-2/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/health-9/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/environment-18/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/education-en-in/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/women-s-rights-13/petitions?offset={}&limit=8&show_promoted_cards=true"
]
pageNumber = 11
def parse(self, response):
data = response.json()
for item in range(len(data['items'])):
yield {
"petition_id": data['items'][item]['petition']['id'],
}
next_page = "https://www.change.org/api-proxy/-/tags/animals-19/petitions?offset=" + str(ChangeSpider.pageNumber) + "&limit=8&show_promoted_cards=true"
if data['last_page'] == False:
ChangeSpider.pageNumber += 1
yield response.follow(next_page, callback=self.parse)

using scrapy extracting data inside links

I have been trying to extract data from consumercomplaints.in the title and the data inside those title links.I wrote the following code and unable to parse through the links and extract the data and also I am unable to extract all the links related.plz guide
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
from comp.items import CompItem
class criticspider(CrawlSpider):
name ="comp"
allowed_domains =["consumercomplaints.in"]
#start_urls =["http://www.consumercomplaints.in/?search=delhivery&page=2","http://www.consumercomplaints.in/?search=delhivery&page=3","http://www.consumercomplaints.in/?search=delhivery&page=4","http://www.consumercomplaints.in/?search=delhivery&page=5","http://www.consumercomplaints.in/?search=delhivery&page=6","http://www.consumercomplaints.in/?search=delhivery&page=7","http://www.consumercomplaints.in/?search=delhivery&page=8","http://www.consumercomplaints.in/?search=delhivery&page=9","http://www.consumercomplaints.in/?search=delhivery&page=10","http://www.consumercomplaints.in/?search=delhivery&page=11"]
start_urls=["http://www.consumercomplaints.in/?search=delhivery"]
rules=(
Rule(SgmlLinkExtractor(allow=("search=delhivery&page=1/+",)), callback="parse", follow=True),
#Rule(SgmlLinkExtractor(allow=("startrow=\d",)),callback="parse_health",follow=True),
)
def parse(self,response):
hxs = Selector(response)
sites = hxs.select('//table[#width="100%"]')
items = []
for site in sites:
item = CompItem()
item['title'] = site.select('.//td[#class="complaint"]/a/span/text()').extract()
item['link'] = site.select('.//td[#class="complaint"]/a/#href').extract()
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
# item['intro'] = site.select('.//td[#class="small"]//a[2]/text()').extract()
# item['heading'] = site.select('.//td[#class="compl-text"]/div/b[1]/text()').extract()
# item['date'] = site.select('.//td[#class="small"]/text()[2]').extract()
# item['complaint'] = site.select('.//td[#class="compl-text"]/div/text()').extract()
items.append(item)
def anchor_page(self, response):
hxs = Selector(response)
old_item = response.request.meta['item'] # Receiving parse Method item that was in Request meta
# parse some more values
#place them in old_item
#e.g
old_item['data']=hxs.select('.//td[#class="compl-text"]/div/text()').extract()
yield old_item
Are you using an old version of Scrapy?
In the latest stable version you don't need to do hxs = Selector(response) nor using the hxs.select() method. You can do the same thing just with response.xpath().
I think the problem in your code is that the result of select() (or response.xpath) is actually a Python list, so you need to do:
link = site.select('.//td[#class="complaint"]/a/#href').extract()
if link:
item['link'] = link[0]
You probably want to do a similar thing for title too.
EDIT: I got it working with a few changes:
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
class CompItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
data = scrapy.Field()
class criticspider(CrawlSpider):
name = "comp"
allowed_domains = ["consumercomplaints.in"]
start_urls = ["http://www.consumercomplaints.in/?search=delhivery"]
rules = (
Rule(
SgmlLinkExtractor(allow=("search=delhivery&page=1/+",)),
callback="parse",
follow=True),
)
def parse(self, response):
sites = response.xpath('//table[#width="100%"]')
items = []
for site in sites:
item = CompItem()
item['title'] = site.xpath('.//td[#class="complaint"]/a/span/text()').extract()[0]
item['link'] = site.xpath('.//td[#class="complaint"]/a/#href').extract()[0]
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield scrapy.Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
items.append(item)
def anchor_page(self, response):
old_item = response.request.meta['item']
old_item['data'] = response.xpath('.//td[#class="compl-text"]/div/text()').extract()
yield old_item

Scrapy Webcrawler and Data Extractor

I am trying to create a webcrawler with scrapy, i am using a template that i have used before but i cant seem to get it to parse the urls. I can see it go to youtube and then go to the watchpage but from there it wont pull the title or desciptions or anything because it always fails to parse.
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy import log
from krakenkrawler.items import KrakenItem
class AttractionSpider(CrawlSpider):
name = "thekraken"
allowed_domains = ["youtube.com"]
start_urls = [
"http://www.youtube.com/?gl=GB&hl=en-GB"
]
rules = ()
def __init__(self, name=None, **kwargs):
super(AttractionSpider, self).__init__(name, **kwargs)
self.items_buffer = {}
self.base_url = "http://www.youtube.com"
from scrapy.conf import settings
settings.overrides['DOWNLOAD_TIMEOUT'] = 360
def parse(self, response):
print "Start scrapping Attractions...."
try:
hxs = HtmlXPathSelector(response)
links = hxs.select("//h3[#class='yt-lockup-title']//a/#href")
if not links:
return
log.msg("No Data to scrap")
for link in links:
v_url = ''.join( link.extract() )
if not v_url:
continue
else:
_url = self.base_url + v_url
yield Request( url= _url, callback=self.parse_details )
except Exception as e:
log.msg("Parsing failed for URL {%s}"%format(response.request.url))
raise
def parse_details(self, response):
print "Start scrapping Detailed Info...."
try:
hxs = HtmlXPathSelector(response)
l_venue = KrakenItem()
v_name = hxs.select("//*[#id='eow-title'].text").extract()
if not v_name:
v_name = hxs.select("//*[#id='eow-title'].text").extract()
l_venue["name"] = v_name[0].strip()
base = hxs.select("//*[#id='content']/div[7]")
if base.extract()[0].strip() == "<div style=\"clear:both\"></div>":
base = hxs.select("//*[#id='content']/div[8]")
elif base.extract()[0].strip() == "<div style=\"padding-top:10px;margin-top:10px;border-top:1px dotted #DDD;\">\n You must be logged in to add a tip\n </div>":
base = hxs.select("//*[#id='content']/div[6]")
x_datas = base.select("div[1]/b").extract()
v_datas = base.select("div[1]/text()").extract()
i_d = 0;
if x_datas:
for x_data in x_datas:
print "data is:" + x_data.strip()
if x_data.strip() == "<b>Address:</b>":
l_venue["address"] = v_datas[i_d].strip()
if x_data.strip() == "<b>Contact:</b>":
l_venue["contact"] = v_datas[i_d].strip()
if x_data.strip() == "<b>Operating Hours:</b>":
l_venue["hours"] = v_datas[i_d].strip()
if x_data.strip() == "<b>Website:</b>":
l_venue["website"] = (base.select("//*[#id='watch-actions-share-panel']/div/div[2]/div[2]/span[1]/input/text()").extract())[0].strip()
i_d += 1
v_photo = base.select("img/#src").extract()
if v_photo:
l_venue["photo"] = v_photo[0].strip()
v_desc = base.select("div[3]/text()").extract()
if v_desc:
desc = ""
for dsc in v_desc:
desc += dsc
l_venue["desc"] = desc.strip()
v_video = hxs.select("//*[#id='content']/iframe/#src").extract()
if v_video:
l_venue["video"] = v_video[0].strip()
yield l_venue
except Exception as e:
log.msg("Parsing failed for URL {%s}"%format(response.request.url))
raise
Thanks a ton in advance.
The problem is that the structure you are looking for "//h3[#class='yt-lockup-title']//a/#href" is not present in all pages.
I modified your code to verify what pages are opened and what data are extracted:
class AttractionSpider(CrawlSpider):
name = "thekraken"
bot_name = 'kraken'
allowed_domains = ["youtube.com"]
start_urls = ["http://www.youtube.com/?gl=GB&hl=en-GB"]
rules = (
Rule(SgmlLinkExtractor(allow=('')), callback='parse_items',follow= True),
)
def parse_items(self, response):
print "Start scrapping Attractions...."
print response.url
try :
hxs = HtmlXPathSelector(response)
links = hxs.select("//h3[#class='yt-lockup-title']//a/#href")
for link in links:
v_url = ''.join( link.extract() )
print v_url
if not links:
log.msg("No Data to scrap")
except :
pass
Result is something like this:
Start scrapping Attractions....http://www.youtube.com/watch?v=GBdCbciGLK0
Start scrapping Attractions....http://www.youtube.com/watch?v=BxUjDpnSHyc&list=TL4PEfm95Wz3k
Start scrapping Attractions.... http://www.youtube.com/watch?v=T-CZW4YjAig
Start scrapping Attractions....
https://www.youtube.com/user/ComedyShortsGamer
/watch?v=TdICODRvAhc&list=UUrqsNpKuDQZreGaxBL_a5Jg
/watch?v=CDGzm5edrlw&list=UUrqsNpKuDQZreGaxBL_a5Jg
/watch?v=F2oR5KS54JM&list=UUrqsNpKuDQZreGaxBL_a5Jg
/watch?v=LHRzOIvqmQI&list=UUrqsNpKuDQZreGaxBL_a5Jg
/watch?v=F4iqiM6h-2U&list=UUrqsNpKuDQZreGaxBL_a5Jg
/watch?v=ug3UPIvWlvU&list=UUrqsNpKuDQZreGaxBL_a5Jg
/watch?v=msiZs6lIZ9w&list=UUrqsNpKuDQZreGaxBL_a5Jg
/watch?v=Jh6A3DoOLBg&list=UUrqsNpKuDQZreGaxBL_a5Jg
In the inner pages where no results are scraped there are no "yt-lockup-title" classes.
In brief you have to improve your spider.

Scrapy: Rule SgmlLinkExtractor concept

Please guide me how to write Rule SgmlLinkExtractor
I am confused and can't figure out the english documents
I want to crawl the web with many pages
And the rule is :
http://abctest.com/list.php?c=&&page=1
http://abctest.com/list.php?c=&&page=2
http://abctest.com/list.php?c=&&page=3 ...
Here is my code:
from scrapy.selector import Selector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
import re
class Spider(CrawlSpider):
name = "find"
start_urls = ["http://abctest.com/list.php?c=&&page=1",]
#crawl 2 pages to test if the data is normal allow=('?c=&&page=/d+')
rules = [Rule(SgmlLinkExtractor(allow=('?c=&&page=2')),callback='parse_item',follow=True)]
#get the page1 item
def parse(self, response):
sel = Selector(response)
sites = sel.css("div#list table tr ")
for site in sites:
item = LAItem()
item['day'] = site.css(" td.date::text ").extract()
item['URL'] = site.css(" td.subject a::attr(href) ").extract()
yield item
#get the page2 item
def parse_item(self, response):
sel = Selector(response)
sites = sel.css("div#list table tr ")
for site in sites:
item = LAItem()
item['day'] = site.css(" td.date::text ").extract()
item['URL'] = site.css(" td.subject a::attr(href) ").extract()
yield item
You don't really need a LinkExtractor and CrawlSpider here - just regular Spider. What you need is to define start_requests() method and yield requests from it:
from scrapy import Request, Spider
from scrapy.exceptions import CloseSpider
from scrapy.selector import Selector
URL = 'http://abctest.com/list.php?c=&&page={page}'
class Spider(Spider):
handle_httpstatus_list = [404]
name = "find"
def start_requests(self):
index = 1
while True:
yield Request(URL.format(page=index))
index +=1
def parse(self, response):
if response.status == 404:
raise CloseSpider("Met the page which doesn't exist")
sel = Selector(response)
sites = sel.css("div#list table tr ")
for site in sites:
item = LAItem()
item['day'] = site.css(" td.date::text ").extract()
item['URL'] = site.css(" td.subject a::attr(href) ").extract()
yield item
Note that the trick here is to continue getting the pages until we meet the first response with 404 - Page not found. This should make it work for any number of pages.

scraping multiple pages with scrapy

I am trying to use scrapy to scrape a website that has several pages of information.
my code is:
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from tcgplayer1.items import Tcgplayer1Item
class MySpider(BaseSpider):
name = "tcg"
allowed_domains = ["http://www.tcgplayer.com/"]
start_urls = ["http://store.tcgplayer.com/magic/journey-into-nyx?PageNumber=1"]
def parse(self, response):
hxs = Selector(response)
titles = hxs.xpath("//div[#class='magicCard']")
for title in titles:
item = Tcgplayer1Item()
item["cardname"] = title.xpath(".//li[#class='cardName']/a/text()").extract()[0]
vendor = title.xpath(".//tr[#class='vendor ']")
item["price"] = vendor.xpath("normalize-space(.//td[#class='price']/text())").extract()
item["quantity"] = vendor.xpath("normalize-space(.//td[#class='quantity']/text())").extract()
item["shipping"] = vendor.xpath("normalize-space(.//span[#class='shippingAmount']/text())").extract()
item["condition"] = vendor.xpath("normalize-space(.//td[#class='condition']/a/text())").extract()
item["vendors"] = vendor.xpath("normalize-space(.//td[#class='seller']/a/text())").extract()
yield item
I am trying to scrape all the pages until it reaches the end of the pages ... sometimes there will be more pages than others so its hard to say exactly where the page numbers end.
The idea is to increment pageNumber until there is no titles found. If no titles on the page - throw CloseSpider exception to stop the spider:
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from scrapy.exceptions import CloseSpider
from scrapy.http import Request
from tcgplayer1.items import Tcgplayer1Item
URL = "http://store.tcgplayer.com/magic/journey-into-nyx?pageNumber=%d"
class MySpider(BaseSpider):
name = "tcg"
allowed_domains = ["tcgplayer.com"]
start_urls = [URL % 1]
def __init__(self):
self.page_number = 1
def parse(self, response):
print self.page_number
print "----------"
sel = Selector(response)
titles = sel.xpath("//div[#class='magicCard']")
if not titles:
raise CloseSpider('No more pages')
for title in titles:
item = Tcgplayer1Item()
item["cardname"] = title.xpath(".//li[#class='cardName']/a/text()").extract()[0]
vendor = title.xpath(".//tr[#class='vendor ']")
item["price"] = vendor.xpath("normalize-space(.//td[#class='price']/text())").extract()
item["quantity"] = vendor.xpath("normalize-space(.//td[#class='quantity']/text())").extract()
item["shipping"] = vendor.xpath("normalize-space(.//span[#class='shippingAmount']/text())").extract()
item["condition"] = vendor.xpath("normalize-space(.//td[#class='condition']/a/text())").extract()
item["vendors"] = vendor.xpath("normalize-space(.//td[#class='seller']/a/text())").extract()
yield item
self.page_number += 1
yield Request(URL % self.page_number)
This particular spider would go throw all 8 pages of the data, then stop.
Hope that helps.

Categories

Resources