Scrapy Recursive download of Content

Scrapy Recursive download of Content - python

After banging my head several time, I am finally coming here.
Problem : I am trying to download the content of each of the craiglist posting. By content I mean the "posting body" like description of the cell phone. Looking for a new old phone since iPhone is done with all excitement.
The code is an awesome work by Michael Herman.
My Spider Class
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import *
from craig.items import CraiglistSampleItem
class MySpider(CrawlSpider):
name = "craigs"
allowed_domains = ["craigslist.org"]
start_urls = ["http://minneapolis.craigslist.org/moa/"]
rules = (Rule (SgmlLinkExtractor(allow=("index\d00\.html", ),restrict_xpaths=('//p[#class="nextpage"]',))
, callback="parse_items", follow= True),
)
def parse_items(self,response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//span[#class='pl']")
items = []
for titles in titles:
item = CraiglistSampleItem()
item ["title"] = titles.select("a/text()").extract()
item ["link"] = titles.select("a/#href").extract()
items.append(item)
return items
And the Item class
from scrapy.item import Item, Field
class CraiglistSampleItem(Item):
title = Field()
link = Field()
Since the code will traverse many links , hence I wanted to save the description of each cell phone in sepearte csv but one more column in csv will be fine also.
Any lead !!!

Instead of returning items in parse_items method you should return/yield scrapy Request instance in order to get the description from the item page, link and title you can pass inside of an Item, and Item inside of the meta dictionary:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request
from scrapy.selector import *
from scrapy.item import Item, Field
class CraiglistSampleItem(Item):
title = Field()
link = Field()
description = Field()
class MySpider(CrawlSpider):
name = "craigs"
allowed_domains = ["craigslist.org"]
start_urls = ["http://minneapolis.craigslist.org/moa/"]
rules = (Rule(SgmlLinkExtractor(allow=("index\d00\.html", ), restrict_xpaths=('//p[#class="nextpage"]',))
, callback="parse_items", follow=True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//span[#class='pl']")
for title in titles:
item = CraiglistSampleItem()
item["title"] = title.select("a/text()").extract()[0]
item["link"] = title.select("a/#href").extract()[0]
url = "http://minneapolis.craigslist.org%s" % item["link"]
yield Request(url=url, meta={'item': item}, callback=self.parse_item_page)
def parse_item_page(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
item['description'] = hxs.select('//section[#id="postingbody"]/text()').extract()
return item
Run it and see additional description column in your output csv file.
Hope that helps.

Related

scraping url and title from nested anchor tag

This is my first scraper using scrapy.
I am trying to scrap video url, title from https://www.google.co.in/trends/hotvideos#hvsm=0 site.
import scrapy
from scrapy.item import Item, Field
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class CraigslistItem(Item):
title = Field()
link = Field()
class DmozSpider(scrapy.Spider):
name = "google"
allowed_domains = ["google.co.in"]
start_urls = [
"https://www.google.co.in/trends/hotvideos#hvsm=0"
]
def parse(self, response):
#for sel in response.xpath('//body/div'):
hxs = HtmlXPathSelector(response)
sites = hxs.xpath("//span[#class='single-video-image-container']")
items = []
for sel in response.xpath("//span[#class='single-video-image-container']"):
item = CraigslistItem()
item['title'] = sel.xpath('a/text()').extract()
item['link'] = sel.xpath('a/#href').extract()
items.append(item)
print items
General walk through of what I am doing wrong would be much appreciable.

Use the help Scrapy FormRequest to get it done.
from scrapy.http import FormRequest
import json
class DmozSpider(scrapy.Spider):
name = "google"
allowed_domains = ["google.co.in"]
start_urls = [
"https://www.google.co.in/trends/hotvideos#hvsm=0"
]
def parse(self, response):
url = 'https://www.google.co.in/trends/hotvideos/hotItems'
formdata = {'hvd':'','geo': 'IN','mob': '0','hvsm': '0'}
yield FormRequest(url=url, formdata=formdata, callback=self.parse_data)
def parse_data(self, response):
json_response = json.loads(response.body)
videos = json_response.get('videoList')
for video in videos:
item = CraigslistItem()
item['title'] = video.get('title')
item['link'] = video.get('url')
yield item

using scrapy extracting data inside links

I have been trying to extract data from consumercomplaints.in the title and the data inside those title links.I wrote the following code and unable to parse through the links and extract the data and also I am unable to extract all the links related.plz guide
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
from comp.items import CompItem
class criticspider(CrawlSpider):
name ="comp"
allowed_domains =["consumercomplaints.in"]
#start_urls =["http://www.consumercomplaints.in/?search=delhivery&page=2","http://www.consumercomplaints.in/?search=delhivery&page=3","http://www.consumercomplaints.in/?search=delhivery&page=4","http://www.consumercomplaints.in/?search=delhivery&page=5","http://www.consumercomplaints.in/?search=delhivery&page=6","http://www.consumercomplaints.in/?search=delhivery&page=7","http://www.consumercomplaints.in/?search=delhivery&page=8","http://www.consumercomplaints.in/?search=delhivery&page=9","http://www.consumercomplaints.in/?search=delhivery&page=10","http://www.consumercomplaints.in/?search=delhivery&page=11"]
start_urls=["http://www.consumercomplaints.in/?search=delhivery"]
rules=(
Rule(SgmlLinkExtractor(allow=("search=delhivery&page=1/+",)), callback="parse", follow=True),
#Rule(SgmlLinkExtractor(allow=("startrow=\d",)),callback="parse_health",follow=True),
)
def parse(self,response):
hxs = Selector(response)
sites = hxs.select('//table[#width="100%"]')
items = []
for site in sites:
item = CompItem()
item['title'] = site.select('.//td[#class="complaint"]/a/span/text()').extract()
item['link'] = site.select('.//td[#class="complaint"]/a/#href').extract()
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
# item['intro'] = site.select('.//td[#class="small"]//a[2]/text()').extract()
# item['heading'] = site.select('.//td[#class="compl-text"]/div/b[1]/text()').extract()
# item['date'] = site.select('.//td[#class="small"]/text()[2]').extract()
# item['complaint'] = site.select('.//td[#class="compl-text"]/div/text()').extract()
items.append(item)
def anchor_page(self, response):
hxs = Selector(response)
old_item = response.request.meta['item'] # Receiving parse Method item that was in Request meta
# parse some more values
#place them in old_item
#e.g
old_item['data']=hxs.select('.//td[#class="compl-text"]/div/text()').extract()
yield old_item

Are you using an old version of Scrapy?
In the latest stable version you don't need to do hxs = Selector(response) nor using the hxs.select() method. You can do the same thing just with response.xpath().
I think the problem in your code is that the result of select() (or response.xpath) is actually a Python list, so you need to do:
link = site.select('.//td[#class="complaint"]/a/#href').extract()
if link:
item['link'] = link[0]
You probably want to do a similar thing for title too.
EDIT: I got it working with a few changes:
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
class CompItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
data = scrapy.Field()
class criticspider(CrawlSpider):
name = "comp"
allowed_domains = ["consumercomplaints.in"]
start_urls = ["http://www.consumercomplaints.in/?search=delhivery"]
rules = (
Rule(
SgmlLinkExtractor(allow=("search=delhivery&page=1/+",)),
callback="parse",
follow=True),
)
def parse(self, response):
sites = response.xpath('//table[#width="100%"]')
items = []
for site in sites:
item = CompItem()
item['title'] = site.xpath('.//td[#class="complaint"]/a/span/text()').extract()[0]
item['link'] = site.xpath('.//td[#class="complaint"]/a/#href').extract()[0]
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield scrapy.Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
items.append(item)
def anchor_page(self, response):
old_item = response.request.meta['item']
old_item['data'] = response.xpath('.//td[#class="compl-text"]/div/text()').extract()
yield old_item

Pass variable to test.py in spider folder using scrapy

I'm using Scrapy. The following is the code for test.py in spider folder.
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from craigslist_sample.items import CraigslistSampleItem
class MySpider(BaseSpider):
name = "craig"
allowed_domains = ["craigslist.org"]
start_urls = ["http://seattle.craigslist.org/npo/"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//span[#class='pl']")
items = []
for titles in titles:
item = CraigslistSampleItem()
item["title"] = titles.select("a/text()").extract()
item["link"] = titles.select("a/#href").extract()
items.append(item)
return items
Essentially, I want to iterate my url list and pass url into MySpider class for start_ulrs. Could you anyone give me suggestion on how to make this?

Instead of having "statically defined" start_urls you need to override start_requests() method:
from scrapy.http import Request
class MySpider(BaseSpider):
name = "craig"
allowed_domains = ["craigslist.org"]
def start_requests(self)
list_of_urls = [...] # reading urls from a text file, for example
for url in list_of_urls:
yield Request(url)
def parse(self, response):
...

Scrapy only scraping first result of each page

I'm currently trying to run the following code but it keeps scraping only the first result of each page. Any idea what the issue may be?
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from firstproject.items import xyz123Item
import urlparse
from scrapy.http.request import Request
class MySpider(CrawlSpider):
name = "xyz123"
allowed_domains = ["www.xyz123.com.au"]
start_urls = ["http://www.xyz123.com.au/",]
rules = (Rule (SgmlLinkExtractor(allow=("",),restrict_xpaths=('//*[#id="1234headerPagination_hlNextLink"]',))
, callback="parse_xyz", follow=True),
)
def parse_xyz(self, response):
hxs = HtmlXPathSelector(response)
xyz = hxs.select('//div[#id="1234SearchResults"]//div/h2')
items = []
for xyz in xyz:
item = xyz123Item()
item ["title"] = xyz.select('a/text()').extract()[0]
item ["link"] = xyz.select('a/#href').extract()[0]
items.append(item)
return items
The Basespider version works well scraping ALL the required data on the first page:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from firstproject.items import xyz123
class MySpider(BaseSpider):
name = "xyz123test"
allowed_domains = ["xyz123.com.au"]
start_urls = ["http://www.xyz123.com.au/"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select('//div[#id="1234SearchResults"]//div/h2')
items = []
for titles in titles:
item = xyz123Item()
item ["title"] = titles.select("a/text()").extract()
item ["link"] = titles.select("a/#href").extract()
items.append(item)
return items
Sorry for the censoring. I had to censor the website for privacy reasons.
The first code crawls through the pages well the way I'd like it to crawl, however it only pulls the first item title and link. NOTE: The XPath of the first title using "inspect element" in google is:
//*[#id="xyz123SearchResults"]/div[1]/h2/a,
second is //*[#id="xyz123SearchResults"]/div[2]/h2/a
third is //*[#id="xyz123SearchResults"]/div[3]/h2/a etc.
I'm not sure if the div[n] bit is what's killing it. I'm hoping it's an easy fix.
Thanks

for xyz in xyz:
item = xyz123Item()
item ["title"] = xyz.select('a/text()').extract()[0]
item ["link"] = xyz.select('a/#href').extract()[0]
items.append(item)
return items
Are you sure about the indentation of the return items ? It should be one less.

Scraper not finding pages

I have a spider written as below, but it doesn't seem to be getting to the function parse. Could someone take a quick look and let me know if I'm missing something. Am I implementing the SgmlLinkExtractor properly?
The spider should pick out all the links from the left sidebar, create a request from them, then parse the next page for a facebook link. It should also do this for other pages as specified in the SgmlLinkExtractor. At the moment, the spider is running, but not parsing any pages.
class PrinzSpider(CrawlSpider):
name = "prinz"
allowed_domains = ["prinzwilly.de"]
start_urls = ["http://www.prinzwilly.de/"]
rules = (
Rule(
SgmlLinkExtractor(
allow=(r'veranstaltungen-(.*)', ),
),
callback='parse'
),
)
def parse(self, response):
hxs = HtmlXPathSelector(response)
startlinks = hxs.select("//ul[#id='mainNav2']/li/a")
print startlinks
for link in startlinks:
giglink = link.select('#href').extract()
item = GigItem()
item['gig_link'] = giglink
request = Request(item['gig_link'], callback='parse_gig_page')
item.meta['item'] = item
yield request
def parse_gig_page(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
gig_content = hxs.select("//div[#class='n']/table/tbody").extract()
fb_link = re.findall(r'(?:www.facebook.com/)(.*)', gig_content)
print '********** FB LINK ********', fb_link
return item
EDIT **
settings.py
BOT_NAME = 'gigscraper'
SPIDER_MODULES = ['gigscraper.spiders']
NEWSPIDER_MODULE = 'gigscraper.spiders'
ITEM_PIPLINES = ['gigscraper.pipelines.GigscraperPipeline']
items.py
from scrapy.item import Item, Field
class GigItem(Item):
gig_link = Field()
pipelines.py
class GigscraperPipeline(object):
def process_item(self, item, spider):
print 'here I am in the pipeline'
return item

Two problems:
extract() returns a list, you are missing [0]
Request's callback should not be a string, use self.parse_gig_page
Here's the modified code (working):
import re
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.http import Request
from scrapy.item import Item, Field
from scrapy.selector import HtmlXPathSelector
class GigItem(Item):
gig_link = Field()
class PrinzSpider(CrawlSpider):
name = "prinz"
allowed_domains = ["prinzwilly.de"]
start_urls = ["http://www.prinzwilly.de/"]
rules = (Rule(SgmlLinkExtractor(allow=(r'veranstaltungen-(.*)',)), callback='parse'),)
def parse(self, response):
hxs = HtmlXPathSelector(response)
startlinks = hxs.select("//ul[#id='mainNav2']/li/a")
for link in startlinks:
item = GigItem()
item['gig_link'] = link.select('#href').extract()[0]
yield Request(item['gig_link'], callback=self.parse_gig_page, meta={'item': item})
def parse_gig_page(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
gig_content = hxs.select("//div[#class='n']/table/tbody").extract()[0]
fb_link = re.findall(r'(?:www.facebook.com/)(.*)', gig_content)
print '********** FB LINK ********', fb_link
return item
Hope that helps.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scrapy Recursive download of Content - python

Related

scraping url and title from nested anchor tag

using scrapy extracting data inside links

Pass variable to test.py in spider folder using scrapy

Scrapy only scraping first result of each page

Scraper not finding pages

Categories

Resources