Scrapy crawl only part of a website - python

Hello there I have the following code to scan all links in a give site.
from scrapy.item import Field, Item
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
class SampleItem(Item):
link = Field()
class SampleSpider(CrawlSpider):
name = "sample_spider"
allowed_domains = ["domain.com"]
start_urls = ["http://domain.com"]
rules = (
Rule(LinkExtractor(), callback='parse_page', follow=True),
)
def parse_page(self, response):
item = SampleItem()
item['link'] = response.url
return item
If I'like to check only part of a global site how could I do it? I have tried for example to scan only the french part of an international site whose domain is structured as : domain.com/fr/fr. So I have tried doing :
from scrapy.item import Field, Item
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
class SampleItem(Item):
link = Field()
class SampleSpider(CrawlSpider):
name = "sample_spider"
allowed_domains = ["domain.com/fr/fr"]
start_urls = ["http://domain.com/fr/fr"]
rules = (
Rule(LinkExtractor(), callback='parse_page', follow=True),
)
def parse_page(self, response):
item = SampleItem()
item['link'] = response.url
return item
But the spider only returns 3 results instead of thousands. What am I doing wrong?

To crawl only part of a website, you have to use the LinkExtractor. You can get a sample by issueing scrapy genspider -t crawl domain domain.com.
# -*- coding: utf-8 -*-
import scrapy
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from test.items import testItem
class DomainSpider(CrawlSpider):
name = 'domain'
allowed_domains = ['domain.com']
start_urls = ['http://www.domain.com/fr/fr']
rules = (
Rule(LinkExtractor(allow=r'fr/'), callback='parse_item', follow=True),
)
def parse_item(self, response):
i = testItem()
#i['domain_id'] = response.xpath('//input[#id="sid"]/#value').extract()
#i['name'] = response.xpath('//div[#id="name"]').extract()
#i['description'] = response.xpath('//div[#id="description"]').extract()
return i

Related

How do I scrape to csv in scrapy

How do I scrape a page to csv? My csv does not appear or appears blank
I have ran: scrapy crawl jobs -o output.csv . While the csv appears, nothing appears into it.
# -*- coding: utf-8 -*-
import scrapy
from scrapy import cmdline
cmdline.execute("scrapy crawl jobs".split())
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from craigslist_sample.items import CraigslistSampleItem
class MySpider(CrawlSpider):
name = "jobs"
allowed_domains = ["sfbay.craigslist.org"]
start_urls = ["http://sfbay.craigslist.org/search/npo"]
rules = (
Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('//a[#class="button next"]',)), callback="parse_items", follow= True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.xpath('//span[#class="pl"]')
items = []
for titles in titles:
item = CraigslistSampleItem()
item["title"] = titles.xpath("a/text()").extract()
item["link"] = titles.xpath("a/#href").extract()
items.append(item)
return(items)
class MySpider(CrawlSpider):
name = 'csvexample'
start_urls = ['C:/example.csv']
delimiter = ','
headers = ['Address', 'Website']
Try this -- I think you have to export each item individually. You are creating a special instance of the item class each time and never actually returning the item. You are appending items to your list and then returning the list So it never goes through the item pipe lines. Also in your list for titles you said for titles in titles both plural. :
# -*- coding: utf-8 -*-
import scrapy
from scrapy import cmdline
# cmdline.execute("scrapy crawl jobs".split()) -- Not sure what this line achieves?
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from craigslist_sample.items import CraigslistSampleItem
class MySpider(CrawlSpider):
name = "jobs"
allowed_domains = ["sfbay.craigslist.org"]
start_urls = ["http://sfbay.craigslist.org/search/npo"]
rules = (
Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('//a[#class="button next"]',)), callback="parse_items", follow= True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.xpath('//span[#class="pl"]')
for title in titles:
item = CraigslistSampleItem()
item["title"] = title.xpath("a/text()").extract_first()
item["link"] = title.xpath("a/#href").extract_first()
yield item

Crawlspider rule not working

I am trying to build a spider to scrape the data for courses at NY institute of Technology using the scrapy framework using python... following is my spider (nyitspider.py). Can someone please tell me where am I going wrong.
from scrapy.spiders import CrawlSpider, Rule, BaseSpider, Spider
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
from nyit_sample.items import NyitSampleItem
class nyitspider(CrawlSpider):
name = 'nyitspider'
allowed_domains = ['nyit.edu']
start_urls = ['http://www.nyit.edu/academics/courses/']
rules = (
Rule(LxmlLinkExtractor(
allow=('.*/academics/courses', ),
)),
Rule(LxmlLinkExtractor(
allow=('.*/academics/courses/[a-z][a-z][a-z]-[a-z][a-z]-[0-9][0-9] [0-9]/', ),
), callback='parse_item'),
)
def parse_item(self, response):
item = Course()
item["institute"] = 'New York Institute of Technology'
item['site'] = 'www.nyit.edu'
item['title'] = response.xpath('//*[#id="course_catalog_table"]/tbody/tr[1]/td[2]/a').extract()[0]
item['id'] = response.xpath('//*[#id="course_catalog_table"]/tbody/tr[1]/td[1]/a').extract()[0]
item['credits'] = response.xpath('//*[#id="course_catalog_table"]/tbody/tr[1]/td[3]').extract()[0]
item['description'] = response.xpath('//*[#id="course_catalog_table"]/tbody/tr[2]/td/text()[1]').extract()[0]
yield item
You have to correctly declare the item in the parse_item method, and the method should return something. Here's a suggestion, but you have to refine it:
# -*- coding: utf-8 -*-
from scrapy.spiders import CrawlSpider, Rule, BaseSpider, Spider
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
from nyit_sample.items import NyitSampleItem
class nyitspider(CrawlSpider):
name = 'nyitspider'
allowed_domains = ['nyit.edu']
start_urls = ['http://www.nyit.edu/academics/courses/']
rules = (
Rule(LxmlLinkExtractor(
allow=('.*/academics/courses', ),
), callback='parse_item'),
Rule(LxmlLinkExtractor(
allow=('.*/academics/courses/[a-z][a-z][a-z]-[a-z][a-z]-[0-9][0-9] [0-9]/', ),
), callback='parse_item'),
)
def parse_item(self, response):
item = NyitSampleItem()
item['institute'] = 'New York Institute of Technology'
item['site'] = 'www.nyit.edu'
item['title'] = response.xpath('string(//*[#id="course_catalog_table"]/tbody/tr[1]/td[2]/a)').extract()[0]
item['id'] = response.xpath('string(//*[#id="course_catalog_table"]/tbody/tr[1]/td[1]/a)').extract()[0]
item['credits'] = response.xpath('string(//*[#id="course_catalog_table"]/tbody/tr[1]/td[3])').extract()[0]
item['description'] = response.xpath('//*[#id="course_catalog_table"]/tbody/tr[2]/td/text()[1]').extract()[0]
return item

Scrapy python Rules not working

I am able to scrap the first page of craiglist. But Linkextractor is not fetching data from other pages. Am i doing something wrong in defining the rules?
import scrapy
from craiglist.items import craiglistItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class ExampleSpider(CrawlSpider):
name = "craiglist"
allowed_domains = ["craiglist.org"]
start_urls = (
'http://sfbay.craigslist.org/search/npo',
)
rules = [
Rule(LinkExtractor(restrict_xpaths='//a[#class="button next"]'), callback='parse', follow= True)
]
def parse(self, response):
titles = response.selector.xpath('//*[#id="sortable-results"]/ul/li/p')
items = []
for title in titles:
item = craiglistItem()
item["title"] = title.select("a/text()").extract()
item["link"] = title.select("a/#href").extract()
items.append(item)
return items
I have modified the code and now its working fine. Below is the working code.
import scrapy
from craiglist.items import craiglistItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http import Request
class ExampleSpider(CrawlSpider):
name = "craiglist"
allowed_domains = ["craigslist.org"]
start_urls = (
'http://sfbay.craigslist.org/search/npo',
)
rules = [
Rule(LinkExtractor(restrict_xpaths='//a[#class="button next"]'), callback="parse_items", follow= True),
]
def parse_start_url(self,response):
request=Request("http://sfbay.craigslist.org/search/npo", callback=self.parse_items)
return request
def parse_items(self, response):
titles = response.selector.xpath('//*[#id="sortable-results"]/ul/li/p')
items = []
for title in titles:
item = craiglistItem()
item["title"] = title.select("a/text()").extract()
item["link"] = title.select("a/#href").extract()
#item["link"] = response.url
items.append(item)
return items

Scraper not finding pages

I have a spider written as below, but it doesn't seem to be getting to the function parse. Could someone take a quick look and let me know if I'm missing something. Am I implementing the SgmlLinkExtractor properly?
The spider should pick out all the links from the left sidebar, create a request from them, then parse the next page for a facebook link. It should also do this for other pages as specified in the SgmlLinkExtractor. At the moment, the spider is running, but not parsing any pages.
class PrinzSpider(CrawlSpider):
name = "prinz"
allowed_domains = ["prinzwilly.de"]
start_urls = ["http://www.prinzwilly.de/"]
rules = (
Rule(
SgmlLinkExtractor(
allow=(r'veranstaltungen-(.*)', ),
),
callback='parse'
),
)
def parse(self, response):
hxs = HtmlXPathSelector(response)
startlinks = hxs.select("//ul[#id='mainNav2']/li/a")
print startlinks
for link in startlinks:
giglink = link.select('#href').extract()
item = GigItem()
item['gig_link'] = giglink
request = Request(item['gig_link'], callback='parse_gig_page')
item.meta['item'] = item
yield request
def parse_gig_page(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
gig_content = hxs.select("//div[#class='n']/table/tbody").extract()
fb_link = re.findall(r'(?:www.facebook.com/)(.*)', gig_content)
print '********** FB LINK ********', fb_link
return item
EDIT **
settings.py
BOT_NAME = 'gigscraper'
SPIDER_MODULES = ['gigscraper.spiders']
NEWSPIDER_MODULE = 'gigscraper.spiders'
ITEM_PIPLINES = ['gigscraper.pipelines.GigscraperPipeline']
items.py
from scrapy.item import Item, Field
class GigItem(Item):
gig_link = Field()
pipelines.py
class GigscraperPipeline(object):
def process_item(self, item, spider):
print 'here I am in the pipeline'
return item
Two problems:
extract() returns a list, you are missing [0]
Request's callback should not be a string, use self.parse_gig_page
Here's the modified code (working):
import re
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.http import Request
from scrapy.item import Item, Field
from scrapy.selector import HtmlXPathSelector
class GigItem(Item):
gig_link = Field()
class PrinzSpider(CrawlSpider):
name = "prinz"
allowed_domains = ["prinzwilly.de"]
start_urls = ["http://www.prinzwilly.de/"]
rules = (Rule(SgmlLinkExtractor(allow=(r'veranstaltungen-(.*)',)), callback='parse'),)
def parse(self, response):
hxs = HtmlXPathSelector(response)
startlinks = hxs.select("//ul[#id='mainNav2']/li/a")
for link in startlinks:
item = GigItem()
item['gig_link'] = link.select('#href').extract()[0]
yield Request(item['gig_link'], callback=self.parse_gig_page, meta={'item': item})
def parse_gig_page(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
gig_content = hxs.select("//div[#class='n']/table/tbody").extract()[0]
fb_link = re.findall(r'(?:www.facebook.com/)(.*)', gig_content)
print '********** FB LINK ********', fb_link
return item
Hope that helps.

Scrapy Recursive download of Content

After banging my head several time, I am finally coming here.
Problem : I am trying to download the content of each of the craiglist posting. By content I mean the "posting body" like description of the cell phone. Looking for a new old phone since iPhone is done with all excitement.
The code is an awesome work by Michael Herman.
My Spider Class
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import *
from craig.items import CraiglistSampleItem
class MySpider(CrawlSpider):
name = "craigs"
allowed_domains = ["craigslist.org"]
start_urls = ["http://minneapolis.craigslist.org/moa/"]
rules = (Rule (SgmlLinkExtractor(allow=("index\d00\.html", ),restrict_xpaths=('//p[#class="nextpage"]',))
, callback="parse_items", follow= True),
)
def parse_items(self,response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//span[#class='pl']")
items = []
for titles in titles:
item = CraiglistSampleItem()
item ["title"] = titles.select("a/text()").extract()
item ["link"] = titles.select("a/#href").extract()
items.append(item)
return items
And the Item class
from scrapy.item import Item, Field
class CraiglistSampleItem(Item):
title = Field()
link = Field()
Since the code will traverse many links , hence I wanted to save the description of each cell phone in sepearte csv but one more column in csv will be fine also.
Any lead !!!
Instead of returning items in parse_items method you should return/yield scrapy Request instance in order to get the description from the item page, link and title you can pass inside of an Item, and Item inside of the meta dictionary:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request
from scrapy.selector import *
from scrapy.item import Item, Field
class CraiglistSampleItem(Item):
title = Field()
link = Field()
description = Field()
class MySpider(CrawlSpider):
name = "craigs"
allowed_domains = ["craigslist.org"]
start_urls = ["http://minneapolis.craigslist.org/moa/"]
rules = (Rule(SgmlLinkExtractor(allow=("index\d00\.html", ), restrict_xpaths=('//p[#class="nextpage"]',))
, callback="parse_items", follow=True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//span[#class='pl']")
for title in titles:
item = CraiglistSampleItem()
item["title"] = title.select("a/text()").extract()[0]
item["link"] = title.select("a/#href").extract()[0]
url = "http://minneapolis.craigslist.org%s" % item["link"]
yield Request(url=url, meta={'item': item}, callback=self.parse_item_page)
def parse_item_page(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
item['description'] = hxs.select('//section[#id="postingbody"]/text()').extract()
return item
Run it and see additional description column in your output csv file.
Hope that helps.

Categories

Resources