I am trying to build a spider to scrape the data for courses at NY institute of Technology using the scrapy framework using python... following is my spider (nyitspider.py). Can someone please tell me where am I going wrong.
from scrapy.spiders import CrawlSpider, Rule, BaseSpider, Spider
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
from nyit_sample.items import NyitSampleItem
class nyitspider(CrawlSpider):
name = 'nyitspider'
allowed_domains = ['nyit.edu']
start_urls = ['http://www.nyit.edu/academics/courses/']
rules = (
Rule(LxmlLinkExtractor(
allow=('.*/academics/courses', ),
)),
Rule(LxmlLinkExtractor(
allow=('.*/academics/courses/[a-z][a-z][a-z]-[a-z][a-z]-[0-9][0-9] [0-9]/', ),
), callback='parse_item'),
)
def parse_item(self, response):
item = Course()
item["institute"] = 'New York Institute of Technology'
item['site'] = 'www.nyit.edu'
item['title'] = response.xpath('//*[#id="course_catalog_table"]/tbody/tr[1]/td[2]/a').extract()[0]
item['id'] = response.xpath('//*[#id="course_catalog_table"]/tbody/tr[1]/td[1]/a').extract()[0]
item['credits'] = response.xpath('//*[#id="course_catalog_table"]/tbody/tr[1]/td[3]').extract()[0]
item['description'] = response.xpath('//*[#id="course_catalog_table"]/tbody/tr[2]/td/text()[1]').extract()[0]
yield item
You have to correctly declare the item in the parse_item method, and the method should return something. Here's a suggestion, but you have to refine it:
# -*- coding: utf-8 -*-
from scrapy.spiders import CrawlSpider, Rule, BaseSpider, Spider
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
from nyit_sample.items import NyitSampleItem
class nyitspider(CrawlSpider):
name = 'nyitspider'
allowed_domains = ['nyit.edu']
start_urls = ['http://www.nyit.edu/academics/courses/']
rules = (
Rule(LxmlLinkExtractor(
allow=('.*/academics/courses', ),
), callback='parse_item'),
Rule(LxmlLinkExtractor(
allow=('.*/academics/courses/[a-z][a-z][a-z]-[a-z][a-z]-[0-9][0-9] [0-9]/', ),
), callback='parse_item'),
)
def parse_item(self, response):
item = NyitSampleItem()
item['institute'] = 'New York Institute of Technology'
item['site'] = 'www.nyit.edu'
item['title'] = response.xpath('string(//*[#id="course_catalog_table"]/tbody/tr[1]/td[2]/a)').extract()[0]
item['id'] = response.xpath('string(//*[#id="course_catalog_table"]/tbody/tr[1]/td[1]/a)').extract()[0]
item['credits'] = response.xpath('string(//*[#id="course_catalog_table"]/tbody/tr[1]/td[3])').extract()[0]
item['description'] = response.xpath('//*[#id="course_catalog_table"]/tbody/tr[2]/td/text()[1]').extract()[0]
return item
Related
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class Jptimes3Spider(CrawlSpider):
name = 'jptimes3'
allowed_domains = ['japantimes.co.jp']
start_urls = ['https://www.japantimes.co.jp/']
custom_settings = {
'DOWNLOAD_DELAY' : 3,
}
rules = (
Rule(LinkExtractor(restrict_xpaths='//*[#id="page"]'), callback='parse_item', follow=True),
)
def parse_item(self, response):
yield{
'category' : response.css('h3 > span.category-column::text').getall(),
'category2' : response.css('h3.category-column::text').getall(),
'article title' : response.css('p.article-title::text').getall(),
'summary' : response.xpath('//*[#id="wrapper"]/section[2]/div[1]/section[4]/div/ul/li[4]/a/article/header/hgroup/p/text()').getall()
}
I'm new to scrapy and this is my first crawl spider. I'm having 2 issues. The first is that it will get the links but not scrape any items I just get the column headers made in my csv. Also I was wondering if there was a way to grab the same data ie categories for instance in the same column if the have different css/xpaths?
xpath selection in rules and in parse_items method was incorrect.Here is an example of working solution.
script:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class Jptimes3Spider(CrawlSpider):
name = 'jptimes3'
allowed_domains = ['japantimes.co.jp']
start_urls = ['https://www.japantimes.co.jp']
custom_settings = {
'DOWNLOAD_DELAY': 3,
}
rules = (Rule(LinkExtractor(restrict_xpaths='//div[#data-tb-region="Top News"]/a'), callback='parse_item', follow=True),)
def parse_item(self, response):
yield {
'category': response.xpath('//h3[#class="single-post-categories"]/a/text()').get(),
'article title': ''.join(response.xpath('//h1/text()').getall())
}
How do I scrape a page to csv? My csv does not appear or appears blank
I have ran: scrapy crawl jobs -o output.csv . While the csv appears, nothing appears into it.
# -*- coding: utf-8 -*-
import scrapy
from scrapy import cmdline
cmdline.execute("scrapy crawl jobs".split())
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from craigslist_sample.items import CraigslistSampleItem
class MySpider(CrawlSpider):
name = "jobs"
allowed_domains = ["sfbay.craigslist.org"]
start_urls = ["http://sfbay.craigslist.org/search/npo"]
rules = (
Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('//a[#class="button next"]',)), callback="parse_items", follow= True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.xpath('//span[#class="pl"]')
items = []
for titles in titles:
item = CraigslistSampleItem()
item["title"] = titles.xpath("a/text()").extract()
item["link"] = titles.xpath("a/#href").extract()
items.append(item)
return(items)
class MySpider(CrawlSpider):
name = 'csvexample'
start_urls = ['C:/example.csv']
delimiter = ','
headers = ['Address', 'Website']
Try this -- I think you have to export each item individually. You are creating a special instance of the item class each time and never actually returning the item. You are appending items to your list and then returning the list So it never goes through the item pipe lines. Also in your list for titles you said for titles in titles both plural. :
# -*- coding: utf-8 -*-
import scrapy
from scrapy import cmdline
# cmdline.execute("scrapy crawl jobs".split()) -- Not sure what this line achieves?
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from craigslist_sample.items import CraigslistSampleItem
class MySpider(CrawlSpider):
name = "jobs"
allowed_domains = ["sfbay.craigslist.org"]
start_urls = ["http://sfbay.craigslist.org/search/npo"]
rules = (
Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('//a[#class="button next"]',)), callback="parse_items", follow= True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.xpath('//span[#class="pl"]')
for title in titles:
item = CraigslistSampleItem()
item["title"] = title.xpath("a/text()").extract_first()
item["link"] = title.xpath("a/#href").extract_first()
yield item
I am able to scrap the first page of craiglist. But Linkextractor is not fetching data from other pages. Am i doing something wrong in defining the rules?
import scrapy
from craiglist.items import craiglistItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class ExampleSpider(CrawlSpider):
name = "craiglist"
allowed_domains = ["craiglist.org"]
start_urls = (
'http://sfbay.craigslist.org/search/npo',
)
rules = [
Rule(LinkExtractor(restrict_xpaths='//a[#class="button next"]'), callback='parse', follow= True)
]
def parse(self, response):
titles = response.selector.xpath('//*[#id="sortable-results"]/ul/li/p')
items = []
for title in titles:
item = craiglistItem()
item["title"] = title.select("a/text()").extract()
item["link"] = title.select("a/#href").extract()
items.append(item)
return items
I have modified the code and now its working fine. Below is the working code.
import scrapy
from craiglist.items import craiglistItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http import Request
class ExampleSpider(CrawlSpider):
name = "craiglist"
allowed_domains = ["craigslist.org"]
start_urls = (
'http://sfbay.craigslist.org/search/npo',
)
rules = [
Rule(LinkExtractor(restrict_xpaths='//a[#class="button next"]'), callback="parse_items", follow= True),
]
def parse_start_url(self,response):
request=Request("http://sfbay.craigslist.org/search/npo", callback=self.parse_items)
return request
def parse_items(self, response):
titles = response.selector.xpath('//*[#id="sortable-results"]/ul/li/p')
items = []
for title in titles:
item = craiglistItem()
item["title"] = title.select("a/text()").extract()
item["link"] = title.select("a/#href").extract()
#item["link"] = response.url
items.append(item)
return items
Hello there I have the following code to scan all links in a give site.
from scrapy.item import Field, Item
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
class SampleItem(Item):
link = Field()
class SampleSpider(CrawlSpider):
name = "sample_spider"
allowed_domains = ["domain.com"]
start_urls = ["http://domain.com"]
rules = (
Rule(LinkExtractor(), callback='parse_page', follow=True),
)
def parse_page(self, response):
item = SampleItem()
item['link'] = response.url
return item
If I'like to check only part of a global site how could I do it? I have tried for example to scan only the french part of an international site whose domain is structured as : domain.com/fr/fr. So I have tried doing :
from scrapy.item import Field, Item
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
class SampleItem(Item):
link = Field()
class SampleSpider(CrawlSpider):
name = "sample_spider"
allowed_domains = ["domain.com/fr/fr"]
start_urls = ["http://domain.com/fr/fr"]
rules = (
Rule(LinkExtractor(), callback='parse_page', follow=True),
)
def parse_page(self, response):
item = SampleItem()
item['link'] = response.url
return item
But the spider only returns 3 results instead of thousands. What am I doing wrong?
To crawl only part of a website, you have to use the LinkExtractor. You can get a sample by issueing scrapy genspider -t crawl domain domain.com.
# -*- coding: utf-8 -*-
import scrapy
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from test.items import testItem
class DomainSpider(CrawlSpider):
name = 'domain'
allowed_domains = ['domain.com']
start_urls = ['http://www.domain.com/fr/fr']
rules = (
Rule(LinkExtractor(allow=r'fr/'), callback='parse_item', follow=True),
)
def parse_item(self, response):
i = testItem()
#i['domain_id'] = response.xpath('//input[#id="sid"]/#value').extract()
#i['name'] = response.xpath('//div[#id="name"]').extract()
#i['description'] = response.xpath('//div[#id="description"]').extract()
return i
This is the BaseSpider example from the Scrapy tutorial:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from dmoz.items import DmozItem
class DmozSpider(BaseSpider):
domain_name = "dmoz.org"
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//ul[2]/li')
items = []
for site in sites:
item = DmozItem()
item['title'] = site.select('a/text()').extract()
item['link'] = site.select('a/#href').extract()
item['desc'] = site.select('text()').extract()
items.append(item)
return items
SPIDER = DmozSpider()
I copied it with changes for my project:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item
from firm.items import FirmItem
class Spider1(CrawlSpider):
domain_name = 'wc2'
start_urls = ['http://www.whitecase.com/Attorneys/List.aspx?LastName=A']
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//td[#class="altRow"][1]/a/#href').re('/.a\w+')
items = []
for site in sites:
item = FirmItem
item['school'] = hxs.select('//td[#class="mainColumnTDa"]').re('(JD)(.*?)(\d+)')
items.append(item)
return items
SPIDER = Spider1()
and I get the error
[wc2] ERROR: Spider exception caught while processing
<http://www.whitecase.com/Attorneys/List.aspx?LastName=A> (referer: <None>):
[Failure instance: Traceback: <type 'exceptions.TypeError'>:
'ItemMeta' object does not support item assignment
I would greatly appreciate it if experts here take a look at the code and give me a clue about where I am going wrong.
Thank you
Probably you meant item = FirmItem() instead of item = FirmItem?