Trouble with scrapy crawl spider - python

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class Jptimes3Spider(CrawlSpider):
name = 'jptimes3'
allowed_domains = ['japantimes.co.jp']
start_urls = ['https://www.japantimes.co.jp/']
custom_settings = {
'DOWNLOAD_DELAY' : 3,
}
rules = (
Rule(LinkExtractor(restrict_xpaths='//*[#id="page"]'), callback='parse_item', follow=True),
)
def parse_item(self, response):
yield{
'category' : response.css('h3 > span.category-column::text').getall(),
'category2' : response.css('h3.category-column::text').getall(),
'article title' : response.css('p.article-title::text').getall(),
'summary' : response.xpath('//*[#id="wrapper"]/section[2]/div[1]/section[4]/div/ul/li[4]/a/article/header/hgroup/p/text()').getall()
}
I'm new to scrapy and this is my first crawl spider. I'm having 2 issues. The first is that it will get the links but not scrape any items I just get the column headers made in my csv. Also I was wondering if there was a way to grab the same data ie categories for instance in the same column if the have different css/xpaths?

xpath selection in rules and in parse_items method was incorrect.Here is an example of working solution.
script:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class Jptimes3Spider(CrawlSpider):
name = 'jptimes3'
allowed_domains = ['japantimes.co.jp']
start_urls = ['https://www.japantimes.co.jp']
custom_settings = {
'DOWNLOAD_DELAY': 3,
}
rules = (Rule(LinkExtractor(restrict_xpaths='//div[#data-tb-region="Top News"]/a'), callback='parse_item', follow=True),)
def parse_item(self, response):
yield {
'category': response.xpath('//h3[#class="single-post-categories"]/a/text()').get(),
'article title': ''.join(response.xpath('//h1/text()').getall())
}

Related

How do I scrape to csv in scrapy

How do I scrape a page to csv? My csv does not appear or appears blank
I have ran: scrapy crawl jobs -o output.csv . While the csv appears, nothing appears into it.
# -*- coding: utf-8 -*-
import scrapy
from scrapy import cmdline
cmdline.execute("scrapy crawl jobs".split())
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from craigslist_sample.items import CraigslistSampleItem
class MySpider(CrawlSpider):
name = "jobs"
allowed_domains = ["sfbay.craigslist.org"]
start_urls = ["http://sfbay.craigslist.org/search/npo"]
rules = (
Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('//a[#class="button next"]',)), callback="parse_items", follow= True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.xpath('//span[#class="pl"]')
items = []
for titles in titles:
item = CraigslistSampleItem()
item["title"] = titles.xpath("a/text()").extract()
item["link"] = titles.xpath("a/#href").extract()
items.append(item)
return(items)
class MySpider(CrawlSpider):
name = 'csvexample'
start_urls = ['C:/example.csv']
delimiter = ','
headers = ['Address', 'Website']
Try this -- I think you have to export each item individually. You are creating a special instance of the item class each time and never actually returning the item. You are appending items to your list and then returning the list So it never goes through the item pipe lines. Also in your list for titles you said for titles in titles both plural. :
# -*- coding: utf-8 -*-
import scrapy
from scrapy import cmdline
# cmdline.execute("scrapy crawl jobs".split()) -- Not sure what this line achieves?
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from craigslist_sample.items import CraigslistSampleItem
class MySpider(CrawlSpider):
name = "jobs"
allowed_domains = ["sfbay.craigslist.org"]
start_urls = ["http://sfbay.craigslist.org/search/npo"]
rules = (
Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('//a[#class="button next"]',)), callback="parse_items", follow= True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.xpath('//span[#class="pl"]')
for title in titles:
item = CraigslistSampleItem()
item["title"] = title.xpath("a/text()").extract_first()
item["link"] = title.xpath("a/#href").extract_first()
yield item

Scrapy doesn't want to go to next url

I have a problem with forcing scrapy to go to another page. I am trying to get all of the Opera schedules for different months.
Each of the adresses that I need looks like this: ""http://www.opera.krakow.pl/pl/repertuar/na-afiszu/ + name of the month
That's why I've made a list of the months and tried to iterate over them but somehow Scrapy just ignores it. I tried to print all the URLs collected by "next_page" and they are all correct.
import scrapy
from ..items import ShowItem, ShowItemLoader
from scrapy.selector import HtmlXPathSelector
class OperaSpider(scrapy.Spider):
name = "opera"
allowed_domains = ["http://www.opera.krakow.pl"]
start_urls = [
"http://www.opera.krakow.pl/pl/repertuar/na-afiszu/listopad"
]
shows_list_xpath = '//div[#class="row-fluid row-performance "]'
item_fields = {
'month':'.//ul[#class="nav nav-pills nav-repertuar"]/li[#class="active"]/a/text()',
'title': './/h2[#class="item-title"]/a/text()',
'time': './/div[#class="item-time vertical-center"]/div[#class="vcentered"]/text()',
'date': './/div[#class="item-date vertical-center"]/div[#class="vcentered"]/text()',
}
def parse(self, response):
selector = HtmlXPathSelector(response)
for show in selector.select(self.shows_list_xpath):
loader = ShowItemLoader(ShowItem(), selector=show)
for field, xpath in self.item_fields.iteritems():
loader.add_xpath(field, xpath)
yield loader.load_item()
list = ["styczen", "luty"
, "marzec", "kwiecien"
, "maj", "czerwiec"
, "lipiec", "sierpien"
, "wrzesien", "pazdziernik"
, "listopad", "grudzien"]
for i in list:
next_page = ("http://www.opera.krakow.pl/pl/repertuar/na-afiszu/%s" % i)
yield scrapy.Request(next_page, callback=self.parse)
scrapy checks allowed_domains for only the netloc of a request's url, you need to change http://www.opera.krakow.pl to opera.krakow.pl.

Crawlspider rule not working

I am trying to build a spider to scrape the data for courses at NY institute of Technology using the scrapy framework using python... following is my spider (nyitspider.py). Can someone please tell me where am I going wrong.
from scrapy.spiders import CrawlSpider, Rule, BaseSpider, Spider
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
from nyit_sample.items import NyitSampleItem
class nyitspider(CrawlSpider):
name = 'nyitspider'
allowed_domains = ['nyit.edu']
start_urls = ['http://www.nyit.edu/academics/courses/']
rules = (
Rule(LxmlLinkExtractor(
allow=('.*/academics/courses', ),
)),
Rule(LxmlLinkExtractor(
allow=('.*/academics/courses/[a-z][a-z][a-z]-[a-z][a-z]-[0-9][0-9] [0-9]/', ),
), callback='parse_item'),
)
def parse_item(self, response):
item = Course()
item["institute"] = 'New York Institute of Technology'
item['site'] = 'www.nyit.edu'
item['title'] = response.xpath('//*[#id="course_catalog_table"]/tbody/tr[1]/td[2]/a').extract()[0]
item['id'] = response.xpath('//*[#id="course_catalog_table"]/tbody/tr[1]/td[1]/a').extract()[0]
item['credits'] = response.xpath('//*[#id="course_catalog_table"]/tbody/tr[1]/td[3]').extract()[0]
item['description'] = response.xpath('//*[#id="course_catalog_table"]/tbody/tr[2]/td/text()[1]').extract()[0]
yield item
You have to correctly declare the item in the parse_item method, and the method should return something. Here's a suggestion, but you have to refine it:
# -*- coding: utf-8 -*-
from scrapy.spiders import CrawlSpider, Rule, BaseSpider, Spider
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
from nyit_sample.items import NyitSampleItem
class nyitspider(CrawlSpider):
name = 'nyitspider'
allowed_domains = ['nyit.edu']
start_urls = ['http://www.nyit.edu/academics/courses/']
rules = (
Rule(LxmlLinkExtractor(
allow=('.*/academics/courses', ),
), callback='parse_item'),
Rule(LxmlLinkExtractor(
allow=('.*/academics/courses/[a-z][a-z][a-z]-[a-z][a-z]-[0-9][0-9] [0-9]/', ),
), callback='parse_item'),
)
def parse_item(self, response):
item = NyitSampleItem()
item['institute'] = 'New York Institute of Technology'
item['site'] = 'www.nyit.edu'
item['title'] = response.xpath('string(//*[#id="course_catalog_table"]/tbody/tr[1]/td[2]/a)').extract()[0]
item['id'] = response.xpath('string(//*[#id="course_catalog_table"]/tbody/tr[1]/td[1]/a)').extract()[0]
item['credits'] = response.xpath('string(//*[#id="course_catalog_table"]/tbody/tr[1]/td[3])').extract()[0]
item['description'] = response.xpath('//*[#id="course_catalog_table"]/tbody/tr[2]/td/text()[1]').extract()[0]
return item

Scrapy python Rules not working

I am able to scrap the first page of craiglist. But Linkextractor is not fetching data from other pages. Am i doing something wrong in defining the rules?
import scrapy
from craiglist.items import craiglistItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class ExampleSpider(CrawlSpider):
name = "craiglist"
allowed_domains = ["craiglist.org"]
start_urls = (
'http://sfbay.craigslist.org/search/npo',
)
rules = [
Rule(LinkExtractor(restrict_xpaths='//a[#class="button next"]'), callback='parse', follow= True)
]
def parse(self, response):
titles = response.selector.xpath('//*[#id="sortable-results"]/ul/li/p')
items = []
for title in titles:
item = craiglistItem()
item["title"] = title.select("a/text()").extract()
item["link"] = title.select("a/#href").extract()
items.append(item)
return items
I have modified the code and now its working fine. Below is the working code.
import scrapy
from craiglist.items import craiglistItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http import Request
class ExampleSpider(CrawlSpider):
name = "craiglist"
allowed_domains = ["craigslist.org"]
start_urls = (
'http://sfbay.craigslist.org/search/npo',
)
rules = [
Rule(LinkExtractor(restrict_xpaths='//a[#class="button next"]'), callback="parse_items", follow= True),
]
def parse_start_url(self,response):
request=Request("http://sfbay.craigslist.org/search/npo", callback=self.parse_items)
return request
def parse_items(self, response):
titles = response.selector.xpath('//*[#id="sortable-results"]/ul/li/p')
items = []
for title in titles:
item = craiglistItem()
item["title"] = title.select("a/text()").extract()
item["link"] = title.select("a/#href").extract()
#item["link"] = response.url
items.append(item)
return items

Scrapy crawl only part of a website

Hello there I have the following code to scan all links in a give site.
from scrapy.item import Field, Item
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
class SampleItem(Item):
link = Field()
class SampleSpider(CrawlSpider):
name = "sample_spider"
allowed_domains = ["domain.com"]
start_urls = ["http://domain.com"]
rules = (
Rule(LinkExtractor(), callback='parse_page', follow=True),
)
def parse_page(self, response):
item = SampleItem()
item['link'] = response.url
return item
If I'like to check only part of a global site how could I do it? I have tried for example to scan only the french part of an international site whose domain is structured as : domain.com/fr/fr. So I have tried doing :
from scrapy.item import Field, Item
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
class SampleItem(Item):
link = Field()
class SampleSpider(CrawlSpider):
name = "sample_spider"
allowed_domains = ["domain.com/fr/fr"]
start_urls = ["http://domain.com/fr/fr"]
rules = (
Rule(LinkExtractor(), callback='parse_page', follow=True),
)
def parse_page(self, response):
item = SampleItem()
item['link'] = response.url
return item
But the spider only returns 3 results instead of thousands. What am I doing wrong?
To crawl only part of a website, you have to use the LinkExtractor. You can get a sample by issueing scrapy genspider -t crawl domain domain.com.
# -*- coding: utf-8 -*-
import scrapy
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from test.items import testItem
class DomainSpider(CrawlSpider):
name = 'domain'
allowed_domains = ['domain.com']
start_urls = ['http://www.domain.com/fr/fr']
rules = (
Rule(LinkExtractor(allow=r'fr/'), callback='parse_item', follow=True),
)
def parse_item(self, response):
i = testItem()
#i['domain_id'] = response.xpath('//input[#id="sid"]/#value').extract()
#i['name'] = response.xpath('//div[#id="name"]').extract()
#i['description'] = response.xpath('//div[#id="description"]').extract()
return i

Categories

Resources