I have written a scrapy crawler but I need to add the ability to read some arguments from the command line and then populates some static fields in my spider class. I also need to override the initialiser so it populates some of the spider fields.
import scrapy
from scrapy.spiders import Spider
from scrapy.http import Request
import re
class TutsplusItem(scrapy.Item):
title = scrapy.Field()
class MySpider(Spider):
name = "tutsplus"
allowed_domains = ["bbc.com"]
start_urls = ["http://www.bbc.com/"]
def parse(self, response):
links = response.xpath('//a/#href').extract()
# We stored already crawled links in this list
crawledLinks = []
for link in links:
# If it is a proper link and is not checked yet, yield it to the Spider
# if linkPattern.match(link) and not link in crawledLinks:
if not link in crawledLinks:
link = "http://www.bbc.com" + link
crawledLinks.append(link)
yield Request(link, self.parse)
titles = response.xpath('//a[contains(#class, "media__link")]/text()').extract()
for title in titles:
item = TutsplusItem()
item["title"] = title
print("Title is : %s" % title)
yield item
Then it should be run as:
scrapy runspider crawler.py arg1 arg2
How do I achieve this?
You can do that by overriding the init method of your spider like this.
class MySpider(Spider):
name = "tutsplus"
allowed_domains = ["bbc.com"]
start_urls = ["http://www.bbc.com/"]
arg1 = None
arg2 = None
def __init__(self, arg1, arg2, *args, **kwargs):
self.arg1 = arg1
self.arg2 = arg2
super(MySpider, self).__init__(*args, **kwargs)
def parse(self, response):
links = response.xpath('//a/#href').extract()
# We stored already crawled links in this list
crawledLinks = []
for link in links:
# If it is a proper link and is not checked yet, yield it to the Spider
# if linkPattern.match(link) and not link in crawledLinks:
if not link in crawledLinks:
link = "http://www.bbc.com" + link
crawledLinks.append(link)
yield Request(link, self.parse)
titles = response.xpath('//a[contains(#class, "media__link")]/text()').extract()
for title in titles:
item = TutsplusItem()
item["title"] = title
print("Title is : %s" % title)
yield item
Then run your spider like
scrapy crawl tutsplus -a arg1=arg1 -a arg2=arg2
Related
I am working Google search crawling using scrapy. This is the code and it works well to get search results.
GoogleBot.py:
class GoogleBotsSpider(scrapy.Spider):
name = 'GoogleScrapyBot'
allowed_domains = ['google.com']
start_urls = ['https://www.google.com/search?q=apple&hl=en&rlz=&start=0']
def parse(self, response):
item = {}
all_page = response.xpath('//*[#id="main"]')
for page in all_page:
title = page.xpath('//*[#id="main"]/div/div/div/a/h3/div/text()').extract()
link = page.xpath('//*[#id="main"]/div/div/div/a/#href').extract()
for title, link in zip(title, link):
print(title)
print(link.lstrip("/url?q="))
My next step is use "pipeline" on Scrapy to save a csv file for results.
Here is the code that I have written so far.
setting.py:
ITEM_PIPELINES = {'GoogleScrapy.pipelines.GooglePipeline': 300,}
pipelines.py:
from scrapy.exporters import CsvItemExporter
class GooglePipeline(object):
def __init__(self):
self.file = open("GoogleSearchResult.csv", 'wb')
self.exporter = CsvItemExporter(self.file, encoding='utf-8')
self.exporter.start_exporting()
def close_spider(self, spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
This is modified my spider code.
GoogleBot.py:
def parse(self, response):
item = {}
all_page = response.xpath('//*[#id="main"]')
for page in all_page:
item['title'] = page.xpath('//*[#id="main"]/div/div/div/a/h3/div/text()').extract()
item['link'] = page.xpath('//*[#id="main"]/div/div/div/a/#href').extract()
for title, link in zip(title, link):
print(title)
print(link.lstrip("/url?q="))
yield item
It has error where in:
for title, link in zip(title, link):
print(title)
print(link.lstrip("/url?q="))
I get this error:
for title, link in zip(title, link):
UnboundLocalError: local variable 'title' referenced before assignment
Here is the working output according to your comment.
import scrapy
class GoogleBotsSpider(scrapy.Spider):
name = 'GoogleScrapyBot'
allowed_domains = ['google.com']
start_urls = ['https://www.google.com/search?q=apple&hl=en&rlz=&start=0']
def parse(self, response):
all_page = response.xpath('//*[#id="main"]')
for page in all_page:
titles = page.xpath('//*[#id="main"]/div/div/div/a/h3/div//text()').extract()
for title in titles:
links = page.xpath('//*[#id="main"]/div/div/div/a/#href').extract()
for link in links:
item={
'Title': title,
'Link':link
}
yield item
I have a spider to crawl course_tal which has a pipeline to save two types of items:
moocs.csv which contains the course data.
moocs_review.csv which contains the reviews data.
This is the spider code I have:
import scrapy
from scrapy import Request
from scrapy.loader import ItemLoader
from urlparse import urljoin
from moocs.items import MoocsItem,MoocsReviewItem
class MoocsSpiderSpider(scrapy.Spider):
name = "moocs_spider"
#allowed_domains = ["https://www.coursetalk.com/subjects/data-science/courses"]
start_urls = (
'https://www.coursetalk.com/subjects/data-science/courses',
)
def parse(self, response):
courses_xpath = '//*[#class="course-listing-card"]//a[contains(#href, "/courses/")]/#href'
courses_url = [urljoin(response.url,relative_url) for relative_url in response.xpath(courses_xpath).extract()]
for course_url in courses_url[0:3]:
print course_url
yield Request(url=course_url, callback=self.parse_reviews)
next_page_url = response.xpath('//*[#class="js-course-pagination"]//a[contains(#aria-label,"Next")]/#href').extract()
yield Request(url=next_page_url, callback=self.parse)
def parse_reviews(self, response):
#print response.body
l = ItemLoader(item=MoocsItem(), response=response)
l.add_xpath('course_title', '//*[#class="course-header-ng__main-info__name__title"]//text()')
l.add_xpath('course_description', '//*[#class="course-info__description"]//p/text()')
l.add_xpath('course_instructors', '//*[#class="course-info__instructors__names"]//text()')
l.add_xpath('course_key_concepts', '//*[#class="key-concepts__labels"]//text()')
l.add_value('course_link', response.url)
l.add_value('course_provider', response.url)
l.add_xpath('course_cost', '//*[#class="course-details-panel__course-cost"]//text()')
l.add_xpath('university', '//*[#class="course-info__school__name"]//text()[2]')
#'//*[#class="course-info__school__name"]'
item = l.load_item()
for review in response.xpath('//*[#class="review-body"]'):
r = ItemLoader(item=MoocsReviewItem(), response=response, selector=review)
r.add_value('course_title', item['course_title'])
r.add_xpath('review_body', './/div[#class="review-body__content"]//text()')
r.add_xpath('course_stage', './/*[#class="review-body-info__course-stage--completed"]//text()')
r.add_xpath('user_name', './/*[#class="review-body__username"]//text()')
r.add_xpath('review_date', './/*[#itemprop="datePublished"]/#datetime')
r.add_xpath('score', './/*[#class="sr-only"]//text()')
yield r.load_item()
yield item
Which goes to each course page and save the details into the corresponding item. I'm getting the pagination here:
next_page_url = response.xpath('//*[#class="js-course-pagination"]//a[contains(#aria-label,"Next")]/#href').extract()
The spider goes to next pages but the result is not saved in the output file.
I'm guessing the problem is in the pipeline, where the files are created:
class MultiCSVItemPipeline(object):
CSVDir = '/moocs/scripts/moocs/moocs/'
SaveTypes = ['moocs','moocsreview']
def __init__(self):
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def spider_opened(self, spider):
self.files = dict([ (name, open(CSVDir+name+'.csv','w+b')) for name in self.SaveTypes ])
self.exporters = dict([ (name,CsvItemExporter(self.files[name])) for name in self.SaveTypes])
[e.start_exporting() for e in self.exporters.values()]
def spider_closed(self, spider):
[e.finish_exporting() for e in self.exporters.values()]
[f.close() for f in self.files.values()]
def process_item(self, item, spider):
what = item_type(item)
if what in set(self.SaveTypes):
self.exporters[what].export_item(item)
return item
Are you sure the spider is doing the pagination properly?
When you do this:
next_page_url = response.xpath('//*[#class="js-course-pagination"]//a[contains(#aria-label,"Next")]/#href').extract()
extract() returns a list of results that you are then passing into the url parameter of the Request:
yield Request(url=next_page_url, callback=self.parse)
But url must be a string or unicode value, so, doing that will generate the following error:
TypeError: Request url must be str or unicode, got list:
It can be solved by using the extract_first() method, and I would also check that the value is not None:
next_page_url = response.xpath('//*[#class="js-course-pagination"]//a[contains(#aria-label,"Next")]/#href').extract_first()
if next_page_url:
yield Request(url=next_page_url) # parse is the callback by default
Please, try this and tell me if it solved your problem
If you use -t csv, this will also do work. instead of pipeline
scrapy crawl moocs -t csv -o moocs.csv --loglevel=INFO
This will automatically create a file in spider folder.
I'm using Scrapy. The following is the code for test.py in spider folder.
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from craigslist_sample.items import CraigslistSampleItem
class MySpider(BaseSpider):
name = "craig"
allowed_domains = ["craigslist.org"]
start_urls = ["http://seattle.craigslist.org/npo/"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//span[#class='pl']")
items = []
for titles in titles:
item = CraigslistSampleItem()
item["title"] = titles.select("a/text()").extract()
item["link"] = titles.select("a/#href").extract()
items.append(item)
return items
Essentially, I want to iterate my url list and pass url into MySpider class for start_ulrs. Could you anyone give me suggestion on how to make this?
Instead of having "statically defined" start_urls you need to override start_requests() method:
from scrapy.http import Request
class MySpider(BaseSpider):
name = "craig"
allowed_domains = ["craigslist.org"]
def start_requests(self)
list_of_urls = [...] # reading urls from a text file, for example
for url in list_of_urls:
yield Request(url)
def parse(self, response):
...
I saw this link [a link] (Pass Scrapy Spider a list of URLs to crawl via .txt file)!
This changes the list of start urls. I want to scrape webpages for each domain(from a file) and put results into a separate file(named after the domain).
I have scraped data for a website but I have specified the start url and allowed_domains in the spider itself. How to change this using input file.
Update 1:
This is the code that I tried:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item, Field
class AppleItem(Item):
reference_link = Field()
rss_link = Field()
class AppleSpider(CrawlSpider):
name = 'apple'
allowed_domains = []
start_urls = []
def __init__(self):
for line in open('./domains.txt', 'r').readlines():
self.allowed_domains.append(line)
self.start_urls.append('http://%s' % line)
rules = [Rule(SgmlLinkExtractor(allow=()), follow=True, callback='parse_item')]
def parse_item(self, response):
sel = HtmlXPathSelector(response)
rsslinks = sel.select('//a[contains(#href, "pdf")]/#href').extract()
items = []
for rss in rsslinks:
item = AppleItem()
item['reference_link'] = response.url
item['rss_link'] = rsslinks
items.append(item)
filename = response.url.split("/")[-2]
open(filename+'.csv', 'wb').write(items)
I get an error when I run this: AttributeError: 'AppleSpider' object has no attribute '_rules'
You can use __init__ method of spider class to read file and owerrite start_urls and allowed_domains.
Suppose we have file domains.txt with content:
example1.com
example2.com
...
Example:
class MySpider(BaseSpider):
name = "myspider"
allowed_domains = []
start_urls = []
def __init__(self):
for line in open('./domains.txt', 'r').readlines():
self.allowed_domains.append(line)
self.start_urls.append('http://%s' % line)
def parse(self, response):
# here you will get data parsing page
# than put your data into single file
# from scrapy toturial http://doc.scrapy.org/en/latest/intro/tutorial.html
filename = response.url.split("/")[-2]
open(filename, 'wb').write(your_data)
I have a spider written as below, but it doesn't seem to be getting to the function parse. Could someone take a quick look and let me know if I'm missing something. Am I implementing the SgmlLinkExtractor properly?
The spider should pick out all the links from the left sidebar, create a request from them, then parse the next page for a facebook link. It should also do this for other pages as specified in the SgmlLinkExtractor. At the moment, the spider is running, but not parsing any pages.
class PrinzSpider(CrawlSpider):
name = "prinz"
allowed_domains = ["prinzwilly.de"]
start_urls = ["http://www.prinzwilly.de/"]
rules = (
Rule(
SgmlLinkExtractor(
allow=(r'veranstaltungen-(.*)', ),
),
callback='parse'
),
)
def parse(self, response):
hxs = HtmlXPathSelector(response)
startlinks = hxs.select("//ul[#id='mainNav2']/li/a")
print startlinks
for link in startlinks:
giglink = link.select('#href').extract()
item = GigItem()
item['gig_link'] = giglink
request = Request(item['gig_link'], callback='parse_gig_page')
item.meta['item'] = item
yield request
def parse_gig_page(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
gig_content = hxs.select("//div[#class='n']/table/tbody").extract()
fb_link = re.findall(r'(?:www.facebook.com/)(.*)', gig_content)
print '********** FB LINK ********', fb_link
return item
EDIT **
settings.py
BOT_NAME = 'gigscraper'
SPIDER_MODULES = ['gigscraper.spiders']
NEWSPIDER_MODULE = 'gigscraper.spiders'
ITEM_PIPLINES = ['gigscraper.pipelines.GigscraperPipeline']
items.py
from scrapy.item import Item, Field
class GigItem(Item):
gig_link = Field()
pipelines.py
class GigscraperPipeline(object):
def process_item(self, item, spider):
print 'here I am in the pipeline'
return item
Two problems:
extract() returns a list, you are missing [0]
Request's callback should not be a string, use self.parse_gig_page
Here's the modified code (working):
import re
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.http import Request
from scrapy.item import Item, Field
from scrapy.selector import HtmlXPathSelector
class GigItem(Item):
gig_link = Field()
class PrinzSpider(CrawlSpider):
name = "prinz"
allowed_domains = ["prinzwilly.de"]
start_urls = ["http://www.prinzwilly.de/"]
rules = (Rule(SgmlLinkExtractor(allow=(r'veranstaltungen-(.*)',)), callback='parse'),)
def parse(self, response):
hxs = HtmlXPathSelector(response)
startlinks = hxs.select("//ul[#id='mainNav2']/li/a")
for link in startlinks:
item = GigItem()
item['gig_link'] = link.select('#href').extract()[0]
yield Request(item['gig_link'], callback=self.parse_gig_page, meta={'item': item})
def parse_gig_page(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
gig_content = hxs.select("//div[#class='n']/table/tbody").extract()[0]
fb_link = re.findall(r'(?:www.facebook.com/)(.*)', gig_content)
print '********** FB LINK ********', fb_link
return item
Hope that helps.