Using Scrapy to crawl websites and only scrape pages containing keywords

Using Scrapy to crawl websites and only scrape pages containing keywords - python

I'm trying to crawl various websites looking for particular keywords of interest and only scraping those pages. I've written the script to run as a standalone Python script rather than the traditional Scrapy project structure (following this example) and using the CrawlSpider class. The idea is that from a given homepage the Spider will crawl pages within that domain and only scrape links from pages which contain the keyword. I'm also trying to save a copy of the page when I find one containing the keyword. The previous version of this question related to a syntax error (see comments below, thanks #tegancp for helping me clear that up) but now although my code runs I am still unable to crawl links only on pages of interest as intended.
I think I want to either i) remove the call to LinkExtractor in the __init__ function or ii) only call LinkExtractor from within __init__ but with a rule based on what I find when I visit that page rather than some attribute of the URL. I can't do i) because the CrawlSpider class wants a rule and I can't do ii) because LinkExtractor doesn't have a process_links option like the old SgmlLinkExtractor which appears to be deprecated. I'm new to Scrapy so wondering if my only option is to write my own LinkExtractor?
from scrapy.crawler import Crawler
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader.processor import Join, MapCompose, TakeFirst
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy import log, signals, Spider, Item, Field
from scrapy.settings import Settings
from twisted.internet import reactor
# define an item class
class GenItem(Item):
url = Field()
# define a spider
class GenSpider(CrawlSpider):
name = "genspider3"
# requires 'start_url', 'allowed_domains' and 'folderpath' to be passed as string arguments IN THIS PARTICULAR ORDER!!!
def __init__(self):
self.start_urls = [sys.argv[1]]
self.allowed_domains = [sys.argv[2]]
self.folder = sys.argv[3]
self.writefile1 = self.folder + 'hotlinks.txt'
self.writefile2 = self.folder + 'pages.txt'
self.rules = [Rule(LinkExtractor(allow_domains=(sys.argv[2],)), follow=True, callback='parse_links')]
super(GenSpider, self).__init__()
def parse_start_url(self, response):
# get list of links on start_url page and process using parse_links
list(self.parse_links(response))
def parse_links(self, response):
# if this page contains a word of interest save the HTML to file and crawl the links on this page
theHTML = response.body
if 'keyword' in theHTML:
with open(self.writefile2, 'a+') as f2:
f2.write(theHTML + '\n')
with open(self.writefile1, 'a+') as f1:
f1.write(response.url + '\n')
for link in LinkExtractor(allow_domains=(sys.argv[2],)).extract_links(response):
linkitem = GenItem()
linkitem['url'] = link.url
log.msg(link.url)
with open(self.writefile1, 'a+') as f1:
f1.write(link.url + '\n')
return linkitem
# callback fired when the spider is closed
def callback(spider, reason):
stats = spider.crawler.stats.get_stats() # collect/log stats?
# stop the reactor
reactor.stop()
# instantiate settings and provide a custom configuration
settings = Settings()
#settings.set('DEPTH_LIMIT', 2)
settings.set('DOWNLOAD_DELAY', 0.25)
# instantiate a crawler passing in settings
crawler = Crawler(settings)
# instantiate a spider
spider = GenSpider()
# configure signals
crawler.signals.connect(callback, signal=signals.spider_closed)
# configure and start the crawler
crawler.configure()
crawler.crawl(spider)
crawler.start()
# start logging
log.start(loglevel=log.DEBUG)
# start the reactor (blocks execution)
reactor.run()

Related

With Scrapy, How do I check links on a single page is allowed from robots.txt file?

With Scrapy, I will scrape a single page (via script and not from console) to check all the links on this page if they are allowed by the robots.txt file.
In the scrapy.robotstxt.RobotParser abstract base class, I found the method allowed(url, user_agent), but I don't see how to use it.
import scrapy
class TestSpider(scrapy.Spider):
name = "TestSpider"
def __init__(self):
super(TestSpider, self).__init__()
def start_requests(self):
yield scrapy.Request(url='http://httpbin.org/', callback=self.parse)
def parse(self, response):
if 200 <= response.status < 300:
links = scrapy.linkextractors.LinkExtractor.extract_links(response)
for idx, link in enumerate(links):
# How can I check each link is allowed by robots.txt file?
# => allowed(link.url , '*')
# self.crawler.engine.downloader.middleware.middlewares
# self.crawler AttributeError: 'TestSpider' object has no attribute 'crawler'
To run 'TestSpider' spider, in settings.py set
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
Go to the project’s top level directory and run:
scrapy crawl TestSpider
Appreciate any help.
My solution:
import scrapy
from scrapy.downloadermiddlewares.robotstxt import RobotsTxtMiddleware
from scrapy.utils.httpobj import urlparse_cached
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class TestSpider(CrawlSpider):
name = "TestSpider"
def __init__(self):
super(TestSpider, self).__init__()
self.le = LinkExtractor(unique=True, allow_domains=self.allowed_domains)
self._rules = [
Rule(self.le, callback=self.parse)
]
def start_requests(self):
self._robotstxt_middleware = None
for middleware in self.crawler.engine.downloader.middleware.middlewares:
if isinstance(middleware, RobotsTxtMiddleware):
self._robotstxt_middleware = middleware
break
yield scrapy.Request(url='http://httpbin.org/', callback=self.parse_robotstxt)
def parse_robotstxt(self, response):
robotstxt_middleware = None
for middleware in self.crawler.engine.downloader.middleware.middlewares:
if isinstance(middleware, RobotsTxtMiddleware):
robotstxt_middleware = middleware
break
url = urlparse_cached(response)
netloc = url.netloc
self._robotsTxtParser = None
if robotstxt_middleware and netloc in robotstxt_middleware._parsers:
self._robotsTxtParser = robotstxt_middleware._parsers[netloc]
return self.parse(response)
def parse(self, response):
if 200 <= response.status < 300:
links = self.le.extract_links(response)
for idx, link in enumerate(links):
# Check if link target is forbidden by robots.txt
if self._robotsTxtParser:
if not self._robotsTxtParser.allowed(link.url, "*"):
print(link.url,' Disallow by robotstxt file')

Parser implementations are listed a bit higher on the page than the link you posted.
Protego parser
Based on Protego:
implemented in Python
is compliant with Google’s Robots.txt Specification
supports wildcard matching
uses the length based rule
Scrapy uses this parser by default.
So, if you want the same results as scrapy gives by default, use protego.
The usage is as follows (robotstxt being the contents of a robots.txt file):
>>> from protego import Protego
>>> rp = Protego.parse(robotstxt)
>>> rp.can_fetch("http://example.com/profiles", "mybot")
False
It is also possible to identify and reuse the robots middleware currently in use, but it's probably more trouble than it's worth for most use cases.
Edit:
If you really want to reuse the middleware, your spider has access to downloader middlewares through self.crawler.engine.downloader.middleware.middlewares.
From there, you need to identify the robots middleware (possibly by class name?) and the parser you need (from the middleware's _parsers attribute).
Finally, you'd use the parser's can_fetch() method to check your links.

2 functions in scrapy spider and the second one not running

I am using scrapy to get the content inside some urls on a page, similar to this question here:
Use scrapy to get list of urls, and then scrape content inside those urls
I am able to get the subURLs from my start urls(first def), However, my second def doesn't seem to be passing through. And the result file is empty. I have tested the content inside the function in scrapy shell and it is getting the info I want, but not when I am running the spider.
import scrapy
from scrapy.selector import Selector
#from scrapy import Spider
from WheelsOnlineScrapper.items import Dealer
from WheelsOnlineScrapper.url_list import urls
import logging
from urlparse import urljoin
logger = logging.getLogger(__name__)
class WheelsonlinespiderSpider(scrapy.Spider):
logger.info('Spider starting')
name = 'wheelsonlinespider'
rotate_user_agent = True # lives in middleware.py and settings.py
allowed_domains = ["https://wheelsonline.ca"]
start_urls = urls # this list is created in url_list.py
logger.info('URLs retrieved')
def parse(self, response):
subURLs = []
partialURLs = response.css('.directory_name::attr(href)').extract()
for i in partialURLs:
subURLs = urljoin('https://wheelsonline.ca/', i)
yield scrapy.Request(subURLs, callback=self.parse_dealers)
logger.info('Dealer ' + subURLs + ' fetched')
def parse_dealers(self, response):
logger.info('Beginning of page')
dlr = Dealer()
#Extracting the content using css selectors
try:
dlr['DealerName'] = response.css(".dealer_head_main_name::text").extract_first() + ' ' + response.css(".dealer_head_aux_name::text").extract_first()
except TypeError:
dlr['DealerName'] = response.css(".dealer_head_main_name::text").extract_first()
dlr['MailingAddress'] = ','.join(response.css(".dealer_address_right::text").extract())
dlr['PhoneNumber'] = response.css(".dealer_head_phone::text").extract_first()
logger.info('Dealer fetched ' + dlr['DealerName'])
yield dlr
logger.info('End of page')

Your allowed_domains list contains the protocol (https). It should have only the domain name as per the documentation:
allowed_domains = ["wheelsonline.ca"]
Also, you should've received a message in your log:
URLWarning: allowed_domains accepts only domains, not URLs. Ignoring URL entry https://wheelsonline.ca in allowed_domains

How can I reuse the parse method of my scrapy Spider-based spider in an inheriting CrawlSpider?

I currently have a Spider-based spider that I wrote for crawling an input JSON array of start_urls:
from scrapy.spider import Spider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from foo.items import AtlanticFirearmsItem
from scrapy.contrib.loader import ItemLoader
import json
import datetime
import re
class AtlanticFirearmsSpider(Spider):
name = "atlantic_firearms"
allowed_domains = ["atlanticfirearms.com"]
def __init__(self, start_urls='[]', *args, **kwargs):
super(AtlanticFirearmsSpider, self).__init__(*args, **kwargs)
self.start_urls = json.loads(start_urls)
def parse(self, response):
l = ItemLoader(item=AtlanticFirearmsItem(), response=response)
product = l.load_item()
return product
I can call it from the command line like so, and it does a wonderful job:
scrapy crawl atlantic_firearms -a start_urls='["http://www.atlanticfirearms.com/component/virtuemart/shipping-rifles/ak-47-receiver-aam-47-detail.html", "http://www.atlanticfirearms.com/component/virtuemart/shipping-accessories/nitride-ak47-7-62x39mm-barrel-detail.html"]'
However, I'm trying to add a CrawlSpider-based spider for crawling the entire site that inherits from it and re-uses the parse method logic. My first attempt looked like this:
class AtlanticFirearmsCrawlSpider(CrawlSpider, AtlanticFirearmsSpider):
name = "atlantic_firearms_crawler"
start_urls = [
"http://www.atlanticfirearms.com"
]
rules = (
# I know, I need to update these to LxmlLinkExtractor
Rule(SgmlLinkExtractor(allow=['detail.html']), callback='parse'),
Rule(SgmlLinkExtractor(allow=[], deny=['/bro', '/news', '/howtobuy', '/component/search', 'askquestion'])),
)
Running this spider with
scrapy crawl atlantic_firearms_crawler
crawls the site but never parses any items. I think it's because CrawlSpider apparently has its own definition of parse, so somehow I'm screwing things up.
When I change callback='parse' to callback='parse_item' and rename the parse method in AtlanticFirearmsSpider to parse_item, it works wonderfully, crawling the whole site and parsing items successfully. But then if I try to call my original atlantic_firearms spider again, it errors out with NotImplementedError, apparently because Spider-based spiders really want one to define the parse method as parse.
What's the best way for me to re-use my logic between these spiders so that I can both feed a JSON array of start_urls as well as do full-site crawls?

You can avoid multiple inheritance here.
Combine both spiders in a single one. If start_urls would be passed from the command-line - it would behave like a CrawlSpider, otherwise like a regular spider:
from scrapy import Item
from scrapy.contrib.spiders import CrawlSpider, Rule
from foo.items import AtlanticFirearmsItem
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.linkextractors import LinkExtractor
import json
class AtlanticFirearmsSpider(CrawlSpider):
name = "atlantic_firearms"
allowed_domains = ["atlanticfirearms.com"]
def __init__(self, start_urls=None, *args, **kwargs):
if start_urls:
self.start_urls = json.loads(start_urls)
self.rules = []
self.parse = self.parse_response
else:
self.start_urls = ["http://www.atlanticfirearms.com/"]
self.rules = [
Rule(LinkExtractor(allow=['detail.html']), callback='parse_response'),
Rule(LinkExtractor(allow=[], deny=['/bro', '/news', '/howtobuy', '/component/search', 'askquestion']))
]
super(AtlanticFirearmsSpider, self).__init__(*args, **kwargs)
def parse_response(self, response):
l = ItemLoader(item=AtlanticFirearmsItem(), response=response)
product = l.load_item()
return product
Or, alternatively, just extract the logic inside the parse() method into a library function and call from both spiders that would not be related, separate spiders.

Confused about running Scrapy from within a Python script

Following document, I can run scrapy from a Python script, but I can't get the scrapy result.
This is my spider:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from items import DmozItem
class DmozSpider(BaseSpider):
name = "douban"
allowed_domains = ["example.com"]
start_urls = [
"http://www.example.com/group/xxx/discussion"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
rows = hxs.select("//table[#class='olt']/tr/td[#class='title']/a")
items = []
# print sites
for row in rows:
item = DmozItem()
item["title"] = row.select('text()').extract()[0]
item["link"] = row.select('#href').extract()[0]
items.append(item)
return items
Notice the last line, I try to use the returned parse result, if I run:
scrapy crawl douban
the terminal could print the return result
But I can't get the return result from the Python script. Here is my Python script:
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy.settings import Settings
from scrapy import log, signals
from spiders.dmoz_spider import DmozSpider
from scrapy.xlib.pydispatch import dispatcher
def stop_reactor():
reactor.stop()
dispatcher.connect(stop_reactor, signal=signals.spider_closed)
spider = DmozSpider(domain='www.douban.com')
crawler = Crawler(Settings())
crawler.configure()
crawler.crawl(spider)
crawler.start()
log.start()
log.msg("------------>Running reactor")
result = reactor.run()
print result
log.msg("------------>Running stoped")
I try to get the result at the reactor.run(), but it return nothing,
How can I get the result?

Terminal prints the result because the default log level is set to DEBUG.
When you are running your spider from the script and call log.start(), the default log level is set to INFO.
Just replace:
log.start()
with
log.start(loglevel=log.DEBUG)
UPD:
To get the result as string, you can log everything to a file and then read from it, e.g.:
log.start(logfile="results.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False)
reactor.run()
with open("results.log", "r") as f:
result = f.read()
print result
Hope that helps.

I found your question while asking myself the same thing, namely: "How can I get the result?". Since this wasn't answered here I endeavoured to find the answer myself and now that I have I can share it:
items = []
def add_item(item):
items.append(item)
dispatcher.connect(add_item, signal=signals.item_passed)
Or for scrapy 0.22 (http://doc.scrapy.org/en/latest/topics/practices.html#run-scrapy-from-a-script) replace the last line of my solution by:
crawler.signals.connect(add_item, signals.item_passed)
My solution is freely adapted from http://www.tryolabs.com/Blog/2011/09/27/calling-scrapy-python-script/.

in my case, i placed the script file at scrapy project level e.g. if scrapyproject/scrapyproject/spiders then i placed it at scrapyproject/myscript.py

Creating a generic scrapy spider

My question is really how to do the same thing as a previous question, but in Scrapy 0.14.
Using one Scrapy spider for several websites
Basically, I have GUI that takes parameters like domain, keywords, tag names, etc. and I want to create a generic spider to crawl those domains for those keywords in those tags. I've read conflicting things, using older versions of scrapy, by either overriding the spider manager class or by dynamically creating a spider. Which method is preferred and how do I implement and invoke the proper solution? Thanks in advance.
Here is the code that I want to make generic. It also uses BeautifulSoup. I paired it down so hopefully didn't remove anything crucial to understand it.
class MySpider(CrawlSpider):
name = 'MySpider'
allowed_domains = ['somedomain.com', 'sub.somedomain.com']
start_urls = ['http://www.somedomain.com']
rules = (
Rule(SgmlLinkExtractor(allow=('/pages/', ), deny=('', ))),
Rule(SgmlLinkExtractor(allow=('/2012/03/')), callback='parse_item'),
)
def parse_item(self, response):
contentTags = []
soup = BeautifulSoup(response.body)
contentTags = soup.findAll('p', itemprop="myProp")
for contentTag in contentTags:
matchedResult = re.search('Keyword1|Keyword2', contentTag.text)
if matchedResult:
print('URL Found: ' + response.url)
pass

You could create a run-time spider which is evaluated by the interpreter. This code piece could be evaluated at runtime like so:
a = open("test.py")
from compiler import compile
d = compile(a.read(), 'spider.py', 'exec')
eval(d)
MySpider
<class '__main__.MySpider'>
print MySpider.start_urls
['http://www.somedomain.com']

I use the Scrapy Extensions approach to extend the Spider class to a class named Masterspider that includes a generic parser.
Below is the very "short" version of my generic extended parser. Note that you'll need to implement a renderer with a Javascript engine (such as Selenium or BeautifulSoup) a as soon as you start working on pages using AJAX. And a lot of additional code to manage differences between sites (scrap based on column title, handle relative vs long URL, manage different kind of data containers, etc...).
What is interresting with the Scrapy Extension approach is that you can still override the generic parser method if something does not fit but I never had to. The Masterspider class checks if some methods have been created (eg. parser_start, next_url_parser...) under the site specific spider class to allow the management of specificies: send a form, construct the next_url request from elements in the page, etc.
As I'm scraping very different sites, there's always specificities to manage. That's why I prefer to keep a class for each scraped site so that I can write some specific methods to handle it (pre-/post-processing except PipeLines, Request generators...).
masterspider/sitespider/settings.py
EXTENSIONS = {
'masterspider.masterspider.MasterSpider': 500
}
masterspider/masterspdier/masterspider.py
# -*- coding: utf8 -*-
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request
from sitespider.items import genspiderItem
class MasterSpider(Spider):
def start_requests(self):
if hasattr(self,'parse_start'): # First page requiring a specific parser
fcallback = self.parse_start
else:
fcallback = self.parse
return [ Request(self.spd['start_url'],
callback=fcallback,
meta={'itemfields': {}}) ]
def parse(self, response):
sel = Selector(response)
lines = sel.xpath(self.spd['xlines'])
# ...
for line in lines:
item = genspiderItem(response.meta['itemfields'])
# ...
# Get request_url of detailed page and scrap basic item info
# ...
yield Request(request_url,
callback=self.parse_item,
meta={'item':item, 'itemfields':response.meta['itemfields']})
for next_url in sel.xpath(self.spd['xnext_url']).extract():
if hasattr(self,'next_url_parser'): # Need to process the next page URL before?
yield self.next_url_parser(next_url, response)
else:
yield Request(
request_url,
callback=self.parse,
meta=response.meta)
def parse_item(self, response):
sel = Selector(response)
item = response.meta['item']
for itemname, xitemname in self.spd['x_ondetailpage'].iteritems():
item[itemname] = "\n".join(sel.xpath(xitemname).extract())
return item
masterspider/sitespider/spiders/somesite_spider.py
# -*- coding: utf8 -*-
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request
from sitespider.items import genspiderItem
from masterspider.masterspider import MasterSpider
class targetsiteSpider(MasterSpider):
name = "targetsite"
allowed_domains = ["www.targetsite.com"]
spd = {
'start_url' : "http://www.targetsite.com/startpage", # Start page
'xlines' : "//td[something...]",
'xnext_url' : "//a[contains(#href,'something?page=')]/#href", # Next pages
'x_ondetailpage' : {
"itemprop123" : u"id('someid')//text()"
}
}
# def next_url_parser(self, next_url, response): # OPTIONAL next_url regexp pre-processor
# ...

Instead of having the variables name,allowed_domains, start_urls and rules attached to the class, you should write a MySpider.__init__, call CrawlSpider.__init__ from that passing the necessary arguments, and setting name, allowed_domains etc. per object.
MyProp and keywords also should be set within your __init__. So in the end you should have something like below. You don't have to add name to the arguments, as name is set by BaseSpider itself from kwargs:
class MySpider(CrawlSpider):
def __init__(self, allowed_domains=[], start_urls=[],
rules=[], findtag='', finditemprop='', keywords='', **kwargs):
CrawlSpider.__init__(self, **kwargs)
self.allowed_domains = allowed_domains
self.start_urls = start_urls
self.rules = rules
self.findtag = findtag
self.finditemprop = finditemprop
self.keywords = keywords
def parse_item(self, response):
contentTags = []
soup = BeautifulSoup(response.body)
contentTags = soup.findAll(self.findtag, itemprop=self.finditemprop)
for contentTag in contentTags:
matchedResult = re.search(self.keywords, contentTag.text)
if matchedResult:
print('URL Found: ' + response.url)

I am not sure which way is preferred, but I will tell you what I have done in the past. I am in no way sure that this is the best (or correct) way of doing this and I would be interested to learn what other people think.
I usually just override the parent class (CrawlSpider) and either pass in arguments and then initialize the parent class via super(MySpider, self).__init__() from within my own init-function or I pull in that data from a database where I have saved a list of links to be appended to start_urls earlier.

As far as crawling specific domains passed as arguments goes, I just override Spider.__init__:
class MySpider(scrapy.Spider):
"""
This spider will try to crawl whatever is passed in `start_urls` which
should be a comma-separated string of fully qualified URIs.
Example: start_urls=http://localhost,http://example.com
"""
def __init__(self, name=None, **kwargs):
if 'start_urls' in kwargs:
self.start_urls = kwargs.pop('start_urls').split(',')
super(Spider, self).__init__(name, **kwargs)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Using Scrapy to crawl websites and only scrape pages containing keywords - python

Related

With Scrapy, How do I check links on a single page is allowed from robots.txt file?

2 functions in scrapy spider and the second one not running

How can I reuse the parse method of my scrapy Spider-based spider in an inheriting CrawlSpider?

Confused about running Scrapy from within a Python script

Creating a generic scrapy spider

Categories

Resources