I try to automatically restart my spider when the scraping is completed, more particularly when the response status is bad.
For example, I've got this code:
#!/usr/bin/python -tt
# -*- coding: utf-8 -*-
from scrapy.selector import Selector
from scrapy.contrib.spiders import CrawlSpider
from scrapy.http import Request
from urlparse import urljoin
from bs4 import BeautifulSoup
from scrapy.spider import BaseSpider
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
from datetime import datetime
import re
class level1(BaseSpider):
# Crawling Start
CrawlSpider.started_on = datetime.now()
name = "level1"
base_domain = 'http://www.google.com'
DOWNLOAD_DELAY = 3
restart=False
handle_httpstatus_list = [404, 302, 503, 999, 200] #add any other code you need
# Call sendEmail class
email = sendEmail()
# Call log settings
saveLog = runlog()
# Init
def __init__(self, url='', child='', parent=''):
self.start_urls = [url]
self.child = child
self.parent = parent
#run baby, run :)
super(level1, self).__init__(self.start_urls)
# On Spider Closed
dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, reason):
if self.restart:
print "we need to retry"
super(level1, self).__init__(self.start_urls)
else:
print "ok"
# parsing time
work_time = datetime.now() - CrawlSpider.started_on
# Correct Finished
if reason == "finished":
print "finished"
def parse(self, response):
if response.status == 503:
self.restart = True
if response.status == 999:
self.restart = True
if str(response.status) == "200":
# Selector
sel = Selector(response)
todo
In the spider_closed method, I try to restart my spider when the response status is bad, but it's not work.
How to resolve this ?
I am not sure if calling init is going to restart your spider.
Take a look at this link: http://doc.scrapy.org/en/1.0/topics/api.html?highlight=scrapy%20start#scrapy.crawler.CrawlerProcess.start
In worst case you could write a separate program that spawns crawler using this core API (from link) and restart as necessary. Though i agree restart inside spider script would be much simpler.
Related
Hi I'm wondering how could I pass scraping result which is pandas file to module which created creating spider.
import mySpider as mspider
def main():
spider1 = mspider.MySpider()
process = CrawlerProcess()
process.crawl(spider1)
process.start()
print(len(spider1.result))
Spider:
class MySpider(scrapy.Spider):
name = 'MySpider'
allowed_domains = config.ALLOWED_DOMAINS
result = pd.DataFrame(columns=...)
def start_requests(self):
yield Request(url=...,headers=config.HEADERS, callback=self.parse)
def parse(self, response):
*...Some Code of adding values to result...*
print("size: " + str(len(self.result)))
Printed value in main method is 0 when in parse method is 1005. Could you tell me how should I pass value between.
I would like to do that cause I'm running multiple spiders. After they finish scraping I'll merge and save to file.
SOLUTION
def spider_closed(spider, reason):
print("Size" + str(len(spider.result)))
def main():
now = datetime.now()
spider1 = spider.MySpider()
crawler_process = CrawlerProcess()
crawler = crawler_process.create_crawler(spider1)
crawler.signals.connect(spider_closed, signals.spider_closed)
crawler_process.crawl(spider1)
crawler_process.start()
The main reason for this behavior is the asynchronous nature of Scrapy itself. The print(len(spider1.result)) line would be executed before the .parse() method is called.
There are multiple ways to wait for the spider to be finished. I would do the spider_closed signal:
from scrapy import signals
def spider_closed(spider, reason):
print(len(spider.result))
spider1 = mspider.MySpider()
crawler_process = CrawlerProcess(settings)
crawler = crawler_process.create_crawler()
crawler.signals.connect(spider_closed, signals.spider_closed)
crawler.crawl(spider1)
crawler_process.start()
I have an existing script (main.py) that requires data to be scraped.
I started a scrapy project for retrieving this data. Now, is there any way main.py can retrieve the data from scrapy as an Item generator, rather than persisting data using the Item pipeline?
Something like this would be really convenient, but I couldn't find out how to do it, if it's feasible at all.
for item in scrapy.process():
I found a potential solution there: https://tryolabs.com/blog/2011/09/27/calling-scrapy-python-script/, using multithreading's queues.
Even though I understand this behaviour is not compatible with distributed crawling, which is what Scrapy is intended for, I'm still a little surprised that you wouldn't have this feature available for smaller projects.
You could send json data out from the crawler and grab the results. It can be done as follows:
Having the spider:
class MySpider(scrapy.Spider):
# some attributes
accomulated=[]
def parse(self, response):
# do your logic here
page_text = response.xpath('//text()').extract()
for text in page_text:
if conditionsAreOk( text ):
self.accomulated.append(text)
def closed( self, reason ):
# call when the crawler process ends
print JSON.dumps(self.accomulated)
Write a runner.py script like:
import sys
from twisted.internet import reactor
import scrapy
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from scrapy.utils.project import get_project_settings
from spiders import MySpider
def main(argv):
url = argv[0]
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s', 'LOG_ENABLED':False })
runner = CrawlerRunner( get_project_settings() )
d = runner.crawl( MySpider, url=url)
# For Multiple in the same process
#
# runner.crawl('craw')
# runner.crawl('craw2')
# d = runner.join()
d.addBoth(lambda _: reactor.stop())
reactor.run() # the script will block here until the crawling is finished
if __name__ == "__main__":
main(sys.argv[1:])
And then call it from your main.py as:
import json, subprocess, sys, time
def main(argv):
# urlArray has http:// or https:// like urls
for url in urlArray:
p = subprocess.Popen(['python', 'runner.py', url ], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, err = p.communicate()
# do something with your data
print out
print json.loads(out)
# This just helps to watch logs
time.sleep(0.5)
if __name__ == "__main__":
main(sys.argv[1:])
Note
This is not the best way of using Scrapy as you know, but for fast results which do not require a complex post processing, this solution can provide what you need.
I hope it helps.
You can do it this way in a Twisted or Tornado app:
import collections
from twisted.internet.defer import Deferred
from scrapy.crawler import Crawler
from scrapy import signals
def scrape_items(crawler_runner, crawler_or_spidercls, *args, **kwargs):
"""
Start a crawl and return an object (ItemCursor instance)
which allows to retrieve scraped items and wait for items
to become available.
Example:
.. code-block:: python
#inlineCallbacks
def f():
runner = CrawlerRunner()
async_items = scrape_items(runner, my_spider)
while (yield async_items.fetch_next):
item = async_items.next_item()
# ...
# ...
This convoluted way to write a loop should become unnecessary
in Python 3.5 because of ``async for``.
"""
# this requires scrapy >= 1.1rc1
crawler = crawler_runner.create_crawler(crawler_or_spidercls)
# for scrapy < 1.1rc1 the following code is needed:
# crawler = crawler_or_spidercls
# if not isinstance(crawler_or_spidercls, Crawler):
# crawler = crawler_runner._create_crawler(crawler_or_spidercls)
d = crawler_runner.crawl(crawler, *args, **kwargs)
return ItemCursor(d, crawler)
class ItemCursor(object):
def __init__(self, crawl_d, crawler):
self.crawl_d = crawl_d
self.crawler = crawler
crawler.signals.connect(self._on_item_scraped, signals.item_scraped)
crawl_d.addCallback(self._on_finished)
crawl_d.addErrback(self._on_error)
self.closed = False
self._items_available = Deferred()
self._items = collections.deque()
def _on_item_scraped(self, item):
self._items.append(item)
self._items_available.callback(True)
self._items_available = Deferred()
def _on_finished(self, result):
self.closed = True
self._items_available.callback(False)
def _on_error(self, failure):
self.closed = True
self._items_available.errback(failure)
#property
def fetch_next(self):
"""
A Deferred used with ``inlineCallbacks`` or ``gen.coroutine`` to
asynchronously retrieve the next item, waiting for an item to be
crawled if necessary. Resolves to ``False`` if the crawl is finished,
otherwise :meth:`next_item` is guaranteed to return an item
(a dict or a scrapy.Item instance).
"""
if self.closed:
# crawl is finished
d = Deferred()
d.callback(False)
return d
if self._items:
# result is ready
d = Deferred()
d.callback(True)
return d
# We're active, but item is not ready yet. Return a Deferred which
# resolves to True if item is scraped or to False if crawl is stopped.
return self._items_available
def next_item(self):
"""Get a document from the most recently fetched batch, or ``None``.
See :attr:`fetch_next`.
"""
if not self._items:
return None
return self._items.popleft()
The main idea is to listen to item_scraped signal, and then wrap it to an object with a nicer API.
Note that you need an event loop in your main.py script for this to work; the example above works with twisted.defer.inlineCallbacks or tornado.gen.coroutine.
I can run crawl in a python script with the following recipe from wiki :
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import log, signals
from testspiders.spiders.followall import FollowAllSpider
from scrapy.utils.project import get_project_settings
spider = FollowAllSpider(domain='scrapinghub.com')
settings = get_project_settings()
crawler = Crawler(settings)
crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
crawler.configure()
crawler.crawl(spider)
crawler.start()
log.start()
reactor.run()
As you can see i can just pass the domain to FollowAllSpider but my question is that how can i pass the start_urls (actually a date that will been added to a Fixed url)to my spider class using above code?
this is my spider class:
class MySpider(CrawlSpider):
name = 'tw'
def __init__(self,date):
y,m,d=date.split('-') #this is a test , it could split with regex!
try:
y,m,d=int(y),int(m),int(d)
except ValueError:
raise 'Enter a valid date'
self.allowed_domains = ['mydomin.com']
self.start_urls = ['my_start_urls{}-{}-{}'.format(y,m,d)]
def parse(self, response):
questions = Selector(response).xpath('//div[#class="result-link"]/span/a/#href')
for question in questions:
item = PoptopItem()
item['url'] = question.extract()
yield item['url']
and this is my script :
from pdfcreator import convertor
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import log, signals
#from testspiders.spiders.followall import FollowAllSpider
from scrapy.utils.project import get_project_settings
from poptop.spiders.stackoverflow_spider import MySpider
from poptop.items import PoptopItem
settings = get_project_settings()
crawler = Crawler(settings)
crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
crawler.configure()
date=raw_input('Enter the date with this format (d-m-Y) : ')
print date
spider=MySpider(date=date)
crawler.crawl(spider)
crawler.start()
log.start()
item=PoptopItem()
for url in item['url']:
convertor(url)
reactor.run() # the script will block here until the spider_closed signal was sent
If i just print the item i'll get the following error :
2015-02-25 17:13:47+0330 [tw] ERROR: Spider must return Request, BaseItem or None, got 'unicode' in <GET test-link2015-1-17>
items:
import scrapy
class PoptopItem(scrapy.Item):
titles= scrapy.Field()
content= scrapy.Field()
url=scrapy.Field()
You need to modify your __init__() constructor to accept the date argument. Also, I would use datetime.strptime() to parse the date string:
from datetime import datetime
class MySpider(CrawlSpider):
name = 'tw'
allowed_domains = ['test.com']
def __init__(self, *args, **kwargs):
super(MySpider, self).__init__(*args, **kwargs)
date = kwargs.get('date')
if not date:
raise ValueError('No date given')
dt = datetime.strptime(date, "%m-%d-%Y")
self.start_urls = ['http://test.com/{dt.year}-{dt.month}-{dt.day}'.format(dt=dt)]
Then, you would instantiate the spider this way:
spider = MySpider(date='01-01-2015')
Or, you can even avoid parsing the date at all, passing a datetime instance in the first place:
class MySpider(CrawlSpider):
name = 'tw'
allowed_domains = ['test.com']
def __init__(self, *args, **kwargs):
super(MySpider, self).__init__(*args, **kwargs)
dt = kwargs.get('dt')
if not dt:
raise ValueError('No date given')
self.start_urls = ['http://test.com/{dt.year}-{dt.month}-{dt.day}'.format(dt=dt)]
spider = MySpider(dt=datetime(year=2014, month=01, day=01))
And, just FYI, see this answer as a detailed example about how to run Scrapy from script.
Here is my spider file amzspider.py
import sys
from scrapy.http import Request
import datetime
from scrapy.item import Item, Field
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class amazonScraperSpider(BaseSpider):
name = "Amazon_Scraper"
allowed_domains = ["amazon.com"]
urls=[]
def __init__(self,url,product_file,asin_file):
self.product_file=product_file
self.asin_file=asin_file
self.url=[url]
self.start_urls = [url]
def parse(self, response):
hxs = HtmlXPathSelector(response)
Tops = hxs.select("//*[#class='zg_more_link']/#href").extract()
Tops.append = self.url
for Top in Tops:
yield Request(Top, callback = self.parseTopsPages)
def parseTopsPages(self, response):
hxs = HtmlXPathSelector(response)
PageLinks = hxs.select("//div[#id='zg_paginationWrapper']//li/a/#href").extract()
for PageLink in PageLinks:
yield Request(PageLink, callback = self.parseProducts)
def parseProducts(self, response):
hxs = HtmlXPathSelector(response)
products = hxs.select("//div[#class='zg_itemWrapper']//div[#class='zg_title']/a/#href").extract()
for productlink in products:
x = productlink.strip(' \t\n\r')
x1 = '/'.join(x.split('/')[:6])
self.urls.append(x1)
self.save()
def save(self):
f=open(self.product_file,"w")
f1=open(self.asin_file,"w")
for url in self.urls:
f.write(url+"\n")
f.flush()
for url in self.urls:
f.write(url.replace("http://www.","")+"\n")
f.flush()
for url in self.urls:
f.write("http://www.amazon.com/gp/product/" + url.split("/")[-1]+"\n")
f.flush()
for url in self.urls:
f.write("amazon.com/gp/product/" + url.split("/")[-1]+"\n")
f.flush()
f.close()
for url in self.urls:
f1.write(url.split("/")[-1]+"\n")
f1.flush()
f1.close()
I call it from controller.py and I want to wait for it to finish (Block Thread) and only then continue with controller.py after it's done scraping work.
I call it this way:
spider = amzspider.amazonScraperSpider(url, settings['product_file'], settings['asins_file'])
The problem controller.py continues executing code without thread block of amzspider.py
Your main() function just creates an instance; it doesn't actually make it do anything. You should really call:
spider = amzspider.amazonScraperSpider(url, settings['product_file'], settings['asins_file'])
in controller.py. This will actually give you access to the instance; you don't need main(). You can then use the instance:
response = get_a_response() # whatever you do here
spider.parse(response) # give the spider work to do
Etc.
Without having seen the code for how you create your threads it seems you want to call join() on the thread running amzspider.main.
From the Python threading manual:
Other threads can call a thread’s join() method. This blocks the calling thread until the thread whose join() method is called is terminated.
I'm trying to execute scrapy spider in separate script and when I execute this script in a loop (for instance run the same spider with different parameters), I get ReactorAlreadyRunning. My snippet:
from celery import task
from episode.skywalker.crawlers import settings
from multiprocessing.queues import Queue
from scrapy import log, project, signals
from scrapy.settings import CrawlerSettings
from scrapy.spider import BaseSpider
from scrapy.spidermanager import SpiderManager
from scrapy.xlib.pydispatch import dispatcher
import multiprocessing
from twisted.internet.error import ReactorAlreadyRunning
class CrawlerWorker(multiprocessing.Process):
def __init__(self, spider, result_queue):
from scrapy.crawler import CrawlerProcess
multiprocessing.Process.__init__(self)
self.result_queue = result_queue
self.crawler = CrawlerProcess(CrawlerSettings(settings))
if not hasattr(project, 'crawler'):
self.crawler.install()
self.crawler.configure()
self.items = []
self.spider = spider
dispatcher.connect(self._item_passed, signals.item_passed)
def _item_passed(self, item):
self.items.append(item)
def run(self):
self.crawler.crawl(self.spider)
try:
self.crawler.start()
except ReactorAlreadyRunning:
pass
self.crawler.stop()
self.result_queue.put(self.items)
#task
def execute_spider(spider, **spider__kwargs):
'''
Execute spider within separate process
#param spider: spider class to crawl or the name (check if instance)
'''
if not isinstance(spider, BaseSpider):
manager = SpiderManager(settings.SPIDER_MODULES)
spider = manager.create(spider, **spider__kwargs)
result_queue = Queue()
crawler = CrawlerWorker(spider, result_queue)
crawler.start()
items = []
for item in result_queue.get():
items.append(item)
My suggestion is that it caused by multiple twisted reactor runs.
How can I avoid it? Is there in general a way to run the spiders without reactor?
I figured out, what caused the problem: if you call execute_spider method somehow in CrawlerWorker process (for instance via recursion ), it causes creating second reactor, which is not possible.
My solution: to move all statements, causing recursive calls, in execute_spider method, so they will trigger the spider execution in the same process, not in secondary CrawlerWorker. I also built in such a statement
try:
self.crawler.start()
except ReactorAlreadyRunning:
raise RecursiveSpiderCall("Spider %s was called from another spider recursively. Such behavior is not allowed" % (self.spider))
to catch unintentionally recursive calls of spiders.