Locally run all of the spiders in Scrapy

Locally run all of the spiders in Scrapy - python

Is there a way to run all of the spiders in a Scrapy project without using the Scrapy daemon? There used to be a way to run multiple spiders with scrapy crawl, but that syntax was removed and Scrapy's code changed quite a bit.
I tried creating my own command:
from scrapy.command import ScrapyCommand
from scrapy.utils.misc import load_object
from scrapy.conf import settings
class Command(ScrapyCommand):
requires_project = True
def syntax(self):
return '[options]'
def short_desc(self):
return 'Runs all of the spiders'
def run(self, args, opts):
spman_cls = load_object(settings['SPIDER_MANAGER_CLASS'])
spiders = spman_cls.from_settings(settings)
for spider_name in spiders.list():
spider = self.crawler.spiders.create(spider_name)
self.crawler.crawl(spider)
self.crawler.start()
But once a spider is registered with self.crawler.crawl(), I get assertion errors for all of the other spiders:
Traceback (most recent call last):
File "/usr/lib/python2.7/site-packages/scrapy/cmdline.py", line 138, in _run_command
cmd.run(args, opts)
File "/home/blender/Projects/scrapers/store_crawler/store_crawler/commands/crawlall.py", line 22, in run
self.crawler.crawl(spider)
File "/usr/lib/python2.7/site-packages/scrapy/crawler.py", line 47, in crawl
return self.engine.open_spider(spider, requests)
File "/usr/lib/python2.7/site-packages/twisted/internet/defer.py", line 1214, in unwindGenerator
return _inlineCallbacks(None, gen, Deferred())
--- <exception caught here> ---
File "/usr/lib/python2.7/site-packages/twisted/internet/defer.py", line 1071, in _inlineCallbacks
result = g.send(result)
File "/usr/lib/python2.7/site-packages/scrapy/core/engine.py", line 215, in open_spider
spider.name
exceptions.AssertionError: No free spider slots when opening 'spidername'
Is there any way to do this? I'd rather not start subclassing core Scrapy components just to run all of my spiders like this.

Why didn't you just use something like:
scrapy list|xargs -n 1 scrapy crawl
?

Here is an example that does not run inside a custom command, but runs the Reactor manually and creates a new Crawler for each spider:
from twisted.internet import reactor
from scrapy.crawler import Crawler
# scrapy.conf.settings singlton was deprecated last year
from scrapy.utils.project import get_project_settings
from scrapy import log
def setup_crawler(spider_name):
crawler = Crawler(settings)
crawler.configure()
spider = crawler.spiders.create(spider_name)
crawler.crawl(spider)
crawler.start()
log.start()
settings = get_project_settings()
crawler = Crawler(settings)
crawler.configure()
for spider_name in crawler.spiders.list():
setup_crawler(spider_name)
reactor.run()
You will have to design some signal system to stop the reactor when all spiders are finished.
EDIT: And here is how you can run multiple spiders in a custom command:
from scrapy.command import ScrapyCommand
from scrapy.utils.project import get_project_settings
from scrapy.crawler import Crawler
class Command(ScrapyCommand):
requires_project = True
def syntax(self):
return '[options]'
def short_desc(self):
return 'Runs all of the spiders'
def run(self, args, opts):
settings = get_project_settings()
for spider_name in self.crawler.spiders.list():
crawler = Crawler(settings)
crawler.configure()
spider = crawler.spiders.create(spider_name)
crawler.crawl(spider)
crawler.start()
self.crawler.start()

the answer of #Steven Almeroth will be failed in Scrapy 1.0, and you should edit the script like this:
from scrapy.commands import ScrapyCommand
from scrapy.utils.project import get_project_settings
from scrapy.crawler import CrawlerProcess
class Command(ScrapyCommand):
requires_project = True
excludes = ['spider1']
def syntax(self):
return '[options]'
def short_desc(self):
return 'Runs all of the spiders'
def run(self, args, opts):
settings = get_project_settings()
crawler_process = CrawlerProcess(settings)
for spider_name in crawler_process.spider_loader.list():
if spider_name in self.excludes:
continue
spider_cls = crawler_process.spider_loader.load(spider_name)
crawler_process.crawl(spider_cls)
crawler_process.start()

this code is works on My scrapy version is 1.3.3 (save it in same directory in scrapy.cfg):
from scrapy.utils.project import get_project_settings
from scrapy.crawler import CrawlerProcess
setting = get_project_settings()
process = CrawlerProcess(setting)
for spider_name in process.spiders.list():
print ("Running spider %s" % (spider_name))
process.crawl(spider_name,query="dvh") #query dvh is custom argument used in your scrapy
process.start()
for scrapy 1.5.x (so you don't get the deprecation warning)
from scrapy.utils.project import get_project_settings
from scrapy.crawler import CrawlerProcess
setting = get_project_settings()
process = CrawlerProcess(setting)
for spider_name in process.spider_loader.list():
print ("Running spider %s" % (spider_name))
process.crawl(spider_name,query="dvh") #query dvh is custom argument used in your scrapy
process.start()

Linux script
#!/bin/bash
for spider in $(scrapy list)
do
scrapy crawl "$spider" -o "$spider".json
done

Running all spiders in project using python
# Run all spiders in project implemented using Scrapy 2.7.0
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
def main():
settings = get_project_settings()
process = CrawlerProcess(settings)
spiders_names = process.spider_loader.list()
for s in spiders_names:
process.crawl(s)
process.start()
if __name__ == '__main__':
main()

Related

Scrapy: Running multiple spiders from the same python process via cmdLine fails

Here's the code:
if __name__ == '__main__':
cmdline.execute("scrapy crawl spider_a -L INFO".split())
cmdline.execute("scrapy crawl spider_b -L INFO".split())
I intend to run multiple spiders from within the same main portal under a scrapy project but it turns out that only the first spider has run successfully, whereas the second one seems like being ignored. Any suggestions?

Just do
import subprocess
subprocess.call('for spider in spider_a spider_b; do scrapy crawl $spider -L INFO; done', shell=True)

From the scrapy documentation: https://doc.scrapy.org/en/latest/topics/practices.html#running-multiple-spiders-in-the-same-process
import scrapy
from scrapy.crawler import CrawlerProcess
from .spiders import Spider1, Spider2
process = CrawlerProcess()
process.crawl(Crawler1)
process.crawl(Crawler2)
process.start() # the script will block here until all crawling jobs are finished
EDIT: If you wish to run multiple spiders two by two, you can do the following:
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
configure_logging()
runner = CrawlerRunner()
spiders = [Spider1, Spider2, Spider3, Spider4]
def join_spiders(spiders):
"""Setup a new runner with the provided spiders"""
runner = CrawlerRunner()
# Add each spider to the current runner
for spider in spider:
runner.crawl(MySpider1)
# This will yield when all the spiders inside the runner finished
yield runner.join()
#defer.inlineCallbacks
def crawl(group_by=2):
# Yield a new runner containing `group_by` spiders
for i in range(0, len(spiders), step=group_by):
yield join_spiders(spiders[i:i + group_by])
# When we finished running all the spiders, stop the twisted reactor
reactor.stop()
crawl()
reactor.run() # the script will block here until the last crawl call is finished
Didn't tested all of this though, let me know if it works !

Change settings for Scrapy CrawlerRunner

I'm trying to change the settings for Scrapy. I've managed to successfully do this for CrawlerProcess before. But I can't seem to get it to work for CrawlerRunner. The log should be disabled but I'm still seeing output from the log. What am I doing wrong? Thanks.
import scrapy
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from scrapy.utils.project import get_project_settings
from scrapy.settings import Settings
class MySpider1(scrapy.Spider):
name = "spider1"
class MySpider2(scrapy.Spider):
name = "spider2"
configure_logging()
s = get_project_settings()
s.update({
"LOG_ENABLED": "False"
})
runner = CrawlerRunner(s)
#defer.inlineCallbacks
def crawl():
yield runner.crawl(MySpider1)
yield runner.crawl(MySpider2)
reactor.stop()
crawl()
reactor.run()

According to the doc, and the api, you should use your setting to init the logger, so you should adjust your code like that:
# comment that line
# configure_logging()
s = get_project_settings()
s.update({
"LOG_ENABLED": "False"
})
# init the logger using setting
configure_logging(s)
runner = CrawlerRunner(s)
Then you will get what you want.

Easiest way to run scrapy crawler so it doesn't block the script

The official docs give many ways for running scrapy crawlers from code:
import scrapy
from scrapy.crawler import CrawlerProcess
class MySpider(scrapy.Spider):
# Your spider definition
...
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
process.crawl(MySpider)
process.start() # the script will block here until the crawling is finished
But all of them block script until crawling is finished. What's the easiest way in python to run the crawler in a non-blocking, async manner?

I tried every solution I could find, and the only working for me was this. But in order to make it work with scrapy 1.1rc1 I had to tweak it a little bit:
from scrapy.crawler import Crawler
from scrapy import signals
from scrapy.utils.project import get_project_settings
from twisted.internet import reactor
from billiard import Process
class CrawlerScript(Process):
def __init__(self, spider):
Process.__init__(self)
settings = get_project_settings()
self.crawler = Crawler(spider.__class__, settings)
self.crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
self.spider = spider
def run(self):
self.crawler.crawl(self.spider)
reactor.run()
def crawl_async():
spider = MySpider()
crawler = CrawlerScript(spider)
crawler.start()
crawler.join()
So now when I call crawl_async, it starts crawling and doesn't block my current thread. I'm absolutely new to scrapy, so may be this isn't a very good solution but it worked for me.
I used these versions of the libraries:
cffi==1.5.0
Scrapy==1.1rc1
Twisted==15.5.0
billiard==3.3.0.22

Netimen's answer is correct. process.start() calls reactor.run(), which blocks the thread. Just that I don't think it is necessary to subclass billiard.Process. Although poorly documented, billiard.Process does have a set of APIs to call another function asynchronously without subclassing.
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from billiard import Process
crawler = CrawlerProcess(get_project_settings())
process = Process(target=crawler.start, stop_after_crawl=False)
def crawl(*args, **kwargs):
crawler.crawl(*args, **kwargs)
process.start()
Note that if you don't have stop_after_crawl=False, you may run into ReactorNotRestartable exception when you run the crawler for more than twice.

How to stop reactor when both spiders finished

I have this code and when both spiders finished program is still running.
#!C:\Python27\python.exe
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import log, signals
from carrefour.spiders.tesco import TescoSpider
from carrefour.spiders.carr import CarrSpider
from scrapy.utils.project import get_project_settings
import threading
import time
def tescofcn():
tescoSpider = TescoSpider()
settings = get_project_settings()
crawler = Crawler(settings)
crawler.configure()
crawler.crawl(tescoSpider)
crawler.start()
def carrfcn():
carrSpider = CarrSpider()
settings = get_project_settings()
crawler = Crawler(settings)
crawler.configure()
crawler.crawl(carrSpider)
crawler.start()
t1=threading.Thread(target=tescofcn)
t2=threading.Thread(target=carrfcn)
t1.start()
t2.start()
log.start()
reactor.run()
When i tried insert this to both function
crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
, the spider which was faster end reactor for both spiders and the slower spider was terminated although he not finished.

What you could do is create a function that checks the list of running of spiders and connect that to singals.spider_closed.
from scrapy.utils.trackref import iter_all
def close_reactor_if_no_spiders():
running_spiders = [spider for spider in iter_all('Spider')]
if not running_spiders:
reactor.stop()
crawler.signals.connect(close_reactor_if_no_spiders, signal=signals.spider_closed)
Although, I still would recommend using scrapyd to manage running multiple spiders.

cannot import scrapy modules as library

I'm trying to run spiders from python script following scrapy document: http://doc.scrapy.org/en/latest/topics/practices.html
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import log, signals
from testspiders.spiders.followall import FollowAllSpider
from scrapy.utils.project import get_project_settings
spider = FollowAllSpider(domain='scrapinghub.com')
settings = get_project_settings()
crawler = Crawler(settings)
crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
crawler.configure()
crawler.crawl(spider)
crawler.start()
log.start()
reactor.run() # the script will block here until the spider_closed signal was sent
But python just cannot import the module, the error looks like this:
Traceback (most recent call last):
...
from scrapy.crawler import Crawler
File "aappp/scrapy.py", line 1, in <module>
ImportError: No module named crawler
The issue is briefly mentioned in faq of scrapy document, but it doesn't help too much for me.

Have you tried doing it this way?
from scrapy.project import crawler
(That's how it's done on http://doc.scrapy.org/en/latest/faq.html - looks like they already answered your question there.)
It also gives a more recent way of doing it and calls this previous method deprecated:
"This way to access the crawler object is deprecated, the code should be ported to use from_crawler class method, for example:
class SomeExtension(object):
#classmethod
def from_crawler(cls, crawler):
o = cls()
o.crawler = crawler
return o
"

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Locally run all of the spiders in Scrapy - python

Why didn't you just use something like: scrapy list|xargs -n 1 scrapy crawl ?

Linux script #!/bin/bash for spider in $(scrapy list) do scrapy crawl "$spider" -o "$spider".json done

Related

Scrapy: Running multiple spiders from the same python process via cmdLine fails

Change settings for Scrapy CrawlerRunner

Easiest way to run scrapy crawler so it doesn't block the script

How to stop reactor when both spiders finished

cannot import scrapy modules as library

Categories

Resources