Pass argument to scrapy spider within a python script - python

I can run crawl in a python script with the following recipe from wiki :
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import log, signals
from testspiders.spiders.followall import FollowAllSpider
from scrapy.utils.project import get_project_settings
spider = FollowAllSpider(domain='scrapinghub.com')
settings = get_project_settings()
crawler = Crawler(settings)
crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
crawler.configure()
crawler.crawl(spider)
crawler.start()
log.start()
reactor.run()
As you can see i can just pass the domain to FollowAllSpider but my question is that how can i pass the start_urls (actually a date that will been added to a Fixed url)to my spider class using above code?
this is my spider class:
class MySpider(CrawlSpider):
name = 'tw'
def __init__(self,date):
y,m,d=date.split('-') #this is a test , it could split with regex!
try:
y,m,d=int(y),int(m),int(d)
except ValueError:
raise 'Enter a valid date'
self.allowed_domains = ['mydomin.com']
self.start_urls = ['my_start_urls{}-{}-{}'.format(y,m,d)]
def parse(self, response):
questions = Selector(response).xpath('//div[#class="result-link"]/span/a/#href')
for question in questions:
item = PoptopItem()
item['url'] = question.extract()
yield item['url']
and this is my script :
from pdfcreator import convertor
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import log, signals
#from testspiders.spiders.followall import FollowAllSpider
from scrapy.utils.project import get_project_settings
from poptop.spiders.stackoverflow_spider import MySpider
from poptop.items import PoptopItem
settings = get_project_settings()
crawler = Crawler(settings)
crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
crawler.configure()
date=raw_input('Enter the date with this format (d-m-Y) : ')
print date
spider=MySpider(date=date)
crawler.crawl(spider)
crawler.start()
log.start()
item=PoptopItem()
for url in item['url']:
convertor(url)
reactor.run() # the script will block here until the spider_closed signal was sent
If i just print the item i'll get the following error :
2015-02-25 17:13:47+0330 [tw] ERROR: Spider must return Request, BaseItem or None, got 'unicode' in <GET test-link2015-1-17>
items:
import scrapy
class PoptopItem(scrapy.Item):
titles= scrapy.Field()
content= scrapy.Field()
url=scrapy.Field()

You need to modify your __init__() constructor to accept the date argument. Also, I would use datetime.strptime() to parse the date string:
from datetime import datetime
class MySpider(CrawlSpider):
name = 'tw'
allowed_domains = ['test.com']
def __init__(self, *args, **kwargs):
super(MySpider, self).__init__(*args, **kwargs)
date = kwargs.get('date')
if not date:
raise ValueError('No date given')
dt = datetime.strptime(date, "%m-%d-%Y")
self.start_urls = ['http://test.com/{dt.year}-{dt.month}-{dt.day}'.format(dt=dt)]
Then, you would instantiate the spider this way:
spider = MySpider(date='01-01-2015')
Or, you can even avoid parsing the date at all, passing a datetime instance in the first place:
class MySpider(CrawlSpider):
name = 'tw'
allowed_domains = ['test.com']
def __init__(self, *args, **kwargs):
super(MySpider, self).__init__(*args, **kwargs)
dt = kwargs.get('dt')
if not dt:
raise ValueError('No date given')
self.start_urls = ['http://test.com/{dt.year}-{dt.month}-{dt.day}'.format(dt=dt)]
spider = MySpider(dt=datetime(year=2014, month=01, day=01))
And, just FYI, see this answer as a detailed example about how to run Scrapy from script.

Related

Scrapy return/pass data to another module

Hi I'm wondering how could I pass scraping result which is pandas file to module which created creating spider.
import mySpider as mspider
def main():
spider1 = mspider.MySpider()
process = CrawlerProcess()
process.crawl(spider1)
process.start()
print(len(spider1.result))
Spider:
class MySpider(scrapy.Spider):
name = 'MySpider'
allowed_domains = config.ALLOWED_DOMAINS
result = pd.DataFrame(columns=...)
def start_requests(self):
yield Request(url=...,headers=config.HEADERS, callback=self.parse)
def parse(self, response):
*...Some Code of adding values to result...*
print("size: " + str(len(self.result)))
Printed value in main method is 0 when in parse method is 1005. Could you tell me how should I pass value between.
I would like to do that cause I'm running multiple spiders. After they finish scraping I'll merge and save to file.
SOLUTION
def spider_closed(spider, reason):
print("Size" + str(len(spider.result)))
def main():
now = datetime.now()
spider1 = spider.MySpider()
crawler_process = CrawlerProcess()
crawler = crawler_process.create_crawler(spider1)
crawler.signals.connect(spider_closed, signals.spider_closed)
crawler_process.crawl(spider1)
crawler_process.start()
The main reason for this behavior is the asynchronous nature of Scrapy itself. The print(len(spider1.result)) line would be executed before the .parse() method is called.
There are multiple ways to wait for the spider to be finished. I would do the spider_closed signal:
from scrapy import signals
def spider_closed(spider, reason):
print(len(spider.result))
spider1 = mspider.MySpider()
crawler_process = CrawlerProcess(settings)
crawler = crawler_process.create_crawler()
crawler.signals.connect(spider_closed, signals.spider_closed)
crawler.crawl(spider1)
crawler_process.start()

Change settings for Scrapy CrawlerRunner

I'm trying to change the settings for Scrapy. I've managed to successfully do this for CrawlerProcess before. But I can't seem to get it to work for CrawlerRunner. The log should be disabled but I'm still seeing output from the log. What am I doing wrong? Thanks.
import scrapy
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from scrapy.utils.project import get_project_settings
from scrapy.settings import Settings
class MySpider1(scrapy.Spider):
name = "spider1"
class MySpider2(scrapy.Spider):
name = "spider2"
configure_logging()
s = get_project_settings()
s.update({
"LOG_ENABLED": "False"
})
runner = CrawlerRunner(s)
#defer.inlineCallbacks
def crawl():
yield runner.crawl(MySpider1)
yield runner.crawl(MySpider2)
reactor.stop()
crawl()
reactor.run()
According to the doc, and the api, you should use your setting to init the logger, so you should adjust your code like that:
# comment that line
# configure_logging()
s = get_project_settings()
s.update({
"LOG_ENABLED": "False"
})
# init the logger using setting
configure_logging(s)
runner = CrawlerRunner(s)
Then you will get what you want.

Scrapy crawl multiple times in long running process

So, I made this class so that I can crawl on-demand using Scrapy:
from scrapy import signals
from scrapy.crawler import CrawlerProcess, Crawler
from scrapy.settings import Settings
class NewsCrawler(object):
def __init__(self, spiders=[]):
self.spiders = spiders
self.settings = Settings()
def crawl(self, start_date, end_date):
crawled_items = []
def add_item(item):
crawled_items.append(item)
process = CrawlerProcess(self.settings)
for spider in self.spiders:
crawler = Crawler(spider, self.settings)
crawler.signals.connect(add_item, signals.item_scraped)
process.crawl(crawler, start_date=start_date, end_date=end_date)
process.start()
return crawled_items
Basically, I have a long running process and I will call the above class' crawl method multiple times, like this:
import time
crawler = NewsCrawler(spiders=[Spider1, Spider2])
while True:
items = crawler.crawl(start_date, end_date)
# do something with crawled items ...
time.sleep(3600)
The problem is, the second time crawl being called, this error will occurs: twisted.internet.error.ReactorNotRestartable.
From what I gathered, it's because reactor can't be run after it's being stopped. Is there any workaround for that?
Thanks!
This is a limitation of scrapy(twisted) at the moment and makes it hard using scrapy as a lib.
What you can do is fork a new process which runs the crawler and stops the reactor when the crawl is finished. You can then wait for join and spawn a new process after the crawl has finished. If you want to handle the items in your main thread you can post the results to a Queue. I would recommend using a customized pipelines for your items though.
Have a look at the following answer by me: https://stackoverflow.com/a/22202877/2208253
You should be able to apply the same principles. But you would rather use multiprocessing instead of billiard.
Based on #bj-blazkowicz's answer above. I found out a solution with CrawlerRunner which is the recommended crawler to use when running multiple spiders as stated in the docs https://docs.scrapy.org/en/latest/topics/practices.html#run-scrapy-from-a-script
There’s another Scrapy utility that provides more control over the crawling process: scrapy.crawler.CrawlerRunner. This class is a thin wrapper that encapsulates some simple helpers to run multiple crawlers, but it won’t start or interfere with existing reactors in any way.
Using this class the reactor should be explicitly run after scheduling your spiders. It’s recommended you use CrawlerRunner instead of CrawlerProcess if your application is already using Twisted and you want to run Scrapy in the same reactor.
Code in your main file:
from multiprocessing import Process, Queue
from scrapy.crawler import CrawlerRunner
from scrapy.utils.project import get_project_settings
from scrapy.utils.log import configure_logging
from twisted.internet import reactor
# Enable logging for CrawlerRunner
configure_logging()
class CrawlerRunnerProcess(Process):
def __init__(self, spider, q, *args):
Process.__init__(self)
self.runner = CrawlerRunner(get_project_settings())
self.spider = spider
self.q = q
self.args = args
def run(self):
deferred = self.runner.crawl(self.spider, self.q, self.args)
deferred.addBoth(lambda _: reactor.stop())
reactor.run(installSignalHandlers=False)
# The wrapper to make it run multiple spiders, multiple times
def run_spider(spider, *args): # optional arguments
q = Queue() # optional queue to return spider results
runner = CrawlerRunnerProcess(spider, q, *args)
runner.start()
runner.join()
return q.get()
Code in your spider file:
class MySpider(Spider):
name = 'my_spider'
def __init__(self, q, *args, **kwargs):
super(MySpider, self).__init__(*args, **kwargs)
self.q = q # optional queue
self.args = args # optional args
def parse(self, response):
my_item = MyItem()
self.q.put(my_item)
yield my_item

How to automatically restart Scrapy when the scraping is completed

I try to automatically restart my spider when the scraping is completed, more particularly when the response status is bad.
For example, I've got this code:
#!/usr/bin/python -tt
# -*- coding: utf-8 -*-
from scrapy.selector import Selector
from scrapy.contrib.spiders import CrawlSpider
from scrapy.http import Request
from urlparse import urljoin
from bs4 import BeautifulSoup
from scrapy.spider import BaseSpider
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
from datetime import datetime
import re
class level1(BaseSpider):
# Crawling Start
CrawlSpider.started_on = datetime.now()
name = "level1"
base_domain = 'http://www.google.com'
DOWNLOAD_DELAY = 3
restart=False
handle_httpstatus_list = [404, 302, 503, 999, 200] #add any other code you need
# Call sendEmail class
email = sendEmail()
# Call log settings
saveLog = runlog()
# Init
def __init__(self, url='', child='', parent=''):
self.start_urls = [url]
self.child = child
self.parent = parent
#run baby, run :)
super(level1, self).__init__(self.start_urls)
# On Spider Closed
dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, reason):
if self.restart:
print "we need to retry"
super(level1, self).__init__(self.start_urls)
else:
print "ok"
# parsing time
work_time = datetime.now() - CrawlSpider.started_on
# Correct Finished
if reason == "finished":
print "finished"
def parse(self, response):
if response.status == 503:
self.restart = True
if response.status == 999:
self.restart = True
if str(response.status) == "200":
# Selector
sel = Selector(response)
todo
In the spider_closed method, I try to restart my spider when the response status is bad, but it's not work.
How to resolve this ?
I am not sure if calling init is going to restart your spider.
Take a look at this link: http://doc.scrapy.org/en/1.0/topics/api.html?highlight=scrapy%20start#scrapy.crawler.CrawlerProcess.start
In worst case you could write a separate program that spawns crawler using this core API (from link) and restart as necessary. Though i agree restart inside spider script would be much simpler.

Locally run all of the spiders in Scrapy

Is there a way to run all of the spiders in a Scrapy project without using the Scrapy daemon? There used to be a way to run multiple spiders with scrapy crawl, but that syntax was removed and Scrapy's code changed quite a bit.
I tried creating my own command:
from scrapy.command import ScrapyCommand
from scrapy.utils.misc import load_object
from scrapy.conf import settings
class Command(ScrapyCommand):
requires_project = True
def syntax(self):
return '[options]'
def short_desc(self):
return 'Runs all of the spiders'
def run(self, args, opts):
spman_cls = load_object(settings['SPIDER_MANAGER_CLASS'])
spiders = spman_cls.from_settings(settings)
for spider_name in spiders.list():
spider = self.crawler.spiders.create(spider_name)
self.crawler.crawl(spider)
self.crawler.start()
But once a spider is registered with self.crawler.crawl(), I get assertion errors for all of the other spiders:
Traceback (most recent call last):
File "/usr/lib/python2.7/site-packages/scrapy/cmdline.py", line 138, in _run_command
cmd.run(args, opts)
File "/home/blender/Projects/scrapers/store_crawler/store_crawler/commands/crawlall.py", line 22, in run
self.crawler.crawl(spider)
File "/usr/lib/python2.7/site-packages/scrapy/crawler.py", line 47, in crawl
return self.engine.open_spider(spider, requests)
File "/usr/lib/python2.7/site-packages/twisted/internet/defer.py", line 1214, in unwindGenerator
return _inlineCallbacks(None, gen, Deferred())
--- <exception caught here> ---
File "/usr/lib/python2.7/site-packages/twisted/internet/defer.py", line 1071, in _inlineCallbacks
result = g.send(result)
File "/usr/lib/python2.7/site-packages/scrapy/core/engine.py", line 215, in open_spider
spider.name
exceptions.AssertionError: No free spider slots when opening 'spidername'
Is there any way to do this? I'd rather not start subclassing core Scrapy components just to run all of my spiders like this.
Why didn't you just use something like:
scrapy list|xargs -n 1 scrapy crawl
?
Here is an example that does not run inside a custom command, but runs the Reactor manually and creates a new Crawler for each spider:
from twisted.internet import reactor
from scrapy.crawler import Crawler
# scrapy.conf.settings singlton was deprecated last year
from scrapy.utils.project import get_project_settings
from scrapy import log
def setup_crawler(spider_name):
crawler = Crawler(settings)
crawler.configure()
spider = crawler.spiders.create(spider_name)
crawler.crawl(spider)
crawler.start()
log.start()
settings = get_project_settings()
crawler = Crawler(settings)
crawler.configure()
for spider_name in crawler.spiders.list():
setup_crawler(spider_name)
reactor.run()
You will have to design some signal system to stop the reactor when all spiders are finished.
EDIT: And here is how you can run multiple spiders in a custom command:
from scrapy.command import ScrapyCommand
from scrapy.utils.project import get_project_settings
from scrapy.crawler import Crawler
class Command(ScrapyCommand):
requires_project = True
def syntax(self):
return '[options]'
def short_desc(self):
return 'Runs all of the spiders'
def run(self, args, opts):
settings = get_project_settings()
for spider_name in self.crawler.spiders.list():
crawler = Crawler(settings)
crawler.configure()
spider = crawler.spiders.create(spider_name)
crawler.crawl(spider)
crawler.start()
self.crawler.start()
the answer of #Steven Almeroth will be failed in Scrapy 1.0, and you should edit the script like this:
from scrapy.commands import ScrapyCommand
from scrapy.utils.project import get_project_settings
from scrapy.crawler import CrawlerProcess
class Command(ScrapyCommand):
requires_project = True
excludes = ['spider1']
def syntax(self):
return '[options]'
def short_desc(self):
return 'Runs all of the spiders'
def run(self, args, opts):
settings = get_project_settings()
crawler_process = CrawlerProcess(settings)
for spider_name in crawler_process.spider_loader.list():
if spider_name in self.excludes:
continue
spider_cls = crawler_process.spider_loader.load(spider_name)
crawler_process.crawl(spider_cls)
crawler_process.start()
this code is works on My scrapy version is 1.3.3 (save it in same directory in scrapy.cfg):
from scrapy.utils.project import get_project_settings
from scrapy.crawler import CrawlerProcess
setting = get_project_settings()
process = CrawlerProcess(setting)
for spider_name in process.spiders.list():
print ("Running spider %s" % (spider_name))
process.crawl(spider_name,query="dvh") #query dvh is custom argument used in your scrapy
process.start()
for scrapy 1.5.x (so you don't get the deprecation warning)
from scrapy.utils.project import get_project_settings
from scrapy.crawler import CrawlerProcess
setting = get_project_settings()
process = CrawlerProcess(setting)
for spider_name in process.spider_loader.list():
print ("Running spider %s" % (spider_name))
process.crawl(spider_name,query="dvh") #query dvh is custom argument used in your scrapy
process.start()
Linux script
#!/bin/bash
for spider in $(scrapy list)
do
scrapy crawl "$spider" -o "$spider".json
done
Running all spiders in project using python
# Run all spiders in project implemented using Scrapy 2.7.0
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
def main():
settings = get_project_settings()
process = CrawlerProcess(settings)
spiders_names = process.spider_loader.list()
for s in spiders_names:
process.crawl(s)
process.start()
if __name__ == '__main__':
main()

Categories

Resources