Callback for celery apply_async - python

I use celery in my application to run periodic tasks. Let's see simple example below
from myqueue import Queue
#perodic_task(run_every=timedelta(minutes=1))
def process_queue():
queue = Queue()
uid, questions = queue.pop()
if uid is None:
return
job = group(do_stuff(q) for q in questions)
job.apply_async()
def do_stuff(question):
try:
...
except:
...
raise
As you can see in the example above, i use celery to run async task, but (since it's a queue) i need to do queue.fail(uid) in case of exception in do_stuff or queue.ack(uid) otherwise. In this situation it would be very clear and usefull to have some callback from my task in both cases - on_failure and on_success.
I saw some documentation, but never seen practices of using callbacks with apply_async. Is it possible to do that?

Subclass the Task class and overload the on_success and on_failure functions:
from celery import Task
class CallbackTask(Task):
def on_success(self, retval, task_id, args, kwargs):
'''
retval – The return value of the task.
task_id – Unique id of the executed task.
args – Original arguments for the executed task.
kwargs – Original keyword arguments for the executed task.
'''
pass
def on_failure(self, exc, task_id, args, kwargs, einfo):
'''
exc – The exception raised by the task.
task_id – Unique id of the failed task.
args – Original arguments for the task that failed.
kwargs – Original keyword arguments for the task that failed.
'''
pass
Use:
#celery.task(base=CallbackTask) # this does the trick
def add(x, y):
return x + y

You can specify success and error callbacks via the link and link_err kwargs when you call apply_async. The celery docs include a clear example: http://docs.celeryproject.org/en/latest/userguide/calling.html#linking-callbacks-errbacks

Related

How to get the celery.Task object from the #app.task

I have a task function:
#app.task(base=ProcessingTask)
def do_processing_task(args):
ProcessingTask inherits celery.Task:
class ProcessingTask(celery.Task):
def on_success(self, res, task_id, args, kwargs):
I start the task remotely with
result = app.send_task("workerTasks.do_processing_task", args=[args])
(I don't have access to the workerTasks file from the server file which calls this, so send_task is the route I need to take)
Within do_processing_task I'd like to get the instance of the ProcessingTask object so I can add some data to it that I can use in do_processing_task::on_success.
Is this possible? Thanks.
Bound tasks
A task being bound means the first argument to the task will always be the task instance (self), just like Python bound methods:
logger = get_task_logger(__name__)
#task(bind=True)
def add(self, x, y):
logger.info(self.request.id)

How to pass argument while calling a function using schedule library?

I am wondering if someone can help me in how to pass the argument while calling the job function using the schedule library. I see there are couple of example on the same but nothing when you are using the threading and run_threaded function.
In the below code snippet i am trying to pass the 'sample_input' as an argument and confused how to define this parameter.
def run_threaded(job_func):
job_thread = threading.Thread(target=job_func)
job_thread.start()
#with_logging
def job(input_name):
print("I'm running on thread %s" % threading.current_thread())
main(input_name)
schedule.every(10).seconds.do(run_threaded, job(‘sample_input’))
You could get by altering the method definitions and invoke signatures to something similar below.
# run_threaded method accepts arguments of job_func
def run_threaded(job_func, *args, **kwargs):
print "======", args, kwargs
job_thread = threading.Thread(target=job_func, args=args, kwargs=kwargs)
job_thread.start()
# Invoke the arguments while scheduling.
schedule.every(10).seconds.do(run_threaded, job, "sample_input")

concurrent.futures.ThreadPoolExecutor max_workers can't be 0

If I spin up a ThreadPoolExecutor(max_workers=0) it works with Python3.4 and Python2.7 but raises an error with Python3.5 and Python3.6. I'm trying to create a ThreadPoolExecutor where I want to ensure that no task gets added to the threadpool. Currently, I created a subclass from ThreadPoolExecutor and raised and exception in the overloaded submit method. Is there a better way to do this?
Simply put, with Python3.5 and 3.6, the max_workers argument is not allowed to be 0 for sanity reasons. So my solution was to make a more "mocky" version of the ThreadPoolExecutor that would record the activity of the ThreadPool in case something is added to the queue and then make assertions about that. I'll share the code here in case someone wants to reuse it for their purposes.
import threading
from concurrent import futures
class RecordingThreadPool(futures.Executor):
"""A thread pool that records if used."""
def __init__(self, max_workers):
self._tp_executor = futures.ThreadPoolExecutor(max_workers=max_workers)
self._lock = threading.Lock()
self._was_used = False
def submit(self, fn, *args, **kwargs):
with self._lock:
self._was_used = True
self._tp_executor.submit(fn, *args, **kwargs)
def was_used(self):
with self._lock:
return self._was_used

how to pass arguments to a python fabric custom task

I cannot figure out how to actually pass arguments to a fabric custom task.
I have a bunch of tasks that all need to do the same setup, so I was hoping to subclass the task and have the base class do the setup and then run the specific subtasks. Both the setup code and the subtasks need access to some arguments that are passed in from the command-line to the task. I also need to be able to set default values for the arguments.
Original Attempt
My original attempt shows what I am trying to do without any sub classes.
This code works correctly.
The code below is in file tmp1.py:
from fabric.api import task
def do_setup(myarg):
''' common setup for all tasks '''
print "in do_setup(myarg=%s)" % myarg
# do setup using myarg for something important
#task
def actual_task1(myarg='default_value', alias='at'):
print "In actual_task1(myarg=%s)" % myarg
do_setup(myarg)
# do rest of work ...
#task
def actual_task2(myarg='default_value', alias='at'):
print "In actual_task2(myarg=%s)" % myarg
do_setup(myarg)
# do rest of work ...
I run it from the command-line without any args and correctly see the default for myarg of 'default_value'
fab -f ./tmp1.py actual_task1
Prints:
In actual_task1(myarg=default_value)
in do_setup(myarg=default_value)
Done.
Then I call it with myarg='hello' and see that 'hello' gets passed through correctly
fab -f ./tmp1.py actual_task1:myarg='hello'
It outputs:
In actual_task1(myarg=hello)
in do_setup(myarg=hello)
Done.
Attempt with a custom task
My next attempt is to make a common task to encapsulate the setup part.
This is copied from http://docs.fabfile.org/en/1.5/usage/tasks.html
The code below is in the file tmp2.py:
from fabric.api import task
from fabric.tasks import Task
def do_setup(myarg):
''' common setup for all tasks '''
print "in do_setup(myarg=%s)" % myarg
# do setup using myarg for something important
'''
Attempt to make a common task to encapsulate the setup part
copied from http://docs.fabfile.org/en/1.5/usage/tasks.html
'''
class CustomTask(Task):
def init(self, func, myarg, args, *kwargs):
super(CustomTask, self).init(args, *kwargs)
print("=> init(myarg=%s, args=%s, kwargs=%s" % (myarg, args, kwargs))
self.func = func
self.myarg = myarg
print "in init: self.func=",self.func,"self.myarg=",self.myarg
def run(self, *args, **kwargs):
return self.func(self.myarg, *args, **kwargs)
#task(task_class=CustomTask, myarg='default_value', alias='at')
def actual_task1():
print "In actual_task1(myarg=%s)" % myarg
# do rest of work ...
When run, there are 2 problems:
__init__ gets "default_value" instead of "Hello"
It complains that actual_task1() expects 0 arguments
I run it this way:
fab -f ./tmp2.py actual_task1:myarg="Hello"
Prints:
=> init(myarg=default_value, args=(), kwargs={'alias': 'at'}
in init: self.func= self.myarg= default_value
Traceback (most recent call last):
File "/home/xxx/Documents/pyenvs/xxx/local/lib/python2.7/site-packages/fabric/main.py", line 743, in main args, *kwargs
File "/home/xxx/Documents/pyenvs/xxx/local/lib/python2.7/site-packages/fabric/tasks.py", line 405, in execute results[''] = task.run(args, *new_kwargs)
File "/home/xxx/test_fab/tmp2.py", line 21, in run
return self.func(self.myarg, args, *kwargs)
TypeError: actual_task1() takes no arguments (1 given)
I spent quite a bit of time trying to make this work but I cannot seem to solve the default_value issue. I must be missing something?
I would appreciate some help figuring out how to make this sample program run. The second version with the custom task needs to behave just like the original version I showed.
Thank you for any help with this issue.
Fixed example with setup:
from fabric.api import task
from fabric.tasks import Task
def do_setup(foo, verbose):
''' common setup for all tasks '''
print "IN do_setup(foo=%s, verbose=%s)" % (foo, verbose)
# do setup using foo and verbose...
class CustomTask(Task):
def __init__(self, func, *args, **kwargs):
'''
The special args like hosts and roles do not show up in
args, and kwargs, they are stripped already.
args and kwargs may contain task specific special arguments
(e.g. aliases, alias, default, and name) to customize the
task. They are set in the #task decorator and cannot be passed
on the command-line. Note also that these special task
arguments are not passed to the run method.
Non-special arguments (there are none in this example) are
set in the task decorator. These other arguments are not
passed to the run method and cannot be overridden from the
command-line.
Note that if you pass any "task specific special arguments" or
"non-special arguments declared in the task decorator" from the
command-line, they are treated as different arguments and the
command-line values are passed to the run method but not to
this method.
'''
super(CustomTask, self).__init__(*args, **kwargs)
print "IN __init__(args=%s, kwargs=%s)" % (args, kwargs)
self.func = func
def run(self, foo='foo_default_val', verbose='verbose_default_val',
*args, **kwargs):
'''
The arguments to this method will be:
1) arguments from the actual task (e.g. foo and verbose). This method
is where you set a default value for the arguments from the
actual_task, not on the actual_task.
2) task specific arguments from the command-line
(e.g. actual_task:bar='xxx'). This example is not expecting any,
so it strips them and does not pass them to the
actual_function one (e.g. it calls self.func with only foo
and verbose and does not pass args and kwargs)
'''
print "IN run(foo=%s, verbose=%s, args=%s, kwargs=%s)" % \
(foo, verbose, args, kwargs)
do_setup(foo, verbose)
return self.func(foo, verbose)
#task(task_class=CustomTask, alias="RUNME")
def actual_task(foo, verbose):
print 'IN task actual_task(foo=%s, verbose=%s)' % (foo, verbose)
Run with only host specified on the command-line:
fab -f ./example_with_setup.py actual_task:host='hhh'
IN __init__(args=(), kwargs={'alias': 'RUNME'})
[hhh] Executing task 'actual_task'
IN run(foo=foo_default_val, verbose=verbose_default_val, args=(), kwargs={})
IN do_setup(foo=foo_default_val, verbose=verbose_default_val)
IN task actual_task(foo=foo_default_val, verbose=verbose_default_val)
Run specifying foo on the commandline:
fab -f ./example_with_setup.py actual_task:host='hhh',foo='bar'
IN __init__(args=(), kwargs={'alias': 'RUNME'})
[hhh] Executing task 'actual_task'
IN run(foo=bar, verbose=verbose_default_val, args=(), kwargs={})
IN do_setup(foo=bar, verbose=verbose_default_val)
IN task actual_task(foo=bar, verbose=verbose_default_val)
Run specifying both foo and verbose on the command-line:
fab -f ./example_with_setup.py actual_task:host='hhh',foo='bar',verbose=True
IN __init__(args=(), kwargs={'alias': 'RUNME'})
[hhh] Executing task 'actual_task'
IN run(foo=bar, verbose=True, args=(), kwargs={})
IN do_setup(foo=bar, verbose=True)
IN task actual_task(foo=bar, verbose=True)
In the custom class section, the function actual_task1 doesn't actually take arguments, so the only valid way to invoke your fabric file is:
fab -f ./tmp2.py actual_task1
Furthermore, I don't think you're actually calling do_setup in either CustomTask or actual_task1
This is the fixed example.
# fixed the example from http://docs.fabfile.org/en/1.8/usage/tasks.html
from fabric.api import task
from fabric.tasks import Task
class CustomTask(Task):
def __init__(self, func, myarg1, *args, **kwargs):
'''
The special args like hosts and roles do not show up in
args, and kwargs, they are stripped already.
args and kwargs may contain task specific special arguments
(e.g. aliases, alias, default, and name) to customize the
task. They are set in the #task decorator and cannot be passed
on the command-line. Note also that these special task
arguments are not passed to the run method.
Non-special arguments (in this example myarg1) are set in the task
decorator. These other arguments are not passed to the run
method and cannot be overridden from the command-line.
Note that if you pass any "task specific special arguments" or
"non-special arguments declared in the task decorator" from the
command-line, they are treated as different arguments and the
command-line values are passed to the run method but not to
this method.
'''
super(CustomTask, self).__init__(*args, **kwargs)
print "IN __init__(myarg1=%s, args=%s, kwargs=%s)" % \
(myarg1, args, kwargs)
self.func = func
self.myarg1 = myarg1
def run(self, myarg2='default_value2', *args, **kwargs):
'''
The arguments to this method will be:
1) arguments from the actual task (e.g. myarg2). This method
is where you set a default value for the arguments from the
actual_task, not on the actual_task.
2) task specific arguments from the command-line
(e.g. actual_host:foo='foo'). This example is not expecting
any, so it strips them and does not pass them to the
actual_function (e.g. it calls self.func with only myarg2 and
does not pass args and kwargs)
'''
print "IN run(myarg2=%s, args=%s, kwargs=%s)" % \
(myarg2, args, kwargs)
return self.func(myarg2)
#task(task_class=CustomTask, myarg1='special_value', alias='RUNME')
def actual_task(myarg2):
print "IN actual_task(myarg2=%s)" % myarg2
Run with only hosts specified on the command-line:
fab -f ./fixed_example actual_task:hosts="hhh"
IN __init__(myarg1=special_value, args=(), kwargs={'alias': 'RUNME'})
[hhh] Executing task 'actual_task'
IN run(myarg2=default_value2, args=(), kwargs={})
IN actual_task(myarg2=default_value2)
Run specifying myarg2 on the command-line:
fab -f ./fixed_example actual_task:hosts="hhh",myarg2="good_value"
IN __init__(myarg1=special_value, args=(), kwargs={'alias': 'RUNME'})
[hhh] Executing task 'actual_task'
IN run(myarg2=good_value, args=(), kwargs={})
IN actual_task(myarg2=good_value)
Bad run specifying myarg1 and alias on the command-line. Notice that init gets the values specified in the task decorator and not the values from the command-line. Notice that run gets myarg1 and alias as arguments now.
fab -f ./fixed_example actual_task:hosts="hhh",myarg1="myarg1_from_commandline",alias="alias_from_commandline"
IN __init__(myarg1=special_value, args=(), kwargs={'alias': 'RUNME'})
[hhh] Executing task 'actual_task'
IN run(myarg2=default_value2, args=(), kwargs={'alias': 'alias_from_commandline', 'myarg1': 'myarg1_from_commandline'})
IN actual_task(myarg2=default_value2)

How to best perform Multiprocessing within requests with the python Tornado server?

I am using the I/O non-blocking python server Tornado. I have a class of GET requests which may take a significant amount of time to complete (think in the range of 5-10 seconds). The problem is that Tornado blocks on these requests so that subsequent fast requests are held up until the slow request completes.
I looked at: https://github.com/facebook/tornado/wiki/Threading-and-concurrency and came to the conclusion that I wanted some combination of #3 (other processes) and #4 (other threads). #4 on its own had issues and I was unable to get reliable control back to the ioloop when there was another thread doing the "heavy_lifting". (I assume that this was due to the GIL and the fact that the heavy_lifting task has high CPU load and keeps pulling control away from the main ioloop, but thats a guess).
So I have been prototyping how to solve this by doing "heavy lifting" tasks within these slow GET requests in a separate process and then place a callback back into the Tornado ioloop when the process is done to finish the request. This frees up the ioloop to handle other requests.
I have created a simple example demonstrating a possible solution, but am curious to get feedback from the community on it.
My question is two-fold: How can this current approach be simplified? What pitfalls potentially exist with it?
The Approach
Utilize Tornado's builtin asynchronous decorator which allows a request to stay open and for the ioloop to continue.
Spawn a separate process for "heavy lifting" tasks using python's multiprocessing module. I first attempted to use the threading module but was unable to get any reliable relinquishing of control back to the ioloop. It also appears that mutliprocessing would also take advantage of multicores.
Start a 'watcher' thread in the main ioloop process using the threading module who's job it is to watch a multiprocessing.Queue for the results of the "heavy lifting" task when it completes. This was needed because I needed a way to know that the heavy_lifting task had completed while being able to still notify the ioloop that this request was now finished.
Be sure that the 'watcher' thread relinquishes control to the main ioloop loop often with time.sleep(0) calls so that other requests continue to get readily processed.
When there is a result in the queue then add a callback from the "watcher" thread using tornado.ioloop.IOLoop.instance().add_callback() which is documented to be the only safe way to call ioloop instances from other threads.
Be sure to then call finish() in the callback to complete the request and hand over a reply.
Below is some sample code showing this approach. multi_tornado.py is the server implementing the above outline and call_multi.py is a sample script that calls the server in two different ways to test the server. Both tests call the server with 3 slow GET requests followed by 20 fast GET requests. The results are shown for both running with and without the threading turned on.
In the case of running it with "no threading" the 3 slow requests block (each taking a little over a second to complete). A few of the 20 fast requests squeeze through in between some of the slow requests within the ioloop (not totally sure how that occurs - but could be an artifact that I am running both the server and client test script on the same machine). The point here being that all of the fast requests are held up to varying degrees.
In the case of running it with threading enabled the 20 fast requests all complete first immediately and the three slow requests complete at about the same time afterwards as they have each been running in parallel. This is the desired behavior. The three slow requests take 2.5 seconds to complete in parallel - whereas in the non threaded case the three slow requests take about 3.5 seconds in total. So there is about 35% speed up overall (I assume due to multicore sharing). But more importantly - the fast requests were immediately handled in leu of the slow ones.
I do not have a lot experience with multithreaded programming - so while this seemingly works here I am curious to learn:
Is there a simpler way to accomplish this? What monster's may lurk within this approach?
(Note: A future tradeoff may be to just run more instances of Tornado with a reverse proxy like nginx doing load balancing. No matter what I will be running multiple instances with a load balancer - but I am concerned about just throwing hardware at this problem since it seems that the hardware is so directly coupled to the problem in terms of the blocking.)
Sample Code
multi_tornado.py (sample server):
import time
import threading
import multiprocessing
import math
from tornado.web import RequestHandler, Application, asynchronous
from tornado.ioloop import IOLoop
# run in some other process - put result in q
def heavy_lifting(q):
t0 = time.time()
for k in range(2000):
math.factorial(k)
t = time.time()
q.put(t - t0) # report time to compute in queue
class FastHandler(RequestHandler):
def get(self):
res = 'fast result ' + self.get_argument('id')
print res
self.write(res)
self.flush()
class MultiThreadedHandler(RequestHandler):
# Note: This handler can be called with threaded = True or False
def initialize(self, threaded=True):
self._threaded = threaded
self._q = multiprocessing.Queue()
def start_process(self, worker, callback):
# method to start process and watcher thread
self._callback = callback
if self._threaded:
# launch process
multiprocessing.Process(target=worker, args=(self._q,)).start()
# start watching for process to finish
threading.Thread(target=self._watcher).start()
else:
# threaded = False just call directly and block
worker(self._q)
self._watcher()
def _watcher(self):
# watches the queue for process result
while self._q.empty():
time.sleep(0) # relinquish control if not ready
# put callback back into the ioloop so we can finish request
response = self._q.get(False)
IOLoop.instance().add_callback(lambda: self._callback(response))
class SlowHandler(MultiThreadedHandler):
#asynchronous
def get(self):
# start a thread to watch for
self.start_process(heavy_lifting, self._on_response)
def _on_response(self, delta):
_id = self.get_argument('id')
res = 'slow result {} <--- {:0.3f} s'.format(_id, delta)
print res
self.write(res)
self.flush()
self.finish() # be sure to finish request
application = Application([
(r"/fast", FastHandler),
(r"/slow", SlowHandler, dict(threaded=False)),
(r"/slow_threaded", SlowHandler, dict(threaded=True)),
])
if __name__ == "__main__":
application.listen(8888)
IOLoop.instance().start()
call_multi.py (client tester):
import sys
from tornado.ioloop import IOLoop
from tornado import httpclient
def run(slow):
def show_response(res):
print res.body
# make 3 "slow" requests on server
requests = []
for k in xrange(3):
uri = 'http://localhost:8888/{}?id={}'
requests.append(uri.format(slow, str(k + 1)))
# followed by 20 "fast" requests
for k in xrange(20):
uri = 'http://localhost:8888/fast?id={}'
requests.append(uri.format(k + 1))
# show results as they return
http_client = httpclient.AsyncHTTPClient()
print 'Scheduling Get Requests:'
print '------------------------'
for req in requests:
print req
http_client.fetch(req, show_response)
# execute requests on server
print '\nStart sending requests....'
IOLoop.instance().start()
if __name__ == '__main__':
scenario = sys.argv[1]
if scenario == 'slow' or scenario == 'slow_threaded':
run(scenario)
Test Results
By running python call_multi.py slow (the blocking behavior):
Scheduling Get Requests:
------------------------
http://localhost:8888/slow?id=1
http://localhost:8888/slow?id=2
http://localhost:8888/slow?id=3
http://localhost:8888/fast?id=1
http://localhost:8888/fast?id=2
http://localhost:8888/fast?id=3
http://localhost:8888/fast?id=4
http://localhost:8888/fast?id=5
http://localhost:8888/fast?id=6
http://localhost:8888/fast?id=7
http://localhost:8888/fast?id=8
http://localhost:8888/fast?id=9
http://localhost:8888/fast?id=10
http://localhost:8888/fast?id=11
http://localhost:8888/fast?id=12
http://localhost:8888/fast?id=13
http://localhost:8888/fast?id=14
http://localhost:8888/fast?id=15
http://localhost:8888/fast?id=16
http://localhost:8888/fast?id=17
http://localhost:8888/fast?id=18
http://localhost:8888/fast?id=19
http://localhost:8888/fast?id=20
Start sending requests....
slow result 1 <--- 1.338 s
fast result 1
fast result 2
fast result 3
fast result 4
fast result 5
fast result 6
fast result 7
slow result 2 <--- 1.169 s
slow result 3 <--- 1.130 s
fast result 8
fast result 9
fast result 10
fast result 11
fast result 13
fast result 12
fast result 14
fast result 15
fast result 16
fast result 18
fast result 17
fast result 19
fast result 20
By running python call_multi.py slow_threaded (the desired behavior):
Scheduling Get Requests:
------------------------
http://localhost:8888/slow_threaded?id=1
http://localhost:8888/slow_threaded?id=2
http://localhost:8888/slow_threaded?id=3
http://localhost:8888/fast?id=1
http://localhost:8888/fast?id=2
http://localhost:8888/fast?id=3
http://localhost:8888/fast?id=4
http://localhost:8888/fast?id=5
http://localhost:8888/fast?id=6
http://localhost:8888/fast?id=7
http://localhost:8888/fast?id=8
http://localhost:8888/fast?id=9
http://localhost:8888/fast?id=10
http://localhost:8888/fast?id=11
http://localhost:8888/fast?id=12
http://localhost:8888/fast?id=13
http://localhost:8888/fast?id=14
http://localhost:8888/fast?id=15
http://localhost:8888/fast?id=16
http://localhost:8888/fast?id=17
http://localhost:8888/fast?id=18
http://localhost:8888/fast?id=19
http://localhost:8888/fast?id=20
Start sending requests....
fast result 1
fast result 2
fast result 3
fast result 4
fast result 5
fast result 6
fast result 7
fast result 8
fast result 9
fast result 10
fast result 11
fast result 12
fast result 13
fast result 14
fast result 15
fast result 19
fast result 20
fast result 17
fast result 16
fast result 18
slow result 2 <--- 2.485 s
slow result 3 <--- 2.491 s
slow result 1 <--- 2.517 s
If you're willing to use concurrent.futures.ProcessPoolExecutor instead of multiprocessing, this is actually very simple. Tornado's ioloop already supports concurrent.futures.Future, so they'll play nicely together out of the box. concurrent.futures is included in Python 3.2+, and has been backported to Python 2.x.
Here's an example:
import time
from concurrent.futures import ProcessPoolExecutor
from tornado.ioloop import IOLoop
from tornado import gen
def f(a, b, c, blah=None):
print "got %s %s %s and %s" % (a, b, c, blah)
time.sleep(5)
return "hey there"
#gen.coroutine
def test_it():
pool = ProcessPoolExecutor(max_workers=1)
fut = pool.submit(f, 1, 2, 3, blah="ok") # This returns a concurrent.futures.Future
print("running it asynchronously")
ret = yield fut
print("it returned %s" % ret)
pool.shutdown()
IOLoop.instance().run_sync(test_it)
Output:
running it asynchronously
got 1 2 3 and ok
it returned hey there
ProcessPoolExecutor has a more limited API than multiprocessing.Pool, but if you don't need the more advanced features of multiprocessing.Pool, it's worth using because the integration is so much simpler.
multiprocessing.Pool can be integrated into the tornado I/O loop, but it's a bit messy. A much cleaner integration can be done using concurrent.futures (see my other answer for details), but if you're stuck on Python 2.x and can't install the concurrent.futures backport, here is how you can do it strictly using multiprocessing:
The multiprocessing.Pool.apply_async and multiprocessing.Pool.map_async methods both have an optional callback parameter, which means that both can potentially be plugged into a tornado.gen.Task. So in most cases, running code asynchronously in a sub-process is as simple as this:
import multiprocessing
import contextlib
from tornado import gen
from tornado.gen import Return
from tornado.ioloop import IOLoop
from functools import partial
def worker():
print "async work here"
#gen.coroutine
def async_run(func, *args, **kwargs):
result = yield gen.Task(pool.apply_async, func, args, kwargs)
raise Return(result)
if __name__ == "__main__":
pool = multiprocessing.Pool(multiprocessing.cpu_count())
func = partial(async_run, worker)
IOLoop().run_sync(func)
As I mentioned, this works well in most cases. But if worker() throws an exception, callback is never called, which means the gen.Task never finishes, and you hang forever. Now, if you know that your work will never throw an exception (because you wrapped the whole thing in a try/except, for example), you can happily use this approach. However, if you want to let exceptions escape from your worker, the only solution I found was to subclass some multiprocessing components, and make them call callback even if the worker sub-process raised an exception:
from multiprocessing.pool import ApplyResult, Pool, RUN
import multiprocessing
class TornadoApplyResult(ApplyResult):
def _set(self, i, obj):
self._success, self._value = obj
if self._callback:
self._callback(self._value)
self._cond.acquire()
try:
self._ready = True
self._cond.notify()
finally:
self._cond.release()
del self._cache[self._job]
class TornadoPool(Pool):
def apply_async(self, func, args=(), kwds={}, callback=None):
''' Asynchronous equivalent of `apply()` builtin
This version will call `callback` even if an exception is
raised by `func`.
'''
assert self._state == RUN
result = TornadoApplyResult(self._cache, callback)
self._taskqueue.put(([(result._job, None, func, args, kwds)], None))
return result
...
if __name__ == "__main__":
pool = TornadoPool(multiprocessing.cpu_count())
...
With these changes, the exception object will be returned by the gen.Task, rather than the gen.Task hanging indefinitely. I also updated my async_run method to re-raise the exception when its returned, and made some other changes to provide better tracebacks for exceptions thrown in the worker sub-processes. Here's the full code:
import multiprocessing
from multiprocessing.pool import Pool, ApplyResult, RUN
from functools import wraps
import tornado.web
from tornado.ioloop import IOLoop
from tornado.gen import Return
from tornado import gen
class WrapException(Exception):
def __init__(self):
exc_type, exc_value, exc_tb = sys.exc_info()
self.exception = exc_value
self.formatted = ''.join(traceback.format_exception(exc_type, exc_value, exc_tb))
def __str__(self):
return '\n%s\nOriginal traceback:\n%s' % (Exception.__str__(self), self.formatted)
class TornadoApplyResult(ApplyResult):
def _set(self, i, obj):
self._success, self._value = obj
if self._callback:
self._callback(self._value)
self._cond.acquire()
try:
self._ready = True
self._cond.notify()
finally:
self._cond.release()
del self._cache[self._job]
class TornadoPool(Pool):
def apply_async(self, func, args=(), kwds={}, callback=None):
''' Asynchronous equivalent of `apply()` builtin
This version will call `callback` even if an exception is
raised by `func`.
'''
assert self._state == RUN
result = TornadoApplyResult(self._cache, callback)
self._taskqueue.put(([(result._job, None, func, args, kwds)], None))
return result
#gen.coroutine
def async_run(func, *args, **kwargs):
""" Runs the given function in a subprocess.
This wraps the given function in a gen.Task and runs it
in a multiprocessing.Pool. It is meant to be used as a
Tornado co-routine. Note that if func returns an Exception
(or an Exception sub-class), this function will raise the
Exception, rather than return it.
"""
result = yield gen.Task(pool.apply_async, func, args, kwargs)
if isinstance(result, Exception):
raise result
raise Return(result)
def handle_exceptions(func):
""" Raise a WrapException so we get a more meaningful traceback"""
#wraps(func)
def inner(*args, **kwargs):
try:
return func(*args, **kwargs)
except Exception:
raise WrapException()
return inner
# Test worker functions
#handle_exceptions
def test2(x):
raise Exception("eeee")
#handle_exceptions
def test(x):
print x
time.sleep(2)
return "done"
class TestHandler(tornado.web.RequestHandler):
#gen.coroutine
def get(self):
try:
result = yield async_run(test, "inside get")
self.write("%s\n" % result)
result = yield async_run(test2, "hi2")
except Exception as e:
print("caught exception in get")
self.write("Caught an exception: %s" % e)
finally:
self.finish()
app = tornado.web.Application([
(r"/test", TestHandler),
])
if __name__ == "__main__":
pool = TornadoPool(4)
app.listen(8888)
IOLoop.instance().start()
Here's how it behaves for the client:
dan#dan:~$ curl localhost:8888/test
done
Caught an exception:
Original traceback:
Traceback (most recent call last):
File "./mutli.py", line 123, in inner
return func(*args, **kwargs)
File "./mutli.py", line 131, in test2
raise Exception("eeee")
Exception: eeee
And if I send two simultaneous curl requests, we can see they're handled asynchronously on the server-side:
dan#dan:~$ ./mutli.py
inside get
inside get
caught exception inside get
caught exception inside get
Edit:
Note that this code becomes simpler with Python 3, because it introduces an error_callback keyword argument to all asynchronous multiprocessing.Pool methods. This makes it much easier to integrate with Tornado:
class TornadoPool(Pool):
def apply_async(self, func, args=(), kwds={}, callback=None):
''' Asynchronous equivalent of `apply()` builtin
This version will call `callback` even if an exception is
raised by `func`.
'''
super().apply_async(func, args, kwds, callback=callback,
error_callback=callback)
#gen.coroutine
def async_run(func, *args, **kwargs):
""" Runs the given function in a subprocess.
This wraps the given function in a gen.Task and runs it
in a multiprocessing.Pool. It is meant to be used as a
Tornado co-routine. Note that if func returns an Exception
(or an Exception sub-class), this function will raise the
Exception, rather than return it.
"""
result = yield gen.Task(pool.apply_async, func, args, kwargs)
raise Return(result)
All we need to do in our overridden apply_async is call the parent with the error_callback keyword argument, in addition to the callback kwarg. No need to override ApplyResult.
We can get even fancier by using a MetaClass in our TornadoPool, to allow its *_async methods to be called directly as if they were coroutines:
import time
from functools import wraps
from multiprocessing.pool import Pool
import tornado.web
from tornado import gen
from tornado.gen import Return
from tornado import stack_context
from tornado.ioloop import IOLoop
from tornado.concurrent import Future
def _argument_adapter(callback):
def wrapper(*args, **kwargs):
if kwargs or len(args) > 1:
callback(Arguments(args, kwargs))
elif args:
callback(args[0])
else:
callback(None)
return wrapper
def PoolTask(func, *args, **kwargs):
""" Task function for use with multiprocessing.Pool methods.
This is very similar to tornado.gen.Task, except it sets the
error_callback kwarg in addition to the callback kwarg. This
way exceptions raised in pool worker methods get raised in the
parent when the Task is yielded from.
"""
future = Future()
def handle_exception(typ, value, tb):
if future.done():
return False
future.set_exc_info((typ, value, tb))
return True
def set_result(result):
if future.done():
return
if isinstance(result, Exception):
future.set_exception(result)
else:
future.set_result(result)
with stack_context.ExceptionStackContext(handle_exception):
cb = _argument_adapter(set_result)
func(*args, callback=cb, error_callback=cb)
return future
def coro_runner(func):
""" Wraps the given func in a PoolTask and returns it. """
#wraps(func)
def wrapper(*args, **kwargs):
return PoolTask(func, *args, **kwargs)
return wrapper
class MetaPool(type):
""" Wrap all *_async methods in Pool with coro_runner. """
def __new__(cls, clsname, bases, dct):
pdct = bases[0].__dict__
for attr in pdct:
if attr.endswith("async") and not attr.startswith('_'):
setattr(bases[0], attr, coro_runner(pdct[attr]))
return super().__new__(cls, clsname, bases, dct)
class TornadoPool(Pool, metaclass=MetaPool):
pass
# Test worker functions
def test2(x):
print("hi2")
raise Exception("eeee")
def test(x):
print(x)
time.sleep(2)
return "done"
class TestHandler(tornado.web.RequestHandler):
#gen.coroutine
def get(self):
try:
result = yield pool.apply_async(test, ("inside get",))
self.write("%s\n" % result)
result = yield pool.apply_async(test2, ("hi2",))
self.write("%s\n" % result)
except Exception as e:
print("caught exception in get")
self.write("Caught an exception: %s" % e)
raise
finally:
self.finish()
app = tornado.web.Application([
(r"/test", TestHandler),
])
if __name__ == "__main__":
pool = TornadoPool()
app.listen(8888)
IOLoop.instance().start()
If your get requests are taking that long then tornado is the wrong framework.
I suggest you use nginx to route the fast gets to tornado and the slower ones to a different server.
PeterBe has an interesting article where he runs multiple Tornado servers and sets one of them to be 'the slow one' for handling the long running requests see: worrying-about-io-blocking I would try this method.

Categories

Resources