Custom json serialization in celery - python

I am trying to use celery with custom objects for which I have implemented a custom serializer, but the celery workers try to use pickling.
celeryconfig.py
broker_url = 'redis://localhost'
result_backend = 'redis://localhost'
imports = ('tasks',)
accept_content = ['application/x-json']
task_serializer = 'custom_json'
result_serializer = 'custom_json'
app.py
from celery import Celery
from . import serializers
from . import celeryconfig
from kombu import serialization
serialization.register(
'custom_json',
serializers.dumps,
serializers.loads,
content_type='application/x-json',
content_encoding='utf-8',
)
app = Celery()
app.config_from_object(celeryconfig)
if __name__ == '__main__':
app.start()
main.py
from .tasks import my_task
my_obj = CustomClass()
my_task.delay(my_obj)
This code works fine if my class was defined in python:
class CustomClass:
def __init__(self):
...
But my CustomClass actually comes from a Boost.Python binding, that I import from an .so file, and then I get the following error from the worker:
[2020-04-11 16:25:08,102: INFO/MainProcess] Received task: my_task[f73a3119-65d7-4a04-9e0d-2bc25ad19dde]
...
RuntimeError: Pickling of "CustomClass" instances is not enabled (http://www.boost.org/libs/python/doc/v2/pickle.html)
I understand that the error message suggests to dig into their pickle's specific. But the whole point of using custom json serializers is to not go down this road.
So my question is: why is celery even trying to use pickling?
Edit: As a dummy example, the following class (without boost) that is not picklable would yield the following error:
Unrecoverable error: TypeError("can't pickle generator objects",)
class NonPicklableClass:
def __init__(self, arg):
self.gen = (i for i in arg)
class CustomEncoder(json.JSONEncoder):
def default(self, o):
if isinstance(o, NonPicklableClass):
return {
'__type__': 'custom',
'raw': list(o.gen),
}
return o
def hook(o):
dtype = o.get('__type__')
if dtype == 'custom':
return NonPicklableClass(o['raw'])
def dumps(o):
return json.dumps(o, cls=CustomEncoder)
def loads(s):
return json.loads(s, object_hook=hook)
I clearly must be misunderstanding something

I think I figured it out, the jobs are sent to the workers using the custom serializers.
However within each worker, the data is passed through each process using the regular python pickling.

Related

Celery with continuous deployment

I have a service that exposes an API which is then feeding tasks, it is implemented with Falcon (API) and Celery (task management).
Specifically, my workers take long time to load and their code looks something like this
class HeavyOp(celery.Task):
def __init__(self):
self._asset = get_heavy_asset() # <-- takes long time
#property
def asset(self):
return self._asset
#app.task(base=HeavyOp)
def my_task(data):
return my_task.asset.do_something(data)
What actually goes on is that in the __init__ function some object is being read from disk and held in memory for as long as the worker lives.
Sometimes, I want to update that object.
Is there a way to reload the worker, without downtime? As this is all behind an API, I don't wish to have those few minutes of loading the heavy object as downtime.
We can assume the host has more than 1 core, but the solution must be a single host solution.
I don't think you need a custom base task class. What you want to achieve is a single instance asset class which gets loaded after the worker has initialised and you can reload from a task.
This approach works:
# worker.py
import os
import sys
import time
from celery import Celery
from celery.signals import worker_ready
app = Celery(include=('tasks',))
class Asset:
def __init__(self):
self.time = time.time()
class AssetLoader:
__shared_state = {}
def __init__(self):
self.__dict__ = self.__shared_state
if '_value' not in self.__dict__:
self.get_heavy_asset()
def get_heavy_asset(self):
self._value = Asset()
#property
def value(self):
return self._value
#worker_ready.connect
def after_worker_ready(sender, **kwargs):
AssetLoader()
Here, I made AssetLoader a Borg class, but you can choose any other pattern/strategy to share a single instance of Asset. For illustrative purposes, I just capture the timestamp when executing get_heavy_asset.
# tasks.py
from worker import app, AssetLoader
#app.task(bind=True)
def load(self):
AssetLoader().get_heavy_asset()
return AssetLoader().value.time
#app.task(bind=True)
def my_task(self):
return AssetLoader().value.time
Bear in mind that Asset is shared per worker process but not across workers. If you run with concurrency=1, it doesn't make a difference, but for anything else it does. But from what I gather in your use case, it should be fine either way.

Multiprocessing apply_async() not working on Ubuntu

I am running this code as a CherryPy Web Service both on Mac OS X and Ubuntu 14.04. By using multiprocessing on python3 I want to start the static method worker() in an asynchronous way, within a Process Pool.
The same code runs flawlessly on Mac OS X, in Ubuntu 14.04 worker() does not run. I.e. by debugging the code inside the POST method I am able to see that each line is executed - from
reqid = str(uuid.uuid4())
to
return handle_error(202, "Request ID: " + reqid)
Starting the same code in Ubuntu 14.04, it does not run the worker() method, not even a print() at the top of the method (which would be logged).
Here's the relevant code (I only omitted the handle_error() method):
import cherrypy
import json
from lib import get_parameters, handle_error
from multiprocessing import Pool
import os
from pymatbridge import Matlab
import requests
import shutil
import uuid
from xml.etree import ElementTree
class Schedule(object):
exposed = True
def __init__(self, mlab_path, pool):
self.mlab_path = mlab_path
self.pool = pool
def POST(self, *paths, **params):
if validate(cherrypy.request.headers):
try:
reqid = str(uuid.uuid4())
path = os.path.join("results", reqid)
os.makedirs(path)
wargs = [(self.mlab_path, reqid)]
self.pool.apply_async(Schedule.worker, wargs)
return handle_error(202, "Request ID: " + reqid)
except:
return handle_error(500, "Internal Server Error")
else:
return handle_error(401, "Unauthorized")
#### this is not executed ####
#staticmethod
def worker(args):
mlab_path, reqid = args
mlab = Matlab(executable=mlab_path)
mlab.start()
mlab.run_code("cd mlab")
mlab.run_code("sched")
a = mlab.get_variable("a")
mlab.stop()
return reqid
####
# to start the Web Service
if __name__ == "__main__":
# start Web Service with some configuration
global_conf = {
"global": {
"server.environment": "production",
"engine.autoreload.on": True,
"engine.autoreload.frequency": 5,
"server.socket_host": "0.0.0.0",
"log.screen": False,
"log.access_file": "site.log",
"log.error_file": "site.log",
"server.socket_port": 8084
}
}
cherrypy.config.update(global_conf)
conf = {
"/": {
"request.dispatch": cherrypy.dispatch.MethodDispatcher(),
"tools.encode.debug": True,
"request.show_tracebacks": False
}
}
pool = Pool(3)
cherrypy.tree.mount(Schedule('matlab', pool), "/sched", conf)
# activate signal handler
if hasattr(cherrypy.engine, "signal_handler"):
cherrypy.engine.signal_handler.subscribe()
# start serving pages
cherrypy.engine.start()
cherrypy.engine.block()
Your logic is hiding the problem from you. The apply_async method returns an AsyncResult object which acts as a handler to the asynchronous task you just scheduled. As you ignore the outcome of the scheduled task, the whole thing looks like "failing silently".
If you try to get the results from that task, you'd see the real problem.
handler = self.pool.apply_async(Schedule.worker, wargs)
handler.get()
... traceback here ...
cPickle.PicklingError: Can't pickle <type 'function'>: attribute lookup __builtin__.function failed
In short, you must ensure the arguments you pass to the Pool are Picklable.
Instance and class methods are Picklable if the object/class they belong to is picklable as well. Static methods are not picklable because they loose the association with the object itself, therefore the pickle library cannot serialise them correctly.
As a general line, is better to avoid scheduling to multiprocessing.Pool anything different than a top level defined functions.
To run a background tasks with Cherrypy it's better if you use an asynchronous task queue manager like Celery or RQ. This services are very easy to install and run, your tasks will run in a completely separated process and if you need to scale because your load is increasing it'll be very straight forward.
You have a simple example with Cherrypy here.
I solved changing the method from #staticmethod to #classmethod. Now the job runs inside the ProcessPool. I found classmethods to be more useful in this case, as explained here.
Thanks.

User defined celery task class : init is getting called during import

I am trying to use celery task as a class and looking at following behavior. I guess that I missed something. Let me first tell you what I am trying to achieve :
1. Create a class with its init function which would be called only once by celery. This will setup required params for my class. I am gonna create a threadpool here.
2. Create instance of this celery task object in producer and put jobs in it.
To achieve the same I tried naive example mentioned on celery site and created a sample class. I am creating task using :
celery -c 1 -A proj worker --loglevel=debug
it seems to be working at first but then I observed that init of task is getting called at import in tester.py, I could stop this init in object usage by passing flag but init during import is a real concern here.
Can you please point me to correct usage of this example. I do not want init of task class to be called more than what I invoked using celery command. In real life scenario it would create unnecessary threads.
Also if possible, point me to right an example which is closest to my requirement mentioned above.
celery.py
from __future__ import absolute_import
from celery import Celery
app = Celery('proj',
broker='amqp://',
backend='amqp://',
include=['proj.tasks'])
# Optional configuration, see the application user guide.
app.conf.update(
CELERY_TASK_RESULT_EXPIRES=3600,
)
if __name__ == '__main__':
app.start()
tasks.py
from __future__ import absolute_import
from proj.celery import app
class NaiveAuthenticateServer(app.Task):
def __init__(self, celeryInst = 1):
if celeryInst == 1:
print "Hi, I am celery instant"
else :
print "Did you invoke me from command"
self.users = {'george': 'password'}
def run(self, username, password):
try:
return self.users[username] == password
except KeyError:
return False
tester.py
from proj import tasks
obj = tasks.NaiveAuthenticateServer(0)
res = obj.delay('hi', 'hello')
print res.get()
o/p of tester.py
Hi, I am celery instant
Did you invoke me from command
False
You should not create an instance of the task class yourself, but rather let celery do that for you automatically when the process starts up.
Therefore, you need to define a task function that uses the base class:
#app.task(base=NaiveAuthenticateServer)
def my_task(arg1, arg2):
print arg1, arg2
And then submit the task like this:
from proj import tasks
tasks.my_task.delay('hi', 'hello')

Celery not running with flask application

I'm using celery in my flask application but celery(3.1.8).This is my configuration with the flask application
celery.py
from __future__ import absolute_import
from celery import Celery
from cuewords.settings import CELERY_BROKER_URL,CELERY_RESULT_BACKEND
app = Celery('proj',
broker=CELERY_BROKER_URL,
backend=CELERY_RESULT_BACKEND)
app.conf.update(CELERY_TASK_RESULT_EXPIRES=3600)
if __name__ == '__main__':
app.start()
setting.py
CELERY_BROKER_URL='redis://localhost:6379/0'
CELERY_RESULT_BACKEND='redis://localhost:6379/0'
BROKER_TRANSPORT = 'redis'
api.py
class Webcontent(Resource):
def post(self,session=session):
args = self.parser.parse_args()
site_url = args["url"]
url_present=Websitecontent.site_url_present(session,site_url)
if site_url.strip() != "" and not url_present:
try:
#add data and commit
session.commit()
websitecontent=Websitecontent(params*)
websitecontent.update_url(id,session)
except:
session.rollback()
raise
finally:
session.close()
else:
return "No data created / data already present"
And in my model i'm adding a method to task
model.py
from cuewords.celery import app
class Websitecontent(Base):
#app.task(name='update_url')
def update_url(self,id,session):
...code goes here..
And this how i run the celery from command prompt
celery -A cuewords.celery worker
And i also using flower to monitor the task i can see a worker running but i couldn't see any task its empty .Any idea what im missing or doing wrong ..
Thanks
The problem is that your tasks never get imported into the Python runtime when running the worker(s). The celery command is your entry point. And you're telling Celery to import your cuewords.celery module because thats where you're app instance resides. However, this is where the chain of events ends and no further Python code is imported.
Now, the most common mistake is to import the tasks into the same module as the Celery app instance. Unfortunately this will result in two modules trying to import things from each other and will result in a circular import error. This is no good.
To get around this one could import the task functions into the Celery app module and register them without using the decorator style. For example:
from celery import Celery
from models import my_task
app = Celery()
app.task(name='my_task')(my_task)
This would remove the need to import the app instance in your model module.
However, you're using method tasks. Method tasks need to be treated differently than function tasks as noted here: http://docs.celeryproject.org/en/latest/reference/celery.contrib.methods.html. Method tasks are different from function tasks, because they are associated with an instance of an object. In other words, the function is a class function. So to use the previous style of registering tasks, you'd need an instance of the class first. To get around this you should consider making your tasks functions instead of methods.

Unit testing with django-celery?

I am trying to come up with a testing methodology for our django-celery project. I have read the notes in the documentation, but it didn't give me a good idea of what to actually do. I am not worried about testing the tasks in the actual daemons, just the functionality of my code. Mainly I am wondering:
How can we bypass task.delay() during the test (I tried setting CELERY_ALWAYS_EAGER = True but it made no difference)?
How do we use the test settings that are recommended (if that is the best way) without actually changing our settings.py?
Can we still use manage.py test or do we have to use a custom runner?
Overall any hints or tips for testing with celery would be very helpful.
I like to use the override_settings decorator on tests which need celery results to complete.
from django.test import TestCase
from django.test.utils import override_settings
from myapp.tasks import mytask
class AddTestCase(TestCase):
#override_settings(CELERY_EAGER_PROPAGATES_EXCEPTIONS=True,
CELERY_ALWAYS_EAGER=True,
BROKER_BACKEND='memory')
def test_mytask(self):
result = mytask.delay()
self.assertTrue(result.successful())
If you want to apply this to all tests you can use the celery test runner as described at http://docs.celeryproject.org/en/2.5/django/unit-testing.html which basically sets these same settings except (BROKER_BACKEND = 'memory').
In settings:
TEST_RUNNER = 'djcelery.contrib.test_runner.CeleryTestSuiteRunner'
Look at the source for CeleryTestSuiteRunner and it's pretty clear what's happening.
Try setting:
BROKER_BACKEND = 'memory'
(Thanks to asksol's comment.)
Here's an excerpt from my testing base class that stubs out the apply_async method and records to the calls to it (which includes Task.delay.) It's a little gross, but it's managed to fit my needs over the past few months I've been using it.
from django.test import TestCase
from celery.task.base import Task
# For recent versions, Task has been moved to celery.task.app:
# from celery.app.task import Task
# See http://docs.celeryproject.org/en/latest/reference/celery.app.task.html
class CeleryTestCaseBase(TestCase):
def setUp(self):
super(CeleryTestCaseBase, self).setUp()
self.applied_tasks = []
self.task_apply_async_orig = Task.apply_async
#classmethod
def new_apply_async(task_class, args=None, kwargs=None, **options):
self.handle_apply_async(task_class, args, kwargs, **options)
# monkey patch the regular apply_sync with our method
Task.apply_async = new_apply_async
def tearDown(self):
super(CeleryTestCaseBase, self).tearDown()
# Reset the monkey patch to the original method
Task.apply_async = self.task_apply_async_orig
def handle_apply_async(self, task_class, args=None, kwargs=None, **options):
self.applied_tasks.append((task_class, tuple(args), kwargs))
def assert_task_sent(self, task_class, *args, **kwargs):
was_sent = any(task_class == task[0] and args == task[1] and kwargs == task[2]
for task in self.applied_tasks)
self.assertTrue(was_sent, 'Task not called w/class %s and args %s' % (task_class, args))
def assert_task_not_sent(self, task_class):
was_sent = any(task_class == task[0] for task in self.applied_tasks)
self.assertFalse(was_sent, 'Task was not expected to be called, but was. Applied tasks: %s' % self.applied_tasks)
Here's an "off the top of the head" example of how you'd use it in your test cases:
mymodule.py
from my_tasks import SomeTask
def run_some_task(should_run):
if should_run:
SomeTask.delay(1, some_kwarg=2)
test_mymodule.py
class RunSomeTaskTest(CeleryTestCaseBase):
def test_should_run(self):
run_some_task(should_run=True)
self.assert_task_sent(SomeTask, 1, some_kwarg=2)
def test_should_not_run(self):
run_some_task(should_run=False)
self.assert_task_not_sent(SomeTask)
since I still see this come up in search results, settings override with
TEST_RUNNER = 'djcelery.contrib.test_runner.CeleryTestSuiteRunner'
worked for me as per Celery Docs
This is what I did
Inside myapp.tasks.py I have:
from celery import shared_task
#shared_task()
def add(a, b):
return a + b
Inside myapp.test_tasks.py I have:
from django.test import TestCase, override_settings
from myapp.tasks import add
class TasksTestCase(TestCase):
def setUp(self):
...
#override_settings(CELERY_TASK_ALWAYS_EAGER=True,CELERY_TASK_EAGER_PROPOGATES=True)
def test_create_sections(self):
result= add.delay(1,2)
assert result.successful() == True
assert result.get() == 3
For everyone getting here in 2019: checkout this article covering different strategies, including calling tasks synchronously.

Categories

Resources