Undesired delay in the celery process - python

I am encountering an undesired delay in the celery process that I cannot explain. My intent is to manage live processing of incoming data (at a rate of 10 to 60 data per seconds). Processing of one piece of data is divided into two fully sequential tasks but parallelization is used to start processing the next piece of data (with task 1) while processing the current one (with task 2) is not finished yet. Getting the shortest delay in the process is of at-most importance since it is a live application.
Once in a while, I encounter a freeze in the process. To see where this problem came from I started monitoring the occupation of my workers. It appeared that it happened during the communication between workers. I designed the lightest and simplest example to illustrate it here.
Here is my code, as you can see I have two tasks doing nothing but waiting 10ms each. I call them by using celery chains once every 20ms. I track each workers occupation by using prerun and postrun along with logging. In most of the case all is happening sequentially as time spent by both the workers doesn't exceed the send rate.
from __future__ import absolute_import
import time
from celery import chain
from celery.signals import task_prerun, task_postrun
from celery import Celery
from kombu import Queue, Exchange
N_ITS = 100000 # Total number of chains sent
LOG_FILE = 'log_file.txt' # Path to the log file
def write_to_log_file(text):
with open(LOG_FILE, 'a') as f:
f.write(text)
# Create celery app
app = Celery('live')
app.config_from_object('celeryconfig')
default_exchange = Exchange('default', type='direct')
app.conf.task_queues = tuple(Queue(route['queue'], default_exchange, routing_key=route['queue'])
for route in app.conf.task_routes.values() + [{'queue': 'default'}])
app.conf.update(result_expires=3600)
# Define functions that record timings
#task_prerun.connect()
def task_prerun(signal=None, sender=None, task_id=None, task=None, **kwargs):
text = 'task_prerun; {0}; {1:.16g}\n'.format(task.name, time.time())
write_to_log_file(text)
#task_postrun.connect()
def task_postrun(signal=None, sender=None, task_id=None, task=None, **kwargs):
text = 'task_postrun; {0}; {1:.16g}\n'.format(task.name, time.time())
write_to_log_file(text)
# Define tasks
#app.task
def task_1(i):
print 'Executing task_1: {}'.format(i)
time.sleep(0.01)
#app.task
def task_2(i):
print 'Executing task_2: {}'.format(i)
time.sleep(0.01)
# Send chained tasks
def main():
celery_chains = []
for i in range(N_ITS):
print '[{}] - Dispatching tasks'.format(i)
celery_chains.append(chain(task_1.si(i) | task_2.si(i))())
time.sleep(0.02)
# wait for all tasks to complete
[c.get() for c in celery_chains]
if __name__ == '__main__':
main()
I also give the configuration of celery if needed:
from __future__ import absolute_import
import os
name = 'live'
broker_url = 'pyamqp://{}'.format(os.environ.get('RMQ_HOST', 'localhost'))
print 'broker_url:', broker_url
include = ['live']
DEFAULT_QUEUE = 'celery'
# A named queue that's not already defined in task_queues will be created automatically.
task_create_missing_queues = True
broker_pool_limit = 10000
task_routes = {
'live.task_1': {'queue': 'worker_1'},
'live.task_2': {'queue': 'worker_2'}
}
# We always set the routing key to be the queue name so we do it here automatically.
for v in task_routes.values():
v.update({'routing_key': v['queue']})
task_serializer = 'pickle'
result_serializer = 'pickle'
accept_content = ['json', 'pickle']
timezone = 'Europe/Paris'
enable_utc = True
For the broker, I use the docker image rabbitmq:3.6-alpine with basic configurations appart that I enabled rabbitmq_management.
This resuts in the following worker occupation chronogram: (the color indicates the index of the data being processed, so you can link tasks belonging to the same chain)
As you can see, usually everything goes well and task 2 is called right after task 1 is finished. However, sometimes (indicated by the arrows on the figure) task 2 doesn't start immediately even though worker 2 isn't occupied. It imputes a delay of 27ms, which is more than twice the duration of a single task. This happened approximately every 2 seconds during this execution.
I made some additionnal investigation using firehose to study the message exchange in rabbitmq and it resulted that the messages are effectively sent on time. To my understanding, the worker waits to go fetch the message and process the task, but I cannot understand why.
I tried setting the broker pool limit to a high number but the issue remains.

Related

Django rq-scheduler: jobs in scheduler doesnt get executed

In my Heroku application I succesfully implemented background tasks. For this purpose I created a Queue object at the top of my views.py file and called queue.enqueue() in the appropriate view.
Now I'm trying to set a repeated job with rq-scheduler's scheduler.schedule() method. I know that it is not best way to do it but I call this method again at the top of my views.py file. Whatever I do, I couldn't get it to work, even if it's a simple HelloWorld function.
views.py:
from redis import Redis
from rq import Queue
from worker import conn
from rq_scheduler import Scheduler
scheduler = Scheduler(queue=q, connection=conn)
print("SCHEDULER = ", scheduler)
def say_hello():
print(" Hello world!")
scheduler.schedule(
scheduled_time=datetime.utcnow(), # Time for first execution, in UTC timezone
func=say_hello, # Function to be queued
interval=60, # Time before the function is called again, in seconds
repeat=10, # Repeat this number of times (None means repeat forever)
queue_name='default',
)
worker.py:
import os
import redis
from rq import Worker, Queue, Connection
import django
django.setup()
listen = ['high', 'default', 'low']
redis_url = os.getenv('REDISTOGO_URL')
if not redis_url:
print("Set up Redis To Go first. Probably can't get env variable REDISTOGO_URL")
raise RuntimeError("Set up Redis To Go first. Probably can't get env variable REDISTOGO_URL")
conn = redis.from_url(redis_url)
if __name__ == '__main__':
with Connection(conn):
print(" CREATING NEW WORKER IN worker.py")
worker = Worker(map(Queue, listen))
worker.work()
I'm checking the length of my queue before and after of schedule(), but it looks like length is always 0. I also can see that there are jobs when I call scheduler.get_jobs(), but those jobs doesn't get enqueued or performed I think.
I also don't want to use another cron solution for my project, as I already can do background tasks with rq, it shouldn't be that hard to implement a repeated task, or is it?
I went through documentation a couple times, now I feel so stuck, so I appretiate all the help or advices that I can get.
Using rq 1.6.1 and rq-scheduler 0.10.0 packages with Django 2.2.5 and Python 3.6.10
Edit: When I print jobs in scheduler, I see that their enqueued_at param is set to None, am I missing something really simple?

Can't seem to make python celery signals work

I'm rather new to celery development and I have an issue implementing signals.
I have an application that consists of many different workers.
currently it uses rabbitmq as a broker and redis as a backend.
Each worker has its own queue. This is the way we have it configured at the moment :
celery = Celery(queueDict['test'], broker=config.REDIS_SERVER, backend=config.REDIS_SERVER)
default_exchange = Exchange('default', type='direct')
test_queue = Queue(queueDict['test'], default_exchange, routing_key=queueDict['test'])
logger = get_task_logger(__name__)
celery.conf.task_queues = (test_queue, )
#celery.task(name='signal2', bind=True)
def signal2(self, param):
print("dog" + param)
I would like to use signals so that I will be able to catch failed tasks on any worker in the application. When I use it inside the same worker with a task_failure event it works.
But I would like to have another worker catch these events (or even my flask app)
but I seem to be missing something...
Here is my current attempt at making it work.
celery = Celery('consumer', broker=config.REDIS_SERVER, backend=config.REDIS_SERVER)
default_exchange = Exchange('default', type='direct')
default_queue = Queue(queueDict['default'], default_exchange, routing_key=queueDict['default'])
logger = get_task_logger(__name__)
celery.conf.task_queues = (default_queue, )
#task_failure.connect
def process_failure_signal(sender=None, task_id=None, exception=None,
args=None, kwargs=None, traceback=None, einfo=None, **akwargs):
msg = 'Signal exception: %s (%s)' % (
exception.__class__.__name__, exception)
exc_info = (type(exception), exception, traceback)
extra = {
'data': {
'task_id': str(task_id),
'sender': str(sender),
'args': str(args),
'kwargs': str(kwargs),
}
}
logger.error(msg, exc_info=exc_info, extra=extra)
But it never receives any signals...
Thanks for the help.
DejanLekic was correct and the page he shared had exactly what I wanted.
for those interested:
https://docs.celeryproject.org/en/stable/userguide/monitoring.html#real-time-processing
This can be easily used to capture events and monitor tasks.
Real-time processing
To process events in real-time you need the following
An event consumer (this is the Receiver)
A set of handlers called when events come in.
You can have different handlers for each event type, or a catch-all handler can be used ('*')
State (optional)
app.events.State is a convenient in-memory representation of tasks and workers in the cluster that’s updated as events come in.
It encapsulates solutions for many common things, like checking if a worker is still alive (by verifying heartbeats), merging event fields together as events come in, making sure time-stamps are in sync, and so on.
Combining these you can easily process events in real-time:
from celery import Celery
def my_monitor(app):
state = app.events.State()
def announce_failed_tasks(event):
state.event(event)
# task name is sent only with -received event, and state
# will keep track of this for us.
task = state.tasks.get(event['uuid'])
print('TASK FAILED: %s[%s] %s' % (
task.name, task.uuid, task.info(),))
with app.connection() as connection:
recv = app.events.Receiver(connection, handlers={
'task-failed': announce_failed_tasks,
'*': state.event,
})
recv.capture(limit=None, timeout=None, wakeup=True)
if __name__ == '__main__':
app = Celery(broker='amqp://guest#localhost//')
my_monitor(app)
Note: The wakeup argument to capture sends a signal to all workers to force them to send a heartbeat. This way you can immediately see workers when the monitor starts.
You can listen to specific events by specifying the handlers:
from celery import Celery
def my_monitor(app):
state = app.events.State()
def announce_failed_tasks(event):
state.event(event)
# task name is sent only with -received event, and state
# will keep track of this for us.
task = state.tasks.get(event['uuid'])
print('TASK FAILED: %s[%s] %s' % (
task.name, task.uuid, task.info(),))
with app.connection() as connection:
recv = app.events.Receiver(connection, handlers={
'task-failed': announce_failed_tasks,
})
recv.capture(limit=None, timeout=None, wakeup=True)
if __name__ == '__main__':
app = Celery(broker='amqp://guest#localhost//')
my_monitor(app)
Monitoring and Management Guide — Celery 4.4.2 documention

Launching celery task_monitor in django

Looking at the celery docs i can see that the task monitor is launched in a script (see below). In an implementation of django (as is my understanding), this won't be the case, as (in my understanding) I'll have to launch the task monitor in a thread.
Currently I'm launching the monitor the first time i run a job, then checking its state each subsequent time i run a job (see further below). This seems like a bad way to do this.
My question is globally: What is the correct way to instantiate the task monitor for celery in a django project? but a good answer would include:
Is threading the accepted way to do this?
Should i launch this in a sub process
do i need to be worried about volume going through the task monitor (hence i should use threading)
Is there a standard, widely accepted way to do this?
It seems I'm missing something really obvious.
# docs example - not implemented like this in my project
from celery import Celery
def my_monitor(app):
state = app.events.State()
def announce_failed_tasks(event):
state.event(event)
# task name is sent only with -received event, and state
# will keep track of this for us.
task = state.tasks.get(event['uuid'])
print('TASK FAILED: %s[%s] %s' % (
task.name, task.uuid, task.info(),))
with app.connection() as connection:
recv = app.events.Receiver(connection, handlers={
'task-failed': announce_failed_tasks,
})
recv.capture(limit=None, timeout=None, wakeup=True)
if __name__ == '__main__':
app = Celery(broker='amqp://guest#localhost//')
# LAUNCHED HERE
my_monitor(app)
# my current implementation
# If the celery_monitor is not instantiated, set it up
app = Celery('scheduler',
broker=rabbit_url, # Rabbit-MQ
backend=redis_url, # Redis
include=tasks
)
celery_monitor = Thread(target=build_monitor, args=[app], name='monitor-global', daemon=True)
# import celery_monitor into another module
global celery_monitor
if not celery_monitor.is_alive():
try:
celery_monitor.start()
logger.debug('Celery Monitor - Thread Started (monitor-retry) ')
except RuntimeError as e: # occurs if thread is dead
# create new instance if thread is dead
logger.debug('Celery Monitor - Error restarting thread (monitor-rety): {}'.format(e))
celery_monitor = Thread(target=build_monitor, args=[app], name='monitor-retry', daemon=True)
celery_monitor.start() # start thread
logger.debug('Celery Monitor - Thread Re-Started (monitor-retry) ')
else:
logger.debug('Celery Monitor - Thread is already alive. Dont do anything.')

I am trying to run an endless worker thread (daemon) from within Django

I have a worker thread which only task is to query a list of active users every 10 minutes from the database, and to send them an SMS message if a certain condition is fulfilled (which is checked every minute); also the worker thread does not hinder the main application at all.
So far I managed to get the thread up and running and sending SMS works also just fine. However, for some reasons the thread stops/gets killed after some random time (hours). I run a try: except Exception as e: within a while True, to catch occurring errors. Additionally, I print out a messages saying what error occurred.
Well, I never see any message and the thread is definitely down. Therefore, I suspect Gunicorn or Django to kill my thread sort of gracefully.
I have put log and print statements all over the code but haven't found anything indicating why my thread is getting killed.
My wsgi.py function where I call the function to start my thread
"""
WSGI config for django_web project.
It exposes the WSGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/2.1/howto/deployment/wsgi/
"""
import os
from django.core.wsgi import get_wsgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'django_web.settings')
application = get_wsgi_application()
'''
Start background services
Import has to happen after "get_wsgi_application()"; otherwise docker container crashes
'''
try:
from threading import Thread
from timesheet.admin import runWorkmateServices
runWorkmateServices()
except Exception as exp:
print(exp)
The function which is called from within the wsgi.py. I double check if the thread was started to avoid having two up and running.
def runWorkmateServices(request=None):
service_name = 'TimeKeeperWorkMateReminderService'
thread_found = False
for thread in threading.enumerate():
if service_name in thread.name:
thread_found = True
break # Leave loop now
if thread_found:
print(f'Service has already been started: {service_name}')
if request:
messages.add_message(request, messages.ERROR, f'Service has already been started:: {service_name}')
else:
Thread(target=WorkMateReminders, args=(), name=service_name, daemon=True).start()
print(f'Started Service: {service_name}')
if request:
messages.add_message(request, messages.SUCCESS, f'Started Service: {service_name}')
The worker thread itself
def WorkMateReminders():
print('Thread Started: WorkMateReminders')
timer = 0
employees = User.objects.none()
while True:
try:
# Update user list every n * sleep time (10 minutes)
if timer % 10 == 0:
timer = 0
# Get active employees
employees = User.objects.filter(is_active=True, profile__workmate_sms_reminders_activated=True)
print(f'Employees updated at {datetime.now().date()} - {datetime.now().time()}: {employees}')
WorkMateCheckClockOffTimes(employees=employees)
WorkMateClockOnReminder(employees=employees)
WorkMateEndOfBreakReminder(employees=employees)
timer += 1 # increment timer
except Exception as exp:
print(f'Error: {exp}')
time.sleep(60 * 1)
My goal is to have this worker thread running for as long as Django is up.
Most WSGI servers spawn workers that are killed/recycled fairly regularly, spawning threads from these workers is not the best solution to your problem. There are several ways to go about this
Cron
Create a management command that does what you want and configure cron to run it every 10 minutes
Celery/Celerybeat
Set up a celery worker, this is a process that runs asynchronously to your Django application and using celerybeat you can have tasks run at intervals

How to detect Celery Connection failure and switch to failover then back?

So our use case might be out of the remit of what Celery can do, but I thought I'd ask...
Use Case
We are planning on using a hosted/managed RabbitMQ cluster backing which Celery will be using for it's broker.
We want to ensure that our app has 0 downtime (obviously) so we're trying to figure out how we can handle the event when our upstream cluster has a catastrophic failure whereby the entire cluster is unavailable.
Our thought is that we have a standby Rabbit cluster that when the connection drops, we can automatically switch Celery to use that connection instead.
In the meantime, Celery is determining whether the master cluster is up and running and when it is, all of the publishers reconnect to the master, the workers drain the backup cluster and when empty, switch back onto the master.
The issue
What I'm having difficulty with is capturing the connection failure as it seems to happen deep within celery as the Exception doesn't bubble up to the app.
I can see that Celery has a BROKER_FAILOVER_STRATEGY configuration property, which would handle the initial swap, but it (seemingly) is only utilised when failover occurs, which doesn't fit our use case of swapping back to the master when it is back up.
I've also come across Celery's "bootsteps", but these are applied after Celery's own "Connection" bootstep which is where the exception is being thrown.
I have a feeling this approach is probably not the best one given the limitations I've been finding, but has anyone got any ideas on how I'd go about overriding the default Connection bootstep or achieving this via a different means?
It's quite old, but maybe useful to someone. I'm usin FastApi with Celery 5.2.
run_api.py file:
import uvicorn
if __name__ == "__main__":
port=8893
print("Starting API server on port {}".format(port))
uvicorn.run("endpoints:app", host="localhost", port=port, access_log=False)
endpoints.py file:
import threading
import time
import os
from celery import Celery
from fastapi import FastAPI
import itertools
import random
# Create object for fastAPI
app = FastAPI()
# Create and onfigure Celery to manage queus
# ----
celery = Celery(__name__)
celery.conf.broker_url = ["redis://localhost:6379"]
celery.conf.result_backend = "redis://localhost:6379"
celery.conf.task_track_started = True
celery.conf.task_serializer = "pickle"
celery.conf.result_serializer = "pickle"
celery.conf.accept_content = ["pickle"]
def random_failover_strategy(servers):
# The next line is necessary to work, even you don't use them:
it = list(servers) # don't modify callers list
shuffle = random.shuffle
for _ in itertools.repeat(None):
# Do whatever action required here to obtain the new url
# As an example, ra.
ra = random.randint(0, 100)
it = [f"redis://localhost:{str(ra)}"]
celery.conf.result_backend = it[0]
shuffle(it)
yield it[0]
celery.conf.broker_failover_strategy = random_failover_strategy
# Start the celery worker. I start it in a separate thread, so fastapi can run in parallel
worker = celery.Worker()
def start_worker():
worker.start()
ce = threading.Thread(target=start_worker)
ce.start()
# ----
#app.get("/", tags=["root"])
def root():
return {"message": ""}
#app.post("/test")
def test(num: int):
task = test_celery.delay(num)
print(f'task id: {task.id}')
return {
"task_id": task.id,
"task_status": "PENDING"}
#celery.task(name="test_celery", bind=True)
def test_celery(self, num):
self.update_state(state='PROGRESS')
print("ENTERED PROCESS", num)
time.sleep(100)
print("EXITING PROCESS", num)
return {'number': num}
#app.get("/result")
def result(id: str):
task_result = celery.AsyncResult(id)
if task_result.status == "SUCCESS":
return {
"task_status": task_result.status,
"task_num": task_result.result['number']
}
else:
return {
"task_status": task_result.status,
"task_num": None
}
Place both files in the same folder. Run python3 run_api.py.
Enjoy!

Categories

Resources