when I have at least 1 skipped task in my dag it shows me that the dag was a "success".
I'm using slack alerts and its integration with the airflow.
I just want to create a class where at least 1 task will be skipped so the dag will be "skipped" and it sends me an alert to my channel. by the way, I'm using DatabricksRunNowOperator.
I thought to use this function in my class and use the skipped method:
def __getstate__(self):
state = dict(self.__dict__)
del state["_log"]
return state
and this is my function and the class that I was create:
def _send_slack_alerts(channels, task_id, msg, context):
tasks = []
if ENV != 'prod':
channels = [MY_CHANNEL]
for channel in channels:
failed_alert = SlackAPIPostOperator(
task_id=task_id,
channel=channel,
slack_conn_id=SLACK_CONN,
text=msg,
username='airflow_dev')
tasks.append(failed_alert.execute(context=context))
class MyOperator(BaseOperator):
def __init__(self, context, *args, **kwargs):
self._log = Log()
self.env = ENV
self.task = context.get('task_instance').task_id
self.dag = context.get('task_instance').dag_id
self.exec_date = context.get('task_instance').exec_date
self.log_url = context.get('task_instance').log_url
def __getstate__(self):
state = dict(self.__dict__)
del state["_log"]
return state
def execute(self, channels, context):
if self.skipped:
slack_message = "An operation was skipped in Airflow."
_send_slack_alerts(channels, 'slack_failed', slack_message, context)
the problem is that the alert doesn't send me anything to my channel.
Does someone have a good idea for this problem?
thanks all anyway.
Sample logic
logic.py
#shared_task
def run_create_or_update_google_creative():
return create_or_update_google_creative()
def create_or_update_google_creative() :
# do some logic
def run_db_sinc():
result = run_create_or_update_google_creative.delay()
job = CeleryJobResult(job_id=result.task_id, status=result.status)
job.save()
return 201, job.id
There is such a structure to the celery task call logic. First I call run_db_sinc, a new celery task is generated and I immediately get the task_id value which I save in the database and send as a response to the frontend. As long as the status is PENDING the frontend will go through the endpoint to the database and find the task_id status.
My question is how do I know that the task has completed and the status has changed to SUCCESS? At what point and how to do it ? I know that it is possible to use similar function
from celery.result import AsyncResult
def get_task_status(task_id):
task = AsyncResult(task_id)
if task.status = 'SUCCESS': # or task ended already
job = CeleryJobResult.objects.get(job_id=task_id)
job.status=task.status
job.save()
return task.status
But I can't understand at what point in time and where in my code to call it.
I think I've come up with an option that works. using threading inside run_db_sinc
#shared_task
def run_create_or_update_google_creative():
return create_or_update_google_creative()
def create_or_update_google_creative() :
# do some logic
def get_task_status(task_id: str) -> str:
task = AsyncResult(task_id)
return task.status
def check_task_status(task_id: str) -> None:
status = get_task_status(task_id)
while status not in ('custom status', ):
time.sleep(1)
status = get_task_status(task_id)
job = CeleryJobResult.objects.get(job_id=task_id)
job.status = status
job.save()
logger.info(f"Task {task_id} completed with status: {status}")
return None
def run_db_sinc():
result = run_create_or_update_google_creative.delay()
job = CeleryJobResult(job_id=result.task_id, status=result.status)
job.save()
t = threading.Thread(target=check_task_status, args=(result.task_id,))
t.start()
return 201, job.id
I'm trying to use aiohttp to make a sort of advanced reverse proxy.
I want to get content of HTTP request and pass it to new HTTP request without pulling it to memory. While there is the only upstream the task is fairly easy: aiohttp server returns request content as StreamReader and aiohttp client can accept StreamReader as request body.
The problem is that I want to send origin request to several upstreams or, for example, simultaneously send content to upstream and write it on disk.
Is there some instruments to broadcast content of StreamReader?
I've tried to make some naive broadcaster but it fails on large objects. What do I do wrong?
class StreamBroadcast:
async def __do_broadcast(self):
while True:
chunk = await self.__source.read(self.__n)
if not chunk:
break
for output in self.__sinks:
output.feed_data(chunk)
for output in self.__sinks:
output.feed_eof()
def __init__(self, source: StreamReader, sinks_count: int, n: int = -1):
self.__source = source
self.__n = n
self.__sinks = [StreamReader() for i in range(sinks_count)]
self.__task = asyncio.create_task(self.__do_broadcast())
#property
def sinks(self) -> Iterable[StreamReader]:
return self.__sinks
#property
def ready(self) -> Task:
return self.__task
Well, I've looked through asyncio sources and discovered that I should use Transport to pump data over a stream. Here is my solution.
import asyncio
from asyncio import StreamReader, StreamWriter, ReadTransport, StreamReaderProtocol
from typing import Iterable
class _BroadcastReadTransport(ReadTransport):
"""
Internal class, is not meant to be instantiated manually
"""
def __init__(self, source: StreamReader, sinks: Iterable[StreamReader]):
super().__init__()
self.__source = source
self.__sinks = tuple(StreamReaderProtocol(s) for s in sinks)
for sink in sinks:
sink.set_transport(self)
self.__waiting_for_data = len(self.__sinks)
asyncio.create_task(self.__broadcast_next_chunk(), name='initial-chunk-broadcast')
def is_reading(self):
return self.__waiting_for_data == len(self.__sinks)
def pause_reading(self):
self.__waiting_for_data -= 1
async def __broadcast_next_chunk(self):
data = await self.__source.read()
if data:
for sink in self.__sinks:
sink.data_received(data)
if self.is_reading():
asyncio.create_task(self.__broadcast_next_chunk())
else:
for sink in self.__sinks:
sink.eof_received()
def resume_reading(self):
self.__waiting_for_data += 1
if self.__waiting_for_data == len(self.__sinks):
asyncio.create_task(self.__broadcast_next_chunk(), name='chunk-broadcast')
#property
def is_completed(self):
return self.__source.at_eof()
class StreamBroadcast:
def __init__(self, source: StreamReader, sinks_count: int):
self.__source = source
self.__sinks = tuple(StreamReader() for _ in range(sinks_count))
self.__transport = _BroadcastReadTransport(self.__source, self.__sinks)
#property
def sinks(self) -> Iterable[StreamReader]:
return self.__sinks
#property
def is_completed(self):
return self.__transport.is_completed
Hope once I'll pack it to pip module.
I am using luigi to extract different user actions and save each as a csv simultaneously.
The idea to look at my source data, find unique actions and create csv's using the names of each of those actions.
class data_filter(luigi.Task):
task = luigi.Parameter()
def run(self):
data_filter = full_file[full_file['properties_url'].str.contains(task)]
data_filter.to_csv('/Users/Documents/Data/'+str(task)+'.csv')
def requires(self):
return []
def output(self):
return luigi.LocalTarget('/Users/Documents/Data/'+str(task)+'.csv')
#chaining tasks with wrapper
class wrapper(luigi.WrapperTask):
def requires(self):
file = pd.read_csv('/Users/Desktop/attr.csv')
actions = file.utm_source.unique()
task_list = []
for current_task in actions:
task_list.append(data_filter(task=current_task))
return task_list
def run(self):
print ('Wrapper has ended')
pd.DataFrame().to_csv('/Users/Documents/Data/wrangle.csv')
def output(self):
return luigi.LocalTarget('/Users/Documents/Data/dwrangle.csv')
if __name__ == '__main__':
luigi.run(wrapper())
The wrapper should tie everything up by, looking at all unique actions, assigning them to task_list and running task_list...while assigning the current task I am iterating through to task = luigi.Paramter in my data_filter class.
However this returns the error message:
return luigi.LocalTarget('/Users/emmanuels/Documents/GitHub/Springboard-DSC/Springboard-DSC/Capstone 1 - Attribution Model/Data/'+str(task)+'.csv')
NameError: name 'task' is not defined
and
===== Luigi Execution Summary =====
Scheduled 1 tasks of which:
* 1 failed scheduling:
- 1 wrapper()
Did not run any tasks
This progress looks :( because there were tasks whose scheduling failed
I just want to figure out what I am doing wrong
I use celery to update RSS feeds in my news aggregation site. I use one #task for each feed, and things seem to work nicely.
There's a detail that I'm not sure to handle well though: all feeds are updated once every minute with a #periodic_task, but what if a feed is still updating from the last periodic task when a new one is started ? (for example if the feed is really slow, or offline and the task is held in a retry loop)
Currently I store tasks results and check their status like this:
import socket
from datetime import timedelta
from celery.decorators import task, periodic_task
from aggregator.models import Feed
_results = {}
#periodic_task(run_every=timedelta(minutes=1))
def fetch_articles():
for feed in Feed.objects.all():
if feed.pk in _results:
if not _results[feed.pk].ready():
# The task is not finished yet
continue
_results[feed.pk] = update_feed.delay(feed)
#task()
def update_feed(feed):
try:
feed.fetch_articles()
except socket.error, exc:
update_feed.retry(args=[feed], exc=exc)
Maybe there is a more sophisticated/robust way of achieving the same result using some celery mechanism that I missed ?
Based on MattH's answer, you could use a decorator like this:
from django.core.cache import cache
import functools
def single_instance_task(timeout):
def task_exc(func):
#functools.wraps(func)
def wrapper(*args, **kwargs):
lock_id = "celery-single-instance-" + func.__name__
acquire_lock = lambda: cache.add(lock_id, "true", timeout)
release_lock = lambda: cache.delete(lock_id)
if acquire_lock():
try:
func(*args, **kwargs)
finally:
release_lock()
return wrapper
return task_exc
then, use it like so...
#periodic_task(run_every=timedelta(minutes=1))
#single_instance_task(60*10)
def fetch_articles()
yada yada...
From the official documentation: Ensuring a task is only executed one at a time.
Using https://pypi.python.org/pypi/celery_once seems to do the job really nice, including reporting errors and testing against some parameters for uniqueness.
You can do things like:
from celery_once import QueueOnce
from myapp.celery import app
from time import sleep
#app.task(base=QueueOnce, once=dict(keys=('customer_id',)))
def start_billing(customer_id, year, month):
sleep(30)
return "Done!"
which just needs the following settings in your project:
ONCE_REDIS_URL = 'redis://localhost:6379/0'
ONCE_DEFAULT_TIMEOUT = 60 * 60 # remove lock after 1 hour in case it was stale
If you're looking for an example that doesn't use Django, then try this example (caveat: uses Redis instead, which I was already using).
The decorator code is as follows (full credit to the author of the article, go read it)
import redis
REDIS_CLIENT = redis.Redis()
def only_one(function=None, key="", timeout=None):
"""Enforce only one celery task at a time."""
def _dec(run_func):
"""Decorator."""
def _caller(*args, **kwargs):
"""Caller."""
ret_value = None
have_lock = False
lock = REDIS_CLIENT.lock(key, timeout=timeout)
try:
have_lock = lock.acquire(blocking=False)
if have_lock:
ret_value = run_func(*args, **kwargs)
finally:
if have_lock:
lock.release()
return ret_value
return _caller
return _dec(function) if function is not None else _dec
I was wondering why nobody mentioned using celery.app.control.inspect().active() to get the list of the currently running tasks. Is it not real time? Because otherwise it would be very easy to implement, for instance:
def unique_task(callback, *decorator_args, **decorator_kwargs):
"""
Decorator to ensure only one instance of the task is running at once.
"""
#wraps(callback)
def _wrapper(celery_task, *args, **kwargs):
active_queues = task.app.control.inspect().active()
if active_queues:
for queue in active_queues:
for running_task in active_queues[queue]:
# Discard the currently running task from the list.
if task.name == running_task['name'] and task.request.id != running_task['id']:
return f'Task "{callback.__name__}()" cancelled! already running...'
return callback(celery_task, *args, **kwargs)
return _wrapper
And then just applying the decorator to the corresponding tasks:
#celery.task(bind=True)
#unique_task
def my_task(self):
# task executed once at a time.
pass
This solution for celery working at single host with concurency greater 1. Other kinds (without dependencies like redis) of locks difference file-based don't work with concurrency greater 1.
class Lock(object):
def __init__(self, filename):
self.f = open(filename, 'w')
def __enter__(self):
try:
flock(self.f.fileno(), LOCK_EX | LOCK_NB)
return True
except IOError:
pass
return False
def __exit__(self, *args):
self.f.close()
class SinglePeriodicTask(PeriodicTask):
abstract = True
run_every = timedelta(seconds=1)
def __call__(self, *args, **kwargs):
lock_filename = join('/tmp',
md5(self.name).hexdigest())
with Lock(lock_filename) as is_locked:
if is_locked:
super(SinglePeriodicTask, self).__call__(*args, **kwargs)
else:
print 'already working'
class SearchTask(SinglePeriodicTask):
restart_delay = timedelta(seconds=60)
def run(self, *args, **kwargs):
print self.name, 'start', datetime.now()
sleep(5)
print self.name, 'end', datetime.now()