Python Async Limit Concurrent coroutines per second - python

My use case is the following :
I’m using python 3.8
I have an async function analyse_doc that is a wrapper for a http request to a web service.
I have approx 1000 docs to analyse as fast as possible. The service allows for 15 transaction per second (and not 15 concurrent request at any second). So first sec I can send 15, then 2nd sec I can send 15 again and so on. If I try to hit the service more than 15 times per sec I get 429 error msg or sometimes 503/504 error (server is busy…)
My question is : is it possible to implement smt in python that effectively sends 15 requests per sec asynchronously then wait 1 sec then do it again until the queue is empty. Also some tasks might fail. Those failing tasks might need a rerun at some point.
So far my code is the following (unbounded parallelism… not even a semaphore) but it handles retry.
tasks = {asyncio.create_task(analyse_async(doc)): doc for doc in documents}
pending = set(tasks)
# Handle retry
while pending:
# backoff in case of 429
time.sleep(1)
# concurrent call return_when all completed
finished, pending = await asyncio.wait(
pending, return_when=asyncio.ALL_COMPLETED
)
# check if task has exception and register for new run.
for task in finished:
arg = tasks[task]
if task.exception():
new_task = asyncio.create_task(analyze_doc(doc))
tasks[new_task] = doc
pending.add(new_task)

You could try adding another sleep tasks into the mix to drive the request generation. Something like this
import asyncio
import random
ONE_SECOND = 1
CONCURRENT_TASK_LIMIT = 2
TASKS_TO_CREATE = 10
loop = asyncio.new_event_loop()
work_todo = []
work_in_progress = []
# just creates arbitrary work to do
def create_tasks():
for i in range(TASKS_TO_CREATE):
work_todo.append(worker_task(i))
# muddle this up to see how drain works
random.shuffle(work_todo)
# represents the actual work
async def worker_task(index):
print(f"i am worker {index} and i am starting")
await asyncio.sleep(index)
print(f"i am worker {index} and i am done")
# gets the next 'concurrent' workload segment (if there is one)
def get_next_tasks():
todo = []
i = 0
while i < CONCURRENT_TASK_LIMIT and len(work_todo) > 0:
todo.append(work_todo.pop())
i += 1
return todo
# drains down any outstanding tasks and closes the loop
async def are_we_done_yet():
print('draining')
await asyncio.gather(*work_in_progress)
loop.stop()
# closes out the program
print('done')
# puts work on the queue every tick (1 second)
async def work():
next_tasks = get_next_tasks()
if len(next_tasks) > 0:
print(f'found {len(next_tasks)} tasks to do')
for task in next_tasks:
# schedules the work, puts it in the in-progress pile
work_in_progress.append(loop.create_task(task))
# this is the 'tick' or speed work gets scheduled on
await asyncio.sleep(ONE_SECOND)
# every 'tick' we add this tasks onto the loop again unless there isn't any more to do...
loop.create_task(work())
else:
# ... if there isn't any to do we just enter drain mode
await are_we_done_yet()
# bootstrap the process
create_tasks()
loop.create_task(work())
loop.run_forever()
Updated version with a simulated exception
import asyncio
import random
ONE_SECOND = 1
CONCURRENT_TASK_LIMIT = 2
TASKS_TO_CREATE = 10
loop = asyncio.new_event_loop()
work_todo = []
work_in_progress = []
# just creates arbitrary work to do
def create_tasks():
for i in range(TASKS_TO_CREATE):
work_todo.append(worker_task(i))
# muddle this up to see how drain works
random.shuffle(work_todo)
# represents the actual work
async def worker_task(index):
try:
print(f"i am worker {index} and i am starting")
await asyncio.sleep(index)
if index % 9 == 0:
print('simulating error')
raise NotImplementedError("some error happened")
print(f"i am worker {index} and i am done")
except:
# put this work back on the pile (fudge the index so it doesn't throw this time)
work_todo.append(worker_task(index + 1))
# gets the next 'concurrent' workload segment (if there is one)
def get_next_tasks():
todo = []
i = 0
while i < CONCURRENT_TASK_LIMIT and len(work_todo) > 0:
todo.append(work_todo.pop())
i += 1
return todo
# drains down any outstanding tasks and closes the loop
async def are_we_done_yet():
print('draining')
await asyncio.gather(*work_in_progress)
if (len(work_todo)) > 0:
loop.create_task(work())
print('found some retries')
else:
loop.stop()
# closes out the program
print('done')
# puts work on the queue every tick (1 second)
async def work():
next_tasks = get_next_tasks()
if len(next_tasks) > 0:
print(f'found {len(next_tasks)} tasks to do')
for task in next_tasks:
# schedules the work, puts it in the in-progress pile
work_in_progress.append(loop.create_task(task))
# this is the 'tick' or speed work gets scheduled on
await asyncio.sleep(ONE_SECOND)
# every 'tick' we add this tasks onto the loop again unless there isn't any more to do...
loop.create_task(work())
else:
# ... if there isn't any to do we just enter drain mode
await are_we_done_yet()
# bootstrap the process
create_tasks()
loop.create_task(work())
loop.run_forever()
This just simulates something going wrong and re-queues the failed task. If the error happens after the main work method has finished it won't get re-queued so in the are-we-there-yet method it would need to check and rerun any failed tasks - this isn't particularly optimal as it'll wait to drain before checking everything else but gives you an idea of an implementation

Related

Execute asyncio task as soon as possible

I would like to know how can I execute the tasks group 'tg_fast' immediately, and after, continue the tasks group 'tg_main'(or start again if not possible to continue).
In use asyncio.gather(), the result is like TaskGroup.
import asyncio
async def another_coro(i):
print(i)
await asyncio.sleep(.1)
async def coro(i):
if i == 1:
async with asyncio.TaskGroup() as tg_fast:
tg_fast.create_task(another_coro(i * 10))
tg_fast.create_task(another_coro(i * 100))
# await asyncio.gather(*[another_coro(i * 10), another_coro(i * 100)])
else:
print(i)
await asyncio.sleep(.1)
async def main():
async with asyncio.TaskGroup() as tg_main:
for i in range(0, 3):
tg_main.create_task(coro(i))
asyncio.run(main(), debug=True)
printing is 0 => 2 => 10 => 100
But I would a method to get: 0 => 10 => 100 => ... OR 0 => 100 => 10 => ...
The goal being to initiate 10 and 100 after 0 and before 2.
Thanks you very much for your help.
Edit:
I want to call 'another_coro' simultaneously. Not wait for one and start the second one after.
And I don't need to finish them, I can execute both until await 'asyncio.sleep(.1') and continue the event loop.
For this to work, you have to deliberately add another mechanism to prioritize tasks, and it has to be done explicitly to your other tasks in the "non priority" group.
It could be done by, for example, subclassing asyncio.TaskGroup, and add a priority mechanism to the __aexit__ method, so that when a group is intended to be exited (and all its tasks intended to be awaited), it could check in a central registry for all your instances of your specialized TaskGroup if there is a TaskGroup with greater priority running, and then wait until that one exits -
That would work without needing to change any code in your tasks -just how you instantiate your groups - but on the other hand, if would not prevent the non-prioritized tasks from step and run parts in any other point inthe code they await (or otherwise yield to the asyncio loop).
Another approach, for which I wrote the snippet bellow, requires you to change the tasks that are to have lower priority at points, and call a specialized sleep in them (it can be called with "0" delay, just as asyncio.sleep) . The points where these calls are placed become explicit points where your tasks will yield priority to the tasks that should run first.
This allows greater flexibility, is more explicit, and is guaranteed to pause your lower priority work - the downside being you have to explicitly add the "checkpoints" in your code.
Perceive that this works by the modified .sleep method simply not returning while there is any other higher priority task running.
import asyncio
from heapq import heappush, heapify
granularity = 0.01
class PriorityGroups:
def __init__(self):
self.priority_queue = []
self.counter = 0
async def sleep(self, delay, priority=10):
counter = self.counter
self.counter += 1
steps = delay / granularity
step_delay = delay / steps
step = 0
heappush(self.priority_queue, (priority, counter))
try:
while step < steps or (self.priority_queue and self.priority_queue[0][0] < priority):
await asyncio.sleep(step_delay)
step += 1
finally:
self.priority_queue.remove((priority, counter))
heapify(self.priority_queue)
priority_group = PriorityGroups()
async def another_coro(i, priority=1):
await priority_group.sleep(.1, priority)
print(i)
async def coro(i):
if i == 1:
async with asyncio.TaskGroup() as tg_fast:
tg_fast.create_task(another_coro(i * 10))
tg_fast.create_task(another_coro(i * 100))
# await asyncio.gather(*[another_coro(i * 10), another_coro(i * 100)])
else:
await priority_group.sleep(.1)
print(i)
async def main():
async with asyncio.TaskGroup() as tg_main:
for i in range(0, 3):
tg_main.create_task(coro(i))
asyncio.run(main(), debug=True)
So - just place calls for the same instance of PriorityGroups.sleep, optionally passing a lower number for the priority (==more prioritary), for things that should run first. Having the control placed in an instance of PriorityGroups even means you can have parallel nested groups of tasks and priority tasks, and one group won't interfere with the others.

Process async results in another thread - app architecture (python-3.7)

I have a programm that receives Data (Trades) from the Binance API.
This data will be processed and visualized in a web-app with dash and plotly.
In order to get best performance and the slightest delay my program has 3 threads:
Thread 1 - Binance API - get requests - Trades
if __name__ == "__main__":
try:
loop = asyncio.get_event_loop()
binance-thread = threading.Thread(target=start_thread_1)
...
def start_thread_1():
loop.run_until_complete(main(api_key,secret_key))
async def main(api_key,secret_key):
client = await AsyncClient.create(api_key,secret_key)
await trades_listener(client)
async def trades_listener(client):
bm = BinanceSocketManager(client)
symbol = 'BTCUSDT'
async with bm.trade_socket(symbol=symbol) as stream:
while True:
msg = await stream.recv()
event_type = msg['e']
...
trade = Trade(event_type,...)
# <-- safe trade SOMEWHERE to process in other thread ? safe to: process_trades_list
Thread 2 - Web App - Displays Trades and Processed Trades Data
web-thread = threading.Thread(target=webserver.run_server)
...
not worth to mention
Thread 3 - Process Data - Process Trades (calculate RSI, filter big trades, etc)
if __name__ == "__main__":
try:
loop = asyncio.get_event_loop()
binance-thread = threading.Thread(target=start_thread_1)
web-thread = threading.Thread(target=webserver.run_server)
process-thread = threading.Thread(target=start_thread_3)
...
.start()
.sleep()
etc.
.join()
def start_thread_3():
process_trades()
def process_trades():
global process_trades_list
while True:
while len(process_trades_list) > 0:
trade = process_trades_list[0]
process_trades_list.pop(0)
# ...do calculation etc.
HOW can I safe / hand over the data from thread_1 / async thread to thread_3?
I tried to put the trades to a list called process_trades_list and then loop while len(process_trades_list) > 0 all trades.
In the loop I pop() processed trades from the list - but this somehow seems to break the program without throwing errors.
What's best way to get this done?
It is possible that the async stream get's spammed by new incoming trades and I want to minimalize the load..
Here you want a queue.Queue instead of a list. Your last code snippet would look something like this:
import queue
if __name__ == "__main__":
try:
q = queue.Queue()
binance_thread = threading.Thread(target=start_thread_1,
args=(q,))
web_thread = threading.Thread(target=webserver.run_server)
process)thread = threading.Thread(target=process_trades,
args=(q,), daemon=True)
...
.start()
.sleep()
etc.
.join()
def process_trades(q):
while True:
trade = q.get()
# ...do calculation etc.
I eliminated the call to get_event_loop since you didn't use the returned object. I eliminated the start_thread_3 function, which is not necessary.
I made thread-3 a daemon, so it will not keep your application open if everything else is finished.
The queue should be created once, in the main thread, and passed explicitly to each thread that needs to access it. That eliminates the need for a global variable.
The process trade function becomes much simpler. The q.get() call blocks until an object is available. It also pops the object off the queue.
Next you must also modify thread-1 to put objects onto the queue, like this:
def start_thread_1(q):
asyncio.run(main(api_key,secret_key, q))
async def main(api_key,secret_key, q):
client = await AsyncClient.create(api_key,secret_key)
await trades_listener(client, q)
async def trades_listener(client, q):
bm = BinanceSocketManager(client)
symbol = 'BTCUSDT'
async with bm.trade_socket(symbol=symbol) as stream:
while True:
msg = await stream.recv()
event_type = msg['e']
...
trade = Trade(event_type,...)
q.put(trade)
The q.put function is how you safely put a trade object into the queue, which will then result in activity in thread-3.
I modified the start_thread1 function: here is a good place to start the event loop mechanism for this thread.
You ask about avoiding spam attacks on your program. Queues have methods that allow you to limit their size, and possibly throw away trades if they become full.
I don't understand what you are trying to do with the if __name__ == '__main__' logic in thread-1. The program can have only one entry point, and only one module named '__main__'. It looks to me like that has to be thread-3.

Read in parallel and write sequentially?

I have the following code which read and write for each id sequentially.
async def main():
while id < 1000:
data = await read_async(id)
await data.write_async(f'{id}.csv')
id += 1
read_async() takes several minutes and write_async() takes less than one minute to run. Now I want to
Run read_async(id) in parallel. However, at most 3 calls can be run in parallel because of memory limitation.
write_async has to be run sequentially, i.e., write_async(n+1) cannot be run before write_async(n).
You could use a queue and a fixed number of tasks for reading, and write from the main task. The main task can use an event to find out that new data is available from the readers and and a shared dict to get it from them. For example (untested):
async def reader(q, id_to_data, data_ready):
while True:
id = await q.get()
data = await read_async(id)
id_to_data[id] = data
data_ready.set()
async def main():
q = asyncio.Queue()
for id in range(1000):
await q.put(id)
id_to_data = {}
data_ready = asyncio.Event()
readers = [asyncio.create_task(reader(q, id_to_data, data_ready))
for _ in 3]
for id in range(1000):
while True:
# wait for the current ID to appear before writing
if id in id_to_data:
data = id_to_data.pop(id)
await data.write_async(f'{id}.csv')
break
# move on to the next ID
else:
# wait for new data and try again
await data_ready.wait()
data_ready.clear()
for r in readers:
r.cancel()
Using a separate queue for results instead of the event wouldn't work because a queue is unordered. A priority queue would fix that, bit it would still immediately return the lowest id currently available, whereas the writer needs the next id in order to process all ids in order.

Python asyncio: stop and start a task from another task without lossing state?

I would like to stop a python asyncio task from another task and start it again when some condition in the second task happen.
Please note, than I don't want to cancel the coroutine of the first task (the state of that coroutine when it stopped should be available). Also, I don't care about the exact state the first task is in, I just want the event loop stop running the first task until told otherwise from the second.
I hope this example code helps understanding the problem:
import asyncio
async def coroutine1():
i = 0
while(True):
i += 1
print("coroutine1: " + str(i) )
await asyncio.sleep(1)
async def coroutine2(task1):
i = 0
while(True):
i += 1
if (i > 3) and (i<10):
pass #TODO: stop task1 here
else:
pass #TODO: Maybe check if task1 is running
#and start task1 again if it's not?
print("coroutine2: " + str(i) )
await asyncio.sleep(1)
async def main_coroutine():
loop = asyncio.get_event_loop()
task1 = loop.create_task(coroutine1())
task2 = loop.create_task(coroutine2(task1))
done, pending = await asyncio.wait(
[task1, task2]
, return_when=asyncio.FIRST_COMPLETED,)
loop = asyncio.get_event_loop()
loop.run_until_complete(main_coroutine())
loop.close()
I would like to stop a python asyncio task from another task and start it again when some condition in the second task happen.
I assume you control the task creation, but don't want to touch the implementation of the coroutine. In your case, you control coroutine2 and main_coroutine, but not the insides of coroutine1.
In that case you can wrap the coroutine in a an __await__ that, instead of the normal yield from loop, checkes your stopped flag and waits for a future that tells it when to resume.
class Stoppable:
def __init__(self, coro):
self._coro_iter = coro.__await__()
self._stopped = None
def __await__(self):
while True:
while self._stopped:
print('awaiting stopped')
yield from self._stopped.__await__()
try:
v = next(self._coro_iter)
except StopIteration as e:
return v
yield v
def stop(self):
loop = asyncio.get_event_loop()
self._stopped = loop.create_future()
def start(self):
if self._stopped is not None:
self._stopped.set_result(None)
self._stopped = None
You can use the wrapper to modify coroutine2 to stop and resume the execution of coroutine1 at will:
async def coroutine2(s):
i = 0
while True:
i += 1
if i == 3:
print('stopping coroutine1')
s.stop()
elif i == 10:
print('restarting coroutine1')
s.start()
print("coroutine2: " + str(i) )
await asyncio.sleep(1)
async def main_coroutine():
loop = asyncio.get_event_loop()
s = Stoppable(coroutine1())
fut1 = asyncio.ensure_future(s)
task2 = loop.create_task(coroutine2(s))
done, pending = await asyncio.wait(
[fut1, task2], return_when=asyncio.FIRST_COMPLETED)
The way wrapper works is by unrolling the loop inherent in yield from. For example, to just delegate __await__ to another coroutine, one would write:
def __await__(self):
yield from self._coro_iter
Written like this, you can't implement stopping because the yield from contains an implicit loop that yields all the values produced by the underlying iterator - something like:
def __await__(self):
while True:
try:
v = next(self._coro_iter)
except StopIteration as e:
return e.value
yield v
Taken like this, it is easy enough to add an if that checks for _stopped at each iteration pass, meaning each time we're resumed by the event loop. The remaining hurdle is that one cannot just busy-loop until _stopped is rescinded - we must yield something else to allow the event loop to resume running other coroutines. Fortunately that is easily achieved by making _stopped a future, and yielding from the future. When the future's result is set, we will be automatically resumed and continue executing the wrapped coroutine.
It seems it can't be done.
It's possible to cancel an ongoing task with task1.cancel() and it's possible to create a new task with asyncio.get_event_loop().create_task(newTask).
It's also possible to get the coroutine of a running task with task1._coro but if we try to create a task again with a previously scheduled coroutine we will get a RuntimeError exception. This the discussion where they decided it: https://bugs.python.org/issue25887
Finally, a possible way of accomplishing the desire effect is using a asyncio.Queue object:
import asyncio
async def coroutine1(stop_queue):
i = 0
while(True):
if stop_queue.empty(): #if the queue is empty keep working.
i += 1
print("coroutine1: " + str(i) )
await asyncio.sleep(1)
async def coroutine2(stop_queue):
i = 0
while(True):
i += 1
if i == 3:
await stop_queue.put("whatever..") #put something in the queue
if i == 11:
await stop_queue.get() #take something from the queue
print("coroutine2: " + str(i) )
await asyncio.sleep(1)
async def main_coroutine():
stop_queue = asyncio.Queue()
done, pending = await asyncio.wait(
[coroutine1(stop_queue), coroutine2(stop_queue)]
, return_when=asyncio.ALL_COMPLETED,)
loop = asyncio.get_event_loop()
loop.run_until_complete(main_coroutine())
loop.close()

Python asyncio - consumer blocking with asyncio.Event()

I have a program with one producer and two slow consumers and I'd like to rewrite it with coroutines in such way that each consumer will handle only last value (i.e. skip new values generated during processing the old ones) produced for it (I used threads and threading.Queue() but with it blocks on put(), cause the queue will be full most of the time).
After reading answer to this question I decided to use asyncio.Event and asyncio.Queue. I wrote this prototype program:
import asyncio
async def l(event, q):
h = 1
while True:
# ready
event.set()
# get value to process
a = await q.get()
# process it
print(a * h)
h *= 2
async def m(event, q):
i = 1
while True:
# pass element to consumer, when it's ready
if event.is_set():
await q.put(i)
event.clear()
# produce value
i += 1
el = asyncio.get_event_loop()
ev = asyncio.Event()
qu = asyncio.Queue(2)
tasks = [
asyncio.ensure_future(l(ev, qu)),
asyncio.ensure_future(m(ev, qu))
]
el.run_until_complete(asyncio.gather(*tasks))
el.close()
and I have noticed that l coroutine blocks on q.get() line and doesn't print anything.
It works as I expect after adding asyncio.sleep() in both (I get 1,11,21,...):
import asyncio
import time
async def l(event, q):
h = 1
a = 1
event.set()
while True:
# await asyncio.sleep(1)
a = await q.get()
# process it
await asyncio.sleep(1)
print(a * h)
event.set()
async def m(event, q):
i = 1
while True:
# pass element to consumer, when it's ready
if event.is_set():
await q.put(i)
event.clear()
await asyncio.sleep(0.1)
# produce value
i += 1
el = asyncio.get_event_loop()
ev = asyncio.Event()
qu = asyncio.Queue(2)
tasks = [
asyncio.ensure_future(l(ev, qu)),
asyncio.ensure_future(m(ev, qu))
]
el.run_until_complete(asyncio.gather(*tasks))
el.close()
...but I'm looking for solution without it.
Why is it so? How can I fix it? I think I cannot call await l() from m as both of them have states (in original program the first draws solution with PyGame and the second plots results).
The code is not working as expected as the task running the m function is never stopped. The task will continue increment i in the case that event.is_set() == False. Because this task is never suspended, the task running function l will never be called. Therefore, you need a way to suspend the task running function m. One way of suspending is awaiting another coroutine, that is the reason why a asyncio.sleep works as expected.
I think the following code will work as you expect. The LeakyQueue will ensure that only the last value from the producer will be processed by the consumer. As the complexity is very symmetric, the consumer will consume all values produced by the producer. If you increase the delay argument, you can simulate that the consumer only processes the last value created by the producer.
import asyncio
class LeakyQueue(asyncio.Queue):
async def put(self, item):
if self.full():
await self.get()
await super().put(item)
async def consumer(queue, delay=0):
h = 1
while True:
a = await queue.get()
if delay:
await asyncio.sleep(delay)
print ('consumer', a)
h += 2
async def producer(queue):
i = 1
while True:
await asyncio.ensure_future(queue.put(i))
print ('producer', i)
i += 1
loop = asyncio.get_event_loop()
queue = LeakyQueue(maxsize=1)
tasks = [
asyncio.ensure_future(consumer(queue, 0)),
asyncio.ensure_future(producer(queue))
]
loop.run_until_complete(asyncio.gather(*tasks))

Categories

Resources