asyncio.gather vs list comprehension

asyncio.gather vs list comprehension - python

In the following code, are get_all_details_1 and get_all_details_2 behave the same?
async def get_details(category, limit):
async with limit:
# ...
limit = asyncio.Semaphore(4)
a_list = .... # a big list
async def get_all_details_1():
b_list = await asyncio.gather(*[get_details(x, limit) for x in a_list])
# ....
async def get_all_details_2():
b_list = [await get_details(x, limit) for x in a_list]
# ....

Absolutely not! Example:
import asyncio
import time
async def slowtask():
await asyncio.sleep(1)
async def gather():
await asyncio.gather(*[slowtask() for _ in range(10)])
async def listcomp():
[await slowtask() for _ in range(10)]
start = time.time()
asyncio.run(gather())
print("gather", time.time() - start)
start = time.time()
asyncio.run(listcomp())
print("listcomp", time.time() - start)
gives us:
gather 1.0030405521392822
listcomp 10.015443325042725
asyncio.gather properly allows multiple async tasks to run asynchronously while the list comprehension awaits one after the other, leading to effectively serial code.

Related

async generator with slow consumer

If I have a slow consumer of an async generator that emits values at a quick frequency, and I only care about consuming the latest value (i.e. I'm not fussed about dropping values), is there a way to achieve this in an eloquent way? I've taken a look at aiostream but I couldn't seem to find anything that fits.
Here is a simple example:
import asyncio
import aiostream
async def main():
xs = aiostream.stream.count(interval=0.2)
async with xs.stream() as stream:
async for x in stream: # do something here to drop updates that aren't processed in time
print(x)
await asyncio.sleep(1.0)
if __name__ == "__main__":
asyncio.run(main())

I propose you to use a class that handles the external generator, since I don't know any source to do that.
The class can consume internally the generator in a task and keep only the last value. It's going to be like a wrapper over the generator you really want to consume.
import asyncio
class RelaxedGenerator:
def __init__(self, async_gen):
self.last_value = None # the last value generated
self.consumed_last = True # flags the last value as consumed
self.async_gen = async_gen # generator which we can drop values
self.exhausted = False # flags the generator as fully consumed
#classmethod
async def start(cls, async_gen):
self = cls(async_gen())
asyncio.create_task(self.generate())
return self
async def generate(self):
# here you can consume the external async generator
# and save only the last value for further process
while True:
try:
self.last_value = await self.async_gen.__anext__()
self.consumed_last = False
except StopAsyncIteration:
self.exhausted = True
break
async def stream(self):
while not self.exhausted:
if self.consumed_last:
await asyncio.sleep(0.01) # avoids block the loop
continue
self.consumed_last = True
yield self.last_value
Testing with a simple generator:
import asyncio
from random import uniform
async def numbers_stream(max_=100):
next_int = -1
while next_int < max_:
next_int += 1
yield next_int
await asyncio.sleep(0.2)
async def main():
gen = await RelaxedGenerator.start(numbers_stream)
async for value in gen.stream():
print(value, end=", ", flush=True)
await asyncio.sleep(uniform(1, 2))
asyncio.run(main())
Output:
0, 6, 15, 21, 28, 38, 43, 48, 57, 65, 73, 81, 89, 96,
Other things to keep in mind is if you want to process the last value or if the generator you are working with is going to be exhausted or not in practice. Here I assume that you don't care about last value and the generator can be exhausted.

You could add a queue between your producer and consumer which forgets old results. Unfortunately, there is no implementation for it in the standard library, but it is almost there. If you check the implementation of asyncio.Queue you will notice the use of collections.deque, see https://github.com/python/cpython/blob/3.10/Lib/asyncio/queues.py#L49.
The collections.deque takes the optional argument maxlen to discard previously added items, see https://docs.python.org/3/library/collections.html#collections.deque.
Making use of it, enables us to create our custom queue, which only keeps the last n items.
import asyncio
import collections
class RollingQueue(asyncio.Queue):
def _init(self, maxsize):
self._queue = collections.deque(maxlen=maxsize)
def full(self):
return False
Now you could use this queue as follows:
async def numbers(nmax):
for n in range(nmax):
yield n
await asyncio.sleep(0.3)
async def fill_queue(producer, queue):
async for item in producer:
queue.put_nowait(item)
queue.put_nowait(None)
queue1 = RollingQueue(1)
numgen = numbers(10)
task = fill_queue(numgen, queue1)
asyncio.create_task(task)
while True:
res = await queue1.get()
if res is None:
break
print(res)
await asyncio.sleep(1)
Where I set the queue size to 1 to just keep the last item as required in your question.

Using a combination of the two provided answers, I came up with the following solution which seems to work quite well:
import asyncio
import aiostream
import collections
class RollingQueue(asyncio.Queue):
def _init(self, maxsize):
self._queue = collections.deque(maxlen=maxsize)
def full(self):
return False
#aiostream.operator(pipable=True)
async def drop_stream(source, max_n=1):
queue = RollingQueue(max_n)
exhausted = False
async def inner_task():
async with aiostream.streamcontext(source) as streamer:
async for item in streamer:
queue.put_nowait(item)
nonlocal exhausted
exhausted = True
task = asyncio.create_task(inner_task())
try:
while not exhausted:
item = await queue.get()
yield item
finally:
task.cancel()
async def main():
xs = aiostream.stream.count(interval=0.2) | drop_stream.pipe(1) | aiostream.pipe.take(5)
async with xs.stream() as stream:
async for x in stream:
print(x)
await asyncio.sleep(1.0)
if __name__ == "__main__":
asyncio.run(main())

Synchronize asyncio Queue

I am planning to have an asyncio Queue based producer-consumer implementation for a processing of realtime data where sending out data in correct time order is vital. So here is the code snippet of it :
async def produce(Q, n_jobs):
for i in range(n_jobs):
print(f"Producing :{i}")
await Q.put(i)
async def consume(Q):
while True:
n = await Q.get()
print(f"Consumed :{n}")
x = do_sometask_and_return_the_result(n)
print(f"Finished :{n} and Result: {x}")
async def main(loop):
Q = asyncio.Queue(loop=loop, maxsize=3)
await asyncio.wait([produce(Q, 10), consume(Q), consume(Q), consume(Q)])
print("Done")
Here the producer produces data and puts it into the asyncio Queue. I have multiple consumers to consume and process the data. While seeing the outputs, the order is maintained while printing "Consumed :{n}" (as in 1,2,3,4... and so on) , this is completely fine. but, since the function do_sometask_and_return_the_result(n) takes variable time to return the result, the order is not maintained in the next print of n "Finished :{n}" (as in 2,1,4,3,5,...).
Is there any way to synchronize this data as I need to maintain the order of printing the results? I want to see 1,2,3,4,.. sequential prints for 'n' even after do_sometask_and_return_the_result(n).

You could use a priority queue system (using the python heapq library) to reorder your jobs after they are complete. Something like this.
# add these variables at class/global scope
priority_queue = []
current_job_id = 1
job_id_dict = {}
async def produce(Q, n_jobs):
# same as above
async def consume(Q):
while True:
n = await Q.get()
print(f"Consumed :{n}")
x = do_sometask_and_return_the_result(n)
await process_result(n, x)
async def process_result(n, x):
heappush(priority_queue, n)
job_id_dict[n] = x
while current_job_id == priority_queue[0]:
job_id = heappop(priority_queue)
print(f"Finished :{job_id} and Result: {job_id_dict[job_id]}")
current_job_id += 1
async def main(loop):
Q = asyncio.Queue(loop=loop, maxsize=3)
await asyncio.wait([produce(Q, 10), consume(Q), consume(Q), consume(Q)])
print("Done")
For more information on the heapq module: https://docs.python.org/3/library/heapq.html

Beginner async/await question for api requests

I want speed up some API requests... for that I try to figure out how to do and copy some code which run but when I try my own code its no longer asynchrone. Maybe someone find the fail?
Copy Code (guess from stackoverflow):
#!/usr/bin/env python3
import asyncio
#asyncio.coroutine
def func_normal():
print('A')
yield from asyncio.sleep(5)
print('B')
return 'saad'
#asyncio.coroutine
def func_infinite():
for i in range(10):
print("--%d" % i)
return 'saad2'
loop = asyncio.get_event_loop()
tasks = func_normal(), func_infinite()
a, b = loop.run_until_complete(asyncio.gather(*tasks))
print("func_normal()={a}, func_infinite()={b}".format(**vars()))
loop.close()
My "own" code (I need at the end a list returned and merge the results of all functions):
import asyncio
import time
#asyncio.coroutine
def say_after(start,count,say,yep=True):
retl = []
if yep:
time.sleep(5)
for x in range(start,count):
retl.append(x)
print(say)
return retl
def main():
print(f"started at {time.strftime('%X')}")
loop = asyncio.get_event_loop()
tasks = say_after(10,20,"a"), say_after(20,30,"b",False)
a, b = loop.run_until_complete(asyncio.gather(*tasks))
print("func_normal()={a}, func_infinite()={b}".format(**vars()))
loop.close()
c = a + b
#print(c)
print(f"finished at {time.strftime('%X')}")
main()
Or I m completly wrong and should solve that with multithreading? What would be the best way for API requests that returns a list that I need to merge?

Added comment for each section that needs improvement. Removed some to simply code.
In fact, I didn't find any performance uplift with using range() wrapped in coroutine and using async def, might worth with heavier operations.
import asyncio
import time
# #asyncio.coroutine IS DEPRECATED since python 3.8
#asyncio.coroutine
def say_after(wait=True):
result = []
if wait:
print("I'm sleeping!")
time.sleep(5)
print("'morning!")
# This BLOCKs thread, but release GIL so other thread can run.
# But asyncio runs in ONE thread, so this still harms simultaneity.
# normal for is BLOCKING operation.
for i in range(5):
result.append(i)
print(i, end='')
print()
return result
def main():
start = time.time()
# Loop argument will be DEPRECATED from python 3.10
# Make main() as coroutine, then use asyncio.run(main()).
# It will be in asyncio Event loop, without explicitly passing Loop.
loop = asyncio.get_event_loop()
tasks = say_after(), say_after(False)
# As we will use asyncio.run(main()) from now on, this should be await-ed.
a, b = loop.run_until_complete(asyncio.gather(*tasks))
print(f"Took {time.time() - start:5f}")
loop.close()
main()
Better way:
import asyncio
import time
async def say_after(wait=True):
result = []
if wait:
print("I'm sleeping!")
await asyncio.sleep(2) # 'await' a coroutine version of it instead.
print("'morning!")
# wrap iterator in generator - or coroutine
async def asynchronous_range(end):
for _i in range(end):
yield _i
# use it with async for
async for i in asynchronous_range(5):
result.append(i)
print(i, end='')
print()
return result
async def main():
start = time.time()
tasks = say_after(), say_after(False)
a, b = await asyncio.gather(*tasks)
print(f"Took {time.time() - start:5f}")
asyncio.run(main())
Result
Your code:
DeprecationWarning: "#coroutine" decorator is deprecated since Python 3.8, use "async def" instead
def say_after(wait=True):
I'm sleeping!
'morning!
01234
01234
Took 5.003802
Better async code:
I'm sleeping!
01234
'morning!
01234
Took 2.013863
Note that fixed code now finish it's job while other task is sleeping.

How to yield from an async for loop using asyncio?

I'm trying to write a simple asynchronous data batch generator, but having troubles with understanding how to yield from an async for loop. Here I've written a simple class illustrating my idea:
import asyncio
from typing import List
class AsyncSimpleIterator:
def __init__(self, data: List[str], batch_size=None):
self.data = data
self.batch_size = batch_size
self.doc2index = self.get_doc_ids()
def get_doc_ids(self):
return list(range(len(self.data)))
async def get_batch_data(self, doc_ids):
print("get_batch_data() running")
page = [self.data[j] for j in doc_ids]
return page
async def get_docs(self, batch_size):
print("get_docs() running")
_batch_size = self.batch_size or batch_size
batches = [self.doc2index[i:i + _batch_size] for i in
range(0, len(self.doc2index), _batch_size)]
for _, doc_ids in enumerate(batches):
docs = await self.get_batch_data(doc_ids)
yield docs, doc_ids
async def main(self):
print("main() running")
async for res in self.get_docs(batch_size=2):
print(res) # how to yield instead of print?
def gen_batches(self):
# how to get results of self.main() here?
loop = asyncio.get_event_loop()
loop.run_until_complete(self.main())
loop.close()
DATA = ["Hello, world!"] * 4
iterator = AsyncSimpleIterator(DATA)
iterator.gen_batches()
So, my question is, how to yield a result from main() to gather it inside gen_batches()?
When I print the result inside main(), I get the following output:
main() running
get_docs() running
get_batch_data() running
(['Hello, world!', 'Hello, world!'], [0, 1])
get_batch_data() running
(['Hello, world!', 'Hello, world!'], [2, 3])

I'm trying to write a simple asynchronous data batch generator, but having troubles with understanding how to yield from an async for loop
Yielding from an async for works like a regular yield, except that it also has to be collected by an async for or equivalent. For example, the yield in get_docs makes it an async generator. If you replace print(res) with yield res in main(), it will make main() an async generator as well.
the generator in main() should exhaust in gen_batches(), so I can gather all results in gen_batches()
To collect the values produced by an async generator (such as main() with print(res) replaced with yield res), you can use a helper coroutine:
def gen_batches(self):
loop = asyncio.get_event_loop()
async def collect():
return [item async for item in self.main()]
items = loop.run_until_complete(collect())
loop.close()
return items
The collect() helper makes use of a PEP 530 asynchronous comprehension, which can be thought of as syntactic sugar for the more explicit:
async def collect():
l = []
async for item in self.main():
l.append(item)
return l

A working solution based on #user4815162342 answer to the original question:
import asyncio
from typing import List
class AsyncSimpleIterator:
def __init__(self, data: List[str], batch_size=None):
self.data = data
self.batch_size = batch_size
self.doc2index = self.get_doc_ids()
def get_doc_ids(self):
return list(range(len(self.data)))
async def get_batch_data(self, doc_ids):
print("get_batch_data() running")
page = [self.data[j] for j in doc_ids]
return page
async def get_docs(self, batch_size):
print("get_docs() running")
_batch_size = self.batch_size or batch_size
batches = [self.doc2index[i:i + _batch_size] for i in
range(0, len(self.doc2index), _batch_size)]
for _, doc_ids in enumerate(batches):
docs = await self.get_batch_data(doc_ids)
yield docs, doc_ids
def gen_batches(self):
loop = asyncio.get_event_loop()
async def collect():
return [j async for j in self.get_docs(batch_size=2)]
items = loop.run_until_complete(collect())
loop.close()
return items
DATA = ["Hello, world!"] * 4
iterator = AsyncSimpleIterator(DATA)
result = iterator.gen_batches()
print(result)

Builtin way to transform asynchronous iterable to synchronous iterable list

Python3.6 now asynchronous iterables. Is there builtin way to transform a asynchronous iterable to a synchronous iterable.
I currently have this helper function, but it feels very un-pythonic. Is there a better way to do this?
async def aiter_to_list(aiter):
l = []
async for i in aiter:
l.append(i)
return l

From Python 3.6 you can use Asynchronous Comprehensions
async def async_iter():
for i in range(0,5):
yield i
# async comprehension
sync_list = [gen async for gen in async_iter()]
print(sync_list) # [0, 1, 2, 3, 4]

You can use aiostream.stream.list:
from aiostream import stream
async def agen():
yield 1
yield 2
yield 3
async def main():
lst = await stream.list(agen())
print(lst) # prints [1, 2, 3]
More operators and examples in the documentation.

Your "asynchronous to synchronous" helper is itself asynchronous; not a big change at all. In general: no, you cannot make something asynchronous synchronous. An asynchronous value will be supplied "sometime later"; you cannot make that into "now" because the value doesn't exist "now" and you will have to wait for it, asynchronously.

These functions allow you to convert from / to iterable <==> async iterable, not just simple lists.
Basic imports
import asyncio
import threading
import time
DONE = object()
TIMEOUT = 0.001
The function to_sync_iterable will convert any async iterable to a sync iterable:
def to_sync_iterable(async_iterable, maxsize = 0):
def sync_iterable():
queue = asyncio.Queue(maxsize=maxsize)
loop = asyncio.get_event_loop()
t = threading.Thread(target=_run_coroutine, args=(loop, async_iterable, queue))
t.daemon = True
t.start()
while True:
if not queue.empty():
x = queue.get_nowait()
if x is DONE:
break
else:
yield x
else:
time.sleep(utils.TIMEOUT)
t.join()
return sync_iterable()
def _run_coroutine(loop, async_iterable, queue):
loop.run_until_complete(_consume_async_iterable(async_iterable, queue))
async def _consume_async_iterable(async_iterable, queue):
async for x in async_iterable:
await queue.put(x)
await queue.put(DONE)
You can use it like this:
async def slow_async_generator():
yield 0
await asyncio.sleep(1)
yield 1
await asyncio.sleep(1)
yield 2
await asyncio.sleep(1)
yield 3
for x in to_sync_iterable(slow_async_generator()):
print(x)
The function to_async_iterable will convert any sync iterable to an async iterable:
def to_async_iterable(iterable, maxsize = 0):
async def async_iterable():
queue = asyncio.Queue(maxsize=maxsize)
loop = asyncio.get_event_loop()
task = loop.run_in_executor(None, lambda: _consume_iterable(loop, iterable, queue))
while True:
x = await queue.get()
if x is DONE:
break
else:
yield x
await task
return async_iterable()
def _consume_iterable(loop, iterable, queue):
for x in iterable:
while True:
if not queue.full():
loop.call_soon_threadsafe(queue.put_nowait, x)
break
else:
time.sleep(TIMEOUT)
while True:
if not queue.full():
loop.call_soon_threadsafe(queue.put_nowait, DONE)
break
else:
time.sleep(TIMEOUT)
This one is specially useful for asyncio programs because it won't block the event loop even if the the sync iterable blocks. You can use it like this:
def slow_sync_generator():
yield 0
time.sleep(1)
yield 1
time.sleep(1)
yield 2
time.sleep(1)
yield 3
async def async_task():
async for x in to_async_iterable(slow_sync_generator()):
print(x)
asyncio.get_event_loop().run_until_complete(async_task())

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

asyncio.gather vs list comprehension - python

Related

async generator with slow consumer

Synchronize asyncio Queue

Beginner async/await question for api requests

How to yield from an async for loop using asyncio?

Builtin way to transform asynchronous iterable to synchronous iterable list

Categories

Resources