How to yield from an async for loop using asyncio? - python

I'm trying to write a simple asynchronous data batch generator, but having troubles with understanding how to yield from an async for loop. Here I've written a simple class illustrating my idea:
import asyncio
from typing import List
class AsyncSimpleIterator:
def __init__(self, data: List[str], batch_size=None):
self.data = data
self.batch_size = batch_size
self.doc2index = self.get_doc_ids()
def get_doc_ids(self):
return list(range(len(self.data)))
async def get_batch_data(self, doc_ids):
print("get_batch_data() running")
page = [self.data[j] for j in doc_ids]
return page
async def get_docs(self, batch_size):
print("get_docs() running")
_batch_size = self.batch_size or batch_size
batches = [self.doc2index[i:i + _batch_size] for i in
range(0, len(self.doc2index), _batch_size)]
for _, doc_ids in enumerate(batches):
docs = await self.get_batch_data(doc_ids)
yield docs, doc_ids
async def main(self):
print("main() running")
async for res in self.get_docs(batch_size=2):
print(res) # how to yield instead of print?
def gen_batches(self):
# how to get results of self.main() here?
loop = asyncio.get_event_loop()
loop.run_until_complete(self.main())
loop.close()
DATA = ["Hello, world!"] * 4
iterator = AsyncSimpleIterator(DATA)
iterator.gen_batches()
So, my question is, how to yield a result from main() to gather it inside gen_batches()?
When I print the result inside main(), I get the following output:
main() running
get_docs() running
get_batch_data() running
(['Hello, world!', 'Hello, world!'], [0, 1])
get_batch_data() running
(['Hello, world!', 'Hello, world!'], [2, 3])

I'm trying to write a simple asynchronous data batch generator, but having troubles with understanding how to yield from an async for loop
Yielding from an async for works like a regular yield, except that it also has to be collected by an async for or equivalent. For example, the yield in get_docs makes it an async generator. If you replace print(res) with yield res in main(), it will make main() an async generator as well.
the generator in main() should exhaust in gen_batches(), so I can gather all results in gen_batches()
To collect the values produced by an async generator (such as main() with print(res) replaced with yield res), you can use a helper coroutine:
def gen_batches(self):
loop = asyncio.get_event_loop()
async def collect():
return [item async for item in self.main()]
items = loop.run_until_complete(collect())
loop.close()
return items
The collect() helper makes use of a PEP 530 asynchronous comprehension, which can be thought of as syntactic sugar for the more explicit:
async def collect():
l = []
async for item in self.main():
l.append(item)
return l

A working solution based on #user4815162342 answer to the original question:
import asyncio
from typing import List
class AsyncSimpleIterator:
def __init__(self, data: List[str], batch_size=None):
self.data = data
self.batch_size = batch_size
self.doc2index = self.get_doc_ids()
def get_doc_ids(self):
return list(range(len(self.data)))
async def get_batch_data(self, doc_ids):
print("get_batch_data() running")
page = [self.data[j] for j in doc_ids]
return page
async def get_docs(self, batch_size):
print("get_docs() running")
_batch_size = self.batch_size or batch_size
batches = [self.doc2index[i:i + _batch_size] for i in
range(0, len(self.doc2index), _batch_size)]
for _, doc_ids in enumerate(batches):
docs = await self.get_batch_data(doc_ids)
yield docs, doc_ids
def gen_batches(self):
loop = asyncio.get_event_loop()
async def collect():
return [j async for j in self.get_docs(batch_size=2)]
items = loop.run_until_complete(collect())
loop.close()
return items
DATA = ["Hello, world!"] * 4
iterator = AsyncSimpleIterator(DATA)
result = iterator.gen_batches()
print(result)

Related

How to asynchronously run functions within a for-loop in Python?

Hi I was wondering how to asynchronously call a function within a for-loop in Python, allowing the for-loop to execute more quickly. bar() in this case is a time intensive function, which is why I want the calls to it to be nonblocking.
Here is what I want to refactor:
def bar(item):
//manipulate item
return newItem
newItems = []
for item in items:
newItem = foo(item)
newItems.append[newItem]
Here is what I've tried:
async def bar(item):
//manipulate item
return newItem
async def foo():
newItems = [bar(item) for item in items]
newItems = await asyncio.gather(*newItems)
return newItems
newItems = asyncio.run(foo())
This doesn't seem to work as each function call still waits for the previous one to finish before starting. I would love tips on what I might be doing wrong. Thank you so much for any and all help!
If your tasks are really async you can do it the following way:
import asyncio
async def bar(item: int) -> int:
# manipulate item
print("Started")
await asyncio.sleep(5)
print("Finished")
return item ** 2
async def foo():
items = range(1, 10)
tasks = [bar(item) for item in items]
new_items = await asyncio.gather(*tasks)
return new_items
if __name__ == '__main__':
results = asyncio.run(foo())
print(results)

async generator with slow consumer

If I have a slow consumer of an async generator that emits values at a quick frequency, and I only care about consuming the latest value (i.e. I'm not fussed about dropping values), is there a way to achieve this in an eloquent way? I've taken a look at aiostream but I couldn't seem to find anything that fits.
Here is a simple example:
import asyncio
import aiostream
async def main():
xs = aiostream.stream.count(interval=0.2)
async with xs.stream() as stream:
async for x in stream: # do something here to drop updates that aren't processed in time
print(x)
await asyncio.sleep(1.0)
if __name__ == "__main__":
asyncio.run(main())
I propose you to use a class that handles the external generator, since I don't know any source to do that.
The class can consume internally the generator in a task and keep only the last value. It's going to be like a wrapper over the generator you really want to consume.
import asyncio
class RelaxedGenerator:
def __init__(self, async_gen):
self.last_value = None # the last value generated
self.consumed_last = True # flags the last value as consumed
self.async_gen = async_gen # generator which we can drop values
self.exhausted = False # flags the generator as fully consumed
#classmethod
async def start(cls, async_gen):
self = cls(async_gen())
asyncio.create_task(self.generate())
return self
async def generate(self):
# here you can consume the external async generator
# and save only the last value for further process
while True:
try:
self.last_value = await self.async_gen.__anext__()
self.consumed_last = False
except StopAsyncIteration:
self.exhausted = True
break
async def stream(self):
while not self.exhausted:
if self.consumed_last:
await asyncio.sleep(0.01) # avoids block the loop
continue
self.consumed_last = True
yield self.last_value
Testing with a simple generator:
import asyncio
from random import uniform
async def numbers_stream(max_=100):
next_int = -1
while next_int < max_:
next_int += 1
yield next_int
await asyncio.sleep(0.2)
async def main():
gen = await RelaxedGenerator.start(numbers_stream)
async for value in gen.stream():
print(value, end=", ", flush=True)
await asyncio.sleep(uniform(1, 2))
asyncio.run(main())
Output:
0, 6, 15, 21, 28, 38, 43, 48, 57, 65, 73, 81, 89, 96,
Other things to keep in mind is if you want to process the last value or if the generator you are working with is going to be exhausted or not in practice. Here I assume that you don't care about last value and the generator can be exhausted.
You could add a queue between your producer and consumer which forgets old results. Unfortunately, there is no implementation for it in the standard library, but it is almost there. If you check the implementation of asyncio.Queue you will notice the use of collections.deque, see https://github.com/python/cpython/blob/3.10/Lib/asyncio/queues.py#L49.
The collections.deque takes the optional argument maxlen to discard previously added items, see https://docs.python.org/3/library/collections.html#collections.deque.
Making use of it, enables us to create our custom queue, which only keeps the last n items.
import asyncio
import collections
class RollingQueue(asyncio.Queue):
def _init(self, maxsize):
self._queue = collections.deque(maxlen=maxsize)
def full(self):
return False
Now you could use this queue as follows:
async def numbers(nmax):
for n in range(nmax):
yield n
await asyncio.sleep(0.3)
async def fill_queue(producer, queue):
async for item in producer:
queue.put_nowait(item)
queue.put_nowait(None)
queue1 = RollingQueue(1)
numgen = numbers(10)
task = fill_queue(numgen, queue1)
asyncio.create_task(task)
while True:
res = await queue1.get()
if res is None:
break
print(res)
await asyncio.sleep(1)
Where I set the queue size to 1 to just keep the last item as required in your question.
Using a combination of the two provided answers, I came up with the following solution which seems to work quite well:
import asyncio
import aiostream
import collections
class RollingQueue(asyncio.Queue):
def _init(self, maxsize):
self._queue = collections.deque(maxlen=maxsize)
def full(self):
return False
#aiostream.operator(pipable=True)
async def drop_stream(source, max_n=1):
queue = RollingQueue(max_n)
exhausted = False
async def inner_task():
async with aiostream.streamcontext(source) as streamer:
async for item in streamer:
queue.put_nowait(item)
nonlocal exhausted
exhausted = True
task = asyncio.create_task(inner_task())
try:
while not exhausted:
item = await queue.get()
yield item
finally:
task.cancel()
async def main():
xs = aiostream.stream.count(interval=0.2) | drop_stream.pipe(1) | aiostream.pipe.take(5)
async with xs.stream() as stream:
async for x in stream:
print(x)
await asyncio.sleep(1.0)
if __name__ == "__main__":
asyncio.run(main())

Asyncio python - TypeError: A Future, a coroutine or an awaitable is required

One of the async function returns the async generator object. I added loop.run_until_complete(func()), but still, it throws the error as "TypeError: A Future, a coroutine or an awaitable is required". Below is the code. I'm trying to fetch the records from Neo4j asynchronously. I got the async "Neo4j class from a GitHub. I'm new to this async concept.
from concurrent import futures
import neo4j
from neo4j import GraphDatabase, basic_auth
import time
import traceback
import asyncio
RETRY_WAITS = [0, 1, 4] # How long to wait after each successive failure.
class Neo4j:
"""Neo4j database API."""
def __init__(self, config, loop):
self.config = config
self.loop = loop
self.executor = futures.ThreadPoolExecutor(max_workers=30)
for retry_wait in RETRY_WAITS:
try:
self.init_driver()
break
except:
if retry_wait == RETRY_WAITS[-1]:
raise
else:
print('WARNING: retrying to Init DB; err:')
traceback.print_exc()
time.sleep(retry_wait) # wait for 0, 1, 3... seconds.
def init_driver(self):
auth = basic_auth(self.config['user'], self.config['pass'])
self.driver = GraphDatabase.driver(self.config['url'], auth=auth)
async def afetch_start(self, query):
session = self.driver.session(access_mode=neo4j.READ_ACCESS)
def run():
return session.run(query).records()
return session, await self.loop.run_in_executor(self.executor, run)
async def afetch_iterate(self, session, iter):
def iterate():
try:
return next(iter)
except StopIteration:
return None
while True:
res = await self.loop.run_in_executor(self.executor, iterate)
if res is None:
return
else:
yield dict(res)
async def afetch(self, query):
for retry_wait in RETRY_WAITS:
try:
session, iter = await self.afetch_start(query)
break
except (BrokenPipeError, neo4j.exceptions.ServiceUnavailable) as e:
if retry_wait == RETRY_WAITS[-1]:
raise
else:
await asyncio.sleep(retry_wait)
await self.loop.run_in_executor(self.executor, self.init_driver)
async for x in self.afetch_iterate(session, iter):
yield x
await self.loop.run_in_executor(self.executor, session.close)
async def afetch_one(self, query):
async for i in self.afetch(query):
return i
return None
async def aexec(self, query):
async for i in self.afetch(query):
pass
return
config={'url':"bolt://localhost",'user':'neo4j','pass':'pwd'}
loop=asyncio.get_event_loop()
n=Neo4j(config,loop)
loop.run_until_complete(n.afetch("MATCH(p:Person)-[:Acted_in]->(mv:Movies) RETURN p.name as actors"))
loop.close()
--EDIT
I have modified the code to work properly. The query returns 218K rows and it takes 5 minutes to extract the complete list and the same async operation in C# completes in just 2 sec. Looks like the above code still doesnt go in async
It's very hard to tell what exactly happens without reproducible example, but I'll take a guess. You probably pass async generator object in a loop, you shouldn't do it. A way to work with async generators is to use async for. Here's example:
import asyncio
async def func(): # async generator
yield 1
yield 2
yield 3
async def main():
async for i in func(): # get values from async generator
print(i)
asyncio.run(main()) # can be used instead of loop.run_until_complete(main())

Can't pickle coroutine objects when ProcessPoolExecutor is used in class

I'm trying to get asyncio work with subprocesses and limitations. I've accomplish this in functional way, but when I tried to implement same logic in opp style several problems showd up. Mostly Can't pickle coroutine/generator errors. I tracked some of theese, but not all
import asyncio
from concurrent.futures import ProcessPoolExecutor
from itertools import islice
from random import randint
class async_runner(object):
def __init__(self):
self.futures = [] # container to store current futures
self.futures_total = []
self.loop = asyncio.get_event_loop() # main event_loop
self.executor = ProcessPoolExecutor()
self.limit = 1
def run(self, func, *args):
temp_loop = asyncio.new_event_loop()
try:
coro = func(*args)
asyncio.set_event_loop(temp_loop)
ret = temp_loop.run_until_complete(coro)
return ret
finally:
temp_loop.close()
def limit_futures(self, futures, limit):
self.futures_total = iter(futures)
self.futures = [future for future in islice(self.futures_total,0,limit)]
async def first_to_finish():
while True:
await asyncio.sleep(0)
for f in self.futures:
if f.done(): # here raised TypeError: can't pickle coroutine objects
print(f.done())
self.futures.remove(f)
try:
#newf = next(self.futures_total)
#self.futures.append(newf)
print(f.done())
except StopIteration as e:
pass
return f.result()
while len(self.futures) > 0:
yield first_to_finish()
async def run_limited(self, func, args, limit):
self.limit = int(limit)
self.futures_total = (self.loop.run_in_executor(self.executor, self.run, func, x) for x in range(110000,119990))
for ret in self.limit_futures(self.futures_total, 4): # limitation - 4 per all processes
await ret
def set_execution(self, func, args, limit):
ret = self.loop.run_until_complete(self.run_limited(func, args, limit))
return ret
async def asy(x):
print('enter: ', x)
await asyncio.sleep(randint(1,3))
print('finishing ', x)
return x
runner = async_runner()
ret = runner.set_execution(asy,urls,2)
print(ret)
But this works fine:
import asyncio
from concurrent.futures import ProcessPoolExecutor
from itertools import islice
import time
async def asy(x):
print('enter: ', x)
await asyncio.sleep(1)
print('finishing ', x)
return x
def run(corofn, *args):
loop = asyncio.new_event_loop()
try:
coro = corofn(*args)
asyncio.set_event_loop(loop)
ret = loop.run_until_complete(coro)
#print(ret)
return ret
finally:
loop.close()
def limit_futures(futures, limit):
futures_sl = [
c for c in islice(futures, 0, limit)
]
print(len(futures_sl))
async def first_to_finish(futures):
while True:
await asyncio.sleep(0)
for f in futures_sl:
if f.done():
futures_sl.remove(f)
try:
newf = next(futures)
futures_sl.append(newf)
except StopIteration as e:
pass
return f.result()
while len(futures_sl) > 0:
yield first_to_finish(futures)
async def main():
loop = asyncio.get_event_loop()
executor = ProcessPoolExecutor()
futures = (loop.run_in_executor(executor, run, asy, x) for x in range(110000,119990))
'''
CASE balls to the wall!
await asyncio.gather(*futures)
'''
for ret in limit_futures(futures, 4): # limitation - 4 per all processes
await ret
if __name__ == '__main__':
start = time.time()
'''
# CASE single
ret = [asy(x) for x in range(510000,510040)]
exit()
'''
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
print("Elapsed time: {:.3f} sec".format(time.time() - start))
I've cant understand why multiprocessing module trying to pickle anything only when objects are in use, but not in any scenario
The reason why multiprocessing needs to pickle the async_runner instance is because self.runner is a bound method, meaning that it "contains" the async_runner instance.
Since you're not actually using self in the run method, you can just make it a staticmethod to avoid this problem.

Builtin way to transform asynchronous iterable to synchronous iterable list

Python3.6 now asynchronous iterables. Is there builtin way to transform a asynchronous iterable to a synchronous iterable.
I currently have this helper function, but it feels very un-pythonic. Is there a better way to do this?
async def aiter_to_list(aiter):
l = []
async for i in aiter:
l.append(i)
return l
From Python 3.6 you can use Asynchronous Comprehensions
async def async_iter():
for i in range(0,5):
yield i
# async comprehension
sync_list = [gen async for gen in async_iter()]
print(sync_list) # [0, 1, 2, 3, 4]
You can use aiostream.stream.list:
from aiostream import stream
async def agen():
yield 1
yield 2
yield 3
async def main():
lst = await stream.list(agen())
print(lst) # prints [1, 2, 3]
More operators and examples in the documentation.
Your "asynchronous to synchronous" helper is itself asynchronous; not a big change at all. In general: no, you cannot make something asynchronous synchronous. An asynchronous value will be supplied "sometime later"; you cannot make that into "now" because the value doesn't exist "now" and you will have to wait for it, asynchronously.
These functions allow you to convert from / to iterable <==> async iterable, not just simple lists.
Basic imports
import asyncio
import threading
import time
DONE = object()
TIMEOUT = 0.001
The function to_sync_iterable will convert any async iterable to a sync iterable:
def to_sync_iterable(async_iterable, maxsize = 0):
def sync_iterable():
queue = asyncio.Queue(maxsize=maxsize)
loop = asyncio.get_event_loop()
t = threading.Thread(target=_run_coroutine, args=(loop, async_iterable, queue))
t.daemon = True
t.start()
while True:
if not queue.empty():
x = queue.get_nowait()
if x is DONE:
break
else:
yield x
else:
time.sleep(utils.TIMEOUT)
t.join()
return sync_iterable()
def _run_coroutine(loop, async_iterable, queue):
loop.run_until_complete(_consume_async_iterable(async_iterable, queue))
async def _consume_async_iterable(async_iterable, queue):
async for x in async_iterable:
await queue.put(x)
await queue.put(DONE)
You can use it like this:
async def slow_async_generator():
yield 0
await asyncio.sleep(1)
yield 1
await asyncio.sleep(1)
yield 2
await asyncio.sleep(1)
yield 3
for x in to_sync_iterable(slow_async_generator()):
print(x)
The function to_async_iterable will convert any sync iterable to an async iterable:
def to_async_iterable(iterable, maxsize = 0):
async def async_iterable():
queue = asyncio.Queue(maxsize=maxsize)
loop = asyncio.get_event_loop()
task = loop.run_in_executor(None, lambda: _consume_iterable(loop, iterable, queue))
while True:
x = await queue.get()
if x is DONE:
break
else:
yield x
await task
return async_iterable()
def _consume_iterable(loop, iterable, queue):
for x in iterable:
while True:
if not queue.full():
loop.call_soon_threadsafe(queue.put_nowait, x)
break
else:
time.sleep(TIMEOUT)
while True:
if not queue.full():
loop.call_soon_threadsafe(queue.put_nowait, DONE)
break
else:
time.sleep(TIMEOUT)
This one is specially useful for asyncio programs because it won't block the event loop even if the the sync iterable blocks. You can use it like this:
def slow_sync_generator():
yield 0
time.sleep(1)
yield 1
time.sleep(1)
yield 2
time.sleep(1)
yield 3
async def async_task():
async for x in to_async_iterable(slow_sync_generator()):
print(x)
asyncio.get_event_loop().run_until_complete(async_task())

Categories

Resources