Does calling thread.join() blocks the event loop in an asynchronous context? - python

I'm implementing a web API using aiohttp, deployed using gunicorn with UVloop enabled --worker-class aiohttp.GunicornUVLoopWebWorker. Therefore, my code always runs in an asynchronous context. I had the ideia of implementing parallel jobs in the handling of requests for better performance.
I'm not using asyncio because i want Parallelism, not Concurrency.
I'm aware of multiprocessing and the GIL problem in python. But joining a process also applies to my question.
Here is an example:
from aiohttp.web import middleware
#middleware
async def context_init(request, handler):
request.context = {}
request.context['threads'] = []
ret = await handler(request)
for thread in request.context['threads']:
thread.join()
return ret
Taking into account that thread.join() or process.join() blocks the current thread, this will block the event loop (As far as my knowledge goes). How can I join asynchronously? What I want can be represented figuratively as this: await thread.join() or await process.join().
Update:
Thanks to #user4815162342 I was able to write proper code for my project:
Middleware:
from aiohttp.web import middleware
from util.process_session import ProcessSession
#middleware
async def context_init(request, handler):
request.context = {}
request.context['process_session'] = ProcessSession()
request.context['processes'] = {}
ret = await handler(request)
await request.context['process_session'].wait_for_all()
return ret
Util:
import asyncio
import concurrent.futures
from functools import partial
class ProcessSession():
def __init__(self):
self.loop = asyncio.get_running_loop()
self.pool = concurrent.futures.ProcessPoolExecutor()
self.futures = []
async def wait_for_all(self):
await asyncio.wait(self.futures)
def add_process(self, f, *args, **kwargs):
ret = self.loop.run_in_executor(self.pool, partial(f, *args, **kwargs))
self.futures.append(ret)
return ret
class ProcessBase():
def __init__(self, process_session, f, *args, **kwargs):
self.future = process_session.add_process(f, *args, **kwargs)
async def wait(self):
await asyncio.wait([self.future])
return self.future.result()

Answering your question: Yes, it does block the event loop.
I found that ThreadPoolExecutor works pretty well on this situations.
from util.process_session import ProcessSession
from concurrent.futures.thread import ThreadPoolExecutor
import asyncio
from aiohttp.web import middleware
#middleware
async def context_init(request, handler):
request.context = {}
request.context['threads'] = []
ret = await handler(request)
with ThreadPoolExecutor(1) as executor:
await asyncio.get_event_loop().run_in_executor(executor,
functools.partial(join_threads, data={
'threads': request.context['threads']
}))
return ret
def join_threads(threads):
for t in threads:
t.join()

I found a solution using multiprocesses. It can be done using a Pool. The standard lib provides some "async" methods (It's not really async, it just separates the initialization of the process from the process' output): apply_async
Using a simple async wrapper, I managed to deliver what I wanted:
from multiprocessing import Pool
from async_converter import sync_to_async
import asyncio
def f(x):
i = 0
while i < 10000000 * x:
i = i + 1
print("Finished: " + str(x))
return i
async def run():
print("Started with run")
with Pool(processes=4) as pool: # start 4 worker processes
result1 = pool.apply_async(f, (10,)) # evaluate "f(10)" asynchronously
result2 = pool.apply_async(f, (2,))
res1 = await sync_to_async(result1.get)()
print(res1)
res2 = await sync_to_async(result2.get)()
print(res2)
async def dummy(output):
print(output)
async def main():
# Schedule three calls *concurrently*:
await asyncio.gather(
run(),
dummy("Nice"),
dummy("Async"),
dummy("Loop"),
dummy("Perfect"),
dummy("Dummy1"),
dummy("Dummy2"),
dummy("Dummy3"),
dummy("Dummy4"),
dummy("Dummy5"),
dummy("Dummy6"),
dummy("Dummy7"),
dummy("Dummy8"),
dummy("Dummy9"),
dummy("Dummy10"),
)
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
loop.close()
outputs:
Perfect
Dummy6
Nice
Dummy1
Dummy7
Started with run
Dummy2
Dummy8
Dummy3
Dummy9
Async
Dummy4
Dummy10
Loop
Dummy5
Finished: 2
Finished: 10
100000000
20000000
Parallelism with asyncio :)

Related

Aggregation of 2 RabbitMQ messages does not work properly (messages hanging unacked)

I need to listen tasks on 2 queues, so I wrote the code below, but it has a problem. Currently it behaves like this: if the code started when 2 queues were full, it works great. But if queues were empty one of them was, the code reads messages, but does not proccess them (does not send ack, does not do the logic). But the messages became unacked, until I stop the code. I do not see any reason to be them unacked and unprocessed.
I can't understand what is wrong with the code? May be there is another way to aggregate 2 or more queues like this?
# task_processor.py
from aio_pika import IncomingMessage
class TaskProcessor:
MAX_TASKS_PER_INSTANCE = 1
def __init__(self):
self._tasks = []
def can_accept_new_task(self) -> bool:
return len(self._tasks) < self.MAX_TASKS_PER_INSTANCE
async def process(self, message: IncomingMessage):
self._tasks.append(message)
print(message.body)
await message.ack()
self._tasks.pop()
# main.py
import asyncio
from asyncio import QueueEmpty
from typing import Callable
import aio_pika
from aio_pika import RobustQueue
from dotenv import load_dotenv
load_dotenv()
from core.logger.logger import logger
from core.services.rabbitmq.task_processor.task_processor import TaskProcessor
async def get_single_task(queue: RobustQueue):
while True:
try:
msg = await queue.get(timeout=3600)
return msg
except QueueEmpty:
await asyncio.sleep(3)
except asyncio.exceptions.TimeoutError:
logger.warning('queue timeout error')
pass
except Exception as ex:
logger.error(f"{queue} errored", exc_info=ex)
async def task_aggregator(queue1: RobustQueue, queue2: RobustQueue, should_take_new_task_cb: Callable):
while True:
if should_take_new_task_cb():
queue2, queue1 = queue1, queue2
gen1 = get_single_task(queue1)
gen2 = get_single_task(queue2)
done, _ = await asyncio.wait([gen1, gen2], return_when=asyncio.FIRST_COMPLETED)
for item in done:
result = item.result()
yield result
else:
await asyncio.sleep(1)
async def tasks(queue1: RobustQueue, queue2: RobustQueue, should_take_new_task_cb: Callable):
async for task in task_aggregator(queue1, queue2, should_take_new_task_cb):
yield task
async def main():
connection = await aio_pika.connect_robust(
f"amqp://user:password#host:port/vhost?heartbeat={180}"
)
channel1 = connection.channel()
channel2 = connection.channel()
await channel1.initialize()
await channel2.initialize()
queue1 = await channel1.get_queue('queue1')
queue2 = await channel2.get_queue('queue2')
task_processor = TaskProcessor()
task_generator = tasks(queue1, queue2, task_processor.can_accept_new_task)
while True:
if task_processor.can_accept_new_task():
task = await anext(task_generator)
await task_processor.process(task)
else:
await asyncio.sleep(1)
if __name__ == '__main__':
asyncio.run(main())

Copying contexvars.Context between tasks

I have a program (an ASGI server) that is structured roughly like this:
import asyncio
import contextvars
ctxvar = contextvars.ContextVar("ctx")
async def lifepsan():
ctxvar.set("spam")
async def endpoint():
assert ctxvar.get() == "spam"
async def main():
ctx = contextvars.copy_context()
task = asyncio.create_task(lifepsan())
await task
task = asyncio.create_task(endpoint())
await task
asyncio.run(main())
Because the lifespan event / endpoints are run in tasks, they can't share contextvars.
This is by design: tasks copy the context before executing, so lifespan can't set ctxvar properly.
This is the desired behavior for endpoints, but I would like for execution to appear like this (from a user's perspective):
async def lifespan():
ctxvar.set("spam")
await endpoint()
In other words, the endpoints are executed in their own independent context, but within the context of the lifespan.
I tried to get this to work by using contextlib.copy_context():
import asyncio
import contextvars
ctxvar = contextvars.ContextVar("ctx")
async def lifepsan():
ctxvar.set("spam")
print("set")
async def endpoint():
print("get")
assert ctxvar.get() == "spam"
async def main():
ctx = contextvars.copy_context()
task = ctx.run(asyncio.create_task, lifepsan())
await task
endpoint_ctx = ctx.copy()
task = endpoint_ctx.run(asyncio.create_task, endpoint())
await task
asyncio.run(main())
As well as:
async def main():
ctx = contextvars.copy_context()
task = asyncio.create_task(ctx.run(lifespan))
await task
endpoint_ctx = ctx.copy()
task = asyncio.create_task(endpoint_ctx.run(endpoint))
await task
However it seems that contextvars.Context.run does not work this way (I guess the context is bound when the coroutine is created but not when it is executed).
Is there a simple way to achieve the desired behavior, without restructuring how the tasks are being created or such?
Here's what I came up with, inspired by PEP 555 and asgiref:
from contextvars import Context, ContextVar, copy_context
from typing import Any
def _set_cvar(cvar: ContextVar, val: Any):
cvar.set(val)
class CaptureContext:
def __init__(self) -> None:
self.context = Context()
def __enter__(self) -> "CaptureContext":
self._outer = copy_context()
return self
def sync(self):
final = copy_context()
for cvar in final:
if cvar not in self._outer:
# new contextvar set
self.context.run(_set_cvar, cvar, final.get(cvar))
else:
final_val = final.get(cvar)
if self._outer.get(cvar) != final_val:
# value changed
self.context.run(_set_cvar, cvar, final_val)
def __exit__(self, *args: Any):
self.sync()
def restore_context(context: Context) -> None:
"""Restore `context` to the current Context"""
for cvar in context.keys():
try:
cvar.set(context.get(cvar))
except LookupError:
cvar.set(context.get(cvar))
Usage:
import asyncio
import contextvars
ctxvar = contextvars.ContextVar("ctx")
async def lifepsan(cap: CaptureContext):
with cap:
ctxvar.set("spam")
async def endpoint():
assert ctxvar.get() == "spam"
async def main():
cap = CaptureContext()
await asyncio.create_task(lifepsan(cap))
restore_context(cap.context)
task = asyncio.create_task(endpoint())
await task
asyncio.run(main())
The sync() method is provided in case the task is long-running and you need to capture the context before it finishes. A somewhat contrived example:
import asyncio
import contextvars
ctxvar = contextvars.ContextVar("ctx")
async def lifepsan(cap: CaptureContext, event: asyncio.Event):
with cap:
ctxvar.set("spam")
cap.sync()
event.set()
await asyncio.sleep(float("inf"))
async def endpoint():
assert ctxvar.get() == "spam"
async def main():
cap = CaptureContext()
event = asyncio.Event()
asyncio.create_task(lifepsan(cap, event))
await event.wait()
restore_context(cap.context)
task = asyncio.create_task(endpoint())
await task
asyncio.run(main())
I think it would still be much nicer if contextvars.Context.run worked with coroutines.
This feature will be supported in Python 3.11: https://github.com/python/cpython/issues/91150
You will be able to write:
async def main():
ctx = contextvars.copy_context()
task = asyncio.create_task(lifepsan(), context=ctx)
await task
endpoint_ctx = ctx.copy()
task = asyncio.create_task(endpoint(), context=endpoint_ctx)
await task
In the meantime, in current Python versions you will need a backport of this feature. I can't think of a good one, but a bad one is here.

Elapsed time of coroutines/futures using asyncio.gather()

I have a list of async queries which I'm gathering with asyncio.gather() & waiting with asyncio.run_until_complete(). Something like:
queries = [
async_query_a(),
async_query_b()
]
loop = asyncio.get_event_loop()
tasks = asyncio.gather(*queries)
results = loop.run_until_complete(tasks)
I would like to know the "waiting time" of each of the queries. Something like a #log_performance wrapper which logs the elapsed time of the future/coroutine completion.
Here is an example implementation of timecoro timeing a coroutine function.
import asyncio
import functools
import logging
import random
import time
def timecoro(corofn):
#functools.wraps(corofn)
async def wrapper(*args, **kwargs):
start = time.time()
try:
result = await corofn(*args, **kwargs)
except Exception:
finish = time.time() - start
logging.info('%s failed in %.2f', corofn, finish)
raise
else:
finish = time.time() - start
logging.info('%s succeeded in %.2f', corofn, finish)
return result
return wrapper
#timecoro
async def async_query_a():
await asyncio.sleep(random.randint(0, 4))
#timecoro
async def async_query_b():
await asyncio.sleep(random.randint(0, 4))
raise RuntimeError
async def main():
queries = [
async_query_a(),
async_query_b(),
]
await asyncio.gather(*queries)
if __name__ == '__main__':
logging.basicConfig(level='INFO')
asyncio.run(main())

Can't pickle coroutine objects when ProcessPoolExecutor is used in class

I'm trying to get asyncio work with subprocesses and limitations. I've accomplish this in functional way, but when I tried to implement same logic in opp style several problems showd up. Mostly Can't pickle coroutine/generator errors. I tracked some of theese, but not all
import asyncio
from concurrent.futures import ProcessPoolExecutor
from itertools import islice
from random import randint
class async_runner(object):
def __init__(self):
self.futures = [] # container to store current futures
self.futures_total = []
self.loop = asyncio.get_event_loop() # main event_loop
self.executor = ProcessPoolExecutor()
self.limit = 1
def run(self, func, *args):
temp_loop = asyncio.new_event_loop()
try:
coro = func(*args)
asyncio.set_event_loop(temp_loop)
ret = temp_loop.run_until_complete(coro)
return ret
finally:
temp_loop.close()
def limit_futures(self, futures, limit):
self.futures_total = iter(futures)
self.futures = [future for future in islice(self.futures_total,0,limit)]
async def first_to_finish():
while True:
await asyncio.sleep(0)
for f in self.futures:
if f.done(): # here raised TypeError: can't pickle coroutine objects
print(f.done())
self.futures.remove(f)
try:
#newf = next(self.futures_total)
#self.futures.append(newf)
print(f.done())
except StopIteration as e:
pass
return f.result()
while len(self.futures) > 0:
yield first_to_finish()
async def run_limited(self, func, args, limit):
self.limit = int(limit)
self.futures_total = (self.loop.run_in_executor(self.executor, self.run, func, x) for x in range(110000,119990))
for ret in self.limit_futures(self.futures_total, 4): # limitation - 4 per all processes
await ret
def set_execution(self, func, args, limit):
ret = self.loop.run_until_complete(self.run_limited(func, args, limit))
return ret
async def asy(x):
print('enter: ', x)
await asyncio.sleep(randint(1,3))
print('finishing ', x)
return x
runner = async_runner()
ret = runner.set_execution(asy,urls,2)
print(ret)
But this works fine:
import asyncio
from concurrent.futures import ProcessPoolExecutor
from itertools import islice
import time
async def asy(x):
print('enter: ', x)
await asyncio.sleep(1)
print('finishing ', x)
return x
def run(corofn, *args):
loop = asyncio.new_event_loop()
try:
coro = corofn(*args)
asyncio.set_event_loop(loop)
ret = loop.run_until_complete(coro)
#print(ret)
return ret
finally:
loop.close()
def limit_futures(futures, limit):
futures_sl = [
c for c in islice(futures, 0, limit)
]
print(len(futures_sl))
async def first_to_finish(futures):
while True:
await asyncio.sleep(0)
for f in futures_sl:
if f.done():
futures_sl.remove(f)
try:
newf = next(futures)
futures_sl.append(newf)
except StopIteration as e:
pass
return f.result()
while len(futures_sl) > 0:
yield first_to_finish(futures)
async def main():
loop = asyncio.get_event_loop()
executor = ProcessPoolExecutor()
futures = (loop.run_in_executor(executor, run, asy, x) for x in range(110000,119990))
'''
CASE balls to the wall!
await asyncio.gather(*futures)
'''
for ret in limit_futures(futures, 4): # limitation - 4 per all processes
await ret
if __name__ == '__main__':
start = time.time()
'''
# CASE single
ret = [asy(x) for x in range(510000,510040)]
exit()
'''
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
print("Elapsed time: {:.3f} sec".format(time.time() - start))
I've cant understand why multiprocessing module trying to pickle anything only when objects are in use, but not in any scenario
The reason why multiprocessing needs to pickle the async_runner instance is because self.runner is a bound method, meaning that it "contains" the async_runner instance.
Since you're not actually using self in the run method, you can just make it a staticmethod to avoid this problem.

python asyncio add_done_callback with async def

I have 2 functions: The first one, def_a, is an asynchronous function and the second one is def_b which is a regular function and called with the result of def_a as a callback with the add_done_callback function.
My code looks like this:
import asyncio
def def_b(result):
next_number = result.result()
# some work on the next_number
print(next_number + 1)
async def def_a(number):
await some_async_work(number)
return number + 1
loop = asyncio.get_event_loop()
task = asyncio.ensure_future(def_a(1))
task.add_done_callback(def_b)
response = loop.run_until_complete(task)
loop.close()
And it's work perfectly.
The problem began when also the second function, def_b, became asynchronous. Now it looks like this:
async def def_b(result):
next_number = result.result()
# some asynchronous work on the next_number
print(next_number + 1)
But now I can not provide it to the add_done_callback function, because it's not a regular function.
My question is- Is it possible and how can I provide def_b to the add_done_callback function if def_b is asynchronous?
add_done_callback is considered a "low level" interface. When working with coroutines, you can chain them in many ways, for example:
import asyncio
async def my_callback(result):
print("my_callback got:", result)
return "My return value is ignored"
async def coro(number):
await asyncio.sleep(number)
return number + 1
async def add_success_callback(fut, callback):
result = await fut
await callback(result)
return result
loop = asyncio.get_event_loop()
task = asyncio.ensure_future(coro(1))
task = add_success_callback(task, my_callback)
response = loop.run_until_complete(task)
print("response:", response)
loop.close()
Keep in mind add_done_callback will still call the callback if your future raises an exception (but calling result.result() will raise it).
This only works for one future job, if you have multiple async jobs, they will blocks each other, a better way is using asyncio.as_completed() to iterate future list:
import asyncio
async def __after_done_callback(future_result):
# await for something...
pass
async def __future_job(number):
await some_async_work(number)
return number + 1
loop = asyncio.get_event_loop()
tasks = [asyncio.ensure_future(__future_job(x)) for x in range(100)] # create 100 future jobs
for f in asyncio.as_completed(tasks):
result = await f
await __after_done_callback(result)
loop.close()
You can try the aiodag library. It's a very lightweight wrapper around asyncio that abstracts away some of the async plumbing that you usually have to think about. From this example you won't be able to tell that things are running asynchronously since it's just 1 task that depends on another, but it is all running async.
import asyncio
from aiodag import task
#task
async def def_b(result):
# some asynchronous work on the next_number
print(result + 1)
#task
async def def_a(number):
await asyncio.sleep(number)
return number + 1
async def main():
a = def_a(1)
b = def_b(a) # this makes task b depend on task a
return await b
loop = asyncio.get_event_loop()
asyncio.set_event_loop(loop)
response = loop.run_until_complete(main())

Categories

Resources