Read in parallel and write sequentially? - python

I have the following code which read and write for each id sequentially.
async def main():
while id < 1000:
data = await read_async(id)
await data.write_async(f'{id}.csv')
id += 1
read_async() takes several minutes and write_async() takes less than one minute to run. Now I want to
Run read_async(id) in parallel. However, at most 3 calls can be run in parallel because of memory limitation.
write_async has to be run sequentially, i.e., write_async(n+1) cannot be run before write_async(n).

You could use a queue and a fixed number of tasks for reading, and write from the main task. The main task can use an event to find out that new data is available from the readers and and a shared dict to get it from them. For example (untested):
async def reader(q, id_to_data, data_ready):
while True:
id = await q.get()
data = await read_async(id)
id_to_data[id] = data
data_ready.set()
async def main():
q = asyncio.Queue()
for id in range(1000):
await q.put(id)
id_to_data = {}
data_ready = asyncio.Event()
readers = [asyncio.create_task(reader(q, id_to_data, data_ready))
for _ in 3]
for id in range(1000):
while True:
# wait for the current ID to appear before writing
if id in id_to_data:
data = id_to_data.pop(id)
await data.write_async(f'{id}.csv')
break
# move on to the next ID
else:
# wait for new data and try again
await data_ready.wait()
data_ready.clear()
for r in readers:
r.cancel()
Using a separate queue for results instead of the event wouldn't work because a queue is unordered. A priority queue would fix that, bit it would still immediately return the lowest id currently available, whereas the writer needs the next id in order to process all ids in order.

Related

Execute asyncio task as soon as possible

I would like to know how can I execute the tasks group 'tg_fast' immediately, and after, continue the tasks group 'tg_main'(or start again if not possible to continue).
In use asyncio.gather(), the result is like TaskGroup.
import asyncio
async def another_coro(i):
print(i)
await asyncio.sleep(.1)
async def coro(i):
if i == 1:
async with asyncio.TaskGroup() as tg_fast:
tg_fast.create_task(another_coro(i * 10))
tg_fast.create_task(another_coro(i * 100))
# await asyncio.gather(*[another_coro(i * 10), another_coro(i * 100)])
else:
print(i)
await asyncio.sleep(.1)
async def main():
async with asyncio.TaskGroup() as tg_main:
for i in range(0, 3):
tg_main.create_task(coro(i))
asyncio.run(main(), debug=True)
printing is 0 => 2 => 10 => 100
But I would a method to get: 0 => 10 => 100 => ... OR 0 => 100 => 10 => ...
The goal being to initiate 10 and 100 after 0 and before 2.
Thanks you very much for your help.
Edit:
I want to call 'another_coro' simultaneously. Not wait for one and start the second one after.
And I don't need to finish them, I can execute both until await 'asyncio.sleep(.1') and continue the event loop.
For this to work, you have to deliberately add another mechanism to prioritize tasks, and it has to be done explicitly to your other tasks in the "non priority" group.
It could be done by, for example, subclassing asyncio.TaskGroup, and add a priority mechanism to the __aexit__ method, so that when a group is intended to be exited (and all its tasks intended to be awaited), it could check in a central registry for all your instances of your specialized TaskGroup if there is a TaskGroup with greater priority running, and then wait until that one exits -
That would work without needing to change any code in your tasks -just how you instantiate your groups - but on the other hand, if would not prevent the non-prioritized tasks from step and run parts in any other point inthe code they await (or otherwise yield to the asyncio loop).
Another approach, for which I wrote the snippet bellow, requires you to change the tasks that are to have lower priority at points, and call a specialized sleep in them (it can be called with "0" delay, just as asyncio.sleep) . The points where these calls are placed become explicit points where your tasks will yield priority to the tasks that should run first.
This allows greater flexibility, is more explicit, and is guaranteed to pause your lower priority work - the downside being you have to explicitly add the "checkpoints" in your code.
Perceive that this works by the modified .sleep method simply not returning while there is any other higher priority task running.
import asyncio
from heapq import heappush, heapify
granularity = 0.01
class PriorityGroups:
def __init__(self):
self.priority_queue = []
self.counter = 0
async def sleep(self, delay, priority=10):
counter = self.counter
self.counter += 1
steps = delay / granularity
step_delay = delay / steps
step = 0
heappush(self.priority_queue, (priority, counter))
try:
while step < steps or (self.priority_queue and self.priority_queue[0][0] < priority):
await asyncio.sleep(step_delay)
step += 1
finally:
self.priority_queue.remove((priority, counter))
heapify(self.priority_queue)
priority_group = PriorityGroups()
async def another_coro(i, priority=1):
await priority_group.sleep(.1, priority)
print(i)
async def coro(i):
if i == 1:
async with asyncio.TaskGroup() as tg_fast:
tg_fast.create_task(another_coro(i * 10))
tg_fast.create_task(another_coro(i * 100))
# await asyncio.gather(*[another_coro(i * 10), another_coro(i * 100)])
else:
await priority_group.sleep(.1)
print(i)
async def main():
async with asyncio.TaskGroup() as tg_main:
for i in range(0, 3):
tg_main.create_task(coro(i))
asyncio.run(main(), debug=True)
So - just place calls for the same instance of PriorityGroups.sleep, optionally passing a lower number for the priority (==more prioritary), for things that should run first. Having the control placed in an instance of PriorityGroups even means you can have parallel nested groups of tasks and priority tasks, and one group won't interfere with the others.

Python Async Limit Concurrent coroutines per second

My use case is the following :
I’m using python 3.8
I have an async function analyse_doc that is a wrapper for a http request to a web service.
I have approx 1000 docs to analyse as fast as possible. The service allows for 15 transaction per second (and not 15 concurrent request at any second). So first sec I can send 15, then 2nd sec I can send 15 again and so on. If I try to hit the service more than 15 times per sec I get 429 error msg or sometimes 503/504 error (server is busy…)
My question is : is it possible to implement smt in python that effectively sends 15 requests per sec asynchronously then wait 1 sec then do it again until the queue is empty. Also some tasks might fail. Those failing tasks might need a rerun at some point.
So far my code is the following (unbounded parallelism… not even a semaphore) but it handles retry.
tasks = {asyncio.create_task(analyse_async(doc)): doc for doc in documents}
pending = set(tasks)
# Handle retry
while pending:
# backoff in case of 429
time.sleep(1)
# concurrent call return_when all completed
finished, pending = await asyncio.wait(
pending, return_when=asyncio.ALL_COMPLETED
)
# check if task has exception and register for new run.
for task in finished:
arg = tasks[task]
if task.exception():
new_task = asyncio.create_task(analyze_doc(doc))
tasks[new_task] = doc
pending.add(new_task)
You could try adding another sleep tasks into the mix to drive the request generation. Something like this
import asyncio
import random
ONE_SECOND = 1
CONCURRENT_TASK_LIMIT = 2
TASKS_TO_CREATE = 10
loop = asyncio.new_event_loop()
work_todo = []
work_in_progress = []
# just creates arbitrary work to do
def create_tasks():
for i in range(TASKS_TO_CREATE):
work_todo.append(worker_task(i))
# muddle this up to see how drain works
random.shuffle(work_todo)
# represents the actual work
async def worker_task(index):
print(f"i am worker {index} and i am starting")
await asyncio.sleep(index)
print(f"i am worker {index} and i am done")
# gets the next 'concurrent' workload segment (if there is one)
def get_next_tasks():
todo = []
i = 0
while i < CONCURRENT_TASK_LIMIT and len(work_todo) > 0:
todo.append(work_todo.pop())
i += 1
return todo
# drains down any outstanding tasks and closes the loop
async def are_we_done_yet():
print('draining')
await asyncio.gather(*work_in_progress)
loop.stop()
# closes out the program
print('done')
# puts work on the queue every tick (1 second)
async def work():
next_tasks = get_next_tasks()
if len(next_tasks) > 0:
print(f'found {len(next_tasks)} tasks to do')
for task in next_tasks:
# schedules the work, puts it in the in-progress pile
work_in_progress.append(loop.create_task(task))
# this is the 'tick' or speed work gets scheduled on
await asyncio.sleep(ONE_SECOND)
# every 'tick' we add this tasks onto the loop again unless there isn't any more to do...
loop.create_task(work())
else:
# ... if there isn't any to do we just enter drain mode
await are_we_done_yet()
# bootstrap the process
create_tasks()
loop.create_task(work())
loop.run_forever()
Updated version with a simulated exception
import asyncio
import random
ONE_SECOND = 1
CONCURRENT_TASK_LIMIT = 2
TASKS_TO_CREATE = 10
loop = asyncio.new_event_loop()
work_todo = []
work_in_progress = []
# just creates arbitrary work to do
def create_tasks():
for i in range(TASKS_TO_CREATE):
work_todo.append(worker_task(i))
# muddle this up to see how drain works
random.shuffle(work_todo)
# represents the actual work
async def worker_task(index):
try:
print(f"i am worker {index} and i am starting")
await asyncio.sleep(index)
if index % 9 == 0:
print('simulating error')
raise NotImplementedError("some error happened")
print(f"i am worker {index} and i am done")
except:
# put this work back on the pile (fudge the index so it doesn't throw this time)
work_todo.append(worker_task(index + 1))
# gets the next 'concurrent' workload segment (if there is one)
def get_next_tasks():
todo = []
i = 0
while i < CONCURRENT_TASK_LIMIT and len(work_todo) > 0:
todo.append(work_todo.pop())
i += 1
return todo
# drains down any outstanding tasks and closes the loop
async def are_we_done_yet():
print('draining')
await asyncio.gather(*work_in_progress)
if (len(work_todo)) > 0:
loop.create_task(work())
print('found some retries')
else:
loop.stop()
# closes out the program
print('done')
# puts work on the queue every tick (1 second)
async def work():
next_tasks = get_next_tasks()
if len(next_tasks) > 0:
print(f'found {len(next_tasks)} tasks to do')
for task in next_tasks:
# schedules the work, puts it in the in-progress pile
work_in_progress.append(loop.create_task(task))
# this is the 'tick' or speed work gets scheduled on
await asyncio.sleep(ONE_SECOND)
# every 'tick' we add this tasks onto the loop again unless there isn't any more to do...
loop.create_task(work())
else:
# ... if there isn't any to do we just enter drain mode
await are_we_done_yet()
# bootstrap the process
create_tasks()
loop.create_task(work())
loop.run_forever()
This just simulates something going wrong and re-queues the failed task. If the error happens after the main work method has finished it won't get re-queued so in the are-we-there-yet method it would need to check and rerun any failed tasks - this isn't particularly optimal as it'll wait to drain before checking everything else but gives you an idea of an implementation

Process async results in another thread - app architecture (python-3.7)

I have a programm that receives Data (Trades) from the Binance API.
This data will be processed and visualized in a web-app with dash and plotly.
In order to get best performance and the slightest delay my program has 3 threads:
Thread 1 - Binance API - get requests - Trades
if __name__ == "__main__":
try:
loop = asyncio.get_event_loop()
binance-thread = threading.Thread(target=start_thread_1)
...
def start_thread_1():
loop.run_until_complete(main(api_key,secret_key))
async def main(api_key,secret_key):
client = await AsyncClient.create(api_key,secret_key)
await trades_listener(client)
async def trades_listener(client):
bm = BinanceSocketManager(client)
symbol = 'BTCUSDT'
async with bm.trade_socket(symbol=symbol) as stream:
while True:
msg = await stream.recv()
event_type = msg['e']
...
trade = Trade(event_type,...)
# <-- safe trade SOMEWHERE to process in other thread ? safe to: process_trades_list
Thread 2 - Web App - Displays Trades and Processed Trades Data
web-thread = threading.Thread(target=webserver.run_server)
...
not worth to mention
Thread 3 - Process Data - Process Trades (calculate RSI, filter big trades, etc)
if __name__ == "__main__":
try:
loop = asyncio.get_event_loop()
binance-thread = threading.Thread(target=start_thread_1)
web-thread = threading.Thread(target=webserver.run_server)
process-thread = threading.Thread(target=start_thread_3)
...
.start()
.sleep()
etc.
.join()
def start_thread_3():
process_trades()
def process_trades():
global process_trades_list
while True:
while len(process_trades_list) > 0:
trade = process_trades_list[0]
process_trades_list.pop(0)
# ...do calculation etc.
HOW can I safe / hand over the data from thread_1 / async thread to thread_3?
I tried to put the trades to a list called process_trades_list and then loop while len(process_trades_list) > 0 all trades.
In the loop I pop() processed trades from the list - but this somehow seems to break the program without throwing errors.
What's best way to get this done?
It is possible that the async stream get's spammed by new incoming trades and I want to minimalize the load..
Here you want a queue.Queue instead of a list. Your last code snippet would look something like this:
import queue
if __name__ == "__main__":
try:
q = queue.Queue()
binance_thread = threading.Thread(target=start_thread_1,
args=(q,))
web_thread = threading.Thread(target=webserver.run_server)
process)thread = threading.Thread(target=process_trades,
args=(q,), daemon=True)
...
.start()
.sleep()
etc.
.join()
def process_trades(q):
while True:
trade = q.get()
# ...do calculation etc.
I eliminated the call to get_event_loop since you didn't use the returned object. I eliminated the start_thread_3 function, which is not necessary.
I made thread-3 a daemon, so it will not keep your application open if everything else is finished.
The queue should be created once, in the main thread, and passed explicitly to each thread that needs to access it. That eliminates the need for a global variable.
The process trade function becomes much simpler. The q.get() call blocks until an object is available. It also pops the object off the queue.
Next you must also modify thread-1 to put objects onto the queue, like this:
def start_thread_1(q):
asyncio.run(main(api_key,secret_key, q))
async def main(api_key,secret_key, q):
client = await AsyncClient.create(api_key,secret_key)
await trades_listener(client, q)
async def trades_listener(client, q):
bm = BinanceSocketManager(client)
symbol = 'BTCUSDT'
async with bm.trade_socket(symbol=symbol) as stream:
while True:
msg = await stream.recv()
event_type = msg['e']
...
trade = Trade(event_type,...)
q.put(trade)
The q.put function is how you safely put a trade object into the queue, which will then result in activity in thread-3.
I modified the start_thread1 function: here is a good place to start the event loop mechanism for this thread.
You ask about avoiding spam attacks on your program. Queues have methods that allow you to limit their size, and possibly throw away trades if they become full.
I don't understand what you are trying to do with the if __name__ == '__main__' logic in thread-1. The program can have only one entry point, and only one module named '__main__'. It looks to me like that has to be thread-3.

Synchronize asyncio Queue

I am planning to have an asyncio Queue based producer-consumer implementation for a processing of realtime data where sending out data in correct time order is vital. So here is the code snippet of it :
async def produce(Q, n_jobs):
for i in range(n_jobs):
print(f"Producing :{i}")
await Q.put(i)
async def consume(Q):
while True:
n = await Q.get()
print(f"Consumed :{n}")
x = do_sometask_and_return_the_result(n)
print(f"Finished :{n} and Result: {x}")
async def main(loop):
Q = asyncio.Queue(loop=loop, maxsize=3)
await asyncio.wait([produce(Q, 10), consume(Q), consume(Q), consume(Q)])
print("Done")
Here the producer produces data and puts it into the asyncio Queue. I have multiple consumers to consume and process the data. While seeing the outputs, the order is maintained while printing "Consumed :{n}" (as in 1,2,3,4... and so on) , this is completely fine. but, since the function do_sometask_and_return_the_result(n) takes variable time to return the result, the order is not maintained in the next print of n "Finished :{n}" (as in 2,1,4,3,5,...).
Is there any way to synchronize this data as I need to maintain the order of printing the results? I want to see 1,2,3,4,.. sequential prints for 'n' even after do_sometask_and_return_the_result(n).
You could use a priority queue system (using the python heapq library) to reorder your jobs after they are complete. Something like this.
# add these variables at class/global scope
priority_queue = []
current_job_id = 1
job_id_dict = {}
async def produce(Q, n_jobs):
# same as above
async def consume(Q):
while True:
n = await Q.get()
print(f"Consumed :{n}")
x = do_sometask_and_return_the_result(n)
await process_result(n, x)
async def process_result(n, x):
heappush(priority_queue, n)
job_id_dict[n] = x
while current_job_id == priority_queue[0]:
job_id = heappop(priority_queue)
print(f"Finished :{job_id} and Result: {job_id_dict[job_id]}")
current_job_id += 1
async def main(loop):
Q = asyncio.Queue(loop=loop, maxsize=3)
await asyncio.wait([produce(Q, 10), consume(Q), consume(Q), consume(Q)])
print("Done")
For more information on the heapq module: https://docs.python.org/3/library/heapq.html

Can a python pool worker return values from initialization?

TL;DR I want to collect the accumulated data in the globals of each worker when the pool is finished processing
Description of what I think I'm missing
As I'm new to multiprocessing, I don't know of all the features that exist. I am looking for a way to make a worker return the value it was initialized with (after manipulating that value a bunch of millions of times). Then, I hope I can collect and merge all these values at the end of the program when all the 'jobs' are done.
import multiprocessing as mp
from collections import defaultdict, Counter
from customtools import load_regexes #, . . .
import gzip
import nltk
result_dict = None
regexes = None
def create_worker():
global result_dict
global regexes
result_dict = defaultdict(Counter) # I want to return this at the end
# these are a bunch of huge regexes
regexes = load_regexes()
These functions represents the way I load and process data. The data is a big gzipfile with articles.
def load_data(semaphore):
with gzip.open('some10Gbfile') as f:
for line in file:
semaphore.acquire()
yield str(line, 'utf-8')
def worker_job(line):
global regexes
global result_dict
hits = defaultdict(Counter)
for sent in nltk.sent_tokenize(line[3:]):
for rename, regex in regex.items():
for hit in regex.finditer(sent):
hits[rename][hit.group(0)]+=1
# and more and more... results = _filter(_extract(hits))
# store some data in results_dict here . . .
return filtered_hits
Class ResultEater():
def __init__(self):
self.wordscounts=defaultdict(Counter)
self.filtered=Counter()
def eat_results(self, filte red_hits):
for k, v in filte.items():
for i, c in v.items():
self.wordscount[k][i]+=c
This is the main program
if __name__ == '__main__':
pool = mp.Pool(mp.cpu_count(), initializer=create_worker)
semaphore = mp.Semaphore(50)
loader = load_data(semaphore)
results = ResultEater()
for intermediate_result in pool.imap_unordered(worker_job, loader, chunksize=10):
results.eat_results(intermediate_result)
semaphore.release()
# results.eat_workers(the_leftover_workers_or_something)
results.print()
I don't really think I understand how exactly returning the data incrementally isn't sufficient, but it kinda seems like you need some sort of finalization function to send the data similar to how you have an initialization function. Unfortunately, I don't think this sort of thing exists for mp.Pool, so it'll require you to use a couple mp.Process's, and send input args, and return results with a couple mp.Queue's
On a side note your use of Semaphore is unncessary, as the call to the "load_data" iterator always happens on the main process. I have moved that to another "producer" process, which puts inputs to a queue, which is also already synchronized automatically by default. This allows you to have one process for gathering inputs, several processes for processing the inputs to outputs, and leaves the main (parent) process to gather outputs. If the "producer" generating the inputs is IO limited by file read speed (very likely), it could also be in a thread rather than a process, but in this case the difference is probably minimal.
I have created an example of a custom "Pool" which allows you to return some data at the end of each worker's "life" using aforementioned "producer-consumer" scheme. there are print statements to track what is going on in each process, but please also read the comments to track what's going on and why:
import multiprocessing as mp
from time import sleep
from queue import Empty
class ExitFlag:
def __init__(self, exit_value=None):
self.exit_value = exit_value #optionally pass value along with exit flag
def producer_func(input_q, n_workers):
for i in range(100): #100 lines of some long file
print(f"put {i}")
input_q.put(i) #put each line of the file to the work queue
print('stopping consumers')
for i in range(n_workers):
input_q.put(ExitFlag()) #send shut down signal to each of the workers
print('producer exiting')
def consumer_func(input_q, output_q, work_func):
counter = 0
while True:
try:
item = input_q.get(.1) #never wait forever on a "get". It's a recipe for deadlock.
except Empty:
continue
print(f"get {item}")
if isinstance(item, ExitFlag):
break
else:
counter += 1
output_q.put(work_func(item))
output_q.put(ExitFlag(exit_value=counter))
print('consumer exiting')
def work_func(number):
sleep(.1) #some heavy nltk work...
return number*2
if __name__ == '__main__':
input_q = mp.Queue(maxsize=10) #only bother limiting size if you have memory usage constraints
output_q = mp.Queue(maxsize=10)
n_workers = mp.cpu_count()
producer = mp.Process(target=producer_func, args=(input_q, n_workers)) #generate the input from another process. (this could just as easily be a thread as it seems it will be IO limited anyway)
producer.start()
consumers = [mp.Process(target=consumer_func, args=(input_q, output_q, work_func)) for _ in range(n_workers)]
for c in consumers: c.start()
total = 0
stop_signals = 0
exit_values = []
while True:
try:
item = output_q.get(.1)
except Empty:
continue
if isinstance(item, ExitFlag):
stop_signals += 1
if item.exit_value is not None:
exit_values.append(item.exit_value) #do something with the return at the end
if stop_signals >= n_workers: #stop waiting for more results once all consumers finish
break
else:
total += item #do something with the incremental return values
print(total)
print(exit_values)
#cleanup
producer.join()
print("producer joined")
for c in consumers: c.join()
print("consumers joined")

Categories

Resources