Timeout handling while using run_in_executor and asyncio - python

I'm using asyncio to run a piece of blocking code like this:
result = await loop.run_in_executor(None, long_running_function)
My question is: Can I impose a timeout for the execution of long_running_function?
Basically I don't want long_running_function to last more than 2 seconds and I can't do proper timeout handling within it because that function comes from a third-party library.

A warning about cancelling long running functions:
Although wrapping the Future returned by loop.run_in_executor with an asyncio.wait_for call will allow the event loop to stop waiting for long_running_function after some x seconds, it won't necessarily stop the underlying long_running_function. This is one of the shortcomings of concurrent.futures and to the best of my knowledge there is no simple way to just cancel a concurrent.futures.Future.

You could use asyncio.wait_for:
future = loop.run_in_executor(None, long_running_function)
result = await asyncio.wait_for(future, timeout, loop=loop)

although not using run_in_executor, i have some workaround about "wrap a block function async with timeout handling"
import asyncio
import threading
import time
import ctypes
def terminate_thread(t: threading.Thread, exc_type=SystemExit):
if not t.is_alive(): return
try:
tid = next(tid for tid, tobj in threading._active.items() if tobj is t)
except StopIteration:
raise ValueError("tid not found")
if ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, ctypes.py_object(exc_type)) != 1:
raise SystemError("PyThreadState_SetAsyncExc failed")
class AsyncResEvent(asyncio.Event):
def __init__(self):
super().__init__()
self.res = None
self.is_exc = False
self._loop = asyncio.get_event_loop()
def set(self, data=None) -> None:
self.res = data
self.is_exc = False
self._loop.call_soon_threadsafe(super().set)
def set_exception(self, exc) -> None:
self.res = exc
self.is_exc = True
self._loop.call_soon_threadsafe(super().set)
async def wait(self, timeout: float | None = None):
await asyncio.wait_for(super().wait(), timeout)
if self.is_exc:
raise self.res
else:
return self.res
async def sub_thread_async(func, *args, _timeout: float | None = None, **kwargs):
res = AsyncResEvent()
def f():
try:
res.set(func(*args, **kwargs))
except Exception as e:
res.set_exception(e)
except SystemExit:
res.set_exception(TimeoutError)
(t := threading.Thread(target=f)).start()
try:
return await res.wait(_timeout)
except TimeoutError:
raise TimeoutError
finally:
if not res.is_set():
terminate_thread(t)
_lock = threading.Lock()
def test(n):
_tid = threading.get_ident()
for i in range(n):
with _lock:
print(f'print from thread {_tid} ({i})')
time.sleep(1)
return n
async def main():
res_normal = await asyncio.gather(*(sub_thread_async(test, 5) for _ in range(2)))
print(res_normal) # [5,5]
res_normal_2 = await asyncio.gather(*(sub_thread_async(test, 2, _timeout=3) for _ in range(2)))
print(res_normal_2) # [2,2]
res_should_not_get = await asyncio.gather(*(sub_thread_async(test, 5, _timeout=3) for _ in range(2)))
print(res_should_not_get) # timeout error
if __name__ == '__main__':
asyncio.new_event_loop().run_until_complete(main())

Related

Python Read huge file line per line and send it to multiprocessing or thread

I have been trying to get my code to work for many days,
I am desperate.
I've scoured the internet, but I still can't find it.
I have a text file encoded in "latin-1" of 9GB -> 737 022 387 lines, each line contains a string.
I would like to read each line and send them in an http PUT request that waits for a response, and returns TRUE or FALSE if the response is 200 or 400
The PUT request takes about 1 to 3 seconds, so to speed up the processing time I would like to use either a Thread or a multiprocessing.
To start, I simulate my PUT request with a sleep of 3 seconds.
and even that I can't get it to work
This code split my string into char, i don't know why...
from multiprocessing import Pool
from time import sleep
def process_line(line):
sleep(3)
print(line)
return True
if __name__ == "__main__":
pool = Pool(2)
peon = open(r'D:\txtFile',encoding="latin-1")
for line in peon:
res = pool.map(process_line,line )
print(res)
This give error : TypeError: process_line() takes 1 positional argument but 17 were given
import multiprocessing
from multiprocessing import Pool
from time import sleep
def process_line(line):
sleep(3)
print(line)
return True
if __name__ == "__main__":
pool = Pool(2)
with open(r"d:\txtFile",encoding="latin-1") as file:
res = pool.apply(process_line,file.readline() )
print(res)
that : Crash the computer
from multiprocessing import Pool
from time import sleep
def process_line(line):
sleep(3)
print(line)
return True
if __name__ == "__main__":
pool = Pool(2)
peon = open(r'D:\txtFile',encoding="latin-1")
for line in peon:
res = pool.map(process_line,peon )
print(res)
Although the problem seems unrealistic though. shooting 737,022,387 requests! calculate how many months it'll take from single computer!!
Still, Better way to do this task is to read line by line from file in a separate thread and insert into a queue. And then multi-process the queue.
Solution 1:
from multiprocessing import Queue, Process
from threading import Thread
from time import sleep
urls_queue = Queue()
max_process = 4
def read_urls():
with open('urls_file.txt', 'r') as f:
for url in f:
urls_queue.put(url.strip())
print('put url: {}'.format(url.strip()))
# put DONE to tell send_request_processor to exit
for i in range(max_process):
urls_queue.put("DONE")
def send_request(url):
print('send request: {}'.format(url))
sleep(1)
print('recv response: {}'.format(url))
def send_request_processor():
print('start send request processor')
while True:
url = urls_queue.get()
if url == "DONE":
break
else:
send_request(url)
def main():
file_reader_thread = Thread(target=read_urls)
file_reader_thread.start()
procs = []
for i in range(max_process):
p = Process(target=send_request_processor)
procs.append(p)
p.start()
for p in procs:
p.join()
print('all done')
# wait for all tasks in the queue
file_reader_thread.join()
if __name__ == '__main__':
main()
Demo: https://onlinegdb.com/Elfo5bGFz
Solution 2:
You can use tornado asynchronous networking library
from tornado import gen
from tornado.ioloop import IOLoop
from tornado.queues import Queue
q = Queue(maxsize=2)
async def consumer():
async for item in q:
try:
print('Doing work on %s' % item)
await gen.sleep(0.01)
finally:
q.task_done()
async def producer():
with open('urls_file.txt', 'r') as f:
for url in f:
await q.put(url)
print('Put %s' % item)
async def main():
# Start consumer without waiting (since it never finishes).
IOLoop.current().spawn_callback(consumer)
await producer() # Wait for producer to put all tasks.
await q.join() # Wait for consumer to finish all tasks.
print('Done')
# producer and consumer can run in parallel
IOLoop.current().run_sync(main)
Using method multiprocessing.pool.imap is a step in the right direction but the problem is that with so much input you will be feeding the input task queue faster than the processing pool can take the tasks off the queue and return results. Consequently, the task queue will continue to grow and you will exhaust memory. What is needed is a way to "throttle" method imap so that it blocks once the task queue size has N tasks on it. I think a reasonable value for N as a default is twice the pool size to ensure that when a pool process completes work on a task there will be no delay for it to find another task to work on. Hence we create classes BoundedQueueProcessPool (multiprocessing) and BoundedQueueThreadPool (multithreading):
import multiprocessing.pool
import multiprocessing
import threading
class ImapResult():
def __init__(self, semaphore, result):
self._semaphore = semaphore
self.it = result.__iter__()
def __iter__(self):
return self
def __next__(self):
try:
elem = self.it.__next__()
self._semaphore.release()
return elem
except StopIteration:
raise
except:
self._semaphore.release()
raise
class BoundedQueuePool:
def __init__(self, limit, semaphore):
self._limit = limit
self._semaphore = semaphore
def release(self, result, callback=None):
self._semaphore.release()
if callback:
callback(result)
def apply_async(self, func, args=(), kwds={}, callback=None, error_callback=None):
self._semaphore.acquire()
callback_fn = self.release if callback is None else lambda result: self.release(result, callback=callback)
error_callback_fn = self.release if error_callback is None else lambda result: self.release(result, callback=callback)
return super().apply_async(func, args, kwds, callback=callback_fn, error_callback=error_callback_fn)
def imap(self, func, iterable, chunksize=1):
def new_iterable(iterable):
for elem in iterable:
self._semaphore.acquire()
yield elem
if chunksize > self._limit:
raise ValueError(f'chunksize argument exceeds {self._limit}')
result = super().imap(func, new_iterable(iterable), chunksize)
return ImapResult(self._semaphore, result)
def imap_unordered(self, func, iterable, chunksize=1):
def new_iterable(iterable):
for elem in iterable:
self._semaphore.acquire()
yield elem
if chunksize > self._limit:
raise ValueError(f'chunksize argument exceeds {self._limit}')
result = super().imap_unordered(func, new_iterable(iterable), chunksize)
return ImapResult(self._semaphore, result)
class BoundedQueueProcessPool(BoundedQueuePool, multiprocessing.pool.Pool):
def __init__(self, *args, max_waiting_tasks=None, **kwargs):
multiprocessing.pool.Pool.__init__(self, *args, **kwargs)
if max_waiting_tasks is None:
max_waiting_tasks = self._processes
elif max_waiting_tasks < 0:
raise ValueError(f'Invalid negative max_waiting_tasks value: {max_waiting_tasks}')
limit = self._processes + max_waiting_tasks
BoundedQueuePool.__init__(self, limit, multiprocessing.BoundedSemaphore(limit))
class BoundedQueueThreadPool(BoundedQueuePool, multiprocessing.pool.ThreadPool):
def __init__(self, *args, max_waiting_tasks=None, **kwargs):
multiprocessing.pool.ThreadPool.__init__(self, *args, **kwargs)
if max_waiting_tasks is None:
max_waiting_tasks = self._processes
elif max_waiting_tasks < 0:
raise ValueError(f'Invalid negative max_waiting_tasks value: {max_waiting_tasks}')
limit = self._processes + max_waiting_tasks
BoundedQueuePool.__init__(self, limit, threading.BoundedSemaphore(limit))
#######################################################################
from time import sleep
def process_line(line):
sleep(3)
# the lines already have line end characters:
print(line, end='')
return True
if __name__ == "__main__":
pool = BoundedQueueProcessPool(2)
with open("test.txt") as file:
for res in pool.imap(process_line, file):
#print(res)
pass
pool.close()
pool.join()

How to have a non-blocking indefinite loop that can be interrupted in Python?

I am trying to create a process that would run indefinitely until being shut down (either from the command prompt or from another notebook cell) by changing a class member. My attempt looks like the follow:
def async_run(*awaitables: Awaitable,
timeout: float = None):
loop = asyncio.get_event_loop()
if not awaitables:
if loop.is_running():
return
loop.run_forever()
result = None
if sys.version_info >= (3, 7):
all_tasks = asyncio.all_tasks(loop) # type: ignore
else:
all_tasks = asyncio.Task.all_tasks() # type: ignore
if all_tasks:
# cancel pending tasks
f = asyncio.gather(*all_tasks)
f.cancel()
try:
loop.run_until_complete(f)
except asyncio.CancelledError:
pass
else:
if len(awaitables) == 1:
future = awaitables[0]
else:
future = asyncio.gather(*awaitables)
if timeout:
future = asyncio.wait_for(future, timeout)
task = asyncio.ensure_future(future)
def onError(_):
task.cancel()
global_error_event.connect(onError)
try:
result = loop.run_until_complete(task)
except asyncio.CancelledError as e:
raise global_error_event.value() or e
finally:
global_error_event.disconnect(onError)
return result
class Platform:
def __init__(self):
self.is_active: bool = True
async def kickstart(self):
while self.is_active:
await async_run(asyncio.sleep(0))
However it seems that the kickstart() function would simply finish running, even though my intention is for it to run indefinitely. If I remove the async, it does run indefinitely but then it becomes blocking. I'd appreciate some help on this, thanks in advance.

asyncio multiple infinite loops using selenium

I'm trying to translate a thread-based script into an asyncio-based script. I'm trying to run multiple concurrent mycheck() coroutines that are infinite loops. I'm having difficulties because inside this mycheck function that will be a coroutine, I call both function (some_http_request_function) and object methods, using selenium library in order to automatize browser. I don't know which function should be defined as async and which parts of code shall be awaited. Here my thread-based code I want to speed up:
class foo:
def __init__(self):
self.drivers=[]
for j in range(2):
self.drivers.append(webdriver.Chrome("chromedriver.exe"))
self.list1=["foo","bar"]
self.urls=["url1","url2"]
def method1(self,index):
return self.list1[index]
def method2(self,index):
if index==1:
return self.method3(index)
else:
return self.method4(index)
def method3(self,index):
if self.drivers[index]==None:
self.drivers[index]=webdriver.Chrome("chromedriver.exe")
self.drivers[index].get(self.urls[index])
try:
self.drivers[index].find_element_by_xpath("xpath-example")
if availability.text != "Some text":
self.drivers[index].close()
self.drivers[index]=None
return true
else:
self.drivers[index].close()
self.drivers[index]=None
return false
except Exception as e:
self.drivers[index].close()
self.drivers[index]=None
return false
def some_http_request_function(element):
response = requests.get(element)
return response.json()
def mycheck(index):
while True:
print("started")
availability=bot.method2(index)
if availability:
some_http_request_function(bot.method1(index))
while availability:
time.sleep(300)
availability = bot.method2(index)
if __name__ == '__main__':
bot = foo()
for i in range(2):
t = Thread(target=mycheck, args=(i,))
t.start()
I tried by making these changes:
class foo:
def __init__(self):
self.drivers=[]
for j in range(2):
self.drivers.append(webdriver.Chrome("chromedriver.exe"))
self.list1=["foo","bar"]
self.urls=["url1","url2"]
def method1(self,index):
return self.list1[index]
async def method2(self,index):
if index==1:
return await self.method3(index)
else:
return await self.method4(index)
async def method3(self,index):
if self.drivers[index]==None:
self.drivers[index]=webdriver.Chrome("chromedriver.exe")
self.drivers[index].get(self.urls[index])
try:
self.drivers[index].find_element_by_xpath("xpath-example")
if availability.text != "Some text":
self.drivers[index].close()
self.drivers[index]=None
return true
else:
self.drivers[index].close()
self.drivers[index]=None
return false
except Exception as e:
self.drivers[index].close()
self.drivers[index]=None
return false
def some_http_request_function(element):
response = requests.get(element)
return response.json()
async def mycheck(index):
while True:
print("started")
availability= await bot.method2(index)
if availability:
some_http_request_function(bot.method1(index))
while availability:
await asyncio.sleep(300)
availability = await bot.method2(index)
async def main():
await asyncio.gather(mycheck(0),mycheck(1))
if __name__ == '__main__':
bot = foo()
asyncio.run(main())
but I'm pretty sure the code is not executed as I wish because the two "started" prints are not close to one another. I expected them to run together. What am I doing wrong?
Thanks in advance.

Can't pickle coroutine objects when ProcessPoolExecutor is used in class

I'm trying to get asyncio work with subprocesses and limitations. I've accomplish this in functional way, but when I tried to implement same logic in opp style several problems showd up. Mostly Can't pickle coroutine/generator errors. I tracked some of theese, but not all
import asyncio
from concurrent.futures import ProcessPoolExecutor
from itertools import islice
from random import randint
class async_runner(object):
def __init__(self):
self.futures = [] # container to store current futures
self.futures_total = []
self.loop = asyncio.get_event_loop() # main event_loop
self.executor = ProcessPoolExecutor()
self.limit = 1
def run(self, func, *args):
temp_loop = asyncio.new_event_loop()
try:
coro = func(*args)
asyncio.set_event_loop(temp_loop)
ret = temp_loop.run_until_complete(coro)
return ret
finally:
temp_loop.close()
def limit_futures(self, futures, limit):
self.futures_total = iter(futures)
self.futures = [future for future in islice(self.futures_total,0,limit)]
async def first_to_finish():
while True:
await asyncio.sleep(0)
for f in self.futures:
if f.done(): # here raised TypeError: can't pickle coroutine objects
print(f.done())
self.futures.remove(f)
try:
#newf = next(self.futures_total)
#self.futures.append(newf)
print(f.done())
except StopIteration as e:
pass
return f.result()
while len(self.futures) > 0:
yield first_to_finish()
async def run_limited(self, func, args, limit):
self.limit = int(limit)
self.futures_total = (self.loop.run_in_executor(self.executor, self.run, func, x) for x in range(110000,119990))
for ret in self.limit_futures(self.futures_total, 4): # limitation - 4 per all processes
await ret
def set_execution(self, func, args, limit):
ret = self.loop.run_until_complete(self.run_limited(func, args, limit))
return ret
async def asy(x):
print('enter: ', x)
await asyncio.sleep(randint(1,3))
print('finishing ', x)
return x
runner = async_runner()
ret = runner.set_execution(asy,urls,2)
print(ret)
But this works fine:
import asyncio
from concurrent.futures import ProcessPoolExecutor
from itertools import islice
import time
async def asy(x):
print('enter: ', x)
await asyncio.sleep(1)
print('finishing ', x)
return x
def run(corofn, *args):
loop = asyncio.new_event_loop()
try:
coro = corofn(*args)
asyncio.set_event_loop(loop)
ret = loop.run_until_complete(coro)
#print(ret)
return ret
finally:
loop.close()
def limit_futures(futures, limit):
futures_sl = [
c for c in islice(futures, 0, limit)
]
print(len(futures_sl))
async def first_to_finish(futures):
while True:
await asyncio.sleep(0)
for f in futures_sl:
if f.done():
futures_sl.remove(f)
try:
newf = next(futures)
futures_sl.append(newf)
except StopIteration as e:
pass
return f.result()
while len(futures_sl) > 0:
yield first_to_finish(futures)
async def main():
loop = asyncio.get_event_loop()
executor = ProcessPoolExecutor()
futures = (loop.run_in_executor(executor, run, asy, x) for x in range(110000,119990))
'''
CASE balls to the wall!
await asyncio.gather(*futures)
'''
for ret in limit_futures(futures, 4): # limitation - 4 per all processes
await ret
if __name__ == '__main__':
start = time.time()
'''
# CASE single
ret = [asy(x) for x in range(510000,510040)]
exit()
'''
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
print("Elapsed time: {:.3f} sec".format(time.time() - start))
I've cant understand why multiprocessing module trying to pickle anything only when objects are in use, but not in any scenario
The reason why multiprocessing needs to pickle the async_runner instance is because self.runner is a bound method, meaning that it "contains" the async_runner instance.
Since you're not actually using self in the run method, you can just make it a staticmethod to avoid this problem.

why asyncio.queue get faild?

Python3, asyncio
Task A is putting a number to a queue every 2s,
task B is wait for the queue.get() with timeout 1s in a forever loop.
I don't know why can't get the number from the queue in task B, if timeout is bigger than 2s, queue.get() is ok.
import asyncio
class Test:
def __init__(self, loop=None):
self._queue = asyncio.Queue(loop=loop)
self._future = asyncio.Future(loop=loop)
#asyncio.coroutine
def run(self):
asyncio.async(self._feed_queue(2))
asyncio.async(self._recv())
# yield from asyncio.sleep(10.0)
# self._future.set_exception('Fail')
#asyncio.coroutine
def _feed_queue(self, interval):
v = 0
while True:
yield from asyncio.sleep(interval)
print("feed")
yield from self._queue.put(v)
v = v+1
#asyncio.coroutine
def _recv(self):
while True:
try:
print('wait')
# r = yield from asyncio.wait([self._queue.get(), self._future], return_when=asyncio.FIRST_COMPLETED, timeout=1.0)
# for x in r[0]:
# if x.exception():
# raise x.exception()
# print("recv", x.result())
try:
r = yield from asyncio.wait_for(self._queue.get(), timeout=1.0)
print(r)
except:
continue
# print("recv", r)
# in done set
except BaseException as e:
print(e)
break
print("quit")
loop = asyncio.get_event_loop()
t = Test(loop=loop)
asyncio.async(t.run())
loop.run_forever()
output:
wait
wait
feed
wait
wait
feed
wait
wait
feed
wait
The first call to self._queue.get() may return the value, but it is not used, and new call is made.
Adjusted to use the call (and to use asyncio.ensure_future; otherwise, AssertionError("yield from wasn't used with future",) is raised)
#asyncio.coroutine
def _recv(self):
future = None
while True:
try:
if future is None:
future = asyncio.ensure_future(self._queue.get())
r = yield from asyncio.wait(
[future, self._future],
return_when=asyncio.FIRST_COMPLETED,
timeout=1.0
)
for x in r[0]:
if x.exception():
raise x.exception()
if x is future:
future = None
print("recv", x.result())
except BaseException as e:
print(e)
break
print("quit")
used asyncio.ensure_future instead of asyncio.async because later one is deprecated.
It is ok on Python-3.5.1.
asyncio.Queue was re-written

Categories

Resources