How to use Python Concurrent Futures with decorators - python

I'm using a decorator for the thread pool executor:
from functools import wraps
from .bounded_pool_executor import BoundedThreadPoolExecutor
_DEFAULT_POOL = BoundedThreadPoolExecutor(max_workers=5)
def threadpool(f, executor=None):
#wraps(f)
def wrap(*args, **kwargs):
return (executor or _DEFAULT_POOL).submit(f, *args, **kwargs)
where the BoundedThreadPoolExecutor is defined here
When I try to use the concurrent futures in a function decorated with #threadpool and then waiting all the futures withas_completed like
def get_results_as_completed(futures):
# finished, pending = wait(futures, return_when=ALL_COMPLETED)
futures_results = as_completed(futures)
for f in futures_results:
try:
yield f.result()
except:
pass
for some worker defined like
from thread_support import threadpool
from time import sleep
from random import randint
#threadpool
def my_worker:
res = {}
# do something
sleep(randint(1, 5))
return res
if __name__ == "__main__":
futures_results = get_results_as_completed(futures)
for r in futures_results:
results.append(r)
I cannot get the futures completed despite of the .result() call, thus resulting in a infinite loop on futures_results. Why?

Related

Python Read huge file line per line and send it to multiprocessing or thread

I have been trying to get my code to work for many days,
I am desperate.
I've scoured the internet, but I still can't find it.
I have a text file encoded in "latin-1" of 9GB -> 737 022 387 lines, each line contains a string.
I would like to read each line and send them in an http PUT request that waits for a response, and returns TRUE or FALSE if the response is 200 or 400
The PUT request takes about 1 to 3 seconds, so to speed up the processing time I would like to use either a Thread or a multiprocessing.
To start, I simulate my PUT request with a sleep of 3 seconds.
and even that I can't get it to work
This code split my string into char, i don't know why...
from multiprocessing import Pool
from time import sleep
def process_line(line):
sleep(3)
print(line)
return True
if __name__ == "__main__":
pool = Pool(2)
peon = open(r'D:\txtFile',encoding="latin-1")
for line in peon:
res = pool.map(process_line,line )
print(res)
This give error : TypeError: process_line() takes 1 positional argument but 17 were given
import multiprocessing
from multiprocessing import Pool
from time import sleep
def process_line(line):
sleep(3)
print(line)
return True
if __name__ == "__main__":
pool = Pool(2)
with open(r"d:\txtFile",encoding="latin-1") as file:
res = pool.apply(process_line,file.readline() )
print(res)
that : Crash the computer
from multiprocessing import Pool
from time import sleep
def process_line(line):
sleep(3)
print(line)
return True
if __name__ == "__main__":
pool = Pool(2)
peon = open(r'D:\txtFile',encoding="latin-1")
for line in peon:
res = pool.map(process_line,peon )
print(res)
Although the problem seems unrealistic though. shooting 737,022,387 requests! calculate how many months it'll take from single computer!!
Still, Better way to do this task is to read line by line from file in a separate thread and insert into a queue. And then multi-process the queue.
Solution 1:
from multiprocessing import Queue, Process
from threading import Thread
from time import sleep
urls_queue = Queue()
max_process = 4
def read_urls():
with open('urls_file.txt', 'r') as f:
for url in f:
urls_queue.put(url.strip())
print('put url: {}'.format(url.strip()))
# put DONE to tell send_request_processor to exit
for i in range(max_process):
urls_queue.put("DONE")
def send_request(url):
print('send request: {}'.format(url))
sleep(1)
print('recv response: {}'.format(url))
def send_request_processor():
print('start send request processor')
while True:
url = urls_queue.get()
if url == "DONE":
break
else:
send_request(url)
def main():
file_reader_thread = Thread(target=read_urls)
file_reader_thread.start()
procs = []
for i in range(max_process):
p = Process(target=send_request_processor)
procs.append(p)
p.start()
for p in procs:
p.join()
print('all done')
# wait for all tasks in the queue
file_reader_thread.join()
if __name__ == '__main__':
main()
Demo: https://onlinegdb.com/Elfo5bGFz
Solution 2:
You can use tornado asynchronous networking library
from tornado import gen
from tornado.ioloop import IOLoop
from tornado.queues import Queue
q = Queue(maxsize=2)
async def consumer():
async for item in q:
try:
print('Doing work on %s' % item)
await gen.sleep(0.01)
finally:
q.task_done()
async def producer():
with open('urls_file.txt', 'r') as f:
for url in f:
await q.put(url)
print('Put %s' % item)
async def main():
# Start consumer without waiting (since it never finishes).
IOLoop.current().spawn_callback(consumer)
await producer() # Wait for producer to put all tasks.
await q.join() # Wait for consumer to finish all tasks.
print('Done')
# producer and consumer can run in parallel
IOLoop.current().run_sync(main)
Using method multiprocessing.pool.imap is a step in the right direction but the problem is that with so much input you will be feeding the input task queue faster than the processing pool can take the tasks off the queue and return results. Consequently, the task queue will continue to grow and you will exhaust memory. What is needed is a way to "throttle" method imap so that it blocks once the task queue size has N tasks on it. I think a reasonable value for N as a default is twice the pool size to ensure that when a pool process completes work on a task there will be no delay for it to find another task to work on. Hence we create classes BoundedQueueProcessPool (multiprocessing) and BoundedQueueThreadPool (multithreading):
import multiprocessing.pool
import multiprocessing
import threading
class ImapResult():
def __init__(self, semaphore, result):
self._semaphore = semaphore
self.it = result.__iter__()
def __iter__(self):
return self
def __next__(self):
try:
elem = self.it.__next__()
self._semaphore.release()
return elem
except StopIteration:
raise
except:
self._semaphore.release()
raise
class BoundedQueuePool:
def __init__(self, limit, semaphore):
self._limit = limit
self._semaphore = semaphore
def release(self, result, callback=None):
self._semaphore.release()
if callback:
callback(result)
def apply_async(self, func, args=(), kwds={}, callback=None, error_callback=None):
self._semaphore.acquire()
callback_fn = self.release if callback is None else lambda result: self.release(result, callback=callback)
error_callback_fn = self.release if error_callback is None else lambda result: self.release(result, callback=callback)
return super().apply_async(func, args, kwds, callback=callback_fn, error_callback=error_callback_fn)
def imap(self, func, iterable, chunksize=1):
def new_iterable(iterable):
for elem in iterable:
self._semaphore.acquire()
yield elem
if chunksize > self._limit:
raise ValueError(f'chunksize argument exceeds {self._limit}')
result = super().imap(func, new_iterable(iterable), chunksize)
return ImapResult(self._semaphore, result)
def imap_unordered(self, func, iterable, chunksize=1):
def new_iterable(iterable):
for elem in iterable:
self._semaphore.acquire()
yield elem
if chunksize > self._limit:
raise ValueError(f'chunksize argument exceeds {self._limit}')
result = super().imap_unordered(func, new_iterable(iterable), chunksize)
return ImapResult(self._semaphore, result)
class BoundedQueueProcessPool(BoundedQueuePool, multiprocessing.pool.Pool):
def __init__(self, *args, max_waiting_tasks=None, **kwargs):
multiprocessing.pool.Pool.__init__(self, *args, **kwargs)
if max_waiting_tasks is None:
max_waiting_tasks = self._processes
elif max_waiting_tasks < 0:
raise ValueError(f'Invalid negative max_waiting_tasks value: {max_waiting_tasks}')
limit = self._processes + max_waiting_tasks
BoundedQueuePool.__init__(self, limit, multiprocessing.BoundedSemaphore(limit))
class BoundedQueueThreadPool(BoundedQueuePool, multiprocessing.pool.ThreadPool):
def __init__(self, *args, max_waiting_tasks=None, **kwargs):
multiprocessing.pool.ThreadPool.__init__(self, *args, **kwargs)
if max_waiting_tasks is None:
max_waiting_tasks = self._processes
elif max_waiting_tasks < 0:
raise ValueError(f'Invalid negative max_waiting_tasks value: {max_waiting_tasks}')
limit = self._processes + max_waiting_tasks
BoundedQueuePool.__init__(self, limit, threading.BoundedSemaphore(limit))
#######################################################################
from time import sleep
def process_line(line):
sleep(3)
# the lines already have line end characters:
print(line, end='')
return True
if __name__ == "__main__":
pool = BoundedQueueProcessPool(2)
with open("test.txt") as file:
for res in pool.imap(process_line, file):
#print(res)
pass
pool.close()
pool.join()

Download timeout function in a loop

I am trying to download some data from the web (web scraping); I have a list of URLs, a few of the URLs take too much time, and thus the loop gets stuck there; I am implementing a function that timeout after a certain period of threshold value and loop should be continued.
For example, if the downloading_source looks like this:
import time
import numpy as np
def downloading_source(x):
wt = np.random.randint(1,50)
print("waiting time", wt)
time.sleep(wt)
return x**2
for the demo, I am taking random values as time. sleep, and the timeout function looks like this
import error
import os
import signal
import functools
class TimeoutError(Exception):
pass
def timeout(seconds=10, error_message=os.strerror(errno.ETIME)):
def decorator(func):
def _handle_timeout(signum, frame):
raise TimeoutError(error_message)
#functools.wraps(func)
def wrapper(*args, **kwargs):
signal.signal(signal.SIGALRM, _handle_timeout)
signal.alarm(seconds)
try:
result = func(*args, **kwargs)
finally:
signal.alarm(0)
return result
return wrapper
return decorator
The download loop:
# Timeout after 5 seconds
#timeout(5)
def long_running_function2(x):
return downloading_source(x)
all_urls = list(range(1,100))
downloaded_data = []
for url in all_urls:
try:
print(url)
down_data = long_running_function2(url)
except Exception as e:
pass
downloaded_data.append(down_data)
It's working; I was wondering, is there any better way to do this?

Does calling thread.join() blocks the event loop in an asynchronous context?

I'm implementing a web API using aiohttp, deployed using gunicorn with UVloop enabled --worker-class aiohttp.GunicornUVLoopWebWorker. Therefore, my code always runs in an asynchronous context. I had the ideia of implementing parallel jobs in the handling of requests for better performance.
I'm not using asyncio because i want Parallelism, not Concurrency.
I'm aware of multiprocessing and the GIL problem in python. But joining a process also applies to my question.
Here is an example:
from aiohttp.web import middleware
#middleware
async def context_init(request, handler):
request.context = {}
request.context['threads'] = []
ret = await handler(request)
for thread in request.context['threads']:
thread.join()
return ret
Taking into account that thread.join() or process.join() blocks the current thread, this will block the event loop (As far as my knowledge goes). How can I join asynchronously? What I want can be represented figuratively as this: await thread.join() or await process.join().
Update:
Thanks to #user4815162342 I was able to write proper code for my project:
Middleware:
from aiohttp.web import middleware
from util.process_session import ProcessSession
#middleware
async def context_init(request, handler):
request.context = {}
request.context['process_session'] = ProcessSession()
request.context['processes'] = {}
ret = await handler(request)
await request.context['process_session'].wait_for_all()
return ret
Util:
import asyncio
import concurrent.futures
from functools import partial
class ProcessSession():
def __init__(self):
self.loop = asyncio.get_running_loop()
self.pool = concurrent.futures.ProcessPoolExecutor()
self.futures = []
async def wait_for_all(self):
await asyncio.wait(self.futures)
def add_process(self, f, *args, **kwargs):
ret = self.loop.run_in_executor(self.pool, partial(f, *args, **kwargs))
self.futures.append(ret)
return ret
class ProcessBase():
def __init__(self, process_session, f, *args, **kwargs):
self.future = process_session.add_process(f, *args, **kwargs)
async def wait(self):
await asyncio.wait([self.future])
return self.future.result()
Answering your question: Yes, it does block the event loop.
I found that ThreadPoolExecutor works pretty well on this situations.
from util.process_session import ProcessSession
from concurrent.futures.thread import ThreadPoolExecutor
import asyncio
from aiohttp.web import middleware
#middleware
async def context_init(request, handler):
request.context = {}
request.context['threads'] = []
ret = await handler(request)
with ThreadPoolExecutor(1) as executor:
await asyncio.get_event_loop().run_in_executor(executor,
functools.partial(join_threads, data={
'threads': request.context['threads']
}))
return ret
def join_threads(threads):
for t in threads:
t.join()
I found a solution using multiprocesses. It can be done using a Pool. The standard lib provides some "async" methods (It's not really async, it just separates the initialization of the process from the process' output): apply_async
Using a simple async wrapper, I managed to deliver what I wanted:
from multiprocessing import Pool
from async_converter import sync_to_async
import asyncio
def f(x):
i = 0
while i < 10000000 * x:
i = i + 1
print("Finished: " + str(x))
return i
async def run():
print("Started with run")
with Pool(processes=4) as pool: # start 4 worker processes
result1 = pool.apply_async(f, (10,)) # evaluate "f(10)" asynchronously
result2 = pool.apply_async(f, (2,))
res1 = await sync_to_async(result1.get)()
print(res1)
res2 = await sync_to_async(result2.get)()
print(res2)
async def dummy(output):
print(output)
async def main():
# Schedule three calls *concurrently*:
await asyncio.gather(
run(),
dummy("Nice"),
dummy("Async"),
dummy("Loop"),
dummy("Perfect"),
dummy("Dummy1"),
dummy("Dummy2"),
dummy("Dummy3"),
dummy("Dummy4"),
dummy("Dummy5"),
dummy("Dummy6"),
dummy("Dummy7"),
dummy("Dummy8"),
dummy("Dummy9"),
dummy("Dummy10"),
)
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
loop.close()
outputs:
Perfect
Dummy6
Nice
Dummy1
Dummy7
Started with run
Dummy2
Dummy8
Dummy3
Dummy9
Async
Dummy4
Dummy10
Loop
Dummy5
Finished: 2
Finished: 10
100000000
20000000
Parallelism with asyncio :)

Class Decorators Singleton?

So for example, I'm making an async decorator and wanted to limit the number of concurrent threads:
from multiprocessing import cpu_count
from threading import Thread
class async:
def __init__(self, function):
self.func = function
self.max_threads = cpu_count()
self.current_threads = []
def __call__(self, *args, **kwargs):
func_thread = Thread(target = self.func, args = args, kwargs = kwargs)
func_thread.start()
self.current_threads.append(func_thread)
while len(self.current_threads) > self.max_threads:
self.current_threads = [t for t in self.current_threads if t.isAlive()]
from time import sleep
#async
def printA():
sleep(1)
print "A"
#async
def printB():
sleep(1)
print "B"
Is this going to limit the total concurrent threads? IE. If I had 8 cores, would the current code end up having 16+ threads due to two separate async objects existing?
If so, how would I fix that?
Thanks!

Tornado gen.sleep add delay

I'm trying to add a delay between requests in an asynchronous way.
When I use Tornado gen.sleep(x) my function (launch) doesn't get executed.
If I remove yield from yield gen.sleep(1.0), function is called, but no delay is added.
How to add delay between requests in my for loop? I need to control Request per second to external API.
If I use time.sleep the response is delayed after all requests are completed.
Tried to add #gen.engine decorator to launch function and no results.
Code:
import collections
import tornado.httpclient
class BacklogClient(object):
MAX_CONCURRENT_REQUESTS = 20
def __init__(self, ioloop):
self.ioloop = ioloop
self.client = tornado.httpclient.AsyncHTTPClient(max_clients=self.MAX_CONCURRENT_REQUESTS)
self.client.configure(None, defaults=dict(connect_timeout=20, request_timeout=30))
self.backlog = collections.deque()
self.concurrent_requests = 0
def __get_callback(self, function):
def wrapped(*args, **kwargs):
self.concurrent_requests -= 1
self.try_run_request()
return function(*args, **kwargs)
return wrapped
def try_run_request(self):
while self.backlog and self.concurrent_requests < self.MAX_CONCURRENT_REQUESTS:
request, callback = self.backlog.popleft()
self.client.fetch(request, callback=callback)
self.concurrent_requests += 1
def fetch(self, request, callback=None):
wrapped = self.__get_callback(callback)
self.backlog.append((request, wrapped))
self.try_run_request()
import time
from tornado import ioloop, httpclient, gen
class TornadoBacklog:
def __init__(self):
self.queue = 0
self.debug = 1
self.toProcess = [
'http://google.com',
'http://yahoo.com',
'http://nytimes.com',
'http://msn.com',
'http://cnn.com',
'http://twitter.com',
'http://facebook.com',
]
def handle_request(self, response):
print response.code
if not self.backlog.backlog and self.backlog.concurrent_requests == 0:
ioloop.IOLoop.instance().stop()
def launch(self):
self.ioloop = ioloop.IOLoop.current()
self.backlog = BacklogClient(self.ioloop)
for item in self.toProcess:
yield gen.sleep(1.0)
print item
self.backlog.fetch(
httpclient.HTTPRequest(
item,
method='GET',
headers=None,
),
self.handle_request
)
self.ioloop.start()
def main():
start_time = time.time()
scraper = TornadoBacklog()
scraper.launch()
elapsed_time = time.time() - start_time
print('Process took %f seconds processed %d items.' % (elapsed_time, len(scraper.toProcess)))
if __name__ == "__main__":
main()
Reference: https://github.com/tornadoweb/tornado/issues/1400
Tornado coroutines have two components:
They contain "yield" statements
They are decorated with "gen.coroutine"
Use the "coroutine" decorator on your "launch" function:
#gen.coroutine
def launch(self):
Run a Tornado coroutine from start to finish like this:
tornado.ioloop.IOLoop.current().run_sync(launch)
Remove the call to "ioloop.start" from your "launch" function: the loop runs the "launch" function, not vice-versa.

Categories

Resources