I want to use multiprocessing.Pool, but multiprocessing.Pool can't abort a task after a timeout. I found solution and some modify it.
from multiprocessing import util, Pool, TimeoutError
from multiprocessing.dummy import Pool as ThreadPool
import threading
import sys
from functools import partial
import time
def worker(y):
print("worker sleep {} sec, thread: {}".format(y, threading.current_thread()))
start = time.time()
while True:
if time.time() - start >= y:
break
time.sleep(0.5)
# show work progress
print(y)
return y
def collect_my_result(result):
print("Got result {}".format(result))
def abortable_worker(func, *args, **kwargs):
timeout = kwargs.get('timeout', None)
p = ThreadPool(1)
res = p.apply_async(func, args=args)
try:
# Wait timeout seconds for func to complete.
out = res.get(timeout)
except TimeoutError:
print("Aborting due to timeout {}".format(args[1]))
# kill worker itself when get TimeoutError
sys.exit(1)
else:
return out
def empty_func():
pass
if __name__ == "__main__":
TIMEOUT = 4
util.log_to_stderr(util.DEBUG)
pool = Pool(processes=4)
# k - time to job sleep
featureClass = [(k,) for k in range(20, 0, -1)] # list of arguments
for f in featureClass:
# check available worker
pool.apply(empty_func)
# run job with timeout
abortable_func = partial(abortable_worker, worker, timeout=TIMEOUT)
pool.apply_async(abortable_func, args=f, callback=collect_my_result)
time.sleep(TIMEOUT)
pool.terminate()
print("exit")
main modification - worker process exit with sys.exit(1). It's kill worker process and kill job thread, but i'm not sure that this solution is good. What potential problems can i get, when process terminate itself with running job?
There is no implicit risk in stopping a running job, the OS will take care of correctly terminating the process.
If your job is writing on files, you might end up with lots of truncated files on your disk.
Some small issue might also occur if you write on DBs or if you are connected with some remote process.
Nevertheless, Python standard Pool does not support worker termination on task timeout. Terminating processes abruptly might lead to weird behaviour within your application.
Pebble processing Pool does support timing-out tasks.
from pebble import ProcessPool
from concurrent.futures import TimeoutError
TIMEOUT_SECONDS = 5
def function(one, two):
return one + two
with ProcessPool() as pool:
future = pool.schedule(function, args=(1, 2), timeout=TIMEOUT_SECONDS)
try:
result = future.result()
except TimeoutError:
print("Future: %s took more than 5 seconds to complete" % future)
Related
I'm trying to launch a function (my_function) and stop its execution after a certain time is reached.
So i challenged multiprocessing library and everything works well. Here is the code, where my_function() has been changed to only create a dummy message.
from multiprocessing import Queue, Process
from multiprocessing.queues import Empty
import time
timeout=1
# timeout=3
def my_function(something):
time.sleep(2)
return f'my message: {something}'
def wrapper(something, queue):
message ="too late..."
try:
message = my_function(something)
return message
finally:
queue.put(message)
try:
queue = Queue()
params = ("hello", queue)
child_process = Process(target=wrapper, args=params)
child_process.start()
output = queue.get(timeout=timeout)
print(f"ok: {output}")
except Empty:
timeout_message = f"Timeout {timeout}s reached"
print(timeout_message)
finally:
if 'child_process' in locals():
child_process.kill()
You can test and verify that depending on timeout=1 or timeout=3, i can trigger an error or not.
My main problem is that the real my_function() is a torch model inference for which i would like to limit the number of threads (to 4 let's say)
One can easily do so if my_function were in the main process, but in my example i tried a lot of tricks to limit it in the child process without any success (using threadpoolctl.threadpool_limits(4), torch.set_num_threads(4), os.environ["OMP_NUM_THREADS"]=4, os.environ["MKL_NUM_THREADS"]=4).
I'm completely open to other solution that can monitor the time execution of a function while limiting the number of threads used by this function.
thanks
Regards
You can limit simultaneous process with Pool. (https://docs.python.org/3/library/multiprocessing.html#module-multiprocessing.pool)
You can set max tasks done per child. Check it out.
Here you have a sample from superfastpython by Jason Brownlee:
# SuperFastPython.com
# example of limiting the number of tasks per child in the process pool
from time import sleep
from multiprocessing.pool import Pool
from multiprocessing import current_process
# task executed in a worker process
def task(value):
# get the current process
process = current_process()
# report a message
print(f'Worker is {process.name} with {value}', flush=True)
# block for a moment
sleep(1)
# protect the entry point
if __name__ == '__main__':
# create and configure the process pool
with Pool(2, maxtasksperchild=3) as pool:
# issue tasks to the process pool
for i in range(10):
pool.apply_async(task, args=(i,))
# close the process pool
pool.close()
# wait for all tasks to complete
pool.join()
I'm trying to code a kind of task manager in Python. It's based on a job queue, the main thread is in charge of adding jobs to this queue. I have made this class to handle the jobs queued, able to limit the number of concurrent processes and handle the output of the finished processes.
Here comes the problem, the _check_jobs function I don't get updated the returncode value of each process, independently of its status (running, finished...) job.returncode is always None, therefore I can't run if statement and remove jobs from the processing job list.
I know it can be done with process.communicate() or process.wait() but I don't want to block the thread that launches the processes. Is there any other way to do it, maybe using a ProcessPoolExecutor? The queue can be hit by processes at any time and I need to be able to handle them.
Thank you all for your time and support :)
from queue import Queue
import subprocess
from threading import Thread
from time import sleep
class JobQueueManager(Queue):
def __init__(self, maxsize: int):
super().__init__(maxsize)
self.processing_jobs = []
self.process = None
self.jobs_launcher=Thread(target=self._worker_job)
self.processing_jobs_checker=Thread(target=self._check_jobs_status)
self.jobs_launcher.start()
self.processing_jobs_checker.start()
def _worker_job(self):
while True:
# Run at max 3 jobs concurrently
if self.not_empty and len(self.processing_jobs) < 3:
# Get job from queue
job = self.get()
# Execute a task without blocking the thread
self.process = subprocess.Popen(job)
self.processing_jobs.append(self.process)
# util if queue.join() is used to block the queue
self.task_done()
else:
print("Waiting 4s for jobs")
sleep(4)
def _check_jobs_status(self):
while True:
# Check if jobs are finished
for job in self.processing_jobs:
# Sucessfully completed
if job.returncode == 0:
self.processing_jobs.remove(job)
# Wait 4 seconds and repeat
sleep(4)
def main():
q = JobQueueManager(100)
task = ["stress", "--cpu", "1", "--timeout", "20"]
for i in range(10): #put 10 tasks in the queue
q.put(task)
q.join() #block until all tasks are done
if __name__ == "__main__":
main()
I answer myself, I have come up with a working solution. The JobExecutor class handles in a custom way the Pool of processes. The watch_completed_tasks function tries to watch and handle the output of the tasks when they are done. This way everything is done with only two threads and the main thread is not blocked when submitting processes.
import subprocess
from threading import Timer
from concurrent.futures import ProcessPoolExecutor, as_completed
import logging
def launch_job(job):
process = subprocess.Popen(job, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
print(f"launching {process.pid}")
return [process.pid, process.stdout.read(), process.stderr.read()]
class JobExecutor(ProcessPoolExecutor):
def __init__(self, max_workers: int):
super().__init__(max_workers)
self.futures = []
self.watch_completed_tasks()
def submit(self, command):
future = super().submit(launch_job, command)
self.futures.append(future)
return future
def watch_completed_tasks(self):
# Manage tasks completion
for completed_task in as_completed(self.futures):
print(f"FINISHED task with PID {completed_task.result()[0]}")
self.futures.remove(completed_task)
# call this function evevery 5 seconds
timer_thread = Timer(5.0, self.watch_completed_tasks)
timer_thread.setName("TasksWatcher")
timer_thread.start()
def main():
executor = JobExecutor(max_workers=5)
for i in range(10):
task = ["stress",
"--cpu", "1",
"--timeout", str(i+5)]
executor.submit(task)
I am running a piece of python code in which multiple threads are run through threadpool executor. Each thread is supposed to perform a task (fetch a webpage for example). What I want to be able to do is to terminate all threads, even if one of the threads fail. For instance:
with ThreadPoolExecutor(self._num_threads) as executor:
jobs = []
for path in paths:
kw = {"path": path}
jobs.append(executor.submit(start,**kw))
for job in futures.as_completed(jobs):
result = job.result()
print(result)
def start(*args,**kwargs):
#fetch the page
if(success):
return True
else:
#Signal all threads to stop
Is it possible to do so? The results returned by threads are useless to me unless all of them are successful, so if even one of them fails, I would like to save some execution time of the rest of the threads and terminate them immediately. The actual code obviously is doing relatively lengthy tasks with a couple of failure points.
If you are done with threads and want to look into processes, then this peace of code here looks very promising and simple, almost the same syntax as thread, but with the multiprocessing module.
When the timeout flag expires the process is terminated, very convenient.
import multiprocessing
def get_page(*args, **kwargs):
# your web page downloading code goes here
def start_get_page(timeout, *args, **kwargs):
p = multiprocessing.Process(target=get_page, args=args, kwargs=kwargs)
p.start()
p.join(timeout)
if p.is_alive():
# stop the downloading 'thread'
p.terminate()
# and then do any post-error processing here
if __name__ == "__main__":
start_get_page(timeout, *args, **kwargs)
I have created an answer for a similar question I had, which I think will work for this question.
Terminate executor using ThreadPoolExecutor from concurrent.futures module
from concurrent.futures import ThreadPoolExecutor, as_completed
from time import sleep
NUM_REQUESTS = 100
def long_request(id):
sleep(1)
# Simulate bad response
if id == 10:
return {"data": {"valid": False}}
else:
return {"data": {"valid": True}}
def check_results(results):
valid = True
for result in results:
valid = result["data"]["valid"]
return valid
def main():
futures = []
responses = []
num_requests = 0
with ThreadPoolExecutor(max_workers=10) as executor:
for request_index in range(NUM_REQUESTS):
future = executor.submit(long_request, request_index)
# Future list
futures.append(future)
for future in as_completed(futures):
is_responses_valid = check_results(responses)
# Cancel all future requests if one invalid
if not is_responses_valid:
executor.shutdown(wait=False)
else:
# Append valid responses
num_requests += 1
responses.append(future.result())
return num_requests
if __name__ == "__main__":
requests = main()
print("Num Requests: ", requests)
In my code I used multiprocessing
import multiprocessing as mp
pool = mp.Pool()
for i in range(threadNumber):
pool.apply_async(publishMessage, args=(map_metrics, connection_parameters...,))
pool.close()
pool.terminate()
This is how I would do it:
import concurrent.futures
def start(*args,**kwargs):
#fetch the page
if(success):
return True
else:
return False
with concurrent.futures.ProcessPoolExecutor() as executor:
results = [executor.submit(start, {"path": path}) for path in paths]
concurrent.futures.wait(results, timeout=10, return_when=concurrent.futures.FIRST_COMPLETED)
for f in concurrent.futures.as_completed(results):
f_success = f.result()
if not f_success:
executor.shutdown(wait=False, cancel_futures=True) # shutdown if one fails
else:
#do stuff here
If any result is not True, everything will be shut down immediately.
You can try to use StoppableThread from func-timeout.
But terminating threads is strongly discouraged. And if you need to kill a thread, you probably have a design problem. Look at alternatives: asyncio coroutines and multiprocessing with legal cancel/terminating functionality.
This problem seems to have been eluding me - all the solutions are more like workarounds and add quite a bit of complexity to the code.
Since its been a good while since any posts regarding this have been made, are there any simple solutions to the following - upon detecting a keyboard interrupt, cleanly exit all the childs proceses, terminate the program?
Code below is snippet of my multiproccess structure - I'd like to preserve as much as posible, while adding the needed functionality:
from multiprocessing import Pool
import time
def multiprocess_init(l):
global lock
lock = l
def synchronous_print(i):
with lock:
print i
time.sleep(1)
if __name__ == '__main__':
lock = Lock()
pool = Pool(processes=5, initializer=multiprocess_init, initargs=(lock, ))
for i in range(1,20):
pool.map_async(synchronous_print, [i])
pool.close() #necessary to prevent zombies
pool.join() #wait for all processes to finish
The short answer is to move to python 3. Python 2 has multiple problems with thread/process synchronization that have been fixed in python 3.
In your case, multiprocessing will doggedly recreate your child processes every time you send keyboard interrupt and pool.close will get stuck and never exit. You can reduce the problem by explicitly exiting the child process with os.exit and by waiting for individual results from apply_async so that you don't get stuck in pool.close prison.
from multiprocessing import Pool, Lock
import time
import os
def multiprocess_init(l):
global lock
lock = l
print("initialized child")
def synchronous_print(i):
try:
with lock:
print i
time.sleep(1)
except KeyboardInterrupt:
print("exit child")
os.exit(2)
if __name__ == '__main__':
lock = Lock()
pool = Pool(processes=5, initializer=multiprocess_init, initargs=(lock, ))
results = []
for i in range(1,20):
results.append(pool.map_async(synchronous_print, [i]))
for result in results:
print('wait result')
result.wait()
pool.close() #necessary to prevent zombies
pool.join() #wait for all processes to finish
print("Join completes")
I have reserved some nodes on a SLURM cluster and want to run a python script on these nodes.
On one node (server) a python script should fill a queue with jobs and dispatch these jobs to the clients.
Most of the time this works fine, but occasionally the script stalls.
When using Ctrl+C it turns out that in that case one (or sometimes more) nodes seem to be stuck in <Finalize object, dead>:
^Csrun: interrupt (one more within 1 sec to abort)
srun: task 30: running
srun: tasks 0-29,31-39: exited
^Csrun: sending Ctrl-C to job 1075185.14
Exception KeyboardInterrupt: KeyboardInterrupt() in <Finalize object, dead> ignored
srun: Job step aborted: Waiting up to 2 seconds for job step to finish.
slurmd[cluster-112]: *** STEP 1075185.14 KILLED AT 2014-04-03T09:11:23 WITH SIGNAL 9 ***
I have no clue what the reason could be. Maybe, it looks like something related to the garbage collector.
This is the script I run:
#!/usr/bin/env
import os
import multiprocessing.managers
import Queue
import sys
import subprocess
import socket
import errno
class QueueManager(multiprocessing.managers.SyncManager):
pass
def worker(i, my_slurm_proc_id):
print 'hello %i (proc=%i)' % (i, my_slurm_proc_id)
time.sleep(0.1)
pass
def run_server(first_slurm_node, N_procs):
queue = Queue.Queue()
barrier = multiprocessing.BoundedSemaphore(N_procs-1)
QueueManager.register('get_queue', callable=lambda: queue)
QueueManager.register('get_barrier', callable=lambda: barrier)
for i in range(5000):
queue.put(i)
m = QueueManager(address=(first_slurm_node, 50000), authkey='abracadabra')
m.start()
for i in range(N_procs-1):
barrier.acquire(True)
m.get_queue().join() # somehow just 'queue.join()' doesn't work here
def run_client(my_slurm_proc_id, first_slurm_node):
QueueManager.register('get_queue')
QueueManager.register('get_barrier')
m = QueueManager(address=(first_slurm_node, 50000), authkey='abracadabra')
m.connect()
barrier = m.get_barrier()
barrier.acquire(True)
queue = m.get_queue()
while not queue.empty():
try:
data = queue.get_nowait()
except Queue.Empty:
break
worker(data, my_slurm_proc_id)
queue.task_done()
queue = None
barrier.release()
barrier = None
def main():
slurm_job_nodelist = subprocess.check_output('scontrol show hostname'.split(' ') + [os.environ['SLURM_JOB_NODELIST']]).split('\n')
master_node = slurm_job_nodelist[0]
my_slurm_proc_id = int(os.environ['SLURM_PROCID'])
N_procs = int(os.environ['SLURM_NPROCS'])
if my_slurm_proc_id == 0:
run_server(master_node, N_procs)
else:
run_client(my_slurm_proc_id, master_node)
if __name__ == '__main__':
main()