How to kill all threads conditioned on status of on thread? - python

I have n threads running simultaneously. These threads are processing a list containing m test cases. For example, thread n-1 is working on item m[i-1] while thread n is working on item m[i]. I want to stop all threads if for example thread n-1 failed or return a signal. How can I achieve this?
Here is a MWE:
This is my processing function
def process(input_addr):
i =+ 1
print('Total number of executed unit tests: {}'.format(i))
print("executed {}. thread".format(input_addr))
try:
command = 'python3 '+input_addr
result = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
msg, err = result.communicate()
if msg.decode('utf-8') != '':
stat = parse_shell(msg.decode('utf-8'))
if stat:
print('Test Failed')
return True
else:
stat = parse_shell(err)
if stat:
print('Test Failed')
return True
except Exception as e:
print("thread.\nMessage:{1}".format(e))
Here is my pool:
def pre_run_test_files(self):
with Pool(10) as p:
p.map(process, self.test_files)
I am using:
from multiprocessing import Pool

You can have your worker function, process simply raise an exception and use an error_callback function with apply_async that calls terminate on the pool as in the following demo:
from multiprocessing import Pool
def process(i):
import time
time.sleep(1)
if i == 6:
raise ValueError(f'Bad value: {i}')
print(i, flush=True)
def my_error_callback(e):
pool.terminate()
print(e)
if __name__ == '__main__':
pool = Pool(4)
for i in range(20):
pool.apply_async(process, args=(i,), error_callback=my_error_callback)
# wait for all tasks to complete
pool.close()
pool.join()
Prints:
0
1
3
2
4
5
7
Bad value: 6
You should be able to adapt the above code to your particular problem.
Update
Because your original code used the map method, there is a second solution that use methid imap_unordered, which will returns an iterator that on every iteration returns the next return value from your worker function, process, or raises an exception if your worker function raised an exception. With method imap_unordere these results are returned in an arbitrary completion order rather than in task submission order, but when the default chunksize argument of 1 is used, this arbitrary order is typically task-completion order. This is what you want so that you can detect an exception at the earliest possible time and terminate the pool. Of course, if you cared about the return values from process, then you would use method imap so that the results are returned in task-submission order. But in that case if when case i == 6 is when the exception is raised but that task happened to be the first task to complete, its exception could still not be returned until the tasks submitted for i == 1 though 5 were completed.
In the following code a pool size of 8 is used, and all tasks first sleep for 1 second before printing their arguments and returning except for the case of i == 6, which raises an exception immediately. Using imap_unordered we have:
from multiprocessing import Pool
def process(i):
import time
# raise an exception immediately for i == 6 without sleeping
if (i != 6):
time.sleep(1)
else:
raise ValueError(f'Bad value: {i}')
print(i, flush=True)
if __name__ == '__main__':
pool = Pool(8)
results = pool.imap_unordered(process, range(20))
try:
# Iterate results as task complete until
# we are done or one raises an exeption:
for result in results:
# we don't care about the return value:
pass
except Exception as e:
pool.terminate()
print(e)
pool.close()
pool.join()
Prints:
Bad value: 6
If we replace the call to imap_unordered with a call to imap, then the output is:
0
1
2
3
4
5
Bad value: 6
The first solution, using apply_async with a error_callback argument, allows for the exception to be acted upon as soon as it occurs and if you care about the results in task submission order, you can save the multiprocessing.AsyncResult objects returned by apply_async in a list and call get on these objects. Try the following code with RAISE_EXCEPTION set to True and then to False:
from multiprocessing import Pool
import time
RAISE_EXCEPTION = True
def process(i):
if RAISE_EXCEPTION and i == 6:
raise ValueError(f'Bad value: {i}')
time.sleep(1)
return i # instead of printing
def my_error_callback(e):
global got_exception
got_exception = True
pool.terminate()
print(e)
if __name__ == '__main__':
got_exception = False
pool = Pool(4)
async_results = [pool.apply_async(process, args=(i,), error_callback=my_error_callback) for i in range(20)]
# Wait for all tasks to complete:
pool.close()
pool.join()
if not got_exception:
for async_result in async_results:
print(async_result.get())

I found the solution:
def process(i, input_addr, event):
kill_flag = False
if not event.is_set():
print('Total number of executed unit tests: {}'.format(i))
print("executed {}. thread".format(input_addr))
try:
command = 'python3 '+input_addr
result = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
msg, err = result.communicate()
if msg.decode('utf-8') != '':
stat = parse_shell(msg.decode('utf-8'))
if stat:
print('Test Failed')
kill_flag = True
# all_run.append(input_addr)
#write_list_to_txt(input_addr, valid_tests)
else:
kill_flag = False
else:
stat = parse_shell(err)
if stat:
print('Test Failed')
kill_flag = True
# all_run.append(input_addr)
#write_list_to_txt(input_addr, valid_tests)
else:
kill_flag = False
except Exception as e:
print("thread.\nMessage:{1}".format(e))
if kill_flag:
event.set()
def manager():
p= multiprocessing.Pool(10)
m = multiprocessing.Manager()
event = m.Event()
for i,f in enumerate(self.test_files):
p.apply_async(process, (i, f, event))
p.close()
event.wait()
p.terminate()

Related

Kill all "workers" on "listener" error (multiprocessing, manager and queue set-up)

I'm using multiprocessing to run workers on different files in parallel. Worker's results are put into queue. A listener gets the results from the queue and writes them to the file.
Sometimes listener might run into errors (of various origins). In this case, the listener silently dies, but all other processes continue running (rather surprisingly, worker errors causes all processes to terminate).
I would like to stop all processes (workers, listener, e.t.c.) when listener catches an error. How this can be done?
The scheme of my code is as follows:
def worker(file_path, q):
## do something
q.put(1.)
return True
def listener(q):
while True:
m = q.get()
if m == 'kill':
break
else:
try:
# do something and write to file
except Exception as err:
# raise error
tb = sys.exc_info()[2]
raise err.with_traceback(tb)
def main():
manager = mp.Manager()
q = manager.Queue(maxsize=3)
with mp.Pool(5) as pool:
watcher = pool.apply_async(listener, (q,))
files = ['path_1','path_2','path_3']
jobs = [ pool.apply_async(worker, (p,q,)) for p in files ]
# fire off workers
for job in jobs:
job.get()
# kill the listener when done
q.put('kill')
# run
if __name__ == "__main__":
main()
I tried introducing event = manager.Event() and using it as a flag in main():
## inside the pool, after starting workers
while True:
if event.is_set():
for job in jobs:
job.terminate()
No success. Calling os._exit(1) in listener exception block rises broken pipe error, but processes are not killed.
I also tried setting daemon = True,
for job in jobs:
job.daemon = True
Did not help.
In fact, to handle listener exceptions, I'm using a callable, as required by apply_async (so that they are not entirely silenced). This complicates the situation, but not much.
Thank you in advance.
As always there are many ways to accomplish what you're after, but I would probably suggest using an Event to signal that the processes should quit. I also would not use a Pool in this instance, as it only really simplifies things for simple cases where you need something like map. More complicated use cases quickly make it easier to just build you own "pool" with the functionality you need.
from multiprocessing import Process, Queue, Event
from random import random
def might_fail(a):
assert(a > .001)
def worker(args_q: Queue, result_q: Queue, do_quit: Event):
try:
while not do_quit.is_set():
args = args_q.get()
if args is None:
break
else:
# do something
result_q.put(random())
finally: #signal that worker is exiting even if exception is raised
result_q.put(None) #signal listener that worker is exiting
def listener(result_q: Queue, do_quit: Event, n_workers: int):
n_completed = 0
while n_workers > 0:
res = result_q.get()
if res is None:
n_workers -= 1
else:
n_completed += 1
try:
might_fail(res)
except:
do_quit.set() #let main continue
print(n_completed)
raise #reraise error after we signal others to stop
do_quit.set() #let main continue
print(n_completed)
if __name__ == "__main__":
args_q = Queue()
result_q = Queue()
do_quit = Event()
n_workers = 4
listener_p = Process(target=listener, args=(result_q, do_quit, n_workers))
listener_p.start()
for _ in range(n_workers):
worker_p = Process(target=worker, args=(args_q, result_q, do_quit))
worker_p.start()
for _ in range(1000):
args_q.put("some/file.txt")
for _ in range(n_workers):
args_q.put(None)
do_quit.wait()
print('done')

Right way to exit from function called by multiprocessing.Pool?

How to exit from a function called my multiprocessing.Pool
Here is an example of the code I am using, when I put a condition to exit from function worker when I use this as a script in terminal it halts and does not exit.
def worker(n):
if n == 4:
exit("wrong number") # tried to use sys.exit(1) did not work
return n*2
def caller(mylist, n=1):
n_cores = n if n > 1 else multiprocessing.cpu_count()
print(n_cores)
pool = multiprocessing.Pool(processes=n_cores)
result = pool.map(worker, mylist)
pool.close()
pool.join()
return result
l = [2, 3, 60, 4]
myresult = caller(l, 4)
As I said, I don't think you can exit the process running the main script from a worker process.
You haven't explained exactly why you want to do this, so this answer is a guess, but perhaps raising a custom Exception and handling it in an explict except as shown below would be an acceptable way to workaround the limitation.
import multiprocessing
import sys
class WorkerStopException(Exception):
pass
def worker(n):
if n == 4:
raise WorkerStopException()
return n*2
def caller(mylist, n=1):
n_cores = n if n > 1 else multiprocessing.cpu_count()
print(n_cores)
pool = multiprocessing.Pool(processes=n_cores)
try:
result = pool.map(worker, mylist)
except WorkerStopException:
sys.exit("wrong number")
pool.close()
pool.join()
return result
if __name__ == '__main__':
l = [2, 3, 60, 4]
myresult = caller(l, 4)
Output displayed when run:
4
wrong number
(The 4 is the number of CPUs my system has.)
The thing with pool.map is, that it will raise exceptions from child-processes only after all tasks are finished. But your comments sound like you need immediate abortion of all processing as soon as a wrong value is detected in any process. This would be a job for pool.apply_async then.
pool.apply_async offers error_callbacks, which you can use to let the pool terminate. Workers will be fed item-wise instead of chunk-wise like with the pool.map variants, so you get the chance for early exit on each processed argument.
I'm basically reusing my answer from here:
from time import sleep
from multiprocessing import Pool
def f(x):
sleep(x)
print(f"f({x})")
if x == 4:
raise ValueError(f'wrong number: {x}')
return x * 2
def on_error(e):
if type(e) is ValueError:
global terminated
terminated = True
pool.terminate()
print(f"oops: {type(e).__name__}('{e}')")
def main():
global pool
global terminated
terminated = False
pool = Pool(4)
results = [pool.apply_async(f, (x,), error_callback=on_error)
for x in range(10)]
pool.close()
pool.join()
if not terminated:
for r in results:
print(r.get())
if __name__ == '__main__':
main()
Output:
f(0)
f(1)
f(2)
f(3)
f(4)
oops: ValueError('wrong number: 4')
Process finished with exit code 0

Python Multiprocessing - terminate / restart worker process

I have a bunch of long running processes that I would like to split up into multiple processes. That part I can do no problem. The issue I run into is sometimes these processes go into a hung state. To address this issue I would like to be able to set a time threshold for each task that a process is working on. When that time threshold is exceeded I would like to restart or terminate the task.
Originally my code was very simple using a process pool, however with the pool I could not figure out how to retrieve the processes inside the pool, nevermind how to restart / terminate a process in the pool.
I have resorted to using a queue and process objects as is illustrated in this example (https://pymotw.com/2/multiprocessing/communication.html#passing-messages-to-processes with some changes.
My attempts to figure this out are in the code below. In its current state the process does not actually get terminated. Further to that I cannot figure out how to get the process to move onto the next task after the current task is terminated. Any suggestions / help appreciated, perhaps I’m going about this the wrong way.
Thanks
import multiprocess
import time
class Consumer(multiprocess.Process):
def __init__(self, task_queue, result_queue, startTimes, name=None):
multiprocess.Process.__init__(self)
if name:
self.name = name
print 'created process: {0}'.format(self.name)
self.task_queue = task_queue
self.result_queue = result_queue
self.startTimes = startTimes
def stopProcess(self):
elapseTime = time.time() - self.startTimes[self.name]
print 'killing process {0} {1}'.format(self.name, elapseTime)
self.task_queue.cancel_join_thread()
self.terminate()
# now want to get the process to start procesing another job
def run(self):
'''
The process subclass calls this on a separate process.
'''
proc_name = self.name
print proc_name
while True:
# pulling the next task off the queue and starting it
# on the current process.
task = self.task_queue.get()
self.task_queue.cancel_join_thread()
if task is None:
# Poison pill means shutdown
#print '%s: Exiting' % proc_name
self.task_queue.task_done()
break
self.startTimes[proc_name] = time.time()
answer = task()
self.task_queue.task_done()
self.result_queue.put(answer)
return
class Task(object):
def __init__(self, a, b, startTimes):
self.a = a
self.b = b
self.startTimes = startTimes
self.taskName = 'taskName_{0}_{1}'.format(self.a, self.b)
def __call__(self):
import time
import os
print 'new job in process pid:', os.getpid(), self.taskName
if self.a == 2:
time.sleep(20000) # simulate a hung process
else:
time.sleep(3) # pretend to take some time to do the work
return '%s * %s = %s' % (self.a, self.b, self.a * self.b)
def __str__(self):
return '%s * %s' % (self.a, self.b)
if __name__ == '__main__':
# Establish communication queues
# tasks = this is the work queue and results is for results or completed work
tasks = multiprocess.JoinableQueue()
results = multiprocess.Queue()
#parentPipe, childPipe = multiprocess.Pipe(duplex=True)
mgr = multiprocess.Manager()
startTimes = mgr.dict()
# Start consumers
numberOfProcesses = 4
processObjs = []
for processNumber in range(numberOfProcesses):
processObj = Consumer(tasks, results, startTimes)
processObjs.append(processObj)
for process in processObjs:
process.start()
# Enqueue jobs
num_jobs = 30
for i in range(num_jobs):
tasks.put(Task(i, i + 1, startTimes))
# Add a poison pill for each process object
for i in range(numberOfProcesses):
tasks.put(None)
# process monitor loop,
killProcesses = {}
executing = True
while executing:
allDead = True
for process in processObjs:
name = process.name
#status = consumer.status.getStatusString()
status = process.is_alive()
pid = process.ident
elapsedTime = 0
if name in startTimes:
elapsedTime = time.time() - startTimes[name]
if elapsedTime > 10:
process.stopProcess()
print "{0} - {1} - {2} - {3}".format(name, status, pid, elapsedTime)
if allDead and status:
allDead = False
if allDead:
executing = False
time.sleep(3)
# Wait for all of the tasks to finish
#tasks.join()
# Start printing results
while num_jobs:
result = results.get()
print 'Result:', result
num_jobs -= 1
I generally recommend against subclassing multiprocessing.Process as it leads to code hard to read.
I'd rather encapsulate your logic in a function and run it in a separate process. This keeps the code much cleaner and intuitive.
Nevertheless, rather than reinventing the wheel, I'd recommend you to use some library which already solves the issue for you such as Pebble or billiard.
For example, the Pebble library allows to easily set timeouts to processes running independently or within a Pool.
Running your function within a separate process with a timeout:
from pebble import concurrent
from concurrent.futures import TimeoutError
#concurrent.process(timeout=10)
def function(foo, bar=0):
return foo + bar
future = function(1, bar=2)
try:
result = future.result() # blocks until results are ready
except TimeoutError as error:
print("Function took longer than %d seconds" % error.args[1])
Same example but with a process Pool.
with ProcessPool(max_workers=5, max_tasks=10) as pool:
future = pool.schedule(function, args=[1], timeout=10)
try:
result = future.result() # blocks until results are ready
except TimeoutError as error:
print("Function took longer than %d seconds" % error.args[1])
In both cases, the timing out process will be automatically terminated for you.
A way simpler solution would be to continue using a than reimplementing the Pool is to design a mechanism which timeout the function you are running.
For instance:
from time import sleep
import signal
class TimeoutError(Exception):
pass
def handler(signum, frame):
raise TimeoutError()
def run_with_timeout(func, *args, timeout=10, **kwargs):
signal.signal(signal.SIGALRM, handler)
signal.alarm(timeout)
try:
res = func(*args, **kwargs)
except TimeoutError as exc:
print("Timeout")
res = exc
finally:
signal.alarm(0)
return res
def test():
sleep(4)
print("ok")
if __name__ == "__main__":
import multiprocessing as mp
p = mp.Pool()
print(p.apply_async(run_with_timeout, args=(test,),
kwds={"timeout":1}).get())
The signal.alarm set a timeout and when this timeout, it run the handler, which stop the execution of your function.
EDIT: If you are using a windows system, it seems to be a bit more complicated as signal does not implement SIGALRM. Another solution is to use the C-level python API. This code have been adapted from this SO answer with a bit of adaptation to work on 64bit system. I have only tested it on linux but it should work the same on windows.
import threading
import ctypes
from time import sleep
class TimeoutError(Exception):
pass
def run_with_timeout(func, *args, timeout=10, **kwargs):
interupt_tid = int(threading.get_ident())
def interupt_thread():
# Call the low level C python api using ctypes. tid must be converted
# to c_long to be valid.
res = ctypes.pythonapi.PyThreadState_SetAsyncExc(
ctypes.c_long(interupt_tid), ctypes.py_object(TimeoutError))
if res == 0:
print(threading.enumerate())
print(interupt_tid)
raise ValueError("invalid thread id")
elif res != 1:
# "if it returns a number greater than one, you're in trouble,
# and you should call it again with exc=NULL to revert the effect"
ctypes.pythonapi.PyThreadState_SetAsyncExc(
ctypes.c_long(interupt_tid), 0)
raise SystemError("PyThreadState_SetAsyncExc failed")
timer = threading.Timer(timeout, interupt_thread)
try:
timer.start()
res = func(*args, **kwargs)
except TimeoutError as exc:
print("Timeout")
res = exc
else:
timer.cancel()
return res
def test():
sleep(4)
print("ok")
if __name__ == "__main__":
import multiprocessing as mp
p = mp.Pool()
print(p.apply_async(run_with_timeout, args=(test,),
kwds={"timeout": 1}).get())
print(p.apply_async(run_with_timeout, args=(test,),
kwds={"timeout": 5}).get())
For long running processes and/or long iterators, spawned workers might hang after some time. To prevent this, there are two built-in techniques:
Restart workers after they have delivered maxtasksperchild tasks from the queue.
Pass timeout to pool.imap.next(), catch the TimeoutError, and finish the rest of the work in another pool.
The following wrapper implements both, as a generator. This also works when replacing stdlib multiprocessing with multiprocess.
import multiprocessing as mp
def imap(
func,
iterable,
*,
processes=None,
maxtasksperchild=42,
timeout=42,
initializer=None,
initargs=(),
context=mp.get_context("spawn")
):
"""Multiprocessing imap, restarting workers after maxtasksperchild tasks to avoid zombies.
Example:
>>> list(imap(str, range(5)))
['0', '1', '2', '3', '4']
Raises:
mp.TimeoutError: if the next result cannot be returned within timeout seconds.
Yields:
Ordered results as they come in.
"""
with context.Pool(
processes=processes,
maxtasksperchild=maxtasksperchild,
initializer=initializer,
initargs=initargs,
) as pool:
it = pool.imap(func, iterable)
while True:
try:
yield it.next(timeout)
except StopIteration:
return
To catch the TimeoutError:
>>> import time
>>> iterable = list(range(10))
>>> results = []
>>> try:
... for i, result in enumerate(imap(time.sleep, iterable, processes=2, timeout=2)):
... results.append(result)
... except mp.TimeoutError:
... print("Failed to process the following subset of iterable:", iterable[i:])
Failed to process the following subset of iterable: [2, 3, 4, 5, 6, 7, 8, 9]

Python multiprocessing.pool failed to stop after finishing all the tasks

I have implemented a parser like this,
import multiprocessing
import time
def foo(i):
try:
# some codes
except Exception, e:
print e
def worker(i):
foo(i)
time.sleep(i)
return i
if __name__ == "__main__":
pool = multiprocessing.Pool(processes=4)
result = pool.map_async(worker, range(15))
while not result.ready():
print("num left: {}".format(result._number_left))
time.sleep(1)
real_result = result.get()
pool.close()
pool.join()
My parser actually finishes all the processes but the results are not available ie, it's still inside the while loop and printing num left : 2. How I stop this? And I don't want the value of real_result variable.
I'm running Ubuntu 14.04, python 2.7
Corresponding part of my code looks like,
async_args = ((date, kw_dict) for date in dates)
pool = Pool(processes=4)
no_rec = []
def check_for_exit(msg):
print msg
if last_date in msg:
print 'Terminating the pool'
pool.terminate()
try:
result = pool.map_async(parse_date_range, async_args)
while not result.ready():
print("num left: {}".format(result._number_left))
sleep(1)
real_result = result.get(5)
passed_dates = []
for x, y in real_result:
passed_dates.append(x)
if y:
no_rec.append(y[0])
# if last_date in passed_dates:
# print 'Terminating the pool'
# pool.terminate()
pool.close()
except:
print 'Pool error'
pool.terminate()
print traceback.format_exc()
finally:
pool.join()
My bet is that you have faulty parse_date_range,
which causes a worker process to terminate without producing any result or py exception.
Probably libc's exit is called by a C module/lib due to a realy nasty error.
This code reproduces the infinite loop you observe:
import sys
import multiprocessing
import time
def parse_date_range(i):
if i == 5:
sys.exit(1) # or raise SystemExit;
# other exceptions are handled by the pool
time.sleep(i/19.)
return i
if __name__ == "__main__":
pool = multiprocessing.Pool(4)
result = pool.map_async(parse_date_range, range(15))
while not result.ready():
print("num left: {}".format(result._number_left))
time.sleep(1)
real_result = result.get()
pool.close()
pool.join()
Hope this'll help.

How to exit all the joined processes in case any one has an exception - Python

I have a python pool of processes , if an exception occurs in any one of the process i want to exit the execution of the pool
I have joined all the processes in the pool, so the join waits for every process to finish.
If i raise sys.exit(1) inside the target function the system goes on infinite wait because the join is still waiting for process to complete.
How can exit the execution while using join in the code
from multiprocessing import Pool
import time
import sys
def printer(ip):
try:
for _ in xrange(5):
print ip+str(_)
time.sleep(1.0)
except Exception as e:
print e
sys.exit(2)
def test():
pool = Pool(processes=2)
for i in ["hello",5]:
result = pool.apply_async(printer,(i,))
pool.close()
pool.join()
print "good bye"
test()
Just return to the parent process the status of the operation and use a callback to react to failures.
import time
from multiprocessing import Pool
def printer(ip):
try:
for _ in xrange(5):
print ip+str(_)
time.sleep(1.0)
return True
except Exception as e:
print e
return False
class Worker():
def __init__(self):
self.pool = Pool(processes=2)
def callback(self, result):
if not result:
print "Error raised in child process. Terminating.."
self.pool.terminate()
def do_job(self):
for i in ["hello", 5]:
self.pool.apply_async(printer, (i,), callback=self.callback)
self.pool.close()
self.pool.join()
print "good bye"
def test():
w = Worker()
w.do_job()

Categories

Resources