Related
I'm running some Matlab code in parallel from inside a Python context (I know, but that's what's going on), and I'm hitting an import error involving matlab.double. The same code works fine in a multiprocessing.Pool, so I am having trouble figuring out what the problem is. Here's a minimal reproducing test case.
import matlab
from multiprocessing import Pool
from joblib import Parallel, delayed
# A global object that I would like to be available in the parallel subroutine
x = matlab.double([[0.0]])
def f(i):
print(i, x)
with Pool(4) as p:
p.map(f, range(10))
# This prints 1, [[0.0]]\n2, [[0.0]]\n... as expected
for _ in Parallel(4, backend='multiprocessing')(delayed(f)(i) for i in range(10)):
pass
# This also prints 1, [[0.0]]\n2, [[0.0]]\n... as expected
# Now run with default `backend='loky'`
for _ in Parallel(4)(delayed(f)(i) for i in range(10)):
pass
# ^ this crashes.
So, the only problematic one is the one using the 'loky' backend.
The full traceback is:
exception calling callback for <Future at 0x7f63b5a57358 state=finished raised BrokenProcessPool>
joblib.externals.loky.process_executor._RemoteTraceback:
'''
Traceback (most recent call last):
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/externals/loky/process_executor.py", line 391, in _process_worker
call_item = call_queue.get(block=True, timeout=timeout)
File "~/miniconda3/envs/myenv/lib/python3.6/multiprocessing/queues.py", line 113, in get
return _ForkingPickler.loads(res)
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/matlab/mlarray.py", line 31, in <module>
from _internal.mlarray_sequence import _MLArrayMetaClass
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/matlab/_internal/mlarray_sequence.py", line 3, in <module>
from _internal.mlarray_utils import _get_strides, _get_size, \
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/matlab/_internal/mlarray_utils.py", line 4, in <module>
import matlab
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/matlab/__init__.py", line 24, in <module>
from mlarray import double, single, uint8, int8, uint16, \
ImportError: cannot import name 'double'
'''
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/externals/loky/_base.py", line 625, in _invoke_callbacks
callback(self)
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/parallel.py", line 309, in __call__
self.parallel.dispatch_next()
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/parallel.py", line 731, in dispatch_next
if not self.dispatch_one_batch(self._original_iterator):
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/parallel.py", line 759, in dispatch_one_batch
self._dispatch(tasks)
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/parallel.py", line 716, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/_parallel_backends.py", line 510, in apply_async
future = self._workers.submit(SafeFunction(func))
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/externals/loky/reusable_executor.py", line 151, in submit
fn, *args, **kwargs)
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/externals/loky/process_executor.py", line 1022, in submit
raise self._flags.broken
joblib.externals.loky.process_executor.BrokenProcessPool: A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.
joblib.externals.loky.process_executor._RemoteTraceback:
'''
Traceback (most recent call last):
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/externals/loky/process_executor.py", line 391, in _process_worker
call_item = call_queue.get(block=True, timeout=timeout)
File "~/miniconda3/envs/myenv/lib/python3.6/multiprocessing/queues.py", line 113, in get
return _ForkingPickler.loads(res)
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/matlab/mlarray.py", line 31, in <module>
from _internal.mlarray_sequence import _MLArrayMetaClass
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/matlab/_internal/mlarray_sequence.py", line 3, in <module>
from _internal.mlarray_utils import _get_strides, _get_size, \
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/matlab/_internal/mlarray_utils.py", line 4, in <module>
import matlab
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/matlab/__init__.py", line 24, in <module>
from mlarray import double, single, uint8, int8, uint16, \
ImportError: cannot import name 'double'
'''
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "test.py", line 20, in <module>
for _ in Parallel(4)(delayed(f)(i) for i in range(10)):
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/parallel.py", line 934, in __call__
self.retrieve()
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/parallel.py", line 833, in retrieve
self._output.extend(job.get(timeout=self.timeout))
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/_parallel_backends.py", line 521, in wrap_future_result
return future.result(timeout=timeout)
File "~/miniconda3/envs/myenv/lib/python3.6/concurrent/futures/_base.py", line 432, in result
return self.__get_result()
File "~/miniconda3/envs/myenv/lib/python3.6/concurrent/futures/_base.py", line 384, in __get_result
raise self._exception
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/externals/loky/_base.py", line 625, in _invoke_callbacks
callback(self)
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/parallel.py", line 309, in __call__
self.parallel.dispatch_next()
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/parallel.py", line 731, in dispatch_next
if not self.dispatch_one_batch(self._original_iterator):
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/parallel.py", line 759, in dispatch_one_batch
self._dispatch(tasks)
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/parallel.py", line 716, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/_parallel_backends.py", line 510, in apply_async
future = self._workers.submit(SafeFunction(func))
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/externals/loky/reusable_executor.py", line 151, in submit
fn, *args, **kwargs)
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/externals/loky/process_executor.py", line 1022, in submit
raise self._flags.broken
joblib.externals.loky.process_executor.BrokenProcessPool: A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.
Looking at the traceback, it seems like the root cause is an issue importing the matlab package in the child process.
It's probably worth noting that this all runs just fine if instead I had defined x = np.array([[0.0]]) (after importing numpy as np). And of course the main process has no problem with any matlab imports, so I am not sure why the child process would.
I'm not sure if this error has anything in particular to do with the matlab package, or if it's something to do with global variables and cloudpickle or loky. In my application it would help to stick with loky, so I'd appreciate any insight!
I should also note that I'm using the official Matlab engine for Python: https://www.mathworks.com/help/matlab/matlab-engine-for-python.html. I suppose that might make it hard for others to try out the test cases, so I wish I could reproduce this error with a type other than matlab.double, but I haven't found another yet.
Digging around more, I've noticed that the process of importing the matlab package is more circular than I would expect, and I'm speculating that this could be part of the problem? The issue is that when import matlab is run by loky's _ForkingPickler, first some file matlab/mlarray.py is imported, which imports some other files, one of which contains import matlab, and this causes matlab/__init__.py to be run, which internally has from mlarray import double, single, uint8, ... which is the line that causes the crash.
Could this circularity be the issue? If so, why can I import this module in the main process but not in the loky backend?
The error is caused by incorrect loading order of global objects in the child processes. It can be seen clearly in the traceback
_ForkingPickler.loads(res) -> ... -> import matlab -> from mlarray import ...
that matlab is not yet imported when the global variable x is loaded by cloudpickle.
joblib with loky seems to treat modules as normal global objects and send them dynamically to the child processes. joblib doesn't record the order in which those objects/modules were defined. Therefore they are loaded (initialized) in a random order in the child processes.
A simple workaround is to manually pickle the matlab object and load it after importing matlab inside your function.
import matlab
import pickle
px = pickle.dumps(matlab.double([[0.0]]))
def f(i):
import matlab
x=pickle.loads(px)
print(i, x)
Of course you can also use the joblib.dumps and loads to serialize the objects.
Use initializer
Thanks to the suggestion of #Aaron, you can also use an initializer (for loky) to import Matlab before loading x.
Currently there's no simple API to specify initializer. So I wrote a simple function:
def with_initializer(self, f_init):
# Overwrite initializer hook in the Loky ProcessPoolExecutor
# https://github.com/tomMoral/loky/blob/f4739e123acb711781e46581d5ed31ed8201c7a9/loky/process_executor.py#L850
hasattr(self._backend, '_workers') or self.__enter__()
origin_init = self._backend._workers._initializer
def new_init():
origin_init()
f_init()
self._backend._workers._initializer = new_init if callable(origin_init) else f_init
return self
It is a little bit hacky but works well with the current version of joblib and loky.
Then you can use it like:
import matlab
from joblib import Parallel, delayed
x = matlab.double([[0.0]])
def f(i):
print(i, x)
def _init_matlab():
import matlab
with Parallel(4) as p:
for _ in with_initializer(p, _init_matlab)(delayed(f)(i) for i in range(10)):
pass
I hope the developers of joblib will add initializer argument to the constructor of Parallel in the future.
I have a list of 80,000 strings that I am running through a discourse parser, and in order to increase the speed of this process I have been trying to use the python multiprocessing package.
The parser code requires python 2.7 and I am currently running it on a 2-core Ubuntu machine using a subset of the strings. For short lists, i.e. 20, the process runs without an issue on both cores, however if I run a list of about 100 strings, both workers will freeze at different points (so in some cases worker 1 won't stop until a few minutes after worker 2). This happens before all the strings are finished and anything is returned. Each time the cores stop at the same point given the same mapping function is used, but these points are different if I try a different mapping function, i.e. map vs map_async vs imap.
I have tried removing the strings at those indices, which did not have any affect and those strings run fine in a shorter list. Based on print statements I included, when the process appears to freeze the current iteration seems to finish for the current string and it just does not move on to the next string. It takes about an hour of run time to reach the spot where both workers have frozen and I have not been able to reproduce the issue in less time. The code involving the multiprocessing commands is:
def main(initial_file, chunksize = 2):
entered_file = pd.read_csv(initial_file)
entered_file = entered_file.ix[:, 0].tolist()
pool = multiprocessing.Pool()
result = pool.map_async(discourse_process, entered_file, chunksize = chunksize)
pool.close()
pool.join()
with open("final_results.csv", 'w') as file:
writer = csv.writer(file)
for listitem in result.get():
writer.writerow([listitem[0], listitem[1]])
if __name__ == '__main__':
main(sys.argv[1])
When I stop the process with Ctrl-C (which does not always work), the error message I receive is:
^CTraceback (most recent call last):
File "Combined_Script.py", line 94, in <module>
main(sys.argv[1])
File "Combined_Script.py", line 85, in main
pool.join()
File "/usr/lib/python2.7/multiprocessing/pool.py", line 474, in join
p.join()
File "/usr/lib/python2.7/multiprocessing/process.py", line 145, in join
res = self._popen.wait(timeout)
File "/usr/lib/python2.7/multiprocessing/forking.py", line 154, in wait
return self.poll(0)
File "/usr/lib/python2.7/multiprocessing/forking.py", line 135, in poll
pid, sts = os.waitpid(self.pid, flag)
KeyboardInterrupt
Process PoolWorker-1:
Traceback (most recent call last):
File "/usr/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/usr/lib/python2.7/multiprocessing/process.py", line 114, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python2.7/multiprocessing/pool.py", line 117, in worker
put((job, i, result))
File "/usr/lib/python2.7/multiprocessing/queues.py", line 390, in put
wacquire()
KeyboardInterrupt
^CProcess PoolWorker-2:
Traceback (most recent call last):
File "/usr/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/usr/lib/python2.7/multiprocessing/process.py", line 114, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python2.7/multiprocessing/pool.py", line 117, in worker
put((job, i, result))
File "/usr/lib/python2.7/multiprocessing/queues.py", line 392, in put
return send(obj)
KeyboardInterrupt
Error in atexit._run_exitfuncs:
Traceback (most recent call last):
File "/usr/lib/python2.7/atexit.py", line 24, in _run_exitfuncs
func(*targs, **kargs)
File "/usr/lib/python2.7/multiprocessing/util.py", line 305, in _exit_function
_run_finalizers(0)
File "/usr/lib/python2.7/multiprocessing/util.py", line 274, in _run_finalizers
finalizer()
File "/usr/lib/python2.7/multiprocessing/util.py", line 207, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/usr/lib/python2.7/multiprocessing/pool.py", line 500, in _terminate_pool
outqueue.put(None) # sentinel
File "/usr/lib/python2.7/multiprocessing/queues.py", line 390, in put
wacquire()
KeyboardInterrupt
Error in sys.exitfunc:
Traceback (most recent call last):
File "/usr/lib/python2.7/atexit.py", line 24, in _run_exitfuncs
func(*targs, **kargs)
File "/usr/lib/python2.7/multiprocessing/util.py", line 305, in _exit_function
_run_finalizers(0)
File "/usr/lib/python2.7/multiprocessing/util.py", line 274, in _run_finalizers
finalizer()
File "/usr/lib/python2.7/multiprocessing/util.py", line 207, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/usr/lib/python2.7/multiprocessing/pool.py", line 500, in _terminate_pool
outqueue.put(None) # sentinel
File "/usr/lib/python2.7/multiprocessing/queues.py", line 390, in put
wacquire()
KeyboardInterrupt
When I look at the memory in another command window using htop, memory is at <3% once the workers freeze. This is my first attempt at parallel processing and I am not sure what else I might be missing?
I was not able to solve the issue with multiprocessing pool, however I came across the loky package and was able to use it to run my code with the following lines:
executor = loky.get_reusable_executor(timeout = 200, kill_workers = True)
results = executor.map(discourse_process, entered_file)
You could define a time to your process to return a result, otherwise it would raise an error:
try:
result.get(timeout = 1)
except multiprocessing.TimeoutError:
print("Error while retrieving the result")
Also you could verify if your process is succesful with
import time
while True:
try:
result.succesful()
except Exception:
print("Result is not yet succesful")
time.sleep(1)
Finally, checking out https://docs.python.org/2/library/multiprocessing.html ,is helpful.
I have the following codes:
from multiprocessing import Process, Manager, Event
manager = Manager()
shared_Queue = manager.Queue(10)
ev = Event()
def do_this(shared_queue, ev):
while not ev.is_set():
if not shared_Queue.__getattribute__('empty')():
item = shared_queue.get()
print item
print 'released!'
subprocs = []
for i in xrange(10):
subproc = Process(target=do_this, args=(shared_Queue, ev, ))
subprocs.append(subproc)
subproc.start()
now, if I run this, and I ask whether these processes are alive:
for subproc in subprocs: print subproc.is_alive()
of course I get all Trues.
After couple of doing these: * there is no error if I don't do these!
shared_Queue.put(3)
shared_Queue.put(5)
Now I want to set the Event to kill all of them using:
ev.set()
But then instead of seeing 'released!' 10 times, I get varying number of these prints, and after about 2 to 5 seconds, I get a bunch of errors:
released!
released!
released!
released!
released!
released!
released!
Process Process-10:
Traceback (most recent call last):
File
"/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/
multiprocessing/process.py", line 258, in _bootstrap
self.run()
File
"/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/
multiprocessing/process.py", line 114, in run
self._target(*self._args, **self._kwargs)
File "<input>", line 10, in do_this
File "<string>", line 2, in get
File
"/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/
multiprocessing/managers.py", line 759, in _callmethod
kind, result = conn.recv()
EOFError
Process Process-5:
Traceback (most recent call last):
File
"/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/
multiprocessing/process.py", line 258, in _bootstrap
self.run()
File
"/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/
multiprocessing/process.py", line 114, in run
self._target(*self._args, **self._kwargs)
File "<input>", line 10, in do_this
File "<string>", line 2, in get
File
"/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/
multiprocessing/managers.py", line 759, in _callmethod
kind, result = conn.recv()
EOFError
Process Process-7:
Traceback (most recent call last):
File
"/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/
multiprocessing/process.py", line 258, in _bootstrap
self.run()
File
"/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/
multiprocessing/process.py", line 114, in run
self._target(*self._args, **self._kwargs)
File "<input>", line 10, in do_this
File "<string>", line 2, in get
File
"/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/
multiprocessing/managers.py", line 759, in _callmethod
kind, result = conn.recv()
EOFError
Why is it that some processes are unable to recognize the Event set and show up as errors later? Is there a better way to signal them to die?
Thanks for the comment stovfl, you are right, ev.set() does not kill anything I was carelessly using the word.
As for the issue I was having, I learned that multiprocessing Queue is process and thread safe, meaning, my process will halt just before writing something into the Queue if the Queue is already full.
If I try to set the event while some of the processes are still waiting to write something to the full Queue, they will not recognize the event set.
The key was to empty all the Queue, let the process write to it, and the get to the first line where it can check on the event!
First of all, I know there are quite some threads about multiprocessing on python already, but none of these seems to solve my problem.
Here is my problem:
I want to implement Random Forest Algorithm, and a naive way to do so would be like this:
def random_tree(Data):
tree = calculation(Data)
forest.append(tree)
forest = list()
for i in range(300):
random_tree(Data)
And theforest with 300 "trees" inside would be my final result. In this case, how do I turn this code into a multiprocessing version?
Update:
I just tried Mukund M K's method, in a very simplified script:
from multiprocessing import Pool
def f(x):
return 2*x
data = np.array([1,2,5])
pool = Pool(processes=4)
forest = pool.map(f, (data for i in range(4)))
# I use range() instead of xrange() because I am using Python 3.4
And now....the script is running like forever.....I open a python shell and enter the script line by line, and this is the messages I've got:
> Process SpawnPoolWorker-1:
> Process SpawnPoolWorker-2:
> Traceback (most recent call last):
> Process SpawnPoolWorker-3:
> Traceback (most recent call last):
> Process SpawnPoolWorker-4:
> Traceback (most recent call last):
> Traceback (most recent call last):
> File "E:\Anaconda3\lib\multiprocessing\process.py", line 254, in _bootstrap
self.run()
> File "E:\Anaconda3\lib\multiprocessing\process.py", line 254, in _bootstrap
self.run()
> File "E:\Anaconda3\lib\multiprocessing\process.py", line 254, in _bootstrap
self.run()
> File "E:\Anaconda3\lib\multiprocessing\process.py", line 254, in _bootstrap
self.run()
> File "E:\Anaconda3\lib\multiprocessing\process.py", line 93, in run
self._target(*self._args, **self._kwargs)
> File "E:\Anaconda3\lib\multiprocessing\process.py", line 93, in run
self._target(*self._args, **self._kwargs)
> File "E:\Anaconda3\lib\multiprocessing\process.py", line 93, in run
self._target(*self._args, **self._kwargs)
> File "E:\Anaconda3\lib\multiprocessing\process.py", line 93, in run
self._target(*self._args, **self._kwargs)
> File "E:\Anaconda3\lib\multiprocessing\pool.py", line 108, in worker
task = get()
> File "E:\Anaconda3\lib\multiprocessing\pool.py", line 108, in worker
task = get()
> File "E:\Anaconda3\lib\multiprocessing\pool.py", line 108, in worker
task = get()
> File "E:\Anaconda3\lib\multiprocessing\pool.py", line 108, in worker
task = get()
> File "E:\Anaconda3\lib\multiprocessing\queues.py", line 357, in get
return ForkingPickler.loads(res)
> File "E:\Anaconda3\lib\multiprocessing\queues.py", line 357, in get
return ForkingPickler.loads(res)
> AttributeError: Can't get attribute 'f' on
> AttributeError: Can't get attribute 'f' on
File "E:\Anaconda3\lib\multiprocessing\queues.py", line 357, in get
return ForkingPickler.loads(res)
> AttributeError: Can't get attribute 'f' on
File "E:\Anaconda3\lib\multiprocessing\queues.py", line 357, in get
return ForkingPickler.loads(res)
> AttributeError: Can't get attribute 'f' on
Update: I edited my sample code according to some other example code like this:
from multiprocessing import Pool
import numpy as np
def f(x):
return 2*x
if __name__ == '__main__':
data = np.array([1,2,3])
with Pool(5) as p:
result = p.map(f, (data for i in range(300)))
And it works now. What I need to do now is to fill in this with more sophisticated algorithm now..
Yet another question in my mind is: why could this code work, while the previous version couldn't?
You can do it with multiprocessing this way:
from multiprocessing import Pool
def random_tree(Data):
return calculation(Data)
pool = Pool(processes=4)
forest = pool.map(random_tree, (Data for i in range(300)))
Package processing might help you. Check it out here.
I've never used the multiprocessing library before, so all advice is welcome..
I've got a python program that uses the multiprocessing library to do some memory-intensive tasks in multiple processes, which occasionally runs out of memory (I'm working on optimizations, but that's not what this question is about). Sometimes, an out-of-memory error gets thrown in a way that I can't seem to catch (output below), and then the program hangs on pool.join() (I'm using multiprocessing.Pool. How can I make the program do something other than indefinitely wait when this problem occurs?
Ideally, The memory error is propagated back to the main process which then dies.
Here's the memory error:
Exception in thread Thread-1:
Traceback (most recent call last):
File "/usr/lib64/python2.7/threading.py", line 811, in __bootstrap_inner
self.run()
File "/usr/lib64/python2.7/threading.py", line 764, in run
self.__target(*self.__args, **self.__kwargs)
File "/usr/lib64/python2.7/multiprocessing/pool.py", line 325, in _handle_workers
pool._maintain_pool()
File "/usr/lib64/python2.7/multiprocessing/pool.py", line 229, in _maintain_pool
self._repopulate_pool()
File "/usr/lib64/python2.7/multiprocessing/pool.py", line 222, in _repopulate_pool
w.start()
File "/usr/lib64/python2.7/multiprocessing/process.py", line 130, in start
self._popen = Popen(self)
File "/usr/lib64/python2.7/multiprocessing/forking.py", line 121, in __init__
self.pid = os.fork()
OSError: [Errno 12] Cannot allocate memory
And here's where i manage multiprocessing:
mp_pool = mp.Pool(processes=num_processes)
mp_results = list()
for datum in input_data:
data_args = {
'value': 0 // actually some other simple dict key/values
}
mp_results.append(mp_pool.apply_async(_process_data, args=(common_args, data_args)))
frame_pool.close()
frame_pool.join() // hangs here when that thread dies..
for result_async in mp_results:
result = result_async.get()
// do stuff to collect results
// rest of the code
When I interrupt the hanging program, I get:
Process process_003:
Traceback (most recent call last):
File "/opt/rh/python27/root/usr/lib64/python2.7/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/opt/rh/python27/root/usr/lib64/python2.7/multiprocessing/process.py", line 114, in run
self._target(*self._args, **self._kwargs)
File "/opt/rh/python27/root/usr/lib64/python2.7/multiprocessing/pool.py", line 102, in worker
task = get()
File "/opt/rh/python27/root/usr/lib64/python2.7/multiprocessing/queues.py", line 374, in get
return recv()
racquire()
KeyboardInterrupt
This is actually a known bug in python's multiprocessing module, fixed in python 3 (here's a summarizing blog post I found). There's a patch attached to python issue 22393, but that hasn't been officially applied.
Basically, if one of a multiprocess pool's sub-processes die unexpectedly (out of memory, killed externally, etc.), the pool will wait indefinitely.