Error pickling a `matlab` object in joblib `Parallel` context - python

I'm running some Matlab code in parallel from inside a Python context (I know, but that's what's going on), and I'm hitting an import error involving matlab.double. The same code works fine in a multiprocessing.Pool, so I am having trouble figuring out what the problem is. Here's a minimal reproducing test case.
import matlab
from multiprocessing import Pool
from joblib import Parallel, delayed
# A global object that I would like to be available in the parallel subroutine
x = matlab.double([[0.0]])
def f(i):
print(i, x)
with Pool(4) as p:
p.map(f, range(10))
# This prints 1, [[0.0]]\n2, [[0.0]]\n... as expected
for _ in Parallel(4, backend='multiprocessing')(delayed(f)(i) for i in range(10)):
pass
# This also prints 1, [[0.0]]\n2, [[0.0]]\n... as expected
# Now run with default `backend='loky'`
for _ in Parallel(4)(delayed(f)(i) for i in range(10)):
pass
# ^ this crashes.
So, the only problematic one is the one using the 'loky' backend.
The full traceback is:
exception calling callback for <Future at 0x7f63b5a57358 state=finished raised BrokenProcessPool>
joblib.externals.loky.process_executor._RemoteTraceback:
'''
Traceback (most recent call last):
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/externals/loky/process_executor.py", line 391, in _process_worker
call_item = call_queue.get(block=True, timeout=timeout)
File "~/miniconda3/envs/myenv/lib/python3.6/multiprocessing/queues.py", line 113, in get
return _ForkingPickler.loads(res)
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/matlab/mlarray.py", line 31, in <module>
from _internal.mlarray_sequence import _MLArrayMetaClass
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/matlab/_internal/mlarray_sequence.py", line 3, in <module>
from _internal.mlarray_utils import _get_strides, _get_size, \
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/matlab/_internal/mlarray_utils.py", line 4, in <module>
import matlab
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/matlab/__init__.py", line 24, in <module>
from mlarray import double, single, uint8, int8, uint16, \
ImportError: cannot import name 'double'
'''
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/externals/loky/_base.py", line 625, in _invoke_callbacks
callback(self)
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/parallel.py", line 309, in __call__
self.parallel.dispatch_next()
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/parallel.py", line 731, in dispatch_next
if not self.dispatch_one_batch(self._original_iterator):
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/parallel.py", line 759, in dispatch_one_batch
self._dispatch(tasks)
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/parallel.py", line 716, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/_parallel_backends.py", line 510, in apply_async
future = self._workers.submit(SafeFunction(func))
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/externals/loky/reusable_executor.py", line 151, in submit
fn, *args, **kwargs)
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/externals/loky/process_executor.py", line 1022, in submit
raise self._flags.broken
joblib.externals.loky.process_executor.BrokenProcessPool: A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.
joblib.externals.loky.process_executor._RemoteTraceback:
'''
Traceback (most recent call last):
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/externals/loky/process_executor.py", line 391, in _process_worker
call_item = call_queue.get(block=True, timeout=timeout)
File "~/miniconda3/envs/myenv/lib/python3.6/multiprocessing/queues.py", line 113, in get
return _ForkingPickler.loads(res)
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/matlab/mlarray.py", line 31, in <module>
from _internal.mlarray_sequence import _MLArrayMetaClass
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/matlab/_internal/mlarray_sequence.py", line 3, in <module>
from _internal.mlarray_utils import _get_strides, _get_size, \
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/matlab/_internal/mlarray_utils.py", line 4, in <module>
import matlab
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/matlab/__init__.py", line 24, in <module>
from mlarray import double, single, uint8, int8, uint16, \
ImportError: cannot import name 'double'
'''
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "test.py", line 20, in <module>
for _ in Parallel(4)(delayed(f)(i) for i in range(10)):
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/parallel.py", line 934, in __call__
self.retrieve()
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/parallel.py", line 833, in retrieve
self._output.extend(job.get(timeout=self.timeout))
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/_parallel_backends.py", line 521, in wrap_future_result
return future.result(timeout=timeout)
File "~/miniconda3/envs/myenv/lib/python3.6/concurrent/futures/_base.py", line 432, in result
return self.__get_result()
File "~/miniconda3/envs/myenv/lib/python3.6/concurrent/futures/_base.py", line 384, in __get_result
raise self._exception
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/externals/loky/_base.py", line 625, in _invoke_callbacks
callback(self)
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/parallel.py", line 309, in __call__
self.parallel.dispatch_next()
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/parallel.py", line 731, in dispatch_next
if not self.dispatch_one_batch(self._original_iterator):
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/parallel.py", line 759, in dispatch_one_batch
self._dispatch(tasks)
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/parallel.py", line 716, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/_parallel_backends.py", line 510, in apply_async
future = self._workers.submit(SafeFunction(func))
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/externals/loky/reusable_executor.py", line 151, in submit
fn, *args, **kwargs)
File "~/miniconda3/envs/myenv/lib/python3.6/site-packages/joblib/externals/loky/process_executor.py", line 1022, in submit
raise self._flags.broken
joblib.externals.loky.process_executor.BrokenProcessPool: A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.
Looking at the traceback, it seems like the root cause is an issue importing the matlab package in the child process.
It's probably worth noting that this all runs just fine if instead I had defined x = np.array([[0.0]]) (after importing numpy as np). And of course the main process has no problem with any matlab imports, so I am not sure why the child process would.
I'm not sure if this error has anything in particular to do with the matlab package, or if it's something to do with global variables and cloudpickle or loky. In my application it would help to stick with loky, so I'd appreciate any insight!
I should also note that I'm using the official Matlab engine for Python: https://www.mathworks.com/help/matlab/matlab-engine-for-python.html. I suppose that might make it hard for others to try out the test cases, so I wish I could reproduce this error with a type other than matlab.double, but I haven't found another yet.
Digging around more, I've noticed that the process of importing the matlab package is more circular than I would expect, and I'm speculating that this could be part of the problem? The issue is that when import matlab is run by loky's _ForkingPickler, first some file matlab/mlarray.py is imported, which imports some other files, one of which contains import matlab, and this causes matlab/__init__.py to be run, which internally has from mlarray import double, single, uint8, ... which is the line that causes the crash.
Could this circularity be the issue? If so, why can I import this module in the main process but not in the loky backend?

The error is caused by incorrect loading order of global objects in the child processes. It can be seen clearly in the traceback
_ForkingPickler.loads(res) -> ... -> import matlab -> from mlarray import ...
that matlab is not yet imported when the global variable x is loaded by cloudpickle.
joblib with loky seems to treat modules as normal global objects and send them dynamically to the child processes. joblib doesn't record the order in which those objects/modules were defined. Therefore they are loaded (initialized) in a random order in the child processes.
A simple workaround is to manually pickle the matlab object and load it after importing matlab inside your function.
import matlab
import pickle
px = pickle.dumps(matlab.double([[0.0]]))
def f(i):
import matlab
x=pickle.loads(px)
print(i, x)
Of course you can also use the joblib.dumps and loads to serialize the objects.
Use initializer
Thanks to the suggestion of #Aaron, you can also use an initializer (for loky) to import Matlab before loading x.
Currently there's no simple API to specify initializer. So I wrote a simple function:
def with_initializer(self, f_init):
# Overwrite initializer hook in the Loky ProcessPoolExecutor
# https://github.com/tomMoral/loky/blob/f4739e123acb711781e46581d5ed31ed8201c7a9/loky/process_executor.py#L850
hasattr(self._backend, '_workers') or self.__enter__()
origin_init = self._backend._workers._initializer
def new_init():
origin_init()
f_init()
self._backend._workers._initializer = new_init if callable(origin_init) else f_init
return self
It is a little bit hacky but works well with the current version of joblib and loky.
Then you can use it like:
import matlab
from joblib import Parallel, delayed
x = matlab.double([[0.0]])
def f(i):
print(i, x)
def _init_matlab():
import matlab
with Parallel(4) as p:
for _ in with_initializer(p, _init_matlab)(delayed(f)(i) for i in range(10)):
pass
I hope the developers of joblib will add initializer argument to the constructor of Parallel in the future.

Related

Multiprocessing; How to debug: _pickle.PicklingError: Could not pickle object as excessively deep recursion required

I have a simulation which I can run using Python code and want to create multiple instances of it using a SubProcVecEnv from stable-baselines3. This uses subprocessing to run the simulations on different cores and it was working before I made a number of changes to my code. However, now I receive the error below and do not know how to debug it, because I don't understand which part of my code is causing it. Is there a way to find out which object/ method is causing the recursion depth being exceeded? I also do not remember writing a recursive method anywhere in my code. Researching the error message was not successful.
/home/philipp/anaconda3/envs/sumo_rl/lib/python3.8/site-packages/gym/logger.py:30: UserWarning: WARN: Box bound precision lowered by casting to float32
Traceback (most recent call last):
File "/home/philipp/anaconda3/envs/sumo_rl/lib/python3.8/site-packages/cloudpickle/cloudpickle_fast.py", line 563, in dump
return Pickler.dump(self, obj)
File "/home/philipp/anaconda3/envs/sumo_rl/lib/python3.8/site-packages/cloudpickle/cloudpickle_fast.py", line 639, in reducer_override
if sys.version_info[:2] < (3, 7) and _is_parametrized_type_hint(obj): # noqa # pragma: no branch
RecursionError: maximum recursion depth exceeded in comparison
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/philipp/Code/ba_pw/train.py", line 84, in <module>
venv = utils.make_venv(env_class, network, params, remote_ports, monitor_log_dir)
File "/home/philipp/Code/ba_pw/sumo_rl/utils/utils.py", line 170, in make_venv
return vec_env.SubprocVecEnv(env_fs)
File "/home/philipp/anaconda3/envs/sumo_rl/lib/python3.8/site-packages/stable_baselines3/common/vec_env/subproc_vec_env.py", line 106, in __init__
process.start()
File "/home/philipp/anaconda3/envs/sumo_rl/lib/python3.8/multiprocessing/process.py", line 121, in start
self._popen = self._Popen(self)
File "/home/philipp/anaconda3/envs/sumo_rl/lib/python3.8/multiprocessing/context.py", line 291, in _Popen
return Popen(process_obj)
File "/home/philipp/anaconda3/envs/sumo_rl/lib/python3.8/multiprocessing/popen_forkserver.py", line 35, in __init__
super().__init__(process_obj)
File "/home/philipp/anaconda3/envs/sumo_rl/lib/python3.8/multiprocessing/popen_fork.py", line 19, in __init__
self._launch(process_obj)
File "/home/philipp/anaconda3/envs/sumo_rl/lib/python3.8/multiprocessing/popen_forkserver.py", line 47, in _launch
reduction.dump(process_obj, buf)
File "/home/philipp/anaconda3/envs/sumo_rl/lib/python3.8/multiprocessing/reduction.py", line 60, in dump
ForkingPickler(file, protocol).dump(obj)
File "/home/philipp/anaconda3/envs/sumo_rl/lib/python3.8/site-packages/stable_baselines3/common/vec_env/base_vec_env.py", line 372, in __getstate__
return cloudpickle.dumps(self.var)
File "/home/philipp/anaconda3/envs/sumo_rl/lib/python3.8/site-packages/cloudpickle/cloudpickle_fast.py", line 73, in dumps
cp.dump(obj)
File "/home/philipp/anaconda3/envs/sumo_rl/lib/python3.8/site-packages/cloudpickle/cloudpickle_fast.py", line 570, in dump
raise pickle.PicklingError(msg) from e
_pickle.PicklingError: Could not pickle object as excessively deep recursion required.
I finally figured out a solution using the answer to this question:
I looks like the object which I want to pickle has too many layers. I called:
sys.setrecursionlimit(3000)
and now it works.

Multiprocessing using Pool class in Python giving Pickling error

I am trying to run a simple multiprocessing example in python3.6 in a zeppelin notebook(in windows) but I am not able to execute it. Below is the code that i used:
def sqrt(x):
return x**0.5
numbers = [i for i in range(1000000)]
with Pool() as pool:
sqrt_ls = pool.map(sqrt, numbers)
After running this code I am getting the following error:
Traceback (most recent call last):
File "/tmp/zeppelin_python-3196160128578820301.py", line 315, in <module>
exec(code, _zcUserQueryNameSpace)
File "<stdin>", line 6, in <module>
File "/usr/lib64/python3.6/multiprocessing/pool.py", line 266, in map
return self._map_async(func, iterable, mapstar, chunksize).get()
File "/usr/lib64/python3.6/multiprocessing/pool.py", line 644, in get
raise self._value
File "/usr/lib64/python3.6/multiprocessing/pool.py", line 424, in _handle_tasks
put(task)
File "/usr/lib64/python3.6/multiprocessing/connection.py", line 206, in send
self._send_bytes(_ForkingPickler.dumps(obj))
File "/usr/lib64/python3.6/multiprocessing/reduction.py", line 51, in dumps
cls(buf, protocol).dump(obj)
_pickle.PicklingError: Can't pickle <function sqrt at 0x7f6f84f1a620>: attribute lookup sqrt on __main__ failed
I am not sure if its just me who is facing the issue. As i have seen so many articles where people can run the code easily. If you know the solution please help
Thanks
From the multiprocessing documentation:
Note: Functionality within this package requires that the main module be importable by the children. This is covered in Programming guidelines however it is worth pointing out here. This means that some examples, such as the Pool examples will not work in the interactive interpreter.
Notebooks are running Python interactive interpreters behind the scene, so it's probably why you get this error. You can try to run your code from within a if __name__ == '__main__': statement.
A Zeppelin notebook does not emulate a normal module well enough to support the pickling that is used to identify the correct operation to another process. You can put all the functions you want to call into a proper module that you import in the usual fashion.

How to run decorated function in a separate and terminatable process?

I am dealing with an existing test suite, where we want to implement a timeout functionality, which will cause a hanging test to time out and then move on with its regular teardown/cleanup.
I am toying with the idea of running each test in a process, which I can terminate after e.g. a timeout of 3 seconds. Ideally, I don't want to modify the test cases and instead just add a decorator indicating the test is affected by this timeout behavior.
This is what I have, a minimal example:
import multiprocessing
import sys
from time import sleep
def timeout(func):
def wrapper():
proc = multiprocessing.Process(target=func)
proc.start()
sleep(3)
proc.terminate()
return wrapper
#timeout
def my_test():
while True:
sleep(1)
if __name__ == "__main__":
my_test()
But for some reason, it seems pickle cannot deal with this and the decorator somehow messes up the reference to the function, as this error is hit:
$ python multiproc.py
2019-11-07 07:34:37.098 | DEBUG | __main__:wrapper:13 - In wrapper
Traceback (most recent call last):
File "multiproc.py", line 30, in <module>
my_test()
File "multiproc.py", line 15, in wrapper
proc.start()
File "C:\Python36\lib\multiprocessing\process.py", line 105, in start
self._popen = self._Popen(self)
File "C:\Python36\lib\multiprocessing\context.py", line 223, in _Popen
return _default_context.get_context().Process._Popen(process_obj)
File "C:\Python36\lib\multiprocessing\context.py", line 322, in _Popen
return Popen(process_obj)
File "C:\Python36\lib\multiprocessing\popen_spawn_win32.py", line 65, in __init__
reduction.dump(process_obj, to_child)
File "C:\Python36\lib\multiprocessing\reduction.py", line 60, in dump
ForkingPickler(file, protocol).dump(obj)
_pickle.PicklingError: Can't pickle <function my_test at 0x000001C0A7E87400>: it's not the same object as __main__.my_test
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "C:\Python36\lib\multiprocessing\spawn.py", line 99, in spawn_main
new_handle = reduction.steal_handle(parent_pid, pipe_handle)
File "C:\Python36\lib\multiprocessing\reduction.py", line 82, in steal_handle
_winapi.PROCESS_DUP_HANDLE, False, source_pid)
OSError: [WinError 87] The parameter is incorrect
Does anyone have an idea if this can be solved without modifying the existing test case?

assert group is none

I am trying to run a preprocessing pipeline using nipype and I get the following error message:
Traceback (most recent call last):
File "preprocscript.py", line 211, in <module>
preproc.run('MultiProc', plugin_args={'n_procs': 8})
File "/sw/anaconda/3/lib/python3.6/site-packages/nipype/pipeline/engine/workflows.py", line 579, in run
runner = plugin_mod(plugin_args=plugin_args)
File "/sw/anaconda/3/lib/python3.6/site-packages/nipype/pipeline/plugins/multiproc.py", line 162, in __init__
initargs=(self._cwd,)
File "/sw/anaconda/3/lib/python3.6/multiprocessing/pool.py", line 175, in __init__
self._repopulate_pool()
File "/sw/anaconda/3/lib/python3.6/multiprocessing/pool.py", line 236, in _repopulate_pool
self._wrap_exception)
File "/sw/anaconda/3/lib/python3.6/multiprocessing/pool.py", line 250, in _repopulate_pool_static
wrap_exception)
File "/sw/anaconda/3/lib/python3.6/multiprocessing/process.py", line 73, in __init__
assert group is None, 'group argument must be None for now'
AssertionError: group argument must be None for now
and I am not sure what exactly might be wrong in my code that leads to this or if this is an issue with my software.
I am on a linux system and use python 3.6.
The module you are using has a ProcessPoolExecuter being used in it. In Python 3.7 they added some additional arguments to that class, namely initargs which is what is being called in nipype multiprocess module you are using. Unfortunately it is not backwards compatible to 3.6 and they did not write in another way to use that module.
Your options are to upgrade or not use the multiprocessing portion of nipype.

Generic function typing in Python

I am running under Python 3.7 on Linux Ubuntu 18.04 under Eclipse 4.8 and Pydev.
The declaration:
args:Dict[str: Optional[Any]] = {}
is in a module that is imported from my testing code. and it is flagged with the following error message from typing.py:
TypeError: Parameters to generic types must be types. Got slice(<class 'str'>, typing.Union[typing.Any, NoneType], None). The stack trace follows: Finding files... done. Importing test modules ... Traceback (most recent call last): File "/Data/WiseOldBird/Eclipse/pool/plugins/org.python.pydev.core_7.0.3.201811082356/pysrc/_pydev_runfiles/pydev_runfiles.py", line 468, in __get_module_from_str
mod = __import__(modname) File "/Data/WiseOldBird/Workspaces/WikimediaAccess/WikimediaLoader/Tests/wiseoldbird/loaders/TestWikimediaLoader.py", line 10, in <module>
from wiseoldbird.application_controller import main File "/Data/WiseOldBird/Workspaces/WikimediaAccess/WikimediaLoader/src/wiseoldbird/application_controller.py", line 36, in <module>
args:Dict[str: Optional[Any]] = {} File "/usr/local/lib/python3.7/typing.py", line 251, in inner
return func(*args, **kwds) File "/usr/local/lib/python3.7/typing.py", line 626, in __getitem__
params = tuple(_type_check(p, msg) for p in params) File "/usr/local/lib/python3.7/typing.py", line 626, in <genexpr>
params = tuple(_type_check(p, msg) for p in params) File "/usr/local/lib/python3.7/typing.py", line 139, in _type_check
raise TypeError(f"{msg} Got {arg!r:.100}.") TypeError: Parameters
This prevents my testing module from being imported.
What am I doing wrong?
The proper syntax for a dict's type is
Dict[str, Optional[Any]]
When you write [a: b], Python interprets this as a slice, i.e. the thing that makes taking parts of arrays work, like a[1:10]. You can see this in the error message: Got slice(<class 'str'>, typing.Union[typing.Any, NoneType], None).

Categories

Resources