ValueError invalid literal error with Python Multiprocessing Pool

ValueError invalid literal error with Python Multiprocessing Pool - python

I have a function that i'm trying to run in parallel. The function is of the form
def parallelFunc(curUser):
if curUser in neighbors.getUsers(): #neighbors is a global object of a class
userData = createData.createData(inpMat1,inpMat2,inpMat3, neighbors.getNeighbors(curUser) )
#inpMatX are numpy arrays/scipy sparse arrays/lists with global scope
return userData # tried returning a double value too, get the same error
else:
return 0
pp = Pool(4) # tried with different values
ret = pp.map(parallelFunc, userList)
When I try running this, I get the following error
ValueError: invalud literal for float(): 1.235443508738e
The error is in multiprocessing/pool.pyc . I'm doing this in an IPython notebook. Any ideas as to why this would happen?
Stack Trace :
ValueError Traceback (most recent call last)
<ipython-input-99-2731048b72e2> in <module>()
3
4 #st = time.time()
----> 5 ret = pp.map(parallelFunc, userList)
6 #ft = time.time()
7
/opt/Anaconda/lib/python2.7/multiprocessing/pool.pyc in map(self, func, iterable, chunksize)
249 '''
250 assert self._state == RUN
--> 251 return self.map_async(func, iterable, chunksize).get()
252
253 def imap(self, func, iterable, chunksize=1):
/opt/Anaconda/lib/python2.7/multiprocessing/pool.pyc in get(self, timeout)
565 return self._value
566 else:
--> 567 raise self._value
568
569 def _set(self, i, obj):
ValueError: invalid literal for float(): 1.34716296703.978260894942e+06

Related

pool objects cannot be passed between processes or pickled

I have the following class (shortened for concision purposes) which standardises SMILES strings upon instantiation. I have been trying to speed up the process by making use of all my cpus with parallel processing by taking advantage of the multiprocessing package in Python 3.7.4.
class Standardiser(object):
def __call__(self):
return self.prepare_dataset()
def __init__(self, DataFrame):
self.DataFrame = DataFrame
self.standardiser = mv.Standardizer()
self.salt_remover = SaltRemover()
self.accepted_atoms = ['H','C','N','O','F','S','Cl','Br','I','P']
self.pool = mp.Pool(processes = mp.cpu_count())
def prepare_dataset(self, standardise = True, remove_charge = False):
standard_smiles = []
if standardise:
standardised_smiles = [self.pool.apply_async(self.standardise_compound, args = (x,)).get() for x in self.DataFrame['Molecule']]
DataFrame = pd.concat([self.DataFrame[['Activity','Molecule']], pd.Series(standardised_smiles)], axis = 1)
return DataFrame
def standardise_compound(self, mol, min_heavy_atoms = 0, max_heavy_atoms = 50, max_len = 150, remove_charge = False):
try:
if selected_fragment is None:
return None
if remove_charge:
mol = remove_charge_mol(selected_fragment)
if min_heavy_atoms <= mol.GetNumHeavyAtoms() <= max_heavy_atoms:
smiles = Chem.MolToSmiles(selected_fragment, isomericSmiles = False, canonical = True)
if len(smiles) <= max_len:
return smiles
except Exception as e:
print(e)
I instantiate it with the relevant DataFrame and then I call it, but I get thrown the following error:
NotImplementedError Traceback (most recent call last)
<ipython-input-60-1c181cd43d85> in <module>()
1 standardise = Standardiser(df[:100])
----> 2 dff = standardise()
3 dff.head()
<ipython-input-59-a6677d6c7724> in __call__(self)
4
5 def __call__(self):
----> 6 return self.prepare_dataset()
7
8 def __init__(self, DataFrame):
<ipython-input-59-a6677d6c7724> in prepare_dataset(self, standardise, remove_charge)
22
---> 23 standardised_smiles = [self.pool.apply(self.standardise_compound, args = (x,)).get() for x in self.DataFrame['Molecule']]
24
25 DataFrame = pd.concat([self.DataFrame[['Activity','Molecule']], pd.Series(standardised_smiles)], axis = 1)
<ipython-input-59-a6677d6c7724> in <listcomp>(.0)
22
---> 23 standardised_smiles = [self.pool.apply(self.standardise_compound, args = (x,)).get() for x in self.DataFrame['Molecule']]
24
25 DataFrame = pd.concat([self.DataFrame[['Activity','Molecule']], pd.Series(standardised_smiles)], axis = 1)
~/.conda/envs/dalkeCourse/lib/python3.6/multiprocessing/pool.py in apply(self, func, args, kwds)
257 '''
258 assert self._state == RUN
--> 259 return self.apply_async(func, args, kwds).get()
260
261 def map(self, func, iterable, chunksize=None):
~/.conda/envs/dalkeCourse/lib/python3.6/multiprocessing/pool.py in get(self, timeout)
642 return self._value
643 else:
--> 644 raise self._value
645
646 def _set(self, i, obj):
~/.conda/envs/dalkeCourse/lib/python3.6/multiprocessing/pool.py in _handle_tasks(taskqueue, put, outqueue, pool, cache)
422 break
423 try:
--> 424 put(task)
425 except Exception as e:
426 job, idx = task[:2]
~/.conda/envs/dalkeCourse/lib/python3.6/multiprocessing/connection.py in send(self, obj)
204 self._check_closed()
205 self._check_writable()
--> 206 self._send_bytes(_ForkingPickler.dumps(obj))
207
208 def recv_bytes(self, maxlength=None):
~/.conda/envs/dalkeCourse/lib/python3.6/multiprocessing/reduction.py in dumps(cls, obj, protocol)
49 def dumps(cls, obj, protocol=None):
50 buf = io.BytesIO()
---> 51 cls(buf, protocol).dump(obj)
52 return buf.getbuffer()
53
~/.conda/envs/dalkeCourse/lib/python3.6/multiprocessing/pool.py in __reduce__(self)
526 def __reduce__(self):
527 raise NotImplementedError(
--> 528 'pool objects cannot be passed between processes or pickled'
529 )
530
NotImplementedError: pool objects cannot be passed between processes or pickled
there is no pickling happening along the class and I was wondering if there is any problem with the way the multiprocessing is being implemented.
EDIT
I have converted the standardise_compound function into a #classmethod and the error thrown has changed to:
standardise_mol() missing 1 required positional argument: 'mol'

Error when using pandas dataframe in R cell, in rpy2, Jupyter Notebook

I want to use ggplot2 within Jupyter Notebook. However, when I try to make an R magic cell and introduce a variable, I get an error.
Here is the code (one paragraph indicates one cell):
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import rpy2
%matplotlib inline
from rpy2.robjects import pandas2ri
pandas2ri.activate()
%load_ext rpy2.ipython
%%R
library(ggplot2)
data = pd.read_csv('train_titanic.csv')
%%R -i data -w 900 -h 480 -u px
With this last cell, I get the following error (incl traceback):
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py in py2rpy_pandasdataframe(obj)
54 try:
---> 55 od[name] = conversion.py2rpy(values)
56 except Exception as e:
~/anaconda3/envs/catenv/lib/python3.7/functools.py in wrapper(*args, **kw)
839
--> 840 return dispatch(args[0].__class__)(*args, **kw)
841
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py in py2rpy_pandasseries(obj)
125 if type(x) is not homogeneous_type:
--> 126 raise ValueError('Series can only be of one type, or None.')
127 # TODO: Could this be merged with obj.type.name == 'O' case above ?
ValueError: Series can only be of one type, or None.
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/rinterface_lib/sexp.py in from_object(cls, obj)
367 try:
--> 368 mv = memoryview(obj)
369 res = cls.from_memoryview(mv)
TypeError: memoryview: a bytes-like object is required, not 'Series'
During handling of the above exception, another exception occurred:
AttributeError Traceback (most recent call last)
<ipython-input-14-75e210679e4a> in <module>
----> 1 get_ipython().run_cell_magic('R', '-i data -w 900 -h 480 -u px', '\n\n')
~/anaconda3/envs/catenv/lib/python3.7/site-packages/IPython/core/interactiveshell.py in run_cell_magic(self, magic_name, line, cell)
2360 with self.builtin_trap:
2361 args = (magic_arg_s, cell)
-> 2362 result = fn(*args, **kwargs)
2363 return result
2364
</home/morgan/anaconda3/envs/catenv/lib/python3.7/site-packages/decorator.py:decorator-gen-130> in R(self, line, cell, local_ns)
~/anaconda3/envs/catenv/lib/python3.7/site-packages/IPython/core/magic.py in <lambda>(f, *a, **k)
185 # but it's overkill for just that one bit of state.
186 def magic_deco(arg):
--> 187 call = lambda f, *a, **k: f(*a, **k)
188
189 if callable(arg):
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/ipython/rmagic.py in R(self, line, cell, local_ns)
721 raise NameError("name '%s' is not defined" % input)
722 with localconverter(converter) as cv:
--> 723 ro.r.assign(input, val)
724
725 tmpd = self.setup_graphics(args)
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/robjects/functions.py in __call__(self, *args, **kwargs)
190 kwargs[r_k] = v
191 return (super(SignatureTranslatedFunction, self)
--> 192 .__call__(*args, **kwargs))
193
194
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/robjects/functions.py in __call__(self, *args, **kwargs)
111
112 def __call__(self, *args, **kwargs):
--> 113 new_args = [conversion.py2rpy(a) for a in args]
114 new_kwargs = {}
115 for k, v in kwargs.items():
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/robjects/functions.py in <listcomp>(.0)
111
112 def __call__(self, *args, **kwargs):
--> 113 new_args = [conversion.py2rpy(a) for a in args]
114 new_kwargs = {}
115 for k, v in kwargs.items():
~/anaconda3/envs/catenv/lib/python3.7/functools.py in wrapper(*args, **kw)
838 '1 positional argument')
839
--> 840 return dispatch(args[0].__class__)(*args, **kw)
841
842 funcname = getattr(func, '__name__', 'singledispatch function')
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py in py2rpy_pandasdataframe(obj)
59 'The error is: %s'
60 % (name, str(e)))
---> 61 od[name] = StrVector(values)
62
63 return DataFrame(od)
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/robjects/vectors.py in __init__(self, obj)
382
383 def __init__(self, obj):
--> 384 super().__init__(obj)
385 self._add_rops()
386
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/rinterface_lib/sexp.py in __init__(self, obj)
286 super().__init__(obj)
287 elif isinstance(obj, collections.abc.Sized):
--> 288 super().__init__(type(self).from_object(obj).__sexp__)
289 else:
290 raise TypeError('The constructor must be called '
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/rinterface_lib/sexp.py in from_object(cls, obj)
370 except (TypeError, ValueError):
371 try:
--> 372 res = cls.from_iterable(obj)
373 except ValueError:
374 msg = ('The class methods from_memoryview() and '
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/rinterface_lib/conversion.py in _(*args, **kwargs)
26 def _cdata_res_to_rinterface(function):
27 def _(*args, **kwargs):
---> 28 cdata = function(*args, **kwargs)
29 # TODO: test cdata is of the expected CType
30 return _cdata_to_rinterface(cdata)
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/rinterface_lib/sexp.py in from_iterable(cls, iterable, populate_func)
317 if populate_func is None:
318 cls._populate_r_vector(iterable,
--> 319 r_vector)
320 else:
321 populate_func(iterable, r_vector)
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/rinterface_lib/sexp.py in _populate_r_vector(cls, iterable, r_vector)
300 r_vector,
301 cls._R_SET_VECTOR_ELT,
--> 302 cls._CAST_IN)
303
304 #classmethod
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/rinterface_lib/sexp.py in _populate_r_vector(iterable, r_vector, set_elt, cast_value)
237 def _populate_r_vector(iterable, r_vector, set_elt, cast_value):
238 for i, v in enumerate(iterable):
--> 239 set_elt(r_vector, i, cast_value(v))
240
241
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/rinterface_lib/sexp.py in _as_charsxp_cdata(x)
430 return x.__sexp__._cdata
431 else:
--> 432 return conversion._str_to_charsxp(x)
433
434
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/rinterface_lib/conversion.py in _str_to_charsxp(val)
118 s = rlib.R_NaString
119 else:
--> 120 cchar = _str_to_cchar(val)
121 s = rlib.Rf_mkCharCE(cchar, _CE_UTF8)
122 return s
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/rinterface_lib/conversion.py in _str_to_cchar(s, encoding)
97 def _str_to_cchar(s, encoding: str = 'utf-8'):
98 # TODO: use isStrinb and installTrChar
---> 99 b = s.encode(encoding)
100 return ffi.new('char[]', b)
101
AttributeError: 'float' object has no attribute 'encode'
So I find that it is not possible to even start an R magic cell while importing my pandas dataframe object. However, I have tried creating R vectors inside the cell, and find I can plot these using ggplot2 with no issues.
I am using Python 3.7.6, rpy2 3.1.0, jupyter-notebook 6.0.3and am using Ubuntu 18.04.2 LTS on Windows Subsystem for Linux.

The problem is most likely with one (or more) columns having more than one type - therefore it is impossible to transfer the data into an R vector (which can hold only one data type). The traceback may be overwhelming, but here is the relevant part:
ValueError: Series can only be of one type, or None.
Which column it is? Difficult to say without looking at the dataset that you load, but my general solution is to check the types in the columns:
types = data.applymap(type).apply(set)
types[types.apply(len) > 1]
Anything returned by the snippet above would be a candidate culprit. There are many different ways of dealing with the problem, depending on the exact nature of the data. Workarounds that I frequently use include:
calling data = data.infer_objects() - helps if the pandas did not catch up with a dtype change and still stores the data with (suboptimal) Python objects
filling NaN with an empty string or a string constant if you have missing values in a string column (e.g. str_columns = str_columns.fillna(''))
dates.apply(pd.to_datetime, axis=1) if you have datetime objects but the dtype is object
using df.applymap(lambda x: datetime.combine(x, datetime.min.time()) if not isinstance(x, datetime) else x) if you have a mixture of date and datetime objects
In some vary rare cases pandas stores the data differently than expected by rpy2 (following certain manipulations); then writing the dataframe down to a csv file and reading it from the disk again helps - but this is likely not what you are facing here, as you start from a newly read dataframe.

I just noticed there might be an even simpler reason for the problem. For some reason, pandas2ri requires you to call pandas2ri.activate()after importing it. This solved the problem for me.

Joblib error: TypeError: can't pickle _thread.lock objects

I am unable to run joblib using my function which takes a numpy array, list of trained Keras models and a list of strings as parameters.
I tried creating the parameters as a namedtuple or even as a class with immutable properties. Any ideas ?
Params = collections.namedtuple('Params',['inputs','y_list','trained_models'])
p = Params(inputs, y_list, trained_models)
or
class Params:
def __init__(self, inputs, mq_list,trained_models):
super(Params , self).__setattr__("inputs", inputs)
super(Params , self).__setattr__("y_list", y_list)
super(Params , self).__setattr__("trained_models", trained_models)
Function which i like to run in parallel:
def predict(params):
inputs = params.inputs
y_list = params.y_list
trained_models = params.trained_models
# process and vectorize inputs
X= new_X(inputs)
X_vect= vect.transform(X)
predictions = dict()
for y in y_list:
y_field = trained_models[y].predict(X_vect)
# evaluate model
if y_field[0] > 0.05:
return None, None
predictions[y] = y_field[0]
return X, predictions
Parallelized calling of function:
r= Parallel(n_jobs=4, verbose=5)(
delayed(predict)(p)
for c in range(100))
Error:
TypeErrorTraceback (most recent call last) <timed exec> in <module>()
~/.conda/envs/mlgpu/lib/python3.6/site-packages/joblib/parallel.py in
__call__(self, iterable)
787 # consumption.
788 self._iterating = False
--> 789 self.retrieve()
790 # Make sure that we get a last message telling us we are done
791 elapsed_time = time.time() - self._start_time
~/.conda/envs/mlgpu/lib/python3.6/site-packages/joblib/parallel.py in retrieve(self)
697 try:
698 if getattr(self._backend, 'supports_timeout', False):
--> 699 self._output.extend(job.get(timeout=self.timeout))
700 else:
701 self._output.extend(job.get())
~/.conda/envs/mlgpu/lib/python3.6/multiprocessing/pool.py in get(self, timeout)
642 return self._value
643 else:
--> 644 raise self._value
645
646 def _set(self, i, obj):
~/.conda/envs/mlgpu/lib/python3.6/multiprocessing/pool.py in
_handle_tasks(taskqueue, put, outqueue, pool, cache)
422 break
423 try:
--> 424 put(task)
425 except Exception as e:
426 job, idx = task[:2]
~/.conda/envs/mlgpu/lib/python3.6/site-packages/joblib/pool.py in send(obj)
369 def send(obj):
370 buffer = BytesIO()
--> 371 CustomizablePickler(buffer, self._reducers).dump(obj)
372 self._writer.send_bytes(buffer.getvalue())
373 self._send = send
TypeError: can't pickle _thread.lock objects

You should create your own class because you don't know if the function collections.namedtuple has non-pickable parts.
I ran into a similar problem some months ago, I was adding lambda functions to a class to pass it as argument. But since lambda functions are not pickable (by the package pickle) it gives an error.

Modifying Simpy Store _do_get

I'd like to modify the simpy Store get functionality by altering the object that is returned in _do_get:
class mod_Store(Store):
def _do_get(self, event):
super()._do_get(event)
event.value.tagged = True
env = Environment()
s = mod_Store(env)
class thing:
pass
def putter():
while True:
yield s.put(thing())
yield env.timeout(5)
def getter():
while True:
t = yield s.get()
yield env.timeout(3)
env.process(putter())
env.process(getter())
env.run(until=20)
Basically, before returning the object back to getter, I'd like to add an attribute (tagged). But this code produces the following error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-50-27e09b52b41a> in getter()
19 while True:
---> 20 t = yield s.get()
21 yield env.timeout(3)
C:\ProgramData\Anaconda3\lib\site-packages\simpy\resources\base.py in __init__(self, resource)
77 self.callbacks.append(resource._trigger_put)
---> 78 resource._trigger_get(None)
79
C:\ProgramData\Anaconda3\lib\site-packages\simpy\resources\base.py in _trigger_get(self, put_event)
223 get_event = self.get_queue[idx]
--> 224 proceed = self._do_get(get_event)
225 if not get_event.triggered:
<ipython-input-50-27e09b52b41a> in _do_get(self, event)
3 super()._do_get(event)
----> 4 event.value.tagged = True
5
C:\ProgramData\Anaconda3\lib\site-packages\simpy\events.py in value(self)
132 if self._value is PENDING:
--> 133 raise AttributeError('Value of %s is not yet available' % self)
134 return self._value
AttributeError: Value of <StoreGet() object at 0x9030c88> is not yet available
The above exception was the direct cause of the following exception:
AttributeError Traceback (most recent call last)
<ipython-input-50-27e09b52b41a> in <module>()
24 env.process(getter())
25
---> 26 env.run(until=20)
C:\ProgramData\Anaconda3\lib\site-packages\simpy\core.py in run(self, until)
135 try:
136 while True:
--> 137 self.step()
138 except StopSimulation as exc:
139 return exc.args[0] # == until.value
C:\ProgramData\Anaconda3\lib\site-packages\simpy\core.py in step(self)
227 exc = type(event._value)(*event._value.args)
228 exc.__cause__ = event._value
--> 229 raise exc
AttributeError: Value of <StoreGet() object at 0x9030c88> is not yet available
Looking through base.py, I see that _trigger_get is called in the init function of Get, so it makes sense that this fails, but how can I accomplish what I want to achieve? One workaround I have is just to do the attribute assignment in a try/catch and catch AttributeErrors, but this feels like a hack.
Also, if I add a print statement at the beginning of _do_get it will actually print twice and then raise the exception, which I find weird.

Maybe you can subclass the StoreGet event and set the attribute when its succeed() method is called. You’d also need also subclass Store like this:
class TaggedStoreGet(StoreGet):
def succeed(self, value=None):
self.tagged = True
return super().succeed(value)
class TaggedStore(Store):
get = BoundClass(TaggedStoreGet)
Haven’t tested it, but I think it might work.

TypeError: 'NoneType' object is not iterable occurring only on big data

I get the error: TypeError: 'NoneType' object is not iterable
when i try to geocode over 200K IPs using the following code excerpt. Please note that when I geocode just 100K IPs no errors are occurring:
#writing a function that takes index, geocodes the ip individually for multithreading
def new_geocode_ip_ver2(ipandresponse):
if ipandresponse[1]!='0':
match=geolite2.lookup(ipandresponse[0])
if match is not None:
lat, lng = match.location
else:
lat="GC failed"
lng="GC failed"
else:
lat="Non-pingable, no GC"
lng="Non-pingable, no GC"
return lat, lng
#spawn threads for geocoding IPs
pool=ThreadPool(200)
latlng_result=pool.map(new_geocode_ip_ver2,ip_response)
pool.close()
pool.join()
The error is message is provided below. Can anyone tell me why this error is occuring on over 100k IP only. I am trying to build my program in such a way that I will be able to geocode a million + IPs. Do I have some kind of memory leak by any chance?
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-6-26e36f25645e> in <module>()
32 #spawn threads for geocoding IPs
33 pool=ThreadPool(200)
---> 34 latlng_result=pool.map(new_geocode_ip_ver2,ip_response)
35 pool.close()
36 pool.join()
C:\Users\sibag\Anaconda2\lib\multiprocessing\pool.pyc in map(self, func, iterable, chunksize)
249 '''
250 assert self._state == RUN
--> 251 return self.map_async(func, iterable, chunksize).get()
252
253 def imap(self, func, iterable, chunksize=1):
C:\Users\sibag\Anaconda2\lib\multiprocessing\pool.pyc in get(self, timeout)
565 return self._value
566 else:
--> 567 raise self._value
568
569 def _set(self, i, obj):
TypeError: 'NoneType' object is not iterable

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

ValueError invalid literal error with Python Multiprocessing Pool - python

Related

pool objects cannot be passed between processes or pickled

Error when using pandas dataframe in R cell, in rpy2, Jupyter Notebook

Joblib error: TypeError: can't pickle _thread.lock objects

Modifying Simpy Store _do_get

TypeError: 'NoneType' object is not iterable occurring only on big data

Categories

Resources