xarray StopIteration: error when running groupby in loop - python

I'm receiving a StopIteration: error when attempting to use the groupby function in xarray. The error only occurs when attempting to loop through a list of files - if a single file path is input, no error is generated. I've also tried using xr.open_mfdataset to open the full directory of files, but this produced the same error.
for path in in_files:
ds = xr.open_dataset(path)
ds['index'] = county_mask
ds = ds.set_coords('index')
ds = ds.where(ds['index'].isin(cotton_county_keys))
ds.groupby('index').mean('stacked_lat_lon').to_dataframe().reset_index()
Produces the error:
StopIteration Traceback (most recent call last)
<ipython-input-91-f26bf31efda5> in <module>()
6 ds = ds.set_coords('index')
7 ds = ds.where(ds['index'].isin(cotton_county_keys))
----> 8 ds.groupby('index').mean('stacked_lat_lon').to_dataframe().reset_index()
~\AppData\Local\Continuum\anaconda3\lib\site-packages\xarray\core\common.py in wrapped_func(self, dim, keep_attrs, skipna, **kwargs)
52 return self.reduce(func, dim, keep_attrs, skipna=skipna,
53 numeric_only=numeric_only, allow_lazy=True,
---> 54 **kwargs)
55 else:
56 def wrapped_func(self, dim=None, keep_attrs=False, **kwargs):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\xarray\core\groupby.py in reduce(self, func, dim, keep_attrs, **kwargs)
652 def reduce_dataset(ds):
653 return ds.reduce(func, dim, keep_attrs, **kwargs)
--> 654 return self.apply(reduce_dataset)
655
656 def assign(self, **kwargs):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\xarray\core\groupby.py in apply(self, func, **kwargs)
607 kwargs.pop('shortcut', None) # ignore shortcut if set (for now)
608 applied = (func(ds, **kwargs) for ds in self._iter_grouped())
--> 609 return self._combine(applied)
610
611 def _combine(self, applied):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\xarray\core\groupby.py in _combine(self, applied)
611 def _combine(self, applied):
612 """Recombine the applied objects like the original."""
--> 613 applied_example, applied = peek_at(applied)
614 coord, dim, positions = self._infer_concat_args(applied_example)
615 combined = concat(applied, dim)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\xarray\core\utils.py in peek_at(iterable)
113 """
114 gen = iter(iterable)
--> 115 peek = next(gen)
116 return peek, itertools.chain([peek], gen)
117
StopIteration:
As does:
ds = xr.open_dataset(in_files[0])
ds['index'] = county_mask
ds = ds.set_coords('index')
ds = ds.where(ds['index'].isin(cotton_county_keys))
ds.groupby('index').mean('stacked_lat_lon').to_dataframe().reset_index()
However a file path works perfectly,
path = r'V:\ARL\Weather\Product_Development\US_PRISM_DATA\daily_temp\PRISM_daily_temp_1993-01-08'
ds = xr.open_dataset(path)
ds['index'] = county_mask
ds = ds.set_coords('index')
ds = ds.where(ds['index'].isin(cotton_county_keys))
ds.groupby('index').mean('stacked_lat_lon').to_dataframe().reset_index()

Related

Scipy Fourier Transform KeyError: 'ALIGNED'?

I'm trying to run a fast fourier transform on a pandas dataframe that I have. I am using the Kepler exoplanet dataset, here, and a specific notebook for it, here. I recreate the code in cells 27-30 (Note that the code in cell 29 is executed elsewhere, thus both dataframes have the same shape as the original notebook), which looks as follows:
import scipy
def spectrum_getter(X):
Spectrum = scipy.fft.fft(X, n=X.size)
return np.abs(Spectrum)
x_train_OS_FT = x_train_OS.apply(spectrum_getter, axis=1)
x_test_FT = x_test.apply(spectrum_getter, axis=1)
Both x_train_OS and x_test are pandas.core.frame.DataFrame. Running this produces:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
Input In [245], in <module>
----> 1 x_train_OS_FT = x_train_OS.apply(spectrum_getter, axis=1)
2 x_test_FT = x_test.apply(spectrum_getter, axis=1)
File c:\users\marti\appdata\local\programs\python\python39\lib\site-packages\pandas\core\frame.py:8827, in DataFrame.apply(self, func, axis, raw, result_type, args, **kwargs)
8816 from pandas.core.apply import frame_apply
8818 op = frame_apply(
8819 self,
8820 func=func,
(...)
8825 kwargs=kwargs,
8826 )
-> 8827 return op.apply().__finalize__(self, method="apply")
File c:\users\marti\appdata\local\programs\python\python39\lib\site-packages\pandas\core\apply.py:727, in FrameApply.apply(self)
724 elif self.raw:
725 return self.apply_raw()
--> 727 return self.apply_standard()
File c:\users\marti\appdata\local\programs\python\python39\lib\site-packages\pandas\core\apply.py:851, in FrameApply.apply_standard(self)
850 def apply_standard(self):
--> 851 results, res_index = self.apply_series_generator()
853 # wrap results
854 return self.wrap_results(results, res_index)
File c:\users\marti\appdata\local\programs\python\python39\lib\site-packages\pandas\core\apply.py:867, in FrameApply.apply_series_generator(self)
864 with option_context("mode.chained_assignment", None):
865 for i, v in enumerate(series_gen):
866 # ignore SettingWithCopy here in case the user mutates
--> 867 results[i] = self.f(v)
868 if isinstance(results[i], ABCSeries):
869 # If we have a view on v, we need to make a copy because
870 # series_generator will swap out the underlying data
871 results[i] = results[i].copy(deep=False)
Input In [244], in spectrum_getter(X)
3 def spectrum_getter(X):
----> 4 Spectrum = scipy.fft.fft(X, n=X.size)
5 return np.abs(Spectrum)
File c:\users\marti\appdata\local\programs\python\python39\lib\site-packages\scipy\fft\_backend.py:22, in _ScipyBackend.__ua_function__(method, args, kwargs)
20 if fn is None:
21 return NotImplemented
---> 22 return fn(*args, **kwargs)
File c:\users\marti\appdata\local\programs\python\python39\lib\site-packages\scipy\fft\_pocketfft\basic.py:17, in c2c(forward, x, n, axis, norm, overwrite_x, workers, plan)
14 if plan is not None:
15 raise NotImplementedError('Passing a precomputed plan is not yet '
16 'supported by scipy.fft functions')
---> 17 tmp = _asfarray(x)
18 overwrite_x = overwrite_x or _datacopied(tmp, x)
19 norm = _normalization(norm, forward)
File c:\users\marti\appdata\local\programs\python\python39\lib\site-packages\scipy\fft\_pocketfft\helper.py:97, in _asfarray(x)
95 dtype = x.dtype.newbyteorder('=')
96 # Always align input
---> 97 copy = not x.flags['ALIGNED']
98 return np.array(x, dtype=dtype, copy=copy)
File c:\users\marti\appdata\local\programs\python\python39\lib\site-packages\pandas\core\flags.py:98, in Flags.__getitem__(self, key)
96 def __getitem__(self, key):
97 if key not in self._keys:
---> 98 raise KeyError(key)
100 return getattr(self, key)
KeyError: 'ALIGNED'
I attempted to convert the dataframe to a numpy array, but ran into other issues. What am I doing wrong here?
I ran to the same error so I converted my datatype to dataframe and it solved my problem.

NotImplementedError: Failed in nopython mode pipeline. Use of unknown opcode MAP_ADD at line 116 of <ipython-input-287-147d4798a88b>

I try to launch a code with Numba and I get errors.
What I want to do is to compute the cosine similarity with a cosinus_sparse function. This class method I use in the search class method, then I call search in the results method. Although I added the #jit decorator before each method I have this implementation error that appears.
Here is my code:
import numpy as np
from numba import jit
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
import math
class Search:
def __init__(self, corpus, method='XTERM', stop_words='english', max_df=1.0, min_df=1, max_features=None):
self.corpus = corpus
self.method = method
self.stop_words = stop_words
self.max_df = max_df
self.min_df = min_df
self.max_features = max_features
self.vectorization()
self.get_shape()
self.features_names = self.bag_of_word.get_feature_names()
def vectorization(self):
if self.method == 'XTERM':
self.bag_of_word = CountVectorizer(stop_words=self.stop_words,
max_df=self.max_df, min_df=self.min_df,
max_features=self.max_features)
self.corpus_vectorized = self.bag_of_word.fit_transform(self.corpus)
elif self.method == 'TFxIDF':
self.bag_of_word = TfidfVectorizer(stop_words=self.stop_words,
max_df=self.max_df, min_df=self.min_df,
max_features=self.max_features)
self.corpus_vectorized = self.bag_of_word.fit_transform(self.corpus)
else:
raise MethodError('Method provided is not valid')
def get_shape(self):
self.n_docs, self.n_terms = self.corpus_vectorized.shape
def get_query(self, query):
self.indexes = [self.features_names.index(q) for q in query if q in self.features_names]
self.query_vec = np.zeros(self.n_terms)
self.query_vec[self.indexes] = 1
#staticmethod
#jit(nopython=True)
def cosinus_sparse(i, j):
num = i.dot(j)
spars = i * i.transpose()
den = math.sqrt(spars[0, 0]) * math.sqrt(sum(j * j))
if (den > 0):
return int(num) / den
else:
return 0
#jit(nopython=True)
def search(self, q) -> dict:
cc = {i: self.cosinus_sparse(self.corpus_vectorized[i, :], q) for i in range(self.n_docs)}
cc = sorted(cc.items(), key=lambda x: x[1], reverse=True)
return cc
#jit
def get_result(self) -> list:
self.result = self.search(self.query_vec)
def result_announcer(self):
self.search_lenght = len([i for i in self.result if i[1] > 0])
print('{} documents linked to your query where found'.format(search_lenght))
def verif_query_vec(self, query):
if int(sum(self.query_vec)) != len(query):
raise QueryError('Error in query or query_vec')
def processing(self, query):
try:
self.get_query(query)
self.verif_query_vec(query)
self.get_result()
except NameError:
self.vectorisation()
self.get_shape()
self.get_feature_names()
self.get_query(query)
self.verif_query_vec(query)
self.get_result()
import ipywidgets as widgets
from IPython.display import display
text = widgets.Text(
value='',
placeholder='Type words',
description='String:',
disabled=False
)
method_radio = widgets.RadioButtons(
options=['XTERM', 'TFxIDF'],
# value='TF',
description='Method:',
disabled=False
)
submit = widgets.Button(description = 'Search')
display(widgets.VBox([text, radio, submit]))
def handle_submit(sender):
global query
query = text.value.lower().split(' ')
method = method_radio.value
# instentiation de l'objet de recherche
global search_obj
search_obj = Search(corpus=corpus, method=method, )
search_obj.processing(query)
submit.on_click(handle_submit)
Here is the error
NotImplementedError Traceback (most recent call last)
<ipython-input-288-025a488daa60> in handle_submit(sender)
27 global search_obj
28 search_obj = Search(corpus=corpus, method=method, )
---> 29 search_obj.processing(query)
30
31 submit.on_click(handle_submit)
<ipython-input-287-147d4798a88b> in processing(self, query)
167 self.get_query(query)
168 self.verif_query_vec(query)
--> 169 self.get_result()
170
171 except NameError:
~\Anaconda3\lib\site-packages\numba\dispatcher.py in _compile_for_args(self, *args, **kws)
365 e.patch_message(''.join(e.args) + help_msg)
366 # ignore the FULL_TRACEBACKS config, this needs reporting!
--> 367 raise e
368
369 def inspect_llvm(self, signature=None):
~\Anaconda3\lib\site-packages\numba\dispatcher.py in _compile_for_args(self, *args, **kws)
322 argtypes.append(self.typeof_pyval(a))
323 try:
--> 324 return self.compile(tuple(argtypes))
325 except errors.TypingError as e:
326 # Intercept typing error that may be due to an argument
~\Anaconda3\lib\site-packages\numba\compiler_lock.py in _acquire_compile_lock(*args, **kwargs)
30 def _acquire_compile_lock(*args, **kwargs):
31 with self:
---> 32 return func(*args, **kwargs)
33 return _acquire_compile_lock
34
~\Anaconda3\lib\site-packages\numba\dispatcher.py in compile(self, sig)
653
654 self._cache_misses[sig] += 1
--> 655 cres = self._compiler.compile(args, return_type)
656 self.add_overload(cres)
657 self._cache.save_overload(sig, cres)
~\Anaconda3\lib\site-packages\numba\dispatcher.py in compile(self, args, return_type)
80 args=args, return_type=return_type,
81 flags=flags, locals=self.locals,
---> 82 pipeline_class=self.pipeline_class)
83 # Check typing error if object mode is used
84 if cres.typing_error is not None and not flags.enable_pyobject:
~\Anaconda3\lib\site-packages\numba\compiler.py in compile_extra(typingctx, targetctx, func, args, return_type, flags, locals, library, pipeline_class)
924 pipeline = pipeline_class(typingctx, targetctx, library,
925 args, return_type, flags, locals)
--> 926 return pipeline.compile_extra(func)
927
928
~\Anaconda3\lib\site-packages\numba\compiler.py in compile_extra(self, func)
372 self.lifted = ()
373 self.lifted_from = None
--> 374 return self._compile_bytecode()
375
376 def compile_ir(self, func_ir, lifted=(), lifted_from=None):
~\Anaconda3\lib\site-packages\numba\compiler.py in _compile_bytecode(self)
855 """
856 assert self.func_ir is None
--> 857 return self._compile_core()
858
859 def _compile_ir(self):
~\Anaconda3\lib\site-packages\numba\compiler.py in _compile_core(self)
842 self.define_pipelines(pm)
843 pm.finalize()
--> 844 res = pm.run(self.status)
845 if res is not None:
846 # Early pipeline completion
~\Anaconda3\lib\site-packages\numba\compiler_lock.py in _acquire_compile_lock(*args, **kwargs)
30 def _acquire_compile_lock(*args, **kwargs):
31 with self:
---> 32 return func(*args, **kwargs)
33 return _acquire_compile_lock
34
~\Anaconda3\lib\site-packages\numba\compiler.py in run(self, status)
253 # No more fallback pipelines?
254 if is_final_pipeline:
--> 255 raise patched_exception
256 # Go to next fallback pipeline
257 else:
~\Anaconda3\lib\site-packages\numba\compiler.py in run(self, status)
244 try:
245 event(stage_name)
--> 246 stage()
247 except _EarlyPipelineCompletion as e:
248 return e.result
~\Anaconda3\lib\site-packages\numba\compiler.py in stage_inline_pass(self)
582 self.flags.auto_parallel,
583 self.parfor_diagnostics.replaced_fns)
--> 584 inline_pass.run()
585 # Remove all Dels, and re-run postproc
586 post_proc = postproc.PostProcessor(self.func_ir)
~\Anaconda3\lib\site-packages\numba\inline_closurecall.py in run(self)
75
76 if guard(self._inline_closure,
---> 77 work_list, block, i, func_def):
78 modified = True
79 break # because block structure changed
~\Anaconda3\lib\site-packages\numba\ir_utils.py in guard(func, *args, **kwargs)
1358 """
1359 try:
-> 1360 return func(*args, **kwargs)
1361 except GuardException:
1362 return None
~\Anaconda3\lib\site-packages\numba\inline_closurecall.py in _inline_closure(self, work_list, block, i, func_def)
212 inline_closure_call(self.func_ir,
213 self.func_ir.func_id.func.__globals__,
--> 214 block, i, func_def, work_list=work_list)
215 return True
216
~\Anaconda3\lib\site-packages\numba\inline_closurecall.py in inline_closure_call(func_ir, glbls, block, i, callee, typingctx, arg_typs, typemap, calltypes, work_list)
253 callee_closure = callee.closure if hasattr(callee, 'closure') else callee.__closure__
254 # first, get the IR of the callee
--> 255 callee_ir = get_ir_of_code(glbls, callee_code)
256 callee_blocks = callee_ir.blocks
257
~\Anaconda3\lib\site-packages\numba\ir_utils.py in get_ir_of_code(glbls, fcode)
1572 f.__name__ = fcode.co_name
1573 from numba import compiler
-> 1574 ir = compiler.run_frontend(f)
1575 # we need to run the before inference rewrite pass to normalize the IR
1576 # XXX: check rewrite pass flag?
~\Anaconda3\lib\site-packages\numba\compiler.py in run_frontend(func)
168 interp = interpreter.Interpreter(func_id)
169 bc = bytecode.ByteCode(func_id=func_id)
--> 170 func_ir = interp.interpret(bc)
171 post_proc = postproc.PostProcessor(func_ir)
172 post_proc.run()
~\Anaconda3\lib\site-packages\numba\interpreter.py in interpret(self, bytecode)
101 # Data flow analysis
102 self.dfa = dataflow.DataFlowAnalysis(self.cfa)
--> 103 self.dfa.run()
104
105 # Temp states during interpretation
~\Anaconda3\lib\site-packages\numba\dataflow.py in run(self)
26 def run(self):
27 for blk in self.cfa.iterliveblocks():
---> 28 self.infos[blk.offset] = self.run_on_block(blk)
29
30 def run_on_block(self, blk):
~\Anaconda3\lib\site-packages\numba\dataflow.py in run_on_block(self, blk)
76 for offset in blk:
77 inst = self.bytecode[offset]
---> 78 self.dispatch(info, inst)
79 return info
80
~\Anaconda3\lib\site-packages\numba\dataflow.py in dispatch(self, info, inst)
86 fname = "op_%s" % inst.opname.replace('+', '_')
87 fn = getattr(self, fname, self.handle_unknown_opcode)
---> 88 fn(info, inst)
89
90 def handle_unknown_opcode(self, info, inst):
~\Anaconda3\lib\site-packages\numba\dataflow.py in handle_unknown_opcode(self, info, inst)
91 msg = "Use of unknown opcode {} at line {} of {}"
92 raise NotImplementedError(msg.format(inst.opname, inst.lineno,
---> 93 self.bytecode.func_id.filename))
94
95 def dup_topx(self, info, inst, count):
NotImplementedError: Failed in nopython mode pipeline (step: inline calls to locally defined closures)
Use of unknown opcode MAP_ADD at line 116 of <ipython-input-287-147d4798a88b>
How do I fix this error?
Thanks a lot for your help.

random forest calculation with Huge Sparse Data

I am trying to calculate a random forest on huge sparse multilabel-data. The Dataset has 94 targets, some of them are barly used (2 out of 650000) and some of them aren't used at all. But I am out of (32 GB) Ram and can't calculate all in one go. So I followed the Guide for "batching" a random forest:
https://stats.stackexchange.com/questions/327335/batch-learning-w-random-forest-sklearn
When I tried to predict something I got the error (below).
So I tried a different approach: Calculating a random forest on dataparts and merging them afterwards:
forest_model = None
forest_model_final = None
start = time.time()
for e in range(5): # 5 passes through the data
print("Epoch:", e)
for batch_index, (X, y) in enumerate(dataloader_dict['Train_and_Validation']):
forest_model = RandomForestClassifier(warm_start = False, n_estimators = 1, n_jobs=parameters['num_workers'])
X = np.squeeze(X.numpy(), axis=1)
y = np.squeeze(y.numpy(), axis=1)
y_one_hot = np.array(y > parameters['threshold'], dtype=int)
forest_model.fit(X,y_one_hot)
if forest_model_final is not None:
forest_model_final = combine([forest_model_final, forest_model])
else:
forest_model_final = forest_model
end = time.time()
print("Time (s): %s"%(end-start))
def combine(all_ensembles):
"""Combine the sub-estimators of a group of ensembles
>>> from sklearn.datasets import load_iris
>>> from sklearn.ensemble import ExtraTreesClassifier
>>> iris = load_iris()
>>> X, y = iris.data, iris.target
>>> all_ensembles = [ExtraTreesClassifier(n_estimators=4).fit(X, y)
... for i in range(3)]
>>> big = combine(all_ensembles)
>>> len(big.estimators_)
12
>>> big.n_estimators
12
>>> big.score(X, y)
1.0
"""
final_ensemble = copy(all_ensembles[0])
final_ensemble.estimators_ = []
for ensemble in all_ensembles:
final_ensemble.estimators_ += ensemble.estimators_
# Required in old versions of sklearn
final_ensemble.n_estimators = len(final_ensemble.estimators_)
return final_ensemble
I get the same error when I try to predict something with the calculated random forest.
Error:
ValueError: non-broadcastable output operand with shape (50,1) doesn't match the broadcast shape (50,2)
I found a similar question Unexpected exception when combining random forest trees , but I don't understand what I should do now.
Full Traceback:
ValueError Traceback (most recent call last)
<ipython-input-10-4f8ce9181286> in <module>
7 yval = np.squeeze(yval.numpy(), axis=1)
8 y_one_hot = yval > parameters['threshold']
----> 9 yval_pred = forest_model_final.predict_proba(Xval)
10 #Todo stuff
11 acc_batch = accuracy_score(y_one_hot, yval_pred)
~/anaconda3/envs/column-labeling/lib/python3.6/site-packages/sklearn/ensemble/_forest.py in predict_proba(self, X)
667 delayed(_accumulate_prediction)(e.predict_proba, X, all_proba,
668 lock)
--> 669 for e in self.estimators_)
670
671 for proba in all_proba:
~/.local/lib/python3.6/site-packages/joblib/parallel.py in __call__(self, iterable)
1014
1015 with self._backend.retrieval_context():
-> 1016 self.retrieve()
1017 # Make sure that we get a last message telling us we are done
1018 elapsed_time = time.time() - self._start_time
~/.local/lib/python3.6/site-packages/joblib/parallel.py in retrieve(self)
906 try:
907 if getattr(self._backend, 'supports_timeout', False):
--> 908 self._output.extend(job.get(timeout=self.timeout))
909 else:
910 self._output.extend(job.get())
~/anaconda3/envs/column-labeling/lib/python3.6/multiprocessing/pool.py in get(self, timeout)
642 return self._value
643 else:
--> 644 raise self._value
645
646 def _set(self, i, obj):
~/anaconda3/envs/column-labeling/lib/python3.6/multiprocessing/pool.py in worker(inqueue, outqueue, initializer, initargs, maxtasks, wrap_exception)
117 job, i, func, args, kwds = task
118 try:
--> 119 result = (True, func(*args, **kwds))
120 except Exception as e:
121 if wrap_exception and func is not _helper_reraises_exception:
~/.local/lib/python3.6/site-packages/joblib/_parallel_backends.py in __call__(self, *args, **kwargs)
598 def __call__(self, *args, **kwargs):
599 try:
--> 600 return self.func(*args, **kwargs)
601 except KeyboardInterrupt:
602 # We capture the KeyboardInterrupt and reraise it as
~/.local/lib/python3.6/site-packages/joblib/parallel.py in __call__(self)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
~/.local/lib/python3.6/site-packages/joblib/parallel.py in <listcomp>(.0)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
~/anaconda3/envs/column-labeling/lib/python3.6/site-packages/sklearn/ensemble/_forest.py in _accumulate_prediction(predict, X, out, lock)
453 else:
454 for i in range(len(out)):
--> 455 out[i] += prediction[i]
456
457
ValueError: non-broadcastable output operand with shape (50,1) doesn't match the broadcast shape (50,2)

pool objects cannot be passed between processes or pickled

I have the following class (shortened for concision purposes) which standardises SMILES strings upon instantiation. I have been trying to speed up the process by making use of all my cpus with parallel processing by taking advantage of the multiprocessing package in Python 3.7.4.
class Standardiser(object):
def __call__(self):
return self.prepare_dataset()
def __init__(self, DataFrame):
self.DataFrame = DataFrame
self.standardiser = mv.Standardizer()
self.salt_remover = SaltRemover()
self.accepted_atoms = ['H','C','N','O','F','S','Cl','Br','I','P']
self.pool = mp.Pool(processes = mp.cpu_count())
def prepare_dataset(self, standardise = True, remove_charge = False):
standard_smiles = []
if standardise:
standardised_smiles = [self.pool.apply_async(self.standardise_compound, args = (x,)).get() for x in self.DataFrame['Molecule']]
DataFrame = pd.concat([self.DataFrame[['Activity','Molecule']], pd.Series(standardised_smiles)], axis = 1)
return DataFrame
def standardise_compound(self, mol, min_heavy_atoms = 0, max_heavy_atoms = 50, max_len = 150, remove_charge = False):
try:
if selected_fragment is None:
return None
if remove_charge:
mol = remove_charge_mol(selected_fragment)
if min_heavy_atoms <= mol.GetNumHeavyAtoms() <= max_heavy_atoms:
smiles = Chem.MolToSmiles(selected_fragment, isomericSmiles = False, canonical = True)
if len(smiles) <= max_len:
return smiles
except Exception as e:
print(e)
I instantiate it with the relevant DataFrame and then I call it, but I get thrown the following error:
NotImplementedError Traceback (most recent call last)
<ipython-input-60-1c181cd43d85> in <module>()
1 standardise = Standardiser(df[:100])
----> 2 dff = standardise()
3 dff.head()
<ipython-input-59-a6677d6c7724> in __call__(self)
4
5 def __call__(self):
----> 6 return self.prepare_dataset()
7
8 def __init__(self, DataFrame):
<ipython-input-59-a6677d6c7724> in prepare_dataset(self, standardise, remove_charge)
22
---> 23 standardised_smiles = [self.pool.apply(self.standardise_compound, args = (x,)).get() for x in self.DataFrame['Molecule']]
24
25 DataFrame = pd.concat([self.DataFrame[['Activity','Molecule']], pd.Series(standardised_smiles)], axis = 1)
<ipython-input-59-a6677d6c7724> in <listcomp>(.0)
22
---> 23 standardised_smiles = [self.pool.apply(self.standardise_compound, args = (x,)).get() for x in self.DataFrame['Molecule']]
24
25 DataFrame = pd.concat([self.DataFrame[['Activity','Molecule']], pd.Series(standardised_smiles)], axis = 1)
~/.conda/envs/dalkeCourse/lib/python3.6/multiprocessing/pool.py in apply(self, func, args, kwds)
257 '''
258 assert self._state == RUN
--> 259 return self.apply_async(func, args, kwds).get()
260
261 def map(self, func, iterable, chunksize=None):
~/.conda/envs/dalkeCourse/lib/python3.6/multiprocessing/pool.py in get(self, timeout)
642 return self._value
643 else:
--> 644 raise self._value
645
646 def _set(self, i, obj):
~/.conda/envs/dalkeCourse/lib/python3.6/multiprocessing/pool.py in _handle_tasks(taskqueue, put, outqueue, pool, cache)
422 break
423 try:
--> 424 put(task)
425 except Exception as e:
426 job, idx = task[:2]
~/.conda/envs/dalkeCourse/lib/python3.6/multiprocessing/connection.py in send(self, obj)
204 self._check_closed()
205 self._check_writable()
--> 206 self._send_bytes(_ForkingPickler.dumps(obj))
207
208 def recv_bytes(self, maxlength=None):
~/.conda/envs/dalkeCourse/lib/python3.6/multiprocessing/reduction.py in dumps(cls, obj, protocol)
49 def dumps(cls, obj, protocol=None):
50 buf = io.BytesIO()
---> 51 cls(buf, protocol).dump(obj)
52 return buf.getbuffer()
53
~/.conda/envs/dalkeCourse/lib/python3.6/multiprocessing/pool.py in __reduce__(self)
526 def __reduce__(self):
527 raise NotImplementedError(
--> 528 'pool objects cannot be passed between processes or pickled'
529 )
530
NotImplementedError: pool objects cannot be passed between processes or pickled
there is no pickling happening along the class and I was wondering if there is any problem with the way the multiprocessing is being implemented.
EDIT
I have converted the standardise_compound function into a #classmethod and the error thrown has changed to:
standardise_mol() missing 1 required positional argument: 'mol'

Python Numba jit NotImplementedError list comprehension

I want to speed up the calculation of a formula executing a list comprehension with Numba.
from numba import jit
# General function to generate overlapping windows from a dataframe
#jit
def overlapping_windows(index, wl=256, noverlap=128):
l = len(index)
res = [[s,s+wl] for s in xrange(0, l, noverlap) if s+wl < l]
return res
overlapping_windows([1,2,3,4,5,6,7,8,9,10],4,2)
However I get a NotImplementedError. Not sure why.
---------------------------------------------------------------------------
NotImplementedError Traceback (most recent call last)
<ipython-input-45-ce0579185abe> in <module>()
6 return res
7
----> 8 overlapping_windows([1,2,3,4,5,6,7,8,9,10],4,2)
~/anaconda/lib/python2.7/site-packages/numba/dispatcher.pyc in _compile_and_call(self, *args, **kws)
123 assert not kws
124 sig = tuple([typeof_pyval(a) for a in args])
--> 125 self.jit(sig)
126 return self(*args, **kws)
127
~/anaconda/lib/python2.7/site-packages/numba/dispatcher.pyc in jit(self, sig, **kws)
118 """Alias of compile(sig, **kws)
119 """
--> 120 return self.compile(sig, **kws)
121
122 def _compile_and_call(self, *args, **kws):
~/anaconda/lib/python2.7/site-packages/numba/dispatcher.pyc in compile(self, sig, locals, **targetoptions)
106 cres = compiler.compile_extra(typingctx, targetctx, self.py_func,
107 args=args, return_type=return_type,
--> 108 flags=flags, locals=locs)
109
110 # Check typing error if object mode is used
~/anaconda/lib/python2.7/site-packages/numba/compiler.pyc in compile_extra(typingctx, targetctx, func, args, return_type, flags, locals)
85 Use ``None`` to indicate
86 """
---> 87 bc = bytecode.ByteCode(func=func)
88 if config.DEBUG:
89 print(bc.dump())
~/anaconda/lib/python2.7/site-packages/numba/bytecode.pyc in __init__(self, func)
275 raise ByteCodeSupportError("does not support cellvars")
276
--> 277 table = utils.SortedMap(ByteCodeIter(code))
278 labels = set(dis.findlabels(code.co_code))
279 labels.add(0)
~/anaconda/lib/python2.7/site-packages/numba/utils.pyc in __init__(self, seq)
44 self._values = []
45 self._index = {}
---> 46 for i, (k, v) in enumerate(sorted(seq)):
47 self._index[k] = i
48 self._values.append((k, v))
~/anaconda/lib/python2.7/site-packages/numba/bytecode.pyc in next(self)
195 ts = "offset=%d opcode=%x opname=%s"
196 tv = offset, opcode, dis.opname[opcode]
--> 197 raise NotImplementedError(ts % tv)
198 if info.argsize:
199 arg = self.read_arg(info.argsize)
NotImplementedError: offset=66 opcode=5e opname=LIST_APPEND
It most likely means that the version of numba you are using does not support functions with list comprehensions.

Categories

Resources