pool objects cannot be passed between processes or pickled

pool objects cannot be passed between processes or pickled - python

I have the following class (shortened for concision purposes) which standardises SMILES strings upon instantiation. I have been trying to speed up the process by making use of all my cpus with parallel processing by taking advantage of the multiprocessing package in Python 3.7.4.
class Standardiser(object):
def __call__(self):
return self.prepare_dataset()
def __init__(self, DataFrame):
self.DataFrame = DataFrame
self.standardiser = mv.Standardizer()
self.salt_remover = SaltRemover()
self.accepted_atoms = ['H','C','N','O','F','S','Cl','Br','I','P']
self.pool = mp.Pool(processes = mp.cpu_count())
def prepare_dataset(self, standardise = True, remove_charge = False):
standard_smiles = []
if standardise:
standardised_smiles = [self.pool.apply_async(self.standardise_compound, args = (x,)).get() for x in self.DataFrame['Molecule']]
DataFrame = pd.concat([self.DataFrame[['Activity','Molecule']], pd.Series(standardised_smiles)], axis = 1)
return DataFrame
def standardise_compound(self, mol, min_heavy_atoms = 0, max_heavy_atoms = 50, max_len = 150, remove_charge = False):
try:
if selected_fragment is None:
return None
if remove_charge:
mol = remove_charge_mol(selected_fragment)
if min_heavy_atoms <= mol.GetNumHeavyAtoms() <= max_heavy_atoms:
smiles = Chem.MolToSmiles(selected_fragment, isomericSmiles = False, canonical = True)
if len(smiles) <= max_len:
return smiles
except Exception as e:
print(e)
I instantiate it with the relevant DataFrame and then I call it, but I get thrown the following error:
NotImplementedError Traceback (most recent call last)
<ipython-input-60-1c181cd43d85> in <module>()
1 standardise = Standardiser(df[:100])
----> 2 dff = standardise()
3 dff.head()
<ipython-input-59-a6677d6c7724> in __call__(self)
4
5 def __call__(self):
----> 6 return self.prepare_dataset()
7
8 def __init__(self, DataFrame):
<ipython-input-59-a6677d6c7724> in prepare_dataset(self, standardise, remove_charge)
22
---> 23 standardised_smiles = [self.pool.apply(self.standardise_compound, args = (x,)).get() for x in self.DataFrame['Molecule']]
24
25 DataFrame = pd.concat([self.DataFrame[['Activity','Molecule']], pd.Series(standardised_smiles)], axis = 1)
<ipython-input-59-a6677d6c7724> in <listcomp>(.0)
22
---> 23 standardised_smiles = [self.pool.apply(self.standardise_compound, args = (x,)).get() for x in self.DataFrame['Molecule']]
24
25 DataFrame = pd.concat([self.DataFrame[['Activity','Molecule']], pd.Series(standardised_smiles)], axis = 1)
~/.conda/envs/dalkeCourse/lib/python3.6/multiprocessing/pool.py in apply(self, func, args, kwds)
257 '''
258 assert self._state == RUN
--> 259 return self.apply_async(func, args, kwds).get()
260
261 def map(self, func, iterable, chunksize=None):
~/.conda/envs/dalkeCourse/lib/python3.6/multiprocessing/pool.py in get(self, timeout)
642 return self._value
643 else:
--> 644 raise self._value
645
646 def _set(self, i, obj):
~/.conda/envs/dalkeCourse/lib/python3.6/multiprocessing/pool.py in _handle_tasks(taskqueue, put, outqueue, pool, cache)
422 break
423 try:
--> 424 put(task)
425 except Exception as e:
426 job, idx = task[:2]
~/.conda/envs/dalkeCourse/lib/python3.6/multiprocessing/connection.py in send(self, obj)
204 self._check_closed()
205 self._check_writable()
--> 206 self._send_bytes(_ForkingPickler.dumps(obj))
207
208 def recv_bytes(self, maxlength=None):
~/.conda/envs/dalkeCourse/lib/python3.6/multiprocessing/reduction.py in dumps(cls, obj, protocol)
49 def dumps(cls, obj, protocol=None):
50 buf = io.BytesIO()
---> 51 cls(buf, protocol).dump(obj)
52 return buf.getbuffer()
53
~/.conda/envs/dalkeCourse/lib/python3.6/multiprocessing/pool.py in __reduce__(self)
526 def __reduce__(self):
527 raise NotImplementedError(
--> 528 'pool objects cannot be passed between processes or pickled'
529 )
530
NotImplementedError: pool objects cannot be passed between processes or pickled
there is no pickling happening along the class and I was wondering if there is any problem with the way the multiprocessing is being implemented.
EDIT
I have converted the standardise_compound function into a #classmethod and the error thrown has changed to:
standardise_mol() missing 1 required positional argument: 'mol'

Related

TypeError: Object of type DataFrame is not JSON serializable

I'm trying to make telegram alarm bot but encountered error such as "TypeError: Object of type DataFrame is not JSON serializable"
Here is my code:
import FinanceDataReader as fdr
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import datetime
from datetime import date
import sys
fdr.__version__
import schedule
import time
import pytz
import telegram
count = 1
def job():
# 전역변수 설정
global count
count += 1
# 한국시각, 주말 설정
now = datetime.datetime.now(pytz.timezone('Asia/Seoul'))
today = date.today()
weekend = today.weekday()
# 예외시간 설정. 9시 이전 및 15시 이후로는 알람을 보내지 않음
if now.hour >= 15 or now.hour <= 9 or weekend == [5, 6]:
return
API_KEY = 'My key'
bot = telegram.Bot(token=API_KEY)
bot.get_updates()
# for i in updates:
# print(i.message['chat']['id'])
# 코스닥지수
code = 'KQ11'
df = fdr.DataReader('KQ11','2022-08').reset_index()
# 3,5,10 이동평균 딕셔너리에 할당
df['close_sma3d'] = df['Close'].rolling(3).mean()
df['close_sma5d'] = df['Close'].rolling(5).mean()
df['close_sma10d'] = df['Close'].rolling(10).mean()
# dataframe 재구성
# df = df.to_dict()
# df.rename(columns={0: 'Date', 1: 'Close', 2: 'Open', 3: 'High', 4: 'Low', 5: 'Volume', 6: 'Change', 7: 'close_sma3d', 8: 'close_sma5d', 9: 'close_sma10d'}, inplace = True)
df2 = df.loc[: ,['Date','Close', 'close_sma3d','close_sma5d','close_sma10d']].iloc[-1:]
alerts = df2[(df2['Close'] > df2['close_sma3d']) | (df2['Close'] > df2['close_sma5d']) | (df2['Close'] > df2['close_sma10d'])]
def display(row):
print(f" - {row['Date']} Signal 발생! 코스닥_현재가 {row['Close']} 3일이동평균 {row['close_sma3d']:.2f} 5일이동평균 {row['close_sma5d']:.2f} 10일이동평균 {row['close_sma10d']:.2f}")
Market_timing = alerts.apply(display, axis=1)
if count % 1 == 0:
bot.sendMessage(chat_id = 'Mykey', text = Market_timing)
else:
print('대기 중입니다..')
# 2 시간 마다 실행
schedule.every(1).minutes.do(job)
print('Start App..')
while True:
schedule.run_pending()
time.sleep(1)
And here is the error:
TypeError Traceback (most recent call last)
Input In [3], in <cell line: 49>()
46 print('Start App..')
48 while True:
---> 49 schedule.run_pending()
50 time.sleep(1)
File ~\miniconda3\envs\py38\lib\site-packages\schedule\__init__.py:780, in run_pending()
776 def run_pending() -> None:
777 """Calls :meth:`run_pending <Scheduler.run_pending>` on the
778 :data:`default scheduler instance <default_scheduler>`.
779 """
--> 780 default_scheduler.run_pending()
File ~\miniconda3\envs\py38\lib\site-packages\schedule\__init__.py:100, in Scheduler.run_pending(self)
98 runnable_jobs = (job for job in self.jobs if job.should_run)
99 for job in sorted(runnable_jobs):
--> 100 self._run_job(job)
File ~\miniconda3\envs\py38\lib\site-packages\schedule\__init__.py:172, in Scheduler._run_job(self, job)
171 def _run_job(self, job: "Job") -> None:
--> 172 ret = job.run()
173 if isinstance(ret, CancelJob) or ret is CancelJob:
174 self.cancel_job(job)
File ~\miniconda3\envs\py38\lib\site-packages\schedule\__init__.py:661, in Job.run(self)
658 return CancelJob
660 logger.debug("Running job %s", self)
--> 661 ret = self.job_func()
662 self.last_run = datetime.datetime.now()
663 self._schedule_next_run()
Input In [3], in job()
35 Market_timing = alerts.apply(display, axis=1)
37 if count % 1 == 0:
---> 38 bot.sendMessage(chat_id = '1760120639', text = Market_timing)
39 else:
40 print('대기 중입니다..')
File ~\miniconda3\envs\py38\lib\site-packages\telegram\bot.py:133, in log.<locals>.decorator(*args, **kwargs)
130 #functools.wraps(func)
131 def decorator(*args: object, **kwargs: object) -> RT: # pylint: disable=W0613
132 logger.debug('Entering: %s', func.__name__)
--> 133 result = func(*args, **kwargs)
134 logger.debug(result)
135 logger.debug('Exiting: %s', func.__name__)
File ~\miniconda3\envs\py38\lib\site-packages\telegram\bot.py:525, in Bot.send_message(self, chat_id, text, parse_mode, disable_web_page_preview, disable_notification, reply_to_message_id, reply_markup, timeout, api_kwargs, allow_sending_without_reply, entities, protect_content)
522 if entities:
523 data['entities'] = [me.to_dict() for me in entities]
--> 525 return self._message( # type: ignore[return-value]
526 'sendMessage',
527 data,
528 disable_notification=disable_notification,
529 reply_to_message_id=reply_to_message_id,
530 reply_markup=reply_markup,
531 allow_sending_without_reply=allow_sending_without_reply,
532 timeout=timeout,
533 api_kwargs=api_kwargs,
534 protect_content=protect_content,
535 )
File ~\miniconda3\envs\py38\lib\site-packages\telegram\bot.py:339, in Bot._message(self, endpoint, data, reply_to_message_id, disable_notification, reply_markup, allow_sending_without_reply, timeout, api_kwargs, protect_content)
336 else:
337 data['media'].parse_mode = None
--> 339 result = self._post(endpoint, data, timeout=timeout, api_kwargs=api_kwargs)
341 if result is True:
342 return result
File ~\miniconda3\envs\py38\lib\site-packages\telegram\bot.py:298, in Bot._post(self, endpoint, data, timeout, api_kwargs)
295 # Drop any None values because Telegram doesn't handle them well
296 data = {key: value for key, value in data.items() if value is not None}
--> 298 return self.request.post(
299 f'{self.base_url}/{endpoint}', data=data, timeout=effective_timeout
300 )
File ~\miniconda3\envs\py38\lib\site-packages\telegram\utils\request.py:364, in Request.post(self, url, data, timeout)
359 result = self._request_wrapper('POST', url, fields=data, **urlopen_kwargs)
360 else:
361 result = self._request_wrapper(
362 'POST',
363 url,
--> 364 body=json.dumps(data).encode('utf-8'),
365 headers={'Content-Type': 'application/json'},
366 **urlopen_kwargs,
367 )
369 return self._parse(result)
File ~\miniconda3\envs\py38\lib\json\__init__.py:231, in dumps(obj, skipkeys, ensure_ascii, check_circular, allow_nan, cls, indent, separators, default, sort_keys, **kw)
226 # cached encoder
227 if (not skipkeys and ensure_ascii and
228 check_circular and allow_nan and
229 cls is None and indent is None and separators is None and
230 default is None and not sort_keys and not kw):
--> 231 return _default_encoder.encode(obj)
232 if cls is None:
233 cls = JSONEncoder
File ~\miniconda3\envs\py38\lib\json\encoder.py:199, in JSONEncoder.encode(self, o)
195 return encode_basestring(o)
196 # This doesn't pass the iterator directly to ''.join() because the
197 # exceptions aren't as detailed. The list call should be roughly
198 # equivalent to the PySequence_Fast that ''.join() would do.
--> 199 chunks = self.iterencode(o, _one_shot=True)
200 if not isinstance(chunks, (list, tuple)):
201 chunks = list(chunks)
File ~\miniconda3\envs\py38\lib\json\encoder.py:257, in JSONEncoder.iterencode(self, o, _one_shot)
252 else:
253 _iterencode = _make_iterencode(
254 markers, self.default, _encoder, self.indent, floatstr,
255 self.key_separator, self.item_separator, self.sort_keys,
256 self.skipkeys, _one_shot)
--> 257 return _iterencode(o, 0)
File ~\miniconda3\envs\py38\lib\json\encoder.py:179, in JSONEncoder.default(self, o)
160 def default(self, o):
161 """Implement this method in a subclass such that it returns
162 a serializable object for ``o``, or calls the base implementation
163 (to raise a ``TypeError``).
(...)
177
178 """
--> 179 raise TypeError(f'Object of type {o.__class__.__name__} '
180 f'is not JSON serializable')
TypeError: Object of type DataFrame is not JSON serializable
I tried TypeError: Object of type 'DataFrame' is not JSON serializable this method but had a problem making a column.
Any help would be appreciated.

Trackpy tp.batch() gives generator already executing error

I am trying to track some particles in a video using trackpy.
I'm following the walkthrough from the website:
http://soft-matter.github.io/trackpy/v0.4.2/tutorial/walkthrough.html
After processing a few frames (usually around 14 frames, sometimes 0), it gives me a Value error that sais: "generator already executing"
I cannot figure out how to solve this issue, I hope someone does.
Python: 3.9.4
Trackpy: 0.5.0
The full error:
ValueError Traceback (most recent call last)
<ipython-input-8-ff6dcf7a7595> in <module>
----> 1 f = tp.batch(frames[100:300], masksize, minmass=minmass, invert=True);
~\.conda\envs\trackpyenv\lib\site-packages\trackpy\feature.py in batch(frames, diameter, output, meta, processes, after_locate, **kwargs)
556 all_features = []
557 for i, features in enumerate(map_func(curried_locate, frames)):
--> 558 image = frames[i]
559 if hasattr(image, 'frame_no') and image.frame_no is not None:
560 frame_no = image.frame_no
~\.conda\envs\trackpyenv\lib\site-packages\slicerator\__init__.py in __getitem__(self, key)
234 if not (isinstance(key, slice) or
235 isinstance(key, collections.Iterable)):
--> 236 return self._get(self._map_index(key))
237 else:
238 rel_indices, new_length = key_to_indices(key, len(self))
~\.conda\envs\trackpyenv\lib\site-packages\slicerator\__init__.py in _get(self, key)
205
206 def _get(self, key):
--> 207 return self._ancestor[key]
208
209 def _map_index(self, key):
~\.conda\envs\trackpyenv\lib\site-packages\slicerator\__init__.py in __getitem__(self, i)
478 indices, new_length = key_to_indices(i, len(self))
479 if new_length is None:
--> 480 return self._get(indices)
481 else:
482 return Slicerator(self, indices, new_length, self._propagate_attrs)
~\.conda\envs\trackpyenv\lib\site-packages\slicerator\__init__.py in _get(self, key)
459 # We need to copy here: else any _proc_func that acts inplace would
460 # change the ancestor value.
--> 461 return self._proc_func(*(copy(a[key]) for a in self._ancestors))
462
463 def __repr__(self):
~\.conda\envs\trackpyenv\lib\site-packages\slicerator\__init__.py in <genexpr>(.0)
459 # We need to copy here: else any _proc_func that acts inplace would
460 # change the ancestor value.
--> 461 return self._proc_func(*(copy(a[key]) for a in self._ancestors))
462
463 def __repr__(self):
~\.conda\envs\trackpyenv\lib\site-packages\slicerator\__init__.py in __getitem__(self, i)
186 indices, new_length = key_to_indices(i, len(self))
187 if new_length is None:
--> 188 return self._get(indices)
189 else:
190 return cls(self, indices, new_length, propagate_attrs)
~\.conda\envs\trackpyenv\lib\site-packages\pims\base_frames.py in __getitem__(self, key)
96 """__getitem__ is handled by Slicerator. In all pims readers, the data
97 returning function is get_frame."""
---> 98 return self.get_frame(key)
99
100 def __iter__(self):
~\.conda\envs\trackpyenv\lib\site-packages\pims\base_frames.py in get_frame(self, i)
590 coords.update(**{k: v for k, v in zip(self.iter_axes, iter_coords)})
591
--> 592 result = self._get_frame_wrapped(**coords)
593 if hasattr(result, 'metadata'):
594 metadata = result.metadata
~\.conda\envs\trackpyenv\lib\site-packages\pims\imageio_reader.py in get_frame_2D(self, **coords)
100 def get_frame_2D(self, **coords):
101 i = coords['t'] if 't' in coords else 0
--> 102 frame = self.reader.get_data(i)
103 return Frame(frame, frame_no=i, metadata=frame.meta)
104
~\.conda\envs\trackpyenv\lib\site-packages\imageio\core\format.py in get_data(self, index, **kwargs)
344 self._BaseReaderWriter_last_index = index
345 try:
--> 346 im, meta = self._get_data(index, **kwargs)
347 except StopIteration:
348 raise IndexError(index)
~\.conda\envs\trackpyenv\lib\site-packages\imageio\plugins\ffmpeg.py in _get_data(self, index)
379 else:
380 if (index < self._pos) or (index > self._pos + 100):
--> 381 self._initialize(index)
382 else:
383 self._skip_frames(index - self._pos - 1)
~\.conda\envs\trackpyenv\lib\site-packages\imageio\plugins\ffmpeg.py in _initialize(self, index)
393 # Close the current generator, and thereby terminate its subprocess
394 if self._read_gen is not None:
--> 395 self._read_gen.close()
396
397 iargs = []
ValueError: generator already executing

I stumbled on the same (or similar) issue.
The root cause seems to be trying to use more than one process to execute the batch code, while some internal function call isn't thread-safe.
A workaround would be to disable multi-processes by calling batch with processes=1, e.g.:
f = tp.batch(frames[100:300], masksize, minmass=minmass, invert=True, processes=1);
See trackpy.batch for reference.
Calling it a workaround as this would cause the code to execute serially, one frame at a time. Then again, better than not executing at all...

NotImplementedError: Failed in nopython mode pipeline. Use of unknown opcode MAP_ADD at line 116 of <ipython-input-287-147d4798a88b>

I try to launch a code with Numba and I get errors.
What I want to do is to compute the cosine similarity with a cosinus_sparse function. This class method I use in the search class method, then I call search in the results method. Although I added the #jit decorator before each method I have this implementation error that appears.
Here is my code:
import numpy as np
from numba import jit
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
import math
class Search:
def __init__(self, corpus, method='XTERM', stop_words='english', max_df=1.0, min_df=1, max_features=None):
self.corpus = corpus
self.method = method
self.stop_words = stop_words
self.max_df = max_df
self.min_df = min_df
self.max_features = max_features
self.vectorization()
self.get_shape()
self.features_names = self.bag_of_word.get_feature_names()
def vectorization(self):
if self.method == 'XTERM':
self.bag_of_word = CountVectorizer(stop_words=self.stop_words,
max_df=self.max_df, min_df=self.min_df,
max_features=self.max_features)
self.corpus_vectorized = self.bag_of_word.fit_transform(self.corpus)
elif self.method == 'TFxIDF':
self.bag_of_word = TfidfVectorizer(stop_words=self.stop_words,
max_df=self.max_df, min_df=self.min_df,
max_features=self.max_features)
self.corpus_vectorized = self.bag_of_word.fit_transform(self.corpus)
else:
raise MethodError('Method provided is not valid')
def get_shape(self):
self.n_docs, self.n_terms = self.corpus_vectorized.shape
def get_query(self, query):
self.indexes = [self.features_names.index(q) for q in query if q in self.features_names]
self.query_vec = np.zeros(self.n_terms)
self.query_vec[self.indexes] = 1
#staticmethod
#jit(nopython=True)
def cosinus_sparse(i, j):
num = i.dot(j)
spars = i * i.transpose()
den = math.sqrt(spars[0, 0]) * math.sqrt(sum(j * j))
if (den > 0):
return int(num) / den
else:
return 0
#jit(nopython=True)
def search(self, q) -> dict:
cc = {i: self.cosinus_sparse(self.corpus_vectorized[i, :], q) for i in range(self.n_docs)}
cc = sorted(cc.items(), key=lambda x: x[1], reverse=True)
return cc
#jit
def get_result(self) -> list:
self.result = self.search(self.query_vec)
def result_announcer(self):
self.search_lenght = len([i for i in self.result if i[1] > 0])
print('{} documents linked to your query where found'.format(search_lenght))
def verif_query_vec(self, query):
if int(sum(self.query_vec)) != len(query):
raise QueryError('Error in query or query_vec')
def processing(self, query):
try:
self.get_query(query)
self.verif_query_vec(query)
self.get_result()
except NameError:
self.vectorisation()
self.get_shape()
self.get_feature_names()
self.get_query(query)
self.verif_query_vec(query)
self.get_result()
import ipywidgets as widgets
from IPython.display import display
text = widgets.Text(
value='',
placeholder='Type words',
description='String:',
disabled=False
)
method_radio = widgets.RadioButtons(
options=['XTERM', 'TFxIDF'],
# value='TF',
description='Method:',
disabled=False
)
submit = widgets.Button(description = 'Search')
display(widgets.VBox([text, radio, submit]))
def handle_submit(sender):
global query
query = text.value.lower().split(' ')
method = method_radio.value
# instentiation de l'objet de recherche
global search_obj
search_obj = Search(corpus=corpus, method=method, )
search_obj.processing(query)
submit.on_click(handle_submit)
Here is the error
NotImplementedError Traceback (most recent call last)
<ipython-input-288-025a488daa60> in handle_submit(sender)
27 global search_obj
28 search_obj = Search(corpus=corpus, method=method, )
---> 29 search_obj.processing(query)
30
31 submit.on_click(handle_submit)
<ipython-input-287-147d4798a88b> in processing(self, query)
167 self.get_query(query)
168 self.verif_query_vec(query)
--> 169 self.get_result()
170
171 except NameError:
~\Anaconda3\lib\site-packages\numba\dispatcher.py in _compile_for_args(self, *args, **kws)
365 e.patch_message(''.join(e.args) + help_msg)
366 # ignore the FULL_TRACEBACKS config, this needs reporting!
--> 367 raise e
368
369 def inspect_llvm(self, signature=None):
~\Anaconda3\lib\site-packages\numba\dispatcher.py in _compile_for_args(self, *args, **kws)
322 argtypes.append(self.typeof_pyval(a))
323 try:
--> 324 return self.compile(tuple(argtypes))
325 except errors.TypingError as e:
326 # Intercept typing error that may be due to an argument
~\Anaconda3\lib\site-packages\numba\compiler_lock.py in _acquire_compile_lock(*args, **kwargs)
30 def _acquire_compile_lock(*args, **kwargs):
31 with self:
---> 32 return func(*args, **kwargs)
33 return _acquire_compile_lock
34
~\Anaconda3\lib\site-packages\numba\dispatcher.py in compile(self, sig)
653
654 self._cache_misses[sig] += 1
--> 655 cres = self._compiler.compile(args, return_type)
656 self.add_overload(cres)
657 self._cache.save_overload(sig, cres)
~\Anaconda3\lib\site-packages\numba\dispatcher.py in compile(self, args, return_type)
80 args=args, return_type=return_type,
81 flags=flags, locals=self.locals,
---> 82 pipeline_class=self.pipeline_class)
83 # Check typing error if object mode is used
84 if cres.typing_error is not None and not flags.enable_pyobject:
~\Anaconda3\lib\site-packages\numba\compiler.py in compile_extra(typingctx, targetctx, func, args, return_type, flags, locals, library, pipeline_class)
924 pipeline = pipeline_class(typingctx, targetctx, library,
925 args, return_type, flags, locals)
--> 926 return pipeline.compile_extra(func)
927
928
~\Anaconda3\lib\site-packages\numba\compiler.py in compile_extra(self, func)
372 self.lifted = ()
373 self.lifted_from = None
--> 374 return self._compile_bytecode()
375
376 def compile_ir(self, func_ir, lifted=(), lifted_from=None):
~\Anaconda3\lib\site-packages\numba\compiler.py in _compile_bytecode(self)
855 """
856 assert self.func_ir is None
--> 857 return self._compile_core()
858
859 def _compile_ir(self):
~\Anaconda3\lib\site-packages\numba\compiler.py in _compile_core(self)
842 self.define_pipelines(pm)
843 pm.finalize()
--> 844 res = pm.run(self.status)
845 if res is not None:
846 # Early pipeline completion
~\Anaconda3\lib\site-packages\numba\compiler_lock.py in _acquire_compile_lock(*args, **kwargs)
30 def _acquire_compile_lock(*args, **kwargs):
31 with self:
---> 32 return func(*args, **kwargs)
33 return _acquire_compile_lock
34
~\Anaconda3\lib\site-packages\numba\compiler.py in run(self, status)
253 # No more fallback pipelines?
254 if is_final_pipeline:
--> 255 raise patched_exception
256 # Go to next fallback pipeline
257 else:
~\Anaconda3\lib\site-packages\numba\compiler.py in run(self, status)
244 try:
245 event(stage_name)
--> 246 stage()
247 except _EarlyPipelineCompletion as e:
248 return e.result
~\Anaconda3\lib\site-packages\numba\compiler.py in stage_inline_pass(self)
582 self.flags.auto_parallel,
583 self.parfor_diagnostics.replaced_fns)
--> 584 inline_pass.run()
585 # Remove all Dels, and re-run postproc
586 post_proc = postproc.PostProcessor(self.func_ir)
~\Anaconda3\lib\site-packages\numba\inline_closurecall.py in run(self)
75
76 if guard(self._inline_closure,
---> 77 work_list, block, i, func_def):
78 modified = True
79 break # because block structure changed
~\Anaconda3\lib\site-packages\numba\ir_utils.py in guard(func, *args, **kwargs)
1358 """
1359 try:
-> 1360 return func(*args, **kwargs)
1361 except GuardException:
1362 return None
~\Anaconda3\lib\site-packages\numba\inline_closurecall.py in _inline_closure(self, work_list, block, i, func_def)
212 inline_closure_call(self.func_ir,
213 self.func_ir.func_id.func.__globals__,
--> 214 block, i, func_def, work_list=work_list)
215 return True
216
~\Anaconda3\lib\site-packages\numba\inline_closurecall.py in inline_closure_call(func_ir, glbls, block, i, callee, typingctx, arg_typs, typemap, calltypes, work_list)
253 callee_closure = callee.closure if hasattr(callee, 'closure') else callee.__closure__
254 # first, get the IR of the callee
--> 255 callee_ir = get_ir_of_code(glbls, callee_code)
256 callee_blocks = callee_ir.blocks
257
~\Anaconda3\lib\site-packages\numba\ir_utils.py in get_ir_of_code(glbls, fcode)
1572 f.__name__ = fcode.co_name
1573 from numba import compiler
-> 1574 ir = compiler.run_frontend(f)
1575 # we need to run the before inference rewrite pass to normalize the IR
1576 # XXX: check rewrite pass flag?
~\Anaconda3\lib\site-packages\numba\compiler.py in run_frontend(func)
168 interp = interpreter.Interpreter(func_id)
169 bc = bytecode.ByteCode(func_id=func_id)
--> 170 func_ir = interp.interpret(bc)
171 post_proc = postproc.PostProcessor(func_ir)
172 post_proc.run()
~\Anaconda3\lib\site-packages\numba\interpreter.py in interpret(self, bytecode)
101 # Data flow analysis
102 self.dfa = dataflow.DataFlowAnalysis(self.cfa)
--> 103 self.dfa.run()
104
105 # Temp states during interpretation
~\Anaconda3\lib\site-packages\numba\dataflow.py in run(self)
26 def run(self):
27 for blk in self.cfa.iterliveblocks():
---> 28 self.infos[blk.offset] = self.run_on_block(blk)
29
30 def run_on_block(self, blk):
~\Anaconda3\lib\site-packages\numba\dataflow.py in run_on_block(self, blk)
76 for offset in blk:
77 inst = self.bytecode[offset]
---> 78 self.dispatch(info, inst)
79 return info
80
~\Anaconda3\lib\site-packages\numba\dataflow.py in dispatch(self, info, inst)
86 fname = "op_%s" % inst.opname.replace('+', '_')
87 fn = getattr(self, fname, self.handle_unknown_opcode)
---> 88 fn(info, inst)
89
90 def handle_unknown_opcode(self, info, inst):
~\Anaconda3\lib\site-packages\numba\dataflow.py in handle_unknown_opcode(self, info, inst)
91 msg = "Use of unknown opcode {} at line {} of {}"
92 raise NotImplementedError(msg.format(inst.opname, inst.lineno,
---> 93 self.bytecode.func_id.filename))
94
95 def dup_topx(self, info, inst, count):
NotImplementedError: Failed in nopython mode pipeline (step: inline calls to locally defined closures)
Use of unknown opcode MAP_ADD at line 116 of <ipython-input-287-147d4798a88b>
How do I fix this error?
Thanks a lot for your help.

xarray StopIteration: error when running groupby in loop

I'm receiving a StopIteration: error when attempting to use the groupby function in xarray. The error only occurs when attempting to loop through a list of files - if a single file path is input, no error is generated. I've also tried using xr.open_mfdataset to open the full directory of files, but this produced the same error.
for path in in_files:
ds = xr.open_dataset(path)
ds['index'] = county_mask
ds = ds.set_coords('index')
ds = ds.where(ds['index'].isin(cotton_county_keys))
ds.groupby('index').mean('stacked_lat_lon').to_dataframe().reset_index()
Produces the error:
StopIteration Traceback (most recent call last)
<ipython-input-91-f26bf31efda5> in <module>()
6 ds = ds.set_coords('index')
7 ds = ds.where(ds['index'].isin(cotton_county_keys))
----> 8 ds.groupby('index').mean('stacked_lat_lon').to_dataframe().reset_index()
~\AppData\Local\Continuum\anaconda3\lib\site-packages\xarray\core\common.py in wrapped_func(self, dim, keep_attrs, skipna, **kwargs)
52 return self.reduce(func, dim, keep_attrs, skipna=skipna,
53 numeric_only=numeric_only, allow_lazy=True,
---> 54 **kwargs)
55 else:
56 def wrapped_func(self, dim=None, keep_attrs=False, **kwargs):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\xarray\core\groupby.py in reduce(self, func, dim, keep_attrs, **kwargs)
652 def reduce_dataset(ds):
653 return ds.reduce(func, dim, keep_attrs, **kwargs)
--> 654 return self.apply(reduce_dataset)
655
656 def assign(self, **kwargs):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\xarray\core\groupby.py in apply(self, func, **kwargs)
607 kwargs.pop('shortcut', None) # ignore shortcut if set (for now)
608 applied = (func(ds, **kwargs) for ds in self._iter_grouped())
--> 609 return self._combine(applied)
610
611 def _combine(self, applied):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\xarray\core\groupby.py in _combine(self, applied)
611 def _combine(self, applied):
612 """Recombine the applied objects like the original."""
--> 613 applied_example, applied = peek_at(applied)
614 coord, dim, positions = self._infer_concat_args(applied_example)
615 combined = concat(applied, dim)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\xarray\core\utils.py in peek_at(iterable)
113 """
114 gen = iter(iterable)
--> 115 peek = next(gen)
116 return peek, itertools.chain([peek], gen)
117
StopIteration:
As does:
ds = xr.open_dataset(in_files[0])
ds['index'] = county_mask
ds = ds.set_coords('index')
ds = ds.where(ds['index'].isin(cotton_county_keys))
ds.groupby('index').mean('stacked_lat_lon').to_dataframe().reset_index()
However a file path works perfectly,
path = r'V:\ARL\Weather\Product_Development\US_PRISM_DATA\daily_temp\PRISM_daily_temp_1993-01-08'
ds = xr.open_dataset(path)
ds['index'] = county_mask
ds = ds.set_coords('index')
ds = ds.where(ds['index'].isin(cotton_county_keys))
ds.groupby('index').mean('stacked_lat_lon').to_dataframe().reset_index()

ValueError when using .diff() with dask dataframe

I have a large time series data set which I want to process with Dask.
apart from a few other columns, there is a column called 'id' which identifies individuals and a column transc_date which identifies the date and a column transc_time identifying the time when an individual made a transaction.
The data is sorted using:
df = df.map_partitions(lambda x: x.sort_values(['id', 'transc_date', 'transc_time'], ascending=[True, True, True]))
transc_time is of type int and transc_date is of type datetime64.
I want to create a new column which gives me for each individual the number of days since the last transaction. For this I created the following function:
def get_diff_since_last_trans(df, plot=True):
df['diff_last'] = df.map_overlap(lambda x: x.groupby('id')['transc_date'].diff(), before=10, after=10)
diffs = df[['id', 'diff_last']].groupby(['id']).agg('max')['diff_last'].dt.days.compute()
if plot:
sns.distplot(diffs.values, kde = False, rug = False)
return diffs
When I try this function on a small subset of the data (200k rows) it works as intended. But when I use it on the full data set I get a ValueErro below.
I dropped all ids which have fewer than 10 occurrences first. transc_date does not contain nans, it only contains datetime64 entries.
Any idea what's going wrong?
ValueError Traceback (most recent call last)
<ipython-input-12-551d7256f328> in <module>()
1 a = get_diff_first_last_trans(df, plot=False)
----> 2 b = get_diff_since_last_trans(df, plot=False)
3 plot_trans_diff(a,b)
<ipython-input-10-8f83d4571659> in get_diff_since_last_trans(df, plot)
12 def get_diff_since_last_trans(df, plot=True):
13 df['diff_last'] = df.map_overlap(lambda x: x.groupby('id')['transc_date'].diff(), before=10, after=10)
---> 14 diffs = df[['id', 'diff_last']].groupby(['id']).agg('max')['diff_last'].dt.days.compute()
15 if plot:
16 sns.distplot(diffs.values, kde = False, rug = False)
~/venv/lib/python3.6/site-packages/dask/base.py in compute(self, **kwargs)
133 dask.base.compute
134 """
--> 135(result,)= compute(self, traverse=False,**kwargs) 136return result
137
~/venv/lib/python3.6/site-packages/dask/base.py in compute(*args, **kwargs)
331 postcomputes = [a.__dask_postcompute__() if is_dask_collection(a)
332 else (None, a) for a in args]
--> 333 results = get(dsk, keys, **kwargs)
334 results_iter = iter(results)
335 return tuple(a if f is None else f(next(results_iter), *a)
~/venv/lib/python3.6/site-packages/distributed/client.py in get(self, dsk, keys, restrictions, loose_restrictions, resources, sync, asynchronous, **kwargs)
1997 secede()
1998 try:
-> 1999 results = self.gather(packed, asynchronous=asynchronous)
2000 finally:
2001 for f in futures.values():
~/venv/lib/python3.6/site-packages/distributed/client.py in gather(self, futures, errors, maxsize, direct, asynchronous)
1435 return self.sync(self._gather, futures, errors=errors,
1436 direct=direct, local_worker=local_worker,
-> 1437 asynchronous=asynchronous)
1438
1439 #gen.coroutine
~/venv/lib/python3.6/site-packages/distributed/client.py in sync(self, func, *args, **kwargs)
590 return future
591 else:
--> 592return sync(self.loop, func,*args,**kwargs) 593 594def __repr__(self):
~/venv/lib/python3.6/site-packages/distributed/utils.py in sync(loop, func, *args, **kwargs)
252 e.wait(1000000)
253 if error[0]:
--> 254 six.reraise(*error[0])
255 else:
256 return result[0]
~/venv/lib/python3.6/site-packages/six.py in reraise(tp, value, tb)
691 if value.__traceback__ is not tb:
692 raise value.with_traceback(tb)
--> 693raise value
694finally: 695 value =None
~/venv/lib/python3.6/site-packages/distributed/utils.py in f()
236 yield gen.moment
237 thread_state.asynchronous = True
--> 238 result[0] = yield make_coro()
239 except Exception as exc:
240 logger.exception(exc)
~/venv/lib/python3.6/site-packages/tornado/gen.py in run(self)
1053
1054 try:
-> 1055 value = future.result()
1056 except Exception:
1057 self.had_exception = True
~/venv/lib/python3.6/site-packages/tornado/concurrent.py in result(self, timeout)
236 if self._exc_info is not None:
237 try:
--> 238 raise_exc_info(self._exc_info)
239 finally:
240 self = None
~/venv/lib/python3.6/site-packages/tornado/util.py in raise_exc_info(exc_info)
~/venv/lib/python3.6/site-packages/tornado/gen.py in run(self)
1061 if exc_info is not None:
1062 try:
-> 1063 yielded = self.gen.throw(*exc_info)
1064 finally:
1065 # Break up a reference to itself
~/venv/lib/python3.6/site-packages/distributed/client.py in _gather(self, futures, errors, direct, local_worker)
1313 six.reraise(type(exception),
1314 exception,
-> 1315 traceback)
1316 if errors == 'skip':
1317 bad_keys.add(key)
~/venv/lib/python3.6/site-packages/six.py in reraise(tp, value, tb)
690 value = tp()
691 if value.__traceback__ is not tb:
--> 692raise value.with_traceback(tb) 693raise value
694finally:
~/venv/lib/python3.6/site-packages/dask/dataframe/rolling.py in overlap_chunk()
30 parts = [p for p in (prev_part, current_part, next_part) if p is not None]
31 combined = pd.concat(parts)
---> 32 out = func(combined, *args, **kwargs)
33 if prev_part is None:
34 before = None
<ipython-input-10-8f83d4571659> in <lambda>()
11
12 def get_diff_since_last_trans(df, plot=True):
---> 13 df['diff_last'] = df.map_overlap(lambda x: x.groupby('id')['transc_date'].diff(), before=10, after=10)
14 diffs = df[['id', 'diff_last']].groupby(['id']).agg('max')['diff_last'].dt.days.compute()
15 if plot:
~/venv/lib/python3.6/site-packages/pandas/core/groupby.py in wrapper()
737 *args, **kwargs)
738 except (AttributeError):
--> 739raise ValueError
740 741return wrapper
ValueError:

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

pool objects cannot be passed between processes or pickled - python

Related

TypeError: Object of type DataFrame is not JSON serializable

Trackpy tp.batch() gives generator already executing error

NotImplementedError: Failed in nopython mode pipeline. Use of unknown opcode MAP_ADD at line 116 of <ipython-input-287-147d4798a88b>

xarray StopIteration: error when running groupby in loop

ValueError when using .diff() with dask dataframe

Categories

Resources