I'm trying to run a fast fourier transform on a pandas dataframe that I have. I am using the Kepler exoplanet dataset, here, and a specific notebook for it, here. I recreate the code in cells 27-30 (Note that the code in cell 29 is executed elsewhere, thus both dataframes have the same shape as the original notebook), which looks as follows:
import scipy
def spectrum_getter(X):
Spectrum = scipy.fft.fft(X, n=X.size)
return np.abs(Spectrum)
x_train_OS_FT = x_train_OS.apply(spectrum_getter, axis=1)
x_test_FT = x_test.apply(spectrum_getter, axis=1)
Both x_train_OS and x_test are pandas.core.frame.DataFrame. Running this produces:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
Input In [245], in <module>
----> 1 x_train_OS_FT = x_train_OS.apply(spectrum_getter, axis=1)
2 x_test_FT = x_test.apply(spectrum_getter, axis=1)
File c:\users\marti\appdata\local\programs\python\python39\lib\site-packages\pandas\core\frame.py:8827, in DataFrame.apply(self, func, axis, raw, result_type, args, **kwargs)
8816 from pandas.core.apply import frame_apply
8818 op = frame_apply(
8819 self,
8820 func=func,
(...)
8825 kwargs=kwargs,
8826 )
-> 8827 return op.apply().__finalize__(self, method="apply")
File c:\users\marti\appdata\local\programs\python\python39\lib\site-packages\pandas\core\apply.py:727, in FrameApply.apply(self)
724 elif self.raw:
725 return self.apply_raw()
--> 727 return self.apply_standard()
File c:\users\marti\appdata\local\programs\python\python39\lib\site-packages\pandas\core\apply.py:851, in FrameApply.apply_standard(self)
850 def apply_standard(self):
--> 851 results, res_index = self.apply_series_generator()
853 # wrap results
854 return self.wrap_results(results, res_index)
File c:\users\marti\appdata\local\programs\python\python39\lib\site-packages\pandas\core\apply.py:867, in FrameApply.apply_series_generator(self)
864 with option_context("mode.chained_assignment", None):
865 for i, v in enumerate(series_gen):
866 # ignore SettingWithCopy here in case the user mutates
--> 867 results[i] = self.f(v)
868 if isinstance(results[i], ABCSeries):
869 # If we have a view on v, we need to make a copy because
870 # series_generator will swap out the underlying data
871 results[i] = results[i].copy(deep=False)
Input In [244], in spectrum_getter(X)
3 def spectrum_getter(X):
----> 4 Spectrum = scipy.fft.fft(X, n=X.size)
5 return np.abs(Spectrum)
File c:\users\marti\appdata\local\programs\python\python39\lib\site-packages\scipy\fft\_backend.py:22, in _ScipyBackend.__ua_function__(method, args, kwargs)
20 if fn is None:
21 return NotImplemented
---> 22 return fn(*args, **kwargs)
File c:\users\marti\appdata\local\programs\python\python39\lib\site-packages\scipy\fft\_pocketfft\basic.py:17, in c2c(forward, x, n, axis, norm, overwrite_x, workers, plan)
14 if plan is not None:
15 raise NotImplementedError('Passing a precomputed plan is not yet '
16 'supported by scipy.fft functions')
---> 17 tmp = _asfarray(x)
18 overwrite_x = overwrite_x or _datacopied(tmp, x)
19 norm = _normalization(norm, forward)
File c:\users\marti\appdata\local\programs\python\python39\lib\site-packages\scipy\fft\_pocketfft\helper.py:97, in _asfarray(x)
95 dtype = x.dtype.newbyteorder('=')
96 # Always align input
---> 97 copy = not x.flags['ALIGNED']
98 return np.array(x, dtype=dtype, copy=copy)
File c:\users\marti\appdata\local\programs\python\python39\lib\site-packages\pandas\core\flags.py:98, in Flags.__getitem__(self, key)
96 def __getitem__(self, key):
97 if key not in self._keys:
---> 98 raise KeyError(key)
100 return getattr(self, key)
KeyError: 'ALIGNED'
I attempted to convert the dataframe to a numpy array, but ran into other issues. What am I doing wrong here?
I ran to the same error so I converted my datatype to dataframe and it solved my problem.
Related
import pickle
import numpy as np
import matplotlib.pyplot as plt
img = np.reshape(train.iloc[0],(1,32,32)) # get the first element from list
# inorder to view in imshow we need image of type (height,width, channel) rather than (channel, height,width)
imgView=np.transpose(img, (1,2,0))
plt.imshow(imgView)
**
ValueError Traceback (most recent call last)
Input In [41], in <cell line: 8>()
5 import numpy as np
6 import matplotlib.pyplot as plt
----> 8 img = np.reshape(train.iloc[0],(1,32,32)) # get the first element from list
10 # inorder to view in imshow we need image of type (height,width, channel) rather than (channel, height,width)
11 imgView=np.transpose(img, (1,2,0))
File <__array_function__ internals>:180, in reshape(*args, **kwargs)
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\numpy\core\fromnumeric.py:298, in reshape(a, newshape, order)
198 #array_function_dispatch(_reshape_dispatcher)
199 def reshape(a, newshape, order='C'):
200 """
201 Gives a new shape to an array without changing its data.
202
(...)
296 [5, 6]])
297 """
--> 298 return _wrapfunc(a, 'reshape', newshape, order=order)
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\numpy\core\fromnumeric.py:54, in _wrapfunc(obj, method, *args, **kwds)
52 bound = getattr(obj, method, None)
53 if bound is None:
---> 54 return _wrapit(obj, method, *args, **kwds)
56 try:
57 return bound(*args, **kwds)
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\numpy\core\fromnumeric.py:47, in _wrapit(obj, method, *args, **kwds)
45 if not isinstance(result, mu.ndarray):
46 result = asarray(result)
---> 47 result = wrap(result)
48 return result
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\generic.py:2095, in NDFrame.__array_wrap__(self, result, context)
2093 return res
2094 d = self._construct_axes_dict(self._AXIS_ORDERS, copy=False)
-> 2095 return self._constructor(res, **d).__finalize__(self, method="__array_wrap__")
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\series.py:442, in Series.__init__(self, data, index, dtype, name, copy, fastpath)
440 index = default_index(len(data))
441 elif is_list_like(data):
--> 442 com.require_length_match(data, index)
444 # create/copy the manager
445 if isinstance(data, (SingleBlockManager, SingleArrayManager)):
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\common.py:557, in require_length_match(data, index)
553 """
554 Check the length of data matches the length of the index.
555 """
556 if len(data) != len(index):
--> 557 raise ValueError(
558 "Length of values "
559 f"({len(data)}) "
560 "does not match length of index "
561 f"({len(index)})"
562 )
ValueError: Length of values (1) does not match length of index (1024)
**
I am trying cifar10 puckle file to convert in images but failed
ValueError: Length of values (1) does not match length of index (1024)
how to fix it
I am trying cifar10 puckle file to convert in images but failed
ValueError: Length of values (1) does not match length of index (1024)
how to fix it
I want to use ggplot2 within Jupyter Notebook. However, when I try to make an R magic cell and introduce a variable, I get an error.
Here is the code (one paragraph indicates one cell):
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import rpy2
%matplotlib inline
from rpy2.robjects import pandas2ri
pandas2ri.activate()
%load_ext rpy2.ipython
%%R
library(ggplot2)
data = pd.read_csv('train_titanic.csv')
%%R -i data -w 900 -h 480 -u px
With this last cell, I get the following error (incl traceback):
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py in py2rpy_pandasdataframe(obj)
54 try:
---> 55 od[name] = conversion.py2rpy(values)
56 except Exception as e:
~/anaconda3/envs/catenv/lib/python3.7/functools.py in wrapper(*args, **kw)
839
--> 840 return dispatch(args[0].__class__)(*args, **kw)
841
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py in py2rpy_pandasseries(obj)
125 if type(x) is not homogeneous_type:
--> 126 raise ValueError('Series can only be of one type, or None.')
127 # TODO: Could this be merged with obj.type.name == 'O' case above ?
ValueError: Series can only be of one type, or None.
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/rinterface_lib/sexp.py in from_object(cls, obj)
367 try:
--> 368 mv = memoryview(obj)
369 res = cls.from_memoryview(mv)
TypeError: memoryview: a bytes-like object is required, not 'Series'
During handling of the above exception, another exception occurred:
AttributeError Traceback (most recent call last)
<ipython-input-14-75e210679e4a> in <module>
----> 1 get_ipython().run_cell_magic('R', '-i data -w 900 -h 480 -u px', '\n\n')
~/anaconda3/envs/catenv/lib/python3.7/site-packages/IPython/core/interactiveshell.py in run_cell_magic(self, magic_name, line, cell)
2360 with self.builtin_trap:
2361 args = (magic_arg_s, cell)
-> 2362 result = fn(*args, **kwargs)
2363 return result
2364
</home/morgan/anaconda3/envs/catenv/lib/python3.7/site-packages/decorator.py:decorator-gen-130> in R(self, line, cell, local_ns)
~/anaconda3/envs/catenv/lib/python3.7/site-packages/IPython/core/magic.py in <lambda>(f, *a, **k)
185 # but it's overkill for just that one bit of state.
186 def magic_deco(arg):
--> 187 call = lambda f, *a, **k: f(*a, **k)
188
189 if callable(arg):
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/ipython/rmagic.py in R(self, line, cell, local_ns)
721 raise NameError("name '%s' is not defined" % input)
722 with localconverter(converter) as cv:
--> 723 ro.r.assign(input, val)
724
725 tmpd = self.setup_graphics(args)
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/robjects/functions.py in __call__(self, *args, **kwargs)
190 kwargs[r_k] = v
191 return (super(SignatureTranslatedFunction, self)
--> 192 .__call__(*args, **kwargs))
193
194
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/robjects/functions.py in __call__(self, *args, **kwargs)
111
112 def __call__(self, *args, **kwargs):
--> 113 new_args = [conversion.py2rpy(a) for a in args]
114 new_kwargs = {}
115 for k, v in kwargs.items():
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/robjects/functions.py in <listcomp>(.0)
111
112 def __call__(self, *args, **kwargs):
--> 113 new_args = [conversion.py2rpy(a) for a in args]
114 new_kwargs = {}
115 for k, v in kwargs.items():
~/anaconda3/envs/catenv/lib/python3.7/functools.py in wrapper(*args, **kw)
838 '1 positional argument')
839
--> 840 return dispatch(args[0].__class__)(*args, **kw)
841
842 funcname = getattr(func, '__name__', 'singledispatch function')
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py in py2rpy_pandasdataframe(obj)
59 'The error is: %s'
60 % (name, str(e)))
---> 61 od[name] = StrVector(values)
62
63 return DataFrame(od)
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/robjects/vectors.py in __init__(self, obj)
382
383 def __init__(self, obj):
--> 384 super().__init__(obj)
385 self._add_rops()
386
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/rinterface_lib/sexp.py in __init__(self, obj)
286 super().__init__(obj)
287 elif isinstance(obj, collections.abc.Sized):
--> 288 super().__init__(type(self).from_object(obj).__sexp__)
289 else:
290 raise TypeError('The constructor must be called '
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/rinterface_lib/sexp.py in from_object(cls, obj)
370 except (TypeError, ValueError):
371 try:
--> 372 res = cls.from_iterable(obj)
373 except ValueError:
374 msg = ('The class methods from_memoryview() and '
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/rinterface_lib/conversion.py in _(*args, **kwargs)
26 def _cdata_res_to_rinterface(function):
27 def _(*args, **kwargs):
---> 28 cdata = function(*args, **kwargs)
29 # TODO: test cdata is of the expected CType
30 return _cdata_to_rinterface(cdata)
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/rinterface_lib/sexp.py in from_iterable(cls, iterable, populate_func)
317 if populate_func is None:
318 cls._populate_r_vector(iterable,
--> 319 r_vector)
320 else:
321 populate_func(iterable, r_vector)
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/rinterface_lib/sexp.py in _populate_r_vector(cls, iterable, r_vector)
300 r_vector,
301 cls._R_SET_VECTOR_ELT,
--> 302 cls._CAST_IN)
303
304 #classmethod
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/rinterface_lib/sexp.py in _populate_r_vector(iterable, r_vector, set_elt, cast_value)
237 def _populate_r_vector(iterable, r_vector, set_elt, cast_value):
238 for i, v in enumerate(iterable):
--> 239 set_elt(r_vector, i, cast_value(v))
240
241
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/rinterface_lib/sexp.py in _as_charsxp_cdata(x)
430 return x.__sexp__._cdata
431 else:
--> 432 return conversion._str_to_charsxp(x)
433
434
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/rinterface_lib/conversion.py in _str_to_charsxp(val)
118 s = rlib.R_NaString
119 else:
--> 120 cchar = _str_to_cchar(val)
121 s = rlib.Rf_mkCharCE(cchar, _CE_UTF8)
122 return s
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/rinterface_lib/conversion.py in _str_to_cchar(s, encoding)
97 def _str_to_cchar(s, encoding: str = 'utf-8'):
98 # TODO: use isStrinb and installTrChar
---> 99 b = s.encode(encoding)
100 return ffi.new('char[]', b)
101
AttributeError: 'float' object has no attribute 'encode'
So I find that it is not possible to even start an R magic cell while importing my pandas dataframe object. However, I have tried creating R vectors inside the cell, and find I can plot these using ggplot2 with no issues.
I am using Python 3.7.6, rpy2 3.1.0, jupyter-notebook 6.0.3and am using Ubuntu 18.04.2 LTS on Windows Subsystem for Linux.
The problem is most likely with one (or more) columns having more than one type - therefore it is impossible to transfer the data into an R vector (which can hold only one data type). The traceback may be overwhelming, but here is the relevant part:
ValueError: Series can only be of one type, or None.
Which column it is? Difficult to say without looking at the dataset that you load, but my general solution is to check the types in the columns:
types = data.applymap(type).apply(set)
types[types.apply(len) > 1]
Anything returned by the snippet above would be a candidate culprit. There are many different ways of dealing with the problem, depending on the exact nature of the data. Workarounds that I frequently use include:
calling data = data.infer_objects() - helps if the pandas did not catch up with a dtype change and still stores the data with (suboptimal) Python objects
filling NaN with an empty string or a string constant if you have missing values in a string column (e.g. str_columns = str_columns.fillna(''))
dates.apply(pd.to_datetime, axis=1) if you have datetime objects but the dtype is object
using df.applymap(lambda x: datetime.combine(x, datetime.min.time()) if not isinstance(x, datetime) else x) if you have a mixture of date and datetime objects
In some vary rare cases pandas stores the data differently than expected by rpy2 (following certain manipulations); then writing the dataframe down to a csv file and reading it from the disk again helps - but this is likely not what you are facing here, as you start from a newly read dataframe.
I just noticed there might be an even simpler reason for the problem. For some reason, pandas2ri requires you to call pandas2ri.activate()after importing it. This solved the problem for me.
I am working on comparing the calculation speed of Dask and Numpy for different data sizes. I understand that Dask can perform computations of data in parallel, and it splits up the data into chunks so that the data size can be larger than RAM. When using the Dask code below, I get a memory error (shown at the bottom) with square array of 42000 in size.
import dask as da
import time
size = 42000
y = da.random.random(size = (size,size), chunks = (size/8,size/8))
start = time.time()
y = y.dot(y*2) #arbitrary dot product calculation
y.compute()
end = time.time()
print(str(end-start) + " seconds")
However, I do not get any error when running similar code with Numpy.
import numpy as np
import time
size = 42000
x = np.random.random(size = (size,size))
start = time.time()
x = x.dot(x*2) #arbitrary dot product calculation
end = time.time()
print(str(end-start) + " seconds")
I, therefore, do not understand why Dask throws a memory error when Numpy doesn't especially because Dask should be able to partition the data. Is there any explanation/solution to this?
Edit: I have only had this problem with dot product. I have tested with mean without any problems.
MemoryError Traceback (most recent call last)
<ipython-input-3-a3af599b673a> in <module>()
3 start = time.time()
4 y = y.dot(y*2)
----> 5 y.compute()
6 end = time.time()
7 print(str(end-start) + " seconds")
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\base.py in compute(self, **kwargs)
152 dask.base.compute
153 """
--> 154 (result,) = compute(self, traverse=False, **kwargs)
155 return result
156
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\base.py in compute(*args, **kwargs)
405 keys = [x.__dask_keys__() for x in collections]
406 postcomputes = [x.__dask_postcompute__() for x in collections]
--> 407 results = get(dsk, keys, **kwargs)
408 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
409
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\threaded.py in get(dsk, result, cache, num_workers, **kwargs)
73 results = get_async(pool.apply_async, len(pool._pool), dsk, result,
74 cache=cache, get_id=_thread_get_id,
---> 75 pack_exception=pack_exception, **kwargs)
76
77 # Cleanup pools associated to dead threads
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\local.py in get_async(apply_async, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, **kwargs)
519 _execute_task(task, data) # Re-execute locally
520 else:
--> 521 raise_exception(exc, tb)
522 res, worker_id = loads(res_info)
523 state['cache'][key] = res
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\compatibility.py in reraise(exc, tb)
65 if exc.__traceback__ is not tb:
66 raise exc.with_traceback(tb)
---> 67 raise exc
68
69 else:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\local.py in execute_task(key, task_info, dumps, loads, get_id, pack_exception)
288 try:
289 task, data = loads(task_info)
--> 290 result = _execute_task(task, data)
291 id = get_id()
292 result = dumps((result, id))
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\local.py in _execute_task(arg, cache, dsk)
269 func, args = arg[0], arg[1:]
270 args2 = [_execute_task(a, cache) for a in args]
--> 271 return func(*args2)
272 elif not ishashable(arg):
273 return arg
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\compatibility.py in apply(func, args, kwargs)
46 def apply(func, args, kwargs=None):
47 if kwargs:
---> 48 return func(*args, **kwargs)
49 else:
50 return func(*args)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\numpy\core\fromnumeric.py in sum(a, axis, dtype, out, keepdims)
1880 return sum(axis=axis, dtype=dtype, out=out, **kwargs)
1881 return _methods._sum(a, axis=axis, dtype=dtype,
-> 1882 out=out, **kwargs)
1883
1884
~\AppData\Local\Continuum\anaconda3\lib\site-packages\numpy\core\_methods.py in _sum(a, axis, dtype, out, keepdims)
30
31 def _sum(a, axis=None, dtype=None, out=None, keepdims=False):
---> 32 return umr_sum(a, axis, dtype, out, keepdims)
33
34 def _prod(a, axis=None, dtype=None, out=None, keepdims=False):
MemoryError:
During the final stage when Dask stitches things together it will probably need around 2x memory for the output.
Generally You probably shouldn't use Dask if your computation fits in memory. NumPy with a modern BLAS implementation (OpenBLAS, MKL, ...) will probably perform better.
I'm receiving a StopIteration: error when attempting to use the groupby function in xarray. The error only occurs when attempting to loop through a list of files - if a single file path is input, no error is generated. I've also tried using xr.open_mfdataset to open the full directory of files, but this produced the same error.
for path in in_files:
ds = xr.open_dataset(path)
ds['index'] = county_mask
ds = ds.set_coords('index')
ds = ds.where(ds['index'].isin(cotton_county_keys))
ds.groupby('index').mean('stacked_lat_lon').to_dataframe().reset_index()
Produces the error:
StopIteration Traceback (most recent call last)
<ipython-input-91-f26bf31efda5> in <module>()
6 ds = ds.set_coords('index')
7 ds = ds.where(ds['index'].isin(cotton_county_keys))
----> 8 ds.groupby('index').mean('stacked_lat_lon').to_dataframe().reset_index()
~\AppData\Local\Continuum\anaconda3\lib\site-packages\xarray\core\common.py in wrapped_func(self, dim, keep_attrs, skipna, **kwargs)
52 return self.reduce(func, dim, keep_attrs, skipna=skipna,
53 numeric_only=numeric_only, allow_lazy=True,
---> 54 **kwargs)
55 else:
56 def wrapped_func(self, dim=None, keep_attrs=False, **kwargs):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\xarray\core\groupby.py in reduce(self, func, dim, keep_attrs, **kwargs)
652 def reduce_dataset(ds):
653 return ds.reduce(func, dim, keep_attrs, **kwargs)
--> 654 return self.apply(reduce_dataset)
655
656 def assign(self, **kwargs):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\xarray\core\groupby.py in apply(self, func, **kwargs)
607 kwargs.pop('shortcut', None) # ignore shortcut if set (for now)
608 applied = (func(ds, **kwargs) for ds in self._iter_grouped())
--> 609 return self._combine(applied)
610
611 def _combine(self, applied):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\xarray\core\groupby.py in _combine(self, applied)
611 def _combine(self, applied):
612 """Recombine the applied objects like the original."""
--> 613 applied_example, applied = peek_at(applied)
614 coord, dim, positions = self._infer_concat_args(applied_example)
615 combined = concat(applied, dim)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\xarray\core\utils.py in peek_at(iterable)
113 """
114 gen = iter(iterable)
--> 115 peek = next(gen)
116 return peek, itertools.chain([peek], gen)
117
StopIteration:
As does:
ds = xr.open_dataset(in_files[0])
ds['index'] = county_mask
ds = ds.set_coords('index')
ds = ds.where(ds['index'].isin(cotton_county_keys))
ds.groupby('index').mean('stacked_lat_lon').to_dataframe().reset_index()
However a file path works perfectly,
path = r'V:\ARL\Weather\Product_Development\US_PRISM_DATA\daily_temp\PRISM_daily_temp_1993-01-08'
ds = xr.open_dataset(path)
ds['index'] = county_mask
ds = ds.set_coords('index')
ds = ds.where(ds['index'].isin(cotton_county_keys))
ds.groupby('index').mean('stacked_lat_lon').to_dataframe().reset_index()
I have a large time series data set which I want to process with Dask.
apart from a few other columns, there is a column called 'id' which identifies individuals and a column transc_date which identifies the date and a column transc_time identifying the time when an individual made a transaction.
The data is sorted using:
df = df.map_partitions(lambda x: x.sort_values(['id', 'transc_date', 'transc_time'], ascending=[True, True, True]))
transc_time is of type int and transc_date is of type datetime64.
I want to create a new column which gives me for each individual the number of days since the last transaction. For this I created the following function:
def get_diff_since_last_trans(df, plot=True):
df['diff_last'] = df.map_overlap(lambda x: x.groupby('id')['transc_date'].diff(), before=10, after=10)
diffs = df[['id', 'diff_last']].groupby(['id']).agg('max')['diff_last'].dt.days.compute()
if plot:
sns.distplot(diffs.values, kde = False, rug = False)
return diffs
When I try this function on a small subset of the data (200k rows) it works as intended. But when I use it on the full data set I get a ValueErro below.
I dropped all ids which have fewer than 10 occurrences first. transc_date does not contain nans, it only contains datetime64 entries.
Any idea what's going wrong?
ValueError Traceback (most recent call last)
<ipython-input-12-551d7256f328> in <module>()
1 a = get_diff_first_last_trans(df, plot=False)
----> 2 b = get_diff_since_last_trans(df, plot=False)
3 plot_trans_diff(a,b)
<ipython-input-10-8f83d4571659> in get_diff_since_last_trans(df, plot)
12 def get_diff_since_last_trans(df, plot=True):
13 df['diff_last'] = df.map_overlap(lambda x: x.groupby('id')['transc_date'].diff(), before=10, after=10)
---> 14 diffs = df[['id', 'diff_last']].groupby(['id']).agg('max')['diff_last'].dt.days.compute()
15 if plot:
16 sns.distplot(diffs.values, kde = False, rug = False)
~/venv/lib/python3.6/site-packages/dask/base.py in compute(self, **kwargs)
133 dask.base.compute
134 """
--> 135(result,)= compute(self, traverse=False,**kwargs) 136return result
137
~/venv/lib/python3.6/site-packages/dask/base.py in compute(*args, **kwargs)
331 postcomputes = [a.__dask_postcompute__() if is_dask_collection(a)
332 else (None, a) for a in args]
--> 333 results = get(dsk, keys, **kwargs)
334 results_iter = iter(results)
335 return tuple(a if f is None else f(next(results_iter), *a)
~/venv/lib/python3.6/site-packages/distributed/client.py in get(self, dsk, keys, restrictions, loose_restrictions, resources, sync, asynchronous, **kwargs)
1997 secede()
1998 try:
-> 1999 results = self.gather(packed, asynchronous=asynchronous)
2000 finally:
2001 for f in futures.values():
~/venv/lib/python3.6/site-packages/distributed/client.py in gather(self, futures, errors, maxsize, direct, asynchronous)
1435 return self.sync(self._gather, futures, errors=errors,
1436 direct=direct, local_worker=local_worker,
-> 1437 asynchronous=asynchronous)
1438
1439 #gen.coroutine
~/venv/lib/python3.6/site-packages/distributed/client.py in sync(self, func, *args, **kwargs)
590 return future
591 else:
--> 592return sync(self.loop, func,*args,**kwargs) 593 594def __repr__(self):
~/venv/lib/python3.6/site-packages/distributed/utils.py in sync(loop, func, *args, **kwargs)
252 e.wait(1000000)
253 if error[0]:
--> 254 six.reraise(*error[0])
255 else:
256 return result[0]
~/venv/lib/python3.6/site-packages/six.py in reraise(tp, value, tb)
691 if value.__traceback__ is not tb:
692 raise value.with_traceback(tb)
--> 693raise value
694finally: 695 value =None
~/venv/lib/python3.6/site-packages/distributed/utils.py in f()
236 yield gen.moment
237 thread_state.asynchronous = True
--> 238 result[0] = yield make_coro()
239 except Exception as exc:
240 logger.exception(exc)
~/venv/lib/python3.6/site-packages/tornado/gen.py in run(self)
1053
1054 try:
-> 1055 value = future.result()
1056 except Exception:
1057 self.had_exception = True
~/venv/lib/python3.6/site-packages/tornado/concurrent.py in result(self, timeout)
236 if self._exc_info is not None:
237 try:
--> 238 raise_exc_info(self._exc_info)
239 finally:
240 self = None
~/venv/lib/python3.6/site-packages/tornado/util.py in raise_exc_info(exc_info)
~/venv/lib/python3.6/site-packages/tornado/gen.py in run(self)
1061 if exc_info is not None:
1062 try:
-> 1063 yielded = self.gen.throw(*exc_info)
1064 finally:
1065 # Break up a reference to itself
~/venv/lib/python3.6/site-packages/distributed/client.py in _gather(self, futures, errors, direct, local_worker)
1313 six.reraise(type(exception),
1314 exception,
-> 1315 traceback)
1316 if errors == 'skip':
1317 bad_keys.add(key)
~/venv/lib/python3.6/site-packages/six.py in reraise(tp, value, tb)
690 value = tp()
691 if value.__traceback__ is not tb:
--> 692raise value.with_traceback(tb) 693raise value
694finally:
~/venv/lib/python3.6/site-packages/dask/dataframe/rolling.py in overlap_chunk()
30 parts = [p for p in (prev_part, current_part, next_part) if p is not None]
31 combined = pd.concat(parts)
---> 32 out = func(combined, *args, **kwargs)
33 if prev_part is None:
34 before = None
<ipython-input-10-8f83d4571659> in <lambda>()
11
12 def get_diff_since_last_trans(df, plot=True):
---> 13 df['diff_last'] = df.map_overlap(lambda x: x.groupby('id')['transc_date'].diff(), before=10, after=10)
14 diffs = df[['id', 'diff_last']].groupby(['id']).agg('max')['diff_last'].dt.days.compute()
15 if plot:
~/venv/lib/python3.6/site-packages/pandas/core/groupby.py in wrapper()
737 *args, **kwargs)
738 except (AttributeError):
--> 739raise ValueError
740 741return wrapper
ValueError: