Dask groupby with multiple columns issue - python

I have the following dataframe created by using dataframe.from_delayed method tha has the following columns
_id hour_timestamp http_method total_hits username hour weekday.
Some details on the source dataframe:
hits_rate_stats._meta.dtypes
_id object
hour_timestamp datetime64[ns]
http_method object
total_hits object
username object
hour int64
weekday int64
dtype: object
meta index:
RangeIndex(start=0, stop=0, step=1)
When I execute the following code
my_df_grouped = my_df.groupby(['username', 'http_method', 'weekday', 'hour'])
my_df_grouped.total_hits.sum().reset_index().compute()
I get the following exception:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-27-b24b24fc86db> in <module>()
----> 1 hits_rate_stats_grouped.total_hits.sum().reset_index().compute()
/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/dask/base.pyc in compute(self, **kwargs)
141 dask.base.compute
142 """
--> 143 (result,) = compute(self, traverse=False, **kwargs)
144 return result
145
/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/dask/base.pyc in compute(*args, **kwargs)
390 postcomputes = [a.__dask_postcompute__() if is_dask_collection(a)
391 else (None, a) for a in args]
--> 392 results = get(dsk, keys, **kwargs)
393 results_iter = iter(results)
394 return tuple(a if f is None else f(next(results_iter), *a)
/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/distributed/client.pyc in get(self, dsk, keys, restrictions, loose_restrictions, resources, sync, asynchronous, **kwargs)
2039 secede()
2040 try:
-> 2041 results = self.gather(packed, asynchronous=asynchronous)
2042 finally:
2043 for f in futures.values():
/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/distributed/client.pyc in gather(self, futures, errors, maxsize, direct, asynchronous)
1476 return self.sync(self._gather, futures, errors=errors,
1477 direct=direct, local_worker=local_worker,
-> 1478 asynchronous=asynchronous)
1479
1480 #gen.coroutine
/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/distributed/client.pyc in sync(self, func, *args, **kwargs)
601 return future
602 else:
--> 603 return sync(self.loop, func, *args, **kwargs)
604
605 def __repr__(self):
/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/distributed/utils.pyc in sync(loop, func, *args, **kwargs)
251 e.wait(10)
252 if error[0]:
--> 253 six.reraise(*error[0])
254 else:
255 return result[0]
/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/distributed/utils.pyc in f()
235 yield gen.moment
236 thread_state.asynchronous = True
--> 237 result[0] = yield make_coro()
238 except Exception as exc:
239 logger.exception(exc)
/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/tornado/gen.pyc in run(self)
1053
1054 try:
-> 1055 value = future.result()
1056 except Exception:
1057 self.had_exception = True
/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/tornado/concurrent.pyc in result(self, timeout)
236 if self._exc_info is not None:
237 try:
--> 238 raise_exc_info(self._exc_info)
239 finally:
240 self = None
/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/tornado/gen.pyc in run(self)
1061 if exc_info is not None:
1062 try:
-> 1063 yielded = self.gen.throw(*exc_info)
1064 finally:
1065 # Break up a reference to itself
/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/distributed/client.pyc in _gather(self, futures, errors, direct, local_worker)
1354 six.reraise(type(exception),
1355 exception,
-> 1356 traceback)
1357 if errors == 'skip':
1358 bad_keys.add(key)
/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/dask/dataframe/core.pyc in apply_and_enforce()
3354 return meta
3355 c = meta.columns if isinstance(df, pd.DataFrame) else meta.name
-> 3356 return _rename(c, df)
3357 return df
3358
/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/dask/dataframe/core.pyc in _rename()
3391 # deep=False doesn't doesn't copy any data/indices, so this is cheap
3392 df = df.copy(deep=False)
-> 3393 df.columns = columns
3394 return df
3395 elif isinstance(df, (pd.Series, pd.Index)):
/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/pandas/core/generic.pyc in __setattr__()
3625 try:
3626 object.__getattribute__(self, name)
-> 3627 return object.__setattr__(self, name, value)
3628 except AttributeError:
3629 pass
pandas/_libs/properties.pyx in pandas._libs.properties.AxisProperty.__set__()
/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/pandas/core/generic.pyc in _set_axis()
557
558 def _set_axis(self, axis, labels):
--> 559 self._data.set_axis(axis, labels)
560 self._clear_item_cache()
561
/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/pandas/core/internals.pyc in set_axis()
3072 raise ValueError('Length mismatch: Expected axis has %d elements, '
3073 'new values have %d elements' %
-> 3074 (old_len, new_len))
3075
3076 self.axes[axis] = new_labels
ValueError: Length mismatch: Expected axis has 5 elements, new values have 2 elements
When I do my_df_grouped.count().reset_index().compute() it works as it should and when I do my_df_grouped.sum().reset_index().compute() I get
/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/pandas/core/groupby.pyc in _get_grouper()
2830 raise ValueError('No group keys passed!')
2831 else:
-> 2832 raise ValueError('multiple levels only valid with '
2833 'MultiIndex')
2834
ValueError: multiple levels only valid with MultiIndex
Reproducing locally with dummy data doesn't give me these errors. What could be going wrong?
EDIT:
It seems it is loosing the multi index. If I do this:
total_hits = my_df_grouped.total_hits.sum()
total_hits._meta.index = pd.MultiIndex(levels=[[],[],[],[],], labels=[[],[],[],[]], names=['username', 'http_method', 'weekday', hour'])

Related

df.apply(hurst_function) gave TypeError: must be real number, not tuple in, Python

I have a column in form of a data-frame that contains the ratio of some numbers.
On that df col, I want to apply hurst function using df.apply() method.
I don't know if the error is with the df.apply or with the hurst_function.
Consider the code which calculates hurst exponent on a col using the df.apply method:
import hurst
def hurst_function(df_col_slice):
display(df_col_slice)
return hurst.compute_Hc(df_col_slice)
def func(df_col):
results = round(df_col.rolling(101).apply(hurst_function)[100:],1)
return results
func(df_col)
I get the error:
Input In [73], in func(df_col)
---> 32 results = round(df_col.rolling(101).apply(hurst_function)[100:],1)
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\window\rolling.py:1843, in Rolling.apply(self, func, raw, engine, engine_kwargs, args, kwargs)
1822 #doc(
1823 template_header,
1824 create_section_header("Parameters"),
(...)
1841 kwargs: dict[str, Any] | None = None,
1842 ):
-> 1843 return super().apply(
1844 func,
1845 raw=raw,
1846 engine=engine,
1847 engine_kwargs=engine_kwargs,
1848 args=args,
1849 kwargs=kwargs,
1850 )
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\window\rolling.py:1315, in RollingAndExpandingMixin.apply(self, func, raw, engine, engine_kwargs, args, kwargs)
1312 else:
1313 raise ValueError("engine must be either 'numba' or 'cython'")
-> 1315 return self._apply(
1316 apply_func,
1317 numba_cache_key=numba_cache_key,
1318 numba_args=numba_args,
1319 )
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\window\rolling.py:590, in BaseWindow._apply(self, func, name, numba_cache_key, numba_args, **kwargs)
587 return result
589 if self.method == "single":
--> 590 return self._apply_blockwise(homogeneous_func, name)
591 else:
592 return self._apply_tablewise(homogeneous_func, name)
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\window\rolling.py:442, in BaseWindow._apply_blockwise(self, homogeneous_func, name)
437 """
438 Apply the given function to the DataFrame broken down into homogeneous
439 sub-frames.
440 """
441 if self._selected_obj.ndim == 1:
--> 442 return self._apply_series(homogeneous_func, name)
444 obj = self._create_data(self._selected_obj)
445 if name == "count":
446 # GH 12541: Special case for count where we support date-like types
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\window\rolling.py:431, in BaseWindow._apply_series(self, homogeneous_func, name)
428 except (TypeError, NotImplementedError) as err:
429 raise DataError("No numeric types to aggregate") from err
--> 431 result = homogeneous_func(values)
432 return obj._constructor(result, index=obj.index, name=obj.name)
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\window\rolling.py:582, in BaseWindow._apply.<locals>.homogeneous_func(values)
579 return func(x, start, end, min_periods, *numba_args)
581 with np.errstate(all="ignore"):
--> 582 result = calc(values)
584 if numba_cache_key is not None:
585 NUMBA_FUNC_CACHE[numba_cache_key] = func
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\window\rolling.py:579, in BaseWindow._apply.<locals>.homogeneous_func.<locals>.calc(x)
571 start, end = window_indexer.get_window_bounds(
572 num_values=len(x),
573 min_periods=min_periods,
574 center=self.center,
575 closed=self.closed,
576 )
577 self._check_window_bounds(start, end, len(x))
--> 579 return func(x, start, end, min_periods, *numba_args)
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\window\rolling.py:1342, in RollingAndExpandingMixin._generate_cython_apply_func.<locals>.apply_func(values, begin, end, min_periods, raw)
1339 if not raw:
1340 # GH 45912
1341 values = Series(values, index=self._on)
-> 1342 return window_func(values, begin, end, min_periods)
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\_libs\window\aggregations.pyx:1315, in pandas._libs.window.aggregations.roll_apply()
TypeError: must be real number, not tuple
What can I do to solve this?
Edit: display(df_col_slice) is giving the following output:
0 0.282043
1 0.103355
2 0.537766
3 0.491976
4 0.535050
...
96 0.022696
97 0.438995
98 -0.131486
99 0.248250
100 1.246463
Length: 101, dtype: float64
hurst.compute_Hc function returns a tuple of 3 values:
H, c, vals = compute_Hc(df_col_slice)
where H is the Hurst exponent , and c - is some constant.
But, pandas._libs.window.aggregations.roll_apply() expects its argument (function) to return a single (scalar) which is the reduced result of a rolling window.
That's why your hurst_function function need to return a certain value from vals.

Error when trying to access dask datafame - ValueError: Length of passed values is 0, index implies

I'm getting above the error when trying to compute a dask dataframe. Here's what I'm doing(taking a pandas dataframe, then converting year to datatime then merging it with another dataframe):
from dask import dataframe as dd
#setup variables
df1x = dd.from_pandas(df1, npartitions=4).reset_index() # cudf.DataFrame.from_pandas(FullMerge)
df2x = dd.from_pandas(df2, npartitions=4).reset_index() #cudf.DataFrame.from_pandas(emissions)
# add year
df1x['year'] = dd.to_datetime(df1x.date_x,unit='ns') #pd.to_datetime(df1['date_x'])
df2x['year'] = dd.to_datetime(df2x.year,unit='ns')
#we must rename emissions DF values to match fullMerge so data can merge correctly
df2x = df2x.rename(columns={'reference_name': 'Name'})
# map revenueOut to df1 #set it to value
df1x['value'] = df1x[['year', 'Name']].merge(df2x, how='left').revenueOutput
It seems to work(no errors) but when I want to view the results, I get above error:
df1x.to_csv('myfiles.csv', single_file = True)
I get this stack trace(if it helps):
ValueError Traceback (most recent call last)
<ipython-input-10-78b6500075c4> in <module>
----> 1 df1x.to_csv('myfiles.csv', single_file = True)
2 # dd.compute(Full_df)
20 frames
/usr/local/lib/python3.7/dist-packages/dask/dataframe/core.py in to_csv(self, filename, **kwargs)
1344 from .io import to_csv
1345
-> 1346 return to_csv(self, filename, **kwargs)
1347
1348 def to_json(self, filename, *args, **kwargs):
/usr/local/lib/python3.7/dist-packages/dask/dataframe/io/csv.py in to_csv(df, filename, single_file, encoding, mode, name_function, compression, compute, scheduler, storage_options, header_first_partition_only, **kwargs)
787 )
788 if compute:
--> 789 delayed(values).compute(scheduler=scheduler)
790 return [f.path for f in files]
791 else:
/usr/local/lib/python3.7/dist-packages/dask/base.py in compute(self, **kwargs)
164 dask.base.compute
165 """
--> 166 (result,) = compute(self, traverse=False, **kwargs)
167 return result
168
/usr/local/lib/python3.7/dist-packages/dask/base.py in compute(*args, **kwargs)
435 keys = [x.__dask_keys__() for x in collections]
436 postcomputes = [x.__dask_postcompute__() for x in collections]
--> 437 results = schedule(dsk, keys, **kwargs)
438 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
439
/usr/local/lib/python3.7/dist-packages/dask/threaded.py in get(dsk, result, cache, num_workers, pool, **kwargs)
82 get_id=_thread_get_id,
83 pack_exception=pack_exception,
---> 84 **kwargs
85 )
86
/usr/local/lib/python3.7/dist-packages/dask/local.py in get_async(apply_async, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, **kwargs)
484 _execute_task(task, data) # Re-execute locally
485 else:
--> 486 raise_exception(exc, tb)
487 res, worker_id = loads(res_info)
488 state["cache"][key] = res
/usr/local/lib/python3.7/dist-packages/dask/local.py in reraise(exc, tb)
314 if exc.__traceback__ is not tb:
315 raise exc.with_traceback(tb)
--> 316 raise exc
317
318
/usr/local/lib/python3.7/dist-packages/dask/local.py in execute_task(key, task_info, dumps, loads, get_id, pack_exception)
220 try:
221 task, data = loads(task_info)
--> 222 result = _execute_task(task, data)
223 id = get_id()
224 result = dumps((result, id))
/usr/local/lib/python3.7/dist-packages/dask/core.py in _execute_task(arg, cache, dsk)
119 # temporaries by their reference count and can execute certain
120 # operations in-place.
--> 121 return func(*(_execute_task(a, cache) for a in args))
122 elif not ishashable(arg):
123 return arg
/usr/local/lib/python3.7/dist-packages/dask/optimization.py in __call__(self, *args)
980 if not len(args) == len(self.inkeys):
981 raise ValueError("Expected %d args, got %d" % (len(self.inkeys), len(args)))
--> 982 return core.get(self.dsk, self.outkey, dict(zip(self.inkeys, args)))
983
984 def __reduce__(self):
/usr/local/lib/python3.7/dist-packages/dask/core.py in get(dsk, out, cache)
149 for key in toposort(dsk):
150 task = dsk[key]
--> 151 result = _execute_task(task, cache)
152 cache[key] = result
153 result = _execute_task(out, cache)
/usr/local/lib/python3.7/dist-packages/dask/core.py in _execute_task(arg, cache, dsk)
119 # temporaries by their reference count and can execute certain
120 # operations in-place.
--> 121 return func(*(_execute_task(a, cache) for a in args))
122 elif not ishashable(arg):
123 return arg
/usr/local/lib/python3.7/dist-packages/dask/core.py in <genexpr>(.0)
119 # temporaries by their reference count and can execute certain
120 # operations in-place.
--> 121 return func(*(_execute_task(a, cache) for a in args))
122 elif not ishashable(arg):
123 return arg
/usr/local/lib/python3.7/dist-packages/dask/core.py in _execute_task(arg, cache, dsk)
119 # temporaries by their reference count and can execute certain
120 # operations in-place.
--> 121 return func(*(_execute_task(a, cache) for a in args))
122 elif not ishashable(arg):
123 return arg
/usr/local/lib/python3.7/dist-packages/dask/utils.py in apply(func, args, kwargs)
28 def apply(func, args, kwargs=None):
29 if kwargs:
---> 30 return func(*args, **kwargs)
31 else:
32 return func(*args)
/usr/local/lib/python3.7/dist-packages/dask/dataframe/core.py in apply_and_enforce(*args, **kwargs)
5072 func = kwargs.pop("_func")
5073 meta = kwargs.pop("_meta")
-> 5074 df = func(*args, **kwargs)
5075 if is_dataframe_like(df) or is_series_like(df) or is_index_like(df):
5076 if not len(df):
/usr/local/lib/python3.7/dist-packages/dask/dataframe/shuffle.py in partitioning_index(df, npartitions)
604 An array of int64 values mapping each record to a partition.
605 """
--> 606 return hash_object_dispatch(df, index=False) % int(npartitions)
607
608
/usr/local/lib/python3.7/dist-packages/dask/utils.py in __call__(self, arg, *args, **kwargs)
504 """
505 meth = self.dispatch(type(arg))
--> 506 return meth(arg, *args, **kwargs)
507
508 #property
/usr/local/lib/python3.7/dist-packages/dask/dataframe/utils.py in hash_object_pandas(obj, index, encoding, hash_key, categorize)
470 ):
471 return pd.util.hash_pandas_object(
--> 472 obj, index=index, encoding=encoding, hash_key=hash_key, categorize=categorize
473 )
474
/usr/local/lib/python3.7/dist-packages/pandas/core/util/hashing.py in hash_pandas_object(obj, index, encoding, hash_key, categorize)
134 h = _combine_hash_arrays(hashes, num_items)
135
--> 136 h = Series(h, index=obj.index, dtype="uint64", copy=False)
137 else:
138 raise TypeError(f"Unexpected type for hashing {type(obj)}")
/usr/local/lib/python3.7/dist-packages/pandas/core/series.py in __init__(self, data, index, dtype, name, copy, fastpath)
312 if len(index) != len(data):
313 raise ValueError(
--> 314 f"Length of passed values is {len(data)}, "
315 f"index implies {len(index)}."
316 )
ValueError: Length of passed values is 0, index implies 41478.
I'm not sure what to do as the pandas version is working.
IIUC, the following line is not something dask will handle well:
df1x['value'] = df1x[['year', 'Name']].merge(df2x, how='left').revenueOutput
The reason is that partitions must be aligned when assigning the variable (df1x['value'] = ...), while merge (in general) does not yield the same alignment (df1x[['year', 'Name']].merge(df2x, how='left')). This is not an issue when all data is in memory.
If df2y defined below fits into memory, then one possible option is to do it with .map_partitions:
# make sure this fits into memory
df2y = df2x[['year', 'Name', 'revenueOutput']].compute()
def add_value(df):
df = df.merge(df2y, how='left')
df['value'] = df['revenueOutput']
return df
df1x = df1x.map_partitions(add_value)
If df2y does not fit into memory, then it might be possible to do an explicit dask merge and then use the merged dataframe for further analysis:
merged_df = dd.merge(df1x, df2x, on=['year', 'Name'], how='left')
merged_df['value'] = merged_df['revenueOutput']
# I assume that the line above is needed for some further
# transformation, but if that's not the case, then
# a simple column rename is more efficient

TypeError: Cannot cast IntervalArray to dtype float64 when transform(pd.qcut,x)

Here's my dataset
result score
1 0.786
1 0.896
0 0.435
1 0.563
0 0.145
Here's my code
import pandas as pd
intervals = data.groupby('result')['score'].transform(pd.qcut, 10)
Here's the error
TypeError Traceback (most recent call last)
/opt/conda/lib/python3.8/site-packages/pandas/core/arrays/interval.py in astype(self, dtype, copy)
708 try:
--> 709 return np.asarray(self).astype(dtype, copy=copy)
710 except (TypeError, ValueError) as err:
TypeError: float() argument must be a string or a number, not 'pandas._libs.interval.Interval'
The above exception was the direct cause of the following exception:
TypeError Traceback (most recent call last)
<ipython-input-88-429b3ee3f973> in <module>
1 data['score'] = pd.to_numeric(data['score'])
----> 2 intervals = data.groupby('result')['score'].transform(pd.qcut, 10)
3 data['Bin_low'] = pd.IntervalIndex(intervals).left
4 data['Bin_high'] = pd.IntervalIndex(intervals).right
/opt/conda/lib/python3.8/site-packages/pandas/core/groupby/generic.py in transform(self, func, engine, engine_kwargs, *args, **kwargs)
491
492 if not isinstance(func, str):
--> 493 return self._transform_general(
494 func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
495 )
/opt/conda/lib/python3.8/site-packages/pandas/core/groupby/generic.py in _transform_general(self, func, engine, engine_kwargs, *args, **kwargs)
557 dtype = self._selected_obj.dtype
558 if is_numeric_dtype(dtype):
--> 559 result = maybe_downcast_to_dtype(result, dtype)
560
561 result.name = self._selected_obj.name
/opt/conda/lib/python3.8/site-packages/pandas/core/dtypes/cast.py in maybe_downcast_to_dtype(result, dtype)
150 dtype = np.dtype(dtype)
151
--> 152 converted = maybe_downcast_numeric(result, dtype, do_round)
153 if converted is not result:
154 return converted
/opt/conda/lib/python3.8/site-packages/pandas/core/dtypes/cast.py in maybe_downcast_numeric(result, dtype, do_round)
250 and not is_string_dtype(result.dtype)
251 ):
--> 252 return result.astype(dtype)
253
254 return result
/opt/conda/lib/python3.8/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors)
5544 else:
5545 # else, only a single dtype is given
-> 5546 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors,)
5547 return self._constructor(new_data).__finalize__(self, method="astype")
5548
/opt/conda/lib/python3.8/site-packages/pandas/core/internals/managers.py in astype(self, dtype, copy, errors)
593 self, dtype, copy: bool = False, errors: str = "raise"
594 ) -> "BlockManager":
--> 595 return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
596
597 def convert(
/opt/conda/lib/python3.8/site-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, **kwargs)
404 applied = b.apply(f, **kwargs)
405 else:
--> 406 applied = getattr(b, f)(**kwargs)
407 result_blocks = _extend_blocks(applied, result_blocks)
408
/opt/conda/lib/python3.8/site-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors)
568 if self.is_extension:
569 try:
--> 570 values = self.values.astype(dtype)
571 except (ValueError, TypeError):
572 if errors == "ignore":
/opt/conda/lib/python3.8/site-packages/pandas/core/arrays/interval.py in astype(self, dtype, copy)
710 except (TypeError, ValueError) as err:
711 msg = f"Cannot cast {type(self).__name__} to dtype {dtype}"
--> 712 raise TypeError(msg) from err
713
714 #classmethod
TypeError: Cannot cast IntervalArray to dtype float64
What should I do to cast the IntervalArray?
Since qcut returns a Series with same indexing, you can just use apply, which works fine:
intervals = df.groupby('result')['score'].apply(pd.qcut, 10)
Output:
0 (0.741, 0.786]
1 (0.874, 0.896]
2 (0.406, 0.435]
3 (0.5619999999999999, 0.608]
4 (0.144, 0.174]
Name: score, dtype: interval

ValueError when using .diff() with dask dataframe

I have a large time series data set which I want to process with Dask.
apart from a few other columns, there is a column called 'id' which identifies individuals and a column transc_date which identifies the date and a column transc_time identifying the time when an individual made a transaction.
The data is sorted using:
df = df.map_partitions(lambda x: x.sort_values(['id', 'transc_date', 'transc_time'], ascending=[True, True, True]))
transc_time is of type int and transc_date is of type datetime64.
I want to create a new column which gives me for each individual the number of days since the last transaction. For this I created the following function:
def get_diff_since_last_trans(df, plot=True):
df['diff_last'] = df.map_overlap(lambda x: x.groupby('id')['transc_date'].diff(), before=10, after=10)
diffs = df[['id', 'diff_last']].groupby(['id']).agg('max')['diff_last'].dt.days.compute()
if plot:
sns.distplot(diffs.values, kde = False, rug = False)
return diffs
When I try this function on a small subset of the data (200k rows) it works as intended. But when I use it on the full data set I get a ValueErro below.
I dropped all ids which have fewer than 10 occurrences first. transc_date does not contain nans, it only contains datetime64 entries.
Any idea what's going wrong?
ValueError Traceback (most recent call last)
<ipython-input-12-551d7256f328> in <module>()
1 a = get_diff_first_last_trans(df, plot=False)
----> 2 b = get_diff_since_last_trans(df, plot=False)
3 plot_trans_diff(a,b)
<ipython-input-10-8f83d4571659> in get_diff_since_last_trans(df, plot)
12 def get_diff_since_last_trans(df, plot=True):
13 df['diff_last'] = df.map_overlap(lambda x: x.groupby('id')['transc_date'].diff(), before=10, after=10)
---> 14 diffs = df[['id', 'diff_last']].groupby(['id']).agg('max')['diff_last'].dt.days.compute()
15 if plot:
16 sns.distplot(diffs.values, kde = False, rug = False)
~/venv/lib/python3.6/site-packages/dask/base.py in compute(self, **kwargs)
133 dask.base.compute
134 """
--> 135(result,)= compute(self, traverse=False,**kwargs) 136return result
137
~/venv/lib/python3.6/site-packages/dask/base.py in compute(*args, **kwargs)
331 postcomputes = [a.__dask_postcompute__() if is_dask_collection(a)
332 else (None, a) for a in args]
--> 333 results = get(dsk, keys, **kwargs)
334 results_iter = iter(results)
335 return tuple(a if f is None else f(next(results_iter), *a)
~/venv/lib/python3.6/site-packages/distributed/client.py in get(self, dsk, keys, restrictions, loose_restrictions, resources, sync, asynchronous, **kwargs)
1997 secede()
1998 try:
-> 1999 results = self.gather(packed, asynchronous=asynchronous)
2000 finally:
2001 for f in futures.values():
~/venv/lib/python3.6/site-packages/distributed/client.py in gather(self, futures, errors, maxsize, direct, asynchronous)
1435 return self.sync(self._gather, futures, errors=errors,
1436 direct=direct, local_worker=local_worker,
-> 1437 asynchronous=asynchronous)
1438
1439 #gen.coroutine
~/venv/lib/python3.6/site-packages/distributed/client.py in sync(self, func, *args, **kwargs)
590 return future
591 else:
--> 592return sync(self.loop, func,*args,**kwargs) 593 594def __repr__(self):
~/venv/lib/python3.6/site-packages/distributed/utils.py in sync(loop, func, *args, **kwargs)
252 e.wait(1000000)
253 if error[0]:
--> 254 six.reraise(*error[0])
255 else:
256 return result[0]
~/venv/lib/python3.6/site-packages/six.py in reraise(tp, value, tb)
691 if value.__traceback__ is not tb:
692 raise value.with_traceback(tb)
--> 693raise value
694finally: 695 value =None
~/venv/lib/python3.6/site-packages/distributed/utils.py in f()
236 yield gen.moment
237 thread_state.asynchronous = True
--> 238 result[0] = yield make_coro()
239 except Exception as exc:
240 logger.exception(exc)
~/venv/lib/python3.6/site-packages/tornado/gen.py in run(self)
1053
1054 try:
-> 1055 value = future.result()
1056 except Exception:
1057 self.had_exception = True
~/venv/lib/python3.6/site-packages/tornado/concurrent.py in result(self, timeout)
236 if self._exc_info is not None:
237 try:
--> 238 raise_exc_info(self._exc_info)
239 finally:
240 self = None
~/venv/lib/python3.6/site-packages/tornado/util.py in raise_exc_info(exc_info)
~/venv/lib/python3.6/site-packages/tornado/gen.py in run(self)
1061 if exc_info is not None:
1062 try:
-> 1063 yielded = self.gen.throw(*exc_info)
1064 finally:
1065 # Break up a reference to itself
~/venv/lib/python3.6/site-packages/distributed/client.py in _gather(self, futures, errors, direct, local_worker)
1313 six.reraise(type(exception),
1314 exception,
-> 1315 traceback)
1316 if errors == 'skip':
1317 bad_keys.add(key)
~/venv/lib/python3.6/site-packages/six.py in reraise(tp, value, tb)
690 value = tp()
691 if value.__traceback__ is not tb:
--> 692raise value.with_traceback(tb) 693raise value
694finally:
~/venv/lib/python3.6/site-packages/dask/dataframe/rolling.py in overlap_chunk()
30 parts = [p for p in (prev_part, current_part, next_part) if p is not None]
31 combined = pd.concat(parts)
---> 32 out = func(combined, *args, **kwargs)
33 if prev_part is None:
34 before = None
<ipython-input-10-8f83d4571659> in <lambda>()
11
12 def get_diff_since_last_trans(df, plot=True):
---> 13 df['diff_last'] = df.map_overlap(lambda x: x.groupby('id')['transc_date'].diff(), before=10, after=10)
14 diffs = df[['id', 'diff_last']].groupby(['id']).agg('max')['diff_last'].dt.days.compute()
15 if plot:
~/venv/lib/python3.6/site-packages/pandas/core/groupby.py in wrapper()
737 *args, **kwargs)
738 except (AttributeError):
--> 739raise ValueError
740 741return wrapper
ValueError:

Pandas groupby and describe flags AttributeError

I have a bunch of data stored in vals. The indices are monotonic, but not continuous. I'm attempting to do some analysis on histograms of the data, so I've created the following structure:
hist = pd.DataFrame(vals)
hist['bins'] = pd.cut(vals, 100)
This is data taken from an experimental instrument and I know that some of the bins have only 1 or 2 counts in them, which I'm trying to remove. I've tried using groupby as follows and get the following error (Full traceback included at the end of the note):
hist.groupby('bins').describe()
AttributeError: 'Categorical' object has no attribute 'flags'
However, when I do the following, the error does not show up and I get the expected result:
In[]: hist.index = hist.bins
In[]: hist['bins'] = hist.index
In[]: desc = hist.groupby('bins').describe()
In[]: desc.index.names = ['bins', 'describe']
Out[]: **describe with MultiIndex for rows.**
If I don't include the second line hist['bins'] = hist.index, I still get an AttributeError: 'Categorical' object has no attribute 'flags' and to the best that I can tell, the traceback is identical.
Can someone explain what the flags are and why they only seem to work when I set the index to bins and then replace the bins by the version stored in the index?
My end goal is to remove the data for bins with counts <= 6. If someone has an easier workaround than the way I'm going after it, I'd also be grateful.
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-11-f606a051f2e4> in <module>()
----> 1 hist.groupby('bins').describe()
C:\Users\balterma\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.4.1.1975.win-x86_64\lib\site-packages\IPython\core\displayhook.pyc in __call__(self, result)
245 self.start_displayhook()
246 self.write_output_prompt()
--> 247 format_dict, md_dict = self.compute_format_data(result)
248 self.write_format_data(format_dict, md_dict)
249 self.update_user_ns(result)
C:\Users\balterma\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.4.1.1975.win-x86_64\lib\site-packages\IPython\core\displayhook.pyc in compute_format_data(self, result)
155
156 """
--> 157 return self.shell.display_formatter.format(result)
158
159 def write_format_data(self, format_dict, md_dict=None):
C:\Users\balterma\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.4.1.1975.win-x86_64\lib\site-packages\IPython\core\formatters.pyc in format(self, obj, include, exclude)
150 md = None
151 try:
--> 152 data = formatter(obj)
153 except:
154 # FIXME: log the exception
C:\Users\balterma\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.4.1.1975.win-x86_64\lib\site-packages\IPython\core\formatters.pyc in __call__(self, obj)
479 type_pprinters=self.type_printers,
480 deferred_pprinters=self.deferred_printers)
--> 481 printer.pretty(obj)
482 printer.flush()
483 return stream.getvalue()
C:\Users\balterma\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.4.1.1975.win-x86_64\lib\site-packages\IPython\lib\pretty.pyc in pretty(self, obj)
360 if callable(meth):
361 return meth(obj, self, cycle)
--> 362 return _default_pprint(obj, self, cycle)
363 finally:
364 self.end_group()
C:\Users\balterma\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.4.1.1975.win-x86_64\lib\site-packages\IPython\lib\pretty.pyc in _default_pprint(obj, p, cycle)
480 if getattr(klass, '__repr__', None) not in _baseclass_reprs:
481 # A user-provided repr.
--> 482 p.text(repr(obj))
483 return
484 p.begin_group(1, '<')
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\core\base.pyc in __repr__(self)
62 Yields Bytestring in Py2, Unicode String in py3.
63 """
---> 64 return str(self)
65
66
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\core\base.pyc in __str__(self)
42 if compat.PY3:
43 return self.__unicode__()
---> 44 return self.__bytes__()
45
46 def __bytes__(self):
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\core\base.pyc in __bytes__(self)
54
55 encoding = get_option("display.encoding")
---> 56 return self.__unicode__().encode(encoding, 'replace')
57
58 def __repr__(self):
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\core\frame.pyc in __unicode__(self)
507 width = None
508 self.to_string(buf=buf, max_rows=max_rows, max_cols=max_cols,
--> 509 line_width=width, show_dimensions=show_dimensions)
510
511 return buf.getvalue()
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\core\frame.pyc in to_string(self, buf, columns, col_space, colSpace, header, index, na_rep, formatters, float_format, sparsify, index_names, justify, line_width, max_rows, max_cols, show_dimensions)
1340 max_rows=max_rows,
1341 max_cols=max_cols,
-> 1342 show_dimensions=show_dimensions)
1343 formatter.to_string()
1344
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\core\format.pyc in __init__(self, frame, buf, columns, col_space, header, index, na_rep, formatters, justify, float_format, sparsify, index_names, line_width, max_rows, max_cols, show_dimensions, **kwds)
345 self.columns = frame.columns
346
--> 347 self._chk_truncate()
348
349 def _chk_truncate(self):
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\core\format.pyc in _chk_truncate(self)
410 else:
411 row_num = max_rows_adj // 2
--> 412 frame = concat((frame.iloc[:row_num, :], frame.iloc[-row_num:, :]))
413 self.tr_row_num = row_num
414
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\tools\merge.pyc in concat(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity, copy)
752 keys=keys, levels=levels, names=names,
753 verify_integrity=verify_integrity,
--> 754 copy=copy)
755 return op.get_result()
756
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\tools\merge.pyc in __init__(self, objs, axis, join, join_axes, keys, levels, names, ignore_index, verify_integrity, copy)
884 self.copy = copy
885
--> 886 self.new_axes = self._get_new_axes()
887
888 def get_result(self):
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\tools\merge.pyc in _get_new_axes(self)
957 new_axes[i] = ax
958
--> 959 new_axes[self.axis] = self._get_concat_axis()
960 return new_axes
961
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\tools\merge.pyc in _get_concat_axis(self)
1009
1010 if self.keys is None:
-> 1011 concat_axis = _concat_indexes(indexes)
1012 else:
1013 concat_axis = _make_concat_multiindex(indexes, self.keys,
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\tools\merge.pyc in _concat_indexes(indexes)
1027
1028 def _concat_indexes(indexes):
-> 1029 return indexes[0].append(indexes[1:])
1030
1031
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\core\index.pyc in append(self, other)
4603 arrays = []
4604 for i in range(self.nlevels):
-> 4605 label = self.get_level_values(i)
4606 appended = [o.get_level_values(i) for o in other]
4607 arrays.append(label.append(appended))
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\core\index.pyc in get_level_values(self, level)
4239 unique = self.levels[num] # .values
4240 labels = self.labels[num]
-> 4241 filled = com.take_1d(unique.values, labels, fill_value=unique._na_value)
4242 values = unique._simple_new(filled, self.names[num],
4243 freq=getattr(unique, 'freq', None),
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\core\common.pyc in take_nd(arr, indexer, axis, out, fill_value, mask_info, allow_fill)
829 out_shape[axis] = len(indexer)
830 out_shape = tuple(out_shape)
--> 831 if arr.flags.f_contiguous and axis == arr.ndim - 1:
832 # minor tweak that can make an order-of-magnitude difference
833 # for dataframes initialized directly from 2-d ndarrays
AttributeError: 'Categorical' object has no attribute 'flags'
This looks to be be a bug with Categorical data that will be corrected in version 0.17.0 (issue here).
In the meantime, you could just cast the category to an object dtype - this is what was happening when you assigned to the index and back.
df['bins'] = df['bins'].astype(str)

Categories

Resources