Numpy command to calculate sine (and cosine) consumes all RAM - python

I am trying to calculate sine and cosine of month number (e.g. Jan=1, Feb=2, ... Dec=12) for a series of observations that covers ~5 years:
def get_sin(value, max_value):
sine = np.sin(value * (2.*np.pi/max_value))
return sine
def get_cosine(value, max_value):
cosine = np.cos(value * (2.*np.pi/max_value))
return cosine
I run the following command on the data:
df_ufvdate['month_sine'] = df_ufvdate.apply(lambda row: get_sin(month, 12), axis=1)
However my desktop RAM is exausted, and then I get the following MemoryError:
---------------------------------------------------------------------------
MemoryError Traceback (most recent call last)
Input In [466], in <cell line: 1>()
----> 1 df_ufvdate['month_sine'] = df_ufvdate.apply(lambda row: get_sin(month, 12), axis=1)
File ~\Anaconda3\lib\site-packages\pandas\core\frame.py:8839, in DataFrame.apply(self, func, axis, raw, result_type, args, **kwargs)
8828 from pandas.core.apply import frame_apply
8830 op = frame_apply(
8831 self,
8832 func=func,
(...)
8837 kwargs=kwargs,
8838 )
-> 8839 return op.apply().__finalize__(self, method="apply")
File ~\Anaconda3\lib\site-packages\pandas\core\apply.py:727, in FrameApply.apply(self)
724 elif self.raw:
725 return self.apply_raw()
--> 727 return self.apply_standard()
File ~\Anaconda3\lib\site-packages\pandas\core\apply.py:854, in FrameApply.apply_standard(self)
851 results, res_index = self.apply_series_generator()
853 # wrap results
--> 854 return self.wrap_results(results, res_index)
File ~\Anaconda3\lib\site-packages\pandas\core\apply.py:880, in FrameApply.wrap_results(self, results, res_index)
878 # see if we can infer the results
879 if len(results) > 0 and 0 in results and is_sequence(results[0]):
--> 880 return self.wrap_results_for_axis(results, res_index)
882 # dict of scalars
883
884 # the default dtype of an empty Series will be `object`, but this
885 # code can be hit by df.mean() where the result should have dtype
886 # float64 even if it's an empty Series.
887 constructor_sliced = self.obj._constructor_sliced
File ~\Anaconda3\lib\site-packages\pandas\core\apply.py:1027, in FrameColumnApply.wrap_results_for_axis(self, results, res_index)
1023 result.index = res_index
1025 # we may want to infer results
1026 else:
-> 1027 result = self.infer_to_same_shape(results, res_index)
1029 return result
File ~\Anaconda3\lib\site-packages\pandas\core\apply.py:1033, in FrameColumnApply.infer_to_same_shape(self, results, res_index)
1031 def infer_to_same_shape(self, results: ResType, res_index: Index) -> DataFrame:
1032 """infer the results to the same shape as the input object"""
-> 1033 result = self.obj._constructor(data=results)
1034 result = result.T
1036 # set the index
File ~\Anaconda3\lib\site-packages\pandas\core\frame.py:636, in DataFrame.__init__(self, data, index, columns, dtype, copy)
630 mgr = self._init_mgr(
631 data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy
632 )
634 elif isinstance(data, dict):
635 # GH#38939 de facto copy defaults to False only in non-dict cases
--> 636 mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
637 elif isinstance(data, ma.MaskedArray):
638 import numpy.ma.mrecords as mrecords
File ~\Anaconda3\lib\site-packages\pandas\core\internals\construction.py:494, in dict_to_mgr(data, index, columns, dtype, typ, copy)
487 arrays = [
488 arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
489 ]
491 if copy:
492 # arrays_to_mgr (via form_blocks) won't make copies for EAs
493 # dtype attr check to exclude EADtype-castable strs
--> 494 arrays = [
495 x
496 if not hasattr(x, "dtype") or not isinstance(x.dtype, ExtensionDtype)
497 else x.copy()
498 for x in arrays
499 ]
500 # TODO: can we get rid of the dt64tz special case above?
502 return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy)
File ~\Anaconda3\lib\site-packages\pandas\core\internals\construction.py:497, in <listcomp>(.0)
487 arrays = [
488 arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
489 ]
491 if copy:
492 # arrays_to_mgr (via form_blocks) won't make copies for EAs
493 # dtype attr check to exclude EADtype-castable strs
494 arrays = [
495 x
496 if not hasattr(x, "dtype") or not isinstance(x.dtype, ExtensionDtype)
--> 497 else x.copy()
498 for x in arrays
499 ]
500 # TODO: can we get rid of the dt64tz special case above?
502 return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy)
File ~\Anaconda3\lib\site-packages\pandas\core\generic.py:6032, in NDFrame.copy(self, deep)
5926 #final
5927 def copy(self: NDFrameT, deep: bool_t = True) -> NDFrameT:
5928 """
5929 Make a copy of this object's indices and data.
5930
(...)
6030 dtype: object
6031 """
-> 6032 data = self._mgr.copy(deep=deep)
6033 self._clear_item_cache()
6034 return self._constructor(data).__finalize__(self, method="copy")
File ~\Anaconda3\lib\site-packages\pandas\core\internals\managers.py:603, in BaseBlockManager.copy(self, deep)
600 else:
601 new_axes = list(self.axes)
--> 603 res = self.apply("copy", deep=deep)
605 res.axes = new_axes
607 if self.ndim > 1:
608 # Avoid needing to re-compute these
File ~\Anaconda3\lib\site-packages\pandas\core\internals\managers.py:304, in BaseBlockManager.apply(self, f, align_keys, ignore_failures, **kwargs)
302 applied = b.apply(f, **kwargs)
303 else:
--> 304 applied = getattr(b, f)(**kwargs)
305 except (TypeError, NotImplementedError):
306 if not ignore_failures:
File ~\Anaconda3\lib\site-packages\pandas\core\internals\blocks.py:643, in Block.copy(self, deep)
641 values = self.values
642 if deep:
--> 643 values = values.copy()
644 return type(self)(values, placement=self._mgr_locs, ndim=self.ndim)
File ~\Anaconda3\lib\site-packages\pandas\core\arrays\masked.py:680, in BaseMaskedArray.copy(self)
678 def copy(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
679 data, mask = self._data, self._mask
--> 680 data = data.copy()
681 mask = mask.copy()
682 return type(self)(data, mask, copy=False)
MemoryError: Unable to allocate 404. KiB for an array with shape (51724,) and data type float64
I suppose there is something very inefficient with my coding. Can anybody suggest what I am doing wrong?
UPDATE:
I noticed something very weird about variable 'month'. I used
df_ufvdate['month'] = df_ufvdate['month'].astype('int64')
to convert 'month' into an integer and when I run df_ufvdate.info(max_cols=250, show_counts='True') I see that 'month' is type 'int64':
month 51724 non-null int64
However, when I run
df_ufvdate['month'].describe()
I get that 'month' is type 'float64':
count 51724.000000
mean 8.030895
std 3.693370
min 1.000000
25% 5.000000
50% 9.000000
75% 11.000000
max 12.000000
Name: month, dtype: float64
Here is more info on df_ufvdate:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 51724 entries, 1 to 62618
Data columns (total 211 columns)
dtypes: Int64(34), float64(105), int64(1), object(71)
memory usage: 85.3+ MB
Here is my desktop specs:
Windows 64,
RAM: 24GB,
Jupyter: 6.4.8,
Python 3.9.12 (main, Apr 4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]

I got it fixed:
def get_sin(row, column, max_value):
value = row[column]
sine = np.sin(value * (2.*np.pi/max_value))
return sine
def get_cosine(row, column, max_value):
value = row[column]
cosine = np.cos(value * (2.*np.pi/max_value))
return cosine
and then these lambdas will do the trick:
df_ufvdate['month_sine'] = df_ufvdate.apply(lambda row: get_sin(row, 'month', 12), axis=1)
df_ufvdate['month_cosine'] = df_ufvdate.apply(lambda row: get_cosine(row, 'month', 12), axis=1)
Thank all who commented on this question!

Related

.describe() and .info() not working for me in Jupyter Notebook

I am trying to use the describe method to get summary statistics of my data but I keep on getting this error message. Anyway to sort this out? The .info() is also giving me the same problem.
TypeError Traceback (most recent call last)
<ipython-input-28-614cd2726f37> in <module>
----> 1 players_final.describe()
~\anaconda3\lib\site-packages\pandas\core\generic.py in describe(self, percentiles, include, exclude)
10265 elif (include is None) and (exclude is None):
10266 # when some numerics are found, keep only numerics
> 10267 data = self.select_dtypes(include=[np.number])
10268 if len(data.columns) == 0:
10269 data = self
~\anaconda3\lib\site-packages\pandas\core\frame.py in select_dtypes(self, include, exclude)
3420 # the "union" of the logic of case 1 and case 2:
3421 # we get the included and excluded, and return their logical and
-> 3422 include_these = Series(not bool(include), index=self.columns)
3423 exclude_these = Series(not bool(exclude), index=self.columns)
3424
~\anaconda3\lib\site-packages\pandas\core\series.py in __init__(self, data, index, dtype, name, copy, fastpath)
309 data = data.copy()
310 else:
--> 311 data = sanitize_array(data, index, dtype, copy, raise_cast_failure=True)
312
313 data = SingleBlockManager(data, index, fastpath=True)
~\anaconda3\lib\site-packages\pandas\core\internals\construction.py in sanitize_array(data, index, dtype, copy, raise_cast_failure)
710 value = maybe_cast_to_datetime(value, dtype)
711
--> 712 subarr = construct_1d_arraylike_from_scalar(value, len(index), dtype)
713
714 else:
~\anaconda3\lib\site-packages\pandas\core\dtypes\cast.py in construct_1d_arraylike_from_scalar(value, length, dtype)
1231 value = ensure_str(value)
1232
-> 1233 subarr = np.empty(length, dtype=dtype)
1234 subarr.fill(value)
1235
TypeError: Cannot interpret '<attribute 'dtype' of 'numpy.generic' objects>' as a data type
​

Not able to perform mean aggregation on group by DataFrame in Panda

I have below dataset
I want to perform mean operation on 'horsepower' column after doing group by on column 'cylinders' and 'model year' using panda. I am running code in jupyter notebook.
Below is my code:
df = pd.read_csv('auto_mpg.csv')
df.groupby(['cylinders','model year']).agg({'horsepower':'mean'})
Basically, I am performing first group by on column 'cylinders' and 'model year' and then performing aggregation operation to get mean value.
I am getting below error:
DataError Traceback (most recent call last)
<ipython-input-105-967f7e0151c3> in <module>
2 #Creating a DataFrame grouped on cylinders and model_year and finding mean, min and max of horsepower
3 df = pd.read_csv('auto_mpg.csv')
----> 4 df.groupby(['cylinders','model year']).agg({'horsepower':['mean']})
~\anaconda3\lib\site-packages\pandas\core\groupby\generic.py in aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
949 func = maybe_mangle_lambdas(func)
950
--> 951 result, how = self._aggregate(func, *args, **kwargs)
952 if how is None:
953 return result
~\anaconda3\lib\site-packages\pandas\core\base.py in _aggregate(self, arg, *args, **kwargs)
414
415 try:
--> 416 result = _agg(arg, _agg_1dim)
417 except SpecificationError:
418
~\anaconda3\lib\site-packages\pandas\core\base.py in _agg(arg, func)
381 result = {}
382 for fname, agg_how in arg.items():
--> 383 result[fname] = func(fname, agg_how)
384 return result
385
~\anaconda3\lib\site-packages\pandas\core\base.py in _agg_1dim(name, how, subset)
365 "nested dictionary is ambiguous in aggregation"
366 )
--> 367 return colg.aggregate(how)
368
369 def _agg_2dim(how):
~\anaconda3\lib\site-packages\pandas\core\groupby\generic.py in aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
244 # but not the class list / tuple itself.
245 func = maybe_mangle_lambdas(func)
--> 246 ret = self._aggregate_multiple_funcs(func)
247 if relabeling:
248 ret.columns = columns
~\anaconda3\lib\site-packages\pandas\core\groupby\generic.py in _aggregate_multiple_funcs(self, arg)
317 obj._reset_cache()
318 obj._selection = name
--> 319 results[base.OutputKey(label=name, position=idx)] = obj.aggregate(func)
320
321 if any(isinstance(x, DataFrame) for x in results.values()):
~\anaconda3\lib\site-packages\pandas\core\groupby\generic.py in aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
238
239 if isinstance(func, str):
--> 240 return getattr(self, func)(*args, **kwargs)
241
242 elif isinstance(func, abc.Iterable):
~\anaconda3\lib\site-packages\pandas\core\groupby\groupby.py in mean(self, numeric_only)
1391 Name: B, dtype: float64
1392 """
-> 1393 return self._cython_agg_general(
1394 "mean",
1395 alt=lambda x, axis: Series(x).mean(numeric_only=numeric_only),
~\anaconda3\lib\site-packages\pandas\core\groupby\groupby.py in _cython_agg_general(self, how, alt, numeric_only, min_count)
1049
1050 if len(output) == 0:
-> 1051 raise DataError("No numeric types to aggregate")
1052
1053 return self._wrap_aggregated_output(output, index=self.grouper.result_index)
DataError: No numeric types to aggregate
While I get min and max aggregation on 'horsepower' column successfully.
df = pd.read_csv('auto_mpg.csv')
df.groupby(['cylinders','model year']).agg({'horsepower':['min','max']})
I loaded the auto-mpg the dataset from https://www.kaggle.com/uciml/autompg-dataset/version/3nd
and managed to replicate the problem.
The root cause is that horsepower column is loaded as type object with missing values represented as question mark strings (?), for example:
df[df.horsepower.str.contains("\?")]
Pandas doesn't know how to take the mean of question marks, so the solution would be casting the column to float:
# Convert non digit strings to NaN
df.loc[~df.horsepower.str.isdigit(), "horsepower"] = np.NaN
# Cast to float
df.horsepower = df.horsepower.astype("float")
# Aggregate
df.groupby(["cylinders", "model year"]).agg({"horsepower": "mean"})
Used pandas==1.1.5 and numpy==1.19.5.
Check the data type. I see the root cause error at the bottom of your post:
raise DataError("No numeric types to aggregate")
Put that ‘mean’ into bracket then, if data type is right:
agg({'horsepower': ['mean']})
Try this
df = pd.read_csv('auto_mpg.csv')
df.groupby(['cylinders','model year']).mean()["horsepower]
df.groupby(['cylinders','model year']).mean() will give you the mean of each column and then you are selecting the horsepower variable to get the desired columns from the df on which groupby and mean operations were performed.

ValueError: Shape of passed values is (37679, 43), indices imply (37679, 41)

I am trying to group horse data by races. I am using pivot function to try do this, but I keep getting a Value error.
def group_horse_and_result(element):
if element[0] == 'placing':
return 100 + element[1]
else:
return element[1]
data = data.pivot(index='id', columns='barrier', values=data.columns[2:])
rearranged_columns = sorted(list(data.columns.values), key=group_horse_and_result)
data = data[rearranged_columns]
print(data.head())
data.fillna(0)
And I keep getting this error result:
AssertionError Traceback (most recent call last)
<ipython-input-253-97da160dc172> in <module>
5 return element[1]
6
----> 7 data = data.pivot(index='race_id', columns='placing', values=data.columns[2:])
8 rearranged_columns = sorted(list(data.columns.values), key=group_horse_and_result)
9 data = data[rearranged_columns]
~\anaconda3\lib\site-packages\pandas\core\frame.py in pivot(self, index, columns, values)
6672 from pandas.core.reshape.pivot import pivot
6673
-> 6674 return pivot(self, index=index, columns=columns, values=values)
6675
6676 _shared_docs[
~\anaconda3\lib\site-packages\pandas\core\reshape\pivot.py in pivot(data, index, columns, values)
470 # Exclude tuple because it is seen as a single column name
471 values = cast(Sequence[Label], values)
--> 472 indexed = data._constructor(
473 data[values]._values, index=index, columns=values
474 )
~\anaconda3\lib\site-packages\pandas\core\frame.py in __init__(self, data, index, columns, dtype, copy)
495 mgr = init_dict({data.name: data}, index, columns, dtype=dtype)
496 else:
--> 497 mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
498
499 # For data is list-like, or Iterable (will consume into list)
~\anaconda3\lib\site-packages\pandas\core\internals\construction.py in init_ndarray(values, index, columns, dtype, copy)
232 block_values = [values]
233
--> 234 return create_block_manager_from_blocks(block_values, [columns, index])
235
236
~\anaconda3\lib\site-packages\pandas\core\internals\managers.py in create_block_manager_from_blocks(blocks, axes)
1663 ]
1664
-> 1665 mgr = BlockManager(blocks, axes)
1666 mgr._consolidate_inplace()
1667 return mgr
~\anaconda3\lib\site-packages\pandas\core\internals\managers.py in __init__(self, blocks, axes, do_integrity_check)
147
148 if do_integrity_check:
--> 149 self._verify_integrity()
150
151 # Populate known_consolidate, blknos, and blklocs lazily
~\anaconda3\lib\site-packages\pandas\core\internals\managers.py in _verify_integrity(self)
326 raise construction_error(tot_items, block.shape[1:], self.axes)
327 if len(self.items) != tot_items:
--> 328 raise AssertionError(
329 "Number of manager items must equal union of "
330 f"block items\n# manager items: {len(self.items)}, # "
AssertionError: Number of manager items must equal union of block items
# manager items: 42, # tot_items: 44
Is this something to do with my data pre-processing or is my code wrong here? Relatively new to coding so apologies if the wording of my questions are off. The table shape is 37679,44.
It might be because of duplicates among the columns.
The duplicate columns can be identified using data.columns.duplicated().

xarray.open_mfdataset "ValueError: axes don't match array"

I'm using python xarray's open_mfdataset to open a dataset that's spread out among multiple HDF5 files. I have 2 datasets that should be nearly identical in structure. The first dataset works just fine, but when I try and open the other, I get the error:
ValueError: axes don't match array
I have no idea what is causing the error, and every google search results in questions about neural networks, which seem to trip the same error but for a different reason.
edit: I should share, the line the error is occurring on looks like this:
df = xr.open_mfdataset("/some/directory/*.h5", concat_dim='TIME')
I'd share the files, but they're quite large and I'm not sure the legalities of doing so, and I'm not sure how to reproduce the problem without them.
edit 2:
I think I found the problem. The data's (x,y,z) dimensions are (300, 300, 60). However, the model I'm using uses the same named dimension for x and y since its the same number of nx and ny (this is very silly imo). So the header of the file states:
-> % ncdump -h icefix-A-2014-02-05-120000-g1.h5
netcdf icefix-A-2014-02-05-120000-g1 {
dimensions:
phony_dim_0 = 300 ;
phony_dim_1 = 60 ;
phony_dim_2 = 2 ;
phony_dim_3 = 5 ;
phony_dim_4 = 11 ;
And a 3D variable looks something like
float CCP(phony_dim_1, phony_dim_0, phony_dim_0) ;
And I think xarray is having problems dealing with a variable having duplicate named dimensions. I'm not sure if there's a way to fix this other than to alter the way the model outputs its files.
The full traceback is as follows:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-10-cb4a12bd492e> in <module>
----> 1 control_d1 = xr.open_mfdataset(datadir + "feb2014_control/icefix*g1.h5", concat_dim='TIME')
~/anaconda3/lib/python3.7/site-packages/xarray/backends/api.py in open_mfdataset(paths, chunks, concat_dim, compat, preprocess, engine, lock, data_vars, coords, autoclose, parallel, **kwargs)
717 data_vars=data_vars, coords=coords,
718 infer_order_from_coords=infer_order_from_coords,
--> 719 ids=ids)
720 except ValueError:
721 for ds in datasets:
~/anaconda3/lib/python3.7/site-packages/xarray/core/combine.py in _auto_combine(datasets, concat_dims, compat, data_vars, coords, infer_order_from_coords, ids)
551 # Repeatedly concatenate then merge along each dimension
552 combined = _combine_nd(combined_ids, concat_dims, compat=compat,
--> 553 data_vars=data_vars, coords=coords)
554 return combined
555
~/anaconda3/lib/python3.7/site-packages/xarray/core/combine.py in _combine_nd(combined_ids, concat_dims, data_vars, coords, compat)
473 data_vars=data_vars,
474 coords=coords,
--> 475 compat=compat)
476 combined_ds = list(combined_ids.values())[0]
477 return combined_ds
~/anaconda3/lib/python3.7/site-packages/xarray/core/combine.py in _auto_combine_all_along_first_dim(combined_ids, dim, data_vars, coords, compat)
491 datasets = combined_ids.values()
492 new_combined_ids[new_id] = _auto_combine_1d(datasets, dim, compat,
--> 493 data_vars, coords)
494 return new_combined_ids
495
~/anaconda3/lib/python3.7/site-packages/xarray/core/combine.py in _auto_combine_1d(datasets, concat_dim, compat, data_vars, coords)
509 concatenated = [_auto_concat(list(ds_group), dim=dim,
510 data_vars=data_vars, coords=coords)
--> 511 for id, ds_group in grouped_by_vars]
512 else:
513 concatenated = datasets
~/anaconda3/lib/python3.7/site-packages/xarray/core/combine.py in <listcomp>(.0)
509 concatenated = [_auto_concat(list(ds_group), dim=dim,
510 data_vars=data_vars, coords=coords)
--> 511 for id, ds_group in grouped_by_vars]
512 else:
513 concatenated = datasets
~/anaconda3/lib/python3.7/site-packages/xarray/core/combine.py in _auto_concat(datasets, dim, data_vars, coords)
367 'explicitly')
368 dim, = concat_dims
--> 369 return concat(datasets, dim=dim, data_vars=data_vars, coords=coords)
370
371
~/anaconda3/lib/python3.7/site-packages/xarray/core/combine.py in concat(objs, dim, data_vars, coords, compat, positions, indexers, mode, concat_over)
118 raise TypeError('can only concatenate xarray Dataset and DataArray '
119 'objects, got %s' % type(first_obj))
--> 120 return f(objs, dim, data_vars, coords, compat, positions)
121
122
~/anaconda3/lib/python3.7/site-packages/xarray/core/combine.py in _dataset_concat(datasets, dim, data_vars, coords, compat, positions)
303 if k in concat_over:
304 vars = ensure_common_dims([ds.variables[k] for ds in datasets])
--> 305 combined = concat_vars(vars, dim, positions)
306 insert_result_variable(k, combined)
307
~/anaconda3/lib/python3.7/site-packages/xarray/core/variable.py in concat(variables, dim, positions, shortcut)
2083 along the given dimension.
2084 """
-> 2085 variables = list(variables)
2086 if all(isinstance(v, IndexVariable) for v in variables):
2087 return IndexVariable.concat(variables, dim, positions, shortcut)
~/anaconda3/lib/python3.7/site-packages/xarray/core/combine.py in ensure_common_dims(vars)
296 common_shape = tuple(non_concat_dims.get(d, dim_len)
297 for d in common_dims)
--> 298 var = var.set_dims(common_dims, common_shape)
299 yield var
300
~/anaconda3/lib/python3.7/site-packages/xarray/core/variable.py in set_dims(self, dims, shape)
1209 expanded_var = Variable(expanded_dims, expanded_data, self._attrs,
1210 self._encoding, fastpath=True)
-> 1211 return expanded_var.transpose(*dims)
1212
1213 def _stack_once(self, dims, new_dim):
~/anaconda3/lib/python3.7/site-packages/xarray/core/variable.py in transpose(self, *dims)
1152 return self.copy(deep=False)
1153
-> 1154 data = as_indexable(self._data).transpose(axes)
1155 return type(self)(dims, data, self._attrs, self._encoding,
1156 fastpath=True)
~/anaconda3/lib/python3.7/site-packages/xarray/core/indexing.py in transpose(self, order)
1210
1211 def transpose(self, order):
-> 1212 return self.array.transpose(order)
1213
1214
~/anaconda3/lib/python3.7/site-packages/dask/array/core.py in transpose(self, *axes)
1331 See Also
1332 --------
-> 1333 da.store
1334 h5py.File.create_dataset
1335 """
~/anaconda3/lib/python3.7/site-packages/dask/array/routines.py in transpose(a, axes)
136
137
--> 138 #derived_from(np)
139 def swapaxes(a, axis1, axis2):
140 if axis1 == axis2:
ValueError: axes don't match array

Pandas groupby and describe flags AttributeError

I have a bunch of data stored in vals. The indices are monotonic, but not continuous. I'm attempting to do some analysis on histograms of the data, so I've created the following structure:
hist = pd.DataFrame(vals)
hist['bins'] = pd.cut(vals, 100)
This is data taken from an experimental instrument and I know that some of the bins have only 1 or 2 counts in them, which I'm trying to remove. I've tried using groupby as follows and get the following error (Full traceback included at the end of the note):
hist.groupby('bins').describe()
AttributeError: 'Categorical' object has no attribute 'flags'
However, when I do the following, the error does not show up and I get the expected result:
In[]: hist.index = hist.bins
In[]: hist['bins'] = hist.index
In[]: desc = hist.groupby('bins').describe()
In[]: desc.index.names = ['bins', 'describe']
Out[]: **describe with MultiIndex for rows.**
If I don't include the second line hist['bins'] = hist.index, I still get an AttributeError: 'Categorical' object has no attribute 'flags' and to the best that I can tell, the traceback is identical.
Can someone explain what the flags are and why they only seem to work when I set the index to bins and then replace the bins by the version stored in the index?
My end goal is to remove the data for bins with counts <= 6. If someone has an easier workaround than the way I'm going after it, I'd also be grateful.
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-11-f606a051f2e4> in <module>()
----> 1 hist.groupby('bins').describe()
C:\Users\balterma\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.4.1.1975.win-x86_64\lib\site-packages\IPython\core\displayhook.pyc in __call__(self, result)
245 self.start_displayhook()
246 self.write_output_prompt()
--> 247 format_dict, md_dict = self.compute_format_data(result)
248 self.write_format_data(format_dict, md_dict)
249 self.update_user_ns(result)
C:\Users\balterma\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.4.1.1975.win-x86_64\lib\site-packages\IPython\core\displayhook.pyc in compute_format_data(self, result)
155
156 """
--> 157 return self.shell.display_formatter.format(result)
158
159 def write_format_data(self, format_dict, md_dict=None):
C:\Users\balterma\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.4.1.1975.win-x86_64\lib\site-packages\IPython\core\formatters.pyc in format(self, obj, include, exclude)
150 md = None
151 try:
--> 152 data = formatter(obj)
153 except:
154 # FIXME: log the exception
C:\Users\balterma\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.4.1.1975.win-x86_64\lib\site-packages\IPython\core\formatters.pyc in __call__(self, obj)
479 type_pprinters=self.type_printers,
480 deferred_pprinters=self.deferred_printers)
--> 481 printer.pretty(obj)
482 printer.flush()
483 return stream.getvalue()
C:\Users\balterma\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.4.1.1975.win-x86_64\lib\site-packages\IPython\lib\pretty.pyc in pretty(self, obj)
360 if callable(meth):
361 return meth(obj, self, cycle)
--> 362 return _default_pprint(obj, self, cycle)
363 finally:
364 self.end_group()
C:\Users\balterma\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.4.1.1975.win-x86_64\lib\site-packages\IPython\lib\pretty.pyc in _default_pprint(obj, p, cycle)
480 if getattr(klass, '__repr__', None) not in _baseclass_reprs:
481 # A user-provided repr.
--> 482 p.text(repr(obj))
483 return
484 p.begin_group(1, '<')
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\core\base.pyc in __repr__(self)
62 Yields Bytestring in Py2, Unicode String in py3.
63 """
---> 64 return str(self)
65
66
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\core\base.pyc in __str__(self)
42 if compat.PY3:
43 return self.__unicode__()
---> 44 return self.__bytes__()
45
46 def __bytes__(self):
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\core\base.pyc in __bytes__(self)
54
55 encoding = get_option("display.encoding")
---> 56 return self.__unicode__().encode(encoding, 'replace')
57
58 def __repr__(self):
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\core\frame.pyc in __unicode__(self)
507 width = None
508 self.to_string(buf=buf, max_rows=max_rows, max_cols=max_cols,
--> 509 line_width=width, show_dimensions=show_dimensions)
510
511 return buf.getvalue()
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\core\frame.pyc in to_string(self, buf, columns, col_space, colSpace, header, index, na_rep, formatters, float_format, sparsify, index_names, justify, line_width, max_rows, max_cols, show_dimensions)
1340 max_rows=max_rows,
1341 max_cols=max_cols,
-> 1342 show_dimensions=show_dimensions)
1343 formatter.to_string()
1344
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\core\format.pyc in __init__(self, frame, buf, columns, col_space, header, index, na_rep, formatters, justify, float_format, sparsify, index_names, line_width, max_rows, max_cols, show_dimensions, **kwds)
345 self.columns = frame.columns
346
--> 347 self._chk_truncate()
348
349 def _chk_truncate(self):
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\core\format.pyc in _chk_truncate(self)
410 else:
411 row_num = max_rows_adj // 2
--> 412 frame = concat((frame.iloc[:row_num, :], frame.iloc[-row_num:, :]))
413 self.tr_row_num = row_num
414
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\tools\merge.pyc in concat(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity, copy)
752 keys=keys, levels=levels, names=names,
753 verify_integrity=verify_integrity,
--> 754 copy=copy)
755 return op.get_result()
756
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\tools\merge.pyc in __init__(self, objs, axis, join, join_axes, keys, levels, names, ignore_index, verify_integrity, copy)
884 self.copy = copy
885
--> 886 self.new_axes = self._get_new_axes()
887
888 def get_result(self):
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\tools\merge.pyc in _get_new_axes(self)
957 new_axes[i] = ax
958
--> 959 new_axes[self.axis] = self._get_concat_axis()
960 return new_axes
961
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\tools\merge.pyc in _get_concat_axis(self)
1009
1010 if self.keys is None:
-> 1011 concat_axis = _concat_indexes(indexes)
1012 else:
1013 concat_axis = _make_concat_multiindex(indexes, self.keys,
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\tools\merge.pyc in _concat_indexes(indexes)
1027
1028 def _concat_indexes(indexes):
-> 1029 return indexes[0].append(indexes[1:])
1030
1031
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\core\index.pyc in append(self, other)
4603 arrays = []
4604 for i in range(self.nlevels):
-> 4605 label = self.get_level_values(i)
4606 appended = [o.get_level_values(i) for o in other]
4607 arrays.append(label.append(appended))
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\core\index.pyc in get_level_values(self, level)
4239 unique = self.levels[num] # .values
4240 labels = self.labels[num]
-> 4241 filled = com.take_1d(unique.values, labels, fill_value=unique._na_value)
4242 values = unique._simple_new(filled, self.names[num],
4243 freq=getattr(unique, 'freq', None),
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\core\common.pyc in take_nd(arr, indexer, axis, out, fill_value, mask_info, allow_fill)
829 out_shape[axis] = len(indexer)
830 out_shape = tuple(out_shape)
--> 831 if arr.flags.f_contiguous and axis == arr.ndim - 1:
832 # minor tweak that can make an order-of-magnitude difference
833 # for dataframes initialized directly from 2-d ndarrays
AttributeError: 'Categorical' object has no attribute 'flags'
This looks to be be a bug with Categorical data that will be corrected in version 0.17.0 (issue here).
In the meantime, you could just cast the category to an object dtype - this is what was happening when you assigned to the index and back.
df['bins'] = df['bins'].astype(str)

Categories

Resources