How can I solve this problem nested renamer is not supported - python

SpecificationError Traceback (most recent call last)
<ipython-input-42-d850d85f8342> in <module>
----> 1 train_label=extract_feature(train,train_label)
<ipython-input-33-23ab8dbf7d96> in extract_feature(df, train)
1 def extract_feature(df,train):
----> 2 t=groupy_feature(df,'ship','x',['max','min','mean','std','median','std','skew','sum'])
3 train=pd.merge(train,t,on='ship',how='left')
4 t=groupy_feature(df,'ship','y',['max','min','mean','std','median','std','skew','sum'])
5 train=pd.merge(train,t,on='ship',how='left')
<ipython-input-32-63d47754fe81> in groupy_feature(df, key, target, aggs)
4 agg_dict[f'{target}_{agg}']=agg
5 print(agg_dict)
----> 6 t=df.groupby(key)[target].agg(agg_dict).reset_index()
7 return t
~\AppData\Roaming\Python\Python37\site-packages\pandas\core\groupby\generic.py in aggregate(self, func, *args, **kwargs)
251 # but not the class list / tuple itself.
252 func = _maybe_mangle_lambdas(func)
--> 253 ret = self._aggregate_multiple_funcs(func)
254 if relabeling:
255 ret.columns = columns
~\AppData\Roaming\Python\Python37\site-packages\pandas\core\groupby\generic.py in _aggregate_multiple_funcs(self, arg)
292 # GH 15931
293 if isinstance(self._selected_obj, Series):
--> 294 raise SpecificationError("nested renamer is not supported")
295
296 columns = list(arg.keys())
**SpecificationError: nested renamer is not supported**

I see two times the term 'std' in
t=groupy_feature(df,'ship','x',['max','min','mean','std','median','std','skew','sum'])

Related

TypeError: loop of ufunc does not support argument 0 of type float which has no callable exp method

Here's my dataset
Id B C
1 0.784 -1.6745
2 2.123 -2.8934
Here's what I try
import numpy as np
df.apply(lambda x: np.exp(x)/(1+np.exp(x)))
The error message
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
AttributeError: 'float' object has no attribute 'exp'
The above exception was the direct cause of the following exception:
TypeError Traceback (most recent call last)
<ipython-input-43-288751cc2e1d> in <module>
----> 1 A.apply(lambda x: np.exp(x)/(1+np.exp(x)))
~/.local/lib/python3.6/site-packages/pandas/core/frame.py in apply(self, func, axis, raw, result_type, args, **kwds)
7550 kwds=kwds,
7551 )
-> 7552 return op.get_result()
7553
7554 def applymap(self, func) -> "DataFrame":
~/.local/lib/python3.6/site-packages/pandas/core/apply.py in get_result(self)
183 return self.apply_raw()
184
--> 185 return self.apply_standard()
186
187 def apply_empty_result(self):
~/.local/lib/python3.6/site-packages/pandas/core/apply.py in apply_standard(self)
274
275 def apply_standard(self):
--> 276 results, res_index = self.apply_series_generator()
277
278 # wrap results
~/.local/lib/python3.6/site-packages/pandas/core/apply.py in apply_series_generator(self)
303 for i, v in enumerate(series_gen):
304 # ignore SettingWithCopy here in case the user mutates
--> 305 results[i] = self.f(v)
306 if isinstance(results[i], ABCSeries):
307 # If we have a view on v, we need to make a copy because
<ipython-input-43-288751cc2e1d> in <lambda>(x)
----> 1 A.apply(lambda x: np.exp(x)/(1+np.exp(x)))
~/.local/lib/python3.6/site-packages/pandas/core/series.py in __array_ufunc__(self, ufunc, method, *inputs, **kwargs)
724
725 inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs)
--> 726 result = getattr(ufunc, method)(*inputs, **kwargs)
727
728 name = names[0] if len(set(names)) == 1 else None
TypeError: loop of ufunc does not support argument 0 of type float which has no callable exp method
Looks like a data type issue.
df = pd.DataFrame({'B':[0.784,2.123], 'C':[-1.6745,-2.8934]})
print(df.info())
Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 B 2 non-null float64
1 C 2 non-null float64
dtypes: float64(2)
memory usage: 160.0 bytes
Apply:
df.apply(lambda x: np.exp(x)/(1+np.exp(x)))
print(df)
Result:
B C
0 0.784 -1.6745
1 2.123 -2.8934
Reproduce error:
df = pd.DataFrame({'B':[0.784,2.123], 'C':[-1.6745,-2.8934]}, dtype=object)
df.apply(lambda x: np.exp(x)/(1+np.exp(x)))

Dask IndexError: list index out of range

So i have folder called "data" say containing many CSV files
import dask.dataframe as dd
df = dd.read_csv('data/*.csv')
df.head()
df.column_1.mean().compute()
The above lines of code work perfectly and the dask.compute method does its job. But when i add the "include_path_column=True" parameter to the dd.read_cs() function call, i get the following error:
IndexError: list index out of range
When i expand the error i get
IndexError Traceback (most recent call last)
<ipython-input-129-4be67235bebb> in <module>
----> 1 df['H_hp'].mean().compute()
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/dask/base.py in compute(self, **kwargs)
165 dask.base.compute
166 """
--> 167 (result,) = compute(self, traverse=False, **kwargs)
168 return result
169
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/dask/base.py in compute(*args, **kwargs)
444 )
445
--> 446 dsk = collections_to_dsk(collections, optimize_graph, **kwargs)
447 keys, postcomputes = [], []
448 for x in collections:
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/dask/base.py in collections_to_dsk(collections, optimize_graph, **kwargs)
216 dsk, keys = _extract_graph_and_keys(val)
217 groups[opt] = (dsk, keys)
--> 218 _opt = opt(dsk, keys, **kwargs)
219 _opt_list.append(_opt)
220
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/dask/dataframe/optimize.py in optimize(dsk, keys, **kwargs)
19 dsk = fuse_roots(dsk, keys=flat_keys)
20
---> 21 dsk = ensure_dict(dsk)
22
23 if isinstance(keys, list):
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/dask/utils.py in ensure_dict(d)
1030 dd_id = id(dd)
1031 if dd_id not in seen:
-> 1032 result.update(dd)
1033 seen.add(dd_id)
1034 return result
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/dask/dataframe/io/csv.py in __getitem__(self, key)
80
81 if self.paths is not None:
---> 82 path_info = (self.colname, self.paths[i], self.paths)
83 else:
84 path_info = None
IndexError: list index out of range

Python Pandas Style to every nth row

I'm working on a Python project w/ Pandas and looking to implement a style to every Nth row. I've been able to select every Nth row using iloc but cannot get the style to work with a basic function. Here's my example in context:
data = [[1,2,3],[2,3,4],[3,4,5],[4,5,6]]
df = pd.DataFrame(data)
df
0 1 2
0 1 2 3
1 2 3 4
2 3 4 5
3 4 5 6
df.iloc[1::2, :]
0 1 2
1 2 3 4
3 4 5 6
At this point everything returns as normal, but when applying the below function, I receive a too many indexes error which I can't seem to resolve
def highlight_everyother(s):
if s.iloc[1::2, :]:
return ['background-color: yellow']*3
df.style.apply(highlight_everyother, axis=1)
ERROR:
IndexingError Traceback (most recent call last)
~\Anaconda3\lib\site-packages\IPython\core\formatters.py in __call__(self, obj)
343 method = get_real_method(obj, self.print_method)
344 if method is not None:
--> 345 return method()
346 return None
347 else:
~\Anaconda3\lib\site-packages\pandas\io\formats\style.py in _repr_html_(self)
180 Hooks into Jupyter notebook rich display system.
181 """
--> 182 return self.render()
183
184 #Appender(
~\Anaconda3\lib\site-packages\pandas\io\formats\style.py in render(self, **kwargs)
535 * table_attributes
536 """
--> 537 self._compute()
538 # TODO: namespace all the pandas keys
539 d = self._translate()
~\Anaconda3\lib\site-packages\pandas\io\formats\style.py in _compute(self)
610 r = self
611 for func, args, kwargs in self._todo:
--> 612 r = func(self)(*args, **kwargs)
613 return r
614
~\Anaconda3\lib\site-packages\pandas\io\formats\style.py in _apply(self, func, axis, subset, **kwargs)
618 data = self.data.loc[subset]
619 if axis is not None:
--> 620 result = data.apply(func, axis=axis, result_type="expand", **kwargs)
621 result.columns = data.columns
622 else:
~\Anaconda3\lib\site-packages\pandas\core\frame.py in apply(self, func, axis, raw, result_type, args, **kwds)
6876 kwds=kwds,
6877 )
-> 6878 return op.get_result()
6879
6880 def applymap(self, func) -> "DataFrame":
~\Anaconda3\lib\site-packages\pandas\core\apply.py in get_result(self)
184 return self.apply_raw()
185
--> 186 return self.apply_standard()
187
188 def apply_empty_result(self):
~\Anaconda3\lib\site-packages\pandas\core\apply.py in apply_standard(self)
311
312 # compute the result using the series generator
--> 313 results, res_index = self.apply_series_generator()
314
315 # wrap results
~\Anaconda3\lib\site-packages\pandas\core\apply.py in apply_series_generator(self)
339 else:
340 for i, v in enumerate(series_gen):
--> 341 results[i] = self.f(v)
342 keys.append(v.name)
343
<ipython-input-49-a5b996f8d6c8> in highlight_everyother(s)
11
12 def highlight_everyother(s):
---> 13 if s.iloc[1::2, :]:
14 return ['background-color: yellow']*3
15
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
1760 except (KeyError, IndexError, AttributeError):
1761 pass
-> 1762 return self._getitem_tuple(key)
1763 else:
1764 # we by definition only have the 0th axis
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_tuple(self, tup)
2065 def _getitem_tuple(self, tup: Tuple):
2066
-> 2067 self._has_valid_tuple(tup)
2068 try:
2069 return self._getitem_lowerdim(tup)
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _has_valid_tuple(self, key)
699 for i, k in enumerate(key):
700 if i >= self.ndim:
--> 701 raise IndexingError("Too many indexers")
702 try:
703 self._validate_key(k, i)
IndexingError: Too many indexers
Any help would be appreciated. Thank you.
I would apply on axis=0 in case df is not index by rangeIndex:
def highlight_everyother(s):
return ['background-color: yellow; color:blue' if x%2==1 else ''
for x in range(len(s))]
df.style.apply(highlight_everyother)
Output:
You are passing one row at a time to highlight_everyother. That's why you were getting the error. The below should work.
def highlight_everyother(s):
if s.name%2==1:
return ['background-color: yellow']*3
else:
return ['background-color: white']*3
df.style.apply(highlight_everyother, axis=1)

Drop duplicates where some rows contain lists and others ints/strings

I have a dataframe where I want to drop rows that have duplicate IDs. For the most part, the IDs are ints and strings. Some of the ID entries, however, are lists of multiple IDs. I cannot split up these lists, but when trying to drop duplicates I get an error. For reference, I used df = df['ID'].astype(str) and it made no difference in the errors shown below.
Code for df:
d = {'ID': [999,
123,
F41,
99W21,
662,
123,
[552, F430, R111],
44482,
F41,
[M192, 5527, 7890, 111120]
]}
df = pd.Dataframe(data=d)
The input df ID column looks something like:
Index ID
-------------
0 999
1 123
2 F41
3 99W21
4 662
5 123
6 [552, F430, R111]
7 44482
8 F41
9 [M192, 5527, 7890, 111120]
And I would like to drop duplicates such that the output is:
Index ID
-------------
0 999
1 123
2 F41
3 99W21
4 662
5 [552, F430, R111]
6 44482
7 [M192, 5527, 7890, 111120]
I have tried df.drop_duplicates(subset=['ID'], inplace=True) which gives me the error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-13-0186aa1e1043> in <module>
3 # Reset index and drop CID duplicates
----> 4 df.drop_duplicates(subset=['ID'], inplace=True)
5 df.reset_index(drop=True, inplace=True)
/usr/local/lib/python3.6/dist-packages/pandas/core/frame.py in drop_duplicates(self, subset, keep, inplace)
4907
4908 inplace = validate_bool_kwarg(inplace, "inplace")
-> 4909 duplicated = self.duplicated(subset, keep=keep)
4910
4911 if inplace:
/usr/local/lib/python3.6/dist-packages/pandas/core/frame.py in duplicated(self, subset, keep)
4967
4968 vals = (col.values for name, col in self.items() if name in subset)
-> 4969 labels, shape = map(list, zip(*map(f, vals)))
4970
4971 ids = get_group_index(labels, shape, sort=False, xnull=False)
/usr/local/lib/python3.6/dist-packages/pandas/core/frame.py in f(vals)
4945 def f(vals):
4946 labels, shape = algorithms.factorize(
-> 4947 vals, size_hint=min(len(self), _SIZE_HINT_LIMIT)
4948 )
4949 return labels.astype("i8", copy=False), len(shape)
/usr/local/lib/python3.6/dist-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
206 else:
207 kwargs[new_arg_name] = new_arg_value
--> 208 return func(*args, **kwargs)
209
210 return wrapper
/usr/local/lib/python3.6/dist-packages/pandas/core/algorithms.py in factorize(values, sort, order, na_sentinel, size_hint)
670
671 labels, uniques = _factorize_array(
--> 672 values, na_sentinel=na_sentinel, size_hint=size_hint, na_value=na_value
673 )
674
/usr/local/lib/python3.6/dist-packages/pandas/core/algorithms.py in _factorize_array(values, na_sentinel, size_hint, na_value)
506 table = hash_klass(size_hint or len(values))
507 uniques, labels = table.factorize(
--> 508 values, na_sentinel=na_sentinel, na_value=na_value
509 )
510
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.factorize()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable._unique()
TypeError: unhashable type: 'list'
And also df = pd.DataFrame(np.unique(df), columns=df.columns), which gives the error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-14-5b335a526fd5> in <module>
3 # Reset index and drop CID duplicates
----> 4 df = pd.DataFrame(np.unique(df), columns=df.columns)
5 df.reset_index(drop=True, inplace=True)
<__array_function__ internals> in unique(*args, **kwargs)
/usr/local/lib/python3.6/dist-packages/numpy/lib/arraysetops.py in unique(ar, return_index, return_inverse, return_counts, axis)
260 ar = np.asanyarray(ar)
261 if axis is None:
--> 262 ret = _unique1d(ar, return_index, return_inverse, return_counts)
263 return _unpack_tuple(ret)
264
/usr/local/lib/python3.6/dist-packages/numpy/lib/arraysetops.py in _unique1d(ar, return_index, return_inverse, return_counts)
308 aux = ar[perm]
309 else:
--> 310 ar.sort()
311 aux = ar
312 mask = np.empty(aux.shape, dtype=np.bool_)
TypeError: '<' not supported between instances of 'float' and 'str'
If there is a way around this, I am not sure what it is, so any help would be useful.
unhashable type: 'list' error means Pandas trying to use list as an hash argument.
All of Python's immutable built-in objects are hashable, while no mutable containers (such as lists or dictionaries) are.
Try to convert column to string and drop the duplicates. and change it back to dataframe
df = df['ID'].astype(str).drop_duplicates().to_frame()

Possible bug with `xarray.Dataset.groupby()`?

I'm using Xarray version 0.8.0, Python 3.5.1, on Mac OS X El Capitan 10.11.6.
The following code works as expected.
id_data_array = xarray.DataArray([280, 306, 280], coords={"index": range(3)})
random = numpy.random.rand(3)
score_data_array = xarray.DataArray(random, coords={"index": range(3)})
score_dataset = xarray.Dataset({"id": id_data_array, "score": score_data_array})
print(score_dataset)
print("======")
print(score_dataset.groupby("id").count())
Output:
<xarray.Dataset>
Dimensions: (index: 3)
Coordinates:
* index (index) int64 0 1 2
Data variables:
id (index) int64 280 306 280
score (index) float64 0.8358 0.7536 0.9495
======
<xarray.Dataset>
Dimensions: (id: 2)
Coordinates:
* id (id) int64 280 306
Data variables:
score (id) int64 2 1
In [ ]:
However, if I change just one little thing, to make the elements of id_data_array all distinct, then there is an error.
Code:
id_data_array = xarray.DataArray([280, 306, 120], coords={"index": range(3)})
random = numpy.random.rand(3)
score_data_array = xarray.DataArray(random, coords={"index": range(3)})
score_dataset = xarray.Dataset({"id": id_data_array, "score": score_data_array})
print(score_dataset)
print("======")
print(score_dataset.groupby("id").count())
Output:
<xarray.Dataset>
Dimensions: (index: 3)
Coordinates:
* index (index) int64 0 1 2
Data variables:
id (index) int64 280 306 120
score (index) float64 0.1353 0.0437 0.1687
======
---------------------------------------------------------------------------
InvalidIndexError Traceback (most recent call last)
<ipython-input-92-cc412270ba2e> in <module>()
5 print(score_dataset)
6 print("======")
----> 7 print(score_dataset.groupby("id").count())
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/common.py in wrapped_func(self, dim, keep_attrs, **kwargs)
44 return self.reduce(func, dim, keep_attrs,
45 numeric_only=numeric_only, allow_lazy=True,
---> 46 **kwargs)
47 return wrapped_func
48
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/groupby.py in reduce(self, func, dim, keep_attrs, **kwargs)
605 def reduce_dataset(ds):
606 return ds.reduce(func, dim, keep_attrs, **kwargs)
--> 607 return self.apply(reduce_dataset)
608
609 def assign(self, **kwargs):
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/groupby.py in apply(self, func, **kwargs)
562 kwargs.pop('shortcut', None) # ignore shortcut if set (for now)
563 applied = (func(ds, **kwargs) for ds in self._iter_grouped())
--> 564 combined = self._concat(applied)
565 result = self._maybe_restore_empty_groups(combined)
566 return result
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/groupby.py in _concat(self, applied)
570 concat_dim, positions = self._infer_concat_args(applied_example)
571
--> 572 combined = concat(applied, concat_dim)
573 reordered = _maybe_reorder(combined, concat_dim, positions)
574 return reordered
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/combine.py in concat(objs, dim, data_vars, coords, compat, positions, indexers, mode, concat_over)
114 raise TypeError('can only concatenate xarray Dataset and DataArray '
115 'objects, got %s' % type(first_obj))
--> 116 return f(objs, dim, data_vars, coords, compat, positions)
117
118
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/combine.py in _dataset_concat(datasets, dim, data_vars, coords, compat, positions)
276 if coord is not None:
277 # add concat dimension last to ensure that its in the final Dataset
--> 278 result[coord.name] = coord
279
280 return result
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/dataset.py in __setitem__(self, key, value)
536 raise NotImplementedError('cannot yet use a dictionary as a key '
537 'to set Dataset values')
--> 538 self.update({key: value})
539
540 def __delitem__(self, key):
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/dataset.py in update(self, other, inplace)
1434 dataset.
1435 """
-> 1436 variables, coord_names, dims = dataset_update_method(self, other)
1437
1438 return self._replace_vars_and_dims(variables, coord_names, dims,
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/merge.py in dataset_update_method(dataset, other)
490 priority_arg = 1
491 indexes = dataset.indexes
--> 492 return merge_core(objs, priority_arg=priority_arg, indexes=indexes)
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/merge.py in merge_core(objs, compat, join, priority_arg, explicit_coords, indexes)
371
372 coerced = coerce_pandas_values(objs)
--> 373 aligned = deep_align(coerced, join=join, copy=False, indexes=indexes)
374 expanded = expand_variable_dicts(aligned)
375
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/alignment.py in deep_align(list_of_variable_maps, join, copy, indexes)
146 out.append(variables)
147
--> 148 aligned = partial_align(*targets, join=join, copy=copy, indexes=indexes)
149
150 for key, aligned_obj in zip(keys, aligned):
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/alignment.py in partial_align(*objects, **kwargs)
109 valid_indexers = dict((k, v) for k, v in joined_indexes.items()
110 if k in obj.dims)
--> 111 result.append(obj.reindex(copy=copy, **valid_indexers))
112 return tuple(result)
113
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/dataset.py in reindex(self, indexers, method, tolerance, copy, **kw_indexers)
1216
1217 variables = alignment.reindex_variables(
-> 1218 self.variables, self.indexes, indexers, method, tolerance, copy=copy)
1219 return self._replace_vars_and_dims(variables)
1220
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/alignment.py in reindex_variables(variables, indexes, indexers, method, tolerance, copy)
218 target = utils.safe_cast_to_index(indexers[name])
219 indexer = index.get_indexer(target, method=method,
--> 220 **get_indexer_kwargs)
221
222 to_shape[name] = len(target)
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/indexes/base.py in get_indexer(self, target, method, limit, tolerance)
2080
2081 if not self.is_unique:
-> 2082 raise InvalidIndexError('Reindexing only valid with uniquely'
2083 ' valued Index objects')
2084
InvalidIndexError: Reindexing only valid with uniquely valued Index objects
To me this seems buggy because if this is the desired behaviour then it would be very strange. Surely, we should include the case when all the elements of the DataArray we're grouping by are distinct?
Update
I've now uninstalled and reinstalled Xarray. The new Xarray is version 0.8.1, and it seems to work fine. So it may indeed be a bug in Xarray 0.8.0.

Categories

Resources