SpecificationError: nested renamer is not supported using groupby() - python

Could you please help me to solve this issue in my code, as the spatial join using pandas (groupby(), agg()) it give me the below error:
I have a data frame df and I use several columns from it to groupby:
n the below way I almost get the table (data frame) that I need. What is missing is an additional column that contains number of rows in each group. In other words, I have mean but I also would like to know how many number were used to get these means.
In short: How do I get group-wise statistics for a dataframe?
Code:
def bin_the_midpoints(bins, midpoints):
b = bins.copy()
m = midpoints.copy()
reindexed = b.reset_index().rename(columns={'index':'bins_index'})
joined = gpd.tools.sjoin(reindexed, m)
bin_stats = joined.groupby('bins_index')['offset']\
.agg({'fold': len, 'min_offset': np.min})
return gpd.GeoDataFrame(b.join(bin_stats))
bin_stats = bin_the_midpoints(bins, midpoints)
Error:
---------------------------------------------------------------------------
SpecificationError Traceback (most recent call last)
Input In [103], in <cell line: 9>()
6 bin_stats = joined.groupby('bins_index')['offset']\
7 .agg({'fold': len, 'min_offset': np.min})
8 return gpd.GeoDataFrame(b.join(bin_stats))
----> 9 bin_stats = bin_the_midpoints(bins, midpoints)
Input In [103], in bin_the_midpoints(bins, midpoints)
4 reindexed = b.reset_index().rename(columns={'index':'bins_index'})
5 joined = gpd.tools.sjoin(reindexed, m)
----> 6 bin_stats = joined.groupby('bins_index')['offset']\
7 .agg({'fold': len, 'min_offset': np.min})
8 return gpd.GeoDataFrame(b.join(bin_stats))
File ~\anaconda3\envs\GeoSynapps\lib\site-packages\pandas\core\groupby\generic.py:271, in SeriesGroupBy.aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
267 elif isinstance(func, abc.Iterable):
268 # Catch instances of lists / tuples
269 # but not the class list / tuple itself.
270 func = maybe_mangle_lambdas(func)
--> 271 ret = self._aggregate_multiple_funcs(func)
272 if relabeling:
273 # error: Incompatible types in assignment (expression has type
274 # "Optional[List[str]]", variable has type "Index")
275 ret.columns = columns # type: ignore[assignment]
File ~\anaconda3\envs\GeoSynapps\lib\site-packages\pandas\core\groupby\generic.py:307, in SeriesGroupBy._aggregate_multiple_funcs(self, arg)
301 def _aggregate_multiple_funcs(self, arg) -> DataFrame:
302 if isinstance(arg, dict):
303
304 # show the deprecation, but only if we
305 # have not shown a higher level one
306 # GH 15931
--> 307 raise SpecificationError("nested renamer is not supported")
309 elif any(isinstance(x, (tuple, list)) for x in arg):
310 arg = [(x, x) if not isinstance(x, (tuple, list)) else x for x in arg]
SpecificationError: nested renamer is not supported

You must read more about agg method in pandas. Easily you can add many calculation to this method.
For example you can write:
df.groupby(by=[...]).agg({'col1': ['count', 'sum', 'min']})

Related

How do I add a list to a column in pandas?

I'm trying to merge the columns kw1, kw2, kw3 shown here:
and have it in one separate column called keywords. This is what I tried:
df['keywords'] = list((df['kw1'], df['kw2'], df['kw3']))
df
but I'm getting this error:
ValueError Traceback (most recent call last)
Input In [13], in <cell line: 1>()
----> 1 df['keywords'] = list((df['kw1'], df['kw2'], df['kw3']))
2 df
File /lib/python3.10/site-packages/pandas/core/frame.py:3655, in DataFrame.__setitem__(self, key, value)
3652 self._setitem_array([key], value)
3653 else:
3654 # set column
-> 3655 self._set_item(key, value)
File /lib/python3.10/site-packages/pandas/core/frame.py:3832, in DataFrame._set_item(self, key, value)
3822 def _set_item(self, key, value) -> None:
3823 """
3824 Add series to DataFrame in specified column.
3825
(...)
3830 ensure homogeneity.
3831 """
-> 3832 value = self._sanitize_column(value)
3834 if (
3835 key in self.columns
3836 and value.ndim == 1
3837 and not is_extension_array_dtype(value)
3838 ):
3839 # broadcast across multiple columns if necessary
3840 if not self.columns.is_unique or isinstance(self.columns, MultiIndex):
File /lib/python3.10/site-packages/pandas/core/frame.py:4535, in DataFrame._sanitize_column(self, value)
4532 return _reindex_for_setitem(value, self.index)
4534 if is_list_like(value):
-> 4535 com.require_length_match(value, self.index)
4536 return sanitize_array(value, self.index, copy=True, allow_2d=True)
File /lib/python3.10/site-packages/pandas/core/common.py:557, in require_length_match(data, index)
553 """
554 Check the length of data matches the length of the index.
555 """
556 if len(data) != len(index):
--> 557 raise ValueError(
558 "Length of values "
559 f"({len(data)}) "
560 "does not match length of index "
561 f"({len(index)})"
562 )
ValueError: Length of values (3) does not match length of index (141)
Is there a way to make it so that it turns it into a list like this [{value of kw1}, {value of kw2}, {value of kw3}]
You can do it like this
df['keywords'] = np.stack([df['kw1'], df['kw2'], df['kw3']], axis=1).tolist()
Pandas treats each element in the outermost list as a single value, so it complains that you only has three values (which are your three series) while you need 141 values for a new column since your original frame has 141 rows.
Stacking the underlying numpy arrays of the three series on the last dimension gives you a shape (141,3) and converting them to list gives you a list of length 141, with each element being another list of length 3.
A more concise way is to extract three columns as another df and let pandas do the stacking for you
df['keywords'] = df[['kw1', 'kw2', 'kw3']].values.tolist()

dataframe error when comparing expression levels: TypeError: Unordered Categoricals can only compare equality or not

I am working with an anndata object gleaned from analyzing single-cell RNAseq data using scanpy to obtain clusters. This is far along in the process (near completed) and I am now trying to obtain a list of the average expression of certain marker genes in the leiden clusters from my data. I am getting an error at the following point.
# Backbone imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
# Single Cell imports
import anndata
import scanpy as sc
markers = ["MS4A1", "CD72", "CD37", "CD79A", "CD79B","CD19"]
grouping_column = "leiden"
df = sc.get.obs_df(hy_bc, markers + [grouping_column])
mean_expression = df.loc[:, ~df.columns.isin([grouping_column])].mean(axis=0)
mean_expression:
MS4A1 1.594015
CD72 0.421510
CD37 1.858241
CD79A 1.801162
CD79B 1.180483
CD19 0.430246
dtype: float32
df, mean_expression = df.align(mean_expression, axis=1, copy=False)
Error happens here
g = (df > mean_expression).groupby(grouping_column)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Input In [88], in <cell line: 1>()
----> 1 g = (df > mean_expression).groupby(grouping_column)
File C:\ProgramData\Anaconda3\envs\JHH216-hT246\lib\site-packages\pandas\core\ops\common.py:70, in _unpack_zerodim_and_defer.<locals>.new_method(self, other)
66 return NotImplemented
68 other = item_from_zerodim(other)
---> 70 return method(self, other)
File C:\ProgramData\Anaconda3\envs\JHH216-hT246\lib\site-packages\pandas\core\arraylike.py:56, in OpsMixin.__gt__(self, other)
54 #unpack_zerodim_and_defer("__gt__")
55 def __gt__(self, other):
---> 56 return self._cmp_method(other, operator.gt)
File C:\ProgramData\Anaconda3\envs\JHH216-hT246\lib\site-packages\pandas\core\frame.py:6934, in DataFrame._cmp_method(self, other, op)
6931 self, other = ops.align_method_FRAME(self, other, axis, flex=False, level=None)
6933 # See GH#4537 for discussion of scalar op behavior
-> 6934 new_data = self._dispatch_frame_op(other, op, axis=axis)
6935 return self._construct_result(new_data)
File C:\ProgramData\Anaconda3\envs\JHH216-hT246\lib\site-packages\pandas\core\frame.py:6985, in DataFrame._dispatch_frame_op(self, right, func, axis)
6979 # TODO: The previous assertion `assert right._indexed_same(self)`
6980 # fails in cases with empty columns reached via
6981 # _frame_arith_method_with_reindex
6982
6983 # TODO operate_blockwise expects a manager of the same type
6984 with np.errstate(all="ignore"):
-> 6985 bm = self._mgr.operate_blockwise(
6986 # error: Argument 1 to "operate_blockwise" of "ArrayManager" has
6987 # incompatible type "Union[ArrayManager, BlockManager]"; expected
6988 # "ArrayManager"
6989 # error: Argument 1 to "operate_blockwise" of "BlockManager" has
6990 # incompatible type "Union[ArrayManager, BlockManager]"; expected
6991 # "BlockManager"
6992 right._mgr, # type: ignore[arg-type]
6993 array_op,
6994 )
6995 return self._constructor(bm)
6997 elif isinstance(right, Series) and axis == 1:
6998 # axis=1 means we want to operate row-by-row
File C:\ProgramData\Anaconda3\envs\JHH216-hT246\lib\site-packages\pandas\core\internals\managers.py:1409, in BlockManager.operate_blockwise(self, other, array_op)
1405 def operate_blockwise(self, other: BlockManager, array_op) -> BlockManager:
1406 """
1407 Apply array_op blockwise with another (aligned) BlockManager.
1408 """
-> 1409 return operate_blockwise(self, other, array_op)
File C:\ProgramData\Anaconda3\envs\JHH216-hT246\lib\site-packages\pandas\core\internals\ops.py:63, in operate_blockwise(left, right, array_op)
61 res_blks: list[Block] = []
62 for lvals, rvals, locs, left_ea, right_ea, rblk in _iter_block_pairs(left, right):
---> 63 res_values = array_op(lvals, rvals)
64 if left_ea and not right_ea and hasattr(res_values, "reshape"):
65 res_values = res_values.reshape(1, -1)
File C:\ProgramData\Anaconda3\envs\JHH216-hT246\lib\site-packages\pandas\core\ops\array_ops.py:269, in comparison_op(left, right, op)
260 raise ValueError(
261 "Lengths must match to compare", lvalues.shape, rvalues.shape
262 )
264 if should_extension_dispatch(lvalues, rvalues) or (
265 (isinstance(rvalues, (Timedelta, BaseOffset, Timestamp)) or right is NaT)
266 and not is_object_dtype(lvalues.dtype)
267 ):
268 # Call the method on lvalues
--> 269 res_values = op(lvalues, rvalues)
271 elif is_scalar(rvalues) and isna(rvalues): # TODO: but not pd.NA?
272 # numpy does not like comparisons vs None
273 if op is operator.ne:
File C:\ProgramData\Anaconda3\envs\JHH216-hT246\lib\site-packages\pandas\core\ops\common.py:70, in _unpack_zerodim_and_defer.<locals>.new_method(self, other)
66 return NotImplemented
68 other = item_from_zerodim(other)
---> 70 return method(self, other)
File C:\ProgramData\Anaconda3\envs\JHH216-hT246\lib\site-packages\pandas\core\arrays\categorical.py:141, in _cat_compare_op.<locals>.func(self, other)
139 if not self.ordered:
140 if opname in ["__lt__", "__gt__", "__le__", "__ge__"]:
--> 141 raise TypeError(
142 "Unordered Categoricals can only compare equality or not"
143 )
144 if isinstance(other, Categorical):
145 # Two Categoricals can only be compared if the categories are
146 # the same (maybe up to ordering, depending on ordered)
148 msg = "Categoricals can only be compared if 'categories' are the same."
TypeError: Unordered Categoricals can only compare equality or not
Code I have, but have not run yet because of the error:
frac = lambda z: sum(z) / z.shape[0]
frac.__name__ = "pos_frac"
g.aggregate([sum, frac])
It seems that your grouping column is a categorical column and not float or int. try adding this line after the instantiation of the dataframe.
df = sc.get.obs_df(hy_bc, markers + [grouping_column])
df[grouping_column] = df[grouping_column].astype('int64')
another issue I noticed. the expression df > mean_expression will produce all false values in leiden because leiden has the value NaN in the mean expression. therefore when you use groupby, you will only have one group which is the value False. One group defeats the purpose of groupby. Not sure what are you trying to do but wanted to point that out.

Unhashable list error when finding duplicates in a pandas dataframe

Hi this is really confusing me, as I am using one command on a large datframe:
df.duplicated(subset=None, keep='first)
This looks identical to what the documentation says of:
DataFrame.duplicated(subset=None, keep='first')
I'm just using df instead, however, all I get back is the following traceback:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-53-529f7b7a97fb> in <module>()
----> 1 df.duplicated(subset=None, keep='first')
/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in duplicated(self, subset, keep)
4383 vals = (col.values for name, col in self.iteritems()
4384 if name in subset)
-> 4385 labels, shape = map(list, zip(*map(f, vals)))
4386
4387 ids = get_group_index(labels, shape, sort=False, xnull=False)
/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in f(vals)
4364 def f(vals):
4365 labels, shape = algorithms.factorize(
-> 4366 vals, size_hint=min(len(self), _SIZE_HINT_LIMIT))
4367 return labels.astype('i8', copy=False), len(shape)
4368
/anaconda3/lib/python3.7/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
176 else:
177 kwargs[new_arg_name] = new_arg_value
--> 178 return func(*args, **kwargs)
179 return wrapper
180 return _deprecate_kwarg
/anaconda3/lib/python3.7/site-packages/pandas/core/algorithms.py in factorize(values, sort, order, na_sentinel, size_hint)
628 na_sentinel=na_sentinel,
629 size_hint=size_hint,
--> 630 na_value=na_value)
631
632 if sort and len(uniques) > 0:
/anaconda3/lib/python3.7/site-packages/pandas/core/algorithms.py in _factorize_array(values, na_sentinel, size_hint, na_value)
474 uniques = vec_klass()
475 labels = table.get_labels(values, uniques, 0, na_sentinel,
--> 476 na_value=na_value)
477
478 labels = _ensure_platform_int(labels)
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_labels()
TypeError: unhashable type: 'list'
What am I doing wrong?
From what I can understand, you got lists in your data frame and python or Pandas can not hash lists. You may have observed this, in case you ever tried to use lists as keys in a dictionary. A simple workaround would be to convert the lists to tuples which are hashable.

How to plot a pandas.core.series.Series as a bar graph?

I'm trying to plot a pandas series variable, which has a numeric id in one column and frequency of that id in the next column. I wish to plot these two as a bar graph with freq on the y-axis and id no. on the x-axis. However, there are too many rows, i.e. id numbers. Is there a way I can only plot the top 10 most frequently occurring ids?
executing this code - area_count.plot.bar
gives this error-
<bound method SeriesPlotMethods.bar of
<pandas.plotting._core.SeriesPlotMethods object at 0x0000019C68029908>>
I tried storing the top 20 values from this series into another variable using the following code:
for i in range(1,20):
f[i,:] = area_count[i,:]
But it showed this error:
ValueError Traceback (most recent call last)
<ipython-input-88-1020cb7bdfc3> in <module>
1 for i in range(1,20):
----> 2 f[i,:] = area_count[i,:]
~\Anaconda3\lib\site-packages\pandas\core\series.py in __getitem__(self, key)
909 key = check_bool_indexer(self.index, key)
910
--> 911 return self._get_with(key)
912
913 def _get_with(self, key):
~\Anaconda3\lib\site-packages\pandas\core\series.py in _get_with(self, key)
921 elif isinstance(key, tuple):
922 try:
--> 923 return self._get_values_tuple(key)
924 except Exception:
925 if len(key) == 1:
~\Anaconda3\lib\site-packages\pandas\core\series.py in _get_values_tuple(self, key)
966
967 if not isinstance(self.index, MultiIndex):
--> 968 raise ValueError('Can only tuple-index with a MultiIndex')
969
970 # If key is contained, would have returned by now
ValueError: Can only tuple-index with a MultiIndex
If I understand you correctly, you now need the top 10 frequently occurring ids, do it by turning your series into a dataframe like:
x = df['id'].value_counts().sort_values(ascending = False).head(10).to_frame().reset_index()
x.rename(columns = {'index':'id', 'id': 'freq'}, inplace = True)
Now plot the graph:
x.plot.bar(x = 'id', y = 'freq')
Sample Output:

Error: all arrays must be same length. But they ARE the same length

I am doing some work about sentiment analysis, here I have three arrays:the content of the sentences, the sentiment score and the key words.
I want to display them as a dataframe by pandas, but I got :
"ValueError: arrays must all be same length"
Here are some of my codes:
print(len(text_sentences),len(score_list),len(keyinfo_list))
df = pd.DataFrame(text_sentences,score_list,keyinfo_list)
print(df)
Here are the results:
182 182 182
ValueError Traceback (most recent call last)
<ipython-input-15-cfb70aca07d1> in <module>()
21 print(len(text_sentences),len(score_list),len(keyinfo_list))
22
---> 23 df = pd.DataFrame(text_sentences,score_list,keyinfo_list)
24
25 print(df)
E:\learningsoft\anadonda\lib\site-packages\pandas\core\frame.py in __init__(self, data, index, columns, dtype, copy)
328 else:
329 mgr = self._init_ndarray(data, index, columns, dtype=dtype,
--> 330 copy=copy)
331 else:
332 mgr = self._init_dict({}, index, columns, dtype=dtype)
E:\learningsoft\anadonda\lib\site-packages\pandas\core\frame.py in _init_ndarray(self, values, index, columns, dtype, copy)
472 raise_with_traceback(e)
473
--> 474 index, columns = _get_axes(*values.shape)
475 values = values.T
476
E:\learningsoft\anadonda\lib\site-packages\pandas\core\frame.py in _get_axes(N, K, index, columns)
439 columns = _default_index(K)
440 else:
--> 441 columns = _ensure_index(columns)
442 return index, columns
443
E:\learningsoft\anadonda\lib\site-packages\pandas\core\indexes\base.py in _ensure_index(index_like, copy)
4015 if len(converted) > 0 and all_arrays:
4016 from .multi import MultiIndex
-> 4017 return MultiIndex.from_arrays(converted)
4018 else:
4019 index_like = converted
E:\learningsoft\anadonda\lib\site-packages\pandas\core\indexes\multi.py in from_arrays(cls, arrays, sortorder, names)
1094 for i in range(1, len(arrays)):
1095 if len(arrays[i]) != len(arrays[i - 1]):
-> 1096 raise ValueError('all arrays must be same length')
1097
1098 from pandas.core.categorical import _factorize_from_iterables
ValueError: all arrays must be same length
You can see all my three arrays contain 182 elements, so I don't understand why it said "all arrays must be same length".
You're passing the wrong data into pandas.DataFrame's initializer.
The way you're using it, you're essentially running:
pandas.DataFrame(data=text_sentences, index=score_list, columns=keyinfo_list)
This isn't what you want. You probably want to do something like this instead:
pd.DataFrame(data={
'sentences': text_sentences,
'scores': score_list,
'keyinfo': keyinfo_list
})

Categories

Resources