non-NDFFrame object error using pandas.SparseSeries.from_coo() function - python

I am trying to convert a COO type sparse matrix (from Scipy.Sparse) to a Pandas sparse series. From the documentation(http://pandas.pydata.org/pandas-docs/stable/sparse.html) it says to use the command SparseSeries.from_coo(A). This seems to be OK, but when I try to see the series' attributes, this is what happens.
10x10 seems OK.
import pandas as pd
import scipy.sparse as ss
import numpy as np
row = (np.random.random(10)*10).astype(int)
col = (np.random.random(10)*10).astype(int)
val = np.random.random(10)*10
sparse = ss.coo_matrix((val,(row,col)),shape=(10,10))
pss = pd.SparseSeries.from_coo(sparse)
print pss
0 7 1.416631
9 5.833902
1 0 4.131919
2 3 2.820531
7 2.227009
3 1 9.205619
4 4 8.309077
6 0 4.376921
7 6 8.444013
7 7.383886
dtype: float64
BlockIndex
Block locations: array([0])
Block lengths: array([10])
But not 100x100.
import pandas as pd
import scipy.sparse as ss
import numpy as np
row = (np.random.random(100)*100).astype(int)
col = (np.random.random(100)*100).astype(int)
val = np.random.random(100)*100
sparse = ss.coo_matrix((val,(row,col)),shape=(100,100))
pss = pd.SparseSeries.from_coo(sparse)
print pss
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-790-f0c22a601b93> in <module>()
7 sparse = ss.coo_matrix((val,(row,col)),shape=(100,100))
8 pss = pd.SparseSeries.from_coo(sparse)
----> 9 print pss
10
C:\Users\ej\AppData\Local\Continuum\Anaconda\lib\site-packages\pandas\core\base.pyc in __str__(self)
45 if compat.PY3:
46 return self.__unicode__()
---> 47 return self.__bytes__()
48
49 def __bytes__(self):
C:\Users\ej\AppData\Local\Continuum\Anaconda\lib\site-packages\pandas\core\base.pyc in __bytes__(self)
57
58 encoding = get_option("display.encoding")
---> 59 return self.__unicode__().encode(encoding, 'replace')
60
61 def __repr__(self):
C:\Users\ej\AppData\Local\Continuum\Anaconda\lib\site-packages\pandas\sparse\series.pyc in __unicode__(self)
287 def __unicode__(self):
288 # currently, unicode is same as repr...fixes infinite loop
--> 289 series_rep = Series.__unicode__(self)
290 rep = '%s\n%s' % (series_rep, repr(self.sp_index))
291 return rep
C:\Users\ej\AppData\Local\Continuum\Anaconda\lib\site-packages\pandas\core\series.pyc in __unicode__(self)
895
896 self.to_string(buf=buf, name=self.name, dtype=self.dtype,
--> 897 max_rows=max_rows)
898 result = buf.getvalue()
899
C:\Users\ej\AppData\Local\Continuum\Anaconda\lib\site-packages\pandas\core\series.pyc in to_string(self, buf, na_rep, float_format, header, length, dtype, name, max_rows)
960 the_repr = self._get_repr(float_format=float_format, na_rep=na_rep,
961 header=header, length=length, dtype=dtype,
--> 962 name=name, max_rows=max_rows)
963
964 # catch contract violations
C:\Users\ej\AppData\Local\Continuum\Anaconda\lib\site-packages\pandas\core\series.pyc in _get_repr(self, name, header, length, dtype, na_rep, float_format, max_rows)
989 na_rep=na_rep,
990 float_format=float_format,
--> 991 max_rows=max_rows)
992 result = formatter.to_string()
993
C:\Users\ej\AppData\Local\Continuum\Anaconda\lib\site-packages\pandas\core\format.pyc in __init__(self, series, buf, length, header, na_rep, name, float_format, dtype, max_rows)
145 self.dtype = dtype
146
--> 147 self._chk_truncate()
148
149 def _chk_truncate(self):
C:\Users\ej\AppData\Local\Continuum\Anaconda\lib\site-packages\pandas\core\format.pyc in _chk_truncate(self)
158 else:
159 row_num = max_rows // 2
--> 160 series = concat((series.iloc[:row_num], series.iloc[-row_num:]))
161 self.tr_row_num = row_num
162 self.tr_series = series
C:\Users\ej\AppData\Local\Continuum\Anaconda\lib\site-packages\pandas\tools\merge.pyc in concat(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity, copy)
752 keys=keys, levels=levels, names=names,
753 verify_integrity=verify_integrity,
--> 754 copy=copy)
755 return op.get_result()
756
C:\Users\ej\AppData\Local\Continuum\Anaconda\lib\site-packages\pandas\tools\merge.pyc in __init__(self, objs, axis, join, join_axes, keys, levels, names, ignore_index, verify_integrity, copy)
803 for obj in objs:
804 if not isinstance(obj, NDFrame):
--> 805 raise TypeError("cannot concatenate a non-NDFrame object")
806
807 # consolidate
TypeError: cannot concatenate a non-NDFrame object
I don't really understand the error message - I think I am following the example in the documentation to the letter, just using my own COO matrix (could it be the size?)
Regards

I have an older pandas. It has the sparse code, but not the tocoo.
The pandas issue that has been filed in connection with this is:
https://github.com/pydata/pandas/issues/10818
But I found on github that:
def _coo_to_sparse_series(A, dense_index=False):
""" Convert a scipy.sparse.coo_matrix to a SparseSeries.
Use the defaults given in the SparseSeries constructor. """
s = Series(A.data, MultiIndex.from_arrays((A.row, A.col)))
s = s.sort_index()
s = s.to_sparse() # TODO: specify kind?
# ...
return s
With a smallish sparse matrix I construct and display without problems:
In [259]: Asml=sparse.coo_matrix(np.arange(10*5).reshape(10,5))
In [260]: s=pd.Series(Asml.data,pd.MultiIndex.from_arrays((Asml.row,Asml.col)))
In [261]: s=s.sort_index()
In [262]: s
Out[262]:
0 1 1
2 2
3 3
4 4
1 0 5
1 6
2 7
[... mine]
3 48
4 49
dtype: int32
In [263]: ssml=s.to_sparse()
In [264]: ssml
Out[264]:
0 1 1
2 2
3 3
4 4
1 0 5
[... mine]
2 47
3 48
4 49
dtype: int32
BlockIndex
Block locations: array([0])
Block lengths: array([49])
but with a larger array (more nonzero elements) I get a display error. I'm guessing it happens when the display for the (plain) series starts to use an ellipsis (...). I'm running in Py3, so I get a different error message.
....\pandas\core\base.pyc in __str__(self)
45 if compat.PY3:
46 return self.__unicode__() # py3
47 return self.__bytes__() # py2 route
e.g.:
In [265]: Asml=sparse.coo_matrix(np.arange(10*7).reshape(10,7))
In [266]: s=pd.Series(Asml.data,pd.MultiIndex.from_arrays((Asml.row,Asml.col)))
In [267]: s=s.sort_index()
In [268]: s
Out[268]:
0 1 1
2 2
3 3
4 4
5 5
6 6
1 0 7
1 8
2 9
3 10
4 11
5 12
6 13
2 0 14
1 15
...
7 6 55
8 0 56
1 57
[... mine]
Length: 69, dtype: int32
In [269]: ssml=s.to_sparse()
In [270]: ssml
Out[270]: <repr(<pandas.sparse.series.SparseSeries at 0xaff6bc0c>)
failed: AttributeError: 'SparseArray' object has no attribute '_get_repr'>
I'm not sufficiently familiar with pandas code and structures to deduce much more for now.

Related

Unable to allocate 208. GiB for an array with shape (27939587241,) and data type int64?

This is my code:
play_count_with_title = pd.merge(df_count, df_small[['song_id', 'title', 'release']], on = 'song_id' )
final_ratings = pd.merge(play_count_with_title, df_small[['song_id', 'artist_name']], on = 'song_id' )
final_ratings
the error which i got is
Unable to allocate 208. GiB for an array with shape (27939587241,) and data type int64
The Code which enabled this error within the library is
File ~\anaconda3\lib\site-packages\pandas\core\reshape\merge.py:124, in merge(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)
93 #Substitution("\nleft : DataFrame or named Series")
94 #Appender(_merge_doc, indents=0)
95 def merge(
(...)
108 validate: str | None = None,
109 ) -> DataFrame:
110 op = _MergeOperation(
111 left,
112 right,
(...)
122 validate=validate,
123 )
--> 124 return op.get_result(copy=copy)
File ~\anaconda3\lib\site-packages\pandas\core\reshape\merge.py:773, in _MergeOperation.get_result(self, copy)
770 if self.indicator:
771 self.left, self.right = self._indicator_pre_merge(self.left, self.right)
--> 773 join_index, left_indexer, right_indexer = self._get_join_info()
775 result = self._reindex_and_concat(
776 join_index, left_indexer, right_indexer, copy=copy
777 )
778 result = result.__finalize__(self, method=self._merge_type)
File ~\anaconda3\lib\site-packages\pandas\core\reshape\merge.py:1026, in _MergeOperation._get_join_info(self)
1022 join_index, right_indexer, left_indexer = _left_join_on_index(
1023 right_ax, left_ax, self.right_join_keys, sort=self.sort
1024 )
1025 else:
-> 1026 (left_indexer, right_indexer) = self._get_join_indexers()
1028 if self.right_index:
1029 if len(self.left) > 0:
File ~\anaconda3\lib\site-packages\pandas\core\reshape\merge.py:1000, in _MergeOperation._get_join_indexers(self)
998 def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
999 """return the join indexers"""
-> 1000 return get_join_indexers(
1001 self.left_join_keys, self.right_join_keys, sort=self.sort, how=self.how
1002 )
File ~\anaconda3\lib\site-packages\pandas\core\reshape\merge.py:1610, in get_join_indexers(left_keys, right_keys, sort, how, **kwargs)
1600 join_func = {
1601 "inner": libjoin.inner_join,
1602 "left": libjoin.left_outer_join,
(...)
1606 "outer": libjoin.full_outer_join,
1607 }[how]
1609 # error: Cannot call function of unknown type
-> 1610 return join_func(lkey, rkey, count, **kwargs)
File ~\anaconda3\lib\site-packages\pandas\_libs\join.pyx:48, in pandas._libs.join.inner_join()
As a beginner i dont understand the error can you guys help me out?
It's hard to know what's going on without a sample of your data. However, this looks like the sort of problem you'd see if there are a lot of duplicated values in both dataframes.
Note that if there are multiple rows which match during the merge, then every combination of left and right rows is emitted by the merge.
For example, here's a tiny example of a 3-element DataFrame being merged with itself. The result has 9 elements!
In [7]: df = pd.DataFrame({'a': [1,1,1], 'b': [1,2,3]})
In [8]: df.merge(df, 'left', on='a')
Out[8]:
a b_x b_y
0 1 1 1
1 1 1 2
2 1 1 3
3 1 2 1
4 1 2 2
5 1 2 3
6 1 3 1
7 1 3 2
8 1 3 3
If your song_id column has a lot of duplicates in it, then the number of elements could be as many as N^2, i.e. 154377**2 == 23832258129 in the worst case.
Try using drop_duplicates('song_id') on each of the merge inputs to see what happens in that case.

I get an error of ValueError: Unable to coerce to Series, length must be 1: given 0

I want to run this code in python but I receive this error : ValueError: Unable to coerce to Series, length must be 1: given 0.
I have read a CSV file and want to run this code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.core.fromnumeric import sort
## import data set
data = pd.read_csv('trial_incomes.csv')
print(data.head(10))
def H_function(*args):
Hash=[]
for i in args:
hashedfunction=(6*(i)+1) % 5
Hash+=hashedfunction
print (Hash)
H_function(data)
Error:
ValueError Traceback (most recent call last)
Input In [58], in <cell line: 9>()
5 Hash+=hashedfunction
6 print (Hash)
----> 9 H_function(data)
Input In [58], in H_function(*args)
3 for i in args:
4 hashedfunction=(6*(i)+1) % 5
----> 5 Hash+=hashedfunction
6 print (Hash)
File ~\miniforge3\lib\site-packages\pandas\core\ops\common.py:72, in _unpack_zerodim_and_defer.<locals>.new_method(self, other)
68 return NotImplemented
70 other = item_from_zerodim(other)
---> 72 return method(self, other)
File ~\miniforge3\lib\site-packages\pandas\core\arraylike.py:107, in OpsMixin.__radd__(self, other)
105 #unpack_zerodim_and_defer("__radd__")
106 def __radd__(self, other):
--> 107 return self._arith_method(other, roperator.radd)
File ~\miniforge3\lib\site-packages\pandas\core\frame.py:7584, in DataFrame._arith_method(self, other, op)
7581 axis = 1 # only relevant for Series other case
7582 other = ops.maybe_prepare_scalar_for_op(other, (self.shape[axis],))
-> 7584 self, other = ops.align_method_FRAME(self, other, axis, flex=True, level=None)
7586 new_data = self._dispatch_frame_op(other, op, axis=axis)
7587 return self._construct_result(new_data)
File ~\miniforge3\lib\site-packages\pandas\core\ops\__init__.py:283, in align_method_FRAME(left, right, axis, flex, level)
279 raise ValueError(
280 f"Unable to coerce list of {type(right[0])} to Series/DataFrame"
281 )
282 # GH17901
--> 283 right = to_series(right)
285 if flex is not None and isinstance(right, ABCDataFrame):
286 if not left._indexed_same(right):
File ~\miniforge3\lib\site-packages\pandas\core\ops\__init__.py:240, in align_method_FRAME.<locals>.to_series(right)
238 else:
239 if len(left.columns) != len(right):
--> 240 raise ValueError(
241 msg.format(req_len=len(left.columns), given_len=len(right))
242 )
243 right = left._constructor_sliced(right, index=left.columns)
244 return right
ValueError: Unable to coerce to Series, length must be 1: given 0
OK, there's so much you haven't told us, but here's the problem.
Your handling of the DataFrame is wrong. You are passing the dataframe to your function with the parameter list (*args). That turns the dataframe into an iterator, and when you iterate a dataframe, it comes out as a set of columns. So, args ends up as a list containing the single column of data, as a pandas Series. When you do for i in args:, i ends up as that whole column. You end up doing your math on the whole column, but when you try to add the column to the list, that operation is not supported.
I'm ASSUMING your CSV file has one column of numbers, and you want to the math in your loop on every row of that column. If so, this is what you want:
import pandas as pd
import numpy as np
data = pd.read_csv('x.csv')
print(data.head(10))
def H_function(*args):
Hash=[]
for i in args:
hashedfunction=(6*(i)+1) % 5
Hash.append(hashedfunction)
return Hash
print(H_function(data['income']))
With this input:
income
1234.00
2345.00
3456.00
4576.00
12000.00
24000.00
36000.00
48000.00
60000.00
921.11
922.22
933.33
I get this output:
income
0 1234.00
1 2345.00
2 3456.00
3 4576.00
4 12000.00
5 24000.00
6 36000.00
7 48000.00
8 60000.00
9 921.11
[0 0.00
1 1.00
2 2.00
3 2.00
4 1.00
5 1.00
6 1.00
7 1.00
8 1.00
9 2.66
10 4.32
11 0.98
Name: income, dtype: float64]

Extract multiple dataframes from dictionary

I'm doing some single cell RNA-sequencing using scprep. I'm using the command
scprep.stats.differential_expression_by_cluster(data, clusters) where clusters is an output from sk.learn kmeans.
According to the docs, the output is dict(pd.DataFrame).
My output looks like this:
{0: difference rank
C1qb (ENSMUSG00000036905) 0.176254 0
C1qa (ENSMUSG00000036887) 0.145618 1
C1qc (ENSMUSG00000036896) 0.120607 2
Crybb1 (ENSMUSG00000029343) 0.105344 3
Tyrobp (ENSMUSG00000030579) 0.098916 4
... ... ...
mt-Co3 (ENSMUSG00000064358) -68.884323 16091
Malat1 (ENSMUSG00000092341) -77.371274 16092
Tuba1a (ENSMUSG00000072235) -91.835869 16093
Tmsb4x (ENSMUSG00000049775) -101.908864 16094
mt-Atp6 (ENSMUSG00000064357) -120.025289 16095
[16096 rows x 2 columns], 1: difference rank
Tmsb4x (ENSMUSG00000049775) 127.537848 0
Tuba1a (ENSMUSG00000072235) 91.644383 1
Tubb2b (ENSMUSG00000045136) 48.972048 2
mt-Atp6 (ENSMUSG00000064357) 41.105186 3
Stmn1 (ENSMUSG00000028832) 40.466334 4
... ... ...
Meg3 (ENSMUSG00000021268) -2.904875 16091
Hmgb2 (ENSMUSG00000054717) -4.784257 16092
Vim (ENSMUSG00000026728) -5.001676 16093
Dbi (ENSMUSG00000026385) -6.704505 16094
Fabp7 (ENSMUSG00000019874) -12.319859 16095
[16096 rows x 2 columns], 2: difference rank
Gria2 (ENSMUSG00000033981) 1.688701 0
Pou3f2 (ENSMUSG00000095139) 1.167767 1
Pou3f3 (ENSMUSG00000045515) 0.999804 2
Cldn5 (ENSMUSG00000041378) 0.971778 3
Robo2 (ENSMUSG00000052516) 0.877576 4
When I try pd.DataFrame.from_dict(dict) I get an error message
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-383-630287ba17f3> in <module>
----> 1 df = pd.DataFrame.from_dict(diff)
~/anaconda/lib/python3.6/site-packages/pandas/core/frame.py in from_dict(cls, data, orient, dtype, columns)
1188 raise ValueError("only recognize index or columns for orient")
1189
-> 1190 return cls(data, index=index, columns=columns, dtype=dtype)
1191
1192 def to_numpy(self, dtype=None, copy=False):
~/anaconda/lib/python3.6/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
409 )
410 elif isinstance(data, dict):
--> 411 mgr = init_dict(data, index, columns, dtype=dtype)
412 elif isinstance(data, ma.MaskedArray):
413 import numpy.ma.mrecords as mrecords
~/anaconda/lib/python3.6/site-packages/pandas/core/internals/construction.py in init_dict(data, index, columns, dtype)
255 arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
256 ]
--> 257 return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
258
259
~/anaconda/lib/python3.6/site-packages/pandas/core/internals/construction.py in arrays_to_mgr(arrays, arr_names, index, columns, dtype)
75 # figure out the index, if necessary
76 if index is None:
---> 77 index = extract_index(arrays)
78 else:
79 index = ensure_index(index)
~/anaconda/lib/python3.6/site-packages/pandas/core/internals/construction.py in extract_index(data)
356
357 if not indexes and not raw_lengths:
--> 358 raise ValueError("If using all scalar values, you must pass an index")
359
360 if have_series:
ValueError: If using all scalar values, you must pass an index
I've tried various methods such as pd.DataFrame.from_dict(dict, orient='index') , which gives me the following output
0
0 difference ran...
1 difference ran...
2 difference rank...
3 difference rank...
4 difference ran...
5 difference ran...
6 difference ran...
7 difference rank...
8 difference ran...
9 difference ran...
10 difference ran...
11 difference ran...
12 difference ran...
13 difference ran...
14 difference ran...
15 difference ran...
16 difference ran...
17 difference ran...
18 difference rank...
19 difference rank...
20 difference ran...
21 difference ran...
22 difference rank...
23 difference rank...
24 difference rank...
25 difference ran...
I want to have 26 different csv files that have gene names as rows and 'difference' and 'rank' as columns.
I looked into the original code on github and found that the result is written as so:
result = {cluster : differential_expression(
select.select_rows(data, idx=clusters==cluster),
select.select_rows(data, idx=clusters!=cluster),
measure = measure, direction = direction,
gene_names = gene_names, n_jobs = n_jobs)
for cluster in np.unique(clusters)}
How can I get the output I would like?
Thanks
Here is what you can do to retrieve the dataframes from the dictionary and save them as excel files:
for key,dataframe in output.items():
# output_dict is the dictionary of Pandas dataframes
dataframe.to_excel('{0}.csv'.format(key)

Python pdist: Setting an array element with a sequence

I have written the following code
arr_coord = []
for chains in structure:
for chain in chains:
for residue in chain:
for atom in residue:
x = atom.get_coord()
arr_coord.append({'X': [x[0]],'Y':[x[1]],'Z':[x[2]]})
coord_table = pd.DataFrame(arr_coord)
print(coord_table)
To generate the following dataframe
X Y Z
0 [-5.43] [28.077] [-0.842]
1 [-3.183] [26.472] [1.741]
2 [-2.574] [22.752] [1.69]
3 [-1.743] [21.321] [5.121]
4 [0.413] [18.212] [5.392]
5 [0.714] [15.803] [8.332]
6 [4.078] [15.689] [10.138]
7 [5.192] [12.2] [9.065]
8 [4.088] [12.79] [5.475]
9 [5.875] [16.117] [4.945]
10 [8.514] [15.909] [2.22]
11 [12.235] [15.85] [2.943]
12 [13.079] [16.427] [-0.719]
13 [10.832] [19.066] [-2.324]
14 [12.327] [22.569] [-2.163]
15 [8.976] [24.342] [-1.742]
16 [7.689] [25.565] [1.689]
17 [5.174] [23.336] [3.467]
18 [2.339] [24.135] [5.889]
19 [0.9] [22.203] [8.827]
20 [-1.217] [22.065] [11.975]
21 [0.334] [20.465] [15.09]
22 [0.0] [20.066] [18.885]
23 [2.738] [21.762] [20.915]
24 [4.087] [19.615] [23.742]
25 [7.186] [21.618] [24.704]
26 [8.867] [24.914] [23.91]
27 [11.679] [27.173] [24.946]
28 [10.76] [30.763] [25.731]
29 [11.517] [33.056] [22.764]
.. ... ... ...
431 [8.093] [34.654] [68.474]
432 [7.171] [32.741] [65.298]
433 [5.088] [35.626] [63.932]
434 [7.859] [38.22] [64.329]
435 [10.623] [35.908] [63.1]
436 [12.253] [36.776] [59.767]
437 [10.65] [35.048] [56.795]
438 [7.459] [34.084] [58.628]
439 [4.399] [35.164] [56.713]
440 [0.694] [35.273] [57.347]
441 [-1.906] [34.388] [54.667]
442 [-5.139] [35.863] [55.987]
443 [-8.663] [36.808] [55.097]
444 [-9.629] [40.233] [56.493]
445 [-12.886] [42.15] [56.888]
446 [-12.969] [45.937] [56.576]
447 [-14.759] [47.638] [59.485]
448 [-14.836] [51.367] [60.099]
449 [-11.607] [51.863] [58.176]
450 [-9.836] [48.934] [59.829]
451 [-8.95] [45.445] [58.689]
452 [-9.824] [42.599] [61.073]
453 [-8.559] [39.047] [60.598]
454 [-11.201] [36.341] [60.195]
455 [-11.561] [32.71] [59.077]
456 [-7.786] [32.216] [59.387]
457 [-5.785] [29.886] [61.675]
458 [-2.143] [29.222] [62.469]
459 [-0.946] [25.828] [61.248]
460 [2.239] [25.804] [63.373]
[461 rows x 3 columns]
What I intend to do is to create a Euclidean distance matrix using these X, Y, and Z values. I tried to do this using the pdist function
dist = pdist(coord_table, metric = 'euclidean')
distance_matrix = squareform(dist)
print(distance_matrix)
However, the interpreter gives the following error
ValueError: setting an array element with a sequence.
I am not sure how to interpret this error or how to fix it.
Change your loop
arr_coord = []
for chains in structure:
for chain in chains:
for residue in chain:
for atom in residue:
x = atom.get_coord()
arr_coord.append({'X': x[0],'Y':x[1],'Z':x[2]}) # here do not need list of list

Format Pandas Pivot Table

I met a problem in formatting pivot table that created by Pandas.
So I made a matrix table between 2 columns (A,B) from my source data, by using pandas.pivot_table with A as Column, and B as Index.
>> df = PD.read_excel("data.xls")
>> table = PD.pivot_table(df,index=["B"],
values='Count',columns=["A"],aggfunc=[NUM.sum],
fill_value=0,margins=True,dropna= True)
>> table
It returns as:
sum
A 1 2 3 All
B
1 23 52 0 75
2 16 35 12 65
3 56 0 0 56
All 95 87 12 196
And I hope to have a format like this:
A All_B
1 2 3
1 23 52 0 75
B 2 16 35 12 65
3 56 0 0 56
All_A 95 87 12 196
How should I do this? Thanks very much ahead.
The table returned by pd.pivot_table is very convenient to do work on (it's single-level index/column) and normally does NOT require any further format manipulation. But if you insist on changing the format to the one you mentioned in the post, then you need to construct a multi-level index/column using pd.MultiIndex. Here is an example on how to do it.
Before manipulation,
import pandas as pd
import numpy as np
np.random.seed(0)
a = np.random.randint(1, 4, 100)
b = np.random.randint(1, 4, 100)
df = pd.DataFrame(dict(A=a,B=b,Val=np.random.randint(1,100,100)))
table = pd.pivot_table(df, index='A', columns='B', values='Val', aggfunc=sum, fill_value=0, margins=True)
print(table)
B 1 2 3 All
A
1 454 649 770 1873
2 628 576 467 1671
3 376 247 481 1104
All 1458 1472 1718 4648
After:
multi_level_column = pd.MultiIndex.from_arrays([['A', 'A', 'A', 'All_B'], [1,2,3,'']])
multi_level_index = pd.MultiIndex.from_arrays([['B', 'B', 'B', 'All_A'], [1,2,3,'']])
table.index = multi_level_index
table.columns = multi_level_column
print(table)
A All_B
1 2 3
B 1 454 649 770 1873
2 628 576 467 1671
3 376 247 481 1104
All_A 1458 1472 1718 4648

Categories

Resources