Replicate the tuple values to create random dataset in python [duplicate] - python

I have a pandas DataFrame with 100,000 rows and want to split it into 100 sections with 1000 rows in each of them.
How do I draw a random sample of certain size (e.g. 50 rows) of just one of the 100 sections? The df is already ordered such that the first 1000 rows are from the first section, next 1000 rows from another, and so on.

You can use the sample method*:
In [11]: df = pd.DataFrame([[1, 2], [3, 4], [5, 6], [7, 8]], columns=["A", "B"])
In [12]: df.sample(2)
Out[12]:
A B
0 1 2
2 5 6
In [13]: df.sample(2)
Out[13]:
A B
3 7 8
0 1 2
*On one of the section DataFrames.
Note: If you have a larger sample size that the size of the DataFrame this will raise an error unless you sample with replacement.
In [14]: df.sample(5)
ValueError: Cannot take a larger sample than population when 'replace=False'
In [15]: df.sample(5, replace=True)
Out[15]:
A B
0 1 2
1 3 4
2 5 6
3 7 8
1 3 4

One solution is to use the choice function from numpy.
Say you want 50 entries out of 100, you can use:
import numpy as np
chosen_idx = np.random.choice(1000, replace=False, size=50)
df_trimmed = df.iloc[chosen_idx]
This is of course not considering your block structure. If you want a 50 item sample from block i for example, you can do:
import numpy as np
block_start_idx = 1000 * i
chosen_idx = np.random.choice(1000, replace=False, size=50)
df_trimmed_from_block_i = df.iloc[block_start_idx + chosen_idx]

You could add a "section" column to your data then perform a groupby and sample:
import numpy as np
import pandas as pd
df = pd.DataFrame(
{"x": np.arange(1_000 * 100), "section": np.repeat(np.arange(100), 1_000)}
)
# >>> df
# x section
# 0 0 0
# 1 1 0
# 2 2 0
# 3 3 0
# 4 4 0
# ... ... ...
# 99995 99995 99
# 99996 99996 99
# 99997 99997 99
# 99998 99998 99
# 99999 99999 99
#
# [100000 rows x 2 columns]
sample = df.groupby("section").sample(50)
# >>> sample
# x section
# 907 907 0
# 494 494 0
# 775 775 0
# 20 20 0
# 230 230 0
# ... ... ...
# 99740 99740 99
# 99272 99272 99
# 99863 99863 99
# 99198 99198 99
# 99555 99555 99
#
# [5000 rows x 2 columns]
with additional .query("section == 42") or whatever if you are interested in only a particular section.
Note this requires pandas 1.1.0, see the docs here: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.core.groupby.DataFrameGroupBy.sample.html
For older versions, see the answer by #msh5678

Thank you, Jeff,
But I received an error;
AttributeError: Cannot access callable attribute 'sample' of 'DataFrameGroupBy' objects, try using the 'apply' method
So I suggest instead of sample = df.groupby("section").sample(50) using below command :
df.groupby('section').apply(lambda grp: grp.sample(50))

This is a nice place for recursion.
def main2():
rows = 8 # say you have 8 rows, real data will need len(rows) for int
rands = []
for i in range(rows):
gen = fun(rands)
rands.append(gen)
print(rands) # now range through random values
def fun(rands):
gen = np.random.randint(0, 8)
if gen in rands:
a = fun(rands)
return a
else: return gen
if __name__ == "__main__":
main2()
output: [6, 0, 7, 1, 3, 5, 4, 2]

Related

Is there a more efficient way to select rows from a PyArrow table based on contents of a column?

I asked a related question about a more idiomatic way to select rows from a PyArrow table based on contents of a column. The answer from #joris looks great. But it looks like selecting rows purely in PyArrow with a row mask has performance issues with sparse selections.
Is there a more efficient way to do this, but stay purely in PyArrow without going back and forth between PyArrow and numpy?
Test case:
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
# Example table for data schema:
# Alternating rows with index 0 and 1
irow = np.arange(2**20)
dt = 17
df0 = pd.DataFrame({'timestamp': np.array((irow//2)*dt, dtype=np.int64),
'index': np.array(irow%2, dtype=np.int16),
'value': np.array(irow*0, dtype=np.int32)},
columns=['timestamp','index','value'])
ii = df0['index'] == 0
df0.loc[ii,'value'] = irow[ii]//2
ii = df0['index'] == 1
df0.loc[ii,'value'] = (np.sin(df0.loc[ii,'timestamp']*0.01)*10000).astype(np.int32)
# Insert rows with index 2 every 16 timestamps
irow = np.arange(10000)
subsample = 16
df1 = pd.DataFrame({'timestamp': np.array(irow*dt*subsample, dtype=np.int64),
'index': np.full_like(irow, 2, dtype=np.int16),
'value': np.array(irow*irow, dtype=np.int32)},
columns=['timestamp','index','value'],
index=irow*subsample*2+1.5)
df2=pd.concat([df0,df1]).sort_index()
df2.index = pd.RangeIndex(len(df2))
print(df2)
table2 = pa.Table.from_pandas(df2)
# which prints:
timestamp index value
0 0 0 0
1 0 1 0
2 0 2 0
3 17 0 1
4 17 1 1691
... ... ... ...
1058571 8912845 1 9945
1058572 8912862 0 524286
1058573 8912862 1 9978
1058574 8912879 0 524287
1058575 8912879 1 9723
[1058576 rows x 3 columns]
to verify there is sparse content with index=2:
print(df2[df2['index']==2])
# which prints
timestamp index value
2 0 2 0
35 272 2 1
68 544 2 4
101 816 2 9
134 1088 2 16
... ... ... ...
329837 2718640 2 99900025
329870 2718912 2 99920016
329903 2719184 2 99940009
329936 2719456 2 99960004
329969 2719728 2 99980001
[10000 rows x 3 columns]
and benchmarking:
import time
# My method, which sloshes back and forth between PyArrow and numpy
def select_by_index_np(table, ival):
value_index = table.column('index').to_numpy()
row_indices = np.nonzero(value_index==ival)[0]
return table.take(pa.array(row_indices))
# Stay in PyArrow: see https://stackoverflow.com/a/64579502/44330
def select_by_index(table, ival):
value_index = table.column('index')
index_type = value_index.type.to_pandas_dtype()
mask = pc.equal(value_index, index_type(ival))
return table.filter(mask)
def run_timing_test(table, ival, select_algorithm, nrep=100):
t1 = time.time_ns()
for _ in range(nrep):
tsel = select_algorithm(table, ival)
t2 = time.time_ns()
print('%.0fus %20s(%s) -> %s' %
((t2-t1)/1000/nrep,
select_algorithm.__name__,
ival,
tsel.column('value').to_numpy()))
run_timing_test(table2, 0, select_by_index)
run_timing_test(table2, 0, select_by_index_np)
run_timing_test(table2, 1, select_by_index)
run_timing_test(table2, 1, select_by_index_np)
run_timing_test(table2, 2, select_by_index)
run_timing_test(table2, 2, select_by_index_np)
# which prints
7639us select_by_index(0) -> [ 0 1 2 ... 524285 524286 524287]
7780us select_by_index_np(0) -> [ 0 1 2 ... 524285 524286 524287]
7789us select_by_index(1) -> [ 0 1691 3334 ... 9945 9978 9723]
8204us select_by_index_np(1) -> [ 0 1691 3334 ... 9945 9978 9723]
3840us select_by_index(2) -> [ 0 1 4 ... 99940009 99960004 99980001]
1611us select_by_index_np(2) -> [ 0 1 4 ... 99940009 99960004 99980001]
The two methods are comparable when selected rows are a substantial portion of the table, but when they are very small, select_by_index_np which marshals to numpy, determines indices where the rows of the mask are True, and marshals back to PyArrow, is faster!
Is there an efficient way to do this but stay in PyArrow? (I don't see any pyarrow.compute equivalent to numpy.nonzero)

Pandas to create new rows from each exisitng rows

A short data frame and I want to create new rows from the existing rows.
What it does now is, each row, each column multiple a random number between 3 to 5:
import pandas as pd
import random
data = {'Price': [59,98,79],
'Stock': [53,60,60],
'Delivery': [11,7,6]}
df = pd.DataFrame(data)
for row in range(df.shape[0]):
new_row = round(df.loc[row] * random.randint(3,5))
new_row.name = 'new row'
df = df.append([new_row])
print (df)
Price Stock Delivery
0 59 53 11
1 98 60 7
2 79 60 6
new row 295 265 55
new row 294 180 21
new row 316 240 24
Is it possible that it can multiple different random numbers to each row? For example:
the 1st row 3 cells multiple (random) [3,4,5]
the 2nd row 3 cells multiple (random) [4,4,3] etc?
Thank you.
Change the random to numpy random.choice in your for loop
np.random.choice(range(3,5),3)
Use np.random.randint(3,6, size=3). Actually, you can do at once:
df * np.random.randint(3,6, size=df.shape)
You may also generate the multiplication coefficients with the same shape of df independently, and then concat the element-wise multiplied df * mul with the original df:
N.B. This method avoids the notoriously slow .append(). Benchmark: 10,000 rows finished almost instantly with this method, while .append() took 40 seconds!
import numpy as np
np.random.seed(111) # reproducibility
mul = np.random.randint(3, 6, df.shape) # 6 not inclusive
df_new = pd.concat([df, df * mul], axis=0).reset_index(drop=True)
Output:
print(df_new)
Price Stock Delivery
0 59 53 11
1 98 60 7
2 79 60 6
3 177 159 33
4 294 300 28
5 395 300 30
print(mul) # check the coefficients
array([[3, 3, 3],
[3, 5, 4],
[5, 5, 5]])

Multiply entire column by a random number and store it as new column

I have a column with 100 rows and i want to generate multiple columns(say 100) from this column. These new columns should be generated by multiplying the first column with a random value. Is there a way to do it using python? I have tried it using excel but that is a tedious task as for every column I have to multiply the column with a randomly generated number (randbetween(a,b)).
Let's assume you have a column of numeric data:
import numpy as np
import random
# random.randint(a,b) will choose a random integer between a and b
# this will create a column that is 96 elements long
col = [random.randint(0,500) for i in range(96)]
Now, let's create more columns by leveraging a numpy.array which supports scalar multiplication of vectors:
arr = np.array(col)
# our dataframe has one column in it
df = pd.DataFrame(arr, columns=['x'])
a, b = 100, 5000 # set what interval to select random numbers from
Now, you can loop through to add in new columns
num_cols = 99
for i in range(num_cols): # or however many columns you want to add
df[i] = df.x * random.randint(a,b)
df.head()
x 0 1 2 3 4 5 6 ... 92 93 94 95 96 97 98 99
0 68 257040 214268 107576 266152 229568 309468 319668 ... 74460 25024 85952 320620 331840 175712 87788 254864
1 286 1081080 901186 452452 1119404 965536 1301586 1344486 ... 313170 105248 361504 1348490 1395680 739024 369226 1071928
2 421 1591380 1326571 666022 1647794 1421296 1915971 1979121 ... 460995 154928 532144 1985015 2054480 1087864 543511 1577908
3 13 49140 40963 20566 50882 43888 59163 61113 ... 14235 4784 16432 61295 63440 33592 16783 48724
4 344 1300320 1083944 544208 1346416 1161344 1565544 1617144 ... 376680 126592 434816 1621960 1678720 888896 444104 1289312
[5 rows x 101 columns]
You can use Numpy reshape to multiply column with random number
a, b = 10 ,20
df = pd.DataFrame({'col':np.random.randint(0,500, 100)})
df['col'].values * np.random.randint(a, b, 100).reshape(-1,1)
To get the result in a Dataframe,
pd.DataFrame(df['col'].values * np.random.randint(a, b, 100).reshape(-1,1))

Fast way to convert strings into lists of ints in a Pandas column?

I'm trying to compute the Hamming distance between all strings in a column in a large dataframe. I have over 100,000 rows in this column so with all pairwise combinations, which is 10x10^9 comparisons. These strings are short DNA sequences. I would like to quickly convert every string in the column to a list of integers, where a unique integer represent each character in the string. E.g.
"ACGTACA" -> [0, 1, 2, 3, 1, 2, 1]
then I use scipy.spatial.distance.pdist to quickly and efficiently compute the hamming distance between all of these. Is there a fast way to do this in Pandas?
I have tried using apply but it is pretty slow:
mapping = {"A":0, "C":1, "G":2, "T":3}
df.apply(lambda x: np.array([mapping[char] for char in x]))
get_dummies and other Categorical operations don't apply because they operate on a per row level. Not within the row.
Since Hamming distance doesn't care about magnitude differences, I can get about a 40-60% speedup just replacing df.apply(lambda x: np.array([mapping[char] for char in x])) with df.apply(lambda x: map(ord, x)) on made-up datasets.
Create your test data
In [39]: pd.options.display.max_rows=12
In [40]: N = 100000
In [41]: chars = np.array(list('ABCDEF'))
In [42]: s = pd.Series(np.random.choice(chars, size=4 * np.prod(N)).view('S4'))
In [45]: s
Out[45]:
0 BEBC
1 BEEC
2 FEFA
3 BBDA
4 CCBB
5 CABE
...
99994 EEBC
99995 FFBD
99996 ACFB
99997 FDBE
99998 BDAB
99999 CCFD
dtype: object
These don't actually have to be the same length the way we are doing it.
In [43]: maxlen = s.str.len().max()
In [44]: result = pd.concat([ s.str[i].astype('category',categories=chars).cat.codes for i in range(maxlen) ], axis=1)
In [47]: result
Out[47]:
0 1 2 3
0 1 4 1 2
1 1 4 4 2
2 5 4 5 0
3 1 1 3 0
4 2 2 1 1
5 2 0 1 4
... .. .. .. ..
99994 4 4 1 2
99995 5 5 1 3
99996 0 2 5 1
99997 5 3 1 4
99998 1 3 0 1
99999 2 2 5 3
[100000 rows x 4 columns]
So you get a factorization according the same categories (e.g. the codes are meaningful)
And pretty fast
In [46]: %timeit pd.concat([ s.str[i].astype('category',categories=chars).cat.codes for i in range(maxlen) ], axis=1)
10 loops, best of 3: 118 ms per loop
I didn't test the performance of this, but you could also try somthing like
atest = "ACGTACA"
alist = atest.replace('A', '3.').replace('C', '2.').replace('G', '1.').replace('T', '0.').split('.')
anumlist = [int(x) for x in alist if x.isdigit()]
results in:
[3, 2, 1, 0, 3, 2, 3]
Edit: Ok, so testing it with atest = "ACTACA"*100000 takes a while :/
Maybe not the best idea...
Edit 5:
Another improvement:
import datetime
import numpy as np
class Test(object):
def __init__(self):
self.mapping = {'A' : 0, 'C' : 1, 'G' : 2, 'T' : 3}
def char2num(self, astring):
return [self.mapping[c] for c in astring]
def main():
now = datetime.datetime.now()
atest = "AGTCAGTCATG"*10000000
t = Test()
alist = t.char2num(atest)
testme = np.array(alist)
print testme, len(testme)
print datetime.datetime.now() - now
if __name__ == "__main__":
main()
Takes about 16 seconds for 110.000.000 characters and keeps your processor busy instead of your ram:
[0 2 3 ..., 0 3 2] 110000000
0:00:16.866659
There doesn't seem to be much difference between using ord or a dictionary-based lookup that exactly maps A->0, C->1 etc:
import pandas as pd
import numpy as np
bases = ['A', 'C', 'T', 'G']
rowlen = 4
nrows = 1000000
dna = pd.Series(np.random.choice(bases, nrows * rowlen).view('S%i' % rowlen))
lookup = dict(zip(bases, range(4)))
%timeit dna.apply(lambda row: map(lookup.get, row))
# 1 loops, best of 3: 785 ms per loop
%timeit dna.apply(lambda row: map(ord, row))
# 1 loops, best of 3: 713 ms per loop
Jeff's solution is also not far off in terms of performance:
%timeit pd.concat([dna.str[i].astype('category', categories=bases).cat.codes for i in range(rowlen)], axis=1)
# 1 loops, best of 3: 1.03 s per loop
A major advantage of this approach over mapping the rows to lists of ints is that the categories can then be viewed as a single (nrows, rowlen) uint8 array via the .values attribute, which could then be passed directly to pdist.

Pandas dataframe total row

I have a dataframe, something like:
foo bar qux
0 a 1 3.14
1 b 3 2.72
2 c 2 1.62
3 d 9 1.41
4 e 3 0.58
and I would like to add a 'total' row to the end of dataframe:
foo bar qux
0 a 1 3.14
1 b 3 2.72
2 c 2 1.62
3 d 9 1.41
4 e 3 0.58
5 total 18 9.47
I've tried to use the sum command but I end up with a Series, which although I can convert back to a Dataframe, doesn't maintain the data types:
tot_row = pd.DataFrame(df.sum()).T
tot_row['foo'] = 'tot'
tot_row.dtypes:
foo object
bar object
qux object
I would like to maintain the data types from the original data frame as I need to apply other operations to the total row, something like:
baz = 2*tot_row['qux'] + 3*tot_row['bar']
Update June 2022
pd.append is now deprecated. You could use pd.concat instead but it's probably easier to use df.loc['Total'] = df.sum(numeric_only=True), as Kevin Zhu commented. Or, better still, don't modify the data frame in place and keep your data separate from your summary statistics!
Append a totals row with
df.append(df.sum(numeric_only=True), ignore_index=True)
The conversion is necessary only if you have a column of strings or objects.
It's a bit of a fragile solution so I'd recommend sticking to operations on the dataframe, though. eg.
baz = 2*df['qux'].sum() + 3*df['bar'].sum()
df.loc["Total"] = df.sum()
works for me and I find it easier to remember. Am I missing something?
Probably wasn't possible in earlier versions.
I'd actually like to add the total row only temporarily though.
Adding it permanently is good for display but makes it a hassle in further calculations.
Just found
df.append(df.sum().rename('Total'))
This prints what I want in a Jupyter notebook and appears to leave the df itself untouched.
New Method
To get both row and column total:
import numpy as np
import pandas as pd
df = pd.DataFrame({'a': [10,20],'b':[100,200],'c': ['a','b']})
df.loc['Column_Total']= df.sum(numeric_only=True, axis=0)
df.loc[:,'Row_Total'] = df.sum(numeric_only=True, axis=1)
print(df)
a b c Row_Total
0 10.0 100.0 a 110.0
1 20.0 200.0 b 220.0
Column_Total 30.0 300.0 NaN 330.0
Use DataFrame.pivot_table with margins=True:
import pandas as pd
data = [('a',1,3.14),('b',3,2.72),('c',2,1.62),('d',9,1.41),('e',3,.58)]
df = pd.DataFrame(data, columns=('foo', 'bar', 'qux'))
Original df:
foo bar qux
0 a 1 3.14
1 b 3 2.72
2 c 2 1.62
3 d 9 1.41
4 e 3 0.58
Since pivot_table requires some sort of grouping (without the index argument, it'll raise a ValueError: No group keys passed!), and your original index is vacuous, we'll use the foo column:
df.pivot_table(index='foo',
margins=True,
margins_name='total', # defaults to 'All'
aggfunc=sum)
VoilĂ !
bar qux
foo
a 1 3.14
b 3 2.72
c 2 1.62
d 9 1.41
e 3 0.58
total 18 9.47
Alternative way (verified on Pandas 0.18.1):
import numpy as np
total = df.apply(np.sum)
total['foo'] = 'tot'
df.append(pd.DataFrame(total.values, index=total.keys()).T, ignore_index=True)
Result:
foo bar qux
0 a 1 3.14
1 b 3 2.72
2 c 2 1.62
3 d 9 1.41
4 e 3 0.58
5 tot 18 9.47
Building on JMZ answer
df.append(df.sum(numeric_only=True), ignore_index=True)
if you want to continue using your current index you can name the sum series using .rename() as follows:
df.append(df.sum().rename('Total'))
This will add a row at the bottom of the table.
This is the way that I do it, by transposing and using the assign method in combination with a lambda function. It makes it simple for me.
df.T.assign(GrandTotal = lambda x: x.sum(axis=1)).T
Building on answer from Matthias Kauer.
To add row total:
df.loc["Row_Total"] = df.sum()
To add column total,
df.loc[:,"Column_Total"] = df.sum(axis=1)
New method [September 2022]
TL;DR:
Just use
df.style.concat(df.agg(['sum']).style)
for a solution that won't change you dataframe, works even if you have an "sum" in your index, and can be styled!
Explanation
In pandas 1.5.0, a new method named .style.concat() gives you the ability to display several dataframes together. This is a good way to show the total (or any other statistics), because it is not changing the original dataframe, and works even if you have an index named "sum" in your original dataframe.
For example:
import pandas as pd
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['A', 'B', 'C'])
df.style.concat(df.agg(['sum']).style)
and it will return a formatted table that is visible in jupyter as this:
Styling
with a little longer code, you can even make the last row look different:
df.style.concat(
df.agg(['sum']).style
.set_properties(**{'background-color': 'yellow'})
)
to get:
see other ways to style (such as bold font, or table lines) in the docs
Following helped for me to add a column total and row total to a dataframe.
Assume dft1 is your original dataframe... now add a column total and row total with the following steps.
from io import StringIO
import pandas as pd
#create dataframe string
dfstr = StringIO(u"""
a;b;c
1;1;1
2;2;2
3;3;3
4;4;4
5;5;5
""")
#create dataframe dft1 from string
dft1 = pd.read_csv(dfstr, sep=";")
## add a column total to dft1
dft1['Total'] = dft1.sum(axis=1)
## add a row total to dft1 with the following steps
sum_row = dft1.sum(axis=0) #get sum_row first
dft1_sum=pd.DataFrame(data=sum_row).T #change it to a dataframe
dft1_sum=dft1_sum.reindex(columns=dft1.columns) #line up the col index to dft1
dft1_sum.index = ['row_total'] #change row index to row_total
dft1.append(dft1_sum) # append the row to dft1
Actually all proposed solutions render the original DataFrame unusable for any further analysis and can invalidate following computations, which will be easy to overlook and could lead to false results.
This is because you add a row to the data, which Pandas cannot differentiate from an additional row of data.
Example:
import pandas as pd
data = [1, 5, 6, 8, 9]
df = pd.DataFrame(data)
df
df.describe()
yields
0
0
1
1
5
2
6
3
8
4
9
0
count
5
mean
5.8
std
3.11448
min
1
25%
5
50%
6
75%
8
max
9
After
df.loc['Totals']= df.sum(numeric_only=True, axis=0)
the dataframe looks like this
0
0
1
1
5
2
6
3
8
4
9
Totals
29
This looks nice, but the new row is treated as if it was an additional data item, so df.describe will produce false results:
0
count
6
mean
9.66667
std
9.87252
min
1
25%
5.25
50%
7
75%
8.75
max
29
So: Watch out! and apply this only after doing all other analyses of the data or work on a copy of the DataFrame!
When the "totals" need to be added to an index column:
totals = pd.DataFrame(df.sum(numeric_only=True)).transpose().set_index(pd.Index({"totals"}))
df.append(totals)
e.g.
(Pdb) df
count min bytes max bytes mean bytes std bytes sum bytes
row_0 837200 67412.0 368733992.0 2.518989e+07 5.122836e+07 2.108898e+13
row_1 299000 85380.0 692782132.0 2.845055e+08 2.026823e+08 8.506713e+13
row_2 837200 67412.0 379484173.0 8.706825e+07 1.071484e+08 7.289354e+13
row_3 239200 85392.0 328063972.0 9.870446e+07 1.016989e+08 2.361011e+13
row_4 59800 67292.0 383487021.0 1.841879e+08 1.567605e+08 1.101444e+13
row_5 717600 112309.0 379483824.0 9.687554e+07 1.103574e+08 6.951789e+13
row_6 119600 664144.0 358486985.0 1.611637e+08 1.171889e+08 1.927518e+13
row_7 478400 67300.0 593141462.0 2.824301e+08 1.446283e+08 1.351146e+14
row_8 358800 215002028.0 327493141.0 2.861329e+08 1.545693e+07 1.026645e+14
row_9 358800 202248016.0 321657935.0 2.684668e+08 1.865470e+07 9.632590e+13
(Pdb) totals = pd.DataFrame(df.sum(numeric_only=True)).transpose()
(Pdb) totals
count min bytes max bytes mean bytes std bytes sum bytes
0 4305600.0 418466685.0 4.132815e+09 1.774725e+09 1.025805e+09 6.365722e+14
(Pdb) totals = pd.DataFrame(df.sum(numeric_only=True)).transpose().set_index(pd.Index({"totals"}))
(Pdb) totals
count min bytes max bytes mean bytes std bytes sum bytes
totals 4305600.0 418466685.0 4.132815e+09 1.774725e+09 1.025805e+09 6.365722e+14
(Pdb) df.append(totals)
count min bytes max bytes mean bytes std bytes sum bytes
row_0 837200.0 67412.0 3.687340e+08 2.518989e+07 5.122836e+07 2.108898e+13
row_1 299000.0 85380.0 6.927821e+08 2.845055e+08 2.026823e+08 8.506713e+13
row_2 837200.0 67412.0 3.794842e+08 8.706825e+07 1.071484e+08 7.289354e+13
row_3 239200.0 85392.0 3.280640e+08 9.870446e+07 1.016989e+08 2.361011e+13
row_4 59800.0 67292.0 3.834870e+08 1.841879e+08 1.567605e+08 1.101444e+13
row_5 717600.0 112309.0 3.794838e+08 9.687554e+07 1.103574e+08 6.951789e+13
row_6 119600.0 664144.0 3.584870e+08 1.611637e+08 1.171889e+08 1.927518e+13
row_7 478400.0 67300.0 5.931415e+08 2.824301e+08 1.446283e+08 1.351146e+14
row_8 358800.0 215002028.0 3.274931e+08 2.861329e+08 1.545693e+07 1.026645e+14
row_9 358800.0 202248016.0 3.216579e+08 2.684668e+08 1.865470e+07 9.632590e+13
totals 4305600.0 418466685.0 4.132815e+09 1.774725e+09 1.025805e+09 6.365722e+14
Since i generally want to do this at the very end as to avoid breaking the integrity of the dataframe (right before printing). I created a summary_rows_cols method which returns a printable dataframe:
def summary_rows_cols(df: pd.DataFrame,
column_sum: bool = False,
column_avg: bool = False,
column_median: bool = False,
row_sum: bool = False,
row_avg: bool = False,
row_median: bool = False
) -> pd.DataFrame:
ret = df.copy()
if column_sum: ret.loc['Sum'] = df.sum(numeric_only=True, axis=0)
if column_avg: ret.loc['Avg'] = df.mean(numeric_only=True, axis=0)
if column_median: ret.loc['Median'] = df.median(numeric_only=True, axis=0)
if row_sum: ret.loc[:, 'Sum'] = df.sum(numeric_only=True, axis=1)
if row_median: ret.loc[:, 'Avg'] = df.mean(numeric_only=True, axis=1)
if row_avg: ret.loc[:, 'Median'] = df.median(numeric_only=True, axis=1)
ret.fillna('-', inplace=True)
return ret
This allows me to enter a generic (numeric) df and get a summarized output such as:
a b c Sum Median
0 1 4 7 12 4
1 2 5 8 15 5
2 3 6 9 18 6
Sum 6 15 24 - -
from:
data = {
'a': [1, 2, 3],
'b': [4, 5, 6],
'c': [7, 8, 9]
}
df = pd.DataFrame(data)
printable = summary_rows_cols(df, row_sum=True, column_sum=True, row_median=True)

Categories

Resources