Normalise values in bivariate distribution to 0-1 - python - python

I have a bivariate distribution below that is generated from the xy points for each Group in 'Int_1','Int_2'. The aim to to apply these points and return a multivariate distribution between the Groups. I then want to normalise the distribution value via Norm so the z-value ranges between 0 and 1. When looking at the z-value now via the colorer, the values vary between 0.24-0.72.
In a previous question, it was mentioned that I'm not actually returning a multivariate distribution. Rather a ratio of probabilities between the two groups.
import pandas as pd
import numpy as np
from scipy.stats import multivariate_normal as mvn
import matplotlib.pyplot as plt
from scipy.interpolate import RectBivariateSpline
df = pd.DataFrame({'Int_1': [1.0, 2.0, 1.0, 3.0, 1.0, 2.0, 3.0, 2.0],
'Int_2': [1.0, 2.0, 2.0, 2.0, 1.0, 1.0, 1.0, 2.0],
'Item_X': [0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
'Item_Y': [0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
'Period': [1, 1, 1, 1, 2, 2, 2, 2],
'Group': ['A', 'B', 'A', 'B', 'A', 'B', 'A', 'B'],
'Item': ['Y', 'Y', 'A', 'B', 'A', 'B', 'A', 'B'],
'id': ['1', '2', '3', '4', '1', '2', '3', '4']})
Group_A = [df[df['Group'] == 'A'][['Int_1','Int_2']].to_numpy()]
Group_B = [df[df['Group'] == 'B'][['Int_1','Int_2']].to_numpy()]
Item = [df[['Item_X','Item_Y']].to_numpy()]
period = df['Period'].drop_duplicates().reset_index(drop = True)
def bivart_func(member_no, location, time_index, group):
if group == 'A':
data = Group_A.copy()
elif group == 'B':
data = Group_B.copy()
else:
return
if np.all(np.isfinite(data[member_no][[time_index,time_index + 1],:])) & np.all(np.isfinite(Item[0][time_index,:])):
sxy = (data[member_no][time_index + 1,:] - data[member_no][time_index,:]) / (period[time_index + 1] - period[time_index])
mu = data[member_no][time_index,:] + sxy * 0.5
out = mvn.pdf(location,mu) / mvn.pdf(data[member_no][time_index,:],mu)
else:
out = np.zeros(location.shape[0])
return out
xx,yy = np.meshgrid(np.linspace(-10,10,200),np.linspace(-10,10,200))
Z_GA = np.zeros(40000)
Z_GB = np.zeros(40000)
for k in range(1):
Z_GA += bivart_func(k,np.c_[xx.flatten(),yy.flatten()],0,'A')
Z_GB += bivart_func(k,np.c_[xx.flatten(),yy.flatten()],0,'B')
fig, ax = plt.subplots(figsize=(8,8))
ax.set_xlim(-10,10)
ax.set_ylim(-10,10)
Z_GA = Z_GA.reshape((200,200))
Z_GB = Z_GB.reshape((200,200))
Norm = xx,yy, 1 / (1 + np.exp(Z_GB - Z_GA))
cfs = ax.contourf(*Norm, cmap = 'magma')
ax.scatter(Item[0][1,0],Item[0][1,1], color = 'white', edgecolor = 'black')
fig.colorbar(cfs, ax = ax)
#f = RectBivariateSpline(xx[0, :], yy[:, 0], Norm)
#z = f(df['Item_X'], df['Item_Y'], grid = False)

Is it what you expect:
Z = Z_GB - Z_GA
Norm = xx,yy, (Z - np.min(Z)) / (np.max(Z) - np.min(Z))
>>> np.min(Norm[2])
0.0
>>> np.max(Norm[2])
1.0

Related

Find FIRST and LAST non-zero values in each row of Pandas, color them, and save it to an Excel file (.xlsx)

I have a Pandas DataFrame:
import pandas as pd
df = pd.DataFrame([[0.0, 2.0, 0.0, 0.0, 5.0, 6.0, 7.0],
[1.0, 0.0, 1.0, 3.0, 0.0, 0.0, 7.0],
[0.0, 0.0, 13.0, 14.0, 0.0, 16.0, 0.0]
]
, columns=['A', 'B', 'C', 'D', 'E', 'F', 'G'])
A B C D E F G
0 0.0 2.0 0.0 0.0 5.0 6.0 7.0
1 1.0 0.0 1.0 3.0 0.0 0.0 7.0
2 0.0 0.0 13.0 14.0 0.0 16.0 17.0
And I would like to save it as an .xlsx file, with the first and last non-zero values in each row marked in color. something like:
I removed the index column though. The first column.
# import dependencies
import pandas as pd
import openpyxl
from openpyxl.styles import PatternFill
from openpyxl.utils import get_column_letter
# data
df = pd.DataFrame([[0.0, 2.0, 0.0, 0.0, 5.0, 6.0, 7.0],
[1.0, 0.0, 1.0, 3.0, 0.0, 0.0, 7.0],
[0.0, 0.0, 13.0, 14.0, 0.0, 16.0, 0.0]
], columns=['A', 'B', 'C', 'D', 'E', 'F', 'G'])
first_and_last_non_zeroes_index = []
for index, row in df.iterrows():
# all non zeroes index in a row
non_zeroes_index= [i for i, x in enumerate(row) if x>0]
# append the first and last non zero in a row to list
first_and_last_non_zeroes_index.append([non_zeroes_index[0],non_zeroes_index[-1]])
# output to excel
df.to_excel('output.xlsx', index=False)
# open excel
wb = openpyxl.load_workbook("output.xlsx")
ws = wb['Sheet1']
# set the color
fill_cell = PatternFill(patternType='solid',
fgColor='ffff00')
# color the appropriate cells
for index, row in enumerate(first_and_last_non_zeroes_index):
for col in row:
ws[f'{get_column_letter(col+1)}{index+2}'].fill = fill_cell
# save output
wb.save("output.xlsx")

interpolating categorical data in python? (nearest / previous value)

How would one interpolate categorical (non-float, or in more broad sense, non-numerical) data in python?
Test data
Here is an example dataset with string-valued y-values.
x = [1.4, 2.8, 3.1, 4.4, 5.2]
y = ['A', 'B', 'A', 'A', 'B']
Expected outputs
# with kind= 'nearest'
x_new = [1, 2, 3, 4, 5]
y_new = ['A', 'A', 'A', 'A', 'B']
# with kind= 'previous', fill_value = None
x_new = [1, 2, 3, 4, 5]
y_new = [None, 'A', 'B', 'A', 'A']
I was expecting that interp1d could do the job with kind='nearest' or kind='previous', but unfortunately that is not the case.
You can still use interp1d if you replace your target points with indicies. I.e. construct list of all unique values - in your case it will be ['A', 'B'], transition y to be indicies instead of strings (indicies converted to float - you will be ok as long as number of unique elements can be stored as float without losing precision).
After interpolating you'll just need to get back elements given result of interpolation. As long as you use 'previous' or 'nearest' you'll always get floating point value which is one of your original indicies.
UPD.
Even simpler version would be to use y_int = [float(i) for i in range(len(y))], as input for interp1d, then after you got your interpolation result just use it as index of y.
Example: kind='nearest'
from scipy.interpolate import interp1d
import numpy as np
x = [1.4, 2.8, 3.1, 4.4, 5.2]
y = ['A', 'B', 'A', 'A', 'B']
f = interp1d(x, range(len(y)), kind='nearest', fill_value=(0, len(y)-1), bounds_error=False)
y_idx = f(x_new)
y_new = [y[int(i)] for i in y_idx ]
# ['A', 'A', 'A', 'A', 'B']
Example: kind='previous'
from scipy.interpolate import interp1d
import numpy as np
x = [1.4, 2.8, 3.1, 4.4, 5.2]
y = ['A', 'B', 'A', 'A', 'B']
f = interp1d(x, range(len(y)), kind='previous', fill_value=-1, bounds_error=False)
y_idx = f(x_new)
y_new = [y[int(i)] if i != -1 else None for i in y_idx]
# [None, 'A', 'B', 'A', 'A']

Progress bar for pandas DataFrame multy operations with .agg()

I want to apply .agg pandas operations to a huge dataset
As an example, I have this code:
from tqdm import tqdm
import pandas as pd
df = pd.DataFrame({"A":[1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0],
"B":[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0],
"C":[1.0, 1.5, 2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0],
"D":[2.0, 5.0, 3.0, 6.0, 4.0, 2.0, 5.0, 1.0, 2.0],
"E":['a', 'a', 'b', 'a', 'b', 'b', 'b', 'a', 'a']})
df2 = df.groupby('B').agg({
'C': 'mean',
'D': 'sum',
'E': lambda x: x.mode()
})
print(df2)
The problem is that my original dataset has 2.000.000 of rows. Transforming it to 130.000 takes some minutes and I would like to see a progress bar
I've tried with tqdm but I don't know how to apply it here. Is there any function similar to .progress_apply() but for .agg()?
This will print the progress as you go, where progress is measured by the fraction of the groups for which statistics are computed. But I'm not sure how much the loop will slow down your computations.
agger = {
'C': 'mean',
'D': 'sum',
'E': lambda x: x.mode()}
gcols = ['B'] # columns defining the groups
groupby = df.groupby(gcols)
ngroups = len(groupby)
gfrac = 0.3 # fraction of groups for which you want to print progress
gfrac_size = max((1, int(ngroups*gfrac)))
groups = []
rows = []
for i,g in enumerate(groupby):
if (i+1)%gfrac_size == 0:
print('{:.0f}% complete'.format(100*(i/ngroups)))
gstats = g[1].agg(agger)
if i==0:
if gstats.ndim==2:
newcols = gstats.columns.tolist()
else:
newcols = gstats.index.tolist()
groups.append(g[0])
rows.append(gstats.values.flat)
df3 = pd.DataFrame(np.vstack(rows), columns=newcols)
if len(gcols) == 1:
df3.index = groups
else:
df3.index = pd.MultiIndex.from_tuples(groups, names=gcols)
df3 = df3.astype(df[newcols].dtypes)
df3
C D E
1.0 1.5 10.0 a
2.0 3.0 12.0 b
3.0 7.0 8.0 a
An alternative (though somewhat hacky) way would be to take advantage of the fact that you use your own function lambda x: x.mode. Since you're already compromising speed using this function, you can write a class that stores information about progress. For example,
import pandas as pd
import numpy as np
df = pd.DataFrame({"A":[1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0],
"B":[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0],
"C":[1.0, 1.5, 2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0],
"D":[2.0, 5.0, 3.0, 6.0, 4.0, 2.0, 5.0, 1.0, 2.0],
"E":['a', 'a', 'b', 'a', 'b', 'b', 'b', 'a', 'a']})
df2 = df.groupby('B').agg({
'C': 'mean',
'D': 'sum',
'E': lambda x: x.mode()
})
print(df2)
class ModeHack:
def __init__(self, size=5, N=10):
self.ix = 0
self.K = 1
self.size = size
self.N = N
def mode(self, x):
self.ix = self.ix + x.shape[0]
if self.K*self.size <= self.ix:
print('{:.0f}% complete'.format(100*self.ix/self.N))
self.K += 1
return x.mode()
def reset(self):
self.ix = 0
self.K = 1
mymode = ModeHack(size=int(.1*df.shape[0]), N=df.shape[0])
mymode.reset()
agger = {
'C': 'mean',
'D': 'sum',
'E': lambda x: mymode.mode(x)}
df3 = df.groupby('B').agg(agger)

Computing euclidean distance with multiple list in python

I'm writing a simple program to compute the euclidean distances between multiple lists using python. This is the code I have so fat
import math
euclidean = 0
euclidean_list = []
euclidean_list_complete = []
test1 = [[0.0, 0.0, 0.0, 152.0, 12.29], [0.0, 0.0, 0.357, 245.0, 10.4], [0.0, 0.0, 0.10, 200.0, 11.0]]
test2 = [[0.0, 0.0, 0.0, 72.0, 12.9], [0.0, 0.0, 0.0, 80.0, 11.3]]
for i in range(len(test2)):
for j in range(len(test1)):
for k in range(len(test1[0])):
euclidean += pow((test2[i][k]-test1[j][k]),2)
euclidean_list.append(math.sqrt(euclidean))
euclidean = 0
euclidean_list_complete.append(euclidean_list)
print euclidean_list_complete
my problem with this code is it doesn't print the output i want properly. The output should be
[[80.0023, 173.018, 128.014], [72.006, 165.002, 120.000]]
but instead, it prints
[[80.00232559119766, 173.01843095173416, 128.01413984400315, 72.00680592832875, 165.0028407300917, 120.00041666594329], [80.00232559119766, 173.01843095173416, 128.01413984400315, 72.00680592832875, 165.0028407300917, 120.00041666594329]]
I'm guessing it has something to do with the loop. What should I do to fix it? By the way, I don't want to use numpy or scipy for studying purposes
If it's unclear, I want to calculate the distance between lists on test2 to each lists on test1
Not sure what you are trying to achieve for 3 vectors, but for two the code has to be much, much simplier:
test2 = [[0.0, 0.0, 0.0, 72.0, 12.9], [0.0, 0.0, 0.0, 80.0, 11.3]]
def distance(list1, list2):
"""Distance between two vectors."""
squares = [(p-q) ** 2 for p, q in zip(list1, list2)]
return sum(squares) ** .5
d2 = distance(test2[0], test2[1])
With numpy is even a shorter statement.
PS. python 3 recommened
The question has partly been answered by #Evgeny. The answer the OP posted to his own question is an example how to not write Python code. Here is a shorter, faster and more readable solution, given test1 and test2 are lists like in the question:
def euclidean(v1, v2):
return sum((p-q)**2 for p, q in zip(v1, v2)) ** .5
d2 = []
for i in test2:
foo = [euclidean(i, j) for j in test1]
d2.append(foo)
print(d2)
#[[80.00232559119766, 173.01843095173416, 128.01413984400315],
# [72.00680592832875, 165.0028407300917, 120.00041666594329]]
test1 = [[0.0, 0.0, 0.0, 152.0, 12.29], [0.0, 0.0, 0.357, 245.0, 10.4], [0.0, 0.0, 0.10, 200.0, 11.0]]
test2 = [[0.0, 0.0, 0.0, 72.0, 12.9], [0.0, 0.0, 0.0, 80.0, 11.3]]
final_list = []
for a in test2:
temp = [] #temporary list
for b in test1:
dis = sum([pow(a[i] - b[i], 2) for i in range(len(a))])
temp.append(round(pow(dis, 0.5),4))
final_list.append(temp)
print(final_list)
I got it, the trick is to create the first euclidean list inside the first for loop, and then deleting the list after appending it to the complete euclidean list
import math
euclidean = 0
euclidean_list_complete = []
test1 = [[0.0, 0.0, 0.0, 152.0, 12.29], [0.0, 0.0, 0.357, 245.0, 10.4], [0.0, 0.0, 0.10, 200.0, 11.0]]
test2 = [[0.0, 0.0, 0.0, 72.0, 12.9], [0.0, 0.0, 0.0, 80.0, 11.3]]
for i in range(len(test2)):
euclidean_list = []
for j in range(len(test1)):
for k in range(len(test1[0])):
euclidean += pow((test2[i][k]-test1[j][k]),2)
euclidean_list.append(math.sqrt(euclidean))
euclidean = 0
euclidean_list.sort(reverse=True)
euclidean_list_complete.append(euclidean_list)
del euclidean_list
print euclidean_list_complete

stacked bar plot using matplotlib

I am generating bar plots using matplotlib and it looks like there is a bug with the stacked bar plot. The sum for each vertical stack should be 100. However, for X-AXIS ticks 65, 70, 75 and 80 we get completely arbitrary results which do not make any sense. I do not understand what the problem is. Please find the MWE below.
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
header = ['a','b','c','d']
dataset= [('60.0', '65.0', '70.0', '75.0', '80.0', '85.0', '90.0', '95.0', '100.0', '105.0', '110.0', '115.0', '120.0', '125.0', '130.0', '135.0', '140.0', '145.0', '150.0', '155.0', '160.0', '165.0', '170.0', '175.0', '180.0', '185.0', '190.0', '195.0', '200.0'), (0.0, 25.0, 48.93617021276596, 83.01886792452831, 66.66666666666666, 66.66666666666666, 70.96774193548387, 84.61538461538461, 93.33333333333333, 85.0, 92.85714285714286, 93.75, 95.0, 100.0, 100.0, 100.0, 100.0, 80.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0), (0.0, 50.0, 36.17021276595745, 11.320754716981133, 26.666666666666668, 33.33333333333333, 29.03225806451613, 15.384615384615385, 6.666666666666667, 15.0, 7.142857142857142, 6.25, 5.0, 0.0, 0.0, 0.0, 0.0, 20.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), (0.0, 12.5, 10.638297872340425, 3.7735849056603774, 4.444444444444445, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), (100.0, 12.5, 4.25531914893617, 1.8867924528301887, 2.2222222222222223, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)]
X_AXIS = dataset[0]
matplotlib.rc('font', serif='Helvetica Neue')
matplotlib.rc('text', usetex='false')
matplotlib.rcParams.update({'font.size': 40})
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(18.5, 10.5)
configs = dataset[0]
N = len(configs)
ind = np.arange(N)
width = 0.4
p1 = plt.bar(ind, dataset[1], width, color='r')
p2 = plt.bar(ind, dataset[2], width, bottom=dataset[1], color='b')
p3 = plt.bar(ind, dataset[3], width, bottom=dataset[2], color='g')
p4 = plt.bar(ind, dataset[4], width, bottom=dataset[3], color='c')
plt.ylim([0,120])
plt.yticks(fontsize=12)
plt.ylabel(output, fontsize=12)
plt.xticks(ind, X_AXIS, fontsize=12, rotation=90)
plt.xlabel('test', fontsize=12)
plt.legend((p1[0], p2[0], p3[0], p4[0]), (header[0], header[1], header[2], header[3]), fontsize=12, ncol=4, framealpha=0, fancybox=True)
plt.show()
You need the bottom of each dataset to be the sum of all the datasets that came before. you may also need to convert the datasets to numpy arrays to add them together.
p1 = plt.bar(ind, dataset[1], width, color='r')
p2 = plt.bar(ind, dataset[2], width, bottom=dataset[1], color='b')
p3 = plt.bar(ind, dataset[3], width,
bottom=np.array(dataset[1])+np.array(dataset[2]), color='g')
p4 = plt.bar(ind, dataset[4], width,
bottom=np.array(dataset[1])+np.array(dataset[2])+np.array(dataset[3]),
color='c')
Alternatively, you could convert them to numpy arrays before you start plotting.
dataset1 = np.array(dataset[1])
dataset2 = np.array(dataset[2])
dataset3 = np.array(dataset[3])
dataset4 = np.array(dataset[4])
p1 = plt.bar(ind, dataset1, width, color='r')
p2 = plt.bar(ind, dataset2, width, bottom=dataset1, color='b')
p3 = plt.bar(ind, dataset3, width, bottom=dataset1+dataset2, color='g')
p4 = plt.bar(ind, dataset4, width, bottom=dataset1+dataset2+dataset3,
color='c')
Or finally if you want to avoid converting to numpy arrays, you could use a list comprehension:
p1 = plt.bar(ind, dataset[1], width, color='r')
p2 = plt.bar(ind, dataset[2], width, bottom=dataset[1], color='b')
p3 = plt.bar(ind, dataset[3], width,
bottom=[sum(x) for x in zip(dataset[1],dataset[2])], color='g')
p4 = plt.bar(ind, dataset[4], width,
bottom=[sum(x) for x in zip(dataset[1],dataset[2],dataset[3])],
color='c')
I found this such a pain that I wrote a function to do it. I'm sharing it in the hope that others find it useful:
import numpy as np
import matplotlib.pyplot as plt
def plot_stacked_bar(data, series_labels, category_labels=None,
show_values=False, value_format="{}", y_label=None,
colors=None, grid=True, reverse=False):
"""Plots a stacked bar chart with the data and labels provided.
Keyword arguments:
data -- 2-dimensional numpy array or nested list
containing data for each series in rows
series_labels -- list of series labels (these appear in
the legend)
category_labels -- list of category labels (these appear
on the x-axis)
show_values -- If True then numeric value labels will
be shown on each bar
value_format -- Format string for numeric value labels
(default is "{}")
y_label -- Label for y-axis (str)
colors -- List of color labels
grid -- If True display grid
reverse -- If True reverse the order that the
series are displayed (left-to-right
or right-to-left)
"""
ny = len(data[0])
ind = list(range(ny))
axes = []
cum_size = np.zeros(ny)
data = np.array(data)
if reverse:
data = np.flip(data, axis=1)
category_labels = reversed(category_labels)
for i, row_data in enumerate(data):
color = colors[i] if colors is not None else None
axes.append(plt.bar(ind, row_data, bottom=cum_size,
label=series_labels[i], color=color))
cum_size += row_data
if category_labels:
plt.xticks(ind, category_labels)
if y_label:
plt.ylabel(y_label)
plt.legend()
if grid:
plt.grid()
if show_values:
for axis in axes:
for bar in axis:
w, h = bar.get_width(), bar.get_height()
plt.text(bar.get_x() + w/2, bar.get_y() + h/2,
value_format.format(h), ha="center",
va="center")
Example:
plt.figure(figsize=(6, 4))
series_labels = ['Series 1', 'Series 2']
data = [
[0.2, 0.3, 0.35, 0.3],
[0.8, 0.7, 0.6, 0.5]
]
category_labels = ['Cat A', 'Cat B', 'Cat C', 'Cat D']
plot_stacked_bar(
data,
series_labels,
category_labels=category_labels,
show_values=True,
value_format="{:.1f}",
colors=['tab:orange', 'tab:green'],
y_label="Quantity (units)"
)
plt.savefig('bar.png')
plt.show()
This is probably your most convenient solution if you are willing to use Pandas:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
X_AXIS = ('60.0', '65.0', '70.0', '75.0', '80.0', '85.0', '90.0', '95.0', '100.0', '105.0', '110.0', '115.0', '120.0', '125.0', '130.0', '135.0', '140.0', '145.0', '150.0', '155.0', '160.0', '165.0', '170.0', '175.0', '180.0', '185.0', '190.0', '195.0', '200.0')
index = pd.Index(X_AXIS, name='test')
data = {'a': (0.0, 25.0, 48.94, 83.02, 66.67, 66.67, 70.97, 84.62, 93.33, 85.0, 92.86, 93.75, 95.0, 100.0, 100.0, 100.0, 100.0, 80.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0),
'b': (0.0, 50.0, 36.17, 11.32, 26.67, 33.33, 29.03, 15.38, 6.67, 15.0, 7.14, 6.25, 5.0, 0.0, 0.0, 0.0, 0.0, 20.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0),
'c': (0.0, 12.5, 10.64, 3.77, 4.45, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0),
'd': (100.0, 12.5, 4.26, 1.89, 2.22, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)}
df = pd.DataFrame(data, index=index)
ax = df.plot(kind='bar', stacked=True, figsize=(10, 6))
ax.set_ylabel('foo')
plt.legend(title='labels', bbox_to_anchor=(1.0, 1), loc='upper left')
# plt.savefig('stacked.png') # if needed
plt.show()
If you're interested in ordered stacking (longest bars at bottom), here is how you can do it:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
a = pd.DataFrame({'a':[0.25, 0.5, 0.15, 0], 'b':[0.15, 0.25, 0.35, 0.15],
'c':[0.50, 0.15, 0.5, 0.35], 'd':[0.35, 0.35, 0.25, 0.5],})
# a b c d
# 0 0.25 0.15 0.50 0.35
# 1 0.50 0.25 0.15 0.35
# 2 0.15 0.35 0.50 0.25
# 3 0.00 0.15 0.35 0.50
fig, ax = plt.subplots()
x = a.index
indexes = np.argsort(a.values).T
heights = np.sort(a.values).T
order = -1
bottoms = heights[::order].cumsum(axis=0)
bottoms = np.insert(bottoms, 0, np.zeros(len(bottoms[0])), axis=0)
mpp_colors = dict(zip(a.columns, plt.rcParams['axes.prop_cycle'].by_key()['color']))
for btms, (idxs, vals) in enumerate(list(zip(indexes, heights))[::order]):
mps = np.take(np.array(a.columns), idxs)
ax.bar(x, height=vals, bottom=bottoms[btms], color=[mpp_colors[m] for m in mps])
ax.set_ylim(bottom=0, top=2)
plt.legend((np.take(np.array(a.columns), np.argsort(a.values)[0]))[::order], loc='upper right')
Here's a solution with a seaborn-like API. You can find an example usage here.
def stackedbarplot(data, stack_order=None, palette=None, **barplot_kws):
"""
Create a stacked barplot
Inputs:
| data <pd.DataFrame>: A wideform dataframe where the index is the variable to stack, the columns are different samples (x-axis), and the cells the counts (y-axis)
| stack_order <array-like>: The order for bars to be stacked (Default: given order)
| palette <array-like>: The colors to use for each value of `stack_order` (Default: husl)
| barplot_kws: Arguments to pass to sns.barplot()
Author: Michael Silverstein
Usage: https://github.com/michaelsilverstein/Pandas-and-Plotting/blob/master/lessons/stacked_bar_chart.ipynb
"""
# Order df
if stack_order is None:
stack_order = data.index
# Create palette if none
if palette is None:
palette = dict(zip(stack_order, sns.husl_palette(len(stack_order))))
# Compute cumsum
cumsum = data.loc[stack_order].cumsum()
# Melt for passing to seaborn
cumsum_stacked = cumsum.stack().reset_index(name='count')
# Get name of variable to stack and sample
stack_name, sample_name = cumsum_stacked.columns[:2]
# Plot bar plot
for s in stack_order[::-1]:
# Subset to this stack level
d = cumsum_stacked[cumsum_stacked[stack_name].eq(s)]
sns.barplot(x=sample_name, y='count', hue=stack_name, palette=palette, data=d, **barplot_kws)
return plt.gca()

Categories

Resources