d = {'X':[1,2,3,4],'A': [50,40,20,60], '% of Total in A':[29.4,23.5,11.8,35.3] , 'B': [25,10,5,15], '% in A' :[50,25,25,25]}
df = pd.DataFrame(d)
ax = df.plot(x='X',y="A", kind="bar")
df.plot(x='X', y="B", kind="bar", ax=ax,color='C2')
X A % of Total in A B % in A
0 1 50 29.4 25 50
1 2 40 23.5 10 25
2 3 20 11.8 5 25
3 4 60 35.3 15 25
I have the above dataframe and I know how to draw a stacked bar plot based on two columns A and B.
How do I add value labels on top of the bar such as for X=0, I want to label 50 (29.4% of the total) above the blue bar, and 25 (50% in group) above the green bar within the blue bar.
Any help is appreciated.
The first bars are stored in ax.containers[0], the second in ax.containers[1]. You can call ax.bar_label(...) using these containers together with a list of the corresponding labels.
By the way, you are missing x= in the second bar plot.
from matplotlib import pyplot as plt
import pandas as pd
d = {'X': [1, 2, 3, 4], 'A': [50, 40, 20, 60], '% of Total in A': [29.4, 23.5, 11.8, 35.3], 'B': [25, 10, 5, 15], '% in A': [50, 25, 25, 25]}
df = pd.DataFrame(d)
ax = df.plot(x='X', y="A", kind="bar")
df.plot(x='X', y="B", kind="bar", color='C2', ax=ax)
ax.bar_label(ax.containers[0], labels=df['% of Total in A'])
ax.bar_label(ax.containers[1], labels=df['% in A'], color='white')
plt.show()
To further accentuate that B is a part of A, you could give them the same color, and hatch B. For example:
ax = df.plot(x='X', y="A", kind="bar", color='dodgerblue')
df.plot(x='X', y="B", kind="bar", facecolor='dodgerblue', hatch='xx', rot=0, ax=ax)
ax.bar_label(ax.containers[0], labels=[f'{p} %' for p in df['% of Total in A']])
ax.bar_label(ax.containers[1], labels=[f'{p} %' for p in df['% in A']], color='white')
for spine in ['top', 'right']:
ax.spines[spine].set_visible(False)
The bars are not correctly stacked. The patches are stacked in z order, not vertically (y-order)., Also the x-axis is incorrect because x='X' is missing from the second plot.
Use zip to combine the containers and cols, and then passes the custom labels to the labels= parameter.
Also see Stacked Bar Chart with Centered Labels, and Adding value labels on a matplotlib bar chart for a thorough explanation about .bar_label.
ax = df.plot(kind='bar', x='X', y=['A', 'B'], stacked=True, rot=0, color=['tab:blue', 'tab:green'])
ax.legend(bbox_to_anchor=(1, 1.02), loc='upper left')
# specify the columns to uses for alternate labels, in order based on the order of y=
cols = ['% of Total in A', '% in A']
for c, col in zip(ax.containers, cols):
labels = df[col]
# Use the alternate column for the labels instead of the bar height (or width of horizontal bars)
labels = [f'{v}%' for v in labels]
# remove the labels parameter if it's not needed for customized labels
ax.bar_label(c, labels=labels, label_type='edge')
ax.margins(y=0.1)
Related
I have a dataset that looks like this:
df = pd.DataFrame({
'Vintage': ['2016Q1','2016Q1', '2016Q2','2016Q3','2016Q4','2016Q1', '2016Q2','2016Q2','2016Q2','2016Q3','2016Q4'],
'Model': ['A','A','A','A','A','B','B','B','B','B','B',],
'Count': [1,1,1,1,1,1,1,1,1,1,1],
'Case':[0,1,1,0,1,1,0,0,1,1,0],
})
Vintage Model Count Case
0 2016Q1 A 1 0
1 2016Q1 A 1 1
2 2016Q2 A 1 1
3 2016Q3 A 1 0
4 2016Q4 A 1 1
5 2016Q1 B 1 1
6 2016Q2 B 1 0
7 2016Q2 B 1 0
8 2016Q2 B 1 1
9 2016Q3 B 1 1
10 2016Q4 B 1 0
What I need to do is:
Plot grouped bar chart, where vintage is the groups and model is the hue/color
Two line plots in the same chart that show the percentage of case over count, aka plot the division of case over count for each model and vintage.
I figured out how to do the first task with a pivot table but haven't been able to add the percentage from the same pivot.
This is the solution for point 1:
dfp = df.pivot_table(index='vintage', columns='model', values='count', aggfunc='sum')
dfp.plot(kind='bar', figsize=(8, 4), rot=45, ylabel='Frequency', title="Vintages")
I tried dividing between columns in the pivot table but it's not the right format to plot.
How can I do the percentage calculation and line plots so without creating a different table?
Could the whole task be done with groupby instead? (as I find it easier to use in general)
Here's a solution using the seaborn plotting library, not sure if it's ok for you to use it for your problem
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.DataFrame({
'Vintage': ['2016Q1','2016Q1', '2016Q2','2016Q3','2016Q4','2016Q1', '2016Q2','2016Q2','2016Q2','2016Q3','2016Q4'],
'Model': ['A','A','A','A','A','B','B','B','B','B','B',],
'Count': [1,1,1,1,1,1,1,1,1,1,1],
'Case':[0,1,1,0,1,1,0,0,1,1,0],
})
agg_df = df.groupby(['Vintage','Model']).sum().reset_index()
agg_df['Fraction'] = agg_df['Case']/agg_df['Count']
sns.barplot(
x = 'Vintage',
y = 'Count',
hue = 'Model',
alpha = 0.5,
data = agg_df,
)
sns.lineplot(
x = 'Vintage',
y = 'Fraction',
hue = 'Model',
marker = 'o',
legend = False,
data = agg_df,
)
plt.show()
plt.close()
IIUC you want the lines to be drawn on the same plot. I'd recommend creating a new y-axis after computing the division from the original df. Then you can plot the lines with seaborn:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.DataFrame({
'Vintage': ['2016Q1','2016Q1', '2016Q2','2016Q3','2016Q4','2016Q1', '2016Q2','2016Q2','2016Q2','2016Q3','2016Q4'],
'Model': ['A','A','A','A','A','B','B','B','B','B','B',],
'Count': [1,1,1,1,1,1,1,1,1,1,1],
'Case':[0,1,1,0,1,1,0,0,1,1,0],
})
dfp = df.pivot_table(index='Vintage', columns='Model', values='Count', aggfunc='sum')
ax1 = dfp.plot(kind='bar', figsize=(8, 4), rot=45, ylabel='Frequency', title="Vintages")
dfd = df.groupby(["Vintage", "Model"]).sum() \
.assign(div_pct=lambda x:100*x["Case"]/x["Count"]) \
.reset_index()
ax2 = ax1.twinx() # creating a second y axis
sns.lineplot(data=dfd, x="Vintage", y="div_pct", hue="Model", style="Model", ax=ax2, markers=True, dashes=False)
plt.show()
Output:
I have the following dataframe:
Color Level Proportion
-------------------------------------
0 Blue 1 0.1
1 Blue 2 0.3
2 Blue 3 0.6
3 Red 1 0.2
4 Red 2 0.5
5 Red 3 0.3
Here I have 2 color categories, where each color category has 3 levels, and each entry has a proportion, which sum to 1 for each color category. I want to make a stacked bar chart from this dataframe that has 2 stacked bars, one for each color category. Within each of those stacked bars will be the proportion for each level, all summing to 1. So while the bars will be "stacked" different, the bars as complete bars will be the same length of 1.
I have tried this:
df.plot(kind='bar', stacked=True)
I then get this stacked bar chart, which is not what I want:
I want 2 stacked bars, and so a stacked bar for "Blue" and a stacked bar for "Red", where these bars are "stacked" by the proportions, with the colors of these stacks corresponding to each level. And so both of these bars would be of length 1 along the x-axis, which would be labelled "proportion". How can I fix my code to create this stacked bar chart?
Make a pivot and then plot it:
df.pivot(index = 'Color', columns = 'Level', values = 'Proportion')
df.plot(kind = 'bar', stacked = True)
Edit: Cleaner legend
You could create a Seaborn sns.histplot using the proportion as weights and the level as hue:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
df = pd.DataFrame({'Color': ['Blue'] * 3 + ['Red'] * 3,
'Level': [1, 2, 3] * 2,
'Proportion': [.1, .3, .6, .2, .5, .3]})
sns.set_style('white')
ax = sns.histplot(data=df, x='Color', weights='Proportion', hue='Level', multiple='stack', palette='flare', shrink=0.75)
ax.set_ylabel('Proportion')
for bars in ax.containers:
ax.bar_label(bars, label_type='center', fmt='%.2f')
sns.move_legend(ax, loc='upper left', bbox_to_anchor=(1, 0.97))
sns.despine()
plt.tight_layout()
plt.show()
I have the following dataframe
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
print('matplotlib: {}'.format(matplotlib.__version__))
# 3.5.3
df=pd.DataFrame({'Type': [ 'Sentence', 'Array', 'String', '-','-', 'Sentence', 'Array', 'String', '-','-', 'Sentence'],
'Length': [42,21,11,6,6,42,21,11,6,6,42],
'label': [1,1,0,0,0,1,1,0,0,0,1],
})
print(df)
# Type Length label
#0 Sentence 42 1
#1 Array 21 1
#2 String 11 0
#3 - 6 0
#4 - 6 0
#5 Sentence 42 1
#6 Array 21 1
#7 String 11 0
#8 - 6 0
#9 - 6 0
#10 Sentence 42 1
I want to plot stacked bar chart for the arbitrary column within dataframe (either numerical e.g. Length column or categorical e.g. Type column) and stack with respect to label column using annotations of both count/percentage, where small values of rare observations are also displayed. The following script gives me the wrong results:
ax = df.plot.bar(stacked=True)
#ax = df[["Type","label"]].plot.bar(stacked=True)
#ax = df.groupby('Type').size().plot(kind='bar', stacked=True)
ax.legend(["0: normanl", "1: Anomaly"])
for p in ax.patches:
width, height = p.get_width(), p.get_height()
x, y = p.get_xy()
ax.text(x+width/2,
y+height/2,
'{:.0f} %'.format(height),
horizontalalignment='center',
verticalalignment='center')
I can imagine that somehow I need to calculate the counts of the selected column with respect to label column:
## counts will be used for the labels
counts = df.apply(lambda x: x.value_counts())
## percents will be used to determine the height of each bar
percents = counts.div(counts.sum(axis=1), axis=0)
I tried to solve the problem by using df.groupby(['selcted column', 'label'] unsuccessfully. I collected all possible solutions in this Google Colab Notebook nevertheless I couldn't find a straightforward way to adapt into dataframe.
So far I have tried following solution inspired by this post to solve the problem by using df.groupby(['selcted column', 'label'] unsuccessfully and I got TypeError: unsupported operand type(s) for +: 'int' and 'str' for total = sum(dff.sum()) can't figure out what is the problem? in indexing or df transformation.
BTW I collected all possible solutions in this Google Colab Notebook nevertheless I couldn't find a straightforward way to adapt into dataframe via Mathplotlib. So I'm looking for an elegant way of using Seaborn or plotly.
df = df.groupby(["Type","label"]).count()
#dfp_Type = df.pivot_table(index='Type', columns='label', values= 'Length', aggfunc='mean')
dfp_Type = df.pivot_table(index='Type', columns='label', values= df.Type.size(), aggfunc='mean')
#dfp_Length = df.pivot_table(index='Length', columns='label', values= df.Length.size(), aggfunc='mean')
ax = dfp_Type.plot(kind='bar', stacked=True, rot=0)
# iterate through each bar container
for c in ax.containers: labels = [v.get_height() if v.get_height() > 0 else '' for v in c]
# add the annotations
ax.bar_label(c, fmt='%0.0f%%', label_type='center')
# move the legend
ax.legend(title='Class', bbox_to_anchor=(1, 1.02), loc='upper left')
plt.show()
output:
Expected output:
The values in Expected output do not match df in the OP, so the sample DataFrame has been updated.
Plot with pandas.DataFrame.plot, using kind='bar' and stacked=True. pandas uses and imports matplotlib as the default plotting backend, so there's no need to import other plotting libraries.
Resources:
How to aggregate unique count with pandas pivot_table for details about using aggfunc=len in .pivot_table.
How to add value labels on a bar chart for details and examples about .bar_label.
How to add multiple annotations to a bar plot & How to create and annotate a stacked proportional bar chart for adding count and percent to a bar plot.
Tested in python 3.10, pandas 1.4.3, matplotlib 3.5.1
import pandas as pd
# sample dataframe
df = pd.DataFrame({'Type': [ 'Sentence', 'Array', 'String', '-','-', 'Sentence', 'Array', 'String', '-','-', 'Sentence'],
'Length': [42, 21, 11, 6, 6, 42, 21, 11, 6, 6, 42],
'label': [1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0]})
# pivot the dataframe and get len
dfp = df.pivot_table(index='Type', columns='label', values='Length', aggfunc=len)
# get the total for each row
total = dfp.sum(axis=1)
# calculate the percent for each row
per = dfp.div(total, axis=0).mul(100).round(2)
# plot the pivoted dataframe
ax = dfp.plot(kind='bar', stacked=True, figsize=(10, 8), rot=0)
# set the colors for each Class
segment_colors = {'0': 'white', '1': 'black'}
# iterate through the containers
for c in ax.containers:
# get the current segment label (a string); corresponds to column / legend
label = c.get_label()
# create custom labels with the bar height and the percent from the per column
# the column labels in per and dfp are int, so convert label to int
labels = [f'{v.get_height()}\n({row}%)' if v.get_height() > 0 else '' for v, row in zip(c, per[int(label)])]
# add the annotation
ax.bar_label(c, labels=labels, label_type='center', fontweight='bold', color=segment_colors[label])
# move the legend
_ = ax.legend(title='Class', bbox_to_anchor=(1, 1.01), loc='upper left')
Comment Updates
How to always have a spot for 'Array' if it's not in the data:
Add 'Array' to dfp if it's not in dfp.index.
df.Type = pd.Categorical(df.Type, ['-', 'Array', 'Sentence', 'String'], ordered=True) does not ensure the missing categories are plotted.
How to have all the annotations, even if they're small:
Don't stack the bars, and set logy=True.
This uses the full-data, which was provided in a link.
# pivot the dataframe and get len
dfp = df.pivot_table(index='Type', columns='label', values='Length', aggfunc=len)
# append Array if it's not included
if 'Array' not in dfp.index:
dfp = pd.concat([dfp, pd.DataFrame({0: [np.nan], 1: [np.nan]}, index=['Array'])])
# order the index
dfp = dfp.loc[['-', 'Array', 'Sentence', 'String'], :]
# calculate the percent for each row
per = dfp.div(dfp.sum(axis=1), axis=0).mul(100).round(2)
# plot the pivoted dataframe
ax = dfp.plot(kind='bar', stacked=False, figsize=(10, 8), rot=0, logy=True, width=0.75)
# iterate through the containers
for c in ax.containers:
# get the current segment label (a string); corresponds to column / legend
label = c.get_label()
# create custom labels with the bar height and the percent from the per column
# the column labels in per and dfp are int, so convert label to int
labels = [f'{v.get_height()}\n({row}%)' if v.get_height() > 0 else '' for v, row in zip(c, per[int(label)])]
# add the annotation
ax.bar_label(c, labels=labels, label_type='edge', fontsize=10, fontweight='bold')
# move the legend
ax.legend(title='Class', bbox_to_anchor=(1, 1.01), loc='upper left')
# pad the spacing between the number and the edge of the figure
_ = ax.margins(y=0.1)
DataFrame Views
Based on the sample data in the OP
df
Type Length label
0 Sentence 42 1
1 Array 21 1
2 String 11 0
3 - 6 0
4 - 6 0
5 Sentence 42 1
6 Array 21 1
7 String 11 0
8 - 6 0
9 - 6 1
10 Sentence 42 0
dfp
label 0 1
Type
- 3.0 1.0
Array NaN 2.0
Sentence 1.0 2.0
String 2.0 NaN
total
Type
- 4.0
Array 2.0
Sentence 3.0
String 2.0
dtype: float64
per
label 0 1
Type
- 75.00 25.00
Array NaN 100.00
Sentence 33.33 66.67
String 100.00 NaN
I slightly adjusted the data so the graph would look identical to yours(e.g., Type:-'s label has three 0 and one 1)
df
###
Type Length label
0 Sentence 42 1
1 Array 21 1
2 String 11 0
3 - 6 0
4 - 6 0
5 Sentence 42 1
6 Array 21 1
7 String 11 0
8 - 6 0
9 - 6 1
10 Sentence 42 0
df_plot = df.groupby(['Type','label']).size().reset_index()
df_plot.columns = ['Type', 'Class', 'count']
df_plot = df_plot.astype({'Class':'str'})
df_plot['percentage'] = df.groupby(['Type','label']).size().groupby(level=0).apply(lambda x: 100*x/float(x.sum())).values.round(2).astype(str)
df_plot['percentage'] = "(" + df_plot['percentage'] + '%)'
df_plot
###
Type Class count percentage
0 - 0 3 (75.0%)
1 - 1 1 (25.0%)
2 Array 1 2 (100.0%)
3 Sentence 0 1 (33.33%)
4 Sentence 1 2 (66.67%)
5 String 0 2 (100.0%)
fig = px.bar(df_plot,
x='Type',
y='count',
color='Class',
text=df_plot['count'].astype(str) + "<br>" + df_plot['percentage'],
width=550,
height=400,
category_orders={'Type':['-','Array','Sentence','String']},
template='plotly_white',
log_y=True
)
fig.show('browser')
with your CSV file followed the same ELT turning into df_plot2,
while Class 0 and 1 has a huge difference,
A stacked bar chart(default setting) won't give you distinguishable outcome,
we can use barmode='group' instead,
fig2 = px.bar(df_plot2,
barmode='group',
x='Type',
y='count',
color='Class',
color_discrete_map={'0':'#5DA597', '1':'#FFC851'},
text=df_plot2['count'].astype(str) + "<br>" + df_plot2['percentage'],
width=850,
height=800,
category_orders={'Type': ['-', 'Array', 'Sentence', 'String']},
template='plotly_white',
log_y=True,
)
fig2.update_yaxes(dtick=1)
fig2.show('browser')
I am trying to plot the following data as a horizontal stacked barplot. I would like to show the Week 1 and Week 2, as bars with the largest bar size ('Total') at the top and then descending down. The actual data is 100 lines so I arrived at using Seaborn catplots with kind='bar'. I'm not sure if possible to stack (like Matplotlib) so I opted to create two charts and overlay 'Week 1' on top of 'Total', for the same stacked effect.
However when I run the below I'm getting two separate plots and the chart title and axis is one the one graph. Am I able to combine this into one stacked horizontal chart. If easier way then appreciate to find out.
Company
Week 1
Week 2
Total
Stanley Atherton
0
1
1
Dennis Auton
1
1
2
David Bailey
3
8
11
Alan Ball
5
2
7
Philip Barker
3
0
3
Mark Beirne
0
1
1
Phyllis Blitz
3
0
3
Simon Blower
4
2
6
Steven Branton
5
7
12
Rebecca Brown
0
4
4
(Names created from random name generator)
Code:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
data = pd.read_csv('Sample1.csv', delimiter="\t", error_bad_lines=False)
data_rank = data.sort_values(["Attending", "Company"], ascending=[False,True])
sns.set(style="ticks")
g = sns.catplot(y='Company', x='Total', data=data_rank, kind='bar', height=4, color='red', aspect=0.8, ax=ax)
ax2 =ax.twinx()
g = sns.catplot(y='Company', x='Week 1', data=data_rank, kind='bar', height=4, color='blue', aspect=0.8, ax=ax2)
for ax in g.axes[0]:
ax.xaxis.tick_top()
ax.xaxis.set_label_position('top')
ax.spines['bottom'].set_visible(True)
ax.spines['top'].set_visible(True)
plt.title("Company by week ", size=7)
catplot 1
catplot 2
I think something like this works.
g = sns.barplot(y='Company', x='Total', data=data_rank, color='red', label='Total')
g = sns.barplot(y='Company', x='Week1', data=data_rank, color='blue', label='Week 1')
plt.title("Company by week ", size=12)
plt.xlabel('Frequency')
plt.legend()
plt.show()
I'm trying to create a grouped, stacked bar chart.
Currently I have the following DataFrame:
>>> df
Value
Rating 1 2 3
Context Parameter
Total 1 43.312347 9.507902 1.580367
2 42.862649 9.482205 1.310549
3 43.710651 9.430811 1.400488
4 43.209559 9.803418 1.349094
5 42.541436 10.008994 1.220609
6 42.978286 9.430811 1.336246
7 42.734164 10.317358 1.606064
User 1 47.652348 11.138861 2.297702
2 47.102897 10.589411 1.848152
3 46.853147 10.139860 1.848152
4 47.252747 11.138861 1.748252
5 45.954046 10.239760 1.448551
6 46.353646 10.439560 1.498501
7 47.102897 11.338661 1.998002
I'd like to have for each Parameter the bars for Total and User grouped together.
This is the resulting chart with df.plot(kind='bar', stacked=True):
The bars themselve look right, but how do I get the bars for Total and User next to each other, for each Parameter, best with some margin between the parameters?
The following approach allows grouped and stacked bars at the same time.
First the dataframe is sorted by parameter, context. Then the context is unstacked from the index, creating new columns for every context, value pair.
Finally, three bar plots are drawn over each other to visualize the stacked bars.
import pandas as pd
from matplotlib import pyplot as plt
df = pd.DataFrame(columns=['Context', 'Parameter', 'Val1', 'Val2', 'Val3'],
data=[['Total', 1, 43.312347, 9.507902, 1.580367],
['Total', 2, 42.862649, 9.482205, 1.310549],
['Total', 3, 43.710651, 9.430811, 1.400488],
['Total', 4, 43.209559, 9.803418, 1.349094],
['Total', 5, 42.541436, 10.008994, 1.220609],
['Total', 6, 42.978286, 9.430811, 1.336246],
['Total', 7, 42.734164, 10.317358, 1.606064],
['User', 1, 47.652348, 11.138861, 2.297702],
['User', 2, 47.102897, 10.589411, 1.848152],
['User', 3, 46.853147, 10.139860, 1.848152],
['User', 4, 47.252747, 11.138861, 1.748252],
['User', 5, 45.954046, 10.239760, 1.448551],
['User', 6, 46.353646, 10.439560, 1.498501],
['User', 7, 47.102897, 11.338661, 1.998002]])
df.set_index(['Context', 'Parameter'], inplace=True)
df0 = df.reorder_levels(['Parameter', 'Context']).sort_index()
colors = plt.cm.Paired.colors
df0 = df0.unstack(level=-1) # unstack the 'Context' column
fig, ax = plt.subplots()
(df0['Val1']+df0['Val2']+df0['Val3']).plot(kind='bar', color=[colors[1], colors[0]], rot=0, ax=ax)
(df0['Val2']+df0['Val3']).plot(kind='bar', color=[colors[3], colors[2]], rot=0, ax=ax)
df0['Val3'].plot(kind='bar', color=[colors[5], colors[4]], rot=0, ax=ax)
legend_labels = [f'{val} ({context})' for val, context in df0.columns]
ax.legend(legend_labels)
plt.tight_layout()
plt.show()
Here's a way to do it:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="whitegrid")
# reshape you data - ensure no index is set initially
df1 = (df
.set_index(['Parameter','Context'])
.stack()
.reset_index()
.drop('level_2', 1)
.rename(columns={0:'value'}))
print(df1.head())
Parameter Context value
0 1 Total 43.312347
1 1 Total 9.507902
2 1 Total 1.580367
3 2 Total 42.862649
4 2 Total 9.482205
sns.barplot(x = 'Parameter',
y = 'value',
hue='Context',
data=df1,
errwidth=0.1)