Create a stacked bar plot and annotate with count and percent - python

I have the following dataframe
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
print('matplotlib: {}'.format(matplotlib.__version__))
# 3.5.3
df=pd.DataFrame({'Type': [ 'Sentence', 'Array', 'String', '-','-', 'Sentence', 'Array', 'String', '-','-', 'Sentence'],
'Length': [42,21,11,6,6,42,21,11,6,6,42],
'label': [1,1,0,0,0,1,1,0,0,0,1],
})
print(df)
# Type Length label
#0 Sentence 42 1
#1 Array 21 1
#2 String 11 0
#3 - 6 0
#4 - 6 0
#5 Sentence 42 1
#6 Array 21 1
#7 String 11 0
#8 - 6 0
#9 - 6 0
#10 Sentence 42 1
I want to plot stacked bar chart for the arbitrary column within dataframe (either numerical e.g. Length column or categorical e.g. Type column) and stack with respect to label column using annotations of both count/percentage, where small values of rare observations are also displayed. The following script gives me the wrong results:
ax = df.plot.bar(stacked=True)
#ax = df[["Type","label"]].plot.bar(stacked=True)
#ax = df.groupby('Type').size().plot(kind='bar', stacked=True)
ax.legend(["0: normanl", "1: Anomaly"])
for p in ax.patches:
width, height = p.get_width(), p.get_height()
x, y = p.get_xy()
ax.text(x+width/2,
y+height/2,
'{:.0f} %'.format(height),
horizontalalignment='center',
verticalalignment='center')
I can imagine that somehow I need to calculate the counts of the selected column with respect to label column:
## counts will be used for the labels
counts = df.apply(lambda x: x.value_counts())
## percents will be used to determine the height of each bar
percents = counts.div(counts.sum(axis=1), axis=0)
I tried to solve the problem by using df.groupby(['selcted column', 'label'] unsuccessfully. I collected all possible solutions in this Google Colab Notebook nevertheless I couldn't find a straightforward way to adapt into dataframe.
So far I have tried following solution inspired by this post to solve the problem by using df.groupby(['selcted column', 'label'] unsuccessfully and I got TypeError: unsupported operand type(s) for +: 'int' and 'str' for total = sum(dff.sum()) can't figure out what is the problem? in indexing or df transformation.
BTW I collected all possible solutions in this Google Colab Notebook nevertheless I couldn't find a straightforward way to adapt into dataframe via Mathplotlib. So I'm looking for an elegant way of using Seaborn or plotly.
df = df.groupby(["Type","label"]).count()
#dfp_Type = df.pivot_table(index='Type', columns='label', values= 'Length', aggfunc='mean')
dfp_Type = df.pivot_table(index='Type', columns='label', values= df.Type.size(), aggfunc='mean')
#dfp_Length = df.pivot_table(index='Length', columns='label', values= df.Length.size(), aggfunc='mean')
ax = dfp_Type.plot(kind='bar', stacked=True, rot=0)
# iterate through each bar container
for c in ax.containers: labels = [v.get_height() if v.get_height() > 0 else '' for v in c]
# add the annotations
ax.bar_label(c, fmt='%0.0f%%', label_type='center')
# move the legend
ax.legend(title='Class', bbox_to_anchor=(1, 1.02), loc='upper left')
plt.show()
output:
Expected output:

The values in Expected output do not match df in the OP, so the sample DataFrame has been updated.
Plot with pandas.DataFrame.plot, using kind='bar' and stacked=True. pandas uses and imports matplotlib as the default plotting backend, so there's no need to import other plotting libraries.
Resources:
How to aggregate unique count with pandas pivot_table for details about using aggfunc=len in .pivot_table.
How to add value labels on a bar chart for details and examples about .bar_label.
How to add multiple annotations to a bar plot & How to create and annotate a stacked proportional bar chart for adding count and percent to a bar plot.
Tested in python 3.10, pandas 1.4.3, matplotlib 3.5.1
import pandas as pd
# sample dataframe
df = pd.DataFrame({'Type': [ 'Sentence', 'Array', 'String', '-','-', 'Sentence', 'Array', 'String', '-','-', 'Sentence'],
'Length': [42, 21, 11, 6, 6, 42, 21, 11, 6, 6, 42],
'label': [1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0]})
# pivot the dataframe and get len
dfp = df.pivot_table(index='Type', columns='label', values='Length', aggfunc=len)
# get the total for each row
total = dfp.sum(axis=1)
# calculate the percent for each row
per = dfp.div(total, axis=0).mul(100).round(2)
# plot the pivoted dataframe
ax = dfp.plot(kind='bar', stacked=True, figsize=(10, 8), rot=0)
# set the colors for each Class
segment_colors = {'0': 'white', '1': 'black'}
# iterate through the containers
for c in ax.containers:
# get the current segment label (a string); corresponds to column / legend
label = c.get_label()
# create custom labels with the bar height and the percent from the per column
# the column labels in per and dfp are int, so convert label to int
labels = [f'{v.get_height()}\n({row}%)' if v.get_height() > 0 else '' for v, row in zip(c, per[int(label)])]
# add the annotation
ax.bar_label(c, labels=labels, label_type='center', fontweight='bold', color=segment_colors[label])
# move the legend
_ = ax.legend(title='Class', bbox_to_anchor=(1, 1.01), loc='upper left')
Comment Updates
How to always have a spot for 'Array' if it's not in the data:
Add 'Array' to dfp if it's not in dfp.index.
df.Type = pd.Categorical(df.Type, ['-', 'Array', 'Sentence', 'String'], ordered=True) does not ensure the missing categories are plotted.
How to have all the annotations, even if they're small:
Don't stack the bars, and set logy=True.
This uses the full-data, which was provided in a link.
# pivot the dataframe and get len
dfp = df.pivot_table(index='Type', columns='label', values='Length', aggfunc=len)
# append Array if it's not included
if 'Array' not in dfp.index:
dfp = pd.concat([dfp, pd.DataFrame({0: [np.nan], 1: [np.nan]}, index=['Array'])])
# order the index
dfp = dfp.loc[['-', 'Array', 'Sentence', 'String'], :]
# calculate the percent for each row
per = dfp.div(dfp.sum(axis=1), axis=0).mul(100).round(2)
# plot the pivoted dataframe
ax = dfp.plot(kind='bar', stacked=False, figsize=(10, 8), rot=0, logy=True, width=0.75)
# iterate through the containers
for c in ax.containers:
# get the current segment label (a string); corresponds to column / legend
label = c.get_label()
# create custom labels with the bar height and the percent from the per column
# the column labels in per and dfp are int, so convert label to int
labels = [f'{v.get_height()}\n({row}%)' if v.get_height() > 0 else '' for v, row in zip(c, per[int(label)])]
# add the annotation
ax.bar_label(c, labels=labels, label_type='edge', fontsize=10, fontweight='bold')
# move the legend
ax.legend(title='Class', bbox_to_anchor=(1, 1.01), loc='upper left')
# pad the spacing between the number and the edge of the figure
_ = ax.margins(y=0.1)
DataFrame Views
Based on the sample data in the OP
df
Type Length label
0 Sentence 42 1
1 Array 21 1
2 String 11 0
3 - 6 0
4 - 6 0
5 Sentence 42 1
6 Array 21 1
7 String 11 0
8 - 6 0
9 - 6 1
10 Sentence 42 0
dfp
label 0 1
Type
- 3.0 1.0
Array NaN 2.0
Sentence 1.0 2.0
String 2.0 NaN
total
Type
- 4.0
Array 2.0
Sentence 3.0
String 2.0
dtype: float64
per
label 0 1
Type
- 75.00 25.00
Array NaN 100.00
Sentence 33.33 66.67
String 100.00 NaN

I slightly adjusted the data so the graph would look identical to yours(e.g., Type:-'s label has three 0 and one 1)
df
###
Type Length label
0 Sentence 42 1
1 Array 21 1
2 String 11 0
3 - 6 0
4 - 6 0
5 Sentence 42 1
6 Array 21 1
7 String 11 0
8 - 6 0
9 - 6 1
10 Sentence 42 0
df_plot = df.groupby(['Type','label']).size().reset_index()
df_plot.columns = ['Type', 'Class', 'count']
df_plot = df_plot.astype({'Class':'str'})
df_plot['percentage'] = df.groupby(['Type','label']).size().groupby(level=0).apply(lambda x: 100*x/float(x.sum())).values.round(2).astype(str)
df_plot['percentage'] = "(" + df_plot['percentage'] + '%)'
df_plot
###
Type Class count percentage
0 - 0 3 (75.0%)
1 - 1 1 (25.0%)
2 Array 1 2 (100.0%)
3 Sentence 0 1 (33.33%)
4 Sentence 1 2 (66.67%)
5 String 0 2 (100.0%)
fig = px.bar(df_plot,
x='Type',
y='count',
color='Class',
text=df_plot['count'].astype(str) + "<br>" + df_plot['percentage'],
width=550,
height=400,
category_orders={'Type':['-','Array','Sentence','String']},
template='plotly_white',
log_y=True
)
fig.show('browser')
with your CSV file followed the same ELT turning into df_plot2,
while Class 0 and 1 has a huge difference,
A stacked bar chart(default setting) won't give you distinguishable outcome,
we can use barmode='group' instead,
fig2 = px.bar(df_plot2,
barmode='group',
x='Type',
y='count',
color='Class',
color_discrete_map={'0':'#5DA597', '1':'#FFC851'},
text=df_plot2['count'].astype(str) + "<br>" + df_plot2['percentage'],
width=850,
height=800,
category_orders={'Type': ['-', 'Array', 'Sentence', 'String']},
template='plotly_white',
log_y=True,
)
fig2.update_yaxes(dtick=1)
fig2.show('browser')

Related

Divide two columns in pivot table and plot grouped bar chart with pandas

I have a dataset that looks like this:
df = pd.DataFrame({
'Vintage': ['2016Q1','2016Q1', '2016Q2','2016Q3','2016Q4','2016Q1', '2016Q2','2016Q2','2016Q2','2016Q3','2016Q4'],
'Model': ['A','A','A','A','A','B','B','B','B','B','B',],
'Count': [1,1,1,1,1,1,1,1,1,1,1],
'Case':[0,1,1,0,1,1,0,0,1,1,0],
})
Vintage Model Count Case
0 2016Q1 A 1 0
1 2016Q1 A 1 1
2 2016Q2 A 1 1
3 2016Q3 A 1 0
4 2016Q4 A 1 1
5 2016Q1 B 1 1
6 2016Q2 B 1 0
7 2016Q2 B 1 0
8 2016Q2 B 1 1
9 2016Q3 B 1 1
10 2016Q4 B 1 0
What I need to do is:
Plot grouped bar chart, where vintage is the groups and model is the hue/color
Two line plots in the same chart that show the percentage of case over count, aka plot the division of case over count for each model and vintage.
I figured out how to do the first task with a pivot table but haven't been able to add the percentage from the same pivot.
This is the solution for point 1:
dfp = df.pivot_table(index='vintage', columns='model', values='count', aggfunc='sum')
dfp.plot(kind='bar', figsize=(8, 4), rot=45, ylabel='Frequency', title="Vintages")
I tried dividing between columns in the pivot table but it's not the right format to plot.
How can I do the percentage calculation and line plots so without creating a different table?
Could the whole task be done with groupby instead? (as I find it easier to use in general)
Here's a solution using the seaborn plotting library, not sure if it's ok for you to use it for your problem
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.DataFrame({
'Vintage': ['2016Q1','2016Q1', '2016Q2','2016Q3','2016Q4','2016Q1', '2016Q2','2016Q2','2016Q2','2016Q3','2016Q4'],
'Model': ['A','A','A','A','A','B','B','B','B','B','B',],
'Count': [1,1,1,1,1,1,1,1,1,1,1],
'Case':[0,1,1,0,1,1,0,0,1,1,0],
})
agg_df = df.groupby(['Vintage','Model']).sum().reset_index()
agg_df['Fraction'] = agg_df['Case']/agg_df['Count']
sns.barplot(
x = 'Vintage',
y = 'Count',
hue = 'Model',
alpha = 0.5,
data = agg_df,
)
sns.lineplot(
x = 'Vintage',
y = 'Fraction',
hue = 'Model',
marker = 'o',
legend = False,
data = agg_df,
)
plt.show()
plt.close()
IIUC you want the lines to be drawn on the same plot. I'd recommend creating a new y-axis after computing the division from the original df. Then you can plot the lines with seaborn:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.DataFrame({
'Vintage': ['2016Q1','2016Q1', '2016Q2','2016Q3','2016Q4','2016Q1', '2016Q2','2016Q2','2016Q2','2016Q3','2016Q4'],
'Model': ['A','A','A','A','A','B','B','B','B','B','B',],
'Count': [1,1,1,1,1,1,1,1,1,1,1],
'Case':[0,1,1,0,1,1,0,0,1,1,0],
})
dfp = df.pivot_table(index='Vintage', columns='Model', values='Count', aggfunc='sum')
ax1 = dfp.plot(kind='bar', figsize=(8, 4), rot=45, ylabel='Frequency', title="Vintages")
dfd = df.groupby(["Vintage", "Model"]).sum() \
.assign(div_pct=lambda x:100*x["Case"]/x["Count"]) \
.reset_index()
ax2 = ax1.twinx() # creating a second y axis
sns.lineplot(data=dfd, x="Vintage", y="div_pct", hue="Model", style="Model", ax=ax2, markers=True, dashes=False)
plt.show()
Output:

Scatter Plot Binary Data Color Coded Points from Data Labels

I'd like to use matplotlib.pyplot.scatter to create a scatter plot similar to the picture below from data in a dataframe with a header that is formatted similar to the table here where all the points for a given sample are colored based on the label in the first column of the data and a point is only plotted for each gene with a value of 1 - no point for the genes with a 0 value:
label
gene a
gene b
gene c
gene d
1
0
1
0
0
0
1
1
0
1
0
0
0
1
0
1
0
0
0
0
1
0
1
0
0
Note: my sample data does not match my sample scatter plot output.
After melting your dataframe to a long format you can draw a matrix with seaborn's sns.relplot
import pandas as pd
import seaborn as sns
sns.set_style("ticks")
df = pd.read_html('https://stackoverflow.com/q/70856944/14277722')[0]
df['sample'] = df.index
df = df.melt(['label','sample'])
g = sns.relplot(
data=df,
x="variable", y="sample", hue="label", size="value",
hue_norm=(-1, 1), palette='tab10',
height=6, sizes=(10, 300), size_norm=(0, 1)
)
g.set(xlabel="Genes", ylabel="Samples",
# ylim=[df['sample'].max()+.5, df['sample'].min()-.5] # uncomment to invert the y-axis
);
With the melted dataframe you can access plt.scatter directly from pandas but I think you have to add your own custom legend for the labels.
df.plot(x='variable', y='sample', s=(df.value+0.1) * 300, kind='scatter',
ylim=[df['sample'].max()+.5, df['sample'].min()-.5], # uncomment to flip y-axis
figsize=(7,6), c='label', cmap='coolwarm', colorbar=False
);

Plotting multiple groups from a dataframe with datashader as lines

I am trying to make plots with datashader. the data itself is a time series of points in polar coordiantes. i managed to transform them to cartesian coordianted(to have equal spaced pixles) and i can plot them with datashader.
the point where i am stuck is that if i just plot them with line() instead of points() it just connects the whole dataframe as a single line. i would like to plot the data of the dataframe group per group(the groups are the names in list_of_names ) onto the canvas as lines.
data can be found here
i get this kind of image with datashader
This is a zoomed in view of the plot generated with points() instead of line() the goal is to produce the same plot but with connected lines instead of points
import datashader as ds, pandas as pd, colorcet
import numby as np
df = pd.read_csv('file.csv')
print(df)
starlink_name = df.loc[:,'Name']
starlink_alt = df.loc[:,'starlink_alt']
starlink_az = df.loc[:,'starlink_az']
name = starlink_name.values
alt = starlink_alt.values
az = starlink_az.values
print(name)
print(df['Name'].nunique())
df['Date'] = pd.to_datetime(df['Date'])
for name, df_name in df.groupby('Name'):
print(name)
df_grouped = df.groupby('Name')
list_of_names = list(df_grouped.groups)
print(len(list_of_names))
#########################################################################################
#i want this kind of plot with connected lines with datashader
#########################################################################################
fig = plt.figure()
ax = fig.add_axes([0.1,0.1,0.8,0.8], polar=True)
# ax.invert_yaxis()
ax.set_theta_zero_location('N')
ax.set_rlim(90, 60, 1)
# Note: you must set the end of arange to be slightly larger than 90 or it won't include 90
ax.set_yticks(np.arange(0, 91, 15))
ax.set_rlim(bottom=90, top=0)
for name in list_of_names:
df2 = df_grouped.get_group(name)
ax.plot(np.deg2rad(df2['starlink_az']), df2['starlink_alt'], linestyle='solid', marker='.',linewidth=0.5, markersize=0.1)
plt.show()
print(df)
#########################################################################################
#transformation to cartasian coordiantes
#########################################################################################
df['starlink_alt'] = 90 - df['starlink_alt']
df['x'] = df.apply(lambda row: np.deg2rad(row.starlink_alt) * np.cos(np.deg2rad(row.starlink_az)), axis=1)
df['y'] = df.apply(lambda row: -1 * np.deg2rad(row.starlink_alt) * np.sin(np.deg2rad(row.starlink_az)), axis=1)
#########################################################################################
# this is what i want but as lines group per group
#########################################################################################
cvs = ds.Canvas(plot_width=2000, plot_height=2000)
agg = cvs.points(df, 'y', 'x')
img = ds.tf.shade(agg, cmap=colorcet.fire, how='eq_hist')
#########################################################################################
#here i am stuck
#########################################################################################
for name in list_of_names:
df2 = df_grouped.get_group(name)
cvs = ds.Canvas(plot_width=2000, plot_height=2000)
agg = cvs.line(df2, 'y', 'x')
img = ds.tf.shade(agg, cmap=colorcet.fire, how='eq_hist')
#plt.imshow(img)
plt.show()
To do this, you have a couple options. One is inserting NaN rows as a breakpoint into your dataframe when using cvs.line. You need DataShader to "pick up the pen" as it were, by inserting a row of NaNs after each group. It's not the slickest, but that's a current recommended solution.
Really simple, hacky example:
In [17]: df = pd.DataFrame({
...: 'name': list('AABBCCDD'),
...: 'x': np.arange(8),
...: 'y': np.arange(10, 18),
...: })
In [18]: df
Out[18]:
name x y
0 A 0 10
1 A 1 11
2 B 2 12
3 B 3 13
4 C 4 14
5 C 5 15
6 D 6 16
7 D 7 17
This block groups on the 'name' column, then reindexes each group to be one element longer than the original data:
In [20]: res = df.set_index('name').groupby('name').apply(
...: lambda x: x.reset_index(drop=True).reindex(np.arange(len(x) + 1))
...: )
In [21]: res
Out[21]:
x y
name
A 0 0.0 10.0
1 1.0 11.0
2 NaN NaN
B 0 2.0 12.0
1 3.0 13.0
2 NaN NaN
C 0 4.0 14.0
1 5.0 15.0
2 NaN NaN
D 0 6.0 16.0
1 7.0 17.0
2 NaN NaN
You can plug this reindexed dataframe into datashader to have multiple disconnected lines in the result.
This is a still-open issue on the datashader repo, including additional examples and boilerplate code: https://github.com/holoviz/datashader/issues/257
Other options include restructuring your data to accommodate one of cvs.line's other formats. From the Canvas.line docstring:
def line(self, source, x=None, y=None, agg=None, axis=0, geometry=None,
antialias=False):
Parameters
----------
source : pandas.DataFrame, dask.DataFrame, or xarray.DataArray/Dataset
The input datasource.
x, y : str or number or list or tuple or np.ndarray
Specification of the x and y coordinates of each vertex
* str or number: Column labels in source
* list or tuple: List or tuple of column labels in source
* np.ndarray: When axis=1, a literal array of the
coordinates to be used for every row
agg : Reduction, optional
Reduction to compute. Default is ``any()``.
axis : 0 or 1, default 0
Axis in source to draw lines along
* 0: Draw lines using data from the specified columns across
all rows in source
* 1: Draw one line per row in source using data from the
specified columns
There are a number of additional examples in the cvs.line docstring. You can pass arrays as the x, y arguments giving multiple columns to use in forming lines when axis=1, or you can a dataframe with ragged array values.
See this pull request adding the line options (h/t to #James-a-bednar in the comments) for a discussion of their use.

Generating an histogram with Matplotlib using a dataframe for x and y

I'm generating a simple line chart with Matplotlib, here is my code:
fig = plt.figure(facecolor='#131722',dpi=155, figsize=(8, 4))
ax1 = plt.subplot2grid((1,2), (0,0), facecolor='#131722')
for x in OrderedList:
rate_buy = []
total_buy = []
for y in x['data']['bids']:
rate_buy.append(y[0])
total_buy.append(y[1])
rBuys = pd.DataFrame({'buy': rate_buy})
tBuys = pd.DataFrame({'total': total_buy})
ax1.plot(rBuys.buy, tBuys.total, color='#0400ff', linewidth=0.5, alpha=1)
ax1.fill_between(rBuys.buy, 0, tBuys.total, facecolor='#0400ff', alpha=1)
Which gives me the following output:
And here is the data i used in the dataframe:
buy
0 9611
1 9610
2 9609
3 9608
4 9607
5 9606
6 9605
7 9604
8 9603
9 9602
10 9601
11 9600
12 9599
total
0 3.033661
1 3.295753
2 3.599813
3 22.305765
4 22.987476
5 30.975145
6 39.492845
7 42.828580
8 46.677708
9 49.533740
10 50.925840
11 61.396243
12 61.921523
I want to get the same output of the image, but with an histogram chart or whatever it's similar to that, where the height of the column on the y axis is retrieved from the total dataframe and the x axis position is retrieved from the buy dataframe. So the first element will have position x=9611 and y=3.033661
Is it possible to do that with Matplotlib? I tried to use hist, but it doesn't allow me to set both the x and the y axis
Pandas uses matplotlib as well, and the API is very easy once you have the dataframe.
Here is an example.
d = {
'buy':[
9611,
9610,
9609,
9608,
9607,
9606,
9605,
9604,
9603,
9602,
9601,
9600,
9599
],
'total':[
3.033661,
3.295753,
3.599813,
22.305765,
22.987476,
30.975145,
39.492845,
42.828580,
46.677708,
49.533740,
50.925840,
61.396243,
61.921523
]
}
df = pd.DataFrame(d)
df = df.sort_values(by=['buy']) #remember to sort your x values!
df.plot(kind='bar', x='buy', y='total', width=1)
plt.show()

Grouping boxplots in seaborn when input is a DataFrame

I intend to plot multiple columns in a pandas dataframe, all grouped by another column using groupby inside seaborn.boxplot. There is a nice answer here, for a similar problem in matplotlib matplotlib: Group boxplots but given the fact that seaborn.boxplot comes with groupby option I thought it could be much easier to do this in seaborn.
Here we go with a reproducible example that fails:
import seaborn as sns
import pandas as pd
df = pd.DataFrame([[2, 4, 5, 6, 1], [4, 5, 6, 7, 2], [5, 4, 5, 5, 1],
[10, 4, 7, 8, 2], [9, 3, 4, 6, 2], [3, 3, 4, 4, 1]],
columns=['a1', 'a2', 'a3', 'a4', 'b'])
# display(df)
a1 a2 a3 a4 b
0 2 4 5 6 1
1 4 5 6 7 2
2 5 4 5 5 1
3 10 4 7 8 2
4 9 3 4 6 2
5 3 3 4 4 1
#Plotting by seaborn
sns.boxplot(df[['a1','a2', 'a3', 'a4']], groupby=df.b)
What I get is something that completely ignores groupby option:
Whereas if I do this with one column it works thanks to another SO question Seaborn groupby pandas Series :
sns.boxplot(df.a1, groupby=df.b)
So I would like to get all my columns in one plot (all columns come in a similar scale).
EDIT:
The above SO question was edited and now includes a 'not clean' answer to this problem, but it would be nice if someone has a better idea for this problem.
As the other answers note, the boxplot function is limited to plotting a single "layer" of boxplots, and the groupby parameter only has an effect when the input is a Series and you have a second variable you want to use to bin the observations into each box..
However, you can accomplish what I think you're hoping for with the factorplot function, using kind="box". But, you'll first have to "melt" the sample dataframe into what is called long-form or "tidy" format where each column is a variable and each row is an observation:
df_long = pd.melt(df, "b", var_name="a", value_name="c")
Then it's very simple to plot:
sns.factorplot("a", hue="b", y="c", data=df_long, kind="box")
You can use directly boxplot (I imagine when the question was asked, that was not possible, but with seaborn version > 0.6 it is).
As explained by #mwaskom, you have to "melt" the sample dataframe into its "long-form" where each column is a variable and each row is an observation:
df_long = pd.melt(df, "b", var_name="a", value_name="c")
# display(df_long.head())
b a c
0 1 a1 2
1 2 a1 4
2 1 a1 5
3 2 a1 10
4 2 a1 9
Then you just plot it:
sns.boxplot(x="a", hue="b", y="c", data=df_long)
Seaborn's groupby function takes Series not DataFrames, that's why it's not working.
As a work around, you can do this :
fig, ax = plt.subplots(1,2, sharey=True)
for i, grp in enumerate(df.filter(regex="a").groupby(by=df.b)):
sns.boxplot(grp[1], ax=ax[i])
it gives :
Note that df.filter(regex="a") is equivalent to df[['a1','a2', 'a3', 'a4']]
a1 a2 a3 a4
0 2 4 5 6
1 4 5 6 7
2 5 4 5 5
3 10 4 7 8
4 9 3 4 6
5 3 3 4 4
Hope this helps
It isn't really any better than the answer you linked, but I think the way to achieve this in seaborn is using the FacetGrid feature, as the groupby parameter is only defined for Series passed to the boxplot function.
Here's some code - the pd.melt is necessary because (as best I can tell) the facet mapping can only take individual columns as parameters, so the data need to be turned into a 'long' format.
g = sns.FacetGrid(pd.melt(df, id_vars='b'), col='b')
g.map(sns.boxplot, 'value', 'variable')
It's not adding a lot to this conversation, but after struggling with this for longer than warranted (the actual clusters are unusable), I thought I would add my implementation as another example. It's got a superimposed scatterplot (because of how annoying my dataset is), shows melt using indices, and some aesthetic tweaks. I hope this is useful for someone.
output_graph
Here it is without using column headers (I saw a different thread that wanted to know how to do this using indices):
combined_array: ndarray = np.concatenate([dbscan_output.data, dbscan_output.labels.reshape(-1, 1)], axis=1)
cluster_data_df: DataFrame = DataFrame(combined_array)
if you want to use labelled columns:
column_names: List[str] = list(outcome_variable_names)
column_names.append('cluster')
cluster_data_df.set_axis(column_names, axis='columns', inplace=True)
graph_data: DataFrame = pd.melt(
frame=cluster_data_df,
id_vars=['cluster'],
# value_vars is an optional param - by default it uses columns except the id vars, but I've included it as an example
# value_vars=['outcome_var_1', 'outcome_var_2', 'outcome_var_3', 'outcome_var_4', 'outcome_var_5', 'outcome_var_6']
var_name='psychometric_test',
value_name='standard deviations from the mean'
)
The resulting dataframe (rows = sample_n x variable_n (in my case 1626 x 6 = 9756)):
index
cluster
psychometric_tst
standard deviations from the mean
0
0.0
outcome_var_1
-1.276182
1
0.0
outcome_var_1
-1.118813
2
0.0
outcome_var_1
-1.276182
9754
0.0
outcome_var_6
0.892548
9755
0.0
outcome_var_6
1.420480
If you want to use indices with melt:
graph_data: DataFrame = pd.melt(
frame=cluster_data_df,
id_vars=cluster_data_df.columns[-1],
# value_vars=cluster_data_df.columns[:-1],
var_name='psychometric_test',
value_name='standard deviations from the mean'
)
And here's the graphing code:
(Done with column headings - just note that y-axis=value_name, x-axis = var_name, hue = id_vars):
# plot graph grouped by cluster
sns.set_theme(style="ticks")
fig = plt.figure(figsize=(10, 10))
fig.set(font_scale=1.2)
fig.set_style("white")
# create boxplot
fig.ax = sns.boxplot(y='standard deviations from the mean', x='psychometric_test', hue='cluster', showfliers=False,
data=graph_data)
# set box alpha:
for patch in fig.ax.artists:
r, g, b, a = patch.get_facecolor()
patch.set_facecolor((r, g, b, .2))
# create scatterplot
fig.ax = sns.stripplot(y='standard deviations from the mean', x='psychometric_test', hue='cluster', data=graph_data,
dodge=True, alpha=.25, zorder=1)
# customise legend:
cluster_n: int = dbscan_output.n_clusters
## create list with legend text
i = 0
cluster_info: Dict[int, int] = dbscan_output.cluster_sizes # custom method
legend_labels: List[str] = []
while i < cluster_n:
label: str = f"cluster {i+1}, n = {cluster_info[i]}"
legend_labels.append(label)
i += 1
if -1 in cluster_info.keys():
cluster_n += 1
label: str = f"Unclustered, n = {cluster_info[-1]}"
legend_labels.insert(0, label)
## fetch existing handles and legends (each tuple will have 2*cluster number -> 1 for each boxplot cluster, 1 for each scatterplot cluster, so I will remove the first half)
handles, labels = fig.ax.get_legend_handles_labels()
index: int = int(cluster_n*(-1))
labels = legend_labels
plt.legend(handles[index:], labels[0:])
plt.xticks(rotation=45)
plt.show()
asds
Just a note: Most of my time was spent debugging the melt function. I predominantly got the error "*only integer scalar arrays can be converted to a scalar index with 1D numpy indices array*". My output required me to concatenate my outcome variable value table and the clusters (DBSCAN), and I'd put extra square brackets around the cluster array in the concat method. So I had a column where each value was an invisible List[int], rather than a plain int. It's pretty niche, but maybe it'll help someone.
List item

Categories

Resources