How to do multi-row layout using matplotlib subplots - python

This is a question about how to properly organize subplots, not how to create stacked bars.
I have the following dataframe:
corpus group mono p non p plus p minus p
0 fairview all 49 51 49 0
1 i2b2 all 46 54 46 0
2 mipacq all 44 56 43 1
and want to arrange the output as given in the two attached figures so that I get ncolumns and 2-rows, instead of two separate subplots with 1 row each (so in this case, there would be 2-rows, 3-columns on a single subplot instead of 1-row, 3-columns on 2 subplots):
I am generating these two figures as separate subplots using the following code:
data = <above dataframe>
semgroups = ['all']
corpus = ['fairview', 'i2b2', 'mipacq']
for sg in semgroups:
i = semgroups.index(sg)
ix = i + 7
ncols = len(set(data.corpus.tolist()))
nrows = len(set(data.group.tolist()))
fig = plt.figure()
fig, axs = plt.subplots(1, ncols, sharey=True)
for ax,(idx,row) in zip(axs.flat, data.iterrows()):
# I WANT TO PLOT BOTH ROWS on same subplot
#row[['mono p', 'non p']].plot.bar(ax=ax, color=['C0','C1'])
row[['plus p', 'minus p']].plot.bar(ax=ax, color=['C0','C1'])
if row['corpus'] == 'fairview':
corpus = 'Fairview'
label = '(d) '
elif row['corpus'] == 'mipacq':
corpus = 'MiPACQ'
if ncols == 3:
label = '(f) '
else:
label = '(b) '
else:
corpus = 'i2b2'
label = '(e) '
ax.set_title(label + corpus)
ax.tick_params(axis='x', labelrotation = 45)
if sg == 'all':
sg = 'All groups'
# Defining custom 'xlim' and 'ylim' values.
custom_ylim = (0, 60)
# Setting the values for all axes.
plt.setp(axs, ylim=custom_ylim)
fig.suptitle('Figure ' + str(ix) + ' ' + sg)
In the code above, I iterate through my df grabbing the following rows to generate both separate subplots:
# BUT, I WANT TO PLOT BOTH ROWS ON SAME SUBPLOT
row[['mono p', 'non p']].plot.bar(ax=ax, color=['C0','C1'])
row[['plus p', 'minus p']].plot.bar(ax=ax, color=['C0','C1'])
No matter how I do this I cannot get the desired two rows in a single subplot(I always get an empty row of plots with no data on the second row).

See inline comments
Tested in python 3.8.12, pandas 1.3.3, matplotlib 3.4.3, seaborn 0.11.2
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns # seaborn is a high-level api for matplotlib
# sample dataframe
data = {'corpus': ['fairview', 'i2b2', 'mipacq'], 'group': ['all', 'all', 'all'], 'mono p': [49, 46, 44], 'non p': [51, 54, 56], 'plus p': [49, 46, 43], 'minus p': [0, 0, 1]}
df = pd.DataFrame(data)
semgroups = df.group.unique() # unique groups
corpus = df.corpus.unique() # unique corpus
rows = [['mono p', 'non p'], ['plus p', 'minus p']] # columns for each row of plots
for sg in semgroups:
i = semgroups.index(sg)
ix = i + 7
ncols = len(corpus) # 3 columns for the example
nrows = len(rows) # 2 rows for the example
# create a figure with 2 rows of 3 columns: axes is a 2x3 array of <AxesSubplot:>
fig, axes = plt.subplots(nrows, ncols, sharey=True, figsize=(12, 10))
# iterate through each plot row combined with a list from rows
for axe, row in zip(axes, rows):
# iterate through each plot column of the current row
for i, ax in enumerate(axe):
# select the data for each plot
data = df.loc[df.group.eq(sg) & df.corpus.eq(corpus[i]), row]
# plot the dataframe, but setting the bar color is more difficult
# data.T.plot(kind='bar', legend=False, ax=ax)
# plot the data with seaborn, which is easier to color the bars
sns.barplot(data=data, ax=ax)
if corpus[i] == 'fairview':
l2 = 'Fairview'
l1 = '(d) '
elif corpus[i] == 'mipacq':
l2 = 'MiPACQ'
if ncols == 3:
l1 = '(f) '
else:
l1 = '(b) '
else:
l2 = 'i2b2'
l1 = '(e) '
ax.set_title(l1 + l2)
ax.tick_params(axis='x', labelrotation = 45)
if sg == 'all':
sg = 'All groups'
# Defining custom 'xlim' and 'ylim' values.
custom_ylim = (0, 60)
# Setting the values for all axes.
plt.setp(axes, ylim=custom_ylim)
fig.suptitle('Figure ' + str(ix) + ' ' + sg)
fig.tight_layout()
plt.show()

Related

Creating box plots by looping multiple columns

I am trying to create multiple box plot charts for about 5 columns in my dataframe (df_summ):
columns = ['dimension_a','dimension_b']
for i in columns:
sns.set(style = "ticks", palette = "pastel")
box_plot = sns.boxplot(y="measure", x=i,
palette=["m","g"],
data=df_summ_1500_delta)
sns.despine(offset=10, trim=True)
medians = df_summ_1500_delta.groupby([i])['measure'].median()
vertical_offset=df_summ_1500_delta['measure'].median()*-0.5
for xtick in box_plot.get_xticks():
box_plot.text(xtick,medians[xtick] + vertical_offset,medians[xtick],
horizontalalignment='center',size='small',color='blue',weight='semibold')
My only issue is that they aren't be separated on different facets, but rather on top of each other.
Any help on how I can make both on their own separate chart with the x axis being 'dimension a' and the x axis of the second chart being 'dimension b'.
To draw two boxplots next to each other at each x-position, you can use a hue for dimension_a and dimension_b separately. These two columns need to be transformed (with pd.melt()) to "long form".
Here is a some example code starting from generated test data. Note that the order both for the x-values as for the hue-values needs to be enforced to be sure of their exact position. The individual box plots are distributed over a width of 0.8.
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
df = pd.DataFrame({'dimension_a': np.random.choice(['hot', 'cold'], 100),
'dimension_b': np.random.choice(['hot', 'cold'], 100),
'measure': np.random.uniform(100, 500, 100)})
df.loc[df['dimension_a'] == 'hot', 'measure'] += 100
df.loc[df['dimension_a'] == 'cold', 'measure'] -= 100
x_order = ['hot', 'cold']
columns = ['dimension_a', 'dimension_b']
df1 = df.melt(value_vars=columns, var_name='dimension', value_name='value', id_vars='measure')
sns.set(style="ticks", palette="pastel")
ax = sns.boxplot(data=df1, x='value', order=x_order, y='measure',
hue='dimension', hue_order=columns, palette=["m", "g"], dodge=True)
ax.set_xlabel('')
sns.despine(offset=10, trim=True)
for col, dodge_dist in zip(columns, np.linspace(-0.4, 0.4, 2 * len(x_order) + 1)[1::2]):
medians = df.groupby([col])['measure'].median()
vertical_offset = df['measure'].median() * -0.5
for x_ind, xtick in enumerate(x_order):
ax.text(x_ind + dodge_dist, medians[xtick] + vertical_offset, f'{medians[xtick]:.2f}',
horizontalalignment='center', size='small', color='blue', weight='semibold')
plt.show()

matplotlib barh: how to make a visual gap between two groups of bars?

I have some sorted data of which I only show the highest and lowest values in a figure. This is a minimal version of what currently I have:
import matplotlib.pyplot as plt
# some dummy data (real data contains about 250 entries)
x_data = list(range(98, 72, -1))
labels = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
ranks = list(range(1, 27))
fig, ax = plt.subplots()
# plot 3 highest entries
bars_top = ax.barh(labels[:3], x_data[:3])
# plot 3 lowest entries
bars_bottom = ax.barh(labels[-3:], x_data[-3:])
ax.invert_yaxis()
# print values and ranks
for bar, value, rank in zip(bars_top + bars_bottom,
x_data[:3] + x_data[-3:],
ranks[:3] + ranks[-3:]):
y_pos = bar.get_y() + 0.5
ax.text(value - 4, y_pos, value, ha='right')
ax.text(4, y_pos, f'$rank:\ {rank}$')
ax.set_title('Comparison of Top 3 and Bottom 3')
plt.show()
Result:
I'd like to make an additional gap to this figure to make it more visually clear that the majority of data is in fact not displayed in this plot. For example, something very simple like the following would be sufficient:
Is this possible in matplotlib?
Here is a flexible approach that just plots a dummy bar in-between. The yaxis-transform together with the dummy bar's position is used to plot 3 black dots.
If multiple separations are needed, they all need a different dummy label, for example repeating the space character.
import matplotlib.pyplot as plt
import numpy as np
# some dummy data (real data contains about 250 entries)
x_data = list(range(98, 72, -1))
labels = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
ranks = list(range(1, 27))
fig, ax = plt.subplots()
# plot 3 highest entries
bars_top = ax.barh(labels[:3], x_data[:3])
# dummy bar inbetween
dummy_bar = ax.barh(" ", 0, color='none')
# plot 3 lowest entries
bars_bottom = ax.barh(labels[-3:], x_data[-3:])
ax.invert_yaxis()
# print values and ranks
for bar, value, rank in zip(bars_top + bars_bottom,
x_data[:3] + x_data[-3:],
ranks[:3] + ranks[-3:]):
y_pos = bar.get_y() + 0.5
ax.text(value - 4, y_pos, value, ha='right')
ax.text(4, y_pos, f'$rank:\ {rank}$')
# add three dots using the dummy bar's position
ax.scatter([0.05] * 3, dummy_bar[0].get_y() + np.linspace(0, dummy_bar[0].get_height(), 3),
marker='o', s=5, color='black', transform=ax.get_yaxis_transform())
ax.set_title('Comparison of Top 3 and Bottom 3')
ax.tick_params(axis='y', length=0) # hide the tick marks
ax.margins(y=0.02) # less empty space at top and bottom
plt.show()
The following function,
def top_bottom(x, l, n, ax=None, gap=1):
from matplotlib.pyplot import gca
if n <= 0 : raise ValueError('No. of top/bottom values must be positive')
if n > len(x) : raise ValueError('No. of top/bottom values should be not greater than data length')
if n+n > len(x):
print('Warning: no. of top/bottom values is larger than one'
' half of data length, OVERLAPPING')
if gap < 0 : print('Warning: some bar will be overlapped')
ax = ax if ax else gca()
top_x = x[:+n]
bot_x = x[-n:]
top_y = list(range(n+n, n, -1))
bot_y = list(range(n-gap, -gap, -1))
top_l = l[:+n] # A B C
bot_l = l[-n:] # X Y Z
top_bars = ax.barh(top_y, top_x)
bot_bars = ax.barh(bot_y, bot_x)
ax.set_yticks(top_y+bot_y)
ax.set_yticklabels(top_l+bot_l)
return top_bars, bot_bars
when invoked with your data and n=4, gap=4
bars_top, bars_bottom = top_bottom(x_data, labels, 4, gap=4)
produces
Later, you'll be able to customize the appearance of the bars as you like using the Artists returned by the function.

How to plot 2 x-axis label in Matplotlib? [duplicate]

I have some data where I've manipulated the dataframe using the following code:
import pandas as pd
import numpy as np
data = pd.DataFrame([[0,0,0,3,6,5,6,1],[1,1,1,3,4,5,2,0],[2,1,0,3,6,5,6,1],[3,0,0,2,9,4,2,1],[4,0,1,3,4,8,1,1],[5,1,1,3,3,5,9,1],[6,1,0,3,3,5,6,1],[7,0,1,3,4,8,9,1]], columns=["id", "sex", "split", "group0Low", "group0High", "group1Low", "group1High", "trim"])
data
#remove all where trim == 0
trimmed = data[(data.trim == 1)]
trimmed
#create df with columns to be split
columns = ['group0Low', 'group0High', 'group1Low', 'group1High']
to_split = trimmed[columns]
to_split
level_group = np.where(to_split.columns.str.contains('0'), 0, 1)
# output: array([0, 0, 1, 1])
level_low_high = np.where(to_split.columns.str.contains('Low'), 'low', 'high')
# output: array(['low', 'high', 'low', 'high'], dtype='<U4')
multi_level_columns = pd.MultiIndex.from_arrays([level_group, level_low_high], names=['group', 'val'])
to_split.columns = multi_level_columns
to_split.stack(level='group')
sex = trimmed['sex']
split = trimmed['split']
horizontalStack = pd.concat([sex, split, to_split], axis=1)
horizontalStack
finalData = horizontalStack.groupby(['split', 'sex', 'group'])
finalData.mean()
My question is, how do I plot the mean data using ggplot or seaborn such that for each "split" level I get a graph that looks like this:
At the bottom of the code you can see I've tried to split up the group factor so I can separate the bars, but that resulted in an error (KeyError: 'group') and I think that is related to the way I used multi indexing
I would use a factor plot from seaborn.
Say you have data like this:
import numpy as np
import pandas
import seaborn
seaborn.set(style='ticks')
np.random.seed(0)
groups = ('Group 1', 'Group 2')
sexes = ('Male', 'Female')
means = ('Low', 'High')
index = pandas.MultiIndex.from_product(
[groups, sexes, means],
names=['Group', 'Sex', 'Mean']
)
values = np.random.randint(low=20, high=100, size=len(index))
data = pandas.DataFrame(data={'val': values}, index=index).reset_index()
print(data)
Group Sex Mean val
0 Group 1 Male Low 64
1 Group 1 Male High 67
2 Group 1 Female Low 84
3 Group 1 Female High 87
4 Group 2 Male Low 87
5 Group 2 Male High 29
6 Group 2 Female Low 41
7 Group 2 Female High 56
You can then create the factor plot with one command + plus an extra line to remove some redundant (for your data) x-labels:
fg = seaborn.factorplot(x='Group', y='val', hue='Mean',
col='Sex', data=data, kind='bar')
fg.set_xlabels('')
Which gives me:
In a related question I found an alternative solution by #Stein that codes the multiindex levels as different labels. Here is how it looks like for your example:
import pandas as pd
import matplotlib.pyplot as plt
from itertools import groupby
import numpy as np
%matplotlib inline
groups = ('Group 1', 'Group 2')
sexes = ('Male', 'Female')
means = ('Low', 'High')
index = pd.MultiIndex.from_product(
[groups, sexes, means],
names=['Group', 'Sex', 'Mean']
)
values = np.random.randint(low=20, high=100, size=len(index))
data = pd.DataFrame(data={'val': values}, index=index)
# unstack last level to plot two separate columns
data = data.unstack(level=-1)
def add_line(ax, xpos, ypos):
line = plt.Line2D([xpos, xpos], [ypos + .1, ypos],
transform=ax.transAxes, color='gray')
line.set_clip_on(False)
ax.add_line(line)
def label_len(my_index,level):
labels = my_index.get_level_values(level)
return [(k, sum(1 for i in g)) for k,g in groupby(labels)]
def label_group_bar_table(ax, df):
ypos = -.1
scale = 1./df.index.size
for level in range(df.index.nlevels)[::-1]:
pos = 0
for label, rpos in label_len(df.index,level):
lxpos = (pos + .5 * rpos)*scale
ax.text(lxpos, ypos, label, ha='center', transform=ax.transAxes)
add_line(ax, pos*scale, ypos)
pos += rpos
add_line(ax, pos*scale , ypos)
ypos -= .1
ax = data['val'].plot(kind='bar')
#Below 2 lines remove default labels
ax.set_xticklabels('')
ax.set_xlabel('')
label_group_bar_table(ax, data)
This gives:

How do I plot bar graphs with error bars using python?

I am using Python 3.5. Also, I am a beginner (3 weeks experience) Python attempter and somehow I haven't given up in trying to analyze my data.
Data Description: My data is in a csv file (fev.csv). I've included it here if you want to see the full extent of it full data set. It has 5 columns:
age (years)
fev (liters)
ht (inches)
sex (female=0, male=1)
smoke (non-smoker=1, smoker=1)
Task: I am trying to write a program to generate a bar graph of average FEVs with error bars indicating standard deviation. I'm trying to get 2 side by side bars (smokers/non-smokers) at 4 different age categories (11-12, 13-14, 15-16, 17 or older).
Code so far (please excuse all my #notes, it helps me know what I'm trying to do):
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
data = pd.read_csv('fev.csv')
nonsmokers = data[data.smoke==0]
smokers = data[data.smoke==1]
nonsmokers1 = nonsmokers[(nonsmokers.age==11) | (nonsmokers.age==12)]
nonsmokers2 = nonsmokers[(nonsmokers.age==13) | (nonsmokers.age==14)]
nonsmokers3 = nonsmokers[(nonsmokers.age==15) | (nonsmokers.age==16)]
nonsmokers4 = nonsmokers[(nonsmokers.age>=17)]
smokers1 = smokers[(smokers.age==11) | (smokers.age==12)]
smokers2 = smokers[(smokers.age==13) | (smokers.age==14)]
smokers3 = smokers[(smokers.age==15) | (smokers.age==16)]
smokers4 = smokers[(smokers.age>=17)]
nonsmMean = [nonsmokers1.fev.mean(), nonsmokers2.fev.mean(), nonsmokers3.fev.mean(), nonsmokers4.fev.mean()]
nonsmSd = [nonsmokers1.fev.std(), nonsmokers2.fev.std(), nonsmokers3.fev.std(), nonsmokers4.fev.std()]
smMean = [smokers1.fev.mean(), smokers2.fev.mean(), smokers3.fev.mean(), smokers4.fev.mean()]
smSd = [smokers1.fev.std(), smokers2.fev.std(), smokers3.fev.std(), smokers4.fev.std()]
# data to be plotted
nonsmoker = np.array(nonsmMean)
sdNonsmoker = np.array(nonsmSd)
smoker = np.array(smMean)
sdSmoker = np.array(smSd)
# parameters
bar_width = 0.35
x = np.arange(len(nonsmoker))
# plotting bars
plt.bar(x, nonsmoker, bar_width, yerr=sdNonsmoker, ecolor='k', color='b', label='Nonsmokers')
plt.bar(x+bar_width, smoker, bar_width, yerr=sdSmoker, ecolor='k', color='m', label='Smokers')
# formatting and labeling the axes and title
plt.xlabel('Age')
plt.ylabel('FEV')
plt.title('Mean FEV by Age and Smoking Status')
plt.xticks(x+0.35, ['11 to 12', '13 to 14', '15 to 16', '17+'])
# adding the legend
plt.legend()
plt.axis([-0.5,4.2,0,7])
plt.savefig('FEVgraph.png', dpi=300)
# and we are done!
plt.show()
Is there a more efficient way of doing this?
Thanks!
Possible solution is the following:
# pip install pandas
# pip install matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# read csv file and create pandas dataframe
df = pd.read_csv('https://raw.githubusercontent.com/benkeser/halplus/master/inst/extdata/fev.csv')
# assign age bins to data
bins = [df['age'].min()-1, 10, 12, 14, 16, df['age'].max()]
bins_labels = ['<11', '11 to 12', '13 to 14', '15 to 16', '17+']
df['age_bins'] = pd.cut(df['age'], bins, labels = bins_labels)
# aggregate data
result = df.groupby(['smoke', 'age_bins'], as_index=False).agg({'fev':['mean','std']})
result.columns = ['_'.join(col).strip('_') for col in result.columns.values]
result = result.round(1)
# prepare data for plot
nonsmokers = result[result['smoke'] == 0]
smokers = result[result['smoke'] == 1]
x = np.arange(len(bins_labels))
width = 0.35
# set plot fugure size
plt.rcParams["figure.figsize"] = [8,6]
fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, nonsmokers['fev_mean'], width, yerr=nonsmokers['fev_std'], color='b', label='Nonsmokers')
rects2 = ax.bar(x + width/2, smokers['fev_mean'], width, yerr=smokers['fev_std'], color='m', label='Smokers')
ax.set_xlabel('Age')
ax.set_ylabel('FEV')
ax.set_title('Mean FEV by Age and Smoking Status')
ax.set_xticks(x, bins_labels)
ax.legend(loc=2)
fig.tight_layout()
plt.savefig('FEVgraph.png', dpi=300)
plt.show()
Returns

matplotlib: Group boxplots

Is there a way to group boxplots in matplotlib?
Assume we have three groups "A", "B", and "C" and for each we want to create a boxplot for both "apples" and "oranges". If a grouping is not possible directly, we can create all six combinations and place them linearly side by side. What would be to simplest way to visualize the groupings? I'm trying to avoid setting the tick labels to something like "A + apples" since my scenario involves much longer names than "A".
How about using colors to differentiate between "apples" and "oranges" and spacing to separate "A", "B" and "C"?
Something like this:
from pylab import plot, show, savefig, xlim, figure, \
hold, ylim, legend, boxplot, setp, axes
# function for setting the colors of the box plots pairs
def setBoxColors(bp):
setp(bp['boxes'][0], color='blue')
setp(bp['caps'][0], color='blue')
setp(bp['caps'][1], color='blue')
setp(bp['whiskers'][0], color='blue')
setp(bp['whiskers'][1], color='blue')
setp(bp['fliers'][0], color='blue')
setp(bp['fliers'][1], color='blue')
setp(bp['medians'][0], color='blue')
setp(bp['boxes'][1], color='red')
setp(bp['caps'][2], color='red')
setp(bp['caps'][3], color='red')
setp(bp['whiskers'][2], color='red')
setp(bp['whiskers'][3], color='red')
setp(bp['fliers'][2], color='red')
setp(bp['fliers'][3], color='red')
setp(bp['medians'][1], color='red')
# Some fake data to plot
A= [[1, 2, 5,], [7, 2]]
B = [[5, 7, 2, 2, 5], [7, 2, 5]]
C = [[3,2,5,7], [6, 7, 3]]
fig = figure()
ax = axes()
hold(True)
# first boxplot pair
bp = boxplot(A, positions = [1, 2], widths = 0.6)
setBoxColors(bp)
# second boxplot pair
bp = boxplot(B, positions = [4, 5], widths = 0.6)
setBoxColors(bp)
# thrid boxplot pair
bp = boxplot(C, positions = [7, 8], widths = 0.6)
setBoxColors(bp)
# set axes limits and labels
xlim(0,9)
ylim(0,9)
ax.set_xticklabels(['A', 'B', 'C'])
ax.set_xticks([1.5, 4.5, 7.5])
# draw temporary red and blue lines and use them to create a legend
hB, = plot([1,1],'b-')
hR, = plot([1,1],'r-')
legend((hB, hR),('Apples', 'Oranges'))
hB.set_visible(False)
hR.set_visible(False)
savefig('boxcompare.png')
show()
Here is my version. It stores data based on categories.
import matplotlib.pyplot as plt
import numpy as np
data_a = [[1,2,5], [5,7,2,2,5], [7,2,5]]
data_b = [[6,4,2], [1,2,5,3,2], [2,3,5,1]]
ticks = ['A', 'B', 'C']
def set_box_color(bp, color):
plt.setp(bp['boxes'], color=color)
plt.setp(bp['whiskers'], color=color)
plt.setp(bp['caps'], color=color)
plt.setp(bp['medians'], color=color)
plt.figure()
bpl = plt.boxplot(data_a, positions=np.array(xrange(len(data_a)))*2.0-0.4, sym='', widths=0.6)
bpr = plt.boxplot(data_b, positions=np.array(xrange(len(data_b)))*2.0+0.4, sym='', widths=0.6)
set_box_color(bpl, '#D7191C') # colors are from http://colorbrewer2.org/
set_box_color(bpr, '#2C7BB6')
# draw temporary red and blue lines and use them to create a legend
plt.plot([], c='#D7191C', label='Apples')
plt.plot([], c='#2C7BB6', label='Oranges')
plt.legend()
plt.xticks(xrange(0, len(ticks) * 2, 2), ticks)
plt.xlim(-2, len(ticks)*2)
plt.ylim(0, 8)
plt.tight_layout()
plt.savefig('boxcompare.png')
I am short of reputation so I cannot post an image to here.
You can run it and see the result. Basically it's very similar to what Molly did.
Note that, depending on the version of python you are using, you may need to replace xrange with range
A simple way would be to use pandas.
I adapted an example from the plotting documentation:
In [1]: import pandas as pd, numpy as np
In [2]: df = pd.DataFrame(np.random.rand(12,2), columns=['Apples', 'Oranges'] )
In [3]: df['Categories'] = pd.Series(list('AAAABBBBCCCC'))
In [4]: pd.options.display.mpl_style = 'default'
In [5]: df.boxplot(by='Categories')
Out[5]:
array([<matplotlib.axes.AxesSubplot object at 0x51a5190>,
<matplotlib.axes.AxesSubplot object at 0x53fddd0>], dtype=object)
Mock data:
df = pd.DataFrame({'Group':['A','A','A','B','C','B','B','C','A','C'],\
'Apple':np.random.rand(10),'Orange':np.random.rand(10)})
df = df[['Group','Apple','Orange']]
Group Apple Orange
0 A 0.465636 0.537723
1 A 0.560537 0.727238
2 A 0.268154 0.648927
3 B 0.722644 0.115550
4 C 0.586346 0.042896
5 B 0.562881 0.369686
6 B 0.395236 0.672477
7 C 0.577949 0.358801
8 A 0.764069 0.642724
9 C 0.731076 0.302369
You can use the Seaborn library for these plots. First melt the dataframe to format data and then create the boxplot of your choice.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
dd=pd.melt(df,id_vars=['Group'],value_vars=['Apple','Orange'],var_name='fruits')
sns.boxplot(x='Group',y='value',data=dd,hue='fruits')
The accepted answer uses pylab and works for 2 groups. What if we have more?
Here is the flexible generic solution with matplotlib
import matplotlib.pyplot as pl
# there are 4 individuals, each one tested under 3 different settings
# --- Random data, e.g. results per algorithm:
# Invidual 1
d1_1 = [1,1,2,2,3,3]
d1_2 = [3,3,4,4,5,5]
d1_3 = [5,5,6,6,7,7]
# Individual 2
d2_1 = [7,7,8,8,9,9]
d2_2 = [9,9,10,10,11,11]
d2_3 = [11,11,12,12,13,13]
# Individual 3
d3_1 = [1,2,3,4,5,6]
d3_2 = [4,5,6,7,8,9]
d3_3 = [10,11,12,13,14,15]
# Individual 4
d4_1 = [1,1,2,2,3,3]
d4_2 = [9,9,10,10,11,11]
d4_3 = [10,11,12,13,14,15]
# --- Combining your data:
data_group1 = [d1_1, d1_2, d1_3]
data_group2 = [d2_1, d2_2, d2_3]
data_group3 = [d3_1, d3_2, d3_3]
data_group4 = [d4_1, d4_2, d4_3]
colors = ['pink', 'lightblue', 'lightgreen', 'violet']
# we compare the performances of the 4 individuals within the same set of 3 settings
data_groups = [data_group1, data_group2, data_group3, data_group4]
# --- Labels for your data:
labels_list = ['a','b', 'c']
width = 1/len(labels_list)
xlocations = [ x*((1+ len(data_groups))*width) for x in range(len(data_group1)) ]
symbol = 'r+'
ymin = min ( [ val for dg in data_groups for data in dg for val in data ] )
ymax = max ( [ val for dg in data_groups for data in dg for val in data ])
ax = pl.gca()
ax.set_ylim(ymin,ymax)
ax.grid(True, linestyle='dotted')
ax.set_axisbelow(True)
pl.xlabel('X axis label')
pl.ylabel('Y axis label')
pl.title('title')
space = len(data_groups)/2
offset = len(data_groups)/2
# --- Offset the positions per group:
group_positions = []
for num, dg in enumerate(data_groups):
_off = (0 - space + (0.5+num))
print(_off)
group_positions.append([x+_off*(width+0.01) for x in xlocations])
for dg, pos, c in zip(data_groups, group_positions, colors):
boxes = ax.boxplot(dg,
sym=symbol,
labels=['']*len(labels_list),
# labels=labels_list,
positions=pos,
widths=width,
boxprops=dict(facecolor=c),
# capprops=dict(color=c),
# whiskerprops=dict(color=c),
# flierprops=dict(color=c, markeredgecolor=c),
medianprops=dict(color='grey'),
# notch=False,
# vert=True,
# whis=1.5,
# bootstrap=None,
# usermedians=None,
# conf_intervals=None,
patch_artist=True,
)
ax.set_xticks( xlocations )
ax.set_xticklabels( labels_list, rotation=0 )
pl.show()
Just to add to the conversation, I have found a more elegant way to change the color of the box plot by iterating over the dictionary of the object itself
import numpy as np
import matplotlib.pyplot as plt
def color_box(bp, color):
# Define the elements to color. You can also add medians, fliers and means
elements = ['boxes','caps','whiskers']
# Iterate over each of the elements changing the color
for elem in elements:
[plt.setp(bp[elem][idx], color=color) for idx in xrange(len(bp[elem]))]
return
a = np.random.uniform(0,10,[100,5])
bp = plt.boxplot(a)
color_box(bp, 'red')
Cheers!
Here's a function I wrote that takes Molly's code and some other code I've found on the internet to make slightly fancier grouped boxplots:
import numpy as np
import matplotlib.pyplot as plt
def custom_legend(colors, labels, linestyles=None):
""" Creates a list of matplotlib Patch objects that can be passed to the legend(...) function to create a custom
legend.
:param colors: A list of colors, one for each entry in the legend. You can also include a linestyle, for example: 'k--'
:param labels: A list of labels, one for each entry in the legend.
"""
if linestyles is not None:
assert len(linestyles) == len(colors), "Length of linestyles must match length of colors."
h = list()
for k,(c,l) in enumerate(zip(colors, labels)):
clr = c
ls = 'solid'
if linestyles is not None:
ls = linestyles[k]
patch = patches.Patch(color=clr, label=l, linestyle=ls)
h.append(patch)
return h
def grouped_boxplot(data, group_names=None, subgroup_names=None, ax=None, subgroup_colors=None,
box_width=0.6, box_spacing=1.0):
""" Draws a grouped boxplot. The data should be organized in a hierarchy, where there are multiple
subgroups for each main group.
:param data: A dictionary of length equal to the number of the groups. The key should be the
group name, the value should be a list of arrays. The length of the list should be
equal to the number of subgroups.
:param group_names: (Optional) The group names, should be the same as data.keys(), but can be ordered.
:param subgroup_names: (Optional) Names of the subgroups.
:param subgroup_colors: A list specifying the plot color for each subgroup.
:param ax: (Optional) The axis to plot on.
"""
if group_names is None:
group_names = data.keys()
if ax is None:
ax = plt.gca()
plt.sca(ax)
nsubgroups = np.array([len(v) for v in data.values()])
assert len(np.unique(nsubgroups)) == 1, "Number of subgroups for each property differ!"
nsubgroups = nsubgroups[0]
if subgroup_colors is None:
subgroup_colors = list()
for k in range(nsubgroups):
subgroup_colors.append(np.random.rand(3))
else:
assert len(subgroup_colors) == nsubgroups, "subgroup_colors length must match number of subgroups (%d)" % nsubgroups
def _decorate_box(_bp, _d):
plt.setp(_bp['boxes'], lw=0, color='k')
plt.setp(_bp['whiskers'], lw=3.0, color='k')
# fill in each box with a color
assert len(_bp['boxes']) == nsubgroups
for _k,_box in enumerate(_bp['boxes']):
_boxX = list()
_boxY = list()
for _j in range(5):
_boxX.append(_box.get_xdata()[_j])
_boxY.append(_box.get_ydata()[_j])
_boxCoords = zip(_boxX, _boxY)
_boxPolygon = plt.Polygon(_boxCoords, facecolor=subgroup_colors[_k])
ax.add_patch(_boxPolygon)
# draw a black line for the median
for _k,_med in enumerate(_bp['medians']):
_medianX = list()
_medianY = list()
for _j in range(2):
_medianX.append(_med.get_xdata()[_j])
_medianY.append(_med.get_ydata()[_j])
plt.plot(_medianX, _medianY, 'k', linewidth=3.0)
# draw a black asterisk for the mean
plt.plot([np.mean(_med.get_xdata())], [np.mean(_d[_k])], color='w', marker='*',
markeredgecolor='k', markersize=12)
cpos = 1
label_pos = list()
for k in group_names:
d = data[k]
nsubgroups = len(d)
pos = np.arange(nsubgroups) + cpos
label_pos.append(pos.mean())
bp = plt.boxplot(d, positions=pos, widths=box_width)
_decorate_box(bp, d)
cpos += nsubgroups + box_spacing
plt.xlim(0, cpos-1)
plt.xticks(label_pos, group_names)
if subgroup_names is not None:
leg = custom_legend(subgroup_colors, subgroup_names)
plt.legend(handles=leg)
You can use the function(s) like this:
data = { 'A':[np.random.randn(100), np.random.randn(100) + 5],
'B':[np.random.randn(100)+1, np.random.randn(100) + 9],
'C':[np.random.randn(100)-3, np.random.randn(100) -5]
}
grouped_boxplot(data, group_names=['A', 'B', 'C'], subgroup_names=['Apples', 'Oranges'], subgroup_colors=['#D02D2E', '#D67700'])
plt.show()
Grouped boxplots, towards subtle academic publication styling... (source)
(Left) Python 2.7.12 Matplotlib v1.5.3. (Right) Python 3.7.3. Matplotlib v3.1.0.
Code:
import numpy as np
import matplotlib.pyplot as plt
# --- Your data, e.g. results per algorithm:
data1 = [5,5,4,3,3,5]
data2 = [6,6,4,6,8,5]
data3 = [7,8,4,5,8,2]
data4 = [6,9,3,6,8,4]
# --- Combining your data:
data_group1 = [data1, data2]
data_group2 = [data3, data4]
# --- Labels for your data:
labels_list = ['a','b']
xlocations = range(len(data_group1))
width = 0.3
symbol = 'r+'
ymin = 0
ymax = 10
ax = plt.gca()
ax.set_ylim(ymin,ymax)
ax.set_xticklabels( labels_list, rotation=0 )
ax.grid(True, linestyle='dotted')
ax.set_axisbelow(True)
ax.set_xticks(xlocations)
plt.xlabel('X axis label')
plt.ylabel('Y axis label')
plt.title('title')
# --- Offset the positions per group:
positions_group1 = [x-(width+0.01) for x in xlocations]
positions_group2 = xlocations
plt.boxplot(data_group1,
sym=symbol,
labels=['']*len(labels_list),
positions=positions_group1,
widths=width,
# notch=False,
# vert=True,
# whis=1.5,
# bootstrap=None,
# usermedians=None,
# conf_intervals=None,
# patch_artist=False,
)
plt.boxplot(data_group2,
labels=labels_list,
sym=symbol,
positions=positions_group2,
widths=width,
# notch=False,
# vert=True,
# whis=1.5,
# bootstrap=None,
# usermedians=None,
# conf_intervals=None,
# patch_artist=False,
)
plt.savefig('boxplot_grouped.png')
plt.savefig('boxplot_grouped.pdf') # when publishing, use high quality PDFs
#plt.show() # uncomment to show the plot.
I used the code given by Kuzeko and it worked well, but I found that the boxes in each group were being drawn in the reverse order. I changed ...x-_off... to ...x+_off... in the following line (just above the last for loop) which fixes it for me:
group_positions.append([x+_off*(width+0.01) for x in xlocations])
A boxplot above was modified to obtain group boxplots with 3 data types.
import matplotlib.pyplot as plt
import numpy as np
ord = [[16.9423,
4.0410,
19.1185],
[18.5134,
17.8048,
19.2669],
[18.7286,
18.0576,
19.1717],
[18.8998,
18.8469,
19.0005],
[18.8126,
18.7870,
18.8393],
[18.7770,
18.7511,
18.8022],
[18.7409,
18.7075,
18.7747],
[18.6866,
18.6624,
18.7093
],
[18.6748],
[18.9069,
18.6752,
19.0769],
[19.0012,
18.9783,
19.0202
],
[18.9448,
18.9134,
18.9813],
[19.1242,
18.8256,
19.3185],
[19.2118,
19.1661,
19.2580],
[19.2505,
19.1231,
19.3526]]
seq = [[17.8092,
4.0410,
19.6653],
[18.7266,
18.2556,
19.3739],
[18.6051,
18.0589,
19.0557],
[18.6467,
18.5629,
18.7566],
[18.5307,
18.4999,
18.5684],
[18.4732,
18.4484,
18.4985],
[18.5234,
18.5027,
18.4797,
18.4573],
[18.3987,
18.3636,
18.4544],
[18.3593],
[18.7234,
18.7092,
18.7598],
[18.7438,
18.7224,
18.7677],
[18.7304,
18.7111,
18.6880,
18.6913,
18.6678],
[18.8926,
18.5902,
19.2003],
[19.1059,
19.0835,
19.0601,
19.0373,
19.0147],
[19.1925,
19.0177,
19.2588]]
apd=[[17.0331,
4.0410,
18.5670],
[17.6124,
17.1975,
18.0755],
[17.3956,
17.1572,
17.9140],
[17.8295,
17.6514,
18.1466],
[18.0665,
17.9144,
18.2157],
[18.1518,
18.0382,
18.2722],
[18.1975,
18.0956,
18.2987],
[18.2219,
18.1293,
18.3062],
[18.2870,
18.2215,
18.3513],
[18.3047,
18.2363,
18.3950],
[18.3580,
18.2923,
18.4205],
[18.3830,
18.3250,
18.4381],
[18.4135,
18.3645,
18.4753],
[18.4580,
18.4095,
18.5170],
[18.4900,
18.4430,
18.5435]
]
ticks = [120,
240,
360,
516,
662,
740,
874,
1022,
1081,
1201,
1320,
1451,
1562,
1680,
1863]
def set_box_color(bp, color):
plt.setp(bp['boxes'], color=color)
plt.setp(bp['whiskers'], color=color)
plt.setp(bp['caps'], color=color)
plt.setp(bp['medians'], color=color)
plt.figure()
bpl = plt.boxplot(ord, positions=np.array(range(len(ord)))*3.0-0.3, sym='', widths=0.6)
bpr = plt.boxplot(seq, positions=np.array(range(len(seq)))*3.0+0.3, sym='', widths=0.6)
bpg = plt.boxplot(apd, positions=np.array(range(len(apd)))*3.0+0.9, sym='', widths=0.6)
set_box_color(bpl, '#D7191C') # colors are from http://colorbrewer2.org/
set_box_color(bpr, '#2C7BB6')
set_box_color(bpg, '#99d8c9')
# draw temporary red and blue lines and use them to create a legend
plt.plot([], c='#D7191C', label='ORD')
plt.plot([], c='#2C7BB6', label='SEQ')
plt.plot([], c='#99d8c9', label='APD')
plt.legend()
plt.xticks(range(0, len(ticks) * 3, 3), ticks)
plt.xlim(-2, len(ticks)*3)
plt.ylim(0, 20)
plt.tight_layout()
plt.show()
plt.savefig('boxcompare.png')

Categories

Resources