matplotlib: Group boxplots

matplotlib: Group boxplots - python

Is there a way to group boxplots in matplotlib?
Assume we have three groups "A", "B", and "C" and for each we want to create a boxplot for both "apples" and "oranges". If a grouping is not possible directly, we can create all six combinations and place them linearly side by side. What would be to simplest way to visualize the groupings? I'm trying to avoid setting the tick labels to something like "A + apples" since my scenario involves much longer names than "A".

How about using colors to differentiate between "apples" and "oranges" and spacing to separate "A", "B" and "C"?
Something like this:
from pylab import plot, show, savefig, xlim, figure, \
hold, ylim, legend, boxplot, setp, axes
# function for setting the colors of the box plots pairs
def setBoxColors(bp):
setp(bp['boxes'][0], color='blue')
setp(bp['caps'][0], color='blue')
setp(bp['caps'][1], color='blue')
setp(bp['whiskers'][0], color='blue')
setp(bp['whiskers'][1], color='blue')
setp(bp['fliers'][0], color='blue')
setp(bp['fliers'][1], color='blue')
setp(bp['medians'][0], color='blue')
setp(bp['boxes'][1], color='red')
setp(bp['caps'][2], color='red')
setp(bp['caps'][3], color='red')
setp(bp['whiskers'][2], color='red')
setp(bp['whiskers'][3], color='red')
setp(bp['fliers'][2], color='red')
setp(bp['fliers'][3], color='red')
setp(bp['medians'][1], color='red')
# Some fake data to plot
A= [[1, 2, 5,], [7, 2]]
B = [[5, 7, 2, 2, 5], [7, 2, 5]]
C = [[3,2,5,7], [6, 7, 3]]
fig = figure()
ax = axes()
hold(True)
# first boxplot pair
bp = boxplot(A, positions = [1, 2], widths = 0.6)
setBoxColors(bp)
# second boxplot pair
bp = boxplot(B, positions = [4, 5], widths = 0.6)
setBoxColors(bp)
# thrid boxplot pair
bp = boxplot(C, positions = [7, 8], widths = 0.6)
setBoxColors(bp)
# set axes limits and labels
xlim(0,9)
ylim(0,9)
ax.set_xticklabels(['A', 'B', 'C'])
ax.set_xticks([1.5, 4.5, 7.5])
# draw temporary red and blue lines and use them to create a legend
hB, = plot([1,1],'b-')
hR, = plot([1,1],'r-')
legend((hB, hR),('Apples', 'Oranges'))
hB.set_visible(False)
hR.set_visible(False)
savefig('boxcompare.png')
show()

Here is my version. It stores data based on categories.
import matplotlib.pyplot as plt
import numpy as np
data_a = [[1,2,5], [5,7,2,2,5], [7,2,5]]
data_b = [[6,4,2], [1,2,5,3,2], [2,3,5,1]]
ticks = ['A', 'B', 'C']
def set_box_color(bp, color):
plt.setp(bp['boxes'], color=color)
plt.setp(bp['whiskers'], color=color)
plt.setp(bp['caps'], color=color)
plt.setp(bp['medians'], color=color)
plt.figure()
bpl = plt.boxplot(data_a, positions=np.array(xrange(len(data_a)))*2.0-0.4, sym='', widths=0.6)
bpr = plt.boxplot(data_b, positions=np.array(xrange(len(data_b)))*2.0+0.4, sym='', widths=0.6)
set_box_color(bpl, '#D7191C') # colors are from http://colorbrewer2.org/
set_box_color(bpr, '#2C7BB6')
# draw temporary red and blue lines and use them to create a legend
plt.plot([], c='#D7191C', label='Apples')
plt.plot([], c='#2C7BB6', label='Oranges')
plt.legend()
plt.xticks(xrange(0, len(ticks) * 2, 2), ticks)
plt.xlim(-2, len(ticks)*2)
plt.ylim(0, 8)
plt.tight_layout()
plt.savefig('boxcompare.png')
I am short of reputation so I cannot post an image to here.
You can run it and see the result. Basically it's very similar to what Molly did.
Note that, depending on the version of python you are using, you may need to replace xrange with range

A simple way would be to use pandas.
I adapted an example from the plotting documentation:
In [1]: import pandas as pd, numpy as np
In [2]: df = pd.DataFrame(np.random.rand(12,2), columns=['Apples', 'Oranges'] )
In [3]: df['Categories'] = pd.Series(list('AAAABBBBCCCC'))
In [4]: pd.options.display.mpl_style = 'default'
In [5]: df.boxplot(by='Categories')
Out[5]:
array([<matplotlib.axes.AxesSubplot object at 0x51a5190>,
<matplotlib.axes.AxesSubplot object at 0x53fddd0>], dtype=object)

Mock data:
df = pd.DataFrame({'Group':['A','A','A','B','C','B','B','C','A','C'],\
'Apple':np.random.rand(10),'Orange':np.random.rand(10)})
df = df[['Group','Apple','Orange']]
Group Apple Orange
0 A 0.465636 0.537723
1 A 0.560537 0.727238
2 A 0.268154 0.648927
3 B 0.722644 0.115550
4 C 0.586346 0.042896
5 B 0.562881 0.369686
6 B 0.395236 0.672477
7 C 0.577949 0.358801
8 A 0.764069 0.642724
9 C 0.731076 0.302369
You can use the Seaborn library for these plots. First melt the dataframe to format data and then create the boxplot of your choice.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
dd=pd.melt(df,id_vars=['Group'],value_vars=['Apple','Orange'],var_name='fruits')
sns.boxplot(x='Group',y='value',data=dd,hue='fruits')

The accepted answer uses pylab and works for 2 groups. What if we have more?
Here is the flexible generic solution with matplotlib
import matplotlib.pyplot as pl
# there are 4 individuals, each one tested under 3 different settings
# --- Random data, e.g. results per algorithm:
# Invidual 1
d1_1 = [1,1,2,2,3,3]
d1_2 = [3,3,4,4,5,5]
d1_3 = [5,5,6,6,7,7]
# Individual 2
d2_1 = [7,7,8,8,9,9]
d2_2 = [9,9,10,10,11,11]
d2_3 = [11,11,12,12,13,13]
# Individual 3
d3_1 = [1,2,3,4,5,6]
d3_2 = [4,5,6,7,8,9]
d3_3 = [10,11,12,13,14,15]
# Individual 4
d4_1 = [1,1,2,2,3,3]
d4_2 = [9,9,10,10,11,11]
d4_3 = [10,11,12,13,14,15]
# --- Combining your data:
data_group1 = [d1_1, d1_2, d1_3]
data_group2 = [d2_1, d2_2, d2_3]
data_group3 = [d3_1, d3_2, d3_3]
data_group4 = [d4_1, d4_2, d4_3]
colors = ['pink', 'lightblue', 'lightgreen', 'violet']
# we compare the performances of the 4 individuals within the same set of 3 settings
data_groups = [data_group1, data_group2, data_group3, data_group4]
# --- Labels for your data:
labels_list = ['a','b', 'c']
width = 1/len(labels_list)
xlocations = [ x*((1+ len(data_groups))*width) for x in range(len(data_group1)) ]
symbol = 'r+'
ymin = min ( [ val for dg in data_groups for data in dg for val in data ] )
ymax = max ( [ val for dg in data_groups for data in dg for val in data ])
ax = pl.gca()
ax.set_ylim(ymin,ymax)
ax.grid(True, linestyle='dotted')
ax.set_axisbelow(True)
pl.xlabel('X axis label')
pl.ylabel('Y axis label')
pl.title('title')
space = len(data_groups)/2
offset = len(data_groups)/2
# --- Offset the positions per group:
group_positions = []
for num, dg in enumerate(data_groups):
_off = (0 - space + (0.5+num))
print(_off)
group_positions.append([x+_off*(width+0.01) for x in xlocations])
for dg, pos, c in zip(data_groups, group_positions, colors):
boxes = ax.boxplot(dg,
sym=symbol,
labels=['']*len(labels_list),
# labels=labels_list,
positions=pos,
widths=width,
boxprops=dict(facecolor=c),
# capprops=dict(color=c),
# whiskerprops=dict(color=c),
# flierprops=dict(color=c, markeredgecolor=c),
medianprops=dict(color='grey'),
# notch=False,
# vert=True,
# whis=1.5,
# bootstrap=None,
# usermedians=None,
# conf_intervals=None,
patch_artist=True,
)
ax.set_xticks( xlocations )
ax.set_xticklabels( labels_list, rotation=0 )
pl.show()

Just to add to the conversation, I have found a more elegant way to change the color of the box plot by iterating over the dictionary of the object itself
import numpy as np
import matplotlib.pyplot as plt
def color_box(bp, color):
# Define the elements to color. You can also add medians, fliers and means
elements = ['boxes','caps','whiskers']
# Iterate over each of the elements changing the color
for elem in elements:
[plt.setp(bp[elem][idx], color=color) for idx in xrange(len(bp[elem]))]
return
a = np.random.uniform(0,10,[100,5])
bp = plt.boxplot(a)
color_box(bp, 'red')
Cheers!

Here's a function I wrote that takes Molly's code and some other code I've found on the internet to make slightly fancier grouped boxplots:
import numpy as np
import matplotlib.pyplot as plt
def custom_legend(colors, labels, linestyles=None):
""" Creates a list of matplotlib Patch objects that can be passed to the legend(...) function to create a custom
legend.
:param colors: A list of colors, one for each entry in the legend. You can also include a linestyle, for example: 'k--'
:param labels: A list of labels, one for each entry in the legend.
"""
if linestyles is not None:
assert len(linestyles) == len(colors), "Length of linestyles must match length of colors."
h = list()
for k,(c,l) in enumerate(zip(colors, labels)):
clr = c
ls = 'solid'
if linestyles is not None:
ls = linestyles[k]
patch = patches.Patch(color=clr, label=l, linestyle=ls)
h.append(patch)
return h
def grouped_boxplot(data, group_names=None, subgroup_names=None, ax=None, subgroup_colors=None,
box_width=0.6, box_spacing=1.0):
""" Draws a grouped boxplot. The data should be organized in a hierarchy, where there are multiple
subgroups for each main group.
:param data: A dictionary of length equal to the number of the groups. The key should be the
group name, the value should be a list of arrays. The length of the list should be
equal to the number of subgroups.
:param group_names: (Optional) The group names, should be the same as data.keys(), but can be ordered.
:param subgroup_names: (Optional) Names of the subgroups.
:param subgroup_colors: A list specifying the plot color for each subgroup.
:param ax: (Optional) The axis to plot on.
"""
if group_names is None:
group_names = data.keys()
if ax is None:
ax = plt.gca()
plt.sca(ax)
nsubgroups = np.array([len(v) for v in data.values()])
assert len(np.unique(nsubgroups)) == 1, "Number of subgroups for each property differ!"
nsubgroups = nsubgroups[0]
if subgroup_colors is None:
subgroup_colors = list()
for k in range(nsubgroups):
subgroup_colors.append(np.random.rand(3))
else:
assert len(subgroup_colors) == nsubgroups, "subgroup_colors length must match number of subgroups (%d)" % nsubgroups
def _decorate_box(_bp, _d):
plt.setp(_bp['boxes'], lw=0, color='k')
plt.setp(_bp['whiskers'], lw=3.0, color='k')
# fill in each box with a color
assert len(_bp['boxes']) == nsubgroups
for _k,_box in enumerate(_bp['boxes']):
_boxX = list()
_boxY = list()
for _j in range(5):
_boxX.append(_box.get_xdata()[_j])
_boxY.append(_box.get_ydata()[_j])
_boxCoords = zip(_boxX, _boxY)
_boxPolygon = plt.Polygon(_boxCoords, facecolor=subgroup_colors[_k])
ax.add_patch(_boxPolygon)
# draw a black line for the median
for _k,_med in enumerate(_bp['medians']):
_medianX = list()
_medianY = list()
for _j in range(2):
_medianX.append(_med.get_xdata()[_j])
_medianY.append(_med.get_ydata()[_j])
plt.plot(_medianX, _medianY, 'k', linewidth=3.0)
# draw a black asterisk for the mean
plt.plot([np.mean(_med.get_xdata())], [np.mean(_d[_k])], color='w', marker='*',
markeredgecolor='k', markersize=12)
cpos = 1
label_pos = list()
for k in group_names:
d = data[k]
nsubgroups = len(d)
pos = np.arange(nsubgroups) + cpos
label_pos.append(pos.mean())
bp = plt.boxplot(d, positions=pos, widths=box_width)
_decorate_box(bp, d)
cpos += nsubgroups + box_spacing
plt.xlim(0, cpos-1)
plt.xticks(label_pos, group_names)
if subgroup_names is not None:
leg = custom_legend(subgroup_colors, subgroup_names)
plt.legend(handles=leg)
You can use the function(s) like this:
data = { 'A':[np.random.randn(100), np.random.randn(100) + 5],
'B':[np.random.randn(100)+1, np.random.randn(100) + 9],
'C':[np.random.randn(100)-3, np.random.randn(100) -5]
}
grouped_boxplot(data, group_names=['A', 'B', 'C'], subgroup_names=['Apples', 'Oranges'], subgroup_colors=['#D02D2E', '#D67700'])
plt.show()

Grouped boxplots, towards subtle academic publication styling... (source)
(Left) Python 2.7.12 Matplotlib v1.5.3. (Right) Python 3.7.3. Matplotlib v3.1.0.
Code:
import numpy as np
import matplotlib.pyplot as plt
# --- Your data, e.g. results per algorithm:
data1 = [5,5,4,3,3,5]
data2 = [6,6,4,6,8,5]
data3 = [7,8,4,5,8,2]
data4 = [6,9,3,6,8,4]
# --- Combining your data:
data_group1 = [data1, data2]
data_group2 = [data3, data4]
# --- Labels for your data:
labels_list = ['a','b']
xlocations = range(len(data_group1))
width = 0.3
symbol = 'r+'
ymin = 0
ymax = 10
ax = plt.gca()
ax.set_ylim(ymin,ymax)
ax.set_xticklabels( labels_list, rotation=0 )
ax.grid(True, linestyle='dotted')
ax.set_axisbelow(True)
ax.set_xticks(xlocations)
plt.xlabel('X axis label')
plt.ylabel('Y axis label')
plt.title('title')
# --- Offset the positions per group:
positions_group1 = [x-(width+0.01) for x in xlocations]
positions_group2 = xlocations
plt.boxplot(data_group1,
sym=symbol,
labels=['']*len(labels_list),
positions=positions_group1,
widths=width,
# notch=False,
# vert=True,
# whis=1.5,
# bootstrap=None,
# usermedians=None,
# conf_intervals=None,
# patch_artist=False,
)
plt.boxplot(data_group2,
labels=labels_list,
sym=symbol,
positions=positions_group2,
widths=width,
# notch=False,
# vert=True,
# whis=1.5,
# bootstrap=None,
# usermedians=None,
# conf_intervals=None,
# patch_artist=False,
)
plt.savefig('boxplot_grouped.png')
plt.savefig('boxplot_grouped.pdf') # when publishing, use high quality PDFs
#plt.show() # uncomment to show the plot.

I used the code given by Kuzeko and it worked well, but I found that the boxes in each group were being drawn in the reverse order. I changed ...x-_off... to ...x+_off... in the following line (just above the last for loop) which fixes it for me:
group_positions.append([x+_off*(width+0.01) for x in xlocations])

A boxplot above was modified to obtain group boxplots with 3 data types.
import matplotlib.pyplot as plt
import numpy as np
ord = [[16.9423,
4.0410,
19.1185],
[18.5134,
17.8048,
19.2669],
[18.7286,
18.0576,
19.1717],
[18.8998,
18.8469,
19.0005],
[18.8126,
18.7870,
18.8393],
[18.7770,
18.7511,
18.8022],
[18.7409,
18.7075,
18.7747],
[18.6866,
18.6624,
18.7093
],
[18.6748],
[18.9069,
18.6752,
19.0769],
[19.0012,
18.9783,
19.0202
],
[18.9448,
18.9134,
18.9813],
[19.1242,
18.8256,
19.3185],
[19.2118,
19.1661,
19.2580],
[19.2505,
19.1231,
19.3526]]
seq = [[17.8092,
4.0410,
19.6653],
[18.7266,
18.2556,
19.3739],
[18.6051,
18.0589,
19.0557],
[18.6467,
18.5629,
18.7566],
[18.5307,
18.4999,
18.5684],
[18.4732,
18.4484,
18.4985],
[18.5234,
18.5027,
18.4797,
18.4573],
[18.3987,
18.3636,
18.4544],
[18.3593],
[18.7234,
18.7092,
18.7598],
[18.7438,
18.7224,
18.7677],
[18.7304,
18.7111,
18.6880,
18.6913,
18.6678],
[18.8926,
18.5902,
19.2003],
[19.1059,
19.0835,
19.0601,
19.0373,
19.0147],
[19.1925,
19.0177,
19.2588]]
apd=[[17.0331,
4.0410,
18.5670],
[17.6124,
17.1975,
18.0755],
[17.3956,
17.1572,
17.9140],
[17.8295,
17.6514,
18.1466],
[18.0665,
17.9144,
18.2157],
[18.1518,
18.0382,
18.2722],
[18.1975,
18.0956,
18.2987],
[18.2219,
18.1293,
18.3062],
[18.2870,
18.2215,
18.3513],
[18.3047,
18.2363,
18.3950],
[18.3580,
18.2923,
18.4205],
[18.3830,
18.3250,
18.4381],
[18.4135,
18.3645,
18.4753],
[18.4580,
18.4095,
18.5170],
[18.4900,
18.4430,
18.5435]
]
ticks = [120,
240,
360,
516,
662,
740,
874,
1022,
1081,
1201,
1320,
1451,
1562,
1680,
1863]
def set_box_color(bp, color):
plt.setp(bp['boxes'], color=color)
plt.setp(bp['whiskers'], color=color)
plt.setp(bp['caps'], color=color)
plt.setp(bp['medians'], color=color)
plt.figure()
bpl = plt.boxplot(ord, positions=np.array(range(len(ord)))*3.0-0.3, sym='', widths=0.6)
bpr = plt.boxplot(seq, positions=np.array(range(len(seq)))*3.0+0.3, sym='', widths=0.6)
bpg = plt.boxplot(apd, positions=np.array(range(len(apd)))*3.0+0.9, sym='', widths=0.6)
set_box_color(bpl, '#D7191C') # colors are from http://colorbrewer2.org/
set_box_color(bpr, '#2C7BB6')
set_box_color(bpg, '#99d8c9')
# draw temporary red and blue lines and use them to create a legend
plt.plot([], c='#D7191C', label='ORD')
plt.plot([], c='#2C7BB6', label='SEQ')
plt.plot([], c='#99d8c9', label='APD')
plt.legend()
plt.xticks(range(0, len(ticks) * 3, 3), ticks)
plt.xlim(-2, len(ticks)*3)
plt.ylim(0, 20)
plt.tight_layout()
plt.show()
plt.savefig('boxcompare.png')

Related

matplotlib barh: how to make a visual gap between two groups of bars?

I have some sorted data of which I only show the highest and lowest values in a figure. This is a minimal version of what currently I have:
import matplotlib.pyplot as plt
# some dummy data (real data contains about 250 entries)
x_data = list(range(98, 72, -1))
labels = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
ranks = list(range(1, 27))
fig, ax = plt.subplots()
# plot 3 highest entries
bars_top = ax.barh(labels[:3], x_data[:3])
# plot 3 lowest entries
bars_bottom = ax.barh(labels[-3:], x_data[-3:])
ax.invert_yaxis()
# print values and ranks
for bar, value, rank in zip(bars_top + bars_bottom,
x_data[:3] + x_data[-3:],
ranks[:3] + ranks[-3:]):
y_pos = bar.get_y() + 0.5
ax.text(value - 4, y_pos, value, ha='right')
ax.text(4, y_pos, f'$rank:\ {rank}$')
ax.set_title('Comparison of Top 3 and Bottom 3')
plt.show()
Result:
I'd like to make an additional gap to this figure to make it more visually clear that the majority of data is in fact not displayed in this plot. For example, something very simple like the following would be sufficient:
Is this possible in matplotlib?

Here is a flexible approach that just plots a dummy bar in-between. The yaxis-transform together with the dummy bar's position is used to plot 3 black dots.
If multiple separations are needed, they all need a different dummy label, for example repeating the space character.
import matplotlib.pyplot as plt
import numpy as np
# some dummy data (real data contains about 250 entries)
x_data = list(range(98, 72, -1))
labels = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
ranks = list(range(1, 27))
fig, ax = plt.subplots()
# plot 3 highest entries
bars_top = ax.barh(labels[:3], x_data[:3])
# dummy bar inbetween
dummy_bar = ax.barh(" ", 0, color='none')
# plot 3 lowest entries
bars_bottom = ax.barh(labels[-3:], x_data[-3:])
ax.invert_yaxis()
# print values and ranks
for bar, value, rank in zip(bars_top + bars_bottom,
x_data[:3] + x_data[-3:],
ranks[:3] + ranks[-3:]):
y_pos = bar.get_y() + 0.5
ax.text(value - 4, y_pos, value, ha='right')
ax.text(4, y_pos, f'$rank:\ {rank}$')
# add three dots using the dummy bar's position
ax.scatter([0.05] * 3, dummy_bar[0].get_y() + np.linspace(0, dummy_bar[0].get_height(), 3),
marker='o', s=5, color='black', transform=ax.get_yaxis_transform())
ax.set_title('Comparison of Top 3 and Bottom 3')
ax.tick_params(axis='y', length=0) # hide the tick marks
ax.margins(y=0.02) # less empty space at top and bottom
plt.show()

The following function,
def top_bottom(x, l, n, ax=None, gap=1):
from matplotlib.pyplot import gca
if n <= 0 : raise ValueError('No. of top/bottom values must be positive')
if n > len(x) : raise ValueError('No. of top/bottom values should be not greater than data length')
if n+n > len(x):
print('Warning: no. of top/bottom values is larger than one'
' half of data length, OVERLAPPING')
if gap < 0 : print('Warning: some bar will be overlapped')
ax = ax if ax else gca()
top_x = x[:+n]
bot_x = x[-n:]
top_y = list(range(n+n, n, -1))
bot_y = list(range(n-gap, -gap, -1))
top_l = l[:+n] # A B C
bot_l = l[-n:] # X Y Z
top_bars = ax.barh(top_y, top_x)
bot_bars = ax.barh(bot_y, bot_x)
ax.set_yticks(top_y+bot_y)
ax.set_yticklabels(top_l+bot_l)
return top_bars, bot_bars
when invoked with your data and n=4, gap=4
bars_top, bars_bottom = top_bottom(x_data, labels, 4, gap=4)
produces
Later, you'll be able to customize the appearance of the bars as you like using the Artists returned by the function.

Doing a custom legend of marker sizes in matplotlib using a lambda function?

I am playing with the third example of "Scatter plots with a legend" in the matplotlib manual.
I have tweaked the marker sizes to:
s = (50 / price) ** 2
And as an input to legend_elements I am using:
func=lambda s: 50 / np.sqrt(s)
I get the output below. The marker sizes of the legend are wrong. Why is that?
Here is the code:
import numpy as np
import matplotlib.pyplot as plt
volume = np.random.rayleigh(27, size=40)
amount = np.random.poisson(10, size=40)
ranking = np.random.normal(size=40)
price = np.random.uniform(1, 10, size=40)
fig, ax = plt.subplots()
s = (50 / price) ** 2
scatter = ax.scatter(volume, amount, c=ranking, s=s,
vmin=-3, vmax=3, cmap="Spectral", label=price)
legend1 = ax.legend(*scatter.legend_elements(num=5),
loc="upper left", title="Ranking")
ax.add_artist(legend1)
kw = dict(prop="sizes", num=5, color=scatter.cmap(0.7), fmt="$ {x:.2f}",
func=lambda s: 50 / np.sqrt(s),
)
legend2 = ax.legend(*scatter.legend_elements(**kw),
loc="lower right", title="Price")
for p, v, a in zip(price, volume, amount):
ax.annotate(round(p, 0), (v, a))
plt.show()

The issue appears to be related to the inverse relationship between price and marker size. The way the data is calculated in legend_elements doesn't account for this, and the calculation doesn't quite work. I've submitted a pull request.
The problem is in np.interp that expects increasing input for the second argument. Here is a work around for now that sorts the input first:
legend2 = ax.legend(*legend_elements(scatter, **kw),
loc="lower right", title="Price")
Run this after defining legend_elements as:
def legend_elements(self, prop="colors", num="auto",
fmt=None, func=lambda x: x, **kwargs):
"""
Creates legend handles and labels for a PathCollection. This is useful
for obtaining a legend for a :meth:`~.Axes.scatter` plot. E.g.::
scatter = plt.scatter([1, 2, 3], [4, 5, 6], c=[7, 2, 3])
plt.legend(*scatter.legend_elements())
Also see the :ref:`automatedlegendcreation` example.
Parameters
----------
prop : string, optional, default *"colors"*
Can be *"colors"* or *"sizes"*. In case of *"colors"*, the legend
handles will show the different colors of the collection. In case
of "sizes", the legend will show the different sizes.
num : int, None, "auto" (default), array-like, or `~.ticker.Locator`,
optional
Target number of elements to create.
If None, use all unique elements of the mappable array. If an
integer, target to use *num* elements in the normed range.
If *"auto"*, try to determine which option better suits the nature
of the data.
The number of created elements may slightly deviate from *num* due
to a `~.ticker.Locator` being used to find useful locations.
If a list or array, use exactly those elements for the legend.
Finally, a `~.ticker.Locator` can be provided.
fmt : str, `~matplotlib.ticker.Formatter`, or None (default)
The format or formatter to use for the labels. If a string must be
a valid input for a `~.StrMethodFormatter`. If None (the default),
use a `~.ScalarFormatter`.
func : function, default *lambda x: x*
Function to calculate the labels. Often the size (or color)
argument to :meth:`~.Axes.scatter` will have been pre-processed
by the user using a function *s = f(x)* to make the markers
visible; e.g. *size = np.log10(x)*. Providing the inverse of this
function here allows that pre-processing to be inverted, so that
the legend labels have the correct values;
e.g. *func = np.exp(x, 10)*.
kwargs : further parameters
Allowed keyword arguments are *color* and *size*. E.g. it may be
useful to set the color of the markers if *prop="sizes"* is used;
similarly to set the size of the markers if *prop="colors"* is
used. Any further parameters are passed onto the `.Line2D`
instance. This may be useful to e.g. specify a different
*markeredgecolor* or *alpha* for the legend handles.
Returns
-------
tuple (handles, labels)
with *handles* being a list of `.Line2D` objects
and *labels* a matching list of strings.
"""
handles = []
labels = []
hasarray = self.get_array() is not None
if fmt is None:
fmt = mpl.ticker.ScalarFormatter(useOffset=False, useMathText=True)
elif isinstance(fmt, str):
fmt = mpl.ticker.StrMethodFormatter(fmt)
fmt.create_dummy_axis()
if prop == "colors":
if not hasarray:
warnings.warn("Collection without array used. Make sure to "
"specify the values to be colormapped via the "
"`c` argument.")
return handles, labels
u = np.unique(self.get_array())
size = kwargs.pop("size", mpl.rcParams["lines.markersize"])
elif prop == "sizes":
u = np.unique(self.get_sizes())
color = kwargs.pop("color", "k")
else:
raise ValueError("Valid values for `prop` are 'colors' or "
f"'sizes'. You supplied '{prop}' instead.")
fmt.set_bounds(func(u).min(), func(u).max())
if num == "auto":
num = 9
if len(u) <= num:
num = None
if num is None:
values = u
label_values = func(values)
else:
if prop == "colors":
arr = self.get_array()
elif prop == "sizes":
arr = self.get_sizes()
if isinstance(num, mpl.ticker.Locator):
loc = num
elif np.iterable(num):
loc = mpl.ticker.FixedLocator(num)
else:
num = int(num)
loc = mpl.ticker.MaxNLocator(nbins=num, min_n_ticks=num-1,
steps=[1, 2, 2.5, 3, 5, 6, 8, 10])
label_values = loc.tick_values(func(arr).min(), func(arr).max())
cond = ((label_values >= func(arr).min()) &
(label_values <= func(arr).max()))
label_values = label_values[cond]
yarr = np.linspace(arr.min(), arr.max(), 256)
xarr = func(yarr)
ix = np.argsort(xarr)
values = np.interp(label_values, xarr[ix], yarr[ix])
kw = dict(markeredgewidth=self.get_linewidths()[0],
alpha=self.get_alpha())
kw.update(kwargs)
for val, lab in zip(values, label_values):
if prop == "colors":
color = self.cmap(self.norm(val))
elif prop == "sizes":
size = np.sqrt(val)
if np.isclose(size, 0.0):
continue
h = mlines.Line2D([0], [0], ls="", color=color, ms=size,
marker=self.get_paths()[0], **kw)
handles.append(h)
if hasattr(fmt, "set_locs"):
fmt.set_locs(label_values)
l = fmt(lab)
labels.append(l)
return handles, labels

You can also manually create your own legend. The trick here is that you have to apply np.sqrt to sizes in the legend for some reason I don't quite follow but #busybear has in her snippet.
import numpy as np
import matplotlib.pyplot as plt
volume = np.random.rayleigh(27, size=40)
amount = np.random.poisson(10, size=40)
ranking = np.random.normal(size=40)
price = np.random.uniform(1, 10, size=40)
fig, ax = plt.subplots()
s = (50 / price) ** 2
scatter = ax.scatter(volume, amount, c=ranking, s=s,
vmin=-3, vmax=3, cmap="Spectral", label=price)
legend1 = ax.legend(*scatter.legend_elements(num=5),
loc="upper left", title="Ranking")
ax.add_artist(legend1)
# # easy legend
# kw = dict(prop="sizes", num=5, color=scatter.cmap(0.7), fmt="$ {x:.2f}",
# func=lambda s: 50 / np.sqrt(s),
# )
# legend2 = ax.legend(*scatter.legend_elements(**kw),
# loc="lower right", title="Price")
# ax.add_artist(legend2)
# manual legend
legend_values = np.array([2,4,6,8])
legend_sizes = (50 / legend_values) ** 2
# IMPORTANT: for some reason the square root needs to be applied to sizes in the legend
legend_sizes_sqrt = np.sqrt(legend_sizes)
elements3 = [Line2D([0], [0], color=scatter.cmap(0.7), lw=0, marker="o", linestyle=None, markersize=s) for s in legend_sizes_sqrt]
legend3 = ax.legend(elements3, [f"$ {p:.2f}" for p in legend_values], loc='lower right', title="Price")
ax.add_artist(legend3)
for p, v, a in zip(price, volume, amount):
ax.annotate(round(p, 0), (v, a))
plt.show()

how to create a discrete colorbar by strings as time and height plot?

The Output of my algorithm gives me a certain string. I need to visualize these in a Time-Height Plot with colors defined by those strings. So far, so good. I convert the strings to categorical and am able to choose my colors freely.
num_hydrometeor = 8
ncar_cmap = cm.get_cmap('gist_ncar_r', num_hydrometeor)
colors = {'AG':'chocolate','IC':'orange','DN':'yellowgreen','OT':'grey','WS':'r','FZ':'rosybrown','RN':'teal','IP':'cyan',np.nan:'white'}
a = np.linspace(0,18,400)
beam_height_test = beam_height_test = np.sort(np.random.choice(a,size=180))
times = pd.date_range('1/1/2020', periods = 288, freq ='5min')
C = np.array(['WS', 'OT', 'FZ', np.nan, 'AG', 'IC'],dtype=object)
test_dist_hca = np.random.choice(C,size=(len(beam_height_test),len(times)))
test_dist_hca_cat = pd.Series(data=test_dist_hca.flatten()).astype('category')
test_dist_hca_cat = test_dist_hca_cat.cat.codes
test_dist_hca_cat = test_dist_hca_cat.values
test_dist_hca_cat = test_dist_hca_cat.reshape((len(beam_height_test),len(times)))
cols = []
a = pd.Series(data=test_dist_hca.flatten()).sort_values().unique()
for hc in a:
cols.append(colors[hc])
ncar_cmap = cm.colors.ListedColormap(cols)
levels = np.unique(test_dist_hca_cat)
plt.figure(figsize=(40,10))
plt.pcolormesh(times,beam_height_test,test_dist_hca_cat,cmap=ncar_cmap,norm = cm.colors.BoundaryNorm(levels, ncolors=ncar_cmap.N, clip=False))
plt.colorbar()
plt.savefig("hmc_daily_test.png")
If applying to my real output it looks like this:
Does anyone has an idea what I am doing wrong? The Algorithm output comes from an pandas DataFrame and goes the same way as the pandas.Series in the minimal example.

To find out what's happening, I reduced the sizes. I also created a scatter plot where the colors are decided directly from the dictionary without the route via .astype('category').
It seems the nan complicates things somewhat, because it gets category number -1. Therefore, it needs to be treated separated from the rest, and we need the ranges for the colors starting with -1.
To get the ticks for the colorbar exactly in the center of each color, its range (-1 to 4 in this case) is divided into 12 equal parts, after which every even tick is skipped.
Here is how the final test code looks like:
from matplotlib import pyplot as plt
from matplotlib import cm
import pandas as pd
import numpy as np
colors = {'AG': 'chocolate', 'IC': 'orange', 'DN': 'yellowgreen', 'OT': 'grey', 'WS': 'r', 'FZ': 'rosybrown',
'RN': 'teal', 'IP': 'cyan', np.nan: 'white'}
a = np.linspace(0, 18, 25)
beam_height_test = np.sort(np.random.choice(a, replace=False, size=10))
times = pd.date_range('1/1/2020', periods=12, freq='5min')
C = np.array(['WS', 'OT', 'FZ', np.nan, 'AG', 'IC'], dtype=object)
test_dist_hca = np.random.choice(C, size=(len(beam_height_test), len(times)))
plt.figure(figsize=(14, 7))
plt.scatter(np.tile(times, len(beam_height_test)),
np.repeat(beam_height_test, len(times)),
c=[colors[h] for h in test_dist_hca.flatten()])
for i, x in enumerate(times):
for j, y in enumerate(beam_height_test):
plt.text(x, y, test_dist_hca[j][i])
plt.show()
test_dist_hca_cat = pd.Series(data=test_dist_hca.flatten()).astype('category')
test_dist_hca_cat = test_dist_hca_cat.cat.codes
test_dist_hca_cat = test_dist_hca_cat.values
test_dist_hca_cat = test_dist_hca_cat.reshape((len(beam_height_test), len(times)))
used_colors = [colors[np.nan]]
a = pd.Series(data=test_dist_hca.flatten()).sort_values().unique()
for hc in a:
if type(hc) == str:
used_colors.append(colors[hc])
cmap = cm.colors.ListedColormap(used_colors)
plt.figure(figsize=(14, 7))
plt.pcolormesh(times, beam_height_test, test_dist_hca_cat,
cmap=cmap,
norm=plt.Normalize(vmin=-1, vmax=len(a) - 2))
cbar = plt.colorbar(ticks=np.linspace(-1, len(a) - 2, 2 * len(a), endpoint=False)[1::2])
cbar.ax.set_yticklabels(['nan'] + list(a[:-1]))
plt.show()
Here is how the pcolormesh with the color bar look like:
And the corresponding scatter plot with the text annotations:
Note that the colors and the names correspond. As explained in the pcolormesh docs, pcolormesh ignores the last row and column when the X and Y sizes aren't 1 larger than the mesh.

Pandas plot: Assign Colors

I have many data frames that I am plotting for a presentation. These all have different columns, but all contain the same additional column foobar. At the moment, I am plotting these different data frames using
df.plot(secondary_y='foobar')
Unfortunately, since these data frames all have different additional columns with different ordering, the color of foobar is always different. This makes the presentation slides unnecessary complicated. I would like, throughout the different plots, assign that foobar is plotted bold and black.
Looking at the docs, the only thing coming close appears to be the parameter colormap - I would need to ensure that the xth color in the color map is always black, where x is the order of foobar in the data frame. Seems to be more complicated than it should be, also this wouldn't make it bold.
Is there a (better) approach?

I would suggest using matplotlib directly rather than the dataframe plotting methods. If df.plot returned the artists it added instead of an Axes object it wouldn't be too bad to change the color of the line after it was plotted.
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
def pandas_plot(ax, df, callout_key):
"""
Parameters
----------
ax : mpl.Axes
The axes to draw to
df : DataFrame
Data to plot
callout_key : str
key to highlight
"""
artists = {}
x = df.index.values
for k, v in df.iteritems():
style_kwargs = {}
if k == callout_key:
style_kwargs['c'] = 'k'
style_kwargs['lw'] = 2
ln, = ax.plot(x, v.values, **style_kwargs)
artists[k] = ln
ax.legend()
ax.set_xlim(np.min(x), np.max(x))
return artists
Usage:
fig, ax = plt.subplots()
ax2 = ax.twinx()
th = np.linspace(0, 2*np.pi, 1024)
df = pd.DataFrame({'cos': np.cos(th), 'sin': np.sin(th),
'foo': np.sin(th + 1), 'bar': np.cos(th +1)}, index=th)
df2 = pd.DataFrame({'cos': -np.cos(th), 'sin': -np.sin(th)}, index=th)
pandas_plot(ax, df, 'sin')
pandas_plot(ax2, df2, 'sin')

Perhaps you could define a function which handles the special column in a separate plot call:
def emphasize_plot(ax, df, col, **emphargs):
columns = [c for c in df.columns if c != col]
df[columns].plot(ax=ax)
df[col].plot(ax=ax, **emphargs)
Using code from tcaswell's example,
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
def emphasize_plot(ax, df, col, **emphargs):
columns = [c for c in df.columns if c != col]
df[columns].plot(ax=ax)
df[col].plot(ax=ax, **emphargs)
fig, ax = plt.subplots()
th = np.linspace(0, 2*np.pi, 1024)
df = pd.DataFrame({'cos': np.cos(th), 'foobar': np.sin(th),
'foo': np.sin(th + 1), 'bar': np.cos(th +1)}, index=th)
df2 = pd.DataFrame({'cos': -np.cos(th), 'foobar': -np.sin(th)}, index=th)
emphasize_plot(ax, df, 'foobar', lw=2, c='k')
emphasize_plot(ax, df2, 'foobar', lw=2, c='k')
plt.show()
yields

I used #unutbut's answer and extended it to allow for a secondary y axis and correct legends:
def emphasize_plot(ax, df, col, **emphargs):
columns = [c for c in df.columns if c != col]
ax2 = ax.twinx()
df[columns].plot(ax=ax)
df[col].plot(ax=ax2, **emphargs)
lines, labels = ax.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax2.legend(lines + lines2, labels + labels2, loc=0)

Stop matplotlib repeating labels in legend

Here is a very simplified example:
xvalues = [2,3,4,6]
for x in xvalues:
plt.axvline(x,color='b',label='xvalues')
plt.legend()
The legend will now show 'xvalues' as a blue line 4 times in the legend.
Is there a more elegant way of fixing this than the following?
for i,x in enumerate(xvalues):
if not i:
plt.axvline(x,color='b',label='xvalues')
else:
plt.axvline(x,color='b')

plt.legend takes as parameters
A list of axis handles which are Artist objects
A list of labels which are strings
These parameters are both optional defaulting to plt.gca().get_legend_handles_labels().
You can remove duplicate labels by putting them in a dictionary before calling legend. This is because dicts can't have duplicate keys.
For example:
For Python versions < 3.7
from collections import OrderedDict
import matplotlib.pyplot as plt
handles, labels = plt.gca().get_legend_handles_labels()
by_label = OrderedDict(zip(labels, handles))
plt.legend(by_label.values(), by_label.keys())
For Python versions > 3.7
As of Python 3.7, dictionaries retain input order by default. Thus, there is no need for OrderedDict form the collections module.
import matplotlib.pyplot as plt
handles, labels = plt.gca().get_legend_handles_labels()
by_label = dict(zip(labels, handles))
plt.legend(by_label.values(), by_label.keys())
Docs for plt.legend

handles, labels = ax.get_legend_handles_labels()
handle_list, label_list = [], []
for handle, label in zip(handles, labels):
if label not in label_list:
handle_list.append(handle)
label_list.append(label)
plt.legend(handle_list, label_list)

I don't know if this can be considered "elegant", but you can have your label a variable that gets set to "_nolegend_" after first usage:
my_label = "xvalues"
xvalues = [2,3,4,6]
for x in xvalues:
plt.axvline(x, color='b', label=my_label)
my_label = "_nolegend_"
plt.legend()
This can be generalized using a dictionary of labels if you have to put several labels:
my_labels = {"x1" : "x1values", "x2" : "x2values"}
x1values = [1, 3, 5]
x2values = [2, 4, 6]
for x in x1values:
plt.axvline(x, color='b', label=my_labels["x1"])
my_labels["x1"] = "_nolegend_"
for x in x2values:
plt.axvline(x, color='r', label=my_labels["x2"])
my_labels["x2"] = "_nolegend_"
plt.legend()
(Answer inspired by https://stackoverflow.com/a/19386045/1878788)

Problem - 3D Array
Questions: Nov 2012, Oct 2013
import numpy as np
a = np.random.random((2, 100, 4))
b = np.random.random((2, 100, 4))
c = np.random.random((2, 100, 4))
Solution - dict uniqueness
For my case _nolegend_ (bli and DSM) would not work, nor would label if i==0. ecatmur's answer uses get_legend_handles_labels and reduces the legend down with collections.OrderedDict. Fons demonstrates this is possible without an import.
Inline with these answers, I suggest using dict for unique labels.
# Step-by-step
ax = plt.gca() # Get the axes you need
a = ax.get_legend_handles_labels() # a = [(h1 ... h2) (l1 ... l2)] non unique
b = {l:h for h,l in zip(*a)} # b = {l1:h1, l2:h2} unique
c = [*zip(*b.items())] # c = [(l1 l2) (h1 h2)]
d = c[::-1] # d = [(h1 h2) (l1 l2)]
plt.legend(*d)
Or
plt.legend(*[*zip(*{l:h for h,l in zip(*ax.get_legend_handles_labels())}.items())][::-1])
Maybe less legible and memorable than Matthew Bourque's solution. Code golf welcome.
Example
import numpy as np
a = np.random.random((2, 100, 4))
b = np.random.random((2, 100, 4))
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1)
ax.plot(*a, 'C0', label='a')
ax.plot(*b, 'C1', label='b')
ax.legend(*[*zip(*{l:h for h,l in zip(*ax.get_legend_handles_labels())}.items())][::-1])
# ax.legend() # Old, ^ New
plt.show()

Based on answer https://stackoverflow.com/a/13589144/9132798 and https://stackoverflow.com/a/19386045/9132798
plt.gca().get_legend_handles_labels()[1] gives a list of names, it is possible to check if the label is already in the list while in the loop plotting (label= name[i] if name[i] not in plt.gca().get_legend_handles_labels()[1] else '').
For the given example this solution would look like:
import matplotlib.pyplot as plt
xvalues = [2,3,4,6]
for x in xvalues:
plt.axvline(x,color='b',\
label= 'xvalues' if 'xvalues' \
not in plt.gca().get_legend_handles_labels()[1] else '')
plt.legend()
Which is much shorter than https://stackoverflow.com/a/13589144/9132798 and more flexible than https://stackoverflow.com/a/19386045/9132798 as it could be use for any kind of loop any plot function in the loop individually.
However, for many cycles it probably slower than https://stackoverflow.com/a/13589144/9132798.

These code snippets didn't work for me personally. I was plotting two different groups in two different colors. The legend would show two red markers and two blue markers, when I only wanted to see one per color. I'll paste a simplified version of what did work for me:
Import statements
import matplotlib.pyplot as plt
from matplotlib.legend_handler import HandlerLine2D
Plot data
points_grp, = plt.plot(x[grp_idx], y[grp_idx], color=c.c[1], marker=m, ms=4, lw=0, label=leglab[1])
points_ctrl, = plt.plot(x[ctrl_idx], y[ctrl_idx], color=c.c[0], marker=m, ms=4, lw=0, label=leglab[0])
Add legend
points_dict = {points_grp: HandlerLine2D(numpoints=1),points_ctrl: HandlerLine2D(numpoints=1)}
leg = ax.legend(fontsize=12, loc='upper left', bbox_to_anchor=(1, 1.03),handler_map=points_dict)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

matplotlib: Group boxplots - python

Related

matplotlib barh: how to make a visual gap between two groups of bars?

Doing a custom legend of marker sizes in matplotlib using a lambda function?

how to create a discrete colorbar by strings as time and height plot?

Pandas plot: Assign Colors

Stop matplotlib repeating labels in legend

Categories

Resources