Add hue to Seaborn Histogram annotation - python

I have a snippet of code that produces 2 seaborn.histogram plots on the same axes, split by hue, and annotated:
The two histograms are appropriately colored differently using the hue parameter, and the count of data in each bin are also appropriately annotated. However, can I also color the annotations / counts of what is in each bin?
Current MRE:
np.random.seed(8)
t = pd.DataFrame(
{
'Value': np.random.uniform(low=100000, high=500000, size=(50,)),
'Type': ['B' if x < 6 else 'R' for x in np.random.uniform(low=1, high=10, size=(50,))]
}
)
ax = sns.histplot(data=t, x='Value', bins=5, hue='Type', palette="dark")
ax.set(title="R against B")
ax.xaxis.set_major_formatter(FormatStrFormatter('%.0f'))
for p in ax.patches:
ax.annotate(f'{p.get_height():.0f}\n',
(p.get_x() + p.get_width() / 2, p.get_height()), ha='center', va='center', color='crimson')
plt.show()

You're looking for matplotlib.axes.Axes.get_facecolor method.
This way, you can match the color of each annotation with the color of the corresponding histo.
for p in ax.patches:
color = p.get_facecolor()
ax.annotate(f"{p.get_height():.0f}\n", (p.get_x() + p.get_width() / 2, p.get_height()),
ha="center", va="center", color=color, fontweight="bold")
Output :

Related

plot textboxes and fill colors between vertical lines in matplotlib python

based on another thread i got this code:
data = np.random.normal(loc=0.0, scale=1.0, size=2000)
df_data = pd.DataFrame(data)
import numpy as np
import scipy
import pandas as pd
from scipy.stats import norm
import matplotlib.pyplot as plt
def _plot(df):
for col in df.columns:
n_bins = 50
fig, axes = plt.subplots(figsize=(12,6))
n, bins, patches = axes.hist(df[col], n_bins, density=True, alpha=.1, edgecolor='black' )
mu = df[col].mean()
sigma = df[col].std()
pdf = 1/(sigma*np.sqrt(2*np.pi))*np.exp(-(bins-mu)**2/(2*sigma**2))
#probability density function
axes.plot(bins, pdf, color='green', alpha=.6)
#dashed lines
plt.axvline(np.mean(df_data[0]),color='b', linestyle='-.')
plt.axvline(np.mean(df_data[0]-sigma),color='b', linestyle='-.')
plt.axvline(np.mean(df_data[0]-2*sigma),color='b', linestyle='-.')
plt.axvline(np.mean(df_data[0]-3*sigma),color='b', linestyle='-.')
plt.axvline(min(df_data[0]),color='r', linestyle='-.')
plt.axvline(np.mean(df_data[0]+sigma),color='b', linestyle='-.')
plt.axvline(np.mean(df_data[0]+2*sigma),color='b', linestyle='-.')
plt.axvline(np.mean(df_data[0]+3*sigma),color='b', linestyle='-.')
plt.axvline(max(df_data[0]),color='r', linestyle='-.')
plt.ylabel('Probability Density')
plt.xlabel('Values')
print(mu)
print(sigma)
_plot(df_data)
Which returns me this nice plot:
As you can see the blue vertical lines indicate borders set by multiples of standard deviations. I would like to add the following information and color coding, which I now quickly placed in powerpoint:
I tried to mess with the plt.fill_between function but didnt really get anything useful. Also I do not know how to write something, like the mu+l*sigma here, above the plot. How can i achieve the second picture based on what I have?
EDIT:
solved by #Trenton McKinney
Putting new boxes inside the colored boxes:
for i, (x, c) in enumerate(locs[:-1]):
axes.axvspan(x, locs[i + 1][0], alpha=0.2, color=c)
tx = (x + locs[i + 1][0]) / 2
axes.text(tx, y1/2, f'Zustand {i + 1}', {'ha': 'center', 'va': 'center'}, rotation=90)
if i<4:
axes.text(tx, y1/1.25, r"$\mu$" + "-" + f"{4-i}"+ "$\cdot$" + "$\sigma$" , {'ha': 'center', 'va': 'center'}, rotation=90, bbox=dict(facecolor='white', alpha=0.8, edgecolor='black'))
else:
axes.text(tx, y1/1.25, r"$\mu$" + "+" + f"{i-4 + 1}"+ "$\cdot$" + "$\sigma$" , {'ha': 'center', 'va': 'center'}, rotation=90, bbox=dict(facecolor='white', alpha=0.8, edgecolor='black'))
It will be easier to create a container with all of the values for the vertical lines because those values will be reused for placing the lines, and determining the axvspan and text placement. In this case, a dictionary is used.
See inline notation for explanations
Use .Axes.axvspan to fill between vertical positions
How to highlight specific x-value ranges
See How do I merge two dictionaries in a single expression (take union of dictionaries)?
Add text to the plot with .Axes.text
Tested in python 3.10, matplotlib 3.5.1
# extra imports
from collections import OrderedDict
from itertools import zip_longest
np.random.seed(2022)
data = np.random.normal(loc=0.0, scale=1.0, size=2000)
df_data = pd.DataFrame(data)
def _plot(df):
for col in df.columns:
n_bins = 50
fig, axes = plt.subplots(figsize=(12,6))
n, bins, patches = axes.hist(df[col], n_bins, density=True, alpha=.1, edgecolor='black' )
mu = df[col].mean()
sigma = df[col].std()
pdf = 1/(sigma*np.sqrt(2*np.pi))*np.exp(-(bins-mu)**2/(2*sigma**2))
#probability density function
axes.plot(bins, pdf, color='green', alpha=.6)
# get ylim to position the text
y0, y1 = axes.get_ylim()
# create a dict for all the x values for vertical lines with the line color
muu = {mu: 'b'}
mm = {df_data[0].min(): 'r', df_data[0].max(): 'r'}
mun = {df_data[0].sub(v*sigma).mean(): 'b' for v in range(1, 4)}
mup = {df_data[0].add(v*sigma).mean(): 'b' for v in range(1, 4)}
# combine the dicts: | requires python 3.9+. See linked SO answer for additional opitons to combine the dicts
vals = muu | mm | mun | mup
# order the keys (x values) from smallest to largest
vals = OrderedDict(sorted(vals.items()))
# plot the dashed lines
for x, c in vals.items():
plt.axvline(x, color=c, linestyle='-.')
# combine the x values with colors of the stages
locs = list(zip_longest(vals.keys(), ['blue', 'brown']*4))
# iterate through all but the last value, and add the vspan and the text
for i, (x, c) in enumerate(locs[:-1]):
axes.axvspan(x, locs[i + 1][0], alpha=0.2, color=c)
tx = (x + locs[i + 1][0]) / 2
axes.text(tx, y1/2, f'Stage {i + 1}', {'ha': 'center', 'va': 'center'}, rotation=90)
plt.ylabel('Probability Density')
plt.xlabel('Values')
print(mu)
print(sigma)
_plot(df_data)
Update for additional annotations
# extra annotations
sign = [f'µ - {v}σ' for v in range(4, 0, -1)]
sigp = [f'µ + {v}σ' for v in range(1, 5)]
anno = sign + sigp
# iterate through all but the last value and add the vspan and the text
for i, (x, c) in enumerate(locs[:-1]):
axes.axvspan(x, locs[i + 1][0], alpha=0.2, color=c)
tx = (x + locs[i + 1][0]) / 2
axes.text(tx, y1/2, f'Stage {i + 1}: {anno[i]}', {'ha': 'center', 'va': 'center'}, rotation=90)

Flexible placement of labels in seaborn barplots

I would like to place labels on bars in Seaborn depending on how much space there is available.
For example in the example below I would like to use outside labels if they fit on the figure and inside labels when they don't. I would like to do this automatically for many plots, so I am looking for a smart way to do this flexibly. Any ideas?
import seaborn as sns
import matplotlib.pyplot as plt
titanic=sns.load_dataset('titanic')
titanic.head()
fig, ax = plt.subplots()
sns.countplot(
y='sex',
data=titanic,
orient="h",
ax=ax
)
for p in ax.patches:
perc = "{:.1f}%".format(100 * p.get_width() / titanic.shape[0])
x = p.get_width()
y = p.get_y() + p.get_height() / 2
# inside labels
ax.annotate(perc, (x*0.8, y), color="white")
# outside labels
ax.annotate(perc, (x*1.1, y), color="black")
Bad quality example of what I would like to achieve:
Does this help?
fig, ax = plt.subplots()
sns.countplot(
y='sex',
data=titanic,
orient="h",
ax=ax
)
values = []
for p in ax.patches:
x = p.get_width() # counts; dimension of the bar
y = p.get_y() + p.get_height() / 2
values.append(x) #collect the bar sizes in a list
x_max = max(values) #determine the largest bar size
#adjust percentage of the maximum bar size where inside/outside annotation occurs
cut_off = 0.8*x_max
for p in ax.patches:
perc = "{:.1f}%".format(100 * p.get_width() / titanic.shape[0]) #the label for the bar
x = p.get_width()
y = p.get_y() + p.get_height() / 2
if x >= cut_off:
ax.annotate(perc, (x*0.8, y), color="white")
else:
ax.annotate(perc, (x*1.1, y), color="black")

plotting bar graph from pair of values from list of arrays

I have a list of arrays each with 1 row and 2 cols (list=array([1.3, 4.5]), array([1.4, 6.8]), array([2.5, 2.88]).
I want to plot bar graph, where each col values of each array(say they are array([x, y]) are plotted side by side, x has same color for all values and y has same color for all values.
How can I go about it?
like this:
Thanks!
Actually there is an similar answer in here.
I took it and made some changes for your problem:
import matplotlib.pyplot as plt
# List to plot
a = [1.3, 1.4, 2.5]
b = [4.5, 6.8, 2.88]
# Some properties for the plot
alpha = 0.7
bar_width = 0.35
# LAbels for the both axis
plt.xlabel('x-label')
plt.ylabel('y-label')
# bar-plot
bar1 = plt.bar(np.arange(len(a)) + bar_width, a, bar_width, align='center', alpha=opacity, color='b', label='a-List')
bar2 = plt.bar(range(len(b)), b, bar_width, align='center', alpha=alpha, color='g', label='b-List')
# Add numbers above the two bar graphs
for rect in bar1 + bar2:
height = rect.get_height()
plt.text(rect.get_x() + rect.get_width() / 2.0, height, f'{height}', ha='center', va='bottom')
plt.legend()
plt.tight_layout()
plt.show()

How to center the names of the bar in a seaborn barplot and add a zoom option to the bar?

I want to center the xlables, ex: Pinchicha, Guayas, Manabi in the middle of each bar. Also, I want to know if it is possible to gave to this figure a dynamic zoom and increase the bar size that shows all the provinces and its color. Please check below, the image of my seaborn bar.
new_index = (ecuador['Confirmed cases'].sort_values(ascending=False)).index.values
sorted_data = ecuador.reindex(new_index)
sns.set_theme(style="whitegrid")
plt.figure(figsize=(50,20)) # I want to make this parameter dynamic
plt.xticks(rotation= 90)
# Creating the sea barplot
ax = sns.barplot( x="Provinces", y="Confirmed cases", data=sorted_data, orient='v', hue="Provinces", dodge= True, ci=None, palette=("RdYlGn")) #RdYlGn Red is the most critical value.
#Changin the bar size function
def change_width(ax, new_value) :
for patch in ax.patches :
current_width = patch.get_width()
diff = current_width - new_value
# we change the bar width
patch.set_width(new_value)
# we recenter the bar
patch.set_x(patch.get_x() + diff * .5)
for p in ax.patches:
ax.annotate(format(p.get_height()), #, '.1f'
(p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center',
xytext = (0, 9),
textcoords = 'offset points')
plt.title("COVID in Ecuador", fontsize=20)
change_width(ax, .35)
ax.autoscale_view()
Please help me, I am just learning.
Solution:
new_index = (ecuador_df['Confirmed cases'].sort_values(ascending=False)).index.values
sorted_data = ecuador_df.reindex(new_index)
sns.set_theme(style="whitegrid")
plt.figure(figsize=(50,20)) # I want to make this parameter dynamic
plt.xticks(rotation= 90)
plt.title("COVID in Ecuador", fontsize=20)
# Creating the sea barplot
ax = sns.barplot( x="Provinces", y="Confirmed cases", data=sorted_data, orient='v', hue=data_x, dodge= False, ci=None,palette=("RdYlGn")) #RdYlGn Red is the most critical value.
#Changin the bar size function
def change_width(ax, new_value) :
for patch in ax.patches :
current_width = patch.get_width()
diff = current_width - new_value
# we change the bar width
patch.set_width(new_value)
# we recenter the bar
patch.set_x(patch.get_x() + diff * .5)
for p in ax.patches:
ax.annotate(format(p.get_height()), #, '.1f'
(p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center',
xytext = (0, 9),
textcoords = 'offset points')
change_width(ax, .35)
ax.autoscale_view()
remove hue='Provinces' in your call to barplot().
hue-nesting is supposed to be in addition to x. So that you can have several categories per x-value. Here you pass the same column to x= and hue= so seaborn is trying to fit N provinces for each x value. But since each x-value is already a province, that does not work.

Annotating subplots in matplotlib scales the figure to the largest axes

When I make figure with 5 subplots and annotate the bars in each subplot, matplotlib appears to scale the figure so that the maximum from the largest y-axis scales to the smallest y-axis.
I can't describe the problem too well, but see this image:
where there's tons of white-space above where the figure should begin.
However, the figure would ideally look like this
When I set the 4 smallest axes to have the same upper y-limit as the largest axis, then the figure scales correctly, but for the purpose of the visualization, I would prefer not to do that.
Why does this happen? Is there anyway to control the figure so that it's not automatically scaled as in the first image? Or otherwise, a more appropriate way of plotting what I hope to achieve?
The code I'm using to generate the figure:
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.patches import Patch
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Arial']
department = ["100", "1,000", "10,000", \
"100,000", "1,000,000"]
quarter = ["Serial", "MPI", "CUDA", "Hybrid"]
budgets = np.array([[0.049979, 0.43584, 2.787366, 19.75062, 201.6935],\
[2.184624, 0.175213, 0.677837, 5.265575, 46.33678],\
[0.050294, 0.068537, 0.23739, 1.93778, 18.55734],\
[3.714284, 3.9917, 4.977599, 6.174967, 37.732232]])
budgets = np.transpose(budgets)
em = np.zeros((len(department), len(quarter)))
# set up barchart
x = np.arange(len(department)) # label locations
width = 0.8 # width of all the bars
# set up figure
fig, (ax1, ax2, ax3, ax4, ax5) = plt.subplots(1, 5)
axes = [ax1, ax2, ax3, ax4, ax5]
# generate bars
rects = []
color = ["tomato", "royalblue", "limegreen", "orange"]
n = len(quarter)
for i in range(n):
bar_x = x - width/2.0 + i/float(n)*width + width/(n*2)
m = len(budgets[:,i])
for j in range(m):
bar_x = x[j] - width/2.0 + i/float(n)*width + width/(n*2)
e = budgets[j,i]
#bar_x = x - width/2.0 + i/float(n)*width + width/(n*2)
rects.append(axes[j].bar(bar_x, e, width=width/float(n), \
label=quarter[i], color=color[i]))
# set figure properties
fig.set_size_inches(12, 2.5)
fig.tight_layout(rect=[0, 0.03, 1, 0.95])
nAx = len(axes)
for i in range(nAx):
#axes[i].set_aspect("auto")
axes[i].tick_params(axis='x', which='both', bottom=False, top=False,
labelbottom=False)
ax1.set_ylabel("Time (ms)")
for i in range(nAx):
axes[i].yaxis.grid(which="major", color="white", lw=0.75)
ax1.set_ylim([0, 4])
fig.suptitle("Time per iteration for differing dataset sizes") # title
for i in range(nAx):
axes[i].set_xlabel(department[i])
# annotate bars
for i in range(nAx):
for rect in rects:
j = 0;
for bar in rect:
y_bottom, y_top = axes[i].get_ylim() # axis limits
height = bar.get_height() # bar's height
va = 'bottom'
offset = 3
color = 'k'
fg = 'w'
# keep label within plot
if (y_top < 1.1 * height):
offset = -3
va = 'top'
color='w'
fg = 'k'
# annotate the bar
axes[i].annotate('{:.2f}'.format(height),
xy=(bar.get_x() + bar.get_width()/2, height),
xytext=(0,offset),
textcoords="offset points",
ha='center', va=va, color=color)
# set custom legend
legend_elements = [Patch(facecolor='tomato', label='Serial'),
Patch(facecolor='royalblue', label='MPI'),
Patch(facecolor='limegreen', label='CUDA'),
Patch(facecolor='orange', label='Hybrid')]
plt.legend(handles=legend_elements, loc="upper center", fancybox=False,
edgecolor='k', ncol=4, bbox_to_anchor=(-2, -0.1))
plt.show()
This is a partial answer.
This might be a bug, since I couldn't reproduce the problem until I switched to a Jupyter notebook in a Debian system (different hardware too). Your figure gets drawn correctly in my macOS Jupyter notebook, and in Debian when displayed from a .py script.
The problem appears to be with your annotations. If you make the tight_layout call after annotation, you might get a warning like this:
<ipython-input-80-f9f592f5efc5>:88: UserWarning: Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations.
fig.tight_layout(rect=[0, 0.03, 1, 0.95])
It seems like the annotate function is calculating some totally wacky coordinates for your annotations, though the text ends up in the right spot. If you remove them, the white space disappears. You can try calculating the xy coordinates a for your annotations a different way. This might get you started:
axes[i].annotate('{:.2f}'.format(height),
xy=(bar.get_x() + bar.get_width()/2, height),
xytext=(0,offset),
textcoords="offset points",
xycoords="axes points", # change
ha='center', va=va, color=color)
Output:
To correctly calculate the points, you can try using the appropriate axis transformation, though again, I couldn't get it to work and it might be related to a bug.
try putting the fig.tight_layout(rect=[0, 0.03, 1, 0.95]) after all the plotting commands, as below.
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.patches import Patch
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Arial']
department = ["100", "1,000", "10,000", \
"100,000", "1,000,000"]
quarter = ["Serial", "MPI", "CUDA", "Hybrid"]
budgets = np.array([[0.049979, 0.43584, 2.787366, 19.75062, 201.6935],\
[2.184624, 0.175213, 0.677837, 5.265575, 46.33678],\
[0.050294, 0.068537, 0.23739, 1.93778, 18.55734],\
[3.714284, 3.9917, 4.977599, 6.174967, 37.732232]])
budgets = np.transpose(budgets)
em = np.zeros((len(department), len(quarter)))
# set up barchart
x = np.arange(len(department)) # label locations
width = 0.8 # width of all the bars
# set up figure
fig, (ax1, ax2, ax3, ax4, ax5) = plt.subplots(1, 5)
axes = [ax1, ax2, ax3, ax4, ax5]
# generate bars
rects = []
color = ["tomato", "royalblue", "limegreen", "orange"]
n = len(quarter)
for i in range(n):
bar_x = x - width/2.0 + i/float(n)*width + width/(n*2)
m = len(budgets[:,i])
for j in range(m):
bar_x = x[j] - width/2.0 + i/float(n)*width + width/(n*2)
e = budgets[j,i]
#bar_x = x - width/2.0 + i/float(n)*width + width/(n*2)
rects.append(axes[j].bar(bar_x, e, width=width/float(n), \
label=quarter[i], color=color[i]))
# set figure properties
fig.set_size_inches(12, 2.5)
#fig.tight_layout(rect=[0, 0.03, 1, 0.95])
nAx = len(axes)
for i in range(nAx):
#axes[i].set_aspect("auto")
axes[i].tick_params(axis='x', which='both', bottom=False, top=False,
labelbottom=False)
ax1.set_ylabel("Time (ms)")
for i in range(nAx):
axes[i].yaxis.grid(which="major", color="white", lw=0.75)
ax1.set_ylim([0, 4])
fig.suptitle("Time per iteration for differing dataset sizes") # title
for i in range(nAx):
axes[i].set_xlabel(department[i])
# annotate bars
for i in range(nAx):
for rect in rects:
j = 0;
for bar in rect:
y_bottom, y_top = axes[i].get_ylim() # axis limits
height = bar.get_height() # bar's height
va = 'bottom'
offset = 3
color = 'k'
fg = 'w'
# keep label within plot
if (y_top < 1.1 * height):
offset = -3
va = 'top'
color='w'
fg = 'k'
# annotate the bar
axes[i].annotate('{:.2f}'.format(height),
xy=(bar.get_x() + bar.get_width()/2, height),
xytext=(0,offset),
textcoords="offset points",
ha='center', va=va, color=color)
# set custom legend
legend_elements = [Patch(facecolor='tomato', label='Serial'),
Patch(facecolor='royalblue', label='MPI'),
Patch(facecolor='limegreen', label='CUDA'),
Patch(facecolor='orange', label='Hybrid')]
plt.legend(handles=legend_elements, loc="upper center", fancybox=False,
edgecolor='k', ncol=4, bbox_to_anchor=(-2, -0.1))
fig.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

Categories

Resources