Plotting annotated gaussian curve

Plotting annotated gaussian curve - python

u am trying to recreate the image below. Most problematic is putting the percentages and labels on the plot.
So far I did manage to plot the gaussian, although be it a bit steep... Any idea how to get the exact percentages there?
from matplotlib import pyplot as mp
import numpy as np
def gaussian(x, mu, sig):
return np.exp(-np.power(x - mu, 2.) / (2 * np.power(sig, 2.)))
for mu, sig in [(0, 3)]:
mp.plot(gaussian(np.linspace(-10, 10, 120), mu, sig))
mp.show()
UPDATE:
What if I would use a trick approach such as the one below, but somehow hide the histogram blocks?
import numpy as np
import scipy
import pandas as pd
from scipy.stats import norm
import matplotlib.pyplot as plt
from matplotlib.mlab import normpdf
# dummy data
mu = 0
sigma = 1
n_bins = 50
s = np.random.normal(mu, sigma, 1000000)
fig, axes = plt.subplots(nrows=2, ncols=1, sharex=True)
#histogram
n, bins, patches = axes[1].hist(s, n_bins, normed=True, alpha=.1, edgecolor='black' )
pdf = 1/(sigma*np.sqrt(2*np.pi))*np.exp(-(bins-mu)**2/(2*sigma**2))
median, q1, q3 = np.percentile(s, 50), np.percentile(s, 25), np.percentile(s, 75)
print(q1, median, q3)
#probability density function
axes[1].plot(bins, pdf, color='orange', alpha=.6)
#to ensure pdf and bins line up to use fill_between.
bins_1 = bins[(bins >= q1-1.5*(q3-q1)) & (bins <= q1)] # to ensure fill starts from Q1-1.5*IQR
bins_2 = bins[(bins <= q3+1.5*(q3-q1)) & (bins >= q3)]
pdf_1 = pdf[:int(len(pdf)/2)]
pdf_2 = pdf[int(len(pdf)/2):]
pdf_1 = pdf_1[(pdf_1 >= norm(mu,sigma).pdf(q1-1.5*(q3-q1))) & (pdf_1 <= norm(mu,sigma).pdf(q1))]
pdf_2 = pdf_2[(pdf_2 >= norm(mu,sigma).pdf(q3+1.5*(q3-q1))) & (pdf_2 <= norm(mu,sigma).pdf(q3))]
#fill from Q1-1.5*IQR to Q1 and Q3 to Q3+1.5*IQR
axes[1].fill_between(bins_1, pdf_1, 0, alpha=.6, color='orange')
axes[1].fill_between(bins_2, pdf_2, 0, alpha=.6, color='orange')
print(norm(mu, sigma).cdf(median))
print(norm(mu, sigma).pdf(median))
#add text to bottom graph.
axes[1].annotate("{:.1f}%".format(100*norm(mu, sigma).cdf(q1)), xy=((q1-1.5*(q3-q1)+q1)/2, 0), ha='center')
axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q3)-norm(mu, sigma).cdf(q1))), xy=(median, 0), ha='center')
axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q3+1.5*(q3-q1)-q3)-norm(mu, sigma).cdf(q3))), xy=((q3+1.5*(q3-q1)+q3)/2, 0), ha='center')
axes[1].annotate('q1', xy=(q1, norm(mu, sigma).pdf(q1)), ha='center')
axes[1].annotate('q3', xy=(q3, norm(mu, sigma).pdf(q3)), ha='center')
axes[1].set_ylabel('probability')
#top boxplot
axes[0].boxplot(s, 0, 'gD', vert=False)
axes[0].axvline(median, color='orange', alpha=.6, linewidth=.5)
axes[0].axis('off')
plt.subplots_adjust(hspace=0)
plt.show()

Related

Matplotlib, plot a vector of numbers as a rectangle filled with numbers

So let's say I have a vector of numbers.
np.random.randn(5).round(2).tolist()
[2.05, -1.57, 1.07, 1.37, 0.32]
I want a draw a rectangle that shows this elements as numbers in a rectangle.
Something like this:
Is there an easy way to do this in matplotlib?

A bit convoluted but you could take advantage of seaborn.heatmap, creating a white colormap:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
data = np.random.randn(5).round(2).tolist()
linewidth = 2
ax = sns.heatmap([data], annot=True, cmap=LinearSegmentedColormap.from_list('', ['w', 'w'], N=1),
linewidths=linewidth, linecolor='black', square=True,
cbar=False, xticklabels=False, yticklabels=False)
plt.tight_layout()
plt.show()
In this case, the external lines won't be as thick as the internal ones. If needed, this can be fixed with:
ax.axhline(y=0, color='black', lw=linewidth*2)
ax.axhline(y=1, color='black', lw=linewidth*2)
ax.axvline(x=0, color='black', lw=linewidth*2)
ax.axvline(x=len(data), color='black', lw=linewidth*2)
Edit: avoid these lines and add clip_on=False to sns.heatmap (thanks/credit #JohanC)
Output:

We can add rectangles , and annotate them in a for loop.
from matplotlib import pyplot as plt
import numpy as np
# Our numbers
nums = np.random.randn(5).round(2).tolist()
# rectangle_size
rectangle_size = 2
# We want rectangles look squared, you can change if you want
plt.rcParams["figure.figsize"] = [rectangle_size * len(nums), rectangle_size]
plt.rcParams["figure.autolayout"] = True
fig = plt.figure()
ax = fig.add_subplot(111)
for i in range(len(nums)):
# We are adding rectangles
# You can change colors as you wish
plt.broken_barh([(rectangle_size * i, rectangle_size)], (0, rectangle_size), facecolors='white', edgecolor='black'
,linewidth = 1)
# We are calculating where to annotate numbers
cy = rectangle_size / 2.0
cx = rectangle_size * i + cy
# Annotation You can change color,font, etc ..
ax.annotate(str(nums[i]), (cx, cy), color='black', weight='bold', fontsize=20, ha='center', va='center')
# For squared look
plt.xlim([0, rectangle_size*len(nums)])
plt.ylim([0, rectangle_size])
# We dont want to show ticks
plt.axis('off')
plt.show()

One way using the Rectangle patch is:
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.patches import Rectangle
x = np.random.randn(5).round(2).tolist()
fig, ax = plt.subplots(figsize=(9, 2)) # make figure
dx = 0.15 # edge size of box
buf = dx / 10 # buffer around edges
# set x and y limits
ax.set_xlim([0 - buf, len(x) * dx + buf])
ax.set_ylim([0 - buf, dx + buf])
# set axes as equal and turn off axis lines
ax.set_aspect("equal")
ax.axis("off")
# draw plot
for i in range(len(x)):
# create rectangle with linewidth=4
rect = Rectangle((dx * i, 0), dx, dx, facecolor="none", edgecolor="black", lw=4)
ax.add_patch(rect)
# get text position
x0, y0 = dx * i + dx / 2, dx / 2
# add text
ax.text(
x0, y0, f"{x[i]}", color="black", ha="center", va="center", fontsize=28, fontweight="bold"
)
fig.tight_layout()
fig.show()
which gives:

normal distribution curve doesn't fit well over histogram in subplots using matplotlib

I am using "plt.subplots(2, 2, sharex=True, sharey=True)" to draw a 2*2 subplots. Each subplot has two Y axis and contains normal distribution curve over a histogram. Noting I particularly set "sharex=True, sharey=True" here in order to make all subplots share the same X axis and Y axis.
After running my code, everything is fine except the second, three, and fourth subplots where the normal distribution curve doesn't fit the histogram very well (please see the figure here)
I did googling but failed to get this issue solved. However, if I set "sharex=True, sharey=False" in my code, then the figure looks correct, but all subplots use their own Y axix which isn't what I want. Please see the figure here
Hope this issue can be fixed by experts in StackOverflow. Many thanks in advance!
Below is my code:
import matplotlib.pyplot as plt
from scipy.stats import norm
def align_yaxis(ax1, v1, ax2, v2):
#adjust ax2 ylimit so that v2 in ax2 is aligned to v1 in ax1
_, y1 = ax1.transData.transform((0, v1))
_, y2 = ax2.transData.transform((0, v2))
inv = ax2.transData.inverted()
_, dy = inv.transform((0, 0)) - inv.transform((0, y1-y2))
miny, maxy = ax2.get_ylim()
ax2.set_ylim(miny+dy, maxy+dy)
def drawSingle(myax, mydf , title, offset):
num_bins = 200
xs = mydf["gap"]
x = np.linspace(-1,1,1000)
mu =np.mean(x)
sigma =np.std(xs)
n, bins, patche = myax.hist(xs, num_bins, alpha=0.8, facecolor='blue', density=False)
myax.set_ylabel('frequency',color="black",fontsize=12, weight = "bold")
myax.set_xlabel('X', fontsize=12, weight = "bold",horizontalalignment='center')
ax_twin = myax.twinx()
y_normcurve = norm.pdf(bins, mu, sigma)
ax_twin.plot(bins, y_normcurve, 'r--')
align_yaxis(myax,0,ax_twin,0)
peakpoint = norm.pdf(mu,loc=mu,scale=sigma)
plt.vlines(mu, 0, peakpoint, 'y', '--', label='example')
ax_twin.set_ylabel("probablility dense",color="black",fontsize=12, weight = "bold")
def drawSubplots(mydf1,mydf2,mydf3,mydf4, pos1,pos2,pos3,pos4, title, filename):
plt.rcParams['figure.figsize'] = (18,15 )
my_x_ticks = np.arange(-0.8, 0.8,0.1)
rows, cols = 2, 2
fig, ax = plt.subplots(2, 2, sharex=True, sharey=True)
drawSingle(ax[0][0], mydf1, "Subplot1", pos1)
drawSingle(ax[0][1], mydf2, "Subplot2", pos2)
drawSingle(ax[1][0], mydf3, "Subplot3", pos3)
drawSingle(ax[1][1], mydf4, "Subplot4", pos4)
plt.text(-1, -1, title, horizontalalignment='center', fontsize=18)
plt.show()
drawSubplots(df1, df2,df3,df4,3.2,3.1,2.7,2.85,"test9", "test9")

Here is an attempt to:
have the left y-axes being "frequency" (which is very uninformative in the case of the current bin widths) and shared among the 4 subplots
have the right y-axes be a "probability density"; note how the top of all gaussians is around y=0.02 (the twin axes can only be set at the end because the shared y axes can be updated via later subplots)
have the histogram and the normal curve aligned
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy.stats import norm
def drawSingle(myax, mydf, title):
num_bins = 200
xs = mydf["gap"]
x = np.linspace(-1, 1, 1000)
mu = np.mean(x)
sigma = np.std(xs)
n, bins, patches = myax.hist(xs, num_bins, alpha=0.8, facecolor='blue', density=False)
myax.set_ylabel('frequency', color="black", fontsize=12, weight="bold")
myax.set_xlabel('X', fontsize=12, weight="bold", horizontalalignment='center')
normalization_factor = len(xs) * (bins[1] - bins[0])
y_normcurve = norm.pdf(x, mu, sigma) * normalization_factor
myax.plot(x, y_normcurve, 'r--')
myax.vlines(mu, 0, y_normcurve.max(), 'y', '--', color='lime', label='example')
return normalization_factor
def drawSubplots(mydf1, mydf2, mydf3, mydf4, title):
plt.rcParams['figure.figsize'] = (18, 15)
fig, ax = plt.subplots(nrows=2, ncols=2, sharex=True, sharey=True)
dfs = [mydf1, mydf2, mydf3, mydf4]
norm_factors = [drawSingle(ax_i, df, title)
for ax_i, df, title in zip(ax.ravel(), dfs, ["Subplot1", "Subplot2", "Subplot3", "Subplot4"])]
for ax_i, norm_factor in zip(ax.ravel(), norm_factors):
ax_twin = ax_i.twinx()
ymax = ax_i.get_ylim()[1]
ax_twin.set_ylim(0, ymax / norm_factor)
plt.suptitle(title, fontsize=18)
plt.tight_layout()
plt.show()
df1, df2, df3, df4 = [pd.DataFrame({"gap": np.random.normal(0, 0.2, n)}) for n in [6000, 4000, 1800, 1200]]
drawSubplots(df1, df2, df3, df4, "Title")

Many thanks JohanC, you are amazing.
Based on your code, I just added a few lines of code within drawSubplots function in order to make 95% of the Gaussian curve area shaded between the lower bound and upper bound for each subplot. The following is my try. It seems that ax_twin.fill_between doesn't work normally here. As you could see from the figure that the shaded area is out of the Gaussian curve enter image description here. What I want is only to shade the area under the Gaussian curve between the lower bound and upper bound. If you don't mind, would you please check it out my mistake? Thank you very much!
import matplotlib.pyplot as plt
import math
from scipy.stats import norm
def align_yaxis(ax1, v1, ax2, v2):
#adjust ax2 ylimit so that v2 in ax2 is aligned to v1 in ax1
_, y1 = ax1.transData.transform((0, v1))
_, y2 = ax2.transData.transform((0, v2))
inv = ax2.transData.inverted()
_, dy = inv.transform((0, 0)) - inv.transform((0, y1-y2))
miny, maxy = ax2.get_ylim()
ax2.set_ylim(miny+dy, maxy+dy)
def drawSingle(myax, mydf , title):
num_bins = 200
xs = mydf["gap"]
x = np.linspace(-1,1,1000)
mu =np.mean(xs)
sigma =np.std(xs)
n, bins, patches = myax.hist(xs, num_bins, alpha=0.8, facecolor='blue', density=False)
myax.set_ylabel('Frequency', color="black", fontsize=12, weight="bold")
myax.set_xlabel(title, fontsize=12, weight="bold", horizontalalignment='center')
normalization_factor = len(xs) * (bins[1] - bins[0])
y_normcurve = norm.pdf(x, mu, sigma) * normalization_factor
myax.plot(x, y_normcurve, 'r--')
myax.vlines(mu, 0, y_normcurve.max(), 'y', '--', color='lime', label='example')
plt.xlim(-0.8,0.8)
my_x_ticks = np.arange(-0.8, 0.8,0.1)
plt.xticks(my_x_ticks)
return normalization_factor, mu, sigma
def drawSubplots(mydf1,mydf2,mydf3,mydf4, title):
plt.rcParams['figure.figsize'] = (18,15 )
norm_factors = []
mus = []
sigmas = []
my_x_ticks = np.arange(-0.8, 0.8,0.1)
rows, cols = 2, 2
fig, ax = plt.subplots(nrows=rows, ncols=cols, sharex=True, sharey=True)
dfs = [mydf1, mydf2, mydf3, mydf4]
#norm_factors = [drawSingle(ax_i, df, title)
#for ax_i, df, title in zip(ax.ravel(), dfs, ["Subplot1", "Subplot2", "Subplot3", "Subplot4"])]
for ax_i, df, title in zip(ax.ravel(), dfs, ["Subplot1", "Subplot2", "Subplot3", "Subplot4"]):
norm_factor, mu, sigma = drawSingle(ax_i, df, title)
norm_factors.append(norm_factor)
mus.append(mu)
sigmas.append(sigma)
for ax_i, norm_factor, mu, sigma in zip(ax.ravel(), norm_factors, mus, sigmas ):
ax_twin = ax_i.twinx()
xmax = ax_i.get_xlim()[1]
ax_twin.set_ylim(0, xmax / norm_factor)
ax_twin.set_ylabel("probablility dense",color="black",fontsize=12, weight = "bold")
CI_95_lower = mu - (1.96*sigma)
CI_95_upper = mu + (1.96*sigma)
px_shaded = np.arange(CI_95_lower,CI_95_upper,0.1)
ax_twin.fill_between(px_shaded,norm.pdf(px_shaded,loc=mu,scale=sigma) * norm_factor,alpha=0.75, color='pink')
area_shaded_95_CI = norm.cdf(x=CI_95_upper, loc=mu, scale=sigma)-norm.cdf(x=CI_95_lower, loc=mu, scale=sigma)
ax_twin.text(-0.06,0.01,str(round(area_shaded_95_CI*100,1))+"%", fontsize=20)
ax_twin.annotate(s=f'lower bound= {CI_95_lower:.3f}',xy=(CI_95_lower,norm.pdf(CI_95_lower,loc=mu,scale=sigma)),xytext=(-0.75,0.01),weight='bold',color='blue',\
arrowprops=dict(arrowstyle='-|>',connectionstyle='arc3',color='green'),\
fontsize=12
)
ax_twin.annotate(s=f'upper bound= {CI_95_upper:.3f}',xy=(CI_95_upper,norm.pdf(CI_95_upper,loc=mu,scale=sigma)),xytext=(0.28,0.01),weight='bold',color='blue',\
arrowprops=dict(arrowstyle='-|>',connectionstyle='arc3',color='green'),\
fontsize=12
)
ax_twin.text(0.05, 0.03, r"$\mu=" + f'{mu:.6f}' + ", \sigma=" + f'{sigma:.6f}' + "$" + ", confidence interval=95%" ,
horizontalalignment='center', fontsize=15)
plt.suptitle(title, fontsize=18)
plt.tight_layout()
plt.show()
df1, df2, df3, df4 = [pd.DataFrame({"gap": np.random.normal(0, 0.2, n)}) for n in [6000, 4000, 1800, 1200]]
drawSubplots(df1, df2, df3, df4, "Title")

Matplotlib fill_between edge

I need to create a plot as close to this picture as possible (given the generated dataframe code below):
And here's the output plot of my code:
What I am having problems with is:
The edge of fill_between is not sharp as in the picture. What I have is some kind of white shadow. How do I change the line between the fillings to match a target picture?
How do I aling legend color lines to the center, but not to the left border which my code does?
Here's my code:
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cm
import numpy as np
import pandas as pd
ncols = 10
figsize = (20, 5)
fontsize = 14
dti = pd.date_range('2013-01-01', '2020-12-31', freq='2W')
probabilities_in_time = np.random.random((ncols, len(dti)))
probabilities_in_time = probabilities_in_time / \
probabilities_in_time.sum(axis=0)
probabilities_in_time = pd.DataFrame(probabilities_in_time).T
probabilities_in_time.index = dti
cm_subsection = np.linspace(0, 1, ncols)
colors = [cm.coolwarm(x) for x in cm_subsection]
def plot_time_probabilities(probabilities_in_time, figsize):
plt.figure(figsize=figsize)
plt.yticks(np.arange(0, 1.2, 0.2), fontsize=fontsize)
plt.xticks(fontsize=fontsize)
draw_stack_plot(colors, probabilities_in_time)
set_grid()
set_legend()
plt.show()
def draw_stack_plot(colors, probabilities_in_time):
for i, color in enumerate(colors):
if i == 0:
plt.plot(probabilities_in_time[i], color=color)
plt.fill_between(probabilities_in_time.index,
probabilities_in_time[0], color=color)
else:
probabilities_in_time[i] += probabilities_in_time[i-1]
plt.fill_between(probabilities_in_time.index,
probabilities_in_time[i], probabilities_in_time[i-1],
color=color)
plt.plot(probabilities_in_time[i], label=' Probability: {}'.format(
i), color=color)
def set_grid():
ax = plt.gca()
ax.set_axisbelow(False)
ax.xaxis.grid(True, linestyle='-', lw=1)
def set_legend():
leg = plt.legend(loc='lower left', fontsize=14, handlelength=1.3)
for i in leg.legendHandles:
i.set_linewidth(12)
plot_time_probabilities(probabilities_in_time, figsize)

To set the legend in the center, you can set loc='center', or you can put the legend outside. To avoid that the legend handles grow to larger, you can leave out .set_linewidth(12) (this sets a very wide edge width of 12 points).
Shifting the colors by one position can help to show the fill borders more pronounced. To still have a correct legend, the label should then be added to fill_between.
The code below also tries to simplify part of the calls:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
import pandas as pd
ncols = 10
figsize = (20, 5)
fontsize = 14
dti = pd.date_range('2013-01-01', '2020-12-31', freq='2W')
probabilities_in_time = np.random.random((ncols, len(dti)))
probabilities_in_time = probabilities_in_time / probabilities_in_time.sum(axis=0)
probabilities_in_time = pd.DataFrame(probabilities_in_time).T
probabilities_in_time.index = dti
cm_subsection = np.linspace(0, 1, ncols)
colors = cm.coolwarm(cm_subsection)
def plot_time_probabilities(probabilities_in_time, figsize):
plt.figure(figsize=figsize)
plt.yticks(np.arange(0, 1.2, 0.2), fontsize=fontsize)
plt.xticks(fontsize=fontsize)
draw_stack_plot(colors, probabilities_in_time)
set_grid()
set_legend()
# plt.margins(x=0, y=0)
plt.margins(x=0.02)
plt.tight_layout()
plt.show()
def draw_stack_plot(colors, probabilities_in_time):
current_probabilities = 0
for i, color in enumerate(colors):
plt.fill_between(probabilities_in_time.index,
probabilities_in_time[i] + current_probabilities, current_probabilities,
color=color, label=f' Probability: {i}')
current_probabilities += probabilities_in_time[i]
plt.plot(current_probabilities,
color=colors[0] if i <= 1 else colors[-1] if i >= 8 else colors[i - 1] if i < 5 else colors[i + 1])
def set_grid():
ax = plt.gca()
ax.set_axisbelow(False)
ax.xaxis.grid(True, linestyle='-', lw=1)
def set_legend():
leg = plt.legend(loc='lower left', fontsize=14, handlelength=1.3)
# leg = plt.legend(loc='upper left', bbox_to_anchor=(1.01, 1), fontsize=14, handlelength=1.3)
# for i in leg.legendHandles:
# i.set_linewidth(12)
plot_time_probabilities(probabilities_in_time, figsize)

How to plot normal distribution with percentage of data as label in each band/bin?

While plotting normal distribution graph of data, how can we put labels like in image below for percentage of data in each bin where each band has a width of 1 standard deviation using matplotlib/seaborn or plotly ?
Currently, im plotting like this:
hmean = np.mean(data)
hstd = np.std(data)
pdf = stats.norm.pdf(data, hmean, hstd)
plt.plot(data, pdf)

Although I've labelled the percentages between the quartiles, this bit of code may be helpful to do the same for the standard deviations.
import numpy as np
import scipy
import pandas as pd
from scipy.stats import norm
import matplotlib.pyplot as plt
from matplotlib.mlab import normpdf
# dummy data
mu = 0
sigma = 1
n_bins = 50
s = np.random.normal(mu, sigma, 1000)
fig, axes = plt.subplots(nrows=2, ncols=1, sharex=True)
#histogram
n, bins, patches = axes[1].hist(s, n_bins, normed=True, alpha=.1, edgecolor='black' )
pdf = 1/(sigma*np.sqrt(2*np.pi))*np.exp(-(bins-mu)**2/(2*sigma**2))
median, q1, q3 = np.percentile(s, 50), np.percentile(s, 25), np.percentile(s, 75)
print(q1, median, q3)
#probability density function
axes[1].plot(bins, pdf, color='orange', alpha=.6)
#to ensure pdf and bins line up to use fill_between.
bins_1 = bins[(bins >= q1-1.5*(q3-q1)) & (bins <= q1)] # to ensure fill starts from Q1-1.5*IQR
bins_2 = bins[(bins <= q3+1.5*(q3-q1)) & (bins >= q3)]
pdf_1 = pdf[:int(len(pdf)/2)]
pdf_2 = pdf[int(len(pdf)/2):]
pdf_1 = pdf_1[(pdf_1 >= norm(mu,sigma).pdf(q1-1.5*(q3-q1))) & (pdf_1 <= norm(mu,sigma).pdf(q1))]
pdf_2 = pdf_2[(pdf_2 >= norm(mu,sigma).pdf(q3+1.5*(q3-q1))) & (pdf_2 <= norm(mu,sigma).pdf(q3))]
#fill from Q1-1.5*IQR to Q1 and Q3 to Q3+1.5*IQR
axes[1].fill_between(bins_1, pdf_1, 0, alpha=.6, color='orange')
axes[1].fill_between(bins_2, pdf_2, 0, alpha=.6, color='orange')
print(norm(mu, sigma).cdf(median))
print(norm(mu, sigma).pdf(median))
#add text to bottom graph.
axes[1].annotate("{:.1f}%".format(100*norm(mu, sigma).cdf(q1)), xy=((q1-1.5*(q3-q1)+q1)/2, 0), ha='center')
axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q3)-norm(mu, sigma).cdf(q1))), xy=(median, 0), ha='center')
axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q3+1.5*(q3-q1)-q3)-norm(mu, sigma).cdf(q3))), xy=((q3+1.5*(q3-q1)+q3)/2, 0), ha='center')
axes[1].annotate('q1', xy=(q1, norm(mu, sigma).pdf(q1)), ha='center')
axes[1].annotate('q3', xy=(q3, norm(mu, sigma).pdf(q3)), ha='center')
axes[1].set_ylabel('probability')
#top boxplot
axes[0].boxplot(s, 0, 'gD', vert=False)
axes[0].axvline(median, color='orange', alpha=.6, linewidth=.5)
axes[0].axis('off')
plt.subplots_adjust(hspace=0)
plt.show()

Since I unfortunately can't comment. Here is an alternative for #MaMo and #Chipmunk_da.
The problem is that the arrays 'bins_1, pdf_1' and 'bins_2, pdf_2' have different sizes. I solved it a bit rudimentary with the code lines written below, but it worked. Since now all arrays have the same size and map the variables of the Gaussian distribution.
The bounds are now no longer solved by the function of the comparison characters, as by #Chris, but with the definition of two variables 'bins_1, bins_2' and the NumPy function 'np.linspace'.
bins_1 = np.linspace(q1-1.5*(q3-q1), q1, n_bins, dtype=float)
pdf_1 = 1/(sigma*np.sqrt(2*np.pi))*np.exp(-(bins_1-mu)**2/(2*sigma**2))
bins_2 = np.linspace(q3+1.5*(q3-q1), q3, n_bins, dtype=float)
pdf_2 = 1/(sigma*np.sqrt(2*np.pi))*np.exp(-(bins_2-mu)**2/(2*sigma**2))

Annotate the quartiles with Matplotlib in a normal distribution plot

I'm working with a data-set, so far i have made a histogram with a overlayed normal distribution curve.
I want to mark out the quartiles as in this image (the box plot is for reference).
This is the code i'm working with:
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
depDelay.sort()
plt.hist(depDelay, bins=100, normed=True)
hmean = np.mean(depDelay)
hstd = np.std(depDelay)
pdf = stats.norm.pdf(depDelay, hmean, hstd)
markers = [np.percentile(depDelay,50)]
plt.plot(DepDelay, pdf,'-o',markevery=markers)
plt.title('Distribution of Departure Delay')
plt.xlabel('Departure Delay (in mins)')
plt.ylabel('Frequency')
plt.savefig('depDelayNormDist.png')
plt.show()
How can i plot the same using matplotlib ?

I've tried to replicate the referenced image somewhat. Not sure what precisely you meant by marking the quartiles, but I've put in labels for Q1 and Q3 at the pdf and percentages in between the quartiles.
import numpy as np
import scipy
import pandas as pd
from scipy.stats import norm
import matplotlib.pyplot as plt
from matplotlib.mlab import normpdf
# dummy data
mu = 0
sigma = 1
n_bins = 50
s = np.random.normal(mu, sigma, 1000)
fig, axes = plt.subplots(nrows=2, ncols=1, sharex=True)
#histogram
n, bins, patches = axes[1].hist(s, n_bins, normed=True, alpha=.1, edgecolor='black' )
pdf = 1/(sigma*np.sqrt(2*np.pi))*np.exp(-(bins-mu)**2/(2*sigma**2))
median, q1, q3 = np.percentile(s, 50), np.percentile(s, 25), np.percentile(s, 75)
print(q1, median, q3)
#probability density function
axes[1].plot(bins, pdf, color='orange', alpha=.6)
#to ensure pdf and bins line up to use fill_between.
bins_1 = bins[(bins >= q1-1.5*(q3-q1)) & (bins <= q1)] # to ensure fill starts from Q1-1.5*IQR
bins_2 = bins[(bins <= q3+1.5*(q3-q1)) & (bins >= q3)]
pdf_1 = pdf[:int(len(pdf)/2)]
pdf_2 = pdf[int(len(pdf)/2):]
pdf_1 = pdf_1[(pdf_1 >= norm(mu,sigma).pdf(q1-1.5*(q3-q1))) & (pdf_1 <= norm(mu,sigma).pdf(q1))]
pdf_2 = pdf_2[(pdf_2 >= norm(mu,sigma).pdf(q3+1.5*(q3-q1))) & (pdf_2 <= norm(mu,sigma).pdf(q3))]
#fill from Q1-1.5*IQR to Q1 and Q3 to Q3+1.5*IQR
axes[1].fill_between(bins_1, pdf_1, 0, alpha=.6, color='orange')
axes[1].fill_between(bins_2, pdf_2, 0, alpha=.6, color='orange')
print(norm(mu, sigma).cdf(median))
print(norm(mu, sigma).pdf(median))
#add text to bottom graph.
axes[1].annotate("{:.1f}%".format(100*norm(mu, sigma).cdf(q1)), xy=((q1-1.5*(q3-q1)+q1)/2, 0), ha='center')
axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q3)-norm(mu, sigma).cdf(q1))), xy=(median, 0), ha='center')
axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q3+1.5*(q3-q1)-q3)-norm(mu, sigma).cdf(q3))), xy=((q3+1.5*(q3-q1)+q3)/2, 0), ha='center')
axes[1].annotate('q1', xy=(q1, norm(mu, sigma).pdf(q1)), ha='center')
axes[1].annotate('q3', xy=(q3, norm(mu, sigma).pdf(q3)), ha='center')
axes[1].set_ylabel('probability')
#top boxplot
axes[0].boxplot(s, 0, 'gD', vert=False)
axes[0].axvline(median, color='orange', alpha=.6, linewidth=.5)
axes[0].axis('off')
plt.subplots_adjust(hspace=0)
plt.show()
FYI, I've answered this similar question as well.

I updated the answer in form of a function following similar posts including creating dashed lines on KDE plot having quantiles:
import numpy as np
import scipy
import pandas as pd
from scipy.stats import norm
import matplotlib.pyplot as plt
#from matplotlib.mlab import normpdf #check this: https://github.com/materialsproject/pymatgen/issues/1657
def KDE_hist_plot(df):
for col in df.columns:
n_bins = 50
fig, axes = plt.subplots(nrows=2, ncols=1, sharex=True, figsize=(10,5))
#histogram
n, bins, patches = axes[1].hist(df[col], n_bins, density=True, alpha=.1, edgecolor='black' )
#data = pd.Series(s)
mu = df[col].mean()
sigma = df[col].std()
pdf = 1/(sigma*np.sqrt(2*np.pi))*np.exp(-(bins-mu)**2/(2*sigma**2))
median, q1, q3 = np.percentile(df[col], 50), np.percentile(df[col], 25), np.percentile(df[col], 75)
#probability density function
axes[1].plot(bins, pdf, color='orange', alpha=.6)
#axes[1].figsize=(10,20)
#fill from Q1-1.5*IQR to Q1 and Q3 to Q3+1.5*IQR
iqr = 1.5 * (q3-q1)
x1 = np.linspace(q1 - iqr, q1)
x2 = np.linspace(q3, q3 + iqr)
pdf1 = 1/(sigma*np.sqrt(2*np.pi))*np.exp(-(x1-mu)**2/(2*sigma**2))
pdf2 = 1/(sigma*np.sqrt(2*np.pi))*np.exp(-(x2-mu)**2/(2*sigma**2))
axes[1].fill_between(x1, pdf1, 0, alpha=.6, color='orange')
axes[1].fill_between(x2, pdf2, 0, alpha=.6, color='orange')
#add text to bottom graph.
axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q1) -norm(mu, sigma).cdf(q1-iqr))), xy=(q1-iqr/2, 0), ha='center')
axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q3) -norm(mu, sigma).cdf(q1) )), xy=(median , 0), ha='center')
axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q3+iqr)-norm(mu, sigma).cdf(q3) )), xy=(q3+iqr/2, 0), ha='center')
axes[1].annotate('q1', xy=(q1, norm(mu, sigma).pdf(q1)), ha='center')
axes[1].annotate('q3', xy=(q3, norm(mu, sigma).pdf(q3)), ha='center')
#dashed lines
plt.axvline(df[col].quantile(0),color='b', linestyle='-.')
plt.axvline(df[col].quantile(0.25),color='g', linestyle='--')
plt.axvline(df[col].quantile(0.50),color='g', linestyle='--')
plt.axvline(df[col].quantile(0.75),color='b', linestyle='--')
plt.axvline(df[col].quantile(1),color='r', linestyle='-.')
axes[1].set_ylabel('Probability Density')
#top boxplot
axes[0].boxplot(df[col], 0, 'gD', vert=False)
axes[0].axvline(median, color='orange', alpha=.6, linewidth=.5)
axes[0].axis('off')
Please see the results below for df with 2 columns/attributes and working function in colab notebook:
KDE_hist_plot(df)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Plotting annotated gaussian curve - python

Related

Matplotlib, plot a vector of numbers as a rectangle filled with numbers

normal distribution curve doesn't fit well over histogram in subplots using matplotlib

Matplotlib fill_between edge

How to plot normal distribution with percentage of data as label in each band/bin?

Annotate the quartiles with Matplotlib in a normal distribution plot

Categories

Resources