Related
I attempted to plot the kernel density distribution (Gaussian) curve along with the histogram plot of two data set in python.
However, in my script the estimation of 95% (data1: marked by red color vertical line) and 5% (data2: marked by black color vertical line) is very time-consuming, e.g. I need to test different limits [detail explanation in code, where I need to change the upper limited] to get the 95% and 5% probability of the kernel density curve.
May someone help out me here and suggest possible way out fixed this issue or another approach to plot the kernel density curve along with its 95% and 5% probability.
Thank you!
My script is here.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.neighbors import KernelDensity
from scipy import stats
data1 = result['95_24'] # data 1
data2 = result['5_24'] # data 2
def plot_prob_density(data1, data2, x_start1, x_end1):
fig, (ax1) = plt.subplots(1, 1, figsize=(6,5), sharey=False)
unit = 1.5
x = np.linspace(-20, 20, 1000)[:, np.newaxis]
# Hisogram plot of data
ax1.hist(data1, bins=np.linspace(-20,20,40), density=True, color='r', alpha=0.4)
ax1.hist(data2, bins=np.linspace(-20,20,40), density=True, color='k', alpha=0.4)
# kernel density estimation
kd_data1 = KernelDensity(kernel='gaussian', bandwidth=1.8).fit(data1)
kd_data2 = KernelDensity(kernel='gaussian', bandwidth=1.8).fit(data2)
kd_vals_data1 = np.exp(kd_data1.score_samples(x))
kd_vals_data2 = np.exp(kd_data2.score_samples(x))
# density plot
ax1.plot(x, kd_vals_data1, color='r', label='$Na$', linewidth=2)
ax1.plot(x, kd_vals_data2, color='k', label='$Λ$', linewidth = 2)
# using the function get probability)
ax1.axvline(x=x_end1,color='red',linestyle='dashed', linewidth = 3, label='$β_{95\%}$')
ax1.axvline(x=x_start1,color='k',linestyle='dashed', linewidth = 3, label='$β_{5\%}$')
# Show the plots
ax1.set_ylabel('Probability density', fontsize=12)
ax1.set_xlabel('Beta', fontsize=12)
ax1.set_xlim([-20, 20])
ax1.set_ylim(0, 0.3)
ax1.set_yticks([0, 0.1, 0.2, 0.3])
ax1.set_xticks([-20, 20, -10, 10, 0])
ax1.legend(fontsize=12, loc='upper left', frameon=False)
fig.tight_layout()
gc.collect()
return kd_data1, kd_data2,
# Calculation of 95% and 5 % for data1 and data2 Kernel density curve
def get_probability(start_value, end_value, eval_points, kd):
# Number of evaluation points
N = eval_points
step = (end_value - start_value) / (N - 1) # Step size
x = np.linspace(start_value, end_value, N)[:, np.newaxis] # Generate values in the range
kd_vals = np.exp(kd.score_samples(x)) # Get PDF values for each x
probability = np.sum(kd_vals * step) # Approximate the integral of the PDF
return probability.round(4)
data1 = np.array(data1).reshape(-1, 1)
data2 = np.array(data2).reshape(-1, 1)
kd_data1, kd_data2= plot_prob_density(data1, data2, x_start1=-2.2, x_end1=5.3)
# ##############################
print('Beta-95%: {}'
.format(get_probability(start_value = -20,
end_value = 5.3,
eval_points = 1000,
kd = kd_data1)))
# here, I modify the end-value every time and then see teh output #value, when it reached to 95% then i took taht values as 95% #confidence, however this is very confsuing, i want to compute this 95% directly and same for 5% probbaility, computed below:
print('Beta-5%: {}\n'
.format(get_probability(start_value = -20,
end_value = -2.2,
eval_points = 1000,
kd = kd_data2)))
####################################################################
plt.savefig("Ev_test.png")
The pictorial representation is also attached here.
Histogram and kernel density plot along with its 95% and 5% probability limits highlighted with red and black vertical bold lines:
Here is the possible way out to fix this issue. Additionally, the proposed method it has error in percentile calculation, therefore i recommend not to use that:
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import gaussian_kde
import seaborn as sns
from sklearn.neighbors import KernelDensity
%matplotlib inline
import numpy as np
from scipy import stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
from statsmodels.distributions.mixture_rvs import mixture_rvs
from scipy.stats import norm
import numpy as np
fig = plt.figure(figsize=(4, 4), dpi=300)
ax = fig.add_subplot(111)
# Plot the histogram
ax.hist(data8,bins=20,zorder=1,color="r",density=True,alpha=0.6,)
ax.hist(data7,bins=20,zorder=1,color="black",density=True,alpha=0.6,)
# kde.fit()
kde = sm.nonparametric.KDEUnivariate(data8)
kde1 = sm.nonparametric.KDEUnivariate(data7)
# Plot the KDE for various bandwidths
for bandwidth in [1.8]:
kde.fit(bw=bandwidth)
kde1.fit(bw=bandwidth)# Estimate the densities
ax.plot(kde.support, kde.density,"-",lw=2,color="r",zorder=10, alpha=0.6, label="Data1")
ax.plot(kde1.support, kde1.density,"-",lw=2,color="black",zorder=10, alpha=0.6, label="Data2")
ax.legend(loc="best")
ax.set_xlim([-20, 40])
ax.set_ylim([0, 0.3])
ax.grid(False)
# Probabilities calculation
quantiles_mesh = np.linspace(0,1,len(kde.density))
fig = plt.figure(figsize=(2, 2), dpi=300)
plt.plot(quantiles_mesh, kde.icdf)
data_1_95= np.percentile(kde1.icdf, 95)
data_2_5= np.percentile(kde2.icdf, 5)
ax.axvline(x=data_1_95,color='red',linestyle='dashed', linewidth = 2)
ax.axvline(x=data_2_5,color='k',linestyle='dashed', linewidth = 2)
#plt.savefig("KDE_Plot.png")
Summary of Question:
Why is my density from my sample so different to the pmf and how can I perform this simulation so that the pmf and the sample estimates are similar.
Question:
I have simulated a sample of independent Bernoulli trials using scipy. I am now trying to take a density histogram of the sample that I created and compare it to the pmf (probability mass function). I would expect the density histogram to show two bins each hovering near the pmf but instead, I have 2 bins above the pmf values at 5. Could someone please show me how to create a density histogram that does not do this for the Bernoulli? I tried a similar simulation with a few other distributions and it seemed to work fine. What am I missing here and could you show me how to manipulate my code to make this work?
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
trials = 10**3
p = 0.5
sample_bernoulli = stats.bernoulli.rvs(p, size=trials) # Generate benoulli RV
plt.plot((0,1), stats.bernoulli.pmf((0,1), p), 'bo', ms=8, label='bernoulli pmf')
# Density histogram of generated values
plt.hist(sample_bernoulli, density=True, alpha=0.5, color='steelblue', edgecolor='none')
plt.show()
I must apologize if this is a simple or trivial question but I couldn't find a solution online and found the issue interesting. Any help at all would be appreciated.
The reason is that plt.hist is primarily meant to work with continuous distributions. If you don't provide explicit bin boundaries, plt.hist just creates 10 equally spaced bins between the minimum and maximum value. Most of these bins will be empty. With only two possible data values, there should be just two bins, so 3 boundaries:
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
trials = 10**3
p = 0.5
sample_bernoulli = stats.bernoulli.rvs(p, size=trials) # Generate benoulli RV
plt.plot((0,1), stats.bernoulli.pmf((0,1), p), 'bo', ms=8, label='bernoulli pmf')
# Density histogram of generated values
plt.hist(sample_bernoulli, density=True, alpha=0.5, color='steelblue', edgecolor='none', bins=np.linspace(-0.5, 1.5, 3))
plt.show()
Here is a visualization of the default bin boundaries and how the samples fit into the bins. Note that with density=True, the histogram is normalized such that the area of all the bars sums to 1. In this case two bars are 0.1 wide and about 5.0 high, while 8 others have height zero. So, the total area is 2*0.1*5 + 8*0.0 = 1.
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
trials = 10 ** 3
p = 0.5
sample_bernoulli = stats.bernoulli.rvs(p, size=trials) # Generate benoulli RV
# Density histogram of generated values with default bins
values, binbounds, bars = plt.hist(sample_bernoulli, density=True, alpha=0.2, color='steelblue', edgecolor='none')
# show the bin boundaries
plt.vlines(binbounds, 0, max(values) * 1.05, color='crimson', ls=':')
# show the sample values with a random displacement
plt.scatter(sample_bernoulli * 0.9 + np.random.uniform(0, 0.1, trials),
np.random.uniform(0, max(values), trials), color='lime')
# show the index of each bin
for i in range(len(binbounds) - 1):
plt.text((binbounds[i] + binbounds[i + 1]) / 2, max(values) / 2, i, ha='center', va='center', fontsize=20, color='crimson')
plt.show()
In Python, I have estimated the parameters for the density of a model of my distribution and I would like to plot the density function above the histogram of the distribution. In R it is similar to using the option prop=TRUE.
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
# initialization of the list "data"
# estimation of the parameter, in my case, mean and variance of a normal distribution
plt.hist(data, bins="auto") # data is the list of data
# here I would like to draw the density above the histogram
plt.show()
I guess the trickiest part is to make it fit.
Edit: I have tried this according to the first answer:
mean = np.mean(logdata)
var = np.var(logdata)
std = np.sqrt(var) # standard deviation, used by numpy as a replacement of the variance
plt.hist(logdata, bins="auto", alpha=0.5, label="données empiriques")
x = np.linspace(min(logdata), max(logdata), 100)
plt.plot(x, mlab.normpdf(x, mean, std))
plt.xlabel("log(taille des fichiers)")
plt.ylabel("nombre de fichiers")
plt.legend(loc='upper right')
plt.grid(True)
plt.show()
But it doesn't fit the graph, here is how it looks:
** Edit 2 ** Works with the option normed=True in the histogram function.
If I understand you correctly you have the mean and standard deviation of some data. You have plotted a histogram of this and would like to plot the normal distribution line over the histogram. This line can be generated using matplotlib.mlab.normpdf(), the documentation can be found here.
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
mean = 100
sigma = 5
data = np.random.normal(mean,sigma,1000) # generate fake data
x = np.linspace(min(data), max(data), 100)
plt.hist(data, bins="auto",normed=True)
plt.plot(x, mlab.normpdf(x, mean, sigma))
plt.show()
Which gives the following figure:
Edit: The above only works with normed = True. If this is not an option, we can define our own function:
def gauss_function(x, a, x0, sigma):
return a * np.exp(-(x - x0) ** 2 / (2 * sigma ** 2))
mean = 100
sigma = 5
data = np.random.normal(mean,sigma,1000) # generate fake data
x = np.linspace(min(data), max(data), 1000)
test = gauss_function(x, max(data), mean, sigma)
plt.hist(data, bins="auto")
plt.plot(x, test)
plt.show()
All what you are looking for, already are in seaborn.
You just have to use distplot
import seaborn as sns
import numpy as np
data = np.random.normal(5, 2, size=1000)
sns.distplot(data)
I'm trying to do a little bit of distribution plotting and fitting in Python using SciPy for stats and matplotlib for the plotting. I'm having good luck with some things like creating a histogram:
seed(2)
alpha=5
loc=100
beta=22
data=ss.gamma.rvs(alpha,loc=loc,scale=beta,size=5000)
myHist = hist(data, 100, normed=True)
Brilliant!
I can even take the same gamma parameters and plot the line function of the probability distribution function (after some googling):
rv = ss.gamma(5,100,22)
x = np.linspace(0,600)
h = plt.plot(x, rv.pdf(x))
How would I go about plotting the histogram myHist with the PDF line h superimposed on top of the histogram? I'm hoping this is trivial, but I have been unable to figure it out.
just put both pieces together.
import scipy.stats as ss
import numpy as np
import matplotlib.pyplot as plt
alpha, loc, beta=5, 100, 22
data=ss.gamma.rvs(alpha,loc=loc,scale=beta,size=5000)
myHist = plt.hist(data, 100, normed=True)
rv = ss.gamma(alpha,loc,beta)
x = np.linspace(0,600)
h = plt.plot(x, rv.pdf(x), lw=2)
plt.show()
to make sure you get what you want in any specific plot instance, try to create a figure object first
import scipy.stats as ss
import numpy as np
import matplotlib.pyplot as plt
# setting up the axes
fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(111)
# now plot
alpha, loc, beta=5, 100, 22
data=ss.gamma.rvs(alpha,loc=loc,scale=beta,size=5000)
myHist = ax.hist(data, 100, normed=True)
rv = ss.gamma(alpha,loc,beta)
x = np.linspace(0,600)
h = ax.plot(x, rv.pdf(x), lw=2)
# show
plt.show()
One could be interested in plotting the distibution function of any histogram.
This can be done using seaborn kde function
import numpy as np # for random data
import pandas as pd # for convinience
import matplotlib.pyplot as plt # for graphics
import seaborn as sns # for nicer graphics
v1 = pd.Series(np.random.normal(0,10,1000), name='v1')
v2 = pd.Series(2*v1 + np.random.normal(60,15,1000), name='v2')
# plot a kernel density estimation over a stacked barchart
plt.figure()
plt.hist([v1, v2], histtype='barstacked', normed=True);
v3 = np.concatenate((v1,v2))
sns.kdeplot(v3);
plt.show()
from a coursera course on data visualization with python
Expanding on Malik's answer, and trying to stick with vanilla NumPy, SciPy and Matplotlib. I've pulled in Seaborn, but it's only used to provide nicer defaults and small visual tweaks:
import numpy as np
import scipy.stats as sps
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='ticks')
# parameterise our distributions
d1 = sps.norm(0, 10)
d2 = sps.norm(60, 15)
# sample values from above distributions
y1 = d1.rvs(300)
y2 = d2.rvs(200)
# combine mixture
ys = np.concatenate([y1, y2])
# create new figure with size given explicitly
plt.figure(figsize=(10, 6))
# add histogram showing individual components
plt.hist([y1, y2], 31, histtype='barstacked', density=True, alpha=0.4, edgecolor='none')
# get X limits and fix them
mn, mx = plt.xlim()
plt.xlim(mn, mx)
# add our distributions to figure
x = np.linspace(mn, mx, 301)
plt.plot(x, d1.pdf(x) * (len(y1) / len(ys)), color='C0', ls='--', label='d1')
plt.plot(x, d2.pdf(x) * (len(y2) / len(ys)), color='C1', ls='--', label='d2')
# estimate Kernel Density and plot
kde = sps.gaussian_kde(ys)
plt.plot(x, kde.pdf(x), label='KDE')
# finish up
plt.legend()
plt.ylabel('Probability density')
sns.despine()
gives us the following plot:
I've tried to stick with a minimal feature set while producing relatively nice output, notably using SciPy to estimate the KDE is very easy.
I have data as a list of floats and I want to plot it as a histogram. Hist() function does the job perfectly for plotting the absolute histogram. However, I cannot figure out how to represent it in a relative frequency format - I would like to have it as a fraction or ideally as a percentage on the y-axis.
Here is the code:
fig = plt.figure()
ax = fig.add_subplot(111)
n, bins, patches = ax.hist(mydata, bins=100, normed=1, cumulative=0)
ax.set_xlabel('Bins', size=20)
ax.set_ylabel('Frequency', size=20)
ax.legend
plt.show()
I thought normed=1 argument would do it, but it gives fractions that are too high and sometimes are greater than 1. They also seem to depend on the bin size, as if they are not normalized by the bin size or something. Nevertheless, when I set cumulative=1, it nicely sums up to 1. So, where is the catch? By the way, when I feed the same data into Origin and plot it, it gives me perfectly correct fractions. Thank you!
Because normed option of hist returns the density of points, e.g dN/dx
What you need is something like that:
# assuming that mydata is an numpy array
ax.hist(mydata, weights=np.zeros_like(mydata) + 1. / mydata.size)
# this will give you fractions
Or you can use set_major_formatter to adjust the scale of the y-axis, as follows:
from matplotlib import ticker as tick
def adjust_y_axis(x, pos):
return x / (len(mydata) * 1.0)
ax.yaxis.set_major_formatter(tick.FuncFormatter(adjust_y_axis))
just call adjust_y_axis as above before plt.show().
For relative frequency format set the option density=True. The figure below shows a histogram for 1000 samples taken from a normal distribution with mean 5 and standard deviation 2.0.
The code is
import numpy as np
import matplotlib.pyplot as plt
# Generate data from normal distibution
mu, sigma = 5, 2.0 # mean and standard deviation
mydata = np.random.normal(mu, sigma, 1000)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.hist(mydata,bins=100,density=True);
plt.show()
If you want % on the y-axis you can use PercentFormatter as shown below
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
# Generate data from normal distibution
mu, sigma = 5, 2.0 # mean and standard deviation
mydata = np.random.normal(mu, sigma, 1000)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.hist(mydata,bins=100,density=False);
ax.yaxis.set_major_formatter(PercentFormatter(xmax=100))
plt.show()
You can use numpy.histogram to get the histogram value and bins, and then calculate frequency by yourself. Finally, use bar plot to get the frequency histogram.
hist, edges = np.histogram(p_hat)
freq = hist / float(hist.sum())
width = np.diff(edges) # edges is bins
plt.bar(edges[1:], freq, width=width, align="edge", ec="k")
plt.set(xlabel='x', ylabel='frequency')