Draw the density curve exactly on the Histogram without normalizing - python

I need to draw the density curve on the Histogram with the actual height of the bars (actual frequency) as the y-axis.
Try1:
I found a related answer here but, it has normalized the Histogram to the range of the curve.
Below is my code and the output.
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
from scipy.stats import norm
data = [125.36, 126.66, 130.28, 133.74, 126.92, 120.85, 119.42, 128.61, 123.53, 130.15, 126.02, 116.65, 125.24, 126.84,
125.95, 114.41, 138.62, 127.4, 127.59, 123.57, 133.76, 124.6, 113.48, 128.6, 121.04, 119.42, 120.83, 136.53, 120.4,
136.58, 121.73, 132.72, 109.25, 125.42, 117.67, 124.01, 118.74, 128.99, 131.11, 112.27, 118.76, 119.15, 122.42,
122.22, 134.71, 126.22, 130.33, 120.52, 126.88, 117.4]
(mu, sigma) = norm.fit(data)
x = np.linspace(min(data), max(data), 100)
plt.hist(data, bins=12, normed=True)
plt.plot(x, mlab.normpdf(x, mu, sigma))
plt.show()
Try2:
There #DavidG has given an option, a user defined function even it doesn't cover the density of the Histogram accurately.
def gauss_function(x, a, x0, sigma):
return a * np.exp(-(x - x0) ** 2 / (2 * sigma ** 2))
test = gauss_function(x, max(data), mu, sigma)
plt.hist(data, bins=12)
plt.plot(x, test)
plt.show()
The result for this was,
But the actual Histogram is below, where Y-axis ranges from 0 to 8,
And I want to draw the density curve exactly on that. Any help this regards will be really appreciated.

Is this what you're looking for? I'm multiplying the pdf by the area of the histogram.
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
data = [125.36, 126.66, 130.28, 133.74, 126.92, 120.85, 119.42, 128.61, 123.53, 130.15, 126.02, 116.65, 125.24, 126.84,
125.95, 114.41, 138.62, 127.4, 127.59, 123.57, 133.76, 124.6, 113.48, 128.6, 121.04, 119.42, 120.83, 136.53, 120.4,
136.58, 121.73, 132.72, 109.25, 125.42, 117.67, 124.01, 118.74, 128.99, 131.11, 112.27, 118.76, 119.15, 122.42,
122.22, 134.71, 126.22, 130.33, 120.52, 126.88, 117.4]
(mu, sigma) = norm.fit(data)
x = np.linspace(min(data), max(data), 100)
values, bins, _ = plt.hist(data, bins=12)
area = sum(np.diff(bins) * values)
plt.plot(x, norm.pdf(x, mu, sigma) * area, 'r')
plt.show()
Result:

Related

Fitting a log-normal function on array values using Python

I am trying to fit log-normal pdf on the matrix generated using the inbuilt log-normal function but it doesn't fit. I was wondering why it is off. The plot is attached for reference.
import numpy as np
import matplotlib.pyplot as plt
mu, sigma = 0.2, 0.5 # mean and standard deviation
A=np.random.lognormal(mean=0.2, sigma=0.5, size=(10, 10))
count, bins, ignored = plt.hist(A, 100, density=True, align='mid')
x = np.linspace(min(bins), max(bins), 10000)
pdf = (np.exp(-(np.log(x) - mu)**2 / (2 * sigma**2))/ (x * sigma * np.sqrt(2 * np.pi)))
plt.plot(x, pdf, linewidth=2, color='r')
plt.axis('tight')
plt.show()

Fitting the statistical distribution cdf function

can anyone explain me why the random points generated for the specific statistical distribution doesn't fit that distribution well. And why the distribution parameters that are obtained by cfd fitting of the random points look much better?
I'm experiencing the same thing also by using the scipy.stats..fit on the any data. It looks worse than cdf fitting...
I must be doing somewhere really bad assumption...
Here is an example:
import numpy as np
from scipy import stats
from scipy.optimize import curve_fit
from matplotlib import pyplot as plt
# fit by cdf
#########################################################
def fit_lognorm_cfd(x_data, p_data):
def lndist(x, sigma, loc, scale):
return stats.lognorm.cdf(x, sigma, loc, scale)
opt, pcov = curve_fit(lndist, x_data, p_data)
return stats.lognorm(*opt)
# create some random points
#########################################################
# define some parameters
shape = 0.8 # sigma
loc = 0
mue = -0.5
scale = np.exp(mue)
# orig distribution
lndist = stats.lognorm(shape, loc=loc, scale=scale)
# get some random points
vals = stats.lognorm.rvs(shape, size=100)
vals = np.sort(vals)
pvals = np.cumsum(np.full(vals.size, 1/vals.size))
# fit by cfd function
lndist2=fit_lognorm_cfd(vals, pvals)
x = np.linspace(lndist.ppf(0.001),
lndist.ppf(0.999), 100)
xlg = np.logspace(np.log10(lndist.ppf(0.001)),
np.log10(lndist.ppf(0.999)), 100)
# plot the results
#########################################################
fig, (ax1, ax2) = plt.subplots(figsize=(10,5), ncols=2)
hist=ax1.hist(vals, bins=30,density=True, histtype='stepfilled', alpha=0.2)
pdf=ax1.plot(x, lndist.pdf(x), 'k-', lw=2, label="orig")
cdf=ax2.plot(xlg, lndist.cdf(xlg), 'k-', lw=2, label="orig")
pdf2=ax1.plot(x, lndist2.pdf(x), 'm-', lw=2, label="curve_fit on cfd")
pts=ax2.plot(vals, pvals, "bo")
cdf2=ax2.plot(xlg, lndist2.cdf(xlg), 'm-', lw=2, label="curve_fit on cfd")
ax1.grid(which="both")
ax1.legend()
ax1.set_title("pdf")
ax2.set_xscale("log")
ax2.grid(which="both")
ax2.set_title("cdf")
ax2.set_ylim((0,1))
ax2.legend()
fig.tight_layout()
fig.savefig("example.png", dpi=150)
plot_from_the_code_above
Thanks a lot!!!

Python Generate a random Maxwell distribution from a normal distribution

I have a set of data that follows a normal distribution in which I can fit the histogram and obtain the mean and sigma.
For the sake of example, I will approximate it by generating a random normal distribution as follows:
from scipy.stats import maxwell
import math
import random
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
from scipy.optimize import curve_fit
from IPython import embed # put embed() where you want to stop
import matplotlib.ticker as ticker
data = random.gauss(307, 16)
N, bins, patches = plt.hist(data, bins=40, density=True, alpha=0.5, histtype='bar', ec='black')
mu, std = norm.fit(data)
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)
plt.plot(x, p, 'k', linewidth=2, label= r'$\mu$ = '+'{:0.1f}'.format(mu)+r' $\pm$ '+'{:0.1f}'.format(std))
What I would like to do next is to generate a Maxwell distribution from this "normal" distribution and be able to fit
I have read scipy.stats.maxwell webpage and several other related questions but was not able to generate such a distribution from "a gauss distribution" and fit it. Any help would much appreciate it.
Well, knowing that each Maxwell is distribution of the absolute value of the molecule velocity, where each component is normally distributed, you could make sampling like code below
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import maxwell
def maxw(size = None):
"""Generates size samples of maxwell"""
vx = np.random.normal(size=size)
vy = np.random.normal(size=size)
vz = np.random.normal(size=size)
return np.sqrt(vx*vx + vy*vy + vz*vz)
mdata = maxw(100000)
h, bins = np.histogram(mdata, bins = 101, range=(0.0, 10.0))
x = np.linspace(0.0, 10.0, 100)
rv = maxwell()
fig, ax = plt.subplots(1, 1)
ax.hist(mdata, bins = bins, density=True)
ax.plot(x, rv.pdf(x), 'k-', lw=2, label='Maxwell pdf')
plt.title("Maxwell")
plt.show()
And here is the picture with sampling and Maxwell PDF overlapped

Plot a density function above a histogram

In Python, I have estimated the parameters for the density of a model of my distribution and I would like to plot the density function above the histogram of the distribution. In R it is similar to using the option prop=TRUE.
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
# initialization of the list "data"
# estimation of the parameter, in my case, mean and variance of a normal distribution
plt.hist(data, bins="auto") # data is the list of data
# here I would like to draw the density above the histogram
plt.show()
I guess the trickiest part is to make it fit.
Edit: I have tried this according to the first answer:
mean = np.mean(logdata)
var = np.var(logdata)
std = np.sqrt(var) # standard deviation, used by numpy as a replacement of the variance
plt.hist(logdata, bins="auto", alpha=0.5, label="données empiriques")
x = np.linspace(min(logdata), max(logdata), 100)
plt.plot(x, mlab.normpdf(x, mean, std))
plt.xlabel("log(taille des fichiers)")
plt.ylabel("nombre de fichiers")
plt.legend(loc='upper right')
plt.grid(True)
plt.show()
But it doesn't fit the graph, here is how it looks:
** Edit 2 ** Works with the option normed=True in the histogram function.
If I understand you correctly you have the mean and standard deviation of some data. You have plotted a histogram of this and would like to plot the normal distribution line over the histogram. This line can be generated using matplotlib.mlab.normpdf(), the documentation can be found here.
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
mean = 100
sigma = 5
data = np.random.normal(mean,sigma,1000) # generate fake data
x = np.linspace(min(data), max(data), 100)
plt.hist(data, bins="auto",normed=True)
plt.plot(x, mlab.normpdf(x, mean, sigma))
plt.show()
Which gives the following figure:
Edit: The above only works with normed = True. If this is not an option, we can define our own function:
def gauss_function(x, a, x0, sigma):
return a * np.exp(-(x - x0) ** 2 / (2 * sigma ** 2))
mean = 100
sigma = 5
data = np.random.normal(mean,sigma,1000) # generate fake data
x = np.linspace(min(data), max(data), 1000)
test = gauss_function(x, max(data), mean, sigma)
plt.hist(data, bins="auto")
plt.plot(x, test)
plt.show()
All what you are looking for, already are in seaborn.
You just have to use distplot
import seaborn as sns
import numpy as np
data = np.random.normal(5, 2, size=1000)
sns.distplot(data)

non-random sampling versions of np.random.normal

I'm trying to generate a single array that follows an exact gaussian distribution. np.random.normal sort of does this by randomly sampling from a gaussian, but how can I reproduce and exact gaussian given some mean and sigma. So the array would produce a histogram that follows an exact gaussian, not just an approximate gaussian as shown below.
mu, sigma = 10, 1
s = np.random.normal(mu, sigma, 1000)
fig = figure()
ax = plt.axes()
totaln, bbins, patches = ax.hist(s, 10, normed = 1, histtype = 'stepfilled', linewidth = 1.2)
plt.show()
If you'd like an exact gaussian histogram, don't generate points. You can never get an "exact" gaussian distribution from observed points, simply because you can't have a fraction of a point within a histogram bin.
Instead, plot the curve in the form of a bar graph.
import numpy as np
import matplotlib.pyplot as plt
def gaussian(x, mean, std):
scale = 1.0 / (std * np.sqrt(2 * np.pi))
return scale * np.exp(-(x - mean)**2 / (2 * std**2))
mean, std = 2.0, 5.0
nbins = 30
npoints = 1000
x = np.linspace(mean - 3 * std, mean + 3 * std, nbins + 1)
centers = np.vstack([x[:-1], x[1:]]).mean(axis=0)
y = npoints * gaussian(centers, mean, std)
fig, ax = plt.subplots()
ax.bar(x[:-1], y, width=np.diff(x), color='lightblue')
# Optional...
ax.margins(0.05)
ax.set_ylim(bottom=0)
plt.show()

Categories

Resources