Log-normal distribution, log x-axis, scaling pdf - python

I want to make lognormal distribution. I'm using a logarithmic x-axis. However, I can't scale Probability density function correctly. I found one post on the forum for a full complete answer I did not find there(Scaling and fitting to a log-normal distribution using a logarithmic axis in python). Along with the change in the parameter "s" the graph has a different shape. Can the unambiguously correct distribution shape be obtained?Thank you for help.
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import pandas as pd
from math import log
# Import data
data = pd.read_excel('data2018.xlsx')
# Create bins (log)
classes = 16
s=10
bins_log10 = np.logspace(np.log10(data['diff'].min()), np.log10(data['diff'].max()), classes + 1)
bins_log10_s = np.logspace(np.log10(data['diff'].min()), np.log10(data['diff'].max()), (classes + 1) * s)
# Plot histogram
plt.style.use('ggplot')
counts, bins, _ = plt.hist(data['diff'], bins=bins_log10, edgecolor='black', linewidth=1,
label="Histogram")
# Calculation of bin centers and multiplied them
restored = [[d] * int(counts[n]) for n, d in enumerate((bins[1:] + bins[:-1]) / 2)]
# Flatten the result
restored = [item for sublist in restored for item in sublist]
# Calculate of fitting parameters. shape = sigma, log(scale) = mu
shape, loc, scale = stats.lognorm.fit(restored, floc=0)
# Calculate centers and length log_bins
cen_log_bins = (bins_log10[1:] + bins_log10[:-1]) / 2
len_log_bins = (bins_log10[1:] - bins_log10[:-1])
samples_fit_log_cntr = stats.lognorm.pdf(cen_log_bins, shape, loc=loc, scale=scale)
plt.plot(cen_log_bins,samples_fit_log_cntr * len_log_bins * counts.sum(), ls='dashed',label='PDF with centers', linewidth=2)
# Smooth pdf
bins_log10_cntr_s = (bins_log10_s[1:] + bins_log10_s[:-1]) / 2
samples_fit_log_cntr = stats.lognorm.pdf(bins_log10_cntr_s, shape, loc=loc, scale=scale)
bins_log_cntr = bins_log10_s[1:] - bins_log10_s[:-1]
plt.plot(bins_log10_cntr_s, samples_fit_log_cntr * bins_log_cntr * counts.sum() * s,color='blue',label='Smooth PDF with centers', linewidth=2)
plt.title("Fit results: $\mu = %.2f, \sigma$ = %.2f" % (log(scale), shape))
plt.xscale('log')
plt.legend()
plt.tight_layout()
plt.show()

Related

Problem to fit a poisson histogram in python

I'm trying to fit some data with a poisson distribution, but it doesn't work.
x = [46,71,106,126,40,27,19,103,46,89,31,70,35,43,82,128,185,47,18,36,96,30,135,36,40,72,32,86,76,116,51,23,40,121,22,107,65,93,25,74,73,73,111,56,34,28,87,14,70,54,63,50,89,62,35,59,71,39,23,46,32,56,15,68,30,69,37,41,43,106,20,35,63,44,40,32,102,28,54,32,42,19,69,31,36,86,41,57,39,53,48,121,35,51,10,68,14,140,57,50,178,37,121,35,206,26,54,5,53,17,139,49,122,110,62,81,43,83,47,62,2,50,36,190,32,124,89,60,39,156,89,26,57,34,58,29,22,96,132,59,34,43,50,58,48,56,43,54,22,26,60,43,69,58,100,122,48,55,29,55,57,36,42,51,24,81,66,73,112,34,54,45,29,53,43,60,72,13,72,85,49,80,47,40,28,43,37,48,31,60,33,75,53,71,49,142,47,28,51,80,50,33,67,28,101,80,60,80,98,39,69,27,32,11,32,62,32,77,110,45,61,22,23,73,25,27,41,42,65,23,127,128,42,44,10,50,56,73,42,63,70,148,18,109,111,54,34,18,32,50,100,41,39,58,93,42,86,70,41,27,24,57,77,81,101,48,52,146,59,87,86,120,28,23,76,52,59,31,60,32,65,49,27,106,136,23,15,77,44,96,62,66,26,41,70,13,64,124,49,44,55,68,54,58,72,41,21,80,3,49,54,35,48,38,83,59,36,80,47,32,38,16,43,196,19,80,28,56,23,81,103,45,25,42,44,34,106,23,47,53,119,56,54,108,35,20,34,39,70,61,40,35,51,104,63,55,93,22,32,48,20,121,55,76,36,32,121,58,42,101,32,49,77,23,95,32,75,53,106,194,54,31,104,69,58,66,29,66,37,28,59,60,70,95,63,103,173,47,59,27] #geiger count
bins = np.histogram_bin_edges(x)
n, bins_edges, patches = plt.hist(x,bins, density=1, facecolor='darkblue',ec='white', log=0)
print(n)
bin_middles = 0.5*(bins_edges[1:] + bins_edges[:-1])
def fit_function(k, lamb):
return poisson.pmf(k, lamb)
parameters, cov_matrix = curve_fit(fit_function, bin_middles,n)
x_plot = np.arange(0,max(x))
plt.plot(x_plot,fit_function(x_plot, *parameters),label='Poisson')
plt.show()
I'm getting this as result but as we can see it's not right
You are using functions such as np.histogram_bin_edges meant for continuous distributions, while the Poisson distribution is discrete.
According to wikipedia, lambda can be estimated by just taking the mean of the samples:
from scipy.stats import poisson
import numpy as np
from matplotlib import pyplot as plt
x = [46,71,106,126,40,27,19,103,46,89,31,70,35,43,82,128,185,47,18,36,96,30,135,36,40,72,32,86,76,116,51,23,40,121,22,107,65,93,25,74,73,73,111,56,34,28,87,14,70,54,63,50,89,62,35,59,71,39,23,46,32,56,15,68,30,69,37,41,43,106,20,35,63,44,40,32,102,28,54,32,42,19,69,31,36,86,41,57,39,53,48,121,35,51,10,68,14,140,57,50,178,37,121,35,206,26,54,5,53,17,139,49,122,110,62,81,43,83,47,62,2,50,36,190,32,124,89,60,39,156,89,26,57,34,58,29,22,96,132,59,34,43,50,58,48,56,43,54,22,26,60,43,69,58,100,122,48,55,29,55,57,36,42,51,24,81,66,73,112,34,54,45,29,53,43,60,72,13,72,85,49,80,47,40,28,43,37,48,31,60,33,75,53,71,49,142,47,28,51,80,50,33,67,28,101,80,60,80,98,39,69,27,32,11,32,62,32,77,110,45,61,22,23,73,25,27,41,42,65,23,127,128,42,44,10,50,56,73,42,63,70,148,18,109,111,54,34,18,32,50,100,41,39,58,93,42,86,70,41,27,24,57,77,81,101,48,52,146,59,87,86,120,28,23,76,52,59,31,60,32,65,49,27,106,136,23,15,77,44,96,62,66,26,41,70,13,64,124,49,44,55,68,54,58,72,41,21,80,3,49,54,35,48,38,83,59,36,80,47,32,38,16,43,196,19,80,28,56,23,81,103,45,25,42,44,34,106,23,47,53,119,56,54,108,35,20,34,39,70,61,40,35,51,104,63,55,93,22,32,48,20,121,55,76,36,32,121,58,42,101,32,49,77,23,95,32,75,53,106,194,54,31,104,69,58,66,29,66,37,28,59,60,70,95,63,103,173,47,59,27] bins = np.histogram_bin_edges(x)
n, bins_edges, patches = plt.hist(x, bins, density=1, facecolor='darkblue', ec='white', log=0)
lamd = np.mean(x)
x_plot = np.arange(0, max(x) + 1)
plt.plot(x_plot, poisson.pmf(x_plot, lamd), label='Poisson')
plt.show()
The calculated lambda is about 60. The plot seems to indicate that the Poisson distribution isn't a very close fit for the given samples.

Calculating PDF given a histogram

I have a heavily right-skewed histogram and would like to calculate the probabilities for a range of Lifetimevalues (Area under the curve, the PDF). For instance, the probability that the Lifetime value is in (0-0.01)
Dataframe consisting of LTV calculated by cumulative revenue/ cumulative installs:
df['LTV'] is
(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.208125,0.0558879,0.608348,0.212553,0.0865896,
0.728542,0,0.609512,0,0,0,0,0,0,0,0.0801339,0.140657,0.0194118,0,0,0.0634682,
0.339545,0.875902,0.8325,0.0260526,0.0711905,0.169894,0.202969,0.0761538,0,0.342055,
0.42781,0,0,0.192115,0,0,0,0,0,0,0,0,0,0,0,1.6473,0,0.232329,0,2.21329,0.748,0.0424286,
0.455439,0.210282,5.56453,0.427959,0,0.352059,0,0,0.567059,0,0,0,0.384462,1.29476,
0.0103125,0,0.0126923,1.03356,0,0,0.289785,0,0)
I have tried utilizing SKlearn's KernelDensity, however, after fitting it to the histogram it does not capture the over-represented 0s.
import gc
from sklearn.neighbors import KernelDensity
def plot_prob_density(df_lunch, field, x_start, x_end):
plt.figure(figsize = (10, 7))
unit = 0
x = np.linspace(df_lunch.min() - unit, df_lunch.max() + unit, 1000)[:, np.newaxis]
# Plot the data using a normalized histogram
plt.hist(df_lunch, bins=200, density=True, label='LTV', color='blue', alpha=0.2)
# Do kernel density estimation
kd_lunch = KernelDensity(kernel='gaussian', bandwidth=0.00187).fit(df_lunch) #0.00187
# Plot the estimated densty
kd_vals_lunch = np.exp(kd_lunch.score_samples(x))
plt.plot(x, kd_vals_lunch, color='orange')
plt.axvline(x=x_start,color='red',linestyle='dashed')
plt.axvline(x=x_end,color='red',linestyle='dashed')
# Show the plots
plt.xlabel(field, fontsize=15)
plt.ylabel('Probability Density', fontsize=15)
plt.legend(fontsize=15)
plt.show()
gc.collect()
return kd_lunch
kd_lunch = plot_prob_density(final_df['LTV'].values.reshape(-1,1), 'LTV', x_start=0, x_end=0.01)
Then finding the probabilities like this:
def get_probability(start_value, end_value, eval_points, kd):
# Number of evaluation points
N = eval_points
step = (end_value - start_value) / (N - 1) # Step size
x = np.linspace(start_value, end_value, N)[:, np.newaxis] # Generate values in the range
kd_vals = np.exp(kd.score_samples(x)) # Get PDF values for each x
probability = np.sum(kd_vals * step) # Approximate the integral of the PDF
return probability.round(4)
print('Probability of LTV 0-3 tips during LUNCH time: {}\n'
.format(get_probability(start_value = 0,
end_value = 0.01,
eval_points = 100,
kd = kd_lunch)))
However, this method does not yield the appropriate PDF values we were aiming for.
Any suggestions for alternative methods would be appreciated.
PLot:
I have used more or less similar script for my work, here is my script may be it will be helpful for you.
import gc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.neighbors import KernelDensity
from scipy import stats
data1 = beta_95[0]
def plot_prob_density(data1, x_start, x_end):
plt.figure(figsize = (4, 3.5))
unit = 1.5
x = np.linspace(-20, 20, 1000)[:, np.newaxis]
# Plot the data using a normalized histogram
plt.hist(data1, bins=np.linspace(-20,20,40), density=True, color='r', alpha=0.4)
#plt.show
# Do kernel density estimation
kd_data1 = KernelDensity(kernel='gaussian', bandwidth=1.8).fit(data1)
# Plot the estimated densty
kd_vals_data1 = np.exp(kd_data1.score_samples(x))
plt.plot(x, kd_vals_data1, color='r', label='$N_a$', linewidth = 2)
plt.axvline(x=9.95,color='green',linestyle='dashed', linewidth = 2.0, label='$β_o$')
plt.axvline(x=1.9,color='black',linestyle='dashed', linewidth = 2.0, label='$β_b$')
plt.axvline(x=x_end,color='red',linestyle='dashed', linewidth = 2, label='$β_{95\%}$')
# Show the plots
plt.xlabel('Beta', fontsize=10)
plt.ylabel('Probability Density', fontsize=10)
plt.title('02 hours window', fontsize=12)
plt.xlim(-20, 20)
plt.ylim(0, 0.3)
plt.yticks([0, 0.1, 0.2, 0.3])
plt.legend(fontsize=12, loc='upper left', frameon=False)
plt.show()
gc.collect()
return kd_data1
def get_probability(start_value, end_value, eval_points, kd):
# Number of evaluation points
N = eval_points
step = (end_value - start_value) / (N - 1) # Step size
x = np.linspace(start_value, end_value, N)[:, np.newaxis] # Generate values in the range
kd_vals = np.exp(kd.score_samples(x)) # Get PDF values for each x
probability = np.sum(kd_vals * step) # Approximate the integral of the PDF
return probability.round(4)
data1 = np.array(data1).reshape(-1, 1)
kd_data1 = plot_prob_density(data1, x_start=3.0, x_end=13)
print('Beta-95%: {}\n'
.format(get_probability(start_value = -10,
end_value = 13,
eval_points = 1000,
kd = kd_data1)))

Plot a density function above a histogram

In Python, I have estimated the parameters for the density of a model of my distribution and I would like to plot the density function above the histogram of the distribution. In R it is similar to using the option prop=TRUE.
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
# initialization of the list "data"
# estimation of the parameter, in my case, mean and variance of a normal distribution
plt.hist(data, bins="auto") # data is the list of data
# here I would like to draw the density above the histogram
plt.show()
I guess the trickiest part is to make it fit.
Edit: I have tried this according to the first answer:
mean = np.mean(logdata)
var = np.var(logdata)
std = np.sqrt(var) # standard deviation, used by numpy as a replacement of the variance
plt.hist(logdata, bins="auto", alpha=0.5, label="données empiriques")
x = np.linspace(min(logdata), max(logdata), 100)
plt.plot(x, mlab.normpdf(x, mean, std))
plt.xlabel("log(taille des fichiers)")
plt.ylabel("nombre de fichiers")
plt.legend(loc='upper right')
plt.grid(True)
plt.show()
But it doesn't fit the graph, here is how it looks:
** Edit 2 ** Works with the option normed=True in the histogram function.
If I understand you correctly you have the mean and standard deviation of some data. You have plotted a histogram of this and would like to plot the normal distribution line over the histogram. This line can be generated using matplotlib.mlab.normpdf(), the documentation can be found here.
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
mean = 100
sigma = 5
data = np.random.normal(mean,sigma,1000) # generate fake data
x = np.linspace(min(data), max(data), 100)
plt.hist(data, bins="auto",normed=True)
plt.plot(x, mlab.normpdf(x, mean, sigma))
plt.show()
Which gives the following figure:
Edit: The above only works with normed = True. If this is not an option, we can define our own function:
def gauss_function(x, a, x0, sigma):
return a * np.exp(-(x - x0) ** 2 / (2 * sigma ** 2))
mean = 100
sigma = 5
data = np.random.normal(mean,sigma,1000) # generate fake data
x = np.linspace(min(data), max(data), 1000)
test = gauss_function(x, max(data), mean, sigma)
plt.hist(data, bins="auto")
plt.plot(x, test)
plt.show()
All what you are looking for, already are in seaborn.
You just have to use distplot
import seaborn as sns
import numpy as np
data = np.random.normal(5, 2, size=1000)
sns.distplot(data)

non-random sampling versions of np.random.normal

I'm trying to generate a single array that follows an exact gaussian distribution. np.random.normal sort of does this by randomly sampling from a gaussian, but how can I reproduce and exact gaussian given some mean and sigma. So the array would produce a histogram that follows an exact gaussian, not just an approximate gaussian as shown below.
mu, sigma = 10, 1
s = np.random.normal(mu, sigma, 1000)
fig = figure()
ax = plt.axes()
totaln, bbins, patches = ax.hist(s, 10, normed = 1, histtype = 'stepfilled', linewidth = 1.2)
plt.show()
If you'd like an exact gaussian histogram, don't generate points. You can never get an "exact" gaussian distribution from observed points, simply because you can't have a fraction of a point within a histogram bin.
Instead, plot the curve in the form of a bar graph.
import numpy as np
import matplotlib.pyplot as plt
def gaussian(x, mean, std):
scale = 1.0 / (std * np.sqrt(2 * np.pi))
return scale * np.exp(-(x - mean)**2 / (2 * std**2))
mean, std = 2.0, 5.0
nbins = 30
npoints = 1000
x = np.linspace(mean - 3 * std, mean + 3 * std, nbins + 1)
centers = np.vstack([x[:-1], x[1:]]).mean(axis=0)
y = npoints * gaussian(centers, mean, std)
fig, ax = plt.subplots()
ax.bar(x[:-1], y, width=np.diff(x), color='lightblue')
# Optional...
ax.margins(0.05)
ax.set_ylim(bottom=0)
plt.show()

How to fit a double Gaussian distribution in Python?

I am trying to obtain a double Gaussian distribution for data (link) using Python. The raw data is of the form:
For the given data, I would like to obtain two Gaussian profiles for the peaks seen in figure. I tried it with the following code (source):
from sklearn import mixture
import matplotlib.pyplot
import matplotlib.mlab
import numpy as np
from pylab import *
data = np.genfromtxt('gaussian_fit.dat', skiprows = 1)
x = data[:, 0]
y = data[:, 1]
clf = mixture.GMM(n_components=2, covariance_type='full')
clf.fit((y, x))
m1, m2 = clf.means_
w1, w2 = clf.weights_
c1, c2 = clf.covars_
fig = plt.figure(figsize = (5, 5))
plt.subplot(111)
plotgauss1 = lambda x: plot(x,w1*matplotlib.mlab.normpdf(x,m1,np.sqrt(c1))[0], linewidth=3)
plotgauss2 = lambda x: plot(x,w2*matplotlib.mlab.normpdf(x,m2,np.sqrt(c2))[0], linewidth=3)
fig.savefig('gaussian_fit.pdf')
But I am not able to get the desired output. So, how can a double Gaussian distribution be obtained in Python?
Update
I was able to fit a single Gaussian distribution with the following code:
import pylab as plb
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from scipy import asarray as ar,exp
import numpy as np
data = np.genfromtxt('gaussian_fit.dat', skiprows = 1)
x = data[:, 0]
y = data[:, 1]
n = len(x)
mean = sum(x*y)/n
sigma = sum(y*(x-mean)**2)/n
def gaus(x,a,x0,sigma):
return a*exp(-(x-x0)**2/(2*sigma**2))
popt,pcov = curve_fit(gaus, x, y ,p0 = [1, mean, sigma])
fig = plt.figure(figsize = (5, 5))
plt.subplot(111)
plt.plot(x, y, label='Raw')
plt.plot(x, gaus(x, *popt), 'o', markersize = 4, label='Gaussian fit')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
fig.savefig('gaussian_fit.pdf')
You can't use scikit-learn for this, because the you are not dealing with a set of samples whose distribution you want to estimate. You could of course transform your curve to a PDF, sample it and then try to fit it using a Gaussian mixture model, but that seems to be a bit of an overkill to me.
Here's a solution using simple least square curve fitting. To get it to work I had to remove the background, i.e. ignore all data points with y < 5, and also provide a good starting vector for leastsq, which can be estimated form a plot of the data.
Finding the Starting Vector
The parameter vector that that is found by the least squares method is the vector
params = [c1, mu1, sigma1, c2, mu2, sigma2]
Here, c1 and c2 are scaling factors for the two Gaussians, i.e. their height, mu1and mu2 are the means, i.e. the horizontal positions of the peaks and sigma1 and sigma2 the standard deviations that determine the width of the Gaussians. To find a starting vector I just looked at a plot of the data and estimated the height of the two peaks ( = c1, c2, respectively) and their horizontal position (= mu1, mu1, respectively). sigma1 and sigma2 were simply set to 1.0.
Code
from sklearn import mixture
import matplotlib.pyplot
import matplotlib.mlab
import numpy as np
from pylab import *
from scipy.optimize import leastsq
data = np.genfromtxt('gaussian_fit.dat', skiprows = 1)
x = data[:, 0]
y = data[:, 1]
def double_gaussian( x, params ):
(c1, mu1, sigma1, c2, mu2, sigma2) = params
res = c1 * np.exp( - (x - mu1)**2.0 / (2.0 * sigma1**2.0) ) \
+ c2 * np.exp( - (x - mu2)**2.0 / (2.0 * sigma2**2.0) )
return res
def double_gaussian_fit( params ):
fit = double_gaussian( x, params )
return (fit - y_proc)
# Remove background.
y_proc = np.copy(y)
y_proc[y_proc < 5] = 0.0
# Least squares fit. Starting values found by inspection.
fit = leastsq( double_gaussian_fit, [13.0,-13.0,1.0,60.0,3.0,1.0] )
plot( x, y, c='b' )
plot( x, double_gaussian( x, fit[0] ), c='r' )

Categories

Resources