Log-normal distribution, log x-axis, scaling pdf - python
I want to make lognormal distribution. I'm using a logarithmic x-axis. However, I can't scale Probability density function correctly. I found one post on the forum for a full complete answer I did not find there(Scaling and fitting to a log-normal distribution using a logarithmic axis in python). Along with the change in the parameter "s" the graph has a different shape. Can the unambiguously correct distribution shape be obtained?Thank you for help.
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import pandas as pd
from math import log
# Import data
data = pd.read_excel('data2018.xlsx')
# Create bins (log)
classes = 16
s=10
bins_log10 = np.logspace(np.log10(data['diff'].min()), np.log10(data['diff'].max()), classes + 1)
bins_log10_s = np.logspace(np.log10(data['diff'].min()), np.log10(data['diff'].max()), (classes + 1) * s)
# Plot histogram
plt.style.use('ggplot')
counts, bins, _ = plt.hist(data['diff'], bins=bins_log10, edgecolor='black', linewidth=1,
label="Histogram")
# Calculation of bin centers and multiplied them
restored = [[d] * int(counts[n]) for n, d in enumerate((bins[1:] + bins[:-1]) / 2)]
# Flatten the result
restored = [item for sublist in restored for item in sublist]
# Calculate of fitting parameters. shape = sigma, log(scale) = mu
shape, loc, scale = stats.lognorm.fit(restored, floc=0)
# Calculate centers and length log_bins
cen_log_bins = (bins_log10[1:] + bins_log10[:-1]) / 2
len_log_bins = (bins_log10[1:] - bins_log10[:-1])
samples_fit_log_cntr = stats.lognorm.pdf(cen_log_bins, shape, loc=loc, scale=scale)
plt.plot(cen_log_bins,samples_fit_log_cntr * len_log_bins * counts.sum(), ls='dashed',label='PDF with centers', linewidth=2)
# Smooth pdf
bins_log10_cntr_s = (bins_log10_s[1:] + bins_log10_s[:-1]) / 2
samples_fit_log_cntr = stats.lognorm.pdf(bins_log10_cntr_s, shape, loc=loc, scale=scale)
bins_log_cntr = bins_log10_s[1:] - bins_log10_s[:-1]
plt.plot(bins_log10_cntr_s, samples_fit_log_cntr * bins_log_cntr * counts.sum() * s,color='blue',label='Smooth PDF with centers', linewidth=2)
plt.title("Fit results: $\mu = %.2f, \sigma$ = %.2f" % (log(scale), shape))
plt.xscale('log')
plt.legend()
plt.tight_layout()
plt.show()
Related
Problem to fit a poisson histogram in python
I'm trying to fit some data with a poisson distribution, but it doesn't work. x = [46,71,106,126,40,27,19,103,46,89,31,70,35,43,82,128,185,47,18,36,96,30,135,36,40,72,32,86,76,116,51,23,40,121,22,107,65,93,25,74,73,73,111,56,34,28,87,14,70,54,63,50,89,62,35,59,71,39,23,46,32,56,15,68,30,69,37,41,43,106,20,35,63,44,40,32,102,28,54,32,42,19,69,31,36,86,41,57,39,53,48,121,35,51,10,68,14,140,57,50,178,37,121,35,206,26,54,5,53,17,139,49,122,110,62,81,43,83,47,62,2,50,36,190,32,124,89,60,39,156,89,26,57,34,58,29,22,96,132,59,34,43,50,58,48,56,43,54,22,26,60,43,69,58,100,122,48,55,29,55,57,36,42,51,24,81,66,73,112,34,54,45,29,53,43,60,72,13,72,85,49,80,47,40,28,43,37,48,31,60,33,75,53,71,49,142,47,28,51,80,50,33,67,28,101,80,60,80,98,39,69,27,32,11,32,62,32,77,110,45,61,22,23,73,25,27,41,42,65,23,127,128,42,44,10,50,56,73,42,63,70,148,18,109,111,54,34,18,32,50,100,41,39,58,93,42,86,70,41,27,24,57,77,81,101,48,52,146,59,87,86,120,28,23,76,52,59,31,60,32,65,49,27,106,136,23,15,77,44,96,62,66,26,41,70,13,64,124,49,44,55,68,54,58,72,41,21,80,3,49,54,35,48,38,83,59,36,80,47,32,38,16,43,196,19,80,28,56,23,81,103,45,25,42,44,34,106,23,47,53,119,56,54,108,35,20,34,39,70,61,40,35,51,104,63,55,93,22,32,48,20,121,55,76,36,32,121,58,42,101,32,49,77,23,95,32,75,53,106,194,54,31,104,69,58,66,29,66,37,28,59,60,70,95,63,103,173,47,59,27] #geiger count bins = np.histogram_bin_edges(x) n, bins_edges, patches = plt.hist(x,bins, density=1, facecolor='darkblue',ec='white', log=0) print(n) bin_middles = 0.5*(bins_edges[1:] + bins_edges[:-1]) def fit_function(k, lamb): return poisson.pmf(k, lamb) parameters, cov_matrix = curve_fit(fit_function, bin_middles,n) x_plot = np.arange(0,max(x)) plt.plot(x_plot,fit_function(x_plot, *parameters),label='Poisson') plt.show() I'm getting this as result but as we can see it's not right
You are using functions such as np.histogram_bin_edges meant for continuous distributions, while the Poisson distribution is discrete. According to wikipedia, lambda can be estimated by just taking the mean of the samples: from scipy.stats import poisson import numpy as np from matplotlib import pyplot as plt x = [46,71,106,126,40,27,19,103,46,89,31,70,35,43,82,128,185,47,18,36,96,30,135,36,40,72,32,86,76,116,51,23,40,121,22,107,65,93,25,74,73,73,111,56,34,28,87,14,70,54,63,50,89,62,35,59,71,39,23,46,32,56,15,68,30,69,37,41,43,106,20,35,63,44,40,32,102,28,54,32,42,19,69,31,36,86,41,57,39,53,48,121,35,51,10,68,14,140,57,50,178,37,121,35,206,26,54,5,53,17,139,49,122,110,62,81,43,83,47,62,2,50,36,190,32,124,89,60,39,156,89,26,57,34,58,29,22,96,132,59,34,43,50,58,48,56,43,54,22,26,60,43,69,58,100,122,48,55,29,55,57,36,42,51,24,81,66,73,112,34,54,45,29,53,43,60,72,13,72,85,49,80,47,40,28,43,37,48,31,60,33,75,53,71,49,142,47,28,51,80,50,33,67,28,101,80,60,80,98,39,69,27,32,11,32,62,32,77,110,45,61,22,23,73,25,27,41,42,65,23,127,128,42,44,10,50,56,73,42,63,70,148,18,109,111,54,34,18,32,50,100,41,39,58,93,42,86,70,41,27,24,57,77,81,101,48,52,146,59,87,86,120,28,23,76,52,59,31,60,32,65,49,27,106,136,23,15,77,44,96,62,66,26,41,70,13,64,124,49,44,55,68,54,58,72,41,21,80,3,49,54,35,48,38,83,59,36,80,47,32,38,16,43,196,19,80,28,56,23,81,103,45,25,42,44,34,106,23,47,53,119,56,54,108,35,20,34,39,70,61,40,35,51,104,63,55,93,22,32,48,20,121,55,76,36,32,121,58,42,101,32,49,77,23,95,32,75,53,106,194,54,31,104,69,58,66,29,66,37,28,59,60,70,95,63,103,173,47,59,27] bins = np.histogram_bin_edges(x) n, bins_edges, patches = plt.hist(x, bins, density=1, facecolor='darkblue', ec='white', log=0) lamd = np.mean(x) x_plot = np.arange(0, max(x) + 1) plt.plot(x_plot, poisson.pmf(x_plot, lamd), label='Poisson') plt.show() The calculated lambda is about 60. The plot seems to indicate that the Poisson distribution isn't a very close fit for the given samples.
Calculating PDF given a histogram
I have a heavily right-skewed histogram and would like to calculate the probabilities for a range of Lifetimevalues (Area under the curve, the PDF). For instance, the probability that the Lifetime value is in (0-0.01) Dataframe consisting of LTV calculated by cumulative revenue/ cumulative installs: df['LTV'] is (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.208125,0.0558879,0.608348,0.212553,0.0865896, 0.728542,0,0.609512,0,0,0,0,0,0,0,0.0801339,0.140657,0.0194118,0,0,0.0634682, 0.339545,0.875902,0.8325,0.0260526,0.0711905,0.169894,0.202969,0.0761538,0,0.342055, 0.42781,0,0,0.192115,0,0,0,0,0,0,0,0,0,0,0,1.6473,0,0.232329,0,2.21329,0.748,0.0424286, 0.455439,0.210282,5.56453,0.427959,0,0.352059,0,0,0.567059,0,0,0,0.384462,1.29476, 0.0103125,0,0.0126923,1.03356,0,0,0.289785,0,0) I have tried utilizing SKlearn's KernelDensity, however, after fitting it to the histogram it does not capture the over-represented 0s. import gc from sklearn.neighbors import KernelDensity def plot_prob_density(df_lunch, field, x_start, x_end): plt.figure(figsize = (10, 7)) unit = 0 x = np.linspace(df_lunch.min() - unit, df_lunch.max() + unit, 1000)[:, np.newaxis] # Plot the data using a normalized histogram plt.hist(df_lunch, bins=200, density=True, label='LTV', color='blue', alpha=0.2) # Do kernel density estimation kd_lunch = KernelDensity(kernel='gaussian', bandwidth=0.00187).fit(df_lunch) #0.00187 # Plot the estimated densty kd_vals_lunch = np.exp(kd_lunch.score_samples(x)) plt.plot(x, kd_vals_lunch, color='orange') plt.axvline(x=x_start,color='red',linestyle='dashed') plt.axvline(x=x_end,color='red',linestyle='dashed') # Show the plots plt.xlabel(field, fontsize=15) plt.ylabel('Probability Density', fontsize=15) plt.legend(fontsize=15) plt.show() gc.collect() return kd_lunch kd_lunch = plot_prob_density(final_df['LTV'].values.reshape(-1,1), 'LTV', x_start=0, x_end=0.01) Then finding the probabilities like this: def get_probability(start_value, end_value, eval_points, kd): # Number of evaluation points N = eval_points step = (end_value - start_value) / (N - 1) # Step size x = np.linspace(start_value, end_value, N)[:, np.newaxis] # Generate values in the range kd_vals = np.exp(kd.score_samples(x)) # Get PDF values for each x probability = np.sum(kd_vals * step) # Approximate the integral of the PDF return probability.round(4) print('Probability of LTV 0-3 tips during LUNCH time: {}\n' .format(get_probability(start_value = 0, end_value = 0.01, eval_points = 100, kd = kd_lunch))) However, this method does not yield the appropriate PDF values we were aiming for. Any suggestions for alternative methods would be appreciated. PLot:
I have used more or less similar script for my work, here is my script may be it will be helpful for you. import gc import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import numpy as np from sklearn.neighbors import KernelDensity from scipy import stats data1 = beta_95[0] def plot_prob_density(data1, x_start, x_end): plt.figure(figsize = (4, 3.5)) unit = 1.5 x = np.linspace(-20, 20, 1000)[:, np.newaxis] # Plot the data using a normalized histogram plt.hist(data1, bins=np.linspace(-20,20,40), density=True, color='r', alpha=0.4) #plt.show # Do kernel density estimation kd_data1 = KernelDensity(kernel='gaussian', bandwidth=1.8).fit(data1) # Plot the estimated densty kd_vals_data1 = np.exp(kd_data1.score_samples(x)) plt.plot(x, kd_vals_data1, color='r', label='$N_a$', linewidth = 2) plt.axvline(x=9.95,color='green',linestyle='dashed', linewidth = 2.0, label='$β_o$') plt.axvline(x=1.9,color='black',linestyle='dashed', linewidth = 2.0, label='$β_b$') plt.axvline(x=x_end,color='red',linestyle='dashed', linewidth = 2, label='$β_{95\%}$') # Show the plots plt.xlabel('Beta', fontsize=10) plt.ylabel('Probability Density', fontsize=10) plt.title('02 hours window', fontsize=12) plt.xlim(-20, 20) plt.ylim(0, 0.3) plt.yticks([0, 0.1, 0.2, 0.3]) plt.legend(fontsize=12, loc='upper left', frameon=False) plt.show() gc.collect() return kd_data1 def get_probability(start_value, end_value, eval_points, kd): # Number of evaluation points N = eval_points step = (end_value - start_value) / (N - 1) # Step size x = np.linspace(start_value, end_value, N)[:, np.newaxis] # Generate values in the range kd_vals = np.exp(kd.score_samples(x)) # Get PDF values for each x probability = np.sum(kd_vals * step) # Approximate the integral of the PDF return probability.round(4) data1 = np.array(data1).reshape(-1, 1) kd_data1 = plot_prob_density(data1, x_start=3.0, x_end=13) print('Beta-95%: {}\n' .format(get_probability(start_value = -10, end_value = 13, eval_points = 1000, kd = kd_data1)))
Plot a density function above a histogram
In Python, I have estimated the parameters for the density of a model of my distribution and I would like to plot the density function above the histogram of the distribution. In R it is similar to using the option prop=TRUE. import numpy as np import matplotlib.mlab as mlab import matplotlib.pyplot as plt # initialization of the list "data" # estimation of the parameter, in my case, mean and variance of a normal distribution plt.hist(data, bins="auto") # data is the list of data # here I would like to draw the density above the histogram plt.show() I guess the trickiest part is to make it fit. Edit: I have tried this according to the first answer: mean = np.mean(logdata) var = np.var(logdata) std = np.sqrt(var) # standard deviation, used by numpy as a replacement of the variance plt.hist(logdata, bins="auto", alpha=0.5, label="données empiriques") x = np.linspace(min(logdata), max(logdata), 100) plt.plot(x, mlab.normpdf(x, mean, std)) plt.xlabel("log(taille des fichiers)") plt.ylabel("nombre de fichiers") plt.legend(loc='upper right') plt.grid(True) plt.show() But it doesn't fit the graph, here is how it looks: ** Edit 2 ** Works with the option normed=True in the histogram function.
If I understand you correctly you have the mean and standard deviation of some data. You have plotted a histogram of this and would like to plot the normal distribution line over the histogram. This line can be generated using matplotlib.mlab.normpdf(), the documentation can be found here. import numpy as np import matplotlib.mlab as mlab import matplotlib.pyplot as plt mean = 100 sigma = 5 data = np.random.normal(mean,sigma,1000) # generate fake data x = np.linspace(min(data), max(data), 100) plt.hist(data, bins="auto",normed=True) plt.plot(x, mlab.normpdf(x, mean, sigma)) plt.show() Which gives the following figure: Edit: The above only works with normed = True. If this is not an option, we can define our own function: def gauss_function(x, a, x0, sigma): return a * np.exp(-(x - x0) ** 2 / (2 * sigma ** 2)) mean = 100 sigma = 5 data = np.random.normal(mean,sigma,1000) # generate fake data x = np.linspace(min(data), max(data), 1000) test = gauss_function(x, max(data), mean, sigma) plt.hist(data, bins="auto") plt.plot(x, test) plt.show()
All what you are looking for, already are in seaborn. You just have to use distplot import seaborn as sns import numpy as np data = np.random.normal(5, 2, size=1000) sns.distplot(data)
non-random sampling versions of np.random.normal
I'm trying to generate a single array that follows an exact gaussian distribution. np.random.normal sort of does this by randomly sampling from a gaussian, but how can I reproduce and exact gaussian given some mean and sigma. So the array would produce a histogram that follows an exact gaussian, not just an approximate gaussian as shown below. mu, sigma = 10, 1 s = np.random.normal(mu, sigma, 1000) fig = figure() ax = plt.axes() totaln, bbins, patches = ax.hist(s, 10, normed = 1, histtype = 'stepfilled', linewidth = 1.2) plt.show()
If you'd like an exact gaussian histogram, don't generate points. You can never get an "exact" gaussian distribution from observed points, simply because you can't have a fraction of a point within a histogram bin. Instead, plot the curve in the form of a bar graph. import numpy as np import matplotlib.pyplot as plt def gaussian(x, mean, std): scale = 1.0 / (std * np.sqrt(2 * np.pi)) return scale * np.exp(-(x - mean)**2 / (2 * std**2)) mean, std = 2.0, 5.0 nbins = 30 npoints = 1000 x = np.linspace(mean - 3 * std, mean + 3 * std, nbins + 1) centers = np.vstack([x[:-1], x[1:]]).mean(axis=0) y = npoints * gaussian(centers, mean, std) fig, ax = plt.subplots() ax.bar(x[:-1], y, width=np.diff(x), color='lightblue') # Optional... ax.margins(0.05) ax.set_ylim(bottom=0) plt.show()
How to fit a double Gaussian distribution in Python?
I am trying to obtain a double Gaussian distribution for data (link) using Python. The raw data is of the form: For the given data, I would like to obtain two Gaussian profiles for the peaks seen in figure. I tried it with the following code (source): from sklearn import mixture import matplotlib.pyplot import matplotlib.mlab import numpy as np from pylab import * data = np.genfromtxt('gaussian_fit.dat', skiprows = 1) x = data[:, 0] y = data[:, 1] clf = mixture.GMM(n_components=2, covariance_type='full') clf.fit((y, x)) m1, m2 = clf.means_ w1, w2 = clf.weights_ c1, c2 = clf.covars_ fig = plt.figure(figsize = (5, 5)) plt.subplot(111) plotgauss1 = lambda x: plot(x,w1*matplotlib.mlab.normpdf(x,m1,np.sqrt(c1))[0], linewidth=3) plotgauss2 = lambda x: plot(x,w2*matplotlib.mlab.normpdf(x,m2,np.sqrt(c2))[0], linewidth=3) fig.savefig('gaussian_fit.pdf') But I am not able to get the desired output. So, how can a double Gaussian distribution be obtained in Python? Update I was able to fit a single Gaussian distribution with the following code: import pylab as plb import matplotlib.pyplot as plt from scipy.optimize import curve_fit from scipy import asarray as ar,exp import numpy as np data = np.genfromtxt('gaussian_fit.dat', skiprows = 1) x = data[:, 0] y = data[:, 1] n = len(x) mean = sum(x*y)/n sigma = sum(y*(x-mean)**2)/n def gaus(x,a,x0,sigma): return a*exp(-(x-x0)**2/(2*sigma**2)) popt,pcov = curve_fit(gaus, x, y ,p0 = [1, mean, sigma]) fig = plt.figure(figsize = (5, 5)) plt.subplot(111) plt.plot(x, y, label='Raw') plt.plot(x, gaus(x, *popt), 'o', markersize = 4, label='Gaussian fit') plt.xlabel('X') plt.ylabel('Y') plt.legend() fig.savefig('gaussian_fit.pdf')
You can't use scikit-learn for this, because the you are not dealing with a set of samples whose distribution you want to estimate. You could of course transform your curve to a PDF, sample it and then try to fit it using a Gaussian mixture model, but that seems to be a bit of an overkill to me. Here's a solution using simple least square curve fitting. To get it to work I had to remove the background, i.e. ignore all data points with y < 5, and also provide a good starting vector for leastsq, which can be estimated form a plot of the data. Finding the Starting Vector The parameter vector that that is found by the least squares method is the vector params = [c1, mu1, sigma1, c2, mu2, sigma2] Here, c1 and c2 are scaling factors for the two Gaussians, i.e. their height, mu1and mu2 are the means, i.e. the horizontal positions of the peaks and sigma1 and sigma2 the standard deviations that determine the width of the Gaussians. To find a starting vector I just looked at a plot of the data and estimated the height of the two peaks ( = c1, c2, respectively) and their horizontal position (= mu1, mu1, respectively). sigma1 and sigma2 were simply set to 1.0. Code from sklearn import mixture import matplotlib.pyplot import matplotlib.mlab import numpy as np from pylab import * from scipy.optimize import leastsq data = np.genfromtxt('gaussian_fit.dat', skiprows = 1) x = data[:, 0] y = data[:, 1] def double_gaussian( x, params ): (c1, mu1, sigma1, c2, mu2, sigma2) = params res = c1 * np.exp( - (x - mu1)**2.0 / (2.0 * sigma1**2.0) ) \ + c2 * np.exp( - (x - mu2)**2.0 / (2.0 * sigma2**2.0) ) return res def double_gaussian_fit( params ): fit = double_gaussian( x, params ) return (fit - y_proc) # Remove background. y_proc = np.copy(y) y_proc[y_proc < 5] = 0.0 # Least squares fit. Starting values found by inspection. fit = leastsq( double_gaussian_fit, [13.0,-13.0,1.0,60.0,3.0,1.0] ) plot( x, y, c='b' ) plot( x, double_gaussian( x, fit[0] ), c='r' )