Plot a density function above a histogram - python
In Python, I have estimated the parameters for the density of a model of my distribution and I would like to plot the density function above the histogram of the distribution. In R it is similar to using the option prop=TRUE.
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
# initialization of the list "data"
# estimation of the parameter, in my case, mean and variance of a normal distribution
plt.hist(data, bins="auto") # data is the list of data
# here I would like to draw the density above the histogram
plt.show()
I guess the trickiest part is to make it fit.
Edit: I have tried this according to the first answer:
mean = np.mean(logdata)
var = np.var(logdata)
std = np.sqrt(var) # standard deviation, used by numpy as a replacement of the variance
plt.hist(logdata, bins="auto", alpha=0.5, label="données empiriques")
x = np.linspace(min(logdata), max(logdata), 100)
plt.plot(x, mlab.normpdf(x, mean, std))
plt.xlabel("log(taille des fichiers)")
plt.ylabel("nombre de fichiers")
plt.legend(loc='upper right')
plt.grid(True)
plt.show()
But it doesn't fit the graph, here is how it looks:
** Edit 2 ** Works with the option normed=True in the histogram function.
If I understand you correctly you have the mean and standard deviation of some data. You have plotted a histogram of this and would like to plot the normal distribution line over the histogram. This line can be generated using matplotlib.mlab.normpdf(), the documentation can be found here.
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
mean = 100
sigma = 5
data = np.random.normal(mean,sigma,1000) # generate fake data
x = np.linspace(min(data), max(data), 100)
plt.hist(data, bins="auto",normed=True)
plt.plot(x, mlab.normpdf(x, mean, sigma))
plt.show()
Which gives the following figure:
Edit: The above only works with normed = True. If this is not an option, we can define our own function:
def gauss_function(x, a, x0, sigma):
return a * np.exp(-(x - x0) ** 2 / (2 * sigma ** 2))
mean = 100
sigma = 5
data = np.random.normal(mean,sigma,1000) # generate fake data
x = np.linspace(min(data), max(data), 1000)
test = gauss_function(x, max(data), mean, sigma)
plt.hist(data, bins="auto")
plt.plot(x, test)
plt.show()
All what you are looking for, already are in seaborn.
You just have to use distplot
import seaborn as sns
import numpy as np
data = np.random.normal(5, 2, size=1000)
sns.distplot(data)
Related
Problem to fit a poisson histogram in python
I'm trying to fit some data with a poisson distribution, but it doesn't work. x = [46,71,106,126,40,27,19,103,46,89,31,70,35,43,82,128,185,47,18,36,96,30,135,36,40,72,32,86,76,116,51,23,40,121,22,107,65,93,25,74,73,73,111,56,34,28,87,14,70,54,63,50,89,62,35,59,71,39,23,46,32,56,15,68,30,69,37,41,43,106,20,35,63,44,40,32,102,28,54,32,42,19,69,31,36,86,41,57,39,53,48,121,35,51,10,68,14,140,57,50,178,37,121,35,206,26,54,5,53,17,139,49,122,110,62,81,43,83,47,62,2,50,36,190,32,124,89,60,39,156,89,26,57,34,58,29,22,96,132,59,34,43,50,58,48,56,43,54,22,26,60,43,69,58,100,122,48,55,29,55,57,36,42,51,24,81,66,73,112,34,54,45,29,53,43,60,72,13,72,85,49,80,47,40,28,43,37,48,31,60,33,75,53,71,49,142,47,28,51,80,50,33,67,28,101,80,60,80,98,39,69,27,32,11,32,62,32,77,110,45,61,22,23,73,25,27,41,42,65,23,127,128,42,44,10,50,56,73,42,63,70,148,18,109,111,54,34,18,32,50,100,41,39,58,93,42,86,70,41,27,24,57,77,81,101,48,52,146,59,87,86,120,28,23,76,52,59,31,60,32,65,49,27,106,136,23,15,77,44,96,62,66,26,41,70,13,64,124,49,44,55,68,54,58,72,41,21,80,3,49,54,35,48,38,83,59,36,80,47,32,38,16,43,196,19,80,28,56,23,81,103,45,25,42,44,34,106,23,47,53,119,56,54,108,35,20,34,39,70,61,40,35,51,104,63,55,93,22,32,48,20,121,55,76,36,32,121,58,42,101,32,49,77,23,95,32,75,53,106,194,54,31,104,69,58,66,29,66,37,28,59,60,70,95,63,103,173,47,59,27] #geiger count bins = np.histogram_bin_edges(x) n, bins_edges, patches = plt.hist(x,bins, density=1, facecolor='darkblue',ec='white', log=0) print(n) bin_middles = 0.5*(bins_edges[1:] + bins_edges[:-1]) def fit_function(k, lamb): return poisson.pmf(k, lamb) parameters, cov_matrix = curve_fit(fit_function, bin_middles,n) x_plot = np.arange(0,max(x)) plt.plot(x_plot,fit_function(x_plot, *parameters),label='Poisson') plt.show() I'm getting this as result but as we can see it's not right
You are using functions such as np.histogram_bin_edges meant for continuous distributions, while the Poisson distribution is discrete. According to wikipedia, lambda can be estimated by just taking the mean of the samples: from scipy.stats import poisson import numpy as np from matplotlib import pyplot as plt x = [46,71,106,126,40,27,19,103,46,89,31,70,35,43,82,128,185,47,18,36,96,30,135,36,40,72,32,86,76,116,51,23,40,121,22,107,65,93,25,74,73,73,111,56,34,28,87,14,70,54,63,50,89,62,35,59,71,39,23,46,32,56,15,68,30,69,37,41,43,106,20,35,63,44,40,32,102,28,54,32,42,19,69,31,36,86,41,57,39,53,48,121,35,51,10,68,14,140,57,50,178,37,121,35,206,26,54,5,53,17,139,49,122,110,62,81,43,83,47,62,2,50,36,190,32,124,89,60,39,156,89,26,57,34,58,29,22,96,132,59,34,43,50,58,48,56,43,54,22,26,60,43,69,58,100,122,48,55,29,55,57,36,42,51,24,81,66,73,112,34,54,45,29,53,43,60,72,13,72,85,49,80,47,40,28,43,37,48,31,60,33,75,53,71,49,142,47,28,51,80,50,33,67,28,101,80,60,80,98,39,69,27,32,11,32,62,32,77,110,45,61,22,23,73,25,27,41,42,65,23,127,128,42,44,10,50,56,73,42,63,70,148,18,109,111,54,34,18,32,50,100,41,39,58,93,42,86,70,41,27,24,57,77,81,101,48,52,146,59,87,86,120,28,23,76,52,59,31,60,32,65,49,27,106,136,23,15,77,44,96,62,66,26,41,70,13,64,124,49,44,55,68,54,58,72,41,21,80,3,49,54,35,48,38,83,59,36,80,47,32,38,16,43,196,19,80,28,56,23,81,103,45,25,42,44,34,106,23,47,53,119,56,54,108,35,20,34,39,70,61,40,35,51,104,63,55,93,22,32,48,20,121,55,76,36,32,121,58,42,101,32,49,77,23,95,32,75,53,106,194,54,31,104,69,58,66,29,66,37,28,59,60,70,95,63,103,173,47,59,27] bins = np.histogram_bin_edges(x) n, bins_edges, patches = plt.hist(x, bins, density=1, facecolor='darkblue', ec='white', log=0) lamd = np.mean(x) x_plot = np.arange(0, max(x) + 1) plt.plot(x_plot, poisson.pmf(x_plot, lamd), label='Poisson') plt.show() The calculated lambda is about 60. The plot seems to indicate that the Poisson distribution isn't a very close fit for the given samples.
Draw the density curve exactly on the Histogram without normalizing
I need to draw the density curve on the Histogram with the actual height of the bars (actual frequency) as the y-axis. Try1: I found a related answer here but, it has normalized the Histogram to the range of the curve. Below is my code and the output. import numpy as np import matplotlib.mlab as mlab import matplotlib.pyplot as plt from scipy.stats import norm data = [125.36, 126.66, 130.28, 133.74, 126.92, 120.85, 119.42, 128.61, 123.53, 130.15, 126.02, 116.65, 125.24, 126.84, 125.95, 114.41, 138.62, 127.4, 127.59, 123.57, 133.76, 124.6, 113.48, 128.6, 121.04, 119.42, 120.83, 136.53, 120.4, 136.58, 121.73, 132.72, 109.25, 125.42, 117.67, 124.01, 118.74, 128.99, 131.11, 112.27, 118.76, 119.15, 122.42, 122.22, 134.71, 126.22, 130.33, 120.52, 126.88, 117.4] (mu, sigma) = norm.fit(data) x = np.linspace(min(data), max(data), 100) plt.hist(data, bins=12, normed=True) plt.plot(x, mlab.normpdf(x, mu, sigma)) plt.show() Try2: There #DavidG has given an option, a user defined function even it doesn't cover the density of the Histogram accurately. def gauss_function(x, a, x0, sigma): return a * np.exp(-(x - x0) ** 2 / (2 * sigma ** 2)) test = gauss_function(x, max(data), mu, sigma) plt.hist(data, bins=12) plt.plot(x, test) plt.show() The result for this was, But the actual Histogram is below, where Y-axis ranges from 0 to 8, And I want to draw the density curve exactly on that. Any help this regards will be really appreciated.
Is this what you're looking for? I'm multiplying the pdf by the area of the histogram. import numpy as np import matplotlib.pyplot as plt from scipy.stats import norm data = [125.36, 126.66, 130.28, 133.74, 126.92, 120.85, 119.42, 128.61, 123.53, 130.15, 126.02, 116.65, 125.24, 126.84, 125.95, 114.41, 138.62, 127.4, 127.59, 123.57, 133.76, 124.6, 113.48, 128.6, 121.04, 119.42, 120.83, 136.53, 120.4, 136.58, 121.73, 132.72, 109.25, 125.42, 117.67, 124.01, 118.74, 128.99, 131.11, 112.27, 118.76, 119.15, 122.42, 122.22, 134.71, 126.22, 130.33, 120.52, 126.88, 117.4] (mu, sigma) = norm.fit(data) x = np.linspace(min(data), max(data), 100) values, bins, _ = plt.hist(data, bins=12) area = sum(np.diff(bins) * values) plt.plot(x, norm.pdf(x, mu, sigma) * area, 'r') plt.show() Result:
Fit histogram log scale python
I need to fit a curve with my histogram in python. I did this before with normal histograms, this time I am trying to do the same with a logarithmic plot in x. This is my code: import numpy as np import matplotlib.pyplot as plt //radius is my np.array Rmin = min(radius) Rmax = max(radius) logmin = np.log(Rmin) logmax = np.log(Rmax) bins = 10**(np.arange(logmin,logmax,0.1)) plt.figure() plt.xscale("log") plt.hist(radius, bins, color = 'red') plt.show() This is showing a gaussian distribution. I am trying to fit a curve with it and what I did is computing the following before the show() command. (mu, sigma) = np.log(norm.fit((radius))) y = (mlab.normpdf(np.log(bins), mu, sigma)) plt.plot(bins, y, 'b--', linewidth=2) My result is a very flattened curve with respect to my distribution. Can someone help me? I can not add the whole array r(50000 points), therefore I have added a picture showing my result. See image
Fitting Maxwell-Boltzman distribution in Python
Is it possible to make a fit to Maxwell-Boltzmann like data in matplotlib or similar module in python?
scipy.stats has support for the maxwell distribution. import scipy.stats as stats import matplotlib.pyplot as plt import numpy as np maxwell = stats.maxwell data = maxwell.rvs(loc=0, scale=5, size=10000) params = maxwell.fit(data, floc=0) print(params) # (0, 4.9808603062591041) plt.hist(data, bins=20, normed=True) x = np.linspace(0, 25, 100) plt.plot(x, maxwell.pdf(x, *params), lw=3) plt.show() The first parameter is the location or shift away from zero. The second parameter is the scaling parameter, denoted by a on the wikipedia page. To generate random variates (random data) with this distribution, use its rvs method: newdata = maxwell.rvs(*params, size=100)
python: plotting a histogram with a function line on top
I'm trying to do a little bit of distribution plotting and fitting in Python using SciPy for stats and matplotlib for the plotting. I'm having good luck with some things like creating a histogram: seed(2) alpha=5 loc=100 beta=22 data=ss.gamma.rvs(alpha,loc=loc,scale=beta,size=5000) myHist = hist(data, 100, normed=True) Brilliant! I can even take the same gamma parameters and plot the line function of the probability distribution function (after some googling): rv = ss.gamma(5,100,22) x = np.linspace(0,600) h = plt.plot(x, rv.pdf(x)) How would I go about plotting the histogram myHist with the PDF line h superimposed on top of the histogram? I'm hoping this is trivial, but I have been unable to figure it out.
just put both pieces together. import scipy.stats as ss import numpy as np import matplotlib.pyplot as plt alpha, loc, beta=5, 100, 22 data=ss.gamma.rvs(alpha,loc=loc,scale=beta,size=5000) myHist = plt.hist(data, 100, normed=True) rv = ss.gamma(alpha,loc,beta) x = np.linspace(0,600) h = plt.plot(x, rv.pdf(x), lw=2) plt.show() to make sure you get what you want in any specific plot instance, try to create a figure object first import scipy.stats as ss import numpy as np import matplotlib.pyplot as plt # setting up the axes fig = plt.figure(figsize=(8,8)) ax = fig.add_subplot(111) # now plot alpha, loc, beta=5, 100, 22 data=ss.gamma.rvs(alpha,loc=loc,scale=beta,size=5000) myHist = ax.hist(data, 100, normed=True) rv = ss.gamma(alpha,loc,beta) x = np.linspace(0,600) h = ax.plot(x, rv.pdf(x), lw=2) # show plt.show()
One could be interested in plotting the distibution function of any histogram. This can be done using seaborn kde function import numpy as np # for random data import pandas as pd # for convinience import matplotlib.pyplot as plt # for graphics import seaborn as sns # for nicer graphics v1 = pd.Series(np.random.normal(0,10,1000), name='v1') v2 = pd.Series(2*v1 + np.random.normal(60,15,1000), name='v2') # plot a kernel density estimation over a stacked barchart plt.figure() plt.hist([v1, v2], histtype='barstacked', normed=True); v3 = np.concatenate((v1,v2)) sns.kdeplot(v3); plt.show() from a coursera course on data visualization with python
Expanding on Malik's answer, and trying to stick with vanilla NumPy, SciPy and Matplotlib. I've pulled in Seaborn, but it's only used to provide nicer defaults and small visual tweaks: import numpy as np import scipy.stats as sps import matplotlib.pyplot as plt import seaborn as sns sns.set(style='ticks') # parameterise our distributions d1 = sps.norm(0, 10) d2 = sps.norm(60, 15) # sample values from above distributions y1 = d1.rvs(300) y2 = d2.rvs(200) # combine mixture ys = np.concatenate([y1, y2]) # create new figure with size given explicitly plt.figure(figsize=(10, 6)) # add histogram showing individual components plt.hist([y1, y2], 31, histtype='barstacked', density=True, alpha=0.4, edgecolor='none') # get X limits and fix them mn, mx = plt.xlim() plt.xlim(mn, mx) # add our distributions to figure x = np.linspace(mn, mx, 301) plt.plot(x, d1.pdf(x) * (len(y1) / len(ys)), color='C0', ls='--', label='d1') plt.plot(x, d2.pdf(x) * (len(y2) / len(ys)), color='C1', ls='--', label='d2') # estimate Kernel Density and plot kde = sps.gaussian_kde(ys) plt.plot(x, kde.pdf(x), label='KDE') # finish up plt.legend() plt.ylabel('Probability density') sns.despine() gives us the following plot: I've tried to stick with a minimal feature set while producing relatively nice output, notably using SciPy to estimate the KDE is very easy.