Gaussian fit to histogram turns out to be a flat line - python

from sklearn.mixture import GaussianMixture
from sklearn import preprocessing
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats
from astropy.io import ascii
from scipy.stats import norm
data= pd.read_csv("P, Theta all stars.csv",usecols=[1])
names = data.columns
df = pd.DataFrame(data, columns=names)
df.head()
mu,std=norm.fit(df)
x=df['Theta']
num_bins = 20
n, bins, patches = plt.hist(x, num_bins, color ='green',alpha = 0.7)
p = norm.pdf(bins, mu, std)
plt.gcf().set_size_inches((10, 8))
plt.plot(bins, p, '--', color ='black')
plt.title("Mean: {:.3f} and Standard Deviation: {:.3f}".format(mu, std))
plt.xlabel('Theta(curve fit)')
#plt.savefig("{0}.png", dpi=300)
plt.show()
Gaussian Fit with density parameter
Gaussian Fit with no Density parameter
I intend to have a gaussian fit while keeping the original y axis without normalising it . But if I want that, the gaussian is getting flattened (because the data is not normalised). In short how can I make both the data and the gaussian to be unnormalised?

Related

Why a norm distribution does not plot a line on stats.probplot()?

The problem is with the resultant graph of function scipy.stats.probplot().
Samples from a normal distribution doesn't produce a line as expected.
I am trying to normalize some data using graphs as guidance.
However, after some strange results showing that zscore and log transformations were having no effect, I started looking for something wrong.
So, I built a graph using synthetic values that has a norm distribution and the resultant graph seems very awkward.
Here is the steps to reproduce the array and the graph:
import math
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
mu = 0
variance = 1
sigma = math.sqrt(variance)
x = np.linspace(mu - 3*sigma, mu + 3*sigma, 100)
norm = stats.norm.pdf(x, mu, sigma)
plt.plot(x, norm)
plt.show()
_ = stats.probplot(norm, plot=plt, sparams=(0, 1))
plt.show()
Distribution curve:
Probability plot:
Your synthesized data aren't normally distributed, they are uniformly distributed, this is what numpy.linspace() does. You can visualize this by adding seaborn.distplot(x, fit=scipy.stats.norm).
import math
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import seaborn as sns
mu = 0
variance = 1
sigma = math.sqrt(variance)
x = np.linspace(mu - 3*sigma, mu + 3*sigma, 100)
y = stats.norm.pdf(x, mu, sigma)
sns.distplot(y, fit=stats.norm)
fig = plt.figure()
res = stats.probplot(y, plot=plt, sparams=(0, 1))
plt.show()
Try synthesizing your data with numpy.random.normal(). This will give you normally distributed data.
import math
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import seaborn as sns
mu = 0
variance = 1
sigma = math.sqrt(variance)
x = np.random.normal(loc=mu, scale=sigma, size=100)
sns.distplot(x, fit=stats.norm)
fig = plt.figure()
res = stats.probplot(x, plot=plt, sparams=(0, 1))
plt.show()

How to plot contour levels base on the standard deviation levels in a multivariate normal distribution?

I am using EllipticEnvelope, which estimates my dataset's mean and covariance matrix. Now, I want to plot the multivariate normal distribution using a contour plot but I want to add a contour levels parameter based on different levels of standard deviation similar to this post
but I have this plot (note that the dataset is different):
I also read this post and this one but the answer doesn't work for me and I would like to use levels parameter of the contour plot.
Here is my code:
import numpy as np
import pandas as pd
import scipy.linalg
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import multivariate_normal
%matplotlib inline
from sklearn.datasets import load_iris
iris = load_iris(as_frame=True)
X = iris.data
from sklearn.covariance import EllipticEnvelope
cov = EllipticEnvelope(random_state=42)
cov.fit(X)
i = 0
j = 1
mean = [cov.location_[i], cov.location_[j]]
covariance = [[cov.covariance_[i, i], cov.covariance_[i, j]], [cov.covariance_[j, i], cov.covariance_[j, j]]]
x_list = X[X.columns[i]].values
y_list = X[X.columns[j]].values
x, y = np.mgrid[x_list.min():x_list.max():.01, y_list.min():y_list.max():.01]
pos = np.dstack((x, y))
rv = multivariate_normal(mean, covariance)
z = rv.pdf(pos)
plt.figure()
plt.contour(x, y, z, cmap='RdYlGn')
plt.scatter(x_list, y_list)
plt.xlabel(X.columns[i])
plt.ylabel(X.columns[j])
plt.show()

Problems with unpacking Matplotlib hist2d outputs

I'm using Matplotlib's function hist2d() and I want to unpack the output in order to further use it. Here's what I do: I simply load with numpy a 2-column file containing my data and use the following code
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import numpy as np
traj = np.loadtxt('trajectory.txt')
x = traj[:,0]
y = traj[:,1]
M, xe, ye, img = plt.hist2d(x, y, bins = 80, norm = LogNorm())
plt.imshow(M)
plt.show()
The result I get is the following:
Instead, if I try to directly plot the hist2d results without unpacking them:
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import numpy as np
traj = np.loadtxt('trajectory.txt')
x = traj[:,0]
y = traj[:,1]
plt.hist2d(x, y, bins = 80, norm = LogNorm())
plt.show()
I get the whole plot without the strange blue box. What am I doing wrong?
You can create a histogram plot directly with plt.hist2d. This calculates the histogram and plots it to the current axes. There is no need to show it yet another time using imshow.
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import numpy as np; np.random.seed(9)
x = np.random.rayleigh(size=9900)
y = np.random.rayleigh(size=9900)
M, xe, ye, img = plt.hist2d(x, y, bins = 80, norm = LogNorm())
plt.show()
Or, you may first calculate the histogram and afterwards plot the result as an image to the current axes. Note that the histogram produced by numpy is transposed, see Matplotlib 2D histogram seems transposed, making it necessary to call imshow(M.T). Also note that in order to obtain the correct axes labeling, you need to set the imshow's extent to the extremal values of the xe and ye edge arrays.
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import numpy as np; np.random.seed(9)
x = np.random.rayleigh(size=9900)
y = np.random.rayleigh(size=9900)
M, xe, ye = np.histogram2d(x, y, bins = 80)
extent = [xe[0], xe[-1], ye[0], ye[-1]]
plt.imshow(M.T, extent=extent, norm = LogNorm(), origin="lower")
plt.show()

Plot a density function above a histogram

In Python, I have estimated the parameters for the density of a model of my distribution and I would like to plot the density function above the histogram of the distribution. In R it is similar to using the option prop=TRUE.
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
# initialization of the list "data"
# estimation of the parameter, in my case, mean and variance of a normal distribution
plt.hist(data, bins="auto") # data is the list of data
# here I would like to draw the density above the histogram
plt.show()
I guess the trickiest part is to make it fit.
Edit: I have tried this according to the first answer:
mean = np.mean(logdata)
var = np.var(logdata)
std = np.sqrt(var) # standard deviation, used by numpy as a replacement of the variance
plt.hist(logdata, bins="auto", alpha=0.5, label="données empiriques")
x = np.linspace(min(logdata), max(logdata), 100)
plt.plot(x, mlab.normpdf(x, mean, std))
plt.xlabel("log(taille des fichiers)")
plt.ylabel("nombre de fichiers")
plt.legend(loc='upper right')
plt.grid(True)
plt.show()
But it doesn't fit the graph, here is how it looks:
** Edit 2 ** Works with the option normed=True in the histogram function.
If I understand you correctly you have the mean and standard deviation of some data. You have plotted a histogram of this and would like to plot the normal distribution line over the histogram. This line can be generated using matplotlib.mlab.normpdf(), the documentation can be found here.
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
mean = 100
sigma = 5
data = np.random.normal(mean,sigma,1000) # generate fake data
x = np.linspace(min(data), max(data), 100)
plt.hist(data, bins="auto",normed=True)
plt.plot(x, mlab.normpdf(x, mean, sigma))
plt.show()
Which gives the following figure:
Edit: The above only works with normed = True. If this is not an option, we can define our own function:
def gauss_function(x, a, x0, sigma):
return a * np.exp(-(x - x0) ** 2 / (2 * sigma ** 2))
mean = 100
sigma = 5
data = np.random.normal(mean,sigma,1000) # generate fake data
x = np.linspace(min(data), max(data), 1000)
test = gauss_function(x, max(data), mean, sigma)
plt.hist(data, bins="auto")
plt.plot(x, test)
plt.show()
All what you are looking for, already are in seaborn.
You just have to use distplot
import seaborn as sns
import numpy as np
data = np.random.normal(5, 2, size=1000)
sns.distplot(data)

Fitting Maxwell-Boltzman distribution in Python

Is it possible to make a fit to Maxwell-Boltzmann like data in matplotlib or similar module in python?
scipy.stats has support for the maxwell distribution.
import scipy.stats as stats
import matplotlib.pyplot as plt
import numpy as np
maxwell = stats.maxwell
data = maxwell.rvs(loc=0, scale=5, size=10000)
params = maxwell.fit(data, floc=0)
print(params)
# (0, 4.9808603062591041)
plt.hist(data, bins=20, normed=True)
x = np.linspace(0, 25, 100)
plt.plot(x, maxwell.pdf(x, *params), lw=3)
plt.show()
The first parameter is the location or shift away from zero.
The second parameter is the scaling parameter, denoted by a on the wikipedia page.
To generate random variates (random data) with this distribution, use its rvs method:
newdata = maxwell.rvs(*params, size=100)

Categories

Resources