I have a problem fitting some date with Gaussian function. I tried to do it in multiple different ways but none of them worked. I need some ideas please. The data is attached (columns 2 and 3).
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from numpy import asarray as ar,exp
x = ar(range(19))
y = ar(0, 0, 0, 0, 0, 0, 0.01955, 1.163025, 19.7159833333333, 81.3119708333334,80.0329166666667,19.3835833333333, 0.03378, 0, 0, 0, 0, 0, 0)
#y = ar(007, 0.04, .175, .628, 1.89, 4.78,10.034,17.542, 25.589, 31.1, 31.544, 26.65, 18.74, 11.01, 5.39, 2.209, 0.74, 0.215. 0.049)
n = len(x)
mean = sum(x*y)/n
sigma = sum(y*(x-mean)**2)/n
def gaus(x,a,x0,sigma):
return a*exp(-(x-x0)**2/(2*sigma**2))
popt,pcov = curve_fit(gaus,x,y)
#popt,pcov = curve_fit(gaus,x,y,p0=[1,mean,sigma])
plt.scatter(x,y, color='blue')
plt.plot(x,y,label='data', marker='', color='blue', linestyle='-', linewidth=2)
plt.scatter(x,gaus(y,*popt), color='red')
plt.plot(x,gaus(y,*popt),label='fit', marker='', color='Red', linestyle='--', linewidth=2)
print(len(x))
print(mean,sigma)
plt.legend()
plt.xlabel('No of Resets', fontsize=20)
plt.ylabel('Frequency', fontsize=20)
plt.legend(loc='upper right')
plt.title('Gaussian Fit', fontsize=20)
plt.show()
I agree with #ddejohn.
However, you are calculating the mean and std wrongly. You could use the following approximation for the integral
import numpy as np
mean = (x*(y/y.sum())).sum()
sigma = np.sqrt(((y/y.sum())*(x-mean)**2).sum())
These should be used as initial guess for the fit as in your commented line, where you can also add a0 = y.max() for the amplitude.
popt,pcov = curve_fit(gaus,x,y,p0=[a0,mean,sigma])
Then plot as #ddejohn said maybe with more sample points
xx = np.linspace(x[0], x[-1], 100)
plt.plot(xx,gaus(xx,*popt),label='fit', marker='', color='Red', linestyle='--', linewidth=2)
Related
I am having the same problem as this post. I followed #B.M.'s code. However, I am not able to figure out how to fit two gaussian distributions in my data. When I hist-plot it and superimpose it with the KDE, I obtain the second peak like this:
Second peak detected by the slight bump at ~0.07 units
Here are my efforts so far:
from pylab import *
from scipy.optimize import curve_fit
#data=concatenate((normal(1,.2,5000),normal(2,.2,2500)))
data = pd.read_csv('gmm_outdata_ngc2173.csv')
gmm_3 = data['col3']
y,x,_=hist(gmm_3,22,alpha=.3,label='data')
x=(x[1:]+x[:-1])/2 # for len(x)==len(y)
def gauss(x,mu,sigma,A):
return A*exp(-(x-mu)**2/2/sigma**2)
def bimodal(x,mu1,sigma1,A1,mu2,sigma2,A2):
return gauss(x,mu1,sigma1,A1)+gauss(x,mu2,sigma2,A2)
expected=(-0.15,.02,22,0.15,.02,22)
params,cov=curve_fit(bimodal,x,y,expected)
sigma=sqrt(diag(cov))
plot(x,bimodal(x,*params),color='red',lw=3,label='model')
legend()
print(params,'\n',sigma)
Here is the output plot:
Output plot of fitting two Gaussian Distributions within the same dataset
Here's the behaviour I'm expecting, a bimodal behaviour from emcee and mpfit implementation.
Plot with emcee and mpfit
Any idea why that might be happening?
Thanks in advance.
Edit:
Based on Mr. T's helpful suggestions, I have the following result:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.optimize import curve_fit
np.random.seed(123)
#data=np.concatenate((np.random.normal(1, .2, 5000), np.random.normal(1.6, .3, 2500)))
data = pd.read_csv('gmm_outdata_ngc2173.csv')
gmm_3 = data['col3']
y,x,_=plt.hist(gmm_3, 22, alpha=.3, label='data')
x=(x[1:]+x[:-1])/2 # for len(x)==len(y)
#%%
def gauss(x, mu, sigma, A):
return A*np.exp(-(x-mu)**2/2/sigma**2)
def bimodal(x, mu1, sigma1, A1, mu2, sigma2, A2):
return gauss(x,mu1,sigma1,A1)+gauss(x,mu2,sigma2,A2)
#expected = (1, .2, 250, 2, .2, 125)
expected=(-0.15,.02,22,0.15,.02,22)
params, cov = curve_fit(bimodal, x, y, expected)
sigma=np.sqrt(np.diag(cov))
x_fit = np.linspace(x.min(), x.max(), 500)
plt.plot(x_fit, bimodal(x_fit, *params), color='red', lw=3, label='model')
plt.plot(x_fit, gauss(x_fit, *params[:3]), color='red', lw=1, ls="--", label='distribution 1')
plt.plot(x_fit, gauss(x_fit, *params[3:]), color='red', lw=1, ls=":", label='distribution 2')
plt.legend()
#print(pd.DataFrame(data={'params': params, 'sigma': sigma}, index=bimodal.__code__.co_varnames[1:]))
plt.show()
When I try to change "*params[3:]", I get an error gauss() missing 1 required positional argument: 'A'.
Mr. T's code's output.
Edit 2:
I am uploading the data I am working on. Please have a look here.
Curve fitting is often sensitive to changes in start values. In a histogram, you get a good idea of the mean values and amplitudes of the two peaks, and we should make use of it. We should also restrict sigma values and amplitudes to nonnegative values:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.optimize import curve_fit
data = np.genfromtxt("test.csv", skip_header=True)
y,x,_=plt.hist(data, 22, alpha=.3,label='data')
x=(x[1:]+x[:-1])/2
def gauss(x, mu, sigma, A):
return A*np.exp(-(x-mu)**2/2/sigma**2)
def bimodal(x, mu1, sigma1, A1, mu2, sigma2, A2):
return gauss(x, mu1, sigma1, A1)+gauss(x, mu2, sigma2, A2)
expected = (0, .01, 20, .1, .01, 5)
params, cov = curve_fit(bimodal, x, y, expected,
#[[lower], [upper]] bounds for mu1, sigma1, A1, mu2, sigma2, A2
bounds=[[-np.inf, 0, 0, -np.inf, 0, 0], [np.inf, np.inf, np.inf, np.inf, np.inf, np.inf]])
sigma=np.sqrt(np.diag(cov))
x_fit = np.linspace(x.min(), x.max(), 500)
plt.plot(x_fit, bimodal(x_fit, *params), color='red', lw=3, label='model')
plt.plot(x_fit, gauss(x_fit, *params[:3]), color='red', lw=1, ls="--", label='distribution 1')
plt.plot(x_fit, gauss(x_fit, *params[3:]), color='red', lw=1, ls=":", label='distribution 2')
plt.legend()
print(pd.DataFrame(data={'params': params, 'sigma': sigma}, index=bimodal.__code__.co_varnames[1:]))
plt.show()
Output:
params sigma
mu1 0.003198 0.005179
sigma1 0.042710 0.005658
A1 18.119693 1.840331
mu2 0.097203 0.009387
sigma2 0.010044 0.007779
A2 5.346416 3.519740
I have 2 clusters plotted in a scatter plot and i need to find their standard deviation and distance from the center from one cluster to another. I was not able to find any guide of documentation that simplifies the process of finding the center of 2 clusters for scatter plots, the reason is that i need to compare the scatter of each cluster with the distance of the centres of the clusters. My actual scatter plot looks like this:
import matplotlib.pyplot as plt
import numpy as np
vector1 = [
2.8238,
3.0284,
5.9333,
2.0156,
2.2467,
2.0092,
4.7983,
4.3554,
3.6372,
1.3159,
2.6174,
2.2336,
0.9625,
5.6285,
5.4040,
2.7887,
0,
3.4632,
0,
2.7370
]
vector5 = [
1.2994,
7.4469,
3.6503,
2.1667,
4.1975,
3.3006,
10.4082,
3.4112,
2.2395,
1.5653,
4.3237,
1.8679,
1.2622,
14.1372,
6.1686,
3.8903,
2.2873,
6.2559,
0.2132,
7.2303,
]
plt.rcParams['figure.figsize'] = (16.0, 10.0)
plt.style.use('ggplot')
data = [vector1, std_colomns4]
plt.plot(vector1 , marker='.', linestyle='none', markersize=20, label='Vector 1')
plt.plot(vector5, marker='.', linestyle='none', markersize=20, label='Vector 5')
plt.xticks(range(1, 20, 1))
plt.yticks(range(1, 20, 1))
plt.ylabel('Sizes')
plt.xlabel('Index')
plt.legend()
plt.show()
For the sake of pre-visualization:
You can compute the mean by converting them to arrays
vector1 = np.array([...])
vector5 = np.array([...])
mean1 = np.mean(vector1)
mean5 = np.mean(vector5)
# Rest of the code
plt.plot((vector1+vector5)/2, marker='x', linestyle='none', markersize=12, label='Mean')
plt.axhline(mean1)
plt.axhline(mean5, c='b')
I'm trying to Add the slider in the plot similar to the slider demo example.
I'm plotting fill_between which gives PolyCollection object.
Although I tried with plot too which give Line2D object as shown picture below, but plot doesn't update as expected as in demo.
code
import numpy as np
import scipy.stats as ss
import matplotlib.pyplot as plt
import matplotlib.widgets as widgets
def get_pdf(mu, sigma=1, offset=4):
o = sigma * offset
x = np.linspace(mu - o, mu + o, 100)
rv = ss.norm(mu, sigma)
return x, rv.pdf(x)
fig, ax = plt.subplots()
plt.subplots_adjust(bottom=0.25)
ax.fill_between(*get_pdf(0, 1), alpha=0.7)
# t = plt.fill_between(*get_pdf(2, 1), alpha=0.7) # this gives ployCollection
t = ax.plot(*get_pdf(2, 1), label='treatment', alpha=0.7)
a = plt.axes([0.25, 0.1, 0.5, 0.03])
slider = widgets.Slider(a, "shift", 0, 10, valinit=2, valstep=1)
def update(val):
x, y = get_pdf(val)
t[0].set_ydata(y)
fig.canvas.draw_idle()
slider.on_changed(update)
plt.show()
To update the line plot, t[0].set_xdata(x) needs to be set, as it is different for each call. In this particular case, get_pdf each time returns the same y.
Updating the coordinates of the polyCollection generated by fill_between doesn't seem to be possible. However, you can delete and recreate it at every update. Note that this is slower than just updating the coordinates.
import numpy as np
import scipy.stats as ss
import matplotlib.pyplot as plt
import matplotlib.widgets as widgets
def get_pdf(mu, sigma=1, offset=4):
o = sigma * offset
x = np.linspace(mu - o, mu + o, 100)
rv = ss.norm(mu, sigma)
return x, rv.pdf(x)
fig, ax = plt.subplots()
plt.subplots_adjust(bottom=0.25)
ax.fill_between(*get_pdf(0, 1), alpha=0.7)
t = ax.fill_between(*get_pdf(2), color='crimson', alpha=0.7)
a = plt.axes([0.25, 0.1, 0.5, 0.03])
slider = widgets.Slider(a, "shift", 0, 10, valinit=2, valstep=1)
def update(val):
global t
t.remove()
t = ax.fill_between(*get_pdf(val), color='crimson', alpha=0.7)
fig.canvas.draw_idle()
slider.on_changed(update)
plt.show()
My code can be seen here:
import numpy as np
from matplotlib import cm
import mpl_toolkits.mplot3d.axes3d as axes3d
from matplotlib.ticker import LinearLocator, FormatStrFormatter
xlist = [+30,+20,+10,0,-10,-20,-30]
ylist = [0.0008,0.0009, 0.001, 0.0012, 0.0013]
total_costs=[[2084.8771849999903, 17314.19051000003, 26026.73173, 65340.709810000015, 108130.0746, 143560.64033000002, 188387.24033], [2129.155209999997, 17314.301310000024, 26026.996729999984, 65341.17821, 108130.792, 143561.44293000002, 188388.11793], [6637.1766100000095, 17314.412110000034, 26027.26173, 65341.646609999996, 108131.5094, 143562.24553000001, 188388.99553], [6623.21941000002, 17314.63371000004, 26027.791729999997, 65342.58341000001, 108132.9442, 150322.81264000002, 191661.16901], [6637.240810000003, 17314.744510000033, 26028.05673000001, 65343.05181000002, 110971.15911000001, 146393.01711000002, 191661.93621]]
Z = np.array(total_costs)
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1, projection='3d')
X, Y = np.meshgrid(xlist, ylist)
ax.plot_surface(X, Y, Z, cmap=cm.coolwarm,
linewidth=0, antialiased=False,alpha=0.5,
`rstride=1,cstride=1, label='skata')`
ax.set_xlabel('System-1 imbalance')
ax.set_ylabel('Penalization factor [€/MWh]')
ax.set_zlabel('Total balancing costs [€]')
#ax.set_legend('upper left', fontsize=15)
#ax.tick_params(axis='both', labelsize=15)
plt.show()
When i run this i get a figure like this:
What i would like is to get a figure like this:
I guess it has something to do with my result being a list within a list with discrete values. Anyone got an idea?
Thank you in advance
I guess you want a more gradual tone change on the graph - the way I know how to do it is to "simply" increase the number of points being plotted using interpolation:
import numpy as np
import matplotlib.pyplot as plt
import mpl_toolkits.mplot3d
import scipy.interpolate as interp
xlist = np.array([+30, +20, +10, 0, -10, -20, -30])
ylist = np.array([0.0008, 0.0009, 0.001, 0.0012, 0.0013])
total_costs = [[2084.8771849999903, 17314.19051000003, 26026.73173,
65340.709810000015, 108130.0746, 143560.64033000002,
188387.24033],
[2129.155209999997, 17314.301310000024, 26026.996729999984,
65341.17821, 108130.792, 143561.44293000002, 188388.11793],
[6637.1766100000095, 17314.412110000034, 26027.26173,
65341.646609999996, 108131.5094, 143562.24553000001,
188388.99553],
[6623.21941000002, 17314.63371000004, 26027.791729999997,
65342.58341000001, 108132.9442, 150322.81264000002,
191661.16901],
[6637.240810000003, 17314.744510000033, 26028.05673000001,
65343.05181000002, 110971.15911000001, 146393.01711000002,
191661.93621]]
X, Y = np.meshgrid(xlist, ylist)
Z = np.asarray(total_costs)
Zfunc = interp.interp2d(X, Y, Z, kind='cubic', copy=False)
n_points = 100 # change this to change the "resolution"
xnew = np.linspace(start=min(xlist), stop=max(xlist), num=n_points)
ynew = np.linspace(start=min(ylist), stop=max(ylist), num=n_points)
Xnew, Ynew = np.meshgrid(xnew, ynew)
Znew = Zfunc(xnew, ynew)
fig = plt.figure(figsize=(11, 8))
ax = plt.axes([0.05, 0.05, 0.9, 0.9], projection='3d')
surface = ax.plot_surface(Xnew, Ynew, Znew, rstride=1, cstride=1,
cmap='coolwarm', linewidth=0.25)
fig.colorbar(surface, shrink=0.75, aspect=9)
plt.show()
Linear interpolation:
Cubic interpolation:
The faces of the surface plot are colorized according to the Z value.
To get mixed or random colors on the faces you can supply a color array with the facecolors argument instead of a colormap.
colors=np.random.rand(X.shape[0]-1,X.shape[1]-1, 3)
ax.plot_surface(X, Y, Z, facecolors=colors,
linewidth=0, antialiased=False,alpha=0.5,
rstride=1,cstride=1, label='skata')
produces
In order to make the colors appear more close to each other, the solution would be not to use the complete range of the colormap. E.g. you could set vmin=0.5*Z.min(), vmax=2*Z.max(), in your call to plot_surface in order to map the colors to a range much larger than the one shown in the image, such that the actual values only cover part of the colormap.
ax.plot_surface(X, Y, Z, cmap=cm.coolwarm, vmin=0.5*Z.min(), vmax=2*Z.max(),
linewidth=0, antialiased=False,alpha=0.5,
rstride=1,cstride=1, label='skata')
Is that what you mean?
def stackQuestion():
import numpy as np
import matplotlib.pyplot as plt
import mpl_toolkits.mplot3d
xlist = np.array([+30,+20,+10,0,-10,-20,-30])
ylist = np.array([0.0008,0.0009, 0.001, 0.0012, 0.0013])
total_costs=[[2084.8771849999903, 17314.19051000003, 26026.73173, 65340.709810000015, 108130.0746, 143560.64033000002, 188387.24033],
[2129.155209999997, 17314.301310000024, 26026.996729999984, 65341.17821, 108130.792, 143561.44293000002, 188388.11793],
[6637.1766100000095, 17314.412110000034, 26027.26173, 65341.646609999996, 108131.5094, 143562.24553000001, 188388.99553],
[6623.21941000002, 17314.63371000004, 26027.791729999997, 65342.58341000001, 108132.9442, 150322.81264000002, 191661.16901],
[6637.240810000003, 17314.744510000033, 26028.05673000001, 65343.05181000002, 110971.15911000001, 146393.01711000002, 191661.93621]]
X, Y = np.meshgrid(xlist, ylist)
Z = np.array(total_costs)
fig = plt.figure(figsize = (11, 8))
ax = plt.axes([0.05, 0.05, 0.9, 0.9], projection = '3d')
surface = ax.plot_surface(X, Y, Z, rstride = 1, cstride = 1,\
cmap = 'coolwarm', linewidth = 0.25)
fig.colorbar(surface, shrink = 0.75, aspect = 9)
plt.show()
I am making this bar plot:
... using this code segment:
my_cmap = plt.get_cmap('copper')
plt.figure()
plt.set_cmap(my_cmap)
plt.pcolormesh(xx, yy, Z)
labels = ['Negative', 'Negative (doubtful)', 'Positive (doubtful)', 'Positive' ]
for i in [0, 1, 2, 3] :
plt.scatter(clustered_training_data[y==i, 0], clustered_training_data[y==i, 1], c=my_cmap(i / 3.0), label=labels[i], s=50, marker='o', edgecolor='white', alpha=0.7)
plt.scatter(lda_trans_eval[q == -1, 0], lda_trans_eval[q == -1, 1], c='green', label='Your patient', s=80, marker='h', edgecolor='white')
plt.legend(prop={'size':8})
Only one (second) color is always blue, regardless of chosen color map. Corresponding data points are correctly colored in the plot and I can't see the reason why pyplot colors the second label differently.
I can't reproduce it with dummy data. Does this have the problem when you run it?
import matplotlib.pyplot as plt
import numpy as np
my_cmap = plt.get_cmap('copper')
fig = plt.figure(figsize=(5,5))
plt.set_cmap(my_cmap)
X = np.linspace(-1,5,100)
Y = np.linspace(-1,5,100)
X,Y = np.meshgrid(X,Y)
Z = (X**2 + Y**2)
Z = Z.astype(int)
Z += (X**2 + Y**2) < .5
ax = plt.pcolormesh(X, Y, Z)
for i in [0,1,2,3]:
plt.scatter([i],[i],c=my_cmap(i / 3.0),label='i=%s'%str(i),
edgecolor='white', alpha=0.7)
plt.scatter([],[],c=my_cmap(1/3.0), label='empty data')
plt.scatter([3],[1],c='green',label='Force color')
plt.legend(loc=2, prop={'size':8})
from os.path import realpath, basename
s = basename(realpath(__file__))
fig.savefig(s.split('.')[0])
plt.show()
This happened to me. I fixed it by using color instead of c.
plt.scatter(clustered_training_data[y==i, 0], clustered_training_data[y==i, 1], color=my_cmap(i / 3.0), label=labels[i], s=50, marker='o', edgecolor='white', alpha=0.7)