I'm working on an astrophysics project where I need to measure the density(ne) of the gas in the center of the galaxy by two methods(A and S). I made a plot of ne_s x ne_a and I want to try an exponential fit in this plot. The problems are the following:
the errors in the data are asymmetrical and, apparently, scipy.odr does not accept this type of error. When the erros are included 'ValueError: could not convert we to a suitable array' is raised.
even if I do not include the errors the fit still does not work.
The code used(errors in the fit not included):
import numpy as np
import matplotlib.pyplot as plt
ne_s = np.array([ 134.70722125, 316.27850769, 403.37221974, 579.91067991,
1103.06258335, 1147.23685549, 115.00820933, 476.42659337,
667.61690967, 403.30988606, 282.08007264, 479.98058352,
897.64247885, 214.75999934, 213.22512064, 491.81749573,
743.68513419, 374.37957281, 362.136037 , 893.88595455])
dne_s_max = np.array([23.6619623 , 5.85802097, 12.02456923, 1.50211648, 5.15987014,
10.3830146 , 10.5274528 , 0.82928872, 2.18586603, 31.95014727,
6.53134179, 2.38392559, 32.2838402 , 5.43629034, 1.02316579,
6.60281602, 14.53943481, 9.16809221, 6.84052648, 12.87655997])
dne_s_min = np.array([21.94513608, 5.80578938, 11.8303456 , 1.49856527, 5.1265976 ,
10.2523836 , 10.12663739, 0.82824884, 2.17914616, 30.55846643,
6.45691351, 2.37446669, 30.87025015, 5.37271061, 1.02087355,
6.5358395 , 14.21332643, 9.0523711 , 6.77187898, 12.64596461])
ne_a = np.array([ 890.61498788, 2872.03715706, 10222.33463389, 1946.48193766,
6695.25304235, 2107.36471192, 891.72010662, 3988.87511761,
11328.9670489 , 1097.38904905, 2896.62668843, 4849.57809801,
5615.96780935, 1415.18564794, 1204.00022768, 3616.05423907,
15638.52683391, 3300.6039601 , 775.28841051, 12325.54379524])
dne_a_max = np.array([1082.33639266, 571.57094375, 2396.39839075, 458.32058555,
796.79916236, 665.95370946, 2262.73423374, 1006.65192577,
1761.9251987 , 1718.78400914, 579.65477159, 245.54811362,
1652.50314639, 401.37677822, 178.03620792, 725.26490794,
6625.62353545, 908.21490446, 719.01117673, 2098.24809312])
dne_a_min = np.array([ 865.33019015, 518.08880981, 1877.85283954, 412.91242092,
724.38681574, 582.52644162, 870.14392196, 866.63643893,
1478.1792513 , 1076.64135559, 521.08794554, 236.2457763 ,
1349.36104495, 362.72343267, 169.23314057, 646.39803115,
4139.5768453 , 789.04878324, 620.55523654, 1720.06369942])
dne_a = [dne_a_min, dne_a_max]
dne_s = [dne_s_min, dne_s_max]
fig, ax = plt.subplots(1,1)
ax.errorbar(ne_s, ne_a, xerr = dne_s, yerr = dne_a,
linestyle = 'none', linewidth = 0.7, capsize = 5, color = 'crimson')
ax.scatter(ne_s, ne_a, s = 15, color = 'black')
ax.set_ylabel('$n_e(A)$'), ax.set_xlabel('$n_e(S)$')
from scipy.odr import Data, RealData, Model, ODR
def f(B, x):
return B[0] + B[1] * np.exp(B[2] * x)
exponential = Model(f)
data = RealData(ne_s, ne_a)
odr = ODR(data, exponential, beta0=[1, 200, 3e-3])
out = odr.run()
ax.plot(ne_s, f(out.beta, ne_s), linewidth = 0.7)
Which results in:
And the actual plot is:
So what am I missing here? Did I applied the odr routine erroneously? What should I do to make the fit work properly? And how to make scipy.odr accept asymmetrical error?
Important to add that I don't know too much about scipy.odr, I just adapted the documentation example to an exponential function.
Appreciate the help. Let me know if more information is necessary.
Related
I want to perform a probability Weibull fit with 0.95% confidence bounds by means of Python. As test data, I use fail cycles of a measurement which are plotted against the reliability R(t).
So far, I found a way to perform the Weibull fit, however, I still do not manage to get the confidence bounds. The Weibull plot with the same test data set was already performed with origin, therfore I know which shape I would "expect" for the confidence interval. But I do not understand how to get there.
I found information about Weibull confidence intervals on reliawiki(cf. Bounds on Reliability based on Fisher Matrix confidence bounds) and used the description there to calculate the variance and the upper and lower confidence bound (R_U and R_L).
Here is a working code example for my Weibull fit and my confidence bounds with the test data set based on the discription of reliawiki (cf. Bounds on Reliability). For the fit, I used a OLS model fit.
import os, sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from scipy.optimize import curve_fit
import math
import statsmodels.api as sm
def weibull_ticks(y, pos):
return "{:.0f}%".format(100 * (1 - np.exp(-np.exp(y))))
def loglog(x):
return np.log(-np.log(1 - np.asarray(x)))
class weibull_example(object):
def __init__(self, dat):
self.fits = {}
dat.index = np.arange(1, len(dat) + 1)
dat.sort_values('data', inplace=True)
#define yaxis-values
dat['percentile'] = dat.index*1/len(dat)
self.data = dat
self.fit()
self.plot_data()
def fit(self):
#fit the data points with a the OLS model
self.data=self.data[:-1]
x0 = np.log(self.data.dropna()['data'].values)
Y = loglog(self.data.dropna()['percentile'])
Yx = sm.add_constant(Y)
model = sm.OLS(x0, Yx)
results = model.fit()
yy = loglog(np.linspace(.001, .999, 100))
YY = sm.add_constant(yy)
XX = np.exp(results.predict(YY))
self.eta = np.exp(results.params[0])
self.beta = 1 / results.params[1]
self.fits['syx'] = {'results': results, 'model': model,
'line': np.row_stack([XX, yy]),
'beta': self.beta,
'eta': self.eta}
cov = results.cov_params()
#get variance and covariance
self.beta_var = cov[1, 1]
self.eta_var = cov[0, 0]
self.cov = cov[1, 0]
def plot_data(self, fit='yx'):
dat = self.data
#plot data points
plt.semilogx(dat['data'], loglog(dat['percentile']), 'o')
fit = 's' + fit
self.plot_fit(fit)
ax = plt.gca()
formatter = mpl.ticker.FuncFormatter(weibull_ticks)
ax.yaxis.set_major_formatter(formatter)
yt_F = np.array([0.001, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5,
0.6, 0.7, 0.8, 0.9, 0.95, 0.99])
yt_lnF = loglog(yt_F)
plt.yticks(yt_lnF)
plt.ylim(loglog([.01, .99]))
def plot_fit(self, fit='syx'):
dat = self.fits[fit]['line']
plt.plot(dat[0], dat[1])
#calculate variance to get confidence bound
def variance(x):
return (math.log(x) - math.log(self.eta)) ** 2 * self.beta_var + \
(self.beta/self.eta) ** 2 * self.eta_var - \
2 * (math.log(x) - math.log(self.eta)) * (-self.beta/self.eta) * self.cov
#calculate confidence bounds
def confidence_upper(x):
return 1-np.exp(-np.exp(self.beta*(math.log(x)-math.log(self.eta)) - 0.95*np.sqrt(variance(x))))
def confidence_lower(x):
return 1-np.exp(-np.exp(self.beta*(math.log(x)-math.log(self.eta)) + 0.95*np.sqrt(variance(x))))
yvals_1 = list(map(confidence_upper, dat[0]))
yvals_2 = list(map(confidence_lower, dat[0]))
#plot confidence bounds
plt.semilogx(dat[0], loglog(yvals_1), linestyle="solid", color="black", linewidth=2,
label="fit_u_1", alpha=0.8)
plt.semilogx(dat[0], loglog(yvals_2), linestyle="solid", color="green", linewidth=2,
label="fit_u_1", alpha=0.8)
def main():
fig, ax1 = plt.subplots()
ax1.set_xlabel("$Cycles\ til\ Failure$")
ax1.set_ylabel("$Weibull\ Percentile$")
#my data points
data = pd.DataFrame({'data': [1556, 2595, 11531, 38079, 46046, 57357]})
weibull_example(data)
plt.savefig("Weibull.png")
plt.close(fig)
if __name__ == "__main__":
main()
The confidence bounds in my plot look not like I expected. I tried a lot of different 'variances', just to understand the function and to check, if the problem is just a typing error. Meanwhile, I am convinced that the problem is more general and that I understood something false from the description on reliawiki. Unfortunately, I really do not get what's the problem and I do not know anyone else I can ask. In the internet and on different forums, I did not find an appropriate answer.
That's why I decided to ask this question here. It's the first time I ask a question in a forum. Therefore, I hope that I explained everything sufficiently and that the code example is useful.
Thank you very much :)
Apologies for the very late answer, but I'll provide it for any future readers.
Rather than try implementing this yourself, you may want to consider using a package designed for exactly this called reliability.
Here is the example for your use case.
Remember to upvote this answer if it helps you :)
Edit:
Modeling and fitting with this approach work fine, the data in here is not good.-------------------
I want to do a curve-fitting on a complex dataset. After thorough reading and searching, I found that i can use a couple of methods (e.g. lmfit optimize, scipy leastsq).
But none gives me a good fit at all.
here is the fit equation:
here is the data to be fitted (list of y values):
[(0.00011342104914066835+8.448890220616275e-07j),
(0.00011340386404065371+7.379293582429708e-07j),
(0.0001133540327309949+6.389834505824625e-07j),
(0.00011332170913939336+5.244566142401774e-07j),
(0.00011331311156154074+4.3841061618015007e-07j),
(0.00011329383047059048+3.6163513508002877e-07j),
(0.00011328700094846502+3.0542249453666894e-07j),
(0.00011327650033983806+2.548725558622188e-07j),
(0.00011327702539337786+2.2508174567697671e-07j),
(0.00011327342238146558+1.9607648998100523e-07j),
(0.0001132710747364799+1.721721661949941e-07j),
(0.00011326933241850936+1.5246061350710235e-07j),
(0.00011326798040984542+1.3614817802178457e-07j),
(0.00011326752037650585+1.233483784504962e-07j),
(0.00011326758290166552+1.1258801448459512e-07j),
(0.00011326813100914905+1.0284749122099354e-07j),
(0.0001132684076390416+9.45791423595816e-08j),
(0.00011326982474882009+8.733105218572698e-08j),
(0.00011327158639135678+8.212191452217794e-08j),
(0.00011327366823516856+7.747920115589205e-08j),
(0.00011327694366034208+7.227069986108343e-08j),
(0.00011327915327873038+6.819405851172907e-08j),
(0.00011328181165961218+6.468392148750885e-08j),
(0.00011328531688122571+6.151393311227958e-08j),
(0.00011328857849500441+5.811704586613896e-08j),
(0.00011329241716561626+5.596645863242474e-08j),
(0.0001132970129528527+5.4722461511610696e-08j),
(0.0001133002881788021+5.064523218904898e-08j),
(0.00011330507671740223+5.0307457368330284e-08j),
(0.00011331106068787993+4.7703959367963307e-08j),
(0.00011331577350707601+4.634615394867111e-08j),
(0.00011332064001939156+4.6914747648361504e-08j),
(0.00011333034985824086+4.4992151257444304e-08j),
(0.00011334188526870483+4.363662798446445e-08j),
(0.00011335491299924776+4.364164366097129e-08j),
(0.00011337451201475147+4.262881852644385e-08j),
(0.00011339778209066752+4.275096587356569e-08j),
(0.00011342832992628646+4.4463907608604945e-08j),
(0.00011346526768580432+4.35706649329342e-08j),
(0.00011351108008292451+4.4155812379491554e-08j),
(0.00011356967192325835+4.327004709646922e-08j),
(0.00011364164970635006+4.420660396556604e-08j),
(0.00011373150199883139+4.3672898914161596e-08j),
(0.00011384660942003356+4.326171366194325e-08j),
(0.00011399193321804955+4.1493065523925126e-08j),
(0.00011418043916260295+4.0762418512759096e-08j),
(0.00011443271767970721+3.91359909722939e-08j),
(0.00011479600563688605+3.845666332695652e-08j),
(0.0001153652105925112+3.6224677316584614e-08j),
(0.00011638635682516399+3.386843079212692e-08j),
(0.00011836223959714231+3.6692295450490655e-08j)]
here is the list of x values:
[999.9999960000001,
794.328231,
630.957342,
501.18723099999994,
398.107168,
316.22776400000004,
251.188642,
199.52623,
158.489318,
125.89254,
99.999999,
79.432823,
63.095734,
50.118722999999996,
39.810717,
31.622776,
25.118864000000002,
19.952623000000003,
15.848932000000001,
12.589253999999999,
10.0,
7.943282000000001,
6.309573,
5.011872,
3.981072,
3.1622779999999997,
2.511886,
1.9952619999999999,
1.584893,
1.258925,
1.0,
0.7943279999999999,
0.630957,
0.5011869999999999,
0.398107,
0.316228,
0.251189,
0.199526,
0.15848900000000002,
0.125893,
0.1,
0.079433,
0.063096,
0.050119,
0.039811,
0.031623000000000005,
0.025119,
0.019953,
0.015849000000000002,
0.012589,
0.01]
and here is the code which works but not the way I want:
import numpy as np
import matplotlib.pyplot as plt
from lmfit import minimize, Parameters
#%% the equation
def ColeCole(params, fr): #fr is x values array and params are the fitting parameters
sig0 = params['sig0']
m = params['m']
tau = params['tau']
c = params['c']
w = fr*2*np.pi
num = 1
denom = 1+(1j*w*tau)**c
sigComplex = sig0*(1.0+(m/(1-m))*(1-num/denom))
return sigComplex
def res(params, fr, data): #calculating reseduals of fit
resedual = ColeCole(params, fr) - data
return resedual.view(np.float)
#%% Adding model parameters and fitting
params = Parameters()
params.add('sig0', value=0.00166)
params.add('m', value=0.19,)
params.add('tau', value=0.05386)
params.add('c', value=0.80)
params['tau'].min = 0 # these conditions must be met but even if I remove them the fit is ugly!!
params['m'].min = 0
out= minimize(res, params , args= (np.array(fr2), np.array(data)))
#%%plotting Imaginary part
fig, ax = plt.subplots()
plotX = fr2
plotY = data.imag
fitplot = ColeCole(out.params, fr2)
ax.semilogx(plotX,plotY,'o',label='imc')
ax.semilogx(plotX,fitplot.imag,label='fit')
#%%plotting real part
fig2, ax2 = plt.subplots()
plotX2 = fr2
plotY2 = data.real
fitplot2 = ColeCole(out.params, fr2)
ax2.semilogx(plotX2,plotY2,'o',label='imc')
ax2.semilogx(plotX2,fitplot2.real,label='fit')
I might be doing it completely wrong, please help me if you know the proper solution to do a curve fitting on complex data.
I would suggest first converting the complex data to numpy arrays and get real, imag pairs separately and then using lmfit Model to model that same sort of data. Perhaps something like this:
cdata = np.array((0.00011342104914066835+8.448890220616275e-07j,
0.00011340386404065371+7.379293582429708e-07j,
0.0001133540327309949+6.389834505824625e-07j,
0.00011332170913939336+5.244566142401774e-07j,
0.00011331311156154074+4.3841061618015007e-07j,
0.00011329383047059048+3.6163513508002877e-07j,
0.00011328700094846502+3.0542249453666894e-07j,
0.00011327650033983806+2.548725558622188e-07j,
0.00011327702539337786+2.2508174567697671e-07j,
0.00011327342238146558+1.9607648998100523e-07j,
0.0001132710747364799+1.721721661949941e-07j,
0.00011326933241850936+1.5246061350710235e-07j,
0.00011326798040984542+1.3614817802178457e-07j,
0.00011326752037650585+1.233483784504962e-07j,
0.00011326758290166552+1.1258801448459512e-07j,
0.00011326813100914905+1.0284749122099354e-07j,
0.0001132684076390416+9.45791423595816e-08j,
0.00011326982474882009+8.733105218572698e-08j,
0.00011327158639135678+8.212191452217794e-08j,
0.00011327366823516856+7.747920115589205e-08j,
0.00011327694366034208+7.227069986108343e-08j,
0.00011327915327873038+6.819405851172907e-08j,
0.00011328181165961218+6.468392148750885e-08j,
0.00011328531688122571+6.151393311227958e-08j,
0.00011328857849500441+5.811704586613896e-08j,
0.00011329241716561626+5.596645863242474e-08j,
0.0001132970129528527+5.4722461511610696e-08j,
0.0001133002881788021+5.064523218904898e-08j,
0.00011330507671740223+5.0307457368330284e-08j,
0.00011331106068787993+4.7703959367963307e-08j,
0.00011331577350707601+4.634615394867111e-08j,
0.00011332064001939156+4.6914747648361504e-08j,
0.00011333034985824086+4.4992151257444304e-08j,
0.00011334188526870483+4.363662798446445e-08j,
0.00011335491299924776+4.364164366097129e-08j,
0.00011337451201475147+4.262881852644385e-08j,
0.00011339778209066752+4.275096587356569e-08j,
0.00011342832992628646+4.4463907608604945e-08j,
0.00011346526768580432+4.35706649329342e-08j,
0.00011351108008292451+4.4155812379491554e-08j,
0.00011356967192325835+4.327004709646922e-08j,
0.00011364164970635006+4.420660396556604e-08j,
0.00011373150199883139+4.3672898914161596e-08j,
0.00011384660942003356+4.326171366194325e-08j,
0.00011399193321804955+4.1493065523925126e-08j,
0.00011418043916260295+4.0762418512759096e-08j,
0.00011443271767970721+3.91359909722939e-08j,
0.00011479600563688605+3.845666332695652e-08j,
0.0001153652105925112+3.6224677316584614e-08j,
0.00011638635682516399+3.386843079212692e-08j,
0.00011836223959714231+3.6692295450490655e-08j))
fr = np.array((999.9999960000001, 794.328231, 630.957342,
501.18723099999994, 398.107168, 316.22776400000004,
251.188642, 199.52623, 158.489318, 125.89254, 99.999999,
79.432823, 63.095734, 50.118722999999996, 39.810717,
31.622776, 25.118864000000002, 19.952623000000003,
15.848932000000001, 12.589253999999999, 10.0,
7.943282000000001, 6.309573, 5.011872, 3.981072,
3.1622779999999997, 2.511886, 1.9952619999999999, 1.584893,
1.258925, 1.0, 0.7943279999999999, 0.630957,
0.5011869999999999, 0.398107, 0.316228, 0.251189, 0.199526,
0.15848900000000002, 0.125893, 0.1, 0.079433, 0.063096,
0.050119, 0.039811, 0.031623000000000005, 0.025119, 0.019953,
0.015849000000000002, 0.012589, 0.01))
data = np.concatenate((cdata.real, cdata.imag))
# model function for lmfit
def colecole_function(x, sig0, m, tau, c):
w = x*2*np.pi
denom = 1+(1j*w*tau)**c
sig = sig0*(1.0+(m/(1.0-m))*(1-1.0/denom))
return np.concatenate((sig.real, sig.imag))
mod = Model(colecole_function)
params = mod.make_params(sig0=0.002, m=-0.19, tau=0.05, c=0.8)
params['tau'].min = 0
result = mod.fit(data, params, x=fr)
print(result.fit_report())
You would then want to plot the results like
nf = len(fr)
plt.plot(fr, data[:nf], label='data(real)')
plt.plot(fr, result.best_fit[:nf], label='fit(real)')
and similarly
plt.plot(fr, data[nf:], label='data(imag)')
plt.plot(fr, result.best_fit[nf:], label='fit(imag)')
Note that I think you're going to want to allow m to be negative (or maybe I misuderstand your model). I did not work carefully on getting a great fit, but I think this should get you started.
I would like to fit an hysteresis curve, with a superparamagnetic behavior, using a magnetic model which includes a Langevin function and a pair distribution function [1]. To fit with this equation I must solve a definite integral. I was trying to use scipy.integrate.quad for this purpose and the features of lmfit, but I do not get -at least- to simulate a reasonable curve (see the code below). Some real physical parameters that can be used to simulate this equation are: Dm = 3.2E-9 m, w = 0.26, NT = 1.7E12, kB=1.38E-23 J/K and T=300K. Simulating with this values it must result in a superparamagnetic curve as the contained in the link below. I would appreciate any suggestion to make this code work and to improve it.
Langevin equation with PDF
www.dropbox.com/pri/get/superpara.dat?_subject_uid=197016565&w=AADkHqW1w-gE9pQkG0oLoE7tNG1J-rWxN0lcIM9ioXWiLA
from lmfit import minimize, Minimizer, Parameters, Parameter, report_fit
from numpy import loadtxt, vectorize, sqrt, log, log10, inf, exp, pi, tanh
from scipy.integrate import quad
import matplotlib.pyplot as plt
dat = loadtxt('superpara.dat')
u0_H = dat[:, 0]
dat1 = dat[:, 1]
def PDF(Dmag, u0_H, params):
v = params.valuesdict()
pdf = (1/(v['w']*sqrt(2*pi)*Dmag))*exp(-(log(Dmag/v['Dm']))**2 / (2*v['w']**2))
x = (pi/(6*v['kB']*v['T']))*v['Ms']*(u0_H*1e-4)*Dmag**3
return (pi/6)*v['Ms']*(Dmag**3)*( (1/tanh(x))-(1/x) )*pdf
def curve(u0_H, params):
return params['NT']*quad(PDF, 0.0, inf, args=(u0_H, params))[0]
vcurve = vectorize(curve, excluded=set([1]))
def fit_function(params, u0_H, dat1):
model1 = vcurve(u0_H, params)
resid1 = dat1 - model1
return resid1.flatten()
params = Parameters()
params.add('Dm' , value= 3.2e-9 , vary= True)
params.add('w' , value= 0.26 , vary= True)
params.add('Ms' , value= 0.1 , vary= True)
params.add('T' , value= 300 , vary= False)
params.add('kB' , value= 1.38e-23, vary= False)
params.add('NT' , value= 1.7e12 , vary= True)
minner = Minimizer(fit_function, params, fcn_args=(u0_H, dat1))
result = minner.minimize()
report_fit(result)
y1_fit = vcurve(u0_H, result.params)
y1_init = vcurve(u0_H, params)
plt.plot(u0_H, dat1, 'k+', u0_H, y1_init, 'b-', u0_H, y1_fit, 'r-')
plt.show()
I'm new to python and trying to plot a gaussian distribution having the function defined as
I plotted normal distribution P(x,y) and it's giving correct output. code and output are below.
Code :
Output :
Now I need to plot a conditional distribution and the output should like . to do this I need to define a boundary condition for the equation. I tried to define a boundary condition but it's not working. the code which I tried is but it's giving wrong output
please help me how to plot the same.
Thanks,
You used the boundary condition on the wrong parameter, try to do it after creating the grid points.
R = np.arange(-4, 4, 0.1)
X, Y = np.meshgrid(R, R)
then validate X and Y based on the condition
valid_xy = np.sqrt(X**2+Y**2) >= 1
X = X[valid_xy]
Y = Y[valid_xy]
Then continue with the rest of the code.
Update
If you want just to reset values around the peak to zero, you can use the following code:
import numpy as np
import matplotlib.pyplot as plt
R = np.arange(-4, 4, 0.1)
X, Y = np.meshgrid(R, R)
Z = np.sum(np.exp(-0.5*(X**2+Y**2)))
P = (1/Z)*np.exp(-0.5*(X**2+Y**2))
# reset the peak
invalid_xy = (X**2+Y**2)<1
P[invalid_xy] = 0
# plot the result
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X, Y, P, s=0.5, alpha=0.5)
plt.show()
You can't use np.meshgrid anymore because it will output a matrix where the coordinates of X and Y form a grid (hence its name) and not a custom shape (a grid minus a disc like you want):
However you can create your custom grid the following way:
R = np.arange(-,4,0.1)
xy_coord = np.array(((x,y) for x in R for y in R if (x*x + y*y) > 1))
X,Y = xy_coord.transpose()
X
# array([ 0. , 0. , 0. , ..., 3.9, 3.9, 3.9])
Y
# array([ 1.1, 1.2, 1.3, ..., 3.7, 3.8, 3.9])
I am trying to fit some data with a Gaussian (and more complex) function(s). I have created a small example below.
My first question is, am I doing it right?
My second question is, how do I add an error in the x-direction, i.e. in the x-position of the observations/data?
It is very hard to find nice guides on how to do this kind of regression in pyMC. Perhaps because its easier to use some least squares, or similar approach, I however have many parameters in the end and need to see how well we can constrain them and compare different models, pyMC seemed like the good choice for that.
import pymc
import numpy as np
import matplotlib.pyplot as plt; plt.ion()
x = np.arange(5,400,10)*1e3
# Parameters for gaussian
amp_true = 0.2
size_true = 1.8
ps_true = 0.1
# Gaussian function
gauss = lambda x,amp,size,ps: amp*np.exp(-1*(np.pi**2/(3600.*180.)*size*x)**2/(4.*np.log(2.)))+ps
f_true = gauss(x=x,amp=amp_true, size=size_true, ps=ps_true )
# add noise to the data points
noise = np.random.normal(size=len(x)) * .02
f = f_true + noise
f_error = np.ones_like(f_true)*0.05*f.max()
# define the model/function to be fitted.
def model(x, f):
amp = pymc.Uniform('amp', 0.05, 0.4, value= 0.15)
size = pymc.Uniform('size', 0.5, 2.5, value= 1.0)
ps = pymc.Normal('ps', 0.13, 40, value=0.15)
#pymc.deterministic(plot=False)
def gauss(x=x, amp=amp, size=size, ps=ps):
e = -1*(np.pi**2*size*x/(3600.*180.))**2/(4.*np.log(2.))
return amp*np.exp(e)+ps
y = pymc.Normal('y', mu=gauss, tau=1.0/f_error**2, value=f, observed=True)
return locals()
MDL = pymc.MCMC(model(x,f))
MDL.sample(1e4)
# extract and plot results
y_min = MDL.stats()['gauss']['quantiles'][2.5]
y_max = MDL.stats()['gauss']['quantiles'][97.5]
y_fit = MDL.stats()['gauss']['mean']
plt.plot(x,f_true,'b', marker='None', ls='-', lw=1, label='True')
plt.errorbar(x,f,yerr=f_error, color='r', marker='.', ls='None', label='Observed')
plt.plot(x,y_fit,'k', marker='+', ls='None', ms=5, mew=2, label='Fit')
plt.fill_between(x, y_min, y_max, color='0.5', alpha=0.5)
plt.legend()
I realize that I might have to run more iterations, use burn in and thinning in the end. The figure plotting the data and the fit is seen here below.
The pymc.Matplot.plot(MDL) figures looks like this, showing nicely peaked distributions. This is good, right?
My first question is, am I doing it right?
Yes! You need to include a burn-in period, which you know. I like to throw out the first half of my samples. You don't need to do any thinning, but sometimes it will make your post-MCMC work faster to process and smaller to store.
The only other thing I advise is to set a random seed, so that your results are "reproducible": np.random.seed(12345) will do the trick.
Oh, and if I was really giving too much advice, I'd say import seaborn to make the matplotlib results a little more beautiful.
My second question is, how do I add an error in the x-direction, i.e. in the x-position of the observations/data?
One way is to include a latent variable for each error. This works in your example, but will not be feasible if you have many more observations. I'll give a little example to get you started down this road:
# add noise to observed x values
x_obs = pm.rnormal(mu=x, tau=(1e4)**-2)
# define the model/function to be fitted.
def model(x_obs, f):
amp = pm.Uniform('amp', 0.05, 0.4, value= 0.15)
size = pm.Uniform('size', 0.5, 2.5, value= 1.0)
ps = pm.Normal('ps', 0.13, 40, value=0.15)
x_pred = pm.Normal('x', mu=x_obs, tau=(1e4)**-2) # this allows error in x_obs
#pm.deterministic(plot=False)
def gauss(x=x_pred, amp=amp, size=size, ps=ps):
e = -1*(np.pi**2*size*x/(3600.*180.))**2/(4.*np.log(2.))
return amp*np.exp(e)+ps
y = pm.Normal('y', mu=gauss, tau=1.0/f_error**2, value=f, observed=True)
return locals()
MDL = pm.MCMC(model(x_obs, f))
MDL.use_step_method(pm.AdaptiveMetropolis, MDL.x_pred) # use AdaptiveMetropolis to "learn" how to step
MDL.sample(200000, 100000, 10) # run chain longer since there are more dimensions
It looks like it may be hard to get good answers if you have noise in x and y:
Here is a notebook collecting this all up.
EDIT: Important note
This has been bothering me for a while now. The answers given by myself and Abraham here are correct in the sense that they add variability to x. HOWEVER: Note that you cannot simply add uncertainty in this way to cancel out the errors you have in your x-values, so that you regress against "true x". The methods in this answer can show you how adding errors to x affects your regression if you have the true x. If you have a mismeasured x, these answers will not help you. Having errors in the x-values is a very tricky problem to solve, as it leads to "attenuation" and an "errors-in-variables effect". The short version is: having unbiased, random errors in x leads to bias in your regression estimates. If you have this problem, check out Carroll, R.J., Ruppert, D., Crainiceanu, C.M. and Stefanski, L.A., 2006. Measurement error in nonlinear models: a modern perspective. Chapman and Hall/CRC., or for a Bayesian approach, Gustafson, P., 2003. Measurement error and misclassification in statistics and epidemiology: impacts and Bayesian adjustments. CRC Press. I ended up solving my specific problem using Carroll et al.'s SIMEX method along with PyMC3. The details are in Carstens, H., Xia, X. and Yadavalli, S., 2017. Low-cost energy meter calibration method for measurement and verification. Applied energy, 188, pp.563-575. It is also available on ArXiv
I converted Abraham Flaxman's answer above into PyMC3, in case someone needs it. Some very minor changes, but can be confusing nevertheless.
The first is that the deterministic decorator #Deterministic is replaced by a distribution-like call function var=pymc3.Deterministic(). Second, when generating a vector of normally distributed random variables,
rvs = pymc2.rnormal(mu=mu, tau=tau)
is replaced by
rvs = pymc3.Normal('var_name', mu=mu, tau=tau,shape=size(var)).random()
The complete code is as follows:
import numpy as np
from pymc3 import *
import matplotlib.pyplot as plt
# set random seed for reproducibility
np.random.seed(12345)
x = np.arange(5,400,10)*1e3
# Parameters for gaussian
amp_true = 0.2
size_true = 1.8
ps_true = 0.1
#Gaussian function
gauss = lambda x,amp,size,ps: amp*np.exp(-1*(np.pi**2/(3600.*180.)*size*x)**2/(4.*np.log(2.)))+ps
f_true = gauss(x=x,amp=amp_true, size=size_true, ps=ps_true )
# add noise to the data points
noise = np.random.normal(size=len(x)) * .02
f = f_true + noise
f_error = np.ones_like(f_true)*0.05*f.max()
with Model() as model3:
amp = Uniform('amp', 0.05, 0.4, testval= 0.15)
size = Uniform('size', 0.5, 2.5, testval= 1.0)
ps = Normal('ps', 0.13, 40, testval=0.15)
gauss=Deterministic('gauss',amp*np.exp(-1*(np.pi**2*size*x/(3600.*180.))**2/(4.*np.log(2.)))+ps)
y =Normal('y', mu=gauss, tau=1.0/f_error**2, observed=f)
start=find_MAP()
step=NUTS()
trace=sample(2000,start=start)
# extract and plot results
y_min = np.percentile(trace.gauss,2.5,axis=0)
y_max = np.percentile(trace.gauss,97.5,axis=0)
y_fit = np.percentile(trace.gauss,50,axis=0)
plt.plot(x,f_true,'b', marker='None', ls='-', lw=1, label='True')
plt.errorbar(x,f,yerr=f_error, color='r', marker='.', ls='None', label='Observed')
plt.plot(x,y_fit,'k', marker='+', ls='None', ms=5, mew=1, label='Fit')
plt.fill_between(x, y_min, y_max, color='0.5', alpha=0.5)
plt.legend()
Which results in
y_error
For errors in x (note the 'x' suffix to variables):
# define the model/function to be fitted in PyMC3:
with Model() as modelx:
x_obsx = pm3.Normal('x_obsx',mu=x, tau=(1e4)**-2, shape=40)
ampx = Uniform('ampx', 0.05, 0.4, testval=0.15)
sizex = Uniform('sizex', 0.5, 2.5, testval=1.0)
psx = Normal('psx', 0.13, 40, testval=0.15)
x_pred = Normal('x_pred', mu=x_obsx, tau=(1e4)**-2*np.ones_like(x_obsx),testval=5*np.ones_like(x_obsx),shape=40) # this allows error in x_obs
gauss=Deterministic('gauss',ampx*np.exp(-1*(np.pi**2*sizex*x_pred/(3600.*180.))**2/(4.*np.log(2.)))+psx)
y = Normal('y', mu=gauss, tau=1.0/f_error**2, observed=f)
start=find_MAP()
step=NUTS()
tracex=sample(20000,start=start)
Which results in:
x_error_graph
the last observation is that when doing
traceplot(tracex[100:])
plt.tight_layout();
(result not shown), we can see that sizex seems to be suffering from 'attenuation' or 'regression dilution' due to the error in the measurement of x.