I'm trying to fit the distribution of some experimental values with a custom probability density function. Obviously, the integral of the resulting function should always be equal to 1, but the results of simple scipy.optimize.curve_fit(function, dataBincenters, dataCounts) never satisfy this condition.
What is the best way to solve this problem?
You can define your own residuals function, including a penalization parameter, like detailed in the code below, where it is known beforehand that the integral along the interval must be 2.. If you test without the penalization you will see that what your are getting is the conventional curve_fit:
import matplotlib.pyplot as plt
import scipy
from scipy.optimize import curve_fit, minimize, leastsq
from scipy.integrate import quad
from scipy import pi, sin
x = scipy.linspace(0, pi, 100)
y = scipy.sin(x) + (0. + scipy.rand(len(x))*0.4)
def func1(x, a0, a1, a2, a3):
return a0 + a1*x + a2*x**2 + a3*x**3
# here you include the penalization factor
def residuals(p, x, y):
integral = quad(func1, 0, pi, args=(p[0], p[1], p[2], p[3]))[0]
penalization = abs(2.-integral)*10000
return y - func1(x, p[0], p[1], p[2], p[3]) - penalization
popt1, pcov1 = curve_fit(func1, x, y)
popt2, pcov2 = leastsq(func=residuals, x0=(1., 1., 1., 1.), args=(x, y))
y_fit1 = func1(x, *popt1)
y_fit2 = func1(x, *popt2)
plt.scatter(x, y, marker='.')
plt.plot(x, y_fit1, color='g', label='curve_fit')
plt.plot(x, y_fit2, color='y', label='constrained')
plt.legend()
plt.xlim(-0.1, 3.5)
plt.ylim(0, 1.4)
print('Exact integral:', quad(sin, 0, pi)[0])
print('Approx integral1:', quad(func1, 0, pi, args=(popt1[0], popt1[1], popt1[2], popt1[3]))[0])
print('Approx integral2:', quad(func1, 0, pi, args=(popt2[0], popt2[1], popt2[2], popt2[3]))[0])
plt.show()
#Exact integral: 2.0
#Approx integral1: 2.60068579748
#Approx integral2: 2.00001911981
Other related questions:
SciPy LeastSq Goodness of Fit Estimator
Here is an almost-identical snippet which makes only use of curve_fit.
import matplotlib.pyplot as plt
import numpy as np
import scipy.optimize as opt
import scipy.integrate as integr
x = np.linspace(0, np.pi, 100)
y = np.sin(x) + (0. + np.random.rand(len(x))*0.4)
def Func(x, a0, a1, a2, a3):
return a0 + a1*x + a2*x**2 + a3*x**3
# modified function definition with Penalization
def FuncPen(x, a0, a1, a2, a3):
integral = integr.quad( Func, 0, np.pi, args=(a0,a1,a2,a3))[0]
penalization = abs(2.-integral)*10000
return a0 + a1*x + a2*x**2 + a3*x**3 + penalization
popt1, pcov1 = opt.curve_fit( Func, x, y )
popt2, pcov2 = opt.curve_fit( FuncPen, x, y )
y_fit1 = Func(x, *popt1)
y_fit2 = Func(x, *popt2)
plt.scatter(x,y, marker='.')
plt.plot(x,y_fit2, color='y', label='constrained')
plt.plot(x,y_fit1, color='g', label='curve_fit')
plt.legend(); plt.xlim(-0.1,3.5); plt.ylim(0,1.4)
print 'Exact integral:',integr.quad(np.sin ,0,np.pi)[0]
print 'Approx integral1:',integr.quad(Func,0,np.pi,args=(popt1[0],popt1[1],
popt1[2],popt1[3]))[0]
print 'Approx integral2:',integr.quad(Func,0,np.pi,args=(popt2[0],popt2[1],
popt2[2],popt2[3]))[0]
plt.show()
#Exact integral: 2.0
#Approx integral1: 2.66485028754
#Approx integral2: 2.00002116217
Following the example above here is more general way to add any constraints:
from scipy.optimize import minimize
from scipy.integrate import quad
import matplotlib.pyplot as plt
import numpy as np
x = np.linspace(0, np.pi, 100)
y = np.sin(x) + (0. + np.random.rand(len(x))*0.4)
def func_to_fit(x, params):
return params[0] + params[1] * x + params[2] * x ** 2 + params[3] * x ** 3
def constr_fun(params):
intgrl, _ = quad(func_to_fit, 0, np.pi, args=(params,))
return intgrl - 2
def func_to_minimise(params, x, y):
y_pred = func_to_fit(x, params)
return np.sum((y_pred - y) ** 2)
# Do the parameter fitting
#without constraints
res1 = minimize(func_to_minimise, x0=np.random.rand(4), args=(x, y))
params1 = res1.x
# with constraints
cons = {'type': 'eq', 'fun': constr_fun}
res2 = minimize(func_to_minimise, x0=np.random.rand(4), args=(x, y), constraints=cons)
params2 = res2.x
y_fit1 = func_to_fit(x, params1)
y_fit2 = func_to_fit(x, params2)
plt.scatter(x,y, marker='.')
plt.plot(x, y_fit2, color='y', label='constrained')
plt.plot(x, y_fit1, color='g', label='curve_fit')
plt.legend(); plt.xlim(-0.1,3.5); plt.ylim(0,1.4)
plt.show()
print(f"Constrant violation: {constr_fun(params1)}")
Constraint violation: -2.9179325622408214e-10
If you are able normalise your probability fitting function in advance then you can use this information to constrain your fit. A very simple example of this would be fitting a Gaussian to data. If one were to fit the following three-parameter (A, mu, sigma) Gaussian then it would be unnormalised in general:
however, if one instead enforces the normalisation condition on A:
then the Gaussian is only two parameter and is automatically normalised.
You could ensure that your fitted probability distribution is normalised via a numerical integration. For example, assuming that you have data x and y and that you have defined an unnormalised_function(x, a, b) with parameters a and b for your probability distribution, which is defined on the interval x1 to x2 (which could be infinite):
from scipy.optimize import curve_fit
from scipy.integrate import quad
# Define a numerically normalised function
def normalised_function(x, a, b):
normalisation, _ = quad(lambda x: unnormalised_function(x, a, b), x1, x2)
return unnormalised_function(x, a, b)/normalisation
# Do the parameter fitting
fitted_parameters, _ = curve_fit(normalised_function, x, y)
Related
I'm trying to make a piecewise linear fit consisting of 3 pieces whereof the first and last pieces are constant. As you can see in this figure
don't get the expected fit, since the fit doesn't capture the 3 linear pieces clearly visual from the original data points.
I've tried following this question and expanded it to the case of 3 pieces with the two constant pieces, but I must have done something wrong.
Here is my code:
from scipy import optimize
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
plt.rcParams['figure.figsize'] = [16, 6]
x = np.arange(0, 50, dtype=float)
y = np.array([50 for i in range(10)]
+ [50 - (50-5)/31 * i for i in range(1, 31)]
+ [5 for i in range(10)],
dtype=float)
def piecewise_linear(x, x0, y0, x1, y1):
return np.piecewise(x,
[x < x0, (x >= x0) & (x < x1), x >= x1],
[lambda x:y0, lambda x:(y1-y0)/(x1-x0)*(x-x0)+y0, lambda x:y1])
p , e = optimize.curve_fit(piecewise_linear, x, y)
xd = np.linspace(0, 50, 101)
plt.plot(x, y, "o", label='original data')
plt.plot(xd, piecewise_linear(xd, *p), label='piecewise linear fit')
plt.legend()
The accepted answer to the previous mentioned question suggest looking at segments_fit.ipynb for the case of N parts, but following that it doesn't seem that I can specify, that the first and last pieces should be constant.
Furthermore I do get the following warning:
OptimizeWarning: Covariance of the parameters could not be estimated
What do I do wrong?
You could directly copy the segments_fit implementation
from scipy import optimize
def segments_fit(X, Y, count):
xmin = X.min()
xmax = X.max()
seg = np.full(count - 1, (xmax - xmin) / count)
px_init = np.r_[np.r_[xmin, seg].cumsum(), xmax]
py_init = np.array([Y[np.abs(X - x) < (xmax - xmin) * 0.01].mean() for x in px_init])
def func(p):
seg = p[:count - 1]
py = p[count - 1:]
px = np.r_[np.r_[xmin, seg].cumsum(), xmax]
return px, py
def err(p):
px, py = func(p)
Y2 = np.interp(X, px, py)
return np.mean((Y - Y2)**2)
r = optimize.minimize(err, x0=np.r_[seg, py_init], method='Nelder-Mead')
return func(r.x)
Then you apply it as follows
import numpy as np;
# mimic your data
x = np.linspace(0, 50)
y = 50 - np.clip(x, 10, 40)
# apply the segment fit
fx, fy = segments_fit(x, y, 3)
This will give you (fx,fy) the corners your piecewise fit, let's plot it
import matplotlib.pyplot as plt
# show the results
plt.figure(figsize=(8, 3))
plt.plot(fx, fy, 'o-')
plt.plot(x, y, '.')
plt.legend(['fitted line', 'given points'])
EDIT: Introducing constant segments
As mentioned in the comments the above example doesn't guarantee that the output will be constant in the end segments.
Based on this implementation the easier way I can think is to restrict func(p) to do that, a simple way to ensure a segment is constant, is to set y[i+1]==y[i]. Thus I added xanchor and yanchor. If you give an array with repeated numbers you can bind multiple points to the same value.
from scipy import optimize
def segments_fit(X, Y, count, xanchors=slice(None), yanchors=slice(None)):
xmin = X.min()
xmax = X.max()
seg = np.full(count - 1, (xmax - xmin) / count)
px_init = np.r_[np.r_[xmin, seg].cumsum(), xmax]
py_init = np.array([Y[np.abs(X - x) < (xmax - xmin) * 0.01].mean() for x in px_init])
def func(p):
seg = p[:count - 1]
py = p[count - 1:]
px = np.r_[np.r_[xmin, seg].cumsum(), xmax]
py = py[yanchors]
px = px[xanchors]
return px, py
def err(p):
px, py = func(p)
Y2 = np.interp(X, px, py)
return np.mean((Y - Y2)**2)
r = optimize.minimize(err, x0=np.r_[seg, py_init], method='Nelder-Mead')
return func(r.x)
I modified a little the data generation to make it more clear the effect of the change
import matplotlib.pyplot as plt
import numpy as np;
# mimic your data
x = np.linspace(0, 50)
y = 50 - np.clip(x, 10, 40) + np.random.randn(len(x)) + 0.25 * x
# apply the segment fit
fx, fy = segments_fit(x, y, 3)
plt.plot(fx, fy, 'o-')
plt.plot(x, y, '.k')
# apply the segment fit with some consecutive points having the
# same anchor
fx, fy = segments_fit(x, y, 3, yanchors=[1,1,2,2])
plt.plot(fx, fy, 'o--r')
plt.legend(['fitted line', 'given points', 'with const segments'])
You can get a one line solution (not counting the import) using univariate splines of degree one. Like this
from scipy.interpolate import UnivariateSpline
f = UnivariateSpline(x,y,k=1,s=0)
Here k=1 means we interpolate using polynomials of degree one aka lines. s is the smoothing parameter. It decides how much you want to compromise on the fit to avoid using too many segments. Setting it to zero means no compromises i.e. the line HAS to go threw all points. See the documentation.
Then
plt.plot(x, y, "o", label='original data')
plt.plot(x, f(x), label='linear interpolation')
plt.legend()
plt.savefig("out.png", dpi=300)
gives
This i consider a funny non-linear approach that works quite well.
Note that even though this is highly non-linear it approximates the linear behavior very well. Moreover, the fit parameters provide the linear results. Only for the offset b a little transformation and according error propagation is required. (Also, I don't care about the value of p as long as it is somewhat larger than 5)
import matplotlib.pyplot as plt
import numpy as np
from scipy.optimize import curve_fit
np.set_printoptions( linewidth=250, precision=4)
np.set_printoptions( linewidth=250, precision=4)
### piecewise linear function for data generation
def pwl( x, m, b, a1, a2 ):
if x < a1:
out = pwl( a1, m, b, a1, a2 )
elif x > a2:
out = pwl( a2, m, b, a1, a2 )
else:
out = m * x + b
return out
### non-linear approximation
def func( x, m, b, a1, a2, p ):
out = b + np.log(
1 / ( 1 + np.exp( -m *( x - a1 ) )**p )
) / p - np.log(
1 / ( 1 + np.exp( -m * ( x - a2 ) )**p )
) / p
return out
### some data
nn = 36
xdata = np.linspace( -5, 19, nn )
ydata = np.fromiter( (pwl( x, -2.1, 11.6, -1.1, 12.7 ) for x in xdata ), float)
ydata += np.random.normal( size=nn, scale=0.2)
### dense grid for printing
xth = np.linspace( -5, 19, 150 )
###fitting
popt, cov = curve_fit( func, xdata, ydata, p0=[-2, 11, -1, 10, 1])
mF, betaF, a1F, a2F, pF = popt
bF = betaF - mF * a1F
sol=( mF, bF, a1F, a2F, pF )
### transforming the covariance due to the b' -> b mapping
J1 = np.identity(5)
J1[1,0] = -popt[2]
J1[1,2] = -popt[0]
cov2 = np.dot( J1, np.dot( cov, np.transpose( J1 ) ) )
### results
print( cov2 )
for i, v in enumerate( ("m", "b", "a1", "a2", "p" ) ):
print( "{:>2} = {:+2.4e} ± {:0.4e}".format( v, sol[i], np.sqrt( cov2[i,i] ) ) )
### plotting
fig = plt.figure()
ax = fig.add_subplot( 1, 1, 1 )
ax.plot( xdata, ydata, ls='', marker='+' )
ax.plot( xth, func( xth, -2, 11, -1, 10, 1 ) )
ax.plot( xth, func( xth, *popt ) )
plt.show()
Providing
[[ 1.3553e-04 -7.6291e-04 -4.3488e-04 4.5624e-04 1.2619e-01]
[-7.6291e-04 6.4126e-03 3.4560e-03 -1.5573e-03 -7.4983e-01]
[-4.3488e-04 3.4560e-03 3.4741e-03 -9.8284e-04 -4.2344e-01]
[ 4.5624e-04 -1.5573e-03 -9.8284e-04 3.0842e-03 -5.2739e+00]
[ 1.2619e-01 -7.4983e-01 -4.2344e-01 -5.2739e+00 3.1583e+05]]
m = -2.0810e+00 ± 9.7718e-03
b = +1.1463e+01 ± 6.7217e-02
a1 = -1.2545e+00 ± 5.0384e-02
a2 = +1.2739e+01 ± 4.7176e-02
p = +1.6840e+01 ± 2.9872e+02
and
I was wondering if it is possible to set bounds for the parameters in curve_fit() such that the bounds are dependent on another parameter. For example, say if I wanted to set the slope of a line to be greater than the intercept.
def linear(x, m, b):
return lambda x: (m*x) + b
def plot_linear(x, y):
B = ([b, -np.inf], [np.inf, np.inf])
p, v = curve_fit(linear, x, y, bounds = B)
xs = np.linspace(min(x), max(x), 1000)
plt.plot(x,y,'.')
plt.plot(xs, linear(xs, *p), '-')
I know that this doesn't work because the parameter b is not defined before it is called in the bounds, but I am not sure if there is a way to make this work?
We can always re-parameterize w.r.t. the specific curve-fitting problem. For example, if you wanted to fit y=mx+b s.t. m >= b, it can be re-written as m=b+k*k with another parameter k and we can optimize with the parameters b, k now as follows:
def linear(x, m, b):
return m*x + b
def linear2(x, k, b): # constrained fit, m = b + k**2 >= b
return (b+k**2)*x + b
def plot_linear(x, y):
p, v = curve_fit(linear, x, y)
print(p)
# [3.1675609 6.01025041]
p2, v2 = curve_fit(linear2, x, y)
print(p2)
# [2.13980283e-05 4.99368661e+00]
xs = np.linspace(min(x), max(x), 1000)
plt.plot(x,y,'.')
plt.plot(xs, linear(xs, *p), 'r-', label='unconstrained fit')
plt.plot(xs, linear2(xs, *p2), 'b-', label='constrained (m>b) fit')
plt.legend()
Now let's fit the curves on following data, using both the constrained and unconstrained fit functions (note the unconstrained optimal fit will have slope less than intercept)
x = np.linspace(0,1,100)
y = 3*x + 5 + 2*np.random.rand(len(x))
plot_linear(x, y)
To my understanding, Statsmodel WLS (weighted least squared) and curve_fit (which uses least-squares by default) should provide the same outcome.
When using non weighted values, it does (code below #1). However, when I used weights, the output values become totally different (code below #2).
Why is this?
_ #1 non weighted
import numpy as np
import statsmodels.api as sm
import pandas as pd
from scipy.optimize import curve_fit
Y = [-5,2,3,4,5,6,7]
X = [1,2,3,4,5,6,7]
W = [1,1,1,1,1,1,1]
def equation_func (x, m, c):
return m * x + c
popt, pcov = curve_fit(equation_func,X, Y, p0=[-1.6, 1.28], sigma=[1 / w for w in W],
absolute_sigma=True)
curve_results = [equation_func(x, popt[0], popt[1]) for x in X]
wls_model = sm.WLS(Y,sm.add_constant(X), weights=W)
results = wls_model.fit()
print(results.params)
print([popt[1], popt[0]])
#WLS - [-3.42857143 1.64285714]
#Curve_fit - [-3.42857143, 1.64285714]
_#2 Weighted
Y = [-5,2,3,4,5,6,7]
X = [1,2,3,4,5,6,7]
W = [1,2,3,4,5,6,7]
def equation_func (x, m, c):
return m * x + c
popt, pcov = curve_fit(equation_func,X, Y, p0=[-1.6, 1.28], sigma=[1 / w for w in W],
absolute_sigma=True)
curve_results = [equation_func(x, popt[0], popt[1]) for x in X]
wls_model = sm.WLS(Y,sm.add_constant(X), weights=W)
results = wls_model.fit()
print(results.params)
print([popt[1], popt[0]])
#WLS - [-1.64285714 1.28571429]
#Curve_fit - [-0.5840336134466501, 1.0966386554667582]
I want to solve a non linear ordinary differential equation of the form
Theta2 = (C + j(Theta2))**-1 * (f(t) – g(Theta1) -h(Theta0))
Where f(), g(), h(), and j() are functions already defined that take Theta2, Theta1, Theta0 or t as an input. Theta2 and Theta1 are the second and first derivative of Theta0 with time t.
I have been solving the equation without the j(Theta2) term using the SciPy.odeint function using the following code:
from scipy.integrate import odeint
def ODE():
def g(Theta, t):
Theta0 = Theta[0]
Theta1 = Theta[1]
Theta2 = (1/C)*( f(t) - g(Theta1) - h(Theta0))
return Theta1, Theta2
init = 0, 0 # Initial conditions on theta0 and theta1 (velocity) at t=0
sol=odeint(g, init, t)
A = sol[:,1]
B = sol[:,0]
return(A, B)
The equation could be re-written as:
F(t, theta, theta')
theta'' = -------------------
a + b*theta''
where a and b are constants, and F corresponds to (f(t) – g(Theta1) -h(Theta0)).
It is a second order polynomial function of theta'', with 2 solutions (considering b!=0 and a^2 + 4*b*F>0) :
theta'' = -( sqrt(a^2 + 4*b*F) +/- a )/(2*b)
This new equation is of the form y' = f(t, y) which could be solved using regular ODE solver.
Here is an example using solve_ivp which is the replacement for odeint:
import numpy as np
from scipy.integrate import solve_ivp
import matplotlib.pyplot as plt
a = 20
b = 1
def f(t, y, dydt):
return t + y**2
def ode_function_plus(t, Y):
y = Y[0]
dydt = Y[1]
d2y_dt2 = -(np.sqrt(a**2 + 4*b*f(t, y, dydt)) + a )/(2*b)
return [dydt, d2y_dt2]
def ode_function_minus(t, Y):
y = Y[0]
dydt = Y[1]
d2y_dt2 = -(np.sqrt(a**2 + 4*b*f(t, y, dydt)) - a )/(2*b)
return [dydt, d2y_dt2]
# Solve
t_span = [0, 4]
Y0 = [10, 1]
sol_plus = solve_ivp(ode_function_plus, t_span, Y0)
sol_minus = solve_ivp(ode_function_minus, t_span, Y0)
print(sol_plus.message)
# Graph
plt.plot(sol_plus.t, sol_plus.y[0, :], label='solution +a');
plt.plot(sol_minus.t, sol_minus.y[0, :], label='solution -a');
plt.xlabel('time'); plt.ylabel('y'); plt.legend();
I'm working on the analysis of some data. Theoretically, there should be two gaussians, that overlap more or less. I found out that fitting the data works best if you don't fit the two gaussians but their distribution function. Actually, this fit seems to work rather well, but if i go back to the density representation of the data it looks somehow wired. The pictures attached speak for them self. Any idea what goes wrong there?
Here is my code:`
import numpy as np
import scipy
import scipy.special
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from scipy.optimize import leastsq
from scipy.special import erf
def fitfunction(params,Bins):
amp_ratio, sigma1, sigma2, mu, Delta = params
return amp_ratio * 0.5 * (1 + erf((Bins -mu)/np.sqrt(2*sigma1**2))) + (1-amp_ratio)* 0.5 * (1 + erf((Bins - (mu + Delta))/np.sqrt(2*sigma2**2)))
def errorfunction(params, Reale_werte, Bins):
amp_ratio, sigma1, sigma2, mu, Delta = params
if(amp_ratio > 0) and (amp_ratio < 1):
return (Reale_werte - fitfunction(params, Bins))
else:
return (Reale_werte - fitfunction(params, Bins))*100
def Gaussians(params, Bins):
amp_ratio, sigma1, sigma2, mu, Delta = params
return amp_ratio/np.sqrt(2*np.pi*sigma1*sigma1) * np.exp(-((Bins-mu)**2) / np.sqrt(2*np.pi*sigma1*sigma1)) + (1-amp_ratio)/np.sqrt(2*np.pi*sigma2*sigma2) * np.exp(-((Bins-(mu + Delta))**2) / np.sqrt(2*np.pi*sigma2*sigma2))
def Gaussian1(params, Bins):
amp_ratio, sigma1, sigma2, mu, Delta = params
return amp_ratio/np.sqrt(2*np.pi*sigma1*sigma1) * np.exp(-((Bins-mu)**2) / np.sqrt(2*np.pi*sigma1*sigma1))
def Gaussian2(params, Bins):
amp_ratio, sigma1, sigma2, mu, Delta = params
return (1-amp_ratio)/np.sqrt(2*np.pi*sigma2*sigma2) * np.exp(-((Bins-(mu + Delta))**2) / np.sqrt(2*np.pi*sigma2*sigma2))
params = 0.25,1,10,1,5
params_init = 0.75, 0.8, 2.5, 1.2, 4
Bins = np.linspace(-4,18,1024)
data = Gaussians(params, Bins)
summe = np.zeros_like(Bins)
for i in range(Bins.shape[0]-1):
summe[i+1] = summe[i] + data[i]
summe = summe/summe[Bins.shape[0]-1]
params_result = leastsq(errorfunction, params_init, args=(summe, Bins))
plt.plot(Bins,fitfunction(params_result[0], Bins))
plt.plot(Bins, summe, 'x')
plt.savefig("Distribution.png")
plt.show()
print params_result[0]
plt.plot(Bins,Gaussians(params_result[0], Bins))
plt.plot(Bins, data, 'x')
plt.savefig("Gaussian.png")
plt.show()`
I was wondering whether kernel density estimation works in your case:
from scipy.stats import kde
import matplotlib.pyplot as plt
density = kde.gaussian_kde(x) # your data
xgrid = np.linspace(x.min(), x.max(), 1024)
plt.plot(xgrid, density(xgrid))
plt.show()