I try to simulate the confidence limit of the mean value in the figure above.
But something wrong with my code. The result is far different than the above figure. I used confidence interval to obtain the slope and intercept. Could someone one give me hints ? Thanks
Here is my code.
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import t
from scipy import stats
x = np.array([3,7,11,15,18,27,29,30,30,31,31, 32,33,33,34,36,36,36,37,38,39,39,39,40,41,42,42,43,44,45,46,47,50])
y = np.array([5,11,21,16,16,28,27,25,35,30,40,32,34,32,34,37,38,34,36,38,37,36,45,39,41,40,44,37,44,46,46,49,51])
mean_x = np.mean(x)
n = len(x)
res = stats.linregress(x, y)
tinv = lambda p, df: abs(t.ppf(p/2, df))
ts = tinv(0.05, n-2)
bins = np.linspace(0,3,54)
plt.plot(x, y, 'o', label='Data Points')
plt.plot(x,res.intercept + res.slope*x, 'c', label='fitted line')
plt.plot(x,(res.intercept+ts*res.intercept_stderr)+(res.slope + ts*res.stderr)*x,'b', label='Upper Limit')
plt.plot(x,(res.intercept-ts*res.intercept_stderr)+(res.slope - ts*res.stderr)*x, 'g' ,label='Lower Limit')
plt.legend()
plt.show()
There are lots of ways to estimate the error of your model. As an example, here is a linear fit of the error:
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import t
from scipy import stats
x = np.array([3,7,11,15,18,27,29,30,30,31,31, 32,33,33,34,36,36,36,37,38,39,39,39,40,41,42,42,43,44,45,46,47,50])
y = np.array([5,11,21,16,16,28,27,25,35,30,40,32,34,32,34,37,38,34,36,38,37,36,45,39,41,40,44,37,44,46,46,49,51])
n = len(x)
res = stats.linregress(x, y)
tinv = lambda p, df: abs(t.ppf(p/2, df))
ts = tinv(0.05, n - 2)
bins = np.linspace(0, 3, 54)
pred = res.intercept + res.slope * x
errors = np.abs(y - pred)
plt.scatter(x, y, label='Data Points')
plt.plot(x, pred, 'c', label='fitted line')
error_res = stats.linregress(x, errors)
pred_err = error_res.intercept + error_res.slope * x
plt.plot(x, pred + 1 * pred_err, label='Upper confidence')
plt.plot(x, pred - 1 * pred_err, label='Lower confidence')
plt.legend()
plt.savefig('hey')
Related
I have some problems with plot. Exatlct solution here is true, but eulerbmethod gives the same curve, but much lower
import numpy as np
import matplotlib.pyplot as plt
# Define parameters
f = lambda x, y: 2*x
h = 0.1
x = np.arange(-10, 10, h)
x0 = 0
y0 = 2
# Explicit Euler Method
y = np.zeros(len(x))
y[x0] = y0
for i in range(0, len(x) - 1):
y[i + 1] = y[i] + h*f(x[i], y[i])
plt.figure(figsize=(12, 8))
plt.plot(x, y, 'b--', label='Euler')
plt.plot(x, 2+x**2, 'g', label='Exact')
plt.title('Numerical integration methods')
plt.xlabel('x')
plt.ylabel('y')
plt.grid()
plt.legend()
plt.show()
That's because your "exact solution" is not correct.
When you integrate, you have to consider that you have a non-zero value for x:
I'd like to smooth a scatter plot shown below (the points are very dense), and the data is here.
There is large noise in the middle of the curve, and I'd like to smooth the curve, also the y value should monotonically increase.
Since there are lots of curves like this, it is kind of hard to know where the noise is in the curve.
I tried scipy.signal.savgol_filter, but it didn't work.
The code I used is:
from scipy.signal import savgol_filter
from scipy import interpolate
import numpy as np
import matplotlib.pyplot as plt
s = np.loadtxt('data.csv', delimiter=',')
x = s[:, 0]
y = s[:, 1]
yhat = savgol_filter(y, 551, 3)
plt.plot(x, y, 'r')
plt.plot(x, yhat, 'b')
plt.show()
Suggestions are really appreciated. Thanks!
-------------------update-------------------------
Following Colin's method, I get the results I want. Here is the code:
from scipy.signal import savgol_filter
from scipy import interpolate
import numpy as np
import matplotlib.pyplot as plt
s = np.loadtxt('data.csv', delimiter=',')
x = s[:, 0]
y = s[:, 1]
yhat = savgol_filter(y, 551, 3)
tolerance = 0.2
increased_span = 150
filter_size = 11
first_pass = medfilt(y,filter_size)
diff = (y-first_pass)**2
first = np.argmax(diff>tolerance) - increased_span
last = len(y) - np.argmax(diff[::-1]>tolerance) + increased_span
print (first, last)
#interpolate between increased span
yhat[first:last] = np.interp(x[first:last], [x[first], x[last]], [y[first], y[last]])
f = interpolate.interp1d(x, yhat, kind='slinear')
x_inter = np.linspace(x[0], x[-1], 1000)
y_inter = f(x_inter)
y_inter = savgol_filter(y_inter, 41, 3)
plt.plot(x, y, 'r')
plt.plot(x, yhat, 'b')
plt.show()
If we firstly isolate the trouble area there are many ways to remove it. Here is an example:
tolerance = 0.2
increased_span = 150
filter_size = 11
#find noise
first_pass = medfilt(y,filter_size)
diff = (yhat-first_pass)**2
first = np.argmax(diff>tolerance) - increased_span
last = len(y) - np.argmax(diff[::-1]>tolerance) + increased_span
#interpolate between increased span
yhat[first:last] = np.interp(x[first:last], [x[first], x[last]], [y[first], y[last]])
When attempting to plot an exponential curve to a set of data:
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import style
from matplotlib import pylab
import numpy as np
from scipy.optimize import curve_fit
x = np.array([30,40,50,60])
y = np.array([0.027679854,0.055639098,0.114814815,0.240740741])
def exponenial_func(x, a, b, c):
return a*np.exp(-b*x)+c
popt, pcov = curve_fit(exponenial_func, x, y, p0=(1, 1e-6, 1))
xx = np.linspace(10,60,1000)
yy = exponenial_func(xx, *popt)
plt.plot(x,y,'o', xx, yy)
pylab.title('Exponential Fit')
ax = plt.gca()
fig = plt.gcf()
plt.xlabel(r'Temperature, C')
plt.ylabel(r'1/Time, $s^-$$^1$')
plt.show()
Graph for the above code:
However when I add the data point 20 (x) and 0.015162344 (y):
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import style
from matplotlib import pylab
import numpy as np
from scipy.optimize import curve_fit
x = np.array([20,30,40,50,60])
y = np.array([0.015162344,0.027679854,0.055639098,0.114814815,0.240740741])
def exponenial_func(x, a, b, c):
return a*np.exp(-b*x)+c
popt, pcov = curve_fit(exponenial_func, x, y, p0=(1, 1e-6, 1))
xx = np.linspace(20,60,1000)
yy = exponenial_func(xx, *popt)
plt.plot(x,y,'o', xx, yy)
pylab.title('Exponential Fit')
ax = plt.gca()
fig = plt.gcf()
plt.xlabel(r'Temperature, C')
plt.ylabel(r'1/Time, $s^-$$^1$')
plt.show()
The above code generates the error
'RuntimeError: Optimal parameters not found: Number of calls to
function has reached maxfev = 800.'
If maxfev is set to maxfev = 1300
popt, pcov = curve_fit(exponenial_func, x, y, p0=(1, 1e-6, 1),maxfev=1300)
The graph is plotted but does not fit the curve correctly. Graph from above code change, maxfev = 1300:
I think this is because points 20 and 30 a too close to each other? For comparison, excel plots the data like this:
How can I plot this curve correctly?
From your data it is obvious that you need a positive exponent, therefore, b needs to be negative as you use a*np.exp(-b*x) + c as the underlying model. However, you start with a positive initial value for b which most likely causes the issues.
If you change
popt, pcov = curve_fit(exponenial_func, x, y, p0=(1, 1e-6, 1))
to
popt, pcov = curve_fit(exponenial_func, x, y, p0=(1, -1e-6, 1))
it works fine and gives the expected outcome.
Alternatively, you could also change your equation to
return a*np.exp(b*x) + c
and start with the same initial values as you had.
Here is the entire code:
import matplotlib.pyplot as plt
import numpy as np
from scipy.optimize import curve_fit
def exponenial_func(x, a, b, c):
return a*np.exp(b*x)+c
x = np.array([20, 30, 40, 50, 60])
y = np.array([0.015162344, 0.027679854, 0.055639098, 0.114814815, 0.240740741])
popt, pcov = curve_fit(exponenial_func, x, y, p0=(1, 1e-6, 1))
xx = np.linspace(20, 60, 1000)
yy = exponenial_func(xx, *popt)
# please check whether that is correct
r2 = 1. - sum((exponenial_func(x, *popt) - y) ** 2) / sum((y - np.mean(y)) ** 2)
plt.plot(x, y, 'o', xx, yy)
plt.title('Exponential Fit')
plt.xlabel(r'Temperature, C')
plt.ylabel(r'1/Time, $s^-$$^1$')
plt.text(30, 0.15, "equation:\n{:.4f} exp({:.4f} x) + {:.4f}".format(*popt))
plt.text(30, 0.1, "R^2:\n {}".format(r2))
plt.show()
I have data with 4 X values and one common Y value for all X. Y is a function of X (observed from graphs), but the function value is unknown, which implies, Y = f1(X1), Y = f2(X2), Y = f3(X3) and Y = f4(X4). I am trying to determine the function f1, f2, f3 and f4 with Python. Variation of Y with X is shown in figure.
MWE
import numpy as np
import matplotlib.pyplot as plt
data = np.genfromtxt('unknown_function.dat', delimiter = '\t', skiprows = 1)
y = data[:, 0]
x1 = data[:, 1]
x2 = data[:, 2]
x3 = data[:, 3]
x4 = data[:, 4]
fig = plt.figure()
fig.clf()
plot = plt.subplot(111)
plt.plot(x1, y, color = 'k')
plt.plot(x2, y, color = 'r')
plt.plot(x3, y, color = 'b')
plt.plot(x4, y, color = 'g')
plt.xlabel('X')
plt.ylabel('Y')
plt.show()
Are there any methods to determine this unknown function with Python?
The data in each column appear to be well-represented by the function f(x) = a0 + a1/x + a2/x^2 + a3/x^3. The scipy.optimize package (good example at http://www2.mpia-hd.mpg.de/~robitaille/PY4SCI_SS_2014/_static/15.%20Fitting%20models%20to%20data.html) can do the function fitting relatively easily and return the fitted parameters. I've added the relevant code to your MWE below for the 'x1' column of data:
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
data = np.genfromtxt('unknown_function.dat', delimiter = '\t', skiprows = 1)
y = data[:, 0]
x1 = data[:, 1]
x2 = data[:, 2]
x3 = data[:, 3]
x4 = data[:, 4]
def f1(x, a0, a1, a2, a3):
return a0 + a1/x + a2/x**2 + a3/x**3
popt, pcov = curve_fit(f1, x1, y)
print "Function fit to x1 data column: a0 + a1/x + a2/x**2 + a3/x**3"
print "Parameters from least-squares fit:"
print "a0 =", popt[0], "+/-", pcov[0,0]**0.5
print "a1 =", popt[1], "+/-", pcov[1,1]**0.5
print "a2 =", popt[2], "+/-", pcov[2,2]**0.5
print "a3 =", popt[3], "+/-", pcov[3,3]**0.5
fig = plt.figure()
fig.clf()
plot = plt.subplot(111)
plt.plot(x1, y, color = 'k')
xfine = np.linspace(min(x1), max(x1), 100)
plt.plot(xfine, f1(xfine, popt[0], popt[1], popt[2], popt[3]), 'r-')
plt.plot(x2, y, color = 'r')
plt.plot(x3, y, color = 'b')
plt.plot(x4, y, color = 'g')
plt.xlabel('X')
plt.ylabel('Y')
plt.show()
#davmat gave an excellent solution. This is a scikit-learn based version of his solution using a linear model with his basis functions.
Load the data using Pandas
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv('unknown_function.dat', delimiter='\t')from sklearn.linear_model import LinearRegression
Define a function to fit the data using a linear model with #davmat's basis.
def fit_function(x,y):
linmodel = LinearRegression(fit_intercept=False)
x_vectors = np.column_stack([x**-i for i in range(4)])
linmodel.fit(x_vectors, y)
return linmodel.coef_
coef1 = fit_function(df['X1'], df['Y'])
coef2 = fit_function(df['X2'], df['Y'])
coef3 = fit_function(df['X3'], df['Y'])
coef4 = fit_function(df['X4'], df['Y'])
Evaluate function with each set of fitted coefficients.
def f1(x, coefs):
return coefs[0] + coefs[1]/x + coefs[2]/x**2 + coefs[3]/x**3
%matplotlib inline
df['y1hat'] = df['X1'].apply(lambda x: f1(x,coef1))
df['y2hat'] = df['X2'].apply(lambda x: f1(x,coef2))
df['y3hat'] = df['X3'].apply(lambda x: f1(x,coef3))
df['y4hat'] = df['X4'].apply(lambda x: f1(x,coef3))
Plot the results
df.plot(x='X1', y=['Y','y1hat'])
df.plot(x='X2', y=['Y','y2hat'])
df.plot(x='X3', y=['Y','y3hat'])
df.plot(x='X4', y=['Y','y4hat'])
You can do it by a polynomial regression.
Y = F(x) = c0 + c1 x + c2 x^2 + c3 ×^3 + ...... + cn x^n
the idea is to choose the value of n and to compute the coefficients c0, c1, ...cn
But keep in mind that this is only an approximation
I got a question that I fight around for days with now.
How do I calculate the (95%) confidence band of a fit?
Fitting curves to data is the every day job of every physicist -- so I think this should be implemented somewhere -- but I can't find an implementation for this neither do I know how to do this mathematically.
The only thing I found is seaborn that does a nice job for linear least-square.
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
x = np.linspace(0,10)
y = 3*np.random.randn(50) + x
data = {'x':x, 'y':y}
frame = pd.DataFrame(data, columns=['x', 'y'])
sns.lmplot('x', 'y', frame, ci=95)
plt.savefig("confidence_band.pdf")
But this is just linear least-square. When I want to fit e.g. a saturation curve like , I'm screwed.
Sure, I can calculate the t-distribution from the std-error of a least-square method like scipy.optimize.curve_fit but that is not what I'm searching for.
Thanks for any help!!
You can achieve this easily using StatsModels module.
Also see this example and this answer.
Here is an answer for your question:
import numpy as np
from matplotlib import pyplot as plt
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import summary_table
x = np.linspace(0,10)
y = 3*np.random.randn(50) + x
X = sm.add_constant(x)
res = sm.OLS(y, X).fit()
st, data, ss2 = summary_table(res, alpha=0.05)
fittedvalues = data[:,2]
predict_mean_se = data[:,3]
predict_mean_ci_low, predict_mean_ci_upp = data[:,4:6].T
predict_ci_low, predict_ci_upp = data[:,6:8].T
fig, ax = plt.subplots(figsize=(8,6))
ax.plot(x, y, 'o', label="data")
ax.plot(X, fittedvalues, 'r-', label='OLS')
ax.plot(X, predict_ci_low, 'b--')
ax.plot(X, predict_ci_upp, 'b--')
ax.plot(X, predict_mean_ci_low, 'g--')
ax.plot(X, predict_mean_ci_upp, 'g--')
ax.legend(loc='best');
plt.show()
kmpfit's confidence_band() calculates the confidence band for non-linear least squares. Here for your saturation curve:
from pylab import *
from kapteyn import kmpfit
def model(p, x):
a, b = p
return a*(1-np.exp(b*x))
x = np.linspace(0, 10, 100)
y = .1*np.random.randn(x.size) + model([1, -.4], x)
fit = kmpfit.simplefit(model, [.1, -.1], x, y)
a, b = fit.params
dfdp = [1-np.exp(b*x), -a*x*np.exp(b*x)]
yhat, upper, lower = fit.confidence_band(x, dfdp, 0.95, model)
scatter(x, y, marker='.', color='#0000ba')
for i, l in enumerate((upper, lower, yhat)):
plot(x, l, c='g' if i == 2 else 'r', lw=2)
savefig('kmpfit confidence bands.png', bbox_inches='tight')
The dfdp are the partial derivatives ∂f/∂p of the model f = a*(1-e^(b*x)) with respect to each parameter p (i.e., a and b), see my answer to a similar question for background links. And here the output: