Fitting function with curve_fit, but the fitted curve is wrong - python

def gaus(x,a,x0,sigma):
return a*np.exp(-(x-x0)**2/(2*sigma**2))
times, amplitudes = openFile("../datafiles/number_of_counts.txt")
mean = sum(np.array(times)*np.array(amplitudes))/sum(amplitudes)
sigma = np.sqrt(sum(np.array(amplitudes)*(np.array(times)-mean)**2)/sum(amplitudes))
params,pcov = curve_fit(gaus,times, amplitudes,p0=[max(amplitudes),mean,sigma])
plt.plot(times, amplitudes)
plt.plot(times ,gaus(np.array(times),params[0],params[1],params[2]),'r', label="fitted curve")
plt.ylabel("Coincidents")
plt.title("Coincident plot")
plt.legend()
plt.show()
My gaussian fit doesn't work properly, but looks like a soft curve, instead of for fitting to the sharp peak, I assume I have some super silly error in my script, but not sure what. Someone who can see it?

Your data has a constant offset of about 3750, but your gaus model function cannot account for that, so you are fitting a normal distribution with offset 0.
It needs one more parameter:
def gaus(x, a, x0, sigma, c):
return a * np.exp(-(x - x0)**2 / (2 * sigma**2)) + c
Then:
offset_guess = 3750 # maybe calculate it from the data as well
params, pcov = curve_fit(
gaus, times, amplitudes,
p0=[max(amplitudes), mean, sigma, offset_guess])
plt.plot(times, gaus(np.array(times), params[0], params[1], params[2], params[3]), ...)
Result:
>>> print(params)
[1545.00193331 -20.45639132 -43.28484495 3792.41050636]

I extracted data from the plot for analysis, and found that with the extracted data a Weibull peak plus offset gave me a better fit than a Gaussian peak with offset. Here is a graphical Python fitter with the extracted data and a Weibull peak plus offset equation, you should be able to substitute in the actual data and run it directly. Note the simple determination of initial parameter estimates.
import numpy, scipy, matplotlib
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
xData = numpy.array([4.914e+03, 5.600e+03, 7.886e+03, 8.571e+03, 1.063e+04, 1.154e+04, 1.269e+04, 1.566e+04, 1.634e+04, 1.817e+04, 1.886e+04, 2.114e+04, 2.389e+04, 2.526e+04, 2.754e+04, 3.051e+04, 3.257e+04, 3.417e+04, 3.554e+04, 3.669e+04, 4.011e+04, 4.240e+04, 4.491e+04, 4.583e+04, 4.697e+04, 4.880e+04, 4.994e+04, 5.154e+04, 5.246e+04, 5.474e+04, 5.634e+04, 5.886e+04, 6.091e+04, 6.366e+04, 6.731e+04, 7.051e+04, 7.257e+04, 7.394e+04, 7.691e+04, 7.851e+04, 7.966e+04, 8.103e+04, 8.240e+04, 8.514e+04, 8.720e+04, 8.834e+04, 8.949e+04, 9.109e+04, 9.223e+04, 9.360e+04, 9.566e+04, 9.726e+04, 9.909e+04, 1.005e+05, 1.014e+05, 1.030e+05, 1.059e+05, 1.073e+05, 1.089e+05, 1.101e+05, 1.119e+05, 1.130e+05, 1.139e+05, 1.162e+05, 1.178e+05, 1.190e+05, 1.203e+05, 1.222e+05, 1.233e+05, 1.247e+05, 1.281e+05, 1.299e+05, 1.309e+05, 1.329e+05, 1.341e+05, 1.357e+05, 1.370e+05, 1.382e+05, 1.395e+05, 1.407e+05, 1.430e+05, 1.439e+05])
yData = numpy.array([3.300e+03, 8.100e+03, 6.100e+03, 1.010e+04, 9.700e+03, 7.300e+03, 7.500e+03, 6.900e+03, 8.100e+03, 3.900e+03, 5.700e+03, 4.900e+03, 4.500e+03, 8.300e+03, 4.100e+03, 8.100e+03, 5.300e+03, 8.100e+03, 6.700e+03, 1.130e+04, 9.300e+03, 6.300e+03, 9.500e+03, 8.900e+03, 1.490e+04, 6.300e+03, 1.190e+04, 6.300e+03, 7.700e+03, 1.310e+04, 9.500e+03, 1.590e+04, 1.050e+04, 1.930e+04, 4.890e+04, 7.350e+04, 5.230e+04, 5.130e+04, 2.350e+04, 1.950e+04, 1.010e+04, 1.510e+04, 9.500e+03, 9.500e+03, 6.900e+03, 6.900e+03, 1.210e+04, 6.300e+03, 7.700e+03, 5.700e+03, 1.410e+04, 8.700e+03, 1.390e+04, 4.900e+03, 7.500e+03, 4.900e+03, 9.500e+03, 5.300e+03, 9.300e+03, 6.300e+03, 1.250e+04, 4.300e+03, 7.700e+03, 6.900e+03, 9.700e+03, 8.500e+03, 1.130e+04, 5.300e+03, 5.100e+03, 1.700e+03, 8.700e+03, 7.300e+03, 6.300e+03, 2.100e+03, 3.100e+03, 7.100e+03, 4.900e+03, 6.100e+03, 3.700e+03, 9.300e+03, 5.500e+03, 5.700e+03])
def func(x, a, b, c, offset): # Weibull peak with offset from zunzun.com
return a * numpy.exp(-0.5 * numpy.power(numpy.log(x/b) / c, 2.0)) + offset
a_est = max(yData)
b_est = max(yData)
c_est = 1.0
offset_est = min(yData)
initialParameters = numpy.array([a_est, b_est, c_est, offset_est])
# curve fit the test data
fittedParameters, pcov = curve_fit(func, xData, yData, initialParameters)
modelPredictions = func(xData, *fittedParameters)
absError = modelPredictions - yData
SE = numpy.square(absError) # squared errors
MSE = numpy.mean(SE) # mean squared errors
RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
Rsquared = 1.0 - (numpy.var(absError) / numpy.var(yData))
print('Parameters:', fittedParameters)
print('RMSE:', RMSE)
print('R-squared:', Rsquared)
print()
##########################################################
# graphics output section
def ModelAndScatterPlot(graphWidth, graphHeight):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
# first the raw data as a scatter plot
axes.plot(xData, yData, 'D')
# create data for the fitted equation plot
xModel = numpy.linspace(min(xData), max(xData), 500)
yModel = func(xModel, *fittedParameters)
# now the model as a line plot
axes.plot(xModel, yModel)
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
plt.show()
plt.close('all') # clean up after using pyplot
graphWidth = 800
graphHeight = 600
ModelAndScatterPlot(graphWidth, graphHeight)

Related

How do I code dose-response (4PL) curve fitting with optimize.minimize()

I want to optimize a dose-response curve (4 parameter logistic) using a data set. I need to use the Powell algorithm, therefore, I have to use optimize.minimize() instead of curve_fit or least square.
I wrote the following code:
import numpy as np
from scipy.optimize import minimize
ydata = np.array([0.1879, 0.4257, 0.80975, 1.3038, 1.64305, 1.94055, 2.21605, 2.3917])
xdata = np.array([40, 100, 250, 400, 600, 800, 1150, 1400])
initParams = [2.4, 0.2, 600.0, 1.0]
def logistic(params):
A = params[0]
B = params[1]
C = params[2]
D = params[3]
logistic4 = ((A-D)/(1.0+((xdata/C)**B))) + D
sse = np.sum(np.square(ydata-logistic4))
print sse
results = minimize(logistic, initParams, method='Powell')
print results
Theoretically, this minimizes the sse of the experimental and theoretical data sets iterating the 4 parameters initially entered using the Powell algorithm.
Practically, it does not work: it starts and the last error, in a fairly long list, is
TypeError: unsupported operand type(s) for -: 'NoneType' and 'NoneType'.
Any ideas on how to code this?
Here is a graphical Python solver for your data and equation, it uses minimize() with 'Powell' and also has a commented-out call to curve_fit. I could not get a good fit with the initial parameter estimates that you supplied, so those are commented out here and replaced with my own values. My equation search confirms that this is an excellent equation to use in modeling this data set.
import numpy, scipy, matplotlib
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from scipy.optimize import minimize
xData = numpy.array([40, 100, 250, 400, 600, 800, 1150, 1400], dtype=float)
yData = numpy.array([0.1879, 0.4257, 0.80975, 1.3038, 1.64305, 1.94055, 2.21605, 2.3917], dtype=float)
def func(xdata, A, B, C, D):
return ((A-D)/(1.0+((xdata/C)**B))) + D
# minimize() requires a function to be minimized, unlike curve_fit()
def SSE(inParameters): # function to minimize, here sum of squared errors
predictions = func(xData, *inParameters)
errors = predictions - yData
return numpy.sum(numpy.square(errors))
#initialParameters = numpy.array([2.4, 0.2, 600.0, 1.0])
initialParameters = numpy.array([3.0, -1.5, 500.0, 0.1])
# curve fit the data with curve_fit()
#fittedParameters, pcov = curve_fit(func, xData, yData, initialParameters)
# curve fit the data with minimize()
resultObject = minimize(SSE, initialParameters, method='Powell')
fittedParameters = resultObject.x
modelPredictions = func(xData, *fittedParameters)
absError = modelPredictions - yData
SE = numpy.square(absError) # squared errors
MSE = numpy.mean(SE) # mean squared errors
RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
Rsquared = 1.0 - (numpy.var(absError) / numpy.var(yData))
print('Parameters:', fittedParameters)
print('RMSE:', RMSE)
print('R-squared:', Rsquared)
print()
##########################################################
# graphics output section
def ModelAndScatterPlot(graphWidth, graphHeight):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
# first the raw data as a scatter plot
axes.plot(xData, yData, 'D')
# create data for the fitted equation plot
xModel = numpy.linspace(min(xData), max(xData))
yModel = func(xModel, *fittedParameters)
# now the model as a line plot
axes.plot(xModel, yModel)
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
plt.show()
plt.close('all') # clean up after using pyplot
graphWidth = 800
graphHeight = 600
ModelAndScatterPlot(graphWidth, graphHeight)
Shouldn't the correct Hill equation used in the function "func" use the term (C/x)**B rather than (x/C)**B where x=dose, C=IC50, and B is Hill coefficient?

Reduce Number of Measurements in Calibration

For calibration purposes I am making N measurements of water flow, each of which is time-intensive. I want to reduce the number of measurements. It sounds like this is part of feature selection as I am reducing the number of columns I have. BUT - I need to predict the measurements I will be dropping.
Here is a sample of the data:
SerialNumber val speed
0 193604048 1.350254 105.0
1 193604048 1.507517 3125.0
2 193604048 1.455142 525.0
6 193604048 1.211184 12.8
7 193604048 1.238835 20.0
For each serial number I have a complete set of speed-val measurements. Ideally I would like a model whose output is the vector of all N val measurements, but it seems the options are all neural networks, which I am trying to avoid for now. Are there are any other options?
If I feed this data into a regression model, how do I differentiate between each serialNumber dataset?
To make sure my goal is clear - I want to learn the historical measurements I have of N measurements and find which speed-val I can drop to still accurately predict all N output values.
Thank you!
I tried to find the simplest equation that would give a good fit to the example data you posted, and from my equation search the Harris Yield Density equation, "y = 1.0 / (a + b * pow(x, c))", is an good candidate. Here is a graphical Python fitter using that equation and your data, with initial parameter estimates for the non-linear fitter calculated directly from the data max and min values. Note that SerialNumber itself is unrelated to the data and would not be used in regressions.
My hope is that you might find this equation generally useful in your work, and it might be possible that after performing similar regressions on several different data sets that parameters a, b, and c are very similar in all cases - that is the best outcome. If your measurement accuracy is high, I personally would expect that with this three-parameter equation it should be possible to use a minimum of four data points per calibration, with max, min and two other well-spaced points along the expected calibration curve.
Note that here the fitted parameters a = -1.91719091e-03. b = 1.11357103e+00, and c = -1.51294798e+01 yield RMSE = 3.191 and R-squared = 0.9999
import numpy, scipy, matplotlib
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
xData = numpy.array([1.350254, 1.507517, 1.455142, 1.211184, 1.238835])
yData = numpy.array([105.0, 3125.0, 525.0, 12.8, 20.0])
def func(x, a, b, c): # Harris yield density equation
return 1.0 / (a + b*numpy.power(x, c))
initialParameters = numpy.array([0.0, min(xData), -10.0 * max(xData)])
# curve fit the test data
fittedParameters, pcov = curve_fit(func, xData, yData, initialParameters)
modelPredictions = func(xData, *fittedParameters)
absError = modelPredictions - yData
SE = numpy.square(absError) # squared errors
MSE = numpy.mean(SE) # mean squared errors
RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
Rsquared = 1.0 - (numpy.var(absError) / numpy.var(yData))
print('Parameters:', fittedParameters)
print('RMSE:', RMSE)
print('R-squared:', Rsquared)
print()
##########################################################
# graphics output section
def ModelAndScatterPlot(graphWidth, graphHeight):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
# first the raw data as a scatter plot
axes.plot(xData, yData, 'D')
# create data for the fitted equation plot
xModel = numpy.linspace(min(xData), max(xData))
yModel = func(xModel, *fittedParameters)
# now the model as a line plot
axes.plot(xModel, yModel)
axes.set_title('Harris Yield Density Equation') # title
axes.set_xlabel('Val') # X axis data label
axes.set_ylabel('Speed') # Y axis data label
plt.show()
plt.close('all') # clean up after using pyplot
graphWidth = 800
graphHeight = 600
ModelAndScatterPlot(graphWidth, graphHeight)
UPDATE using reversed X and Y
Per the comments, here is a three-parameter equation Mixed Power and Eponential "a * pow(x, b) * exp(c * x)" graphical fitter with X and Y reversed from the previous code. Here the fitted parameters a = 1.05910664e+00, b = 5.26304345e-02, and -2.25604946e-05 yield RMSE = 0.0003602 and R-squared= 0.9999
import numpy, scipy, matplotlib
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
xData = numpy.array([105.0, 3125.0, 525.0, 12.8, 20.0])
yData = numpy.array([1.350254, 1.507517, 1.455142, 1.211184, 1.238835])
def func(x, a, b, c): # mixed power and exponential equation
return a * numpy.power(x, b) * numpy.exp(c * x)
initialParameters = [1.0, 0.01, -0.01]
# curve fit the test data
fittedParameters, pcov = curve_fit(func, xData, yData, initialParameters)
modelPredictions = func(xData, *fittedParameters)
absError = modelPredictions - yData
SE = numpy.square(absError) # squared errors
MSE = numpy.mean(SE) # mean squared errors
RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
Rsquared = 1.0 - (numpy.var(absError) / numpy.var(yData))
print('Parameters:', fittedParameters)
print('RMSE:', RMSE)
print('R-squared:', Rsquared)
print()
##########################################################
# graphics output section
def ModelAndScatterPlot(graphWidth, graphHeight):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
# first the raw data as a scatter plot
axes.plot(xData, yData, 'D')
# create data for the fitted equation plot
xModel = numpy.linspace(min(xData), max(xData))
yModel = func(xModel, *fittedParameters)
# now the model as a line plot
axes.plot(xModel, yModel)
axes.set_title('Mixed Power and Exponential Equation') # title
axes.set_xlabel('Speed') # X axis data label
axes.set_ylabel('Val') # Y axis data label
plt.show()
plt.close('all') # clean up after using pyplot
graphWidth = 800
graphHeight = 600
ModelAndScatterPlot(graphWidth, graphHeight)

Fitting a curve to only a few data points

I have a scatter plot with only 5 data points, which I would like to fit a curve to. I have tried both polyfit and the following code, but neither are able to produce a curve with so few data points
def func(x, a, b, c):
return a * np.exp(-b * x) + c
plt.plot(xdata, ydata, ".", label="Data");
optimizedParameters, pcov = opt.curve_fit(func, xdata, ydata);
plt.plot(xdata, func(xdata, *optimizedParameters), label="fit");
Attached is an example of the plot, along with an example of the kind of curve I am trying to produce (apologies for the bad drawing). Thanks!
Here is an example graphical Python fitter using the data in your comment, fitting to a Polytrope type of equation. In this example there is no need to take logs of the data. Here the X axis is plotted on a decade logarithmic scale. Please note that the data in the example code is in the form of floating point numbers.
import numpy, scipy, matplotlib
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
xData = numpy.array([7e-09, 9e-09, 1e-08, 2e-8, 1e-6])
yData = numpy.array([790.0, 870.0, 2400.0, 2450.0, 3100.0])
def func(x, a, b, offset): # polytrope equation from zunzun.com
return a / numpy.power(x, b) + offset
# these are the same as the scipy defaults
initialParameters = numpy.array([1.0, 1.0, 1.0])
# curve fit the test data
fittedParameters, pcov = curve_fit(func, xData, yData, initialParameters)
modelPredictions = func(xData, *fittedParameters)
absError = modelPredictions - yData
SE = numpy.square(absError) # squared errors
MSE = numpy.mean(SE) # mean squared errors
RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
Rsquared = 1.0 - (numpy.var(absError) / numpy.var(yData))
print('Parameters:', fittedParameters)
print('RMSE:', RMSE)
print('R-squared:', Rsquared)
print()
##########################################################
# graphics output section
def ModelAndScatterPlot(graphWidth, graphHeight):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
# first the raw data as a scatter plot
axes.plot(xData, yData, 'D')
# create data for the fitted equation plot
xModel = numpy.linspace(min(xData), max(xData), 1000)
yModel = func(xModel, *fittedParameters)
# now the model as a line plot
axes.plot(xModel, yModel)
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
plt.xscale('log') # comment this out for default linear scaling
plt.show()
plt.close('all') # clean up after using pyplot
graphWidth = 800
graphHeight = 600
ModelAndScatterPlot(graphWidth, graphHeight)
You would have to choose what you want to fit the curver after. From the look of your drawing it seems you are trying to shape it to be somewhat logarithmic.
Here's a picture of a logarithmic regression:
A logarithmmic regression would follow the form of y = A + B ln(x).
This is essentially a linear regression fitting were instead of fitting y vs. x
we are trying to fit y vs. ln(x).
So you may just take the natural log of the x-values of the points in your dataset and execute a linear regression algorithm on it. The yielding coefficients are then A and B for y=A + B ln(x).
Picture Credits:
http://mathworld.wolfram.com/LeastSquaresFittingLogarithmic.html
Edit: As James Phillips pointed out in his answer, it is also possible to model the curve in the form of y=Ax^(-B) + C since for so few points it cannot be determined wheter the graph has an horizontal asymptote or is always growing but decelerating. A lot of curves are possible (for instance y=A* B^(-x) +C could be another one) but you would need to choose what to model the data after.
The exponential function does not fit your data well. Consider another modeling function.
Given
import numpy as np
import scipy.optimize as opt
import matplotlib.pyplot as plt
%matplotlib inline
x_samp = np.array([7e-09, 9e-09, 1e-08, 2e-8, 1e-6])
y_samp = np.array([790, 870, 2400, 2450, 3100])
def func(x, a, b):
"""Return a exponential result."""
return a + b*np.log(x)
def func2(x, a, b, c):
"""Return a 'power law' result."""
return a/np.power(x, b) + c
Code
From #Allan Lago's logarithmic model:
# REGRESSION ------------------------------------------------------------------
x_lin = np.linspace(x_samp.min(), x_samp.max(), 50)
w, _ = opt.curve_fit(func, x_samp, y_samp)
print("Estimated Parameters", w)
# Model
y_model = func(x_lin, *w)
# PLOT ------------------------------------------------------------------------
# Visualize data and fitted curves
plt.plot(x_samp, y_samp, "ko", label="Data")
plt.plot(x_lin, y_model, "k--", label="Fit")
plt.xticks(np.arange(0, x_samp.max(), x_samp.max()/2))
plt.title("Least squares regression")
plt.legend(loc="upper left")
Estimated Parameters [8339.61062739 367.6992259 ]
Using #James Phillips' "Polytrope" model:
# REGRESSION ------------------------------------------------------------------
p0 = [1, 1, 1]
w, _ = opt.curve_fit(func2, x_samp, y_samp, p0=p0)
print("Estimated Parameters", w)
# Model
y_model = func2(x_lin, *w)
# PLOT ------------------------------------------------------------------------
# Visualize data and fitted curves
plt.plot(x_samp, y_samp, "ko", label="Data")
plt.plot(x_lin, y_model, "k--", label="Fit")
plt.xticks(np.arange(0, x_samp.max(), x_samp.max()/2))
plt.title("Least squares regression")
plt.legend()
Estimated Parameters [-3.49305043e-10 1.57259788e+00 3.05801283e+03]

Python Linear Regression, best fit line with residuals

I have done my linear regression and the best fit line, but would like to have also a line connecting the real points (the ones in blue) to the predicted points (the ones i red x) representing the predictions error, or the so called residuals. The plot should look in a similar way:
And what I have until now is:
# draw the plot
xx=X[:,np.newaxis]
yy=y[:,np.newaxis]
slr=LinearRegression()
slr.fit(xx,yy)
y_pred=slr.predict(xx)
plt.scatter(xx,yy)
plt.plot(xx,y_pred,'r')
plt.plot(X,y_pred,'rx') #add the prediction points
plt.show()
Thank you very much in advance!
Here is example code with the vertical lines
import numpy, scipy, matplotlib
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
xData = numpy.array([1.1, 2.2, 3.3, 4.4, 5.0, 6.6, 7.7])
yData = numpy.array([1.1, 20.2, 30.3, 60.4, 50.0, 60.6, 70.7])
def func(x, a, b): # simple linear example
return a * x + b
initialParameters = numpy.array([1.0, 1.0])
# curve fit the test data
fittedParameters, pcov = curve_fit(func, xData, yData, initialParameters)
modelPredictions = func(xData, *fittedParameters)
absError = modelPredictions - yData
SE = numpy.square(absError) # squared errors
MSE = numpy.mean(SE) # mean squared errors
RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
Rsquared = 1.0 - (numpy.var(absError) / numpy.var(yData))
print('RMSE:', RMSE)
print('R-squared:', Rsquared)
print()
##########################################################
# graphics output section
def ModelAndScatterPlot(graphWidth, graphHeight):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
# first the raw data as a scatter plot
axes.plot(xData, yData, 'D')
# create data for the fitted equation plot
xModel = numpy.linspace(min(xData), max(xData))
yModel = func(xModel, *fittedParameters)
# now the model as a line plot
axes.plot(xModel, yModel)
# now add individual line for each point
for i in range(len(xData)):
lineXdata = (xData[i], xData[i]) # same X
lineYdata = (yData[i], modelPredictions[i]) # different Y
plt.plot(lineXdata, lineYdata)
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
plt.show()
plt.close('all') # clean up after using pyplot
graphWidth = 800
graphHeight = 600
ModelAndScatterPlot(graphWidth, graphHeight)

Scatterplot for multiple regression in matplotlib

I have four features and a dependent(X). I want to plot a graph with the predicted regression line and the feature values. I went through the documentation but I can't figure out how to represent everything in a scatter plot.
Here is some example code to get you started, it fits a simple quadratic and scatterplots the raw data and fitted curve along with calculation of RMSE and R-squared. The example uses a non-linear fit in case you would like to try fitting non-linear equations.
import numpy, scipy, matplotlib
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
import scipy.stats
xData = numpy.array([1.1, 2.2, 3.3, 4.4, 5.0, 6.6, 7.7])
yData = numpy.array([1.1, 20.2, 30.3, 40.4, 50.0, 60.6, 70.7])
def func(x, a, b, c): # simple quadratic example
return (a * numpy.square(x)) + b * x + c
initialParameters = numpy.array([1.0, 1.0, 1.0])
# curve fit the test data
fittedParameters, pcov = curve_fit(func, xData, yData, initialParameters)
modelPredictions = func(xData, *fittedParameters)
absError = modelPredictions - yData
SE = numpy.square(absError) # squared errors
MSE = numpy.mean(SE) # mean squared errors
RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
Rsquared = 1.0 - (numpy.var(absError) / numpy.var(yData))
print('RMSE:', RMSE)
print('R-squared:', Rsquared)
print()
##########################################################
# graphics output section
def ModelAndScatterPlot(graphWidth, graphHeight):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
# first the raw data as a scatter plot
axes.plot(xData, yData, 'D')
# create data for the fitted equation plot
xModel = numpy.linspace(min(xData), max(xData))
yModel = func(xModel, *fittedParameters)
# now the model as a line plot
axes.plot(xModel, yModel)
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
plt.show()
plt.close('all') # clean up after using pyplot
graphWidth = 800
graphHeight = 600
ModelAndScatterPlot(graphWidth, graphHeight)

Categories

Resources