I have four features and a dependent(X). I want to plot a graph with the predicted regression line and the feature values. I went through the documentation but I can't figure out how to represent everything in a scatter plot.
Here is some example code to get you started, it fits a simple quadratic and scatterplots the raw data and fitted curve along with calculation of RMSE and R-squared. The example uses a non-linear fit in case you would like to try fitting non-linear equations.
import numpy, scipy, matplotlib
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
import scipy.stats
xData = numpy.array([1.1, 2.2, 3.3, 4.4, 5.0, 6.6, 7.7])
yData = numpy.array([1.1, 20.2, 30.3, 40.4, 50.0, 60.6, 70.7])
def func(x, a, b, c): # simple quadratic example
return (a * numpy.square(x)) + b * x + c
initialParameters = numpy.array([1.0, 1.0, 1.0])
# curve fit the test data
fittedParameters, pcov = curve_fit(func, xData, yData, initialParameters)
modelPredictions = func(xData, *fittedParameters)
absError = modelPredictions - yData
SE = numpy.square(absError) # squared errors
MSE = numpy.mean(SE) # mean squared errors
RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
Rsquared = 1.0 - (numpy.var(absError) / numpy.var(yData))
print('RMSE:', RMSE)
print('R-squared:', Rsquared)
print()
##########################################################
# graphics output section
def ModelAndScatterPlot(graphWidth, graphHeight):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
# first the raw data as a scatter plot
axes.plot(xData, yData, 'D')
# create data for the fitted equation plot
xModel = numpy.linspace(min(xData), max(xData))
yModel = func(xModel, *fittedParameters)
# now the model as a line plot
axes.plot(xModel, yModel)
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
plt.show()
plt.close('all') # clean up after using pyplot
graphWidth = 800
graphHeight = 600
ModelAndScatterPlot(graphWidth, graphHeight)
Related
I want to optimize a dose-response curve (4 parameter logistic) using a data set. I need to use the Powell algorithm, therefore, I have to use optimize.minimize() instead of curve_fit or least square.
I wrote the following code:
import numpy as np
from scipy.optimize import minimize
ydata = np.array([0.1879, 0.4257, 0.80975, 1.3038, 1.64305, 1.94055, 2.21605, 2.3917])
xdata = np.array([40, 100, 250, 400, 600, 800, 1150, 1400])
initParams = [2.4, 0.2, 600.0, 1.0]
def logistic(params):
A = params[0]
B = params[1]
C = params[2]
D = params[3]
logistic4 = ((A-D)/(1.0+((xdata/C)**B))) + D
sse = np.sum(np.square(ydata-logistic4))
print sse
results = minimize(logistic, initParams, method='Powell')
print results
Theoretically, this minimizes the sse of the experimental and theoretical data sets iterating the 4 parameters initially entered using the Powell algorithm.
Practically, it does not work: it starts and the last error, in a fairly long list, is
TypeError: unsupported operand type(s) for -: 'NoneType' and 'NoneType'.
Any ideas on how to code this?
Here is a graphical Python solver for your data and equation, it uses minimize() with 'Powell' and also has a commented-out call to curve_fit. I could not get a good fit with the initial parameter estimates that you supplied, so those are commented out here and replaced with my own values. My equation search confirms that this is an excellent equation to use in modeling this data set.
import numpy, scipy, matplotlib
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from scipy.optimize import minimize
xData = numpy.array([40, 100, 250, 400, 600, 800, 1150, 1400], dtype=float)
yData = numpy.array([0.1879, 0.4257, 0.80975, 1.3038, 1.64305, 1.94055, 2.21605, 2.3917], dtype=float)
def func(xdata, A, B, C, D):
return ((A-D)/(1.0+((xdata/C)**B))) + D
# minimize() requires a function to be minimized, unlike curve_fit()
def SSE(inParameters): # function to minimize, here sum of squared errors
predictions = func(xData, *inParameters)
errors = predictions - yData
return numpy.sum(numpy.square(errors))
#initialParameters = numpy.array([2.4, 0.2, 600.0, 1.0])
initialParameters = numpy.array([3.0, -1.5, 500.0, 0.1])
# curve fit the data with curve_fit()
#fittedParameters, pcov = curve_fit(func, xData, yData, initialParameters)
# curve fit the data with minimize()
resultObject = minimize(SSE, initialParameters, method='Powell')
fittedParameters = resultObject.x
modelPredictions = func(xData, *fittedParameters)
absError = modelPredictions - yData
SE = numpy.square(absError) # squared errors
MSE = numpy.mean(SE) # mean squared errors
RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
Rsquared = 1.0 - (numpy.var(absError) / numpy.var(yData))
print('Parameters:', fittedParameters)
print('RMSE:', RMSE)
print('R-squared:', Rsquared)
print()
##########################################################
# graphics output section
def ModelAndScatterPlot(graphWidth, graphHeight):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
# first the raw data as a scatter plot
axes.plot(xData, yData, 'D')
# create data for the fitted equation plot
xModel = numpy.linspace(min(xData), max(xData))
yModel = func(xModel, *fittedParameters)
# now the model as a line plot
axes.plot(xModel, yModel)
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
plt.show()
plt.close('all') # clean up after using pyplot
graphWidth = 800
graphHeight = 600
ModelAndScatterPlot(graphWidth, graphHeight)
Shouldn't the correct Hill equation used in the function "func" use the term (C/x)**B rather than (x/C)**B where x=dose, C=IC50, and B is Hill coefficient?
I currently have a scatter plot of data points, and I want to draw a line that captures the general pattern of the data. I believe that this is also known as an ordinary least squares regression method, but I may be wrong as I'm not completely familiar with the literature.
For example, if I had a plot like the following:
I just want a line that goes through the data points, that captures the most general trend.
I've tried methods like using Scikit-Learn's LinearRegression module, but I'll have to split my data into train and test sets and perform regression. Is there a way that I can just capture the general trend without having to do this?
Thank you.
Here is an example polynomial fitter that does this, if you convert your date format to a numeric type such as "elapsed days" you can directly substitute your data into the example. Here I use a curved second-order polynomial (quadratic) equation, set at the top of the code, because to my eye the trend of your data appears to have some curvature rather than a straight line.
import numpy, matplotlib
import matplotlib.pyplot as plt
xData = numpy.array([1.1, 2.2, 3.3, 4.4, 5.0, 6.6, 7.7, 0.0])
yData = numpy.array([1.1, 20.2, 30.3, 40.4, 50.0, 60.6, 70.7, 0.1])
polynomialOrder = 2 # example quadratic
# curve fit the test data
fittedParameters = numpy.polyfit(xData, yData, polynomialOrder)
print('Fitted Parameters:', fittedParameters)
modelPredictions = numpy.polyval(fittedParameters, xData)
absError = modelPredictions - yData
SE = numpy.square(absError) # squared errors
MSE = numpy.mean(SE) # mean squared errors
RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
Rsquared = 1.0 - (numpy.var(absError) / numpy.var(yData))
print('RMSE:', RMSE)
print('R-squared:', Rsquared)
print()
##########################################################
# graphics output section
def ModelAndScatterPlot(graphWidth, graphHeight):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
# first the raw data as a scatter plot
axes.plot(xData, yData, 'D')
# create data for the fitted equation plot
xModel = numpy.linspace(min(xData), max(xData))
yModel = numpy.polyval(fittedParameters, xModel)
# now the model as a line plot
axes.plot(xModel, yModel)
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
plt.show()
plt.close('all') # clean up after using pyplot
graphWidth = 800
graphHeight = 600
ModelAndScatterPlot(graphWidth, graphHeight)
I would like to see the regression equation for a polynomial regression in python.
I am new to python, in R the analogous command I am looking for is "summary." I have tried the print function in python.
x = (LIST)
y = (LIST)
x = x[:, np.newaxis]
y = y[:, np.newaxis]
poly = PolynomialFeatures(degree=2)
x_poly = poly.fit_transform(x)
poly.fit(x_poly,y)
lin = LinearRegression()
lin.fit(x_poly,y)
y_poly_pred = lin.predict(x_poly)
print(lin)
print(poly)
print(lin.predict)
print(poly.fit_transform)
I would like the output to give me the ax^2 + bx + c equation, or at least the info to figure out that equation. Instead, I get (below) for my 4 print statements.
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
normalize=False)
PolynomialFeatures(degree=2, include_bias=True, interaction_only=False,
order='C')
<bound method LinearModel.predict of LinearRegression(copy_X=True,
fit_intercept=True, n_jobs=None, normalize=False)>
<bound method TransformerMixin.fit_transform of
PolynomialFeatures(degree=2, include_bias=True, interaction_only=False,
order='C')>
Here is an example graphical polynomial fitter using numpy.polyfit for fitting and numpy.polyval for evaluation. This example has eight data points, and making polynomialOrder = 7 shows Runge's phenomenon rather nicely.
import numpy, matplotlib
import matplotlib.pyplot as plt
xData = numpy.array([1.1, 2.2, 3.3, 4.4, 5.0, 6.6, 7.7, 0.0])
yData = numpy.array([1.1, 20.2, 30.3, 40.4, 50.0, 60.6, 70.7, 0.1])
polynomialOrder = 2 # example quadratic
# curve fit the test data
fittedParameters = numpy.polyfit(xData, yData, polynomialOrder)
print('Fitted Parameters:', fittedParameters)
modelPredictions = numpy.polyval(fittedParameters, xData)
absError = modelPredictions - yData
SE = numpy.square(absError) # squared errors
MSE = numpy.mean(SE) # mean squared errors
RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
Rsquared = 1.0 - (numpy.var(absError) / numpy.var(yData))
print('RMSE:', RMSE)
print('R-squared:', Rsquared)
print()
##########################################################
# graphics output section
def ModelAndScatterPlot(graphWidth, graphHeight):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
# first the raw data as a scatter plot
axes.plot(xData, yData, 'D')
# create data for the fitted equation plot
xModel = numpy.linspace(min(xData), max(xData))
yModel = numpy.polyval(fittedParameters, xModel)
# now the model as a line plot
axes.plot(xModel, yModel)
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
plt.show()
plt.close('all') # clean up after using pyplot
graphWidth = 800
graphHeight = 600
ModelAndScatterPlot(graphWidth, graphHeight)
I'm interesting in mimic Excel's LOGEST function in Python but have no idea where to start.
Here is a graphical fitter using LOGEST as described in
https://support.office.com/en-us/article/logest-function-f27462d8-3657-4030-866b-a272c1d18b4b
import numpy, scipy, matplotlib
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
xData = numpy.array([1.1, 2.2, 3.3, 4.4, 5.0, 6.6, 7.7])
yData = numpy.array([1.1, 20.2, 30.3, 60.4, 50.0, 60.6, 70.7])
# LOGEST from https://support.office.com/en-us/article/logest-function-f27462d8-3657-4030-866b-a272c1d18b4b
def func(x, b, m):
y = b * m**x
return y
# these are the same as the scipy defaults
initialParameters = numpy.array([1.0, 1.0])
# curve fit the test data
fittedParameters, pcov = curve_fit(func, xData, yData, initialParameters)
modelPredictions = func(xData, *fittedParameters)
absError = modelPredictions - yData
SE = numpy.square(absError) # squared errors
MSE = numpy.mean(SE) # mean squared errors
RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
Rsquared = 1.0 - (numpy.var(absError) / numpy.var(yData))
print('Parameters:', fittedParameters)
print('RMSE:', RMSE)
print('R-squared:', Rsquared)
print()
##########################################################
# graphics output section
def ModelAndScatterPlot(graphWidth, graphHeight):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
# first the raw data as a scatter plot
axes.plot(xData, yData, 'D')
# create data for the fitted equation plot
xModel = numpy.linspace(min(xData), max(xData))
yModel = func(xModel, *fittedParameters)
# now the model as a line plot
axes.plot(xModel, yModel)
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
plt.show()
plt.close('all') # clean up after using pyplot
graphWidth = 800
graphHeight = 600
ModelAndScatterPlot(graphWidth, graphHeight)
You could do this taking the log and doing a linear regression or you could fit to an exponential function. Here I show both solutions using scipy.stats.linregress and scipy.optimize.curve_fit, respectively.
Here is the example from the documentation on the function LOGEST in Excel from Microsoft:
Method using linregress:
from scipy.stats import linregress
import math
x = months = [11, 12, 13, 14, 15, 16]
y = units = [33100, 47300, 69000, 102000, 150000, 220000]
slope, intercept, r_value, p_value, std_err = linregress(
x,
list(map(math.log, y)),
)
print('m', math.exp(slope))
print('b', math.exp(intercept))
Output:
m 1.4632756281161756
b 495.3047701587278
Method using curve_fit:
from scipy.optimize import curve_fit
def f(x, b, m):
return b * m ** x
popt, pcov = curve_fit(f, x, y)
print('m', popt[1])
print('b', popt[0])
Output:
m 1.4678382448967822
b 473.717820465515
I have done my linear regression and the best fit line, but would like to have also a line connecting the real points (the ones in blue) to the predicted points (the ones i red x) representing the predictions error, or the so called residuals. The plot should look in a similar way:
And what I have until now is:
# draw the plot
xx=X[:,np.newaxis]
yy=y[:,np.newaxis]
slr=LinearRegression()
slr.fit(xx,yy)
y_pred=slr.predict(xx)
plt.scatter(xx,yy)
plt.plot(xx,y_pred,'r')
plt.plot(X,y_pred,'rx') #add the prediction points
plt.show()
Thank you very much in advance!
Here is example code with the vertical lines
import numpy, scipy, matplotlib
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
xData = numpy.array([1.1, 2.2, 3.3, 4.4, 5.0, 6.6, 7.7])
yData = numpy.array([1.1, 20.2, 30.3, 60.4, 50.0, 60.6, 70.7])
def func(x, a, b): # simple linear example
return a * x + b
initialParameters = numpy.array([1.0, 1.0])
# curve fit the test data
fittedParameters, pcov = curve_fit(func, xData, yData, initialParameters)
modelPredictions = func(xData, *fittedParameters)
absError = modelPredictions - yData
SE = numpy.square(absError) # squared errors
MSE = numpy.mean(SE) # mean squared errors
RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
Rsquared = 1.0 - (numpy.var(absError) / numpy.var(yData))
print('RMSE:', RMSE)
print('R-squared:', Rsquared)
print()
##########################################################
# graphics output section
def ModelAndScatterPlot(graphWidth, graphHeight):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
# first the raw data as a scatter plot
axes.plot(xData, yData, 'D')
# create data for the fitted equation plot
xModel = numpy.linspace(min(xData), max(xData))
yModel = func(xModel, *fittedParameters)
# now the model as a line plot
axes.plot(xModel, yModel)
# now add individual line for each point
for i in range(len(xData)):
lineXdata = (xData[i], xData[i]) # same X
lineYdata = (yData[i], modelPredictions[i]) # different Y
plt.plot(lineXdata, lineYdata)
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
plt.show()
plt.close('all') # clean up after using pyplot
graphWidth = 800
graphHeight = 600
ModelAndScatterPlot(graphWidth, graphHeight)