I am trying to build a multiple linear regression on a dummy data and I keep getting overflow error.
Assume this as a dummy data.
print(x_train)
col1 col2 target
0.18 0.89 109.85
1.0 0.26 155.72
0.92 0.11 137.66
0.07 0.37 76.17
0.85 0.16 139.75
0.99 0.41 162.6
0.87 0.47 151.77
print(x_test)
0.49 0.18
0.57 0.83
0.56 0.64
0.76 0.18
This is the code I wrote for implementation of linear regression for multiple features. Can anyone let me know if my implementation of LINEAR REGRESSION is correct? If it's correct then why am I keep getting overflow error.
import numpy as np
def data():
# prepare data
x_train = np.array(train_data)[:, :-1]
y_train = np.array(train_data)[:, -1]
x_test = np.array(test_data)
return x_train, y_train, x_test
def normalize(y):
return (y - y.min()) / (y.max() - y.min())
def linear_regression(x_train, y_train, epochs=300):
y_train = normalize(y_train)
rows, columns = x_train.shape
weights = np.zeros((columns))
intercept = 0
for x in range(epochs):
for i in range(len(x_train)):
prev_weights = weights
weights += intercept + prev_weights * x_train[i] - y_train[i]
intercept += (intercept+(prev_weights*x_train[i])-y_train[i]).dot(x_train[i])
return weights, intercept
def predict(x_test, weights, intercept):
y_pred = []
for i in range(len(x_test)):
y_pred.append(weights.dot(x_test[i]) + intercept)
return y_pred
def main():
x_train, y_train, x_test = data()
weights, intercept = linear_regression(x_train, y_train, epochs=300)
y_pred = predict(x_test, weights, intercept)
for i in y_pred:
print(str(i))
if __name__=='__main__':
main()
Results:
-inf
-inf
-inf
-inf
/srv/conda/lib/python3.6/site-packages/ipykernel_launcher.py:25: RuntimeWarning: overflow encountered in add
Here is a different approach, a Python 3D surface fitter using your data with 3D scatter plot, 3D surface plot, and contour plot. You should be able to click-drag and rotate the 3D plots in 3-space for visual inspection. Here the fitted surface is a flat plane, and there is no need for test and train split as the RMSE and R-squared are given directly and you can see the surface. Just re-fit with all data.
import numpy, scipy, scipy.optimize
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm # to colormap 3D surfaces from blue to red
import matplotlib.pyplot as plt
graphWidth = 800 # units are pixels
graphHeight = 600 # units are pixels
# 3D contour plot lines
numberOfContourLines = 16
# x, y, z = col1, col2, target
xData = numpy.array([0.18, 1.0, 0.92, 0.07, 0.85, 0.99, 0.87])
yData = numpy.array([0.89, 0.26, 0.11, 0.37, 0.16, 0.41, 0.47])
zData = numpy.array([109.85, 155.72, 137.66, 76.17, 139.75, 162.6, 151.77])
def func(data, a, b, c):
x = data[0]
y = data[1]
return (a * x) + (y * b) + c
def SurfacePlot(func, data, fittedParameters):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
matplotlib.pyplot.grid(True)
axes = Axes3D(f)
x_data = data[0]
y_data = data[1]
z_data = data[2]
xModel = numpy.linspace(min(x_data), max(x_data), 20)
yModel = numpy.linspace(min(y_data), max(y_data), 20)
X, Y = numpy.meshgrid(xModel, yModel)
Z = func(numpy.array([X, Y]), *fittedParameters)
axes.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=1, antialiased=True)
axes.scatter(x_data, y_data, z_data) # show data along with plotted surface
axes.set_title('Surface Plot (click-drag with mouse)') # add a title for surface plot
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
axes.set_zlabel('Z Data') # Z axis data label
plt.show()
plt.close('all') # clean up after using pyplot or else there can be memory and process problems
def ContourPlot(func, data, fittedParameters):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
x_data = data[0]
y_data = data[1]
z_data = data[2]
xModel = numpy.linspace(min(x_data), max(x_data), 20)
yModel = numpy.linspace(min(y_data), max(y_data), 20)
X, Y = numpy.meshgrid(xModel, yModel)
Z = func(numpy.array([X, Y]), *fittedParameters)
axes.plot(x_data, y_data, 'o')
axes.set_title('Contour Plot') # add a title for contour plot
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
CS = matplotlib.pyplot.contour(X, Y, Z, numberOfContourLines, colors='k')
matplotlib.pyplot.clabel(CS, inline=1, fontsize=10) # labels for contours
plt.show()
plt.close('all') # clean up after using pyplot or else there can be memory and process problems
def ScatterPlot(data):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
matplotlib.pyplot.grid(True)
axes = Axes3D(f)
x_data = data[0]
y_data = data[1]
z_data = data[2]
axes.scatter(x_data, y_data, z_data)
axes.set_title('Scatter Plot (click-drag with mouse)')
axes.set_xlabel('X Data')
axes.set_ylabel('Y Data')
axes.set_zlabel('Z Data')
plt.show()
plt.close('all') # clean up after using pyplot or else there can be memory and process problems
if __name__ == "__main__":
data = [xData, yData, zData]
initialParameters = [1.0, 1.0, 1.0] # these are the same as scipy default values in this example
# here a non-linear surface fit is made with scipy's curve_fit()
fittedParameters, pcov = scipy.optimize.curve_fit(func, [xData, yData], zData, p0 = initialParameters)
ScatterPlot(data)
SurfacePlot(func, data, fittedParameters)
ContourPlot(func, data, fittedParameters)
print('fitted prameters', fittedParameters)
modelPredictions = func(data, *fittedParameters)
absError = modelPredictions - zData
SE = numpy.square(absError) # squared errors
MSE = numpy.mean(SE) # mean squared errors
RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
Rsquared = 1.0 - (numpy.var(absError) / numpy.var(zData))
print('RMSE:', RMSE)
print('R-squared:', Rsquared)
Related
Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 3 years ago.
Improve this question
I am trying to implement multiple linear regression using the
sklearn.linear_model.LinearRegression function.
The equation for regression is:
y = c + a1x1 + a2x2
with an additional condition that:
c always takes a value between (1,10)
a1 takes values between (0,1).
How can I principally solve such equations using Python?
Here is an example bounded graphical 3D surface fitter in Python using your equation and bounds on c with 3D scatter plot, 3D surface plot, and contour plot. You should be able to click-drag with the mouse and rotate the 3D plots in 3-space for examination. You can of course change or add bounds as you see fit.
Note that this example is using scipy's curve_fit() which allows bounds on the fitted parameters, and that the estimated initial parameters must be within the bounds so that curve_fit() can begin. In this example, the fitted parameter values for a1, a2, and c are:
fitted parameters [ 9.71206053e-01 3.57603742e-02 1.63260453e-16]
with parameter "c" effectively at the lower bound of zero. If you remove the bounds from the call to curve_fit() then parameter "c" will be negative in this example.
import numpy, scipy, scipy.optimize
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm # to colormap 3D surfaces from blue to red
import matplotlib.pyplot as plt
graphWidth = 800 # units are pixels
graphHeight = 600 # units are pixels
# 3D contour plot lines
numberOfContourLines = 16
def SurfacePlot(func, data, fittedParameters):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
matplotlib.pyplot.grid(True)
axes = Axes3D(f)
x_data = data[0]
y_data = data[1]
z_data = data[2]
xModel = numpy.linspace(min(x_data), max(x_data), 20)
yModel = numpy.linspace(min(y_data), max(y_data), 20)
X, Y = numpy.meshgrid(xModel, yModel)
Z = func(numpy.array([X, Y]), *fittedParameters)
axes.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=1, antialiased=True)
axes.scatter(x_data, y_data, z_data) # show data along with plotted surface
axes.set_title('Surface Plot (click-drag with mouse)') # add a title for surface plot
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
axes.set_zlabel('Z Data') # Z axis data label
plt.show()
plt.close('all') # clean up after using pyplot or else there can be memory and process problems
def ContourPlot(func, data, fittedParameters):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
x_data = data[0]
y_data = data[1]
z_data = data[2]
xModel = numpy.linspace(min(x_data), max(x_data), 20)
yModel = numpy.linspace(min(y_data), max(y_data), 20)
X, Y = numpy.meshgrid(xModel, yModel)
Z = func(numpy.array([X, Y]), *fittedParameters)
axes.plot(x_data, y_data, 'o')
axes.set_title('Contour Plot') # add a title for contour plot
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
CS = matplotlib.pyplot.contour(X, Y, Z, numberOfContourLines, colors='k')
matplotlib.pyplot.clabel(CS, inline=1, fontsize=10) # labels for contours
plt.show()
plt.close('all') # clean up after using pyplot or else there can be memory and process problems
def ScatterPlot(data):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
matplotlib.pyplot.grid(True)
axes = Axes3D(f)
x_data = data[0]
y_data = data[1]
z_data = data[2]
axes.scatter(x_data, y_data, z_data)
axes.set_title('Scatter Plot (click-drag with mouse)')
axes.set_xlabel('X Data')
axes.set_ylabel('Y Data')
axes.set_zlabel('Z Data')
plt.show()
plt.close('all') # clean up after using pyplot or else there can be memory and process problems
def func(data, a1, a2, c):
x1 = data[0]
x2 = data[1]
return c + (a1 * x1) + (x2 * a2)
if __name__ == "__main__":
xData = numpy.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0])
yData = numpy.array([11.0, 12.1, 13.0, 14.1, 15.0, 16.1, 17.0, 18.1, 90.0])
zData = numpy.array([1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.0, 12.0])
data = [xData, yData, zData]
# initial parameter estimates must be within bounds
initialParameters = [1.0, 1.0, 0.5]
# bounds on parameters - initial parameters must be within these
# note that +/- infinity means "no bound" on that parameter
lowerBounds = (-numpy.Inf, -numpy.Inf, 0.0)
upperBounds = (numpy.Inf, numpy.Inf, 1.0)
parameterBounds = [lowerBounds, upperBounds]
# now call curve_fit passing in parameter bounds
fittedParameters, pcov = scipy.optimize.curve_fit(func, [xData, yData], zData, p0 = initialParameters, bounds = parameterBounds)
ScatterPlot(data)
SurfacePlot(func, data, fittedParameters)
ContourPlot(func, data, fittedParameters)
print('fitted parameters', fittedParameters)
modelPredictions = func(data, *fittedParameters)
absError = modelPredictions - zData
SE = numpy.square(absError) # squared errors
MSE = numpy.mean(SE) # mean squared errors
RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
Rsquared = 1.0 - (numpy.var(absError) / numpy.var(zData))
print('RMSE:', RMSE)
print('R-squared:', Rsquared)
I would like to see the regression equation for a polynomial regression in python.
I am new to python, in R the analogous command I am looking for is "summary." I have tried the print function in python.
x = (LIST)
y = (LIST)
x = x[:, np.newaxis]
y = y[:, np.newaxis]
poly = PolynomialFeatures(degree=2)
x_poly = poly.fit_transform(x)
poly.fit(x_poly,y)
lin = LinearRegression()
lin.fit(x_poly,y)
y_poly_pred = lin.predict(x_poly)
print(lin)
print(poly)
print(lin.predict)
print(poly.fit_transform)
I would like the output to give me the ax^2 + bx + c equation, or at least the info to figure out that equation. Instead, I get (below) for my 4 print statements.
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
normalize=False)
PolynomialFeatures(degree=2, include_bias=True, interaction_only=False,
order='C')
<bound method LinearModel.predict of LinearRegression(copy_X=True,
fit_intercept=True, n_jobs=None, normalize=False)>
<bound method TransformerMixin.fit_transform of
PolynomialFeatures(degree=2, include_bias=True, interaction_only=False,
order='C')>
Here is an example graphical polynomial fitter using numpy.polyfit for fitting and numpy.polyval for evaluation. This example has eight data points, and making polynomialOrder = 7 shows Runge's phenomenon rather nicely.
import numpy, matplotlib
import matplotlib.pyplot as plt
xData = numpy.array([1.1, 2.2, 3.3, 4.4, 5.0, 6.6, 7.7, 0.0])
yData = numpy.array([1.1, 20.2, 30.3, 40.4, 50.0, 60.6, 70.7, 0.1])
polynomialOrder = 2 # example quadratic
# curve fit the test data
fittedParameters = numpy.polyfit(xData, yData, polynomialOrder)
print('Fitted Parameters:', fittedParameters)
modelPredictions = numpy.polyval(fittedParameters, xData)
absError = modelPredictions - yData
SE = numpy.square(absError) # squared errors
MSE = numpy.mean(SE) # mean squared errors
RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
Rsquared = 1.0 - (numpy.var(absError) / numpy.var(yData))
print('RMSE:', RMSE)
print('R-squared:', Rsquared)
print()
##########################################################
# graphics output section
def ModelAndScatterPlot(graphWidth, graphHeight):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
# first the raw data as a scatter plot
axes.plot(xData, yData, 'D')
# create data for the fitted equation plot
xModel = numpy.linspace(min(xData), max(xData))
yModel = numpy.polyval(fittedParameters, xModel)
# now the model as a line plot
axes.plot(xModel, yModel)
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
plt.show()
plt.close('all') # clean up after using pyplot
graphWidth = 800
graphHeight = 600
ModelAndScatterPlot(graphWidth, graphHeight)
I have a DataFrame (df) with two columns and three rows.
Column X = [137,270,344]
Column Y = [51, 121, 136]
I want to get the slope of the linear regression considering the intercept = 0.
I have tried to add a point (0,0) but it doesn´t work.
EX.
Column X = [0, 137,270,344]
Column Y = [0, 51, 121, 136]
The code that I am using.
Code:
X= df [“Column X”].astype(float)
Y = df [“Column Y”].astype(float)
slope, intercept, r_value, p_value, std_err = stats.linregress(X, Y)
intercept_desv = slope
coef_desv = intercept
I expected intercept = 0 but is less than 0.
In standard linear regression, all data points implicitly have a weight of 1.0. In any software that allows linear regression using weights, the regression can effectively be made to pass through any single point - such as the origin - by assigning that data point an extremely large weight. Numpy's polyfit() allows weights. Here is a graphing example with your data using this technique to make the fitted line pass through the 0,0 point.
import numpy, matplotlib
import matplotlib.pyplot as plt
xData = numpy.array( [0.0, 137.0, 270.0, 344.0])
yData = numpy.array([0.0, 51.0, 121.0, 136.0])
weights = numpy.array([1.0E10, 1.0, 1.0, 1.0]) # heavily weight the 0,0 point
#weights = None # use this for "no weights"
polynomialOrder = 1 # example straight line
# curve fit the test data
fittedParameters = numpy.polyfit(xData, yData, polynomialOrder, w=weights)
print('Fitted Parameters:', fittedParameters)
modelPredictions = numpy.polyval(fittedParameters, xData)
absError = modelPredictions - yData
SE = numpy.square(absError) # squared errors
MSE = numpy.mean(SE) # mean squared errors
RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
Rsquared = 1.0 - (numpy.var(absError) / numpy.var(yData))
print('RMSE:', RMSE)
print('R-squared:', Rsquared)
print()
print('Predicted value at x=0:', modelPredictions[0])
print()
##########################################################
# graphics output section
def ModelAndScatterPlot(graphWidth, graphHeight):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
# first the raw data as a scatter plot
axes.plot(xData, yData, 'D')
# create data for the fitted equation plot
xModel = numpy.linspace(min(xData), max(xData))
yModel = numpy.polyval(fittedParameters, xModel)
# now the model as a line plot
axes.plot(xModel, yModel)
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
plt.show()
plt.close('all') # clean up after using pyplot
graphWidth = 800
graphHeight = 600
ModelAndScatterPlot(graphWidth, graphHeight)
I am developing a code to analyze the relation of two variables. I am using a DataFrame to save the variables in two columns as it follows:
column A = 132.54672, 201.3845717, 323.2654551
column B = 51.54671995, 96.38457166, 131.2654551
I have tried to use statsmodels but it says that I do not have enough samples.
Can anyone help me? I need to define the coefficient and the intercept in order to calculate other variables.
y = coefficient * x + intercept
Ok, here is a solution using DataFrame. I am skipping the import commands and showing only the relevant part. In case you wonder what they are, drop me a comment.
I am using NumPy's polyfit for linear regression of order 1. You can print the fit (fit) to get the slope and the intercept. fit[0] is the intercept and fit[1] is the slope (or coefficient, as you call it)
column_A= [132.54672, 201.3845717, 323.2654551]
column_B= [51.54671995, 96.38457166, 131.2654551]
df = pd.DataFrame({'A': column_A, 'B': column_B})
fit = np.poly1d(np.polyfit(df['A'], df['B'], 1))
A_mesh = np.linspace(min(df['A']), max(df['A']), 100)
plt.plot(df['A'], df['B'], 'bx', label='Data', ms=10)
plt.plot(A_mesh, fit(A_mesh), '-b', label='Linear fit')
print (fit)
# 0.4028 x + 4.833
You can do this with curve_fit:
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
x = np.array([132.54672, 201.3845717, 323.2654551])
y = np.array([51.54671995, 96.38457166, 131.2654551])
linear = lambda x, a, b: a * x + b
popt, pcov = curve_fit(linear, x, y, p0=[1, 1])
plt.plot(x, y, "rx")
plt.plot(x, linear(x, *popt), "b-")
plt.title("f(x)=a*x+b, a={:.2f}, b={:.2f}".format(*popt))
plt.show()
Plot:
Using scipy.stats:
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
column_A= [132.54672, 201.3845717, 323.2654551]
column_B= [51.54671995, 96.38457166, 131.2654551]
df = pd.DataFrame({'A': column_A, 'B': column_B})
reg = stats.linregress(df.A, df.B)
plt.plot(df.A, df.B, 'bo', label='Data')
plt.plot(df.A, reg.intercept + reg.slope * df.A, 'k-', label='Linear Regression')
plt.xlabel('A')
plt.ylabel('B')
plt.legend()
plt.show()
You can also find useful methods from dir(reg), which include
.intercept
.pvalue
.rvalue
.slope
.stderr
See here.
In addition to the previous excellent answers, here is a graphical fitter that has a 3D scatterplot, 3D surface plot, and a contour plot.
import numpy, scipy, scipy.optimize
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm # to colormap 3D surfaces from blue to red
import matplotlib.pyplot as plt
graphWidth = 800 # units are pixels
graphHeight = 600 # units are pixels
# 3D contour plot lines
numberOfContourLines = 16
def SurfacePlot(func, data, fittedParameters):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
matplotlib.pyplot.grid(True)
axes = Axes3D(f)
x_data = data[0]
y_data = data[1]
z_data = data[2]
xModel = numpy.linspace(min(x_data), max(x_data), 20)
yModel = numpy.linspace(min(y_data), max(y_data), 20)
X, Y = numpy.meshgrid(xModel, yModel)
Z = func(numpy.array([X, Y]), *fittedParameters)
axes.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=1, antialiased=True)
axes.scatter(x_data, y_data, z_data) # show data along with plotted surface
axes.set_title('Surface Plot (click-drag with mouse)') # add a title for surface plot
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
axes.set_zlabel('Z Data') # Z axis data label
plt.show()
plt.close('all') # clean up after using pyplot or else thaere can be memory and process problems
def ContourPlot(func, data, fittedParameters):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
x_data = data[0]
y_data = data[1]
z_data = data[2]
xModel = numpy.linspace(min(x_data), max(x_data), 20)
yModel = numpy.linspace(min(y_data), max(y_data), 20)
X, Y = numpy.meshgrid(xModel, yModel)
Z = func(numpy.array([X, Y]), *fittedParameters)
axes.plot(x_data, y_data, 'o')
axes.set_title('Contour Plot') # add a title for contour plot
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
CS = matplotlib.pyplot.contour(X, Y, Z, numberOfContourLines, colors='k')
matplotlib.pyplot.clabel(CS, inline=1, fontsize=10) # labels for contours
plt.show()
plt.close('all') # clean up after using pyplot or else thaere can be memory and process problems
def ScatterPlot(data):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
matplotlib.pyplot.grid(True)
axes = Axes3D(f)
x_data = data[0]
y_data = data[1]
z_data = data[2]
axes.scatter(x_data, y_data, z_data)
axes.set_title('Scatter Plot (click-drag with mouse)')
axes.set_xlabel('X Data')
axes.set_ylabel('Y Data')
axes.set_zlabel('Z Data')
plt.show()
plt.close('all') # clean up after using pyplot or else thaere can be memory and process problems
def func(data, a, alpha, beta):
t = data[0]
p_p = data[1]
return a * (t**alpha) * (p_p**beta)
if __name__ == "__main__":
xData = numpy.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0])
yData = numpy.array([11.0, 12.1, 13.0, 14.1, 15.0, 16.1, 17.0, 18.1, 90.0])
zData = numpy.array([1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.0, 9.9])
data = [xData, yData, zData]
initialParameters = [1.0, 1.0, 1.0] # these are the same as scipy default values in this example
# here a non-linear surface fit is made with scipy's curve_fit()
fittedParameters, pcov = scipy.optimize.curve_fit(func, [xData, yData], zData, p0 = initialParameters)
ScatterPlot(data)
SurfacePlot(func, data, fittedParameters)
ContourPlot(func, data, fittedParameters)
print('fitted prameters', fittedParameters)
modelPredictions = func(data, *fittedParameters)
absError = modelPredictions - zData
SE = numpy.square(absError) # squared errors
MSE = numpy.mean(SE) # mean squared errors
RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
Rsquared = 1.0 - (numpy.var(absError) / numpy.var(zData))
print('RMSE:', RMSE)
print('R-squared:', Rsquared)
I have done my linear regression and the best fit line, but would like to have also a line connecting the real points (the ones in blue) to the predicted points (the ones i red x) representing the predictions error, or the so called residuals. The plot should look in a similar way:
And what I have until now is:
# draw the plot
xx=X[:,np.newaxis]
yy=y[:,np.newaxis]
slr=LinearRegression()
slr.fit(xx,yy)
y_pred=slr.predict(xx)
plt.scatter(xx,yy)
plt.plot(xx,y_pred,'r')
plt.plot(X,y_pred,'rx') #add the prediction points
plt.show()
Thank you very much in advance!
Here is example code with the vertical lines
import numpy, scipy, matplotlib
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
xData = numpy.array([1.1, 2.2, 3.3, 4.4, 5.0, 6.6, 7.7])
yData = numpy.array([1.1, 20.2, 30.3, 60.4, 50.0, 60.6, 70.7])
def func(x, a, b): # simple linear example
return a * x + b
initialParameters = numpy.array([1.0, 1.0])
# curve fit the test data
fittedParameters, pcov = curve_fit(func, xData, yData, initialParameters)
modelPredictions = func(xData, *fittedParameters)
absError = modelPredictions - yData
SE = numpy.square(absError) # squared errors
MSE = numpy.mean(SE) # mean squared errors
RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
Rsquared = 1.0 - (numpy.var(absError) / numpy.var(yData))
print('RMSE:', RMSE)
print('R-squared:', Rsquared)
print()
##########################################################
# graphics output section
def ModelAndScatterPlot(graphWidth, graphHeight):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
# first the raw data as a scatter plot
axes.plot(xData, yData, 'D')
# create data for the fitted equation plot
xModel = numpy.linspace(min(xData), max(xData))
yModel = func(xModel, *fittedParameters)
# now the model as a line plot
axes.plot(xModel, yModel)
# now add individual line for each point
for i in range(len(xData)):
lineXdata = (xData[i], xData[i]) # same X
lineYdata = (yData[i], modelPredictions[i]) # different Y
plt.plot(lineXdata, lineYdata)
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
plt.show()
plt.close('all') # clean up after using pyplot
graphWidth = 800
graphHeight = 600
ModelAndScatterPlot(graphWidth, graphHeight)