I am developing a code to analyze the relation of two variables. I am using a DataFrame to save the variables in two columns as it follows:
column A = 132.54672, 201.3845717, 323.2654551
column B = 51.54671995, 96.38457166, 131.2654551
I have tried to use statsmodels but it says that I do not have enough samples.
Can anyone help me? I need to define the coefficient and the intercept in order to calculate other variables.
y = coefficient * x + intercept
Ok, here is a solution using DataFrame. I am skipping the import commands and showing only the relevant part. In case you wonder what they are, drop me a comment.
I am using NumPy's polyfit for linear regression of order 1. You can print the fit (fit) to get the slope and the intercept. fit[0] is the intercept and fit[1] is the slope (or coefficient, as you call it)
column_A= [132.54672, 201.3845717, 323.2654551]
column_B= [51.54671995, 96.38457166, 131.2654551]
df = pd.DataFrame({'A': column_A, 'B': column_B})
fit = np.poly1d(np.polyfit(df['A'], df['B'], 1))
A_mesh = np.linspace(min(df['A']), max(df['A']), 100)
plt.plot(df['A'], df['B'], 'bx', label='Data', ms=10)
plt.plot(A_mesh, fit(A_mesh), '-b', label='Linear fit')
print (fit)
# 0.4028 x + 4.833
You can do this with curve_fit:
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
x = np.array([132.54672, 201.3845717, 323.2654551])
y = np.array([51.54671995, 96.38457166, 131.2654551])
linear = lambda x, a, b: a * x + b
popt, pcov = curve_fit(linear, x, y, p0=[1, 1])
plt.plot(x, y, "rx")
plt.plot(x, linear(x, *popt), "b-")
plt.title("f(x)=a*x+b, a={:.2f}, b={:.2f}".format(*popt))
plt.show()
Plot:
Using scipy.stats:
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
column_A= [132.54672, 201.3845717, 323.2654551]
column_B= [51.54671995, 96.38457166, 131.2654551]
df = pd.DataFrame({'A': column_A, 'B': column_B})
reg = stats.linregress(df.A, df.B)
plt.plot(df.A, df.B, 'bo', label='Data')
plt.plot(df.A, reg.intercept + reg.slope * df.A, 'k-', label='Linear Regression')
plt.xlabel('A')
plt.ylabel('B')
plt.legend()
plt.show()
You can also find useful methods from dir(reg), which include
.intercept
.pvalue
.rvalue
.slope
.stderr
See here.
In addition to the previous excellent answers, here is a graphical fitter that has a 3D scatterplot, 3D surface plot, and a contour plot.
import numpy, scipy, scipy.optimize
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm # to colormap 3D surfaces from blue to red
import matplotlib.pyplot as plt
graphWidth = 800 # units are pixels
graphHeight = 600 # units are pixels
# 3D contour plot lines
numberOfContourLines = 16
def SurfacePlot(func, data, fittedParameters):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
matplotlib.pyplot.grid(True)
axes = Axes3D(f)
x_data = data[0]
y_data = data[1]
z_data = data[2]
xModel = numpy.linspace(min(x_data), max(x_data), 20)
yModel = numpy.linspace(min(y_data), max(y_data), 20)
X, Y = numpy.meshgrid(xModel, yModel)
Z = func(numpy.array([X, Y]), *fittedParameters)
axes.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=1, antialiased=True)
axes.scatter(x_data, y_data, z_data) # show data along with plotted surface
axes.set_title('Surface Plot (click-drag with mouse)') # add a title for surface plot
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
axes.set_zlabel('Z Data') # Z axis data label
plt.show()
plt.close('all') # clean up after using pyplot or else thaere can be memory and process problems
def ContourPlot(func, data, fittedParameters):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
x_data = data[0]
y_data = data[1]
z_data = data[2]
xModel = numpy.linspace(min(x_data), max(x_data), 20)
yModel = numpy.linspace(min(y_data), max(y_data), 20)
X, Y = numpy.meshgrid(xModel, yModel)
Z = func(numpy.array([X, Y]), *fittedParameters)
axes.plot(x_data, y_data, 'o')
axes.set_title('Contour Plot') # add a title for contour plot
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
CS = matplotlib.pyplot.contour(X, Y, Z, numberOfContourLines, colors='k')
matplotlib.pyplot.clabel(CS, inline=1, fontsize=10) # labels for contours
plt.show()
plt.close('all') # clean up after using pyplot or else thaere can be memory and process problems
def ScatterPlot(data):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
matplotlib.pyplot.grid(True)
axes = Axes3D(f)
x_data = data[0]
y_data = data[1]
z_data = data[2]
axes.scatter(x_data, y_data, z_data)
axes.set_title('Scatter Plot (click-drag with mouse)')
axes.set_xlabel('X Data')
axes.set_ylabel('Y Data')
axes.set_zlabel('Z Data')
plt.show()
plt.close('all') # clean up after using pyplot or else thaere can be memory and process problems
def func(data, a, alpha, beta):
t = data[0]
p_p = data[1]
return a * (t**alpha) * (p_p**beta)
if __name__ == "__main__":
xData = numpy.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0])
yData = numpy.array([11.0, 12.1, 13.0, 14.1, 15.0, 16.1, 17.0, 18.1, 90.0])
zData = numpy.array([1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.0, 9.9])
data = [xData, yData, zData]
initialParameters = [1.0, 1.0, 1.0] # these are the same as scipy default values in this example
# here a non-linear surface fit is made with scipy's curve_fit()
fittedParameters, pcov = scipy.optimize.curve_fit(func, [xData, yData], zData, p0 = initialParameters)
ScatterPlot(data)
SurfacePlot(func, data, fittedParameters)
ContourPlot(func, data, fittedParameters)
print('fitted prameters', fittedParameters)
modelPredictions = func(data, *fittedParameters)
absError = modelPredictions - zData
SE = numpy.square(absError) # squared errors
MSE = numpy.mean(SE) # mean squared errors
RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
Rsquared = 1.0 - (numpy.var(absError) / numpy.var(zData))
print('RMSE:', RMSE)
print('R-squared:', Rsquared)
Related
Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 3 years ago.
Improve this question
I am trying to implement multiple linear regression using the
sklearn.linear_model.LinearRegression function.
The equation for regression is:
y = c + a1x1 + a2x2
with an additional condition that:
c always takes a value between (1,10)
a1 takes values between (0,1).
How can I principally solve such equations using Python?
Here is an example bounded graphical 3D surface fitter in Python using your equation and bounds on c with 3D scatter plot, 3D surface plot, and contour plot. You should be able to click-drag with the mouse and rotate the 3D plots in 3-space for examination. You can of course change or add bounds as you see fit.
Note that this example is using scipy's curve_fit() which allows bounds on the fitted parameters, and that the estimated initial parameters must be within the bounds so that curve_fit() can begin. In this example, the fitted parameter values for a1, a2, and c are:
fitted parameters [ 9.71206053e-01 3.57603742e-02 1.63260453e-16]
with parameter "c" effectively at the lower bound of zero. If you remove the bounds from the call to curve_fit() then parameter "c" will be negative in this example.
import numpy, scipy, scipy.optimize
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm # to colormap 3D surfaces from blue to red
import matplotlib.pyplot as plt
graphWidth = 800 # units are pixels
graphHeight = 600 # units are pixels
# 3D contour plot lines
numberOfContourLines = 16
def SurfacePlot(func, data, fittedParameters):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
matplotlib.pyplot.grid(True)
axes = Axes3D(f)
x_data = data[0]
y_data = data[1]
z_data = data[2]
xModel = numpy.linspace(min(x_data), max(x_data), 20)
yModel = numpy.linspace(min(y_data), max(y_data), 20)
X, Y = numpy.meshgrid(xModel, yModel)
Z = func(numpy.array([X, Y]), *fittedParameters)
axes.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=1, antialiased=True)
axes.scatter(x_data, y_data, z_data) # show data along with plotted surface
axes.set_title('Surface Plot (click-drag with mouse)') # add a title for surface plot
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
axes.set_zlabel('Z Data') # Z axis data label
plt.show()
plt.close('all') # clean up after using pyplot or else there can be memory and process problems
def ContourPlot(func, data, fittedParameters):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
x_data = data[0]
y_data = data[1]
z_data = data[2]
xModel = numpy.linspace(min(x_data), max(x_data), 20)
yModel = numpy.linspace(min(y_data), max(y_data), 20)
X, Y = numpy.meshgrid(xModel, yModel)
Z = func(numpy.array([X, Y]), *fittedParameters)
axes.plot(x_data, y_data, 'o')
axes.set_title('Contour Plot') # add a title for contour plot
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
CS = matplotlib.pyplot.contour(X, Y, Z, numberOfContourLines, colors='k')
matplotlib.pyplot.clabel(CS, inline=1, fontsize=10) # labels for contours
plt.show()
plt.close('all') # clean up after using pyplot or else there can be memory and process problems
def ScatterPlot(data):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
matplotlib.pyplot.grid(True)
axes = Axes3D(f)
x_data = data[0]
y_data = data[1]
z_data = data[2]
axes.scatter(x_data, y_data, z_data)
axes.set_title('Scatter Plot (click-drag with mouse)')
axes.set_xlabel('X Data')
axes.set_ylabel('Y Data')
axes.set_zlabel('Z Data')
plt.show()
plt.close('all') # clean up after using pyplot or else there can be memory and process problems
def func(data, a1, a2, c):
x1 = data[0]
x2 = data[1]
return c + (a1 * x1) + (x2 * a2)
if __name__ == "__main__":
xData = numpy.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0])
yData = numpy.array([11.0, 12.1, 13.0, 14.1, 15.0, 16.1, 17.0, 18.1, 90.0])
zData = numpy.array([1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.0, 12.0])
data = [xData, yData, zData]
# initial parameter estimates must be within bounds
initialParameters = [1.0, 1.0, 0.5]
# bounds on parameters - initial parameters must be within these
# note that +/- infinity means "no bound" on that parameter
lowerBounds = (-numpy.Inf, -numpy.Inf, 0.0)
upperBounds = (numpy.Inf, numpy.Inf, 1.0)
parameterBounds = [lowerBounds, upperBounds]
# now call curve_fit passing in parameter bounds
fittedParameters, pcov = scipy.optimize.curve_fit(func, [xData, yData], zData, p0 = initialParameters, bounds = parameterBounds)
ScatterPlot(data)
SurfacePlot(func, data, fittedParameters)
ContourPlot(func, data, fittedParameters)
print('fitted parameters', fittedParameters)
modelPredictions = func(data, *fittedParameters)
absError = modelPredictions - zData
SE = numpy.square(absError) # squared errors
MSE = numpy.mean(SE) # mean squared errors
RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
Rsquared = 1.0 - (numpy.var(absError) / numpy.var(zData))
print('RMSE:', RMSE)
print('R-squared:', Rsquared)
I am trying to build a multiple linear regression on a dummy data and I keep getting overflow error.
Assume this as a dummy data.
print(x_train)
col1 col2 target
0.18 0.89 109.85
1.0 0.26 155.72
0.92 0.11 137.66
0.07 0.37 76.17
0.85 0.16 139.75
0.99 0.41 162.6
0.87 0.47 151.77
print(x_test)
0.49 0.18
0.57 0.83
0.56 0.64
0.76 0.18
This is the code I wrote for implementation of linear regression for multiple features. Can anyone let me know if my implementation of LINEAR REGRESSION is correct? If it's correct then why am I keep getting overflow error.
import numpy as np
def data():
# prepare data
x_train = np.array(train_data)[:, :-1]
y_train = np.array(train_data)[:, -1]
x_test = np.array(test_data)
return x_train, y_train, x_test
def normalize(y):
return (y - y.min()) / (y.max() - y.min())
def linear_regression(x_train, y_train, epochs=300):
y_train = normalize(y_train)
rows, columns = x_train.shape
weights = np.zeros((columns))
intercept = 0
for x in range(epochs):
for i in range(len(x_train)):
prev_weights = weights
weights += intercept + prev_weights * x_train[i] - y_train[i]
intercept += (intercept+(prev_weights*x_train[i])-y_train[i]).dot(x_train[i])
return weights, intercept
def predict(x_test, weights, intercept):
y_pred = []
for i in range(len(x_test)):
y_pred.append(weights.dot(x_test[i]) + intercept)
return y_pred
def main():
x_train, y_train, x_test = data()
weights, intercept = linear_regression(x_train, y_train, epochs=300)
y_pred = predict(x_test, weights, intercept)
for i in y_pred:
print(str(i))
if __name__=='__main__':
main()
Results:
-inf
-inf
-inf
-inf
/srv/conda/lib/python3.6/site-packages/ipykernel_launcher.py:25: RuntimeWarning: overflow encountered in add
Here is a different approach, a Python 3D surface fitter using your data with 3D scatter plot, 3D surface plot, and contour plot. You should be able to click-drag and rotate the 3D plots in 3-space for visual inspection. Here the fitted surface is a flat plane, and there is no need for test and train split as the RMSE and R-squared are given directly and you can see the surface. Just re-fit with all data.
import numpy, scipy, scipy.optimize
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm # to colormap 3D surfaces from blue to red
import matplotlib.pyplot as plt
graphWidth = 800 # units are pixels
graphHeight = 600 # units are pixels
# 3D contour plot lines
numberOfContourLines = 16
# x, y, z = col1, col2, target
xData = numpy.array([0.18, 1.0, 0.92, 0.07, 0.85, 0.99, 0.87])
yData = numpy.array([0.89, 0.26, 0.11, 0.37, 0.16, 0.41, 0.47])
zData = numpy.array([109.85, 155.72, 137.66, 76.17, 139.75, 162.6, 151.77])
def func(data, a, b, c):
x = data[0]
y = data[1]
return (a * x) + (y * b) + c
def SurfacePlot(func, data, fittedParameters):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
matplotlib.pyplot.grid(True)
axes = Axes3D(f)
x_data = data[0]
y_data = data[1]
z_data = data[2]
xModel = numpy.linspace(min(x_data), max(x_data), 20)
yModel = numpy.linspace(min(y_data), max(y_data), 20)
X, Y = numpy.meshgrid(xModel, yModel)
Z = func(numpy.array([X, Y]), *fittedParameters)
axes.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=1, antialiased=True)
axes.scatter(x_data, y_data, z_data) # show data along with plotted surface
axes.set_title('Surface Plot (click-drag with mouse)') # add a title for surface plot
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
axes.set_zlabel('Z Data') # Z axis data label
plt.show()
plt.close('all') # clean up after using pyplot or else there can be memory and process problems
def ContourPlot(func, data, fittedParameters):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
x_data = data[0]
y_data = data[1]
z_data = data[2]
xModel = numpy.linspace(min(x_data), max(x_data), 20)
yModel = numpy.linspace(min(y_data), max(y_data), 20)
X, Y = numpy.meshgrid(xModel, yModel)
Z = func(numpy.array([X, Y]), *fittedParameters)
axes.plot(x_data, y_data, 'o')
axes.set_title('Contour Plot') # add a title for contour plot
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
CS = matplotlib.pyplot.contour(X, Y, Z, numberOfContourLines, colors='k')
matplotlib.pyplot.clabel(CS, inline=1, fontsize=10) # labels for contours
plt.show()
plt.close('all') # clean up after using pyplot or else there can be memory and process problems
def ScatterPlot(data):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
matplotlib.pyplot.grid(True)
axes = Axes3D(f)
x_data = data[0]
y_data = data[1]
z_data = data[2]
axes.scatter(x_data, y_data, z_data)
axes.set_title('Scatter Plot (click-drag with mouse)')
axes.set_xlabel('X Data')
axes.set_ylabel('Y Data')
axes.set_zlabel('Z Data')
plt.show()
plt.close('all') # clean up after using pyplot or else there can be memory and process problems
if __name__ == "__main__":
data = [xData, yData, zData]
initialParameters = [1.0, 1.0, 1.0] # these are the same as scipy default values in this example
# here a non-linear surface fit is made with scipy's curve_fit()
fittedParameters, pcov = scipy.optimize.curve_fit(func, [xData, yData], zData, p0 = initialParameters)
ScatterPlot(data)
SurfacePlot(func, data, fittedParameters)
ContourPlot(func, data, fittedParameters)
print('fitted prameters', fittedParameters)
modelPredictions = func(data, *fittedParameters)
absError = modelPredictions - zData
SE = numpy.square(absError) # squared errors
MSE = numpy.mean(SE) # mean squared errors
RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
Rsquared = 1.0 - (numpy.var(absError) / numpy.var(zData))
print('RMSE:', RMSE)
print('R-squared:', Rsquared)
I am trying to fit this X, Y, Z datasets to an unknown surface.
Unfortunately, linear fitting is not good enough to show the surface data. I think the polynomial fitting might fit in this case. In addition, The problem is that I do not know how to build the polynomial fitting function to make the surface fitting done.
Any help would be great.
Thank you
import numpy as np
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
X = [[2, 2, 2], [1.5, 1.5, 1.5], [0.5, 0.5, 0.5]]
Y = [[3, 2, 1], [3, 2, 1], [3, 2, 1]]
Z = [[2.4, 2.5, 2.2], [2.4, 3, 2.5], [4, 3.3, 8]]
# ================= Plot figure ================= ##
Fontsize_set = {'size': 20}
fig = plt.figure(figsize=[8, 5], dpi=140, facecolor='w')
ax = fig.gca(projection='3d')
ax.grid(color='y', linestyle='--', linewidth=0.5)
ax.tick_params(labelsize=20)
ax.set_xlim3d(0, 3)
ax.set_ylim3d(0, 6)
ax.set_zlim3d(0, 10)
ax.view_init(30, 45)
ax.scatter(X, Y, Z, s=50, color='k', marker='o', linewidth=None, alpha=1)
# ax.plot_surface(X, Y, Z)
fig.tight_layout()
plt.show()
Here you go
=^..^=
Description in code:
import numpy as np
from scipy.optimize import curve_fit
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
# test function
def function(data, a, b, c):
x = data[0]
y = data[1]
return a * (x**b) * (y**c)
# setup test data
raw_data = [2.0, 2.0, 2.0], [1.5, 1.5, 1.5], [0.5, 0.5, 0.5],[3.0, 2.0, 1.0], [3.0, 2.0, 1.0],\
[3.0, 2.0, 1.0], [2.4, 2.5, 2.2], [2.4, 3.0, 2.5], [4.0, 3.3, 8.0]
# convert data into proper format
x_data = []
y_data = []
z_data = []
for item in raw_data:
x_data.append(item[0])
y_data.append(item[1])
z_data.append(item[2])
# get fit parameters from scipy curve fit
parameters, covariance = curve_fit(function, [x_data, y_data], z_data)
# create surface function model
# setup data points for calculating surface model
model_x_data = np.linspace(min(x_data), max(x_data), 30)
model_y_data = np.linspace(min(y_data), max(y_data), 30)
# create coordinate arrays for vectorized evaluations
X, Y = np.meshgrid(model_x_data, model_y_data)
# calculate Z coordinate array
Z = function(np.array([X, Y]), *parameters)
# setup figure object
fig = plt.figure()
# setup 3d object
ax = Axes3D(fig)
# plot surface
ax.plot_surface(X, Y, Z)
# plot input data
ax.scatter(x_data, y_data, z_data, color='red')
# set plot descriptions
ax.set_xlabel('X data')
ax.set_ylabel('Y data')
ax.set_zlabel('Z data')
plt.show()
Here is an additional graphics example with scatterplot, surface plot, and contour plot. You should be able to hold down the mouse button and rotate the 3D plots.
import numpy, scipy, scipy.optimize
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm # to colormap 3D surfaces from blue to red
import matplotlib.pyplot as plt
graphWidth = 800 # units are pixels
graphHeight = 600 # units are pixels
# 3D contour plot lines
numberOfContourLines = 16
def SurfacePlot(func, data, fittedParameters):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
matplotlib.pyplot.grid(True)
axes = Axes3D(f)
x_data = data[0]
y_data = data[1]
z_data = data[2]
xModel = numpy.linspace(min(x_data), max(x_data), 20)
yModel = numpy.linspace(min(y_data), max(y_data), 20)
X, Y = numpy.meshgrid(xModel, yModel)
Z = func(numpy.array([X, Y]), *fittedParameters)
axes.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=1, antialiased=True)
axes.scatter(x_data, y_data, z_data) # show data along with plotted surface
axes.set_title('Surface Plot (click-drag with mouse)') # add a title for surface plot
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
axes.set_zlabel('Z Data') # Z axis data label
plt.show()
plt.close('all') # clean up after using pyplot or else there can be memory and process problems
def ContourPlot(func, data, fittedParameters):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
x_data = data[0]
y_data = data[1]
z_data = data[2]
xModel = numpy.linspace(min(x_data), max(x_data), 20)
yModel = numpy.linspace(min(y_data), max(y_data), 20)
X, Y = numpy.meshgrid(xModel, yModel)
Z = func(numpy.array([X, Y]), *fittedParameters)
axes.plot(x_data, y_data, 'o')
axes.set_title('Contour Plot') # add a title for contour plot
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
CS = matplotlib.pyplot.contour(X, Y, Z, numberOfContourLines, colors='k')
matplotlib.pyplot.clabel(CS, inline=1, fontsize=10) # labels for contours
plt.show()
plt.close('all') # clean up after using pyplot or else there can be memory and process problems
def ScatterPlot(data):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
matplotlib.pyplot.grid(True)
axes = Axes3D(f)
x_data = data[0]
y_data = data[1]
z_data = data[2]
axes.scatter(x_data, y_data, z_data)
axes.set_title('Scatter Plot (click-drag with mouse)')
axes.set_xlabel('X Data')
axes.set_ylabel('Y Data')
axes.set_zlabel('Z Data')
plt.show()
plt.close('all') # clean up after using pyplot or else there can be memory and process problems
def func(data, a, b, c):
x = data[0]
y = data[1]
return (a * x) + (y * b) + c
if __name__ == "__main__":
xData = numpy.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0])
yData = numpy.array([11.0, 12.1, 13.0, 14.1, 15.0, 16.1, 17.0, 18.1, 90.0])
zData = numpy.array([1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.0, 9.9])
data = [xData, yData, zData]
initialParameters = [1.0, 1.0, 1.0] # these are the same as scipy default values in this example
# here a non-linear surface fit is made with scipy's curve_fit()
fittedParameters, pcov = scipy.optimize.curve_fit(func, [xData, yData], zData, p0 = initialParameters)
ScatterPlot(data)
SurfacePlot(func, data, fittedParameters)
ContourPlot(func, data, fittedParameters)
print('fitted prameters', fittedParameters)
modelPredictions = func(data, *fittedParameters)
absError = modelPredictions - zData
SE = numpy.square(absError) # squared errors
MSE = numpy.mean(SE) # mean squared errors
RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
Rsquared = 1.0 - (numpy.var(absError) / numpy.var(zData))
print('RMSE:', RMSE)
print('R-squared:', Rsquared)
I am trying to find a way to fit a linear regression. However I would like to force the coefficient of some drivers to be positive.
As far as I understood, scipy.optimize.nnls can do non-negative least squares but for all drivers.
Is there a way to do it automatically?
Thanks a lot.
Here is a graphical fitter that has a "brick wall" in the fitting function that forces one of the fitted parameters to be positive. Note that in this example, the fit is quite poor - if you remove the "brick wall" the example fit improves greatly. This example uses the default scipy curve_fit() initial parameter estimates of all 1.0, and does not use scipy's genetic algorithm to help find initial parameter estimates. When using this technique the initial parameter estimates must be outside the "brick wall" conditions so that the non-linear fitter can begin normally.
import numpy, scipy, matplotlib
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
xData = numpy.array([19.1647, 18.0189, 16.9550, 15.7683, 14.7044, 13.6269, 12.6040, 11.4309, 10.2987, 9.23465, 8.18440, 7.89789, 7.62498, 7.36571, 7.01106, 6.71094, 6.46548, 6.27436, 6.16543, 6.05569, 5.91904, 5.78247, 5.53661, 4.85425, 4.29468, 3.74888, 3.16206, 2.58882, 1.93371, 1.52426, 1.14211, 0.719035, 0.377708, 0.0226971, -0.223181, -0.537231, -0.878491, -1.27484, -1.45266, -1.57583, -1.61717])
yData = numpy.array([0.644557, 0.641059, 0.637555, 0.634059, 0.634135, 0.631825, 0.631899, 0.627209, 0.622516, 0.617818, 0.616103, 0.613736, 0.610175, 0.606613, 0.605445, 0.603676, 0.604887, 0.600127, 0.604909, 0.588207, 0.581056, 0.576292, 0.566761, 0.555472, 0.545367, 0.538842, 0.529336, 0.518635, 0.506747, 0.499018, 0.491885, 0.484754, 0.475230, 0.464514, 0.454387, 0.444861, 0.437128, 0.415076, 0.401363, 0.390034, 0.378698])
def func(x, a, b, offset): #exponential curve fitting function
# force a to be positive by using "brick wall" that
# returns a large value, and therefore a large error,
# if parameter a is not positive
if a <= 0.0:
return 1.0E10
return a * numpy.exp(-b*x) + offset
fittedParameters, pcov = curve_fit(func, xData, yData)
print(fittedParameters)
print()
modelPredictions = func(xData, *fittedParameters)
absError = modelPredictions - yData
SE = numpy.square(absError) # squared errors
MSE = numpy.mean(SE) # mean squared errors
RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
Rsquared = 1.0 - (numpy.var(absError) / numpy.var(yData))
print()
print('RMSE:', RMSE)
print('R-squared:', Rsquared)
print()
##########################################################
# graphics output section
def ModelAndScatterPlot(graphWidth, graphHeight):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
# first the raw data as a scatter plot
axes.plot(xData, yData, 'D')
# create data for the fitted equation plot
xModel = numpy.linspace(min(xData), max(xData))
yModel = func(xModel, *fittedParameters)
# now the model as a line plot
axes.plot(xModel, yModel)
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
plt.show()
plt.close('all') # clean up after using pyplot
graphWidth = 800
graphHeight = 600
ModelAndScatterPlot(graphWidth, graphHeight)
Per the comment regarding constrained multiple regression, here is a graphical 3D surface fitter that also has a "brick wall" to force one of the fitted parameters to be positive. The call to curve_fit can be made with either the constrained or unconstrained function versions for comparison.
import numpy, scipy, scipy.optimize
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm # to colormap 3D surfaces from blue to red
import matplotlib.pyplot as plt
graphWidth = 800 # units are pixels
graphHeight = 600 # units are pixels
# 3D contour plot lines
numberOfContourLines = 16
def SurfacePlot(func, data, fittedParameters):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
matplotlib.pyplot.grid(True)
axes = Axes3D(f)
x_data = data[0]
y_data = data[1]
z_data = data[2]
xModel = numpy.linspace(min(x_data), max(x_data), 20)
yModel = numpy.linspace(min(y_data), max(y_data), 20)
X, Y = numpy.meshgrid(xModel, yModel)
Z = func(numpy.array([X, Y]), *fittedParameters)
axes.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=1, antialiased=True)
axes.scatter(x_data, y_data, z_data) # show data along with plotted surface
axes.set_title('Surface Plot (click-drag with mouse)') # add a title for surface plot
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
axes.set_zlabel('Z Data') # Z axis data label
plt.show()
plt.close('all') # clean up after using pyplot or else thaere can be memory and process problems
def ContourPlot(func, data, fittedParameters):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
x_data = data[0]
y_data = data[1]
z_data = data[2]
xModel = numpy.linspace(min(x_data), max(x_data), 20)
yModel = numpy.linspace(min(y_data), max(y_data), 20)
X, Y = numpy.meshgrid(xModel, yModel)
Z = func(numpy.array([X, Y]), *fittedParameters)
axes.plot(x_data, y_data, 'o')
axes.set_title('Contour Plot') # add a title for contour plot
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
CS = matplotlib.pyplot.contour(X, Y, Z, numberOfContourLines, colors='k')
matplotlib.pyplot.clabel(CS, inline=1, fontsize=10) # labels for contours
plt.show()
plt.close('all') # clean up after using pyplot or else thaere can be memory and process problems
def ScatterPlot(data):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
matplotlib.pyplot.grid(True)
axes = Axes3D(f)
x_data = data[0]
y_data = data[1]
z_data = data[2]
axes.scatter(x_data, y_data, z_data)
axes.set_title('Scatter Plot (click-drag with mouse)')
axes.set_xlabel('X Data')
axes.set_ylabel('Y Data')
axes.set_zlabel('Z Data')
plt.show()
plt.close('all') # clean up after using pyplot or else thaere can be memory and process problems
def func(data, a, b, c):
# extract the individual data arrays used in the equation
x = data[0]
y = data[1]
return a*x + b*y + c
def constrainedFunction(data, a, b, c):
# use a "brick wall" to ensure parameter c is positive
# return a large value and therefor large error
if c <= 0.0:
return 1.0E10
else:
return func(data, a, b, c) # call the unconstrained function
if __name__ == "__main__":
xData = numpy.array([-10.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0])
yData = numpy.array([-10.0, 11.0, 12.1, 13.0, 14.1, 15.0, 16.1, 17.0, 18.1, 19.0])
zData = numpy.array([-30.0, 1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.0, 9.9])
data = [xData, yData, zData]
initialParameters = [1.0, 1.0, 1.0] # these are the same as scipy default values in this example
# here a non-linear surface fit is made with scipy's curve_fit()
fittedParameters, pcov = scipy.optimize.curve_fit(constrainedFunction, [xData, yData], zData, p0 = initialParameters)
ScatterPlot(data)
SurfacePlot(func, data, fittedParameters)
ContourPlot(func, data, fittedParameters)
print('fitted prameters', fittedParameters)
Firstly, I am not familiar with Python and I still barely understand the mechanism of Python code. But I need to do some statistical analysis through Python.
I have tried many many ways to figure out but I failed.
Basically, I have 3 arrays of data (assume these arrays are X, Y, Z).
I did some analysis with (X, Y) and (Z, Y) by making the scatter plot and put the best fit with the data to see the correlation.
№1 and №2 are quite easy enough.
Now I need to see the edge on view from the graph which is the one with combined X and Z. So, I made the equation (see below).
import pylab as pl
import numpy as np
from pylab import *
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt
s = np.loadtxt('New_list3.txt')
s = s.T
x = s[1]
y = s[2]
z = s[4]
upper_error = s[5]
lower_error = s[6]
asymmetric_error = [lower_error, upper_error]
def func(X, a1, a2, a3):
x1, y1 = X
return a1 * x1 + a2 * y1 + a3
popt, pcov = curve_fit(func,(x,y),z)
new_x=func((x,y),popt[0],popt[1],0)
new_y=z
new_z = np.polyfit(new_x,new_y,1)
p = np.poly1d(new_z)
plt.plot(func((x, y), popt[0], popt[1], 0), z, '.k')
pl.plot(new_x, p(new_x), "r-")
plt.errorbar(new_x, z ,yerr=asymmetric_error, ecolor='b', capsize=3, marker ='o', fmt='none')
print popt
plt.show()
Now I cannot figure out to find the errors from that equation which is y = a1 * x + a2 * z + a3. I found the best values of a1, a2, a3. However, not the errors.
How can I find the "errors" of a1, a2, a3?
Here is example Python code using scipy.optimize.curve_fit to fit a surface, and it makes a 3D scatterplot of the raw data, a 3D scatterplot of the errors, a surface plot, and a contour plot. Change this to use your own data and function, and you should be done.
import numpy, scipy
import scipy.optimize
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm # to colormap 3D surfaces from blue to red
import matplotlib.pyplot as plt
graphWidth = 800 # units are pixels
graphHeight = 600 # units are pixels
# 3D contour plot lines
numberOfContourLines = 16
def SurfacePlot(equationFunc, data, params):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
matplotlib.pyplot.grid(True)
axes = Axes3D(f)
x_data = data[0]
y_data = data[1]
z_data = data[2]
xModel = numpy.linspace(min(x_data), max(x_data), 20)
yModel = numpy.linspace(min(y_data), max(y_data), 20)
X, Y = numpy.meshgrid(xModel, yModel)
Z = equationFunc(numpy.array([X, Y]), *params)
axes.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=1, antialiased=True)
axes.scatter(x_data, y_data, z_data) # show data along with plotted surface
axes.set_title('Surface Plot (click-drag with mouse)') # add a title for surface plot
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
axes.set_zlabel('Z Data') # Z axis data label
plt.show()
plt.close('all') # clean up after using pyplot or else thaere can be memory and process problems
def ContourPlot(equationFunc, data, params):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
x_data = data[0]
y_data = data[1]
z_data = data[2]
xModel = numpy.linspace(min(x_data), max(x_data), 20)
yModel = numpy.linspace(min(y_data), max(y_data), 20)
X, Y = numpy.meshgrid(xModel, yModel)
Z = equationFunc(numpy.array([X, Y]), *params)
axes.plot(x_data, y_data, 'o')
axes.set_title('Contour Plot') # add a title for contour plot
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
CS = matplotlib.pyplot.contour(X, Y, Z, numberOfContourLines, colors='k')
matplotlib.pyplot.clabel(CS, inline=1, fontsize=10) # labels for contours
plt.show()
plt.close('all') # clean up after using pyplot or else thaere can be memory and process problems
def ScatterPlot(data, title):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
matplotlib.pyplot.grid(True)
axes = Axes3D(f)
x_data = data[0]
y_data = data[1]
z_data = data[2]
axes.scatter(x_data, y_data, z_data, depthshade=False, color='k')
axes.set_title(title)
axes.set_xlabel('X Data')
axes.set_ylabel('Y Data')
axes.set_zlabel('Z Data')
plt.show()
plt.close('all') # clean up after using pyplot or else thaere can be memory and process problems
def EquationFunc(data, *params):
p0 = params[0]
p1 = params[1]
return p0 + numpy.sqrt(data[0]) + numpy.cos(data[1] / p1)
if __name__ == "__main__":
# raw data
xData = numpy.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0])
yData = numpy.array([11.0, 12.1, 13.0, 14.1, 15.0, 16.1, 17.0, 18.1, 90.0])
zData = numpy.array([1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.0, 9.9])
pInitial = (1.0, 1.0)
popt, pcov = scipy.optimize.curve_fit(EquationFunc,(xData,yData),zData, p0=pInitial)
dataForPlotting = [xData, yData, zData]
ScatterPlot([xData, yData, zData], 'Data Scatter Plot (click-drag with mouse)')
SurfacePlot(EquationFunc, [xData, yData, zData], popt)
ContourPlot(EquationFunc, [xData, yData, zData], popt)
absError = zData - EquationFunc((xData,yData), *popt)
ScatterPlot([xData, yData, absError], 'Error Scatter Plot (click-drag with mouse)')