Python: Unexpected predicted vs actual plot for regression models - python

I have 3 regression models: Linear regression, Random Forest, and ANN.
Trying to compare them, I am plotting two plots: residuals and actual vs predicted.
Overall, the residuals suggest that most models predict the data well as they have a symmetric shape and follow the horizontal line. However, when evaluating the actual vs predicted, none of the models nearly follow the 45 degree line (suggesting perfect prediction). What am I doing wrong? Please see the relevant code below:
# Separate features from the label (add .values) to use arrays instead of dataframes
X = dataset2.iloc[:, 0:-1]
y = dataset2.iloc[:, -1]
# Split into training and testin sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
# Scale the data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Train OLS ####################################
ols_regressor = sm.OLS(y_train, X_train).fit()
# Predictions
y_pred_ols_test = ols_regressor.predict(X_test)
y_pred_ols_train = ols_regressor.predict(X_train)
# Train RF ####################################
rf = RandomForestRegressor(bootstrap=False,
max_depth=100,
max_features=3,
min_samples_leaf=1,
min_samples_split=5,
n_estimators=800)
rf.fit(X_train, y_train)
# Predictions
y_pred_rf_test = rf.predict(X_test)
y_pred_rf_train = rf.predict(X_train)
# Train RF ####################################
# ... TRAIN NN WITH KERAS AND TENSORFLOW ...
y_pred_nn_test = ann.predict(X_test)
y_pred_nn_train = ann.predict(X_train)
Here I am plotting the residuals:
fig, axes =plt.subplots(3,1,figsize=(8,12))
fig.suptitle('Residuals', fontweight="bold", fontsize=15)
# OLS #######################################
axes[0] = sns.residplot(ax=axes[0], x=y_pred_ols_test, y=y_test, data=dataset,
lowess=True,
color='darkcyan',
scatter_kws={'alpha': 0.5},
line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})
axes[0].set_xlim([-0.0050, 0.0005])
axes[0].set_ylim([-0.03, 0.03])
axes[0].set_title('Linear Regression')
axes[0].set_xlabel(' ')
axes[0].set_ylabel('Residuals')
# Random Forest ###############################
axes[1] = sns.residplot(ax=axes[1], x=y_pred_rf_test, y=y_test, data=dataset,
lowess=True,
color='darkcyan',
scatter_kws={'alpha': 0.5},
line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})
axes[1].set_xlim([-0.016, 0.006])
axes[1].set_ylim([-0.03, 0.03])
axes[1].set_title('Random Forest')
axes[1].set_xlabel(' ')
axes[1].set_ylabel('Residuals')
# Neural Network ###############################
axes[2] = sns.residplot(ax=axes[2], x=y_pred_nn_test, y=y_test, data=dataset,
lowess=True,
color='darkcyan',
scatter_kws={'alpha': 0.5},
line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})
axes[2].set_xlim([-0.018, -0.009])
axes[2].set_ylim([-0.03, 0.03])
axes[2].set_title('Artificial Neural Network)')
axes[2].set_xlabel('Predicted values')
axes[2].set_ylabel('Residuals')
plt.show()
Then I plot the predicted vs actual plot:
fig, axes =plt.subplots(3,1,figsize=(8,12))
fig.suptitle('Predicted vs Actual', fontweight="bold", fontsize=15)
# OLS #######################################
axes[0] = sns.regplot(ax=axes[0], x=y_pred_ols_test, y=y_test,
color='darkcyan',
scatter_kws={'alpha': 0.5},
line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})
x0 = -0.00415
x1 = 0.00025
y0 = -0.03
y1 = 0.03
# Plot 45 degree line
axes[0].plot([x0, x1], [y0, y1], ls="--", c="grey")
# # Axes Scale
axes[0].set_xlim([x0, x1])
axes[0].set_ylim([y0, y1])
# Labels
axes[0].set_title('Linear Regression')
axes[0].set_xlabel('')
axes[0].set_ylabel('True values')
# Random Forest ###############################
axes[1] = sns.regplot(ax=axes[1], x=y_pred_rf_test, y=y_test,
color='darkcyan',
scatter_kws={'alpha': 0.5},
line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})
x0 = -0.0125
x1 = 0.0045
y0 = -0.03
y1 = 0.03
# Plot 45 degree line
axes[1].plot([x0, x1], [y0, y1], ls="--", c="grey")
# Axes Scale
axes[1].set_xlim([x0, x1])
axes[1].set_ylim([y0, y1])
axes[1].set_title('Random Forest')
axes[1].set_xlabel(' ')
axes[1].set_ylabel('True values')
# NN #########################################
axes[2] = sns.regplot(ax=axes[2], x=y_pred_nn_test, y=y_test,
color='darkcyan',
scatter_kws={'alpha': 0.5},
line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})
x0 = -0.0175
x1 = -0.0092
y0 = -0.03
y1 = 0.03
# Plot 45 degree line
axes[2].plot([x0, x1], [y0, y1], ls="--", c="grey")
# Axes Scale
axes[2].set_xlim([x0, x1])
axes[2].set_ylim([y0, y1])
axes[2].set_title('Artificial Neural Network')
axes[2].set_xlabel('Predicted values')
axes[2].set_ylabel('True values')
plt.show()
Theoretically, the actual vs predicted plot should be good right? Considering that the residuals plot is symmetric and horizontal. What am I missing here?
For the actual vs predicted I also tried using sns.scatterplot() but it was the same really...

Related

Scikit classification comparison

i compared results of testset of 6 classifications and got Random forest with highest accuracy and NB as the lowest ..
but when i do the evaluation matrix " F1 , precision and recall " am getting NB with the highest value among the rest ..
is this normal or am i doing something wrong?
i used the same code to calculate metrics for each of the classification :
RFclf=RandomForestClassifier(n_estimators=20)
#Train the model using the training sets y_pred=clf.predict(X_test)
RFclf.fit(X_train,y_train)
train_pred=RFclf.predict(X_train)
test_pred=RFclf.predict(X_test)
rftrain = metrics.accuracy_score(y_train, train_pred)
rftest = metrics.accuracy_score(y_test, test_pred)
print("Accuracy for Random Forest Test data on the test set: ",rftest)
frftest = f1_score(y_test, test_pred)
prftest = precision_score(y_test, test_pred)
rrftest = recall_score(y_test, test_pred)
and used this code to do the plot :
plt.figure(figsize=(14, 7))
ax = plt.subplot(111)
models = ['SVM ','NB','Decision Tree ','LR','KNN','RF']
values = [pSVMtest ,pGAcc ,pdesicionT ,pLRAcc ,pKNAcc ,prftest]
model = np.arange(len(models))
plt.bar(model+0.15, values, align='center', width = 0.15, alpha=0.7, color = 'blue', label =
'precision')
plt.xticks(model, models)
ax = plt.subplot(111)
models = ['SVM ','NB','Decision Tree ','LR','KNN','RF']
values = [rSVMtest ,rGAcc ,rdesicionT ,rLRAcc ,rKNAcc ,rrftest]
model = np.arange(len(models))
plt.bar(model+0.3, values, align='center', width = 0.15, alpha=0.7, color = 'green', label =
'recall')
plt.xticks(model, models)
ax = plt.subplot(111)
models = ['SVM ','NB','Decision Tree ','LR','KNN','RF']
values = [fSVMtest ,fGAcc ,fdesicionT ,fLRAcc ,fKNAcc ,frftest]
model = np.arange(len(models))
plt.bar(model+0.45, values, align='center', width = 0.15, alpha=0.7, color = 'red', label =
'F1 score')
plt.xticks(model, models)
plt.ylabel('Performance Metrics for Different models')
plt.title('Model')
# removing the axis on the top and right of the plot window
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.legend()
plt.show()

Plotting and Interpreting TensorFlow Results

The Problem:
I'm having trouble plotting and interpreting the results from my TensorFlow model. I've created my own CSV of [x, y, color] where there is a plot of randomly scattered dots with a clear pattern in the color formation. I'm able to enter all the data into the model and train the neural network but can't seem to put it all together. I'm a bit new to this as a hobbyist.
Essentially I want the ML algorithm to pick up the pattern from 100 datapoints and use it on a test dataset of nodes to plot an approximation of the pattern.
The Code:
LABEL_COLUMN = "Color"
LABELS=[0,1]
def get_dataset(data_url, **kwargs):
dataset = tf.data.experimental.make_csv_dataset(
data_url,
batch_size=5,
label_name=LABEL_COLUMN,
na_value="?",
num_epochs=1,
ignore_errors=True,
**kwargs)
return dataset
project_data = get_dataset(data_url)
project_test_data = get_dataset(test_data_url)
def pack(features,label):
return tf.stack(list(features.values()), axis=-1), label
packed_data = project_data.map(pack)
packed_test_data = project_test_data.map(pack)
model2 = tf.keras.Sequential([
tf.keras.layers.Dense(128, activation="relu"),
tf.keras.layers.Dense(128, activation="relu"),
tf.keras.layers.Dense(1),
])
model2.compile(
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True),
optimizer = "adam",
metrics = ["accuracy"]
)
model2.fit(packed_data, epochs=100)
model_output = model2.predict(packed_test_data)
model_output.plot()
Gives the below error:
AttributeError: 'numpy.ndarray' object has no attribute 'plot'
Perhaps this function can be adapted to solve your problem?
(From https://jonchar.net/notebooks/Artificial-Neural-Network-with-Keras/)
import matplotlib.pyplot as plt
def plot_decision_boundary(X, y, model, steps=1000, cmap='Paired'):
"""
Function to plot the decision boundary and data points of a model.
Data points are colored based on their actual label.
"""
cmap = plt.get_cmap(cmap)
# Define region of interest by data limits
xmin, xmax = X[:,0].min() - 1, X[:,0].max() + 1
ymin, ymax = X[:,1].min() - 1, X[:,1].max() + 1
steps = 1000
x_span = np.linspace(xmin, xmax, steps)
y_span = np.linspace(ymin, ymax, steps)
xx, yy = np.meshgrid(x_span, y_span)
# Make predictions across region of interest
labels = model.predict(np.c_[xx.ravel(), yy.ravel()])
# Plot decision boundary in region of interest
z = labels.reshape(xx.shape)
fig, ax = plt.subplots()
ax.contourf(xx, yy, z, cmap=cmap, alpha=0.5)
# Get predicted labels on training data and plot
train_labels = model.predict(X)
ax.scatter(X[:,0], X[:,1], c=y, cmap=cmap, lw=0)
return fig, ax
plot_decision_boundary(X, y, model, cmap='RdBu')

Multiple Linear Regression in Python?

I am trying to build a multiple linear regression on a dummy data and I keep getting overflow error.
Assume this as a dummy data.
print(x_train)
col1 col2 target
0.18 0.89 109.85
1.0 0.26 155.72
0.92 0.11 137.66
0.07 0.37 76.17
0.85 0.16 139.75
0.99 0.41 162.6
0.87 0.47 151.77
print(x_test)
0.49 0.18
0.57 0.83
0.56 0.64
0.76 0.18
This is the code I wrote for implementation of linear regression for multiple features. Can anyone let me know if my implementation of LINEAR REGRESSION is correct? If it's correct then why am I keep getting overflow error.
import numpy as np
def data():
# prepare data
x_train = np.array(train_data)[:, :-1]
y_train = np.array(train_data)[:, -1]
x_test = np.array(test_data)
return x_train, y_train, x_test
def normalize(y):
return (y - y.min()) / (y.max() - y.min())
def linear_regression(x_train, y_train, epochs=300):
y_train = normalize(y_train)
rows, columns = x_train.shape
weights = np.zeros((columns))
intercept = 0
for x in range(epochs):
for i in range(len(x_train)):
prev_weights = weights
weights += intercept + prev_weights * x_train[i] - y_train[i]
intercept += (intercept+(prev_weights*x_train[i])-y_train[i]).dot(x_train[i])
return weights, intercept
def predict(x_test, weights, intercept):
y_pred = []
for i in range(len(x_test)):
y_pred.append(weights.dot(x_test[i]) + intercept)
return y_pred
def main():
x_train, y_train, x_test = data()
weights, intercept = linear_regression(x_train, y_train, epochs=300)
y_pred = predict(x_test, weights, intercept)
for i in y_pred:
print(str(i))
if __name__=='__main__':
main()
Results:
-inf
-inf
-inf
-inf
/srv/conda/lib/python3.6/site-packages/ipykernel_launcher.py:25: RuntimeWarning: overflow encountered in add
Here is a different approach, a Python 3D surface fitter using your data with 3D scatter plot, 3D surface plot, and contour plot. You should be able to click-drag and rotate the 3D plots in 3-space for visual inspection. Here the fitted surface is a flat plane, and there is no need for test and train split as the RMSE and R-squared are given directly and you can see the surface. Just re-fit with all data.
import numpy, scipy, scipy.optimize
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm # to colormap 3D surfaces from blue to red
import matplotlib.pyplot as plt
graphWidth = 800 # units are pixels
graphHeight = 600 # units are pixels
# 3D contour plot lines
numberOfContourLines = 16
# x, y, z = col1, col2, target
xData = numpy.array([0.18, 1.0, 0.92, 0.07, 0.85, 0.99, 0.87])
yData = numpy.array([0.89, 0.26, 0.11, 0.37, 0.16, 0.41, 0.47])
zData = numpy.array([109.85, 155.72, 137.66, 76.17, 139.75, 162.6, 151.77])
def func(data, a, b, c):
x = data[0]
y = data[1]
return (a * x) + (y * b) + c
def SurfacePlot(func, data, fittedParameters):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
matplotlib.pyplot.grid(True)
axes = Axes3D(f)
x_data = data[0]
y_data = data[1]
z_data = data[2]
xModel = numpy.linspace(min(x_data), max(x_data), 20)
yModel = numpy.linspace(min(y_data), max(y_data), 20)
X, Y = numpy.meshgrid(xModel, yModel)
Z = func(numpy.array([X, Y]), *fittedParameters)
axes.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=1, antialiased=True)
axes.scatter(x_data, y_data, z_data) # show data along with plotted surface
axes.set_title('Surface Plot (click-drag with mouse)') # add a title for surface plot
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
axes.set_zlabel('Z Data') # Z axis data label
plt.show()
plt.close('all') # clean up after using pyplot or else there can be memory and process problems
def ContourPlot(func, data, fittedParameters):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
x_data = data[0]
y_data = data[1]
z_data = data[2]
xModel = numpy.linspace(min(x_data), max(x_data), 20)
yModel = numpy.linspace(min(y_data), max(y_data), 20)
X, Y = numpy.meshgrid(xModel, yModel)
Z = func(numpy.array([X, Y]), *fittedParameters)
axes.plot(x_data, y_data, 'o')
axes.set_title('Contour Plot') # add a title for contour plot
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
CS = matplotlib.pyplot.contour(X, Y, Z, numberOfContourLines, colors='k')
matplotlib.pyplot.clabel(CS, inline=1, fontsize=10) # labels for contours
plt.show()
plt.close('all') # clean up after using pyplot or else there can be memory and process problems
def ScatterPlot(data):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
matplotlib.pyplot.grid(True)
axes = Axes3D(f)
x_data = data[0]
y_data = data[1]
z_data = data[2]
axes.scatter(x_data, y_data, z_data)
axes.set_title('Scatter Plot (click-drag with mouse)')
axes.set_xlabel('X Data')
axes.set_ylabel('Y Data')
axes.set_zlabel('Z Data')
plt.show()
plt.close('all') # clean up after using pyplot or else there can be memory and process problems
if __name__ == "__main__":
data = [xData, yData, zData]
initialParameters = [1.0, 1.0, 1.0] # these are the same as scipy default values in this example
# here a non-linear surface fit is made with scipy's curve_fit()
fittedParameters, pcov = scipy.optimize.curve_fit(func, [xData, yData], zData, p0 = initialParameters)
ScatterPlot(data)
SurfacePlot(func, data, fittedParameters)
ContourPlot(func, data, fittedParameters)
print('fitted prameters', fittedParameters)
modelPredictions = func(data, *fittedParameters)
absError = modelPredictions - zData
SE = numpy.square(absError) # squared errors
MSE = numpy.mean(SE) # mean squared errors
RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
Rsquared = 1.0 - (numpy.var(absError) / numpy.var(zData))
print('RMSE:', RMSE)
print('R-squared:', Rsquared)

colorbar within a loop with matshow

I basically want to add a colorbar at each of the subplots in the code below (link to code ). My attempts add all color bars at the end of the loop in the last subplot.
print(__doc__)
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_mldata
from sklearn.neural_network import MLPClassifier
mnist = fetch_mldata("MNIST original")
# rescale the data, use the traditional train/test split
X, y = mnist.data / 255., mnist.target
X_train, X_test = X[:60000], X[60000:]
y_train, y_test = y[:60000], y[60000:]
# mlp = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
# solver='sgd', verbose=10, tol=1e-4, random_state=1)
mlp = MLPClassifier(hidden_layer_sizes=(50,), max_iter=10, alpha=1e-4,
solver='sgd', verbose=10, tol=1e-4, random_state=1,
learning_rate_init=.1)
mlp.fit(X_train, y_train)
print("Training set score: %f" % mlp.score(X_train, y_train))
print("Test set score: %f" % mlp.score(X_test, y_test))
fig, axes = plt.subplots(4, 4)
# use global min / max to ensure all weights are shown on the same scale
vmin, vmax = mlp.coefs_[0].min(), mlp.coefs_[0].max()
for coef, ax in zip(mlp.coefs_[0].T, axes.ravel()):
ax.matshow(coef.reshape(28, 28), cmap=plt.cm.gray, vmin=.5 * vmin,
vmax=.5 * vmax)
ax.set_xticks(())
ax.set_yticks(())
plt.show()
UPDATE:
based on the link in the comment below, here is the code which adds the colorbar at the right of the diagram
fig, axes = plt.subplots(4, 4)
# use global min / max to ensure all weights are shown on the same scale
vmin, vmax = mlp.coefs_[0].min(), mlp.coefs_[0].max()
for coef, ax in zip(mlp.coefs_[0].T, axes.ravel()):
im = ax.matshow(coef.reshape(28, 28), cmap=plt.cm.gray, vmin=.5 * vmin, vmax=.5 * vmax)
ax.set_xticks(())
ax.set_yticks(())
fig.subplots_adjust(right=0.8)
cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7])
fig.colorbar(im, cax=cbar_ax)
plt.show()

Using StatsModels to plot quantile regression for 2nd order polynomial

I am following the StatsModels example here to plot quantile regression lines. With only slight modification for my data, the example works great, producing this plot (note that I have modified the code to only plot the 0.05, 0.25, 0.5, 0.75, and 0.95 quantiles) :
However, I would like to plot the OLS fit and corresponding quantiles for a 2nd order polynomial fit (instead of linear). For example, here is the 2nd-order OLS line for the same data:
How can I modify the code in the linked example to produce non-linear quantiles?
Here is my relevant code modified from the linked example to produce the 1st plot:
d = {'temp': x, 'dens': y}
df = pd.DataFrame(data=d)
# Least Absolute Deviation
#
# The LAD model is a special case of quantile regression where q=0.5
mod = smf.quantreg('dens ~ temp', df)
res = mod.fit(q=.5)
print(res.summary())
# Prepare data for plotting
#
# For convenience, we place the quantile regression results in a Pandas DataFrame, and the OLS results in a dictionary.
quantiles = [.05, .25, .50, .75, .95]
def fit_model(q):
res = mod.fit(q=q)
return [q, res.params['Intercept'], res.params['temp']] + res.conf_int().ix['temp'].tolist()
models = [fit_model(x) for x in quantiles]
models = pd.DataFrame(models, columns=['q', 'a', 'b','lb','ub'])
ols = smf.ols('dens ~ temp', df).fit()
ols_ci = ols.conf_int().ix['temp'].tolist()
ols = dict(a = ols.params['Intercept'],
b = ols.params['temp'],
lb = ols_ci[0],
ub = ols_ci[1])
print(models)
print(ols)
x = np.arange(df.temp.min(), df.temp.max(), 50)
get_y = lambda a, b: a + b * x
for i in range(models.shape[0]):
y = get_y(models.a[i], models.b[i])
plt.plot(x, y, linestyle='dotted', color='grey')
y = get_y(ols['a'], ols['b'])
plt.plot(x, y, color='red', label='OLS')
plt.scatter(df.temp, df.dens, alpha=.2)
plt.xlim((-10, 40))
plt.ylim((0, 0.4))
plt.legend()
plt.xlabel('temp')
plt.ylabel('dens')
plt.show()
After a day of looking into this, came up with a solution, so posting my own answer. Much credit to Josef Perktold at StatsModels for assistance.
Here is the relevant code and plot:
d = {'temp': x, 'dens': y}
df = pd.DataFrame(data=d)
x1 = pd.DataFrame({'temp': np.linspace(df.temp.min(), df.temp.max(), 200)})
poly_2 = smf.ols(formula='dens ~ 1 + temp + I(temp ** 2.0)', data=df).fit()
plt.plot(x, y, 'o', alpha=0.2)
plt.plot(x1.temp, poly_2.predict(x1), 'r-',
label='2nd order poly fit, $R^2$=%.2f' % poly_2.rsquared,
alpha=0.9)
plt.xlim((-10, 50))
plt.ylim((0, 0.25))
plt.xlabel('mean air temp')
plt.ylabel('density')
plt.legend(loc="upper left")
# with quantile regression
# Least Absolute Deviation
# The LAD model is a special case of quantile regression where q=0.5
mod = smf.quantreg('dens ~ temp + I(temp ** 2.0)', df)
res = mod.fit(q=.5)
print(res.summary())
# Quantile regression for 5 quantiles
quantiles = [.05, .25, .50, .75, .95]
# get all result instances in a list
res_all = [mod.fit(q=q) for q in quantiles]
res_ols = smf.ols('dens ~ temp + I(temp ** 2.0)', df).fit()
plt.figure()
# create x for prediction
x_p = np.linspace(df.temp.min(), df.temp.max(), 50)
df_p = pd.DataFrame({'temp': x_p})
for qm, res in zip(quantiles, res_all):
# get prediction for the model and plot
# here we use a dict which works the same way as the df in ols
plt.plot(x_p, res.predict({'temp': x_p}), linestyle='--', lw=1,
color='k', label='q=%.2F' % qm, zorder=2)
y_ols_predicted = res_ols.predict(df_p)
plt.plot(x_p, y_ols_predicted, color='red', zorder=1)
#plt.scatter(df.temp, df.dens, alpha=.2)
plt.plot(df.temp, df.dens, 'o', alpha=.2, zorder=0)
plt.xlim((-10, 50))
plt.ylim((0, 0.25))
#plt.legend(loc="upper center")
plt.xlabel('mean air temp')
plt.ylabel('density')
plt.title('')
plt.show()
red line: 2nd order polynomial fit
black dashed lines: 5th, 25th, 50th, 75th, 95th percentiles

Categories

Resources