Representation of a training and validation metric in a pipeline - python

I have a problem. I want to plot my RMSE value. However, I now use a pipeline because I use cross-validation and also use other steps like feature selection.
My question is, is there a way to get this plot through the pipeline (without training the model a second time)? So how can I display the training and validation RMSE value nicely in a diagram in the pipeline?
Pipeline
dfListingsFeature_regression = pd.read_csv(r"https://raw.githubusercontent.com/Coderanker3/dataset4/main/listings_cleaned.csv")
d = {True: 1, False: 0, np.nan : np.nan}
dfListingsFeature_regression['host_is_superhost'] = dfListingsFeature_regression[
'host_is_superhost'].map(d).astype('int')
X = dfListingsFeature_regression.drop(columns=['host_id', 'id', 'price']) # Features
y = dfListingsFeature_regression['price'] # Target variable
print(dfListingsFeature_nor.shape)
steps = [('feature_selection', SelectFromModel(estimator=LogisticRegression(max_iter=10000))),
('lasso', Lasso(alpha=0.4))]
pipeline = Pipeline(steps)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=30)
parameteres = { }
grid = GridSearchCV(pipeline, param_grid=parameteres, cv=5)
grid.fit(X_train, y_train)
print("score = %3.2f" %(grid.score(X_test,y_test)))
print('Training set score: ' + str(grid.score(X_train,y_train)))
print('Test set score: ' + str(grid.score(X_test,y_test)))
y_pred = grid.predict(X_test)
print("RMSE Val:", metrics.mean_squared_error(y_test, y_pred, squared=False))
y_train_predict = grid.predict(X_train)
print("Train:" , metrics.mean_squared_error(y_train, y_train_predict , squared=False))
r2 = metrics.r2_score(y_test, y_pred)
print(r2)
Plot
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
def plot_learning_curves(model, X, y):
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=10)
train_errors, val_errors = [], []
for m in range(1, 500 + 1):
model.fit(X_train[:m], y_train[:m])
y_train_predict = model.predict(X_train[:m])
y_val_predict = model.predict(X_val)
train_errors.append(mean_squared_error(y_train[:m], y_train_predict))
val_errors.append(mean_squared_error(y_val, y_val_predict))
plt.figure( figsize=(10,10))
plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="train")
plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="val")
plt.legend(loc="upper right", fontsize=14)
plt.xlabel("Training set size", fontsize=14)
plt.ylabel("RMSE", fontsize=14)
%%time
lin_reg = Lasso(alpha=0.1)
plot_learning_curves(lin_reg, X, y)
#plt.axis([0, 80, 0, 3])
plt.show()

You don't have to fit() your model again in plot_learning_curves. You can simply use your fitted pipeline to predict value for both train and validation set and then plot your learning curve.
You function should look as follow without the model.fit():
def plot_learning_curves(model, X, y):
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=10)
train_errors, val_errors = [], []
for m in range(1, 500 + 1):
y_train_predict = model.predict(X_train[:m])
y_val_predict = model.predict(X_val)
train_errors.append(mean_squared_error(y_train[:m], y_train_predict))
val_errors.append(mean_squared_error(y_val, y_val_predict))
plt.figure( figsize=(10,10))
plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="train")
plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="val")
plt.legend(loc="upper right", fontsize=14)
plt.xlabel("Training set size", fontsize=14)
plt.ylabel("RMSE", fontsize=14)
Then you should call this function using your fitted model as parameter.

Related

Learning-curve for classification in python

To plot the learning-curve in a regression problem, we should use 'RMSE' as a measure of analysis, like:
def plot_learning_curves(model, X_train, y_train, X_val, y_val):
plt.figure(figsize=(15, 5))
train_errors, val_errors = [], []
for m in range(5, len(X_train)):
model.fit(X_train[:m], y_train[:m])
y_train_predict = model.predict(X_train[:m])
y_val_predict = model.predict(X_val)
train_errors.append(metrics.mean_squared_error(y_train_predict, y_train[:m]))
val_errors.append(metrics.mean_squared_error(y_val_predict, y_val))
plt.plot(np.sqrt(train_errors), "r-+", linewidth=1, label="training data")
plt.plot(np.sqrt(val_errors), "b-", linewidth=1, label="validation data")
plt.legend(loc="upper right", fontsize=10)
plt.xlabel("Size", fontsize=10)
plt.ylabel("RMSE", fontsize=10)
plt.title("Learning Curves")
plt.show()
However, I would like to have a learning curve for the classification problem, and I know that we should use accuracy as a metric for analysis instead of 'RMSE'.
So far, I've only found snippet codes for this problem using the 'learning-curve' class in sklearn , like:
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
def plot_learning_curve(estimator, X, y, ax=None, cv=None, n_jobs=4, train_sizes=np.linspace(.1, 1.0, 5)):
train_sizes, train_scores, test_scores = \
learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
# Plot learning curve
ax.set_xlabel("Training examples")
ax.set_ylabel("Score")
ax.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
ax.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
ax.legend(loc="best")
return plt
However, as you can see in this solution, we need to pass the x and y datasets and it uses cross-validation to split them into train and test datasets.
But, I've already split my dataset and applied a lot of preprocessing to x-train and x-test.
So, I intend to use my train and test dataset just like the code I wrote for the regression problem, which is based on the "RMSE" metric (without using the "learning-curve" class in sklearn).

How to draw best fit plane for multi variant regression in scikit learn?

I am not software back ground yet i am learning regression technique to predict motor data.
I have 3d data for which i have used multi variant regression.
Result is fine. But now i want to visualize the best fir plane for this data.
following are the code which i copied paste from different site to try to visualize my data.
X_final=df3[['Ampere','Voltage']]
y_final=df3[['ReactivePower']].copy() #copy column data in to y_final
X_final=X_final.dropna()
y_final=y_final.dropna()
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size = 0.33, random_state = 0 )
lr = LinearRegression().fit(X_train,y_train)
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)
#print score
print("lr.coef_: {}".format(lr.coef_))
print("lr.intercept_: {}".format(lr.intercept_))
print('lr train score %.3f, lr test score: %.3f' % (
lr.score(X_train,y_train),
lr.score(X_test, y_test)))
# Visualize the Data for Multiple Linear Regression
x_surf, y_surf = np.meshgrid(np.linspace(df3.Voltage.min(), df3.Voltage.max()),np.linspace(df3.Ampere.min(), df3.Ampere.max()))
y_train_pred_random= y_train_pred[np.random.choice(y_train_pred.shape[0], 2500, replace=False), :]
y_train_pred_random=np.array(y_train_pred_random)
y_train_pred1=y_train_pred_random.reshape(x_surf.shape)
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(df3['Voltage'],df3['Ampere'],df3['ReactivePower'],c='red', marker='o', alpha=0.5)
ax.plot_surface(x_surf,y_surf,y_train_pred1,rstride=1, cstride=1, color='b', alpha=0.3)
ax.set_xlabel('Voltage')
ax.set_ylabel('Ampere')
ax.set_zlabel('Reactive Power')
plt.show()
when i run code for visualization i get following graph,
Please help
yeah, i solved myself with some refrence online,
here is the code,
#Test train split mullti variant
X_final=df3[['Ampere','Voltage']]
y_final=df3[['ReactivePower']].copy() #copy column data in to y_final
X_final=X_final.dropna()
y_final=y_final.dropna()
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size = 0.33, random_state = 0 )
lr = LinearRegression().fit(X_train,y_train)
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)
#print score
print("lr.coef_: {}".format(lr.coef_))
print("lr.intercept_: {}".format(lr.intercept_))
print('lr train score %.3f, lr test score: %.3f' % (
lr.score(X_train,y_train),
lr.score(X_test, y_test)))
# Visualize the Data for Multiple Linear Regression
x_surf, y_surf = np.meshgrid(np.linspace(df3.Ampere.min(), df3.Ampere.max()),np.linspace(df3.Voltage.min(), df3.Voltage.max()))
z_surf=lr.coef_[0,0]*x_surf+lr.coef_[0,1]*y_surf+lr.intercept_
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(df3['Ampere'],df3['Voltage'],df3['ReactivePower'],c='red', marker='o', alpha=0.5)
ax.plot_surface(x_surf,y_surf,z_surf,rstride=1, cstride=1, color='b', alpha=0.3)
ax.set_xlabel('Ampere')
ax.set_ylabel('Voltage')
ax.set_zlabel('Reactive Power')
plt.show()
Here is the plot,
Thanks,

display a plot with linear regression sklearn

I have a dataset with X = ['x', 'y'] the two first columns of my dataset
and in target the data['class'].
But i doesn't how display a plot of linear regression in this case.
Because I have the error "x and y must be the same size".
So how i can plot a linear regression and predict with a dataset
or i take X as the first two column of my dataset and in target the last column ?
Thanks so much for the help, here my code below :
data = pd.read_csv('data.csv')
X = data[['x', 'y']]
data['class'] = np.where(data['class']=='P', 1, 0)
Y = data['class']
plt.scatter(X, Y, color='blue')
plt.xlabel('x')
plt.ylabel('y')
plt.plot(X, Y, color='red', linewidth=2)
plt.show()
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=0)
regressor = LinearRegression()
regressor.fit(X_train, y_train)
Based on the offical documentation:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=0)
regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test) #adding your prediction, this was missing
import matplotlib.pyplot as plt
import numpy as np
# Plot outputs
plt.scatter(X_test, y_test, color='black') #plot scatters
plt.plot(X_test, y_pred, color='red', linewidth=2) #plot line
plt.xticks(())
plt.yticks(())
plt.show()

Precision Recall curve with n-fold cross validation showing standard deviation

I want to generate a Precision-Recall curve with 5-fold cross-validation showing standard deviation as in the example ROC curve code here.
The code below (adapted from How to Plot PR-Curve Over 10 folds of Cross Validation in Scikit-Learn) gives a PR curves for each fold of cross-validation along with the mean PR curve. I wanted to also show the region of one standard deviation above and below the mean PR curve in grey. But it gives the following error (details in the link below the code):
ValueError: operands could not be broadcast together with shapes (91,) (78,)
import matplotlib.pyplot as plt
import numpy
from sklearn.datasets import make_blobs
from sklearn.metrics import precision_recall_curve, auc
from sklearn.model_selection import KFold
from sklearn.svm import SVC
X, y = make_blobs(n_samples=500, n_features=2, centers=2, cluster_std=10.0,
random_state=10)
k_fold = KFold(n_splits=5, shuffle=True, random_state=10)
predictor = SVC(kernel='linear', C=1.0, probability=True, random_state=10)
y_real = []
y_proba = []
precisions, recalls = [], []
for i, (train_index, test_index) in enumerate(k_fold.split(X)):
Xtrain, Xtest = X[train_index], X[test_index]
ytrain, ytest = y[train_index], y[test_index]
predictor.fit(Xtrain, ytrain)
pred_proba = predictor.predict_proba(Xtest)
precision, recall, _ = precision_recall_curve(ytest, pred_proba[:,1])
lab = 'Fold %d AUC=%.4f' % (i+1, auc(recall, precision))
plt.plot(recall, precision, alpha=0.3, label=lab)
y_real.append(ytest)
y_proba.append(pred_proba[:,1])
precisions.append(precision)
recalls.append(recall)
y_real = numpy.concatenate(y_real)
y_proba = numpy.concatenate(y_proba)
precision, recall, _ = precision_recall_curve(y_real, y_proba)
lab = 'Overall AUC=%.4f' % (auc(recall, precision))
plt.plot(recall, precision, lw=2,color='red', label=lab)
std_precision = np.std(precisions, axis=0)
tprs_upper = np.minimum(precisions[median] + std_precision, 1)
tprs_lower = np.maximum(precisions[median] - std_precision, 0)
plt.fill_between(recall_overall, upper_precision, lower_precision, alpha=0.5, linewidth=0, color='grey')
Error reported and Plot generated
Can you please suggest how I add to the following code to also show one standard deviation around the mean PR curve?
I have got a working solution, but it would be helpful if anybody can comment whether it is doing the right thing.
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.metrics import precision_recall_curve, auc
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from numpy import interp
X, y = make_blobs(n_samples=500, n_features=2, centers=2, cluster_std=10.0,
random_state=10)
k_fold = KFold(n_splits=5, shuffle=True, random_state=10)
predictor = SVC(kernel='linear', C=1.0, probability=True, random_state=10)
y_real = []
y_proba = []
precision_array = []
threshold_array=[]
recall_array = np.linspace(0, 1, 100)
for i, (train_index, test_index) in enumerate(k_fold.split(X)):
Xtrain, Xtest = X[train_index], X[test_index]
ytrain, ytest = y[train_index], y[test_index]
predictor.fit(Xtrain, ytrain)
pred_proba = predictor.predict_proba(Xtest)
precision_fold, recall_fold, thresh = precision_recall_curve(ytest, pred_proba[:,1])
precision_fold, recall_fold, thresh = precision_fold[::-1], recall_fold[::-1], thresh[::-1] # reverse order of results
thresh = np.insert(thresh, 0, 1.0)
precision_array = interp(recall_array, recall_fold, precision_fold)
threshold_array = interp(recall_array, recall_fold, thresh)
pr_auc = auc(recall_array, precision_array)
lab_fold = 'Fold %d AUC=%.4f' % (i+1, pr_auc)
plt.plot(recall_fold, precision_fold, alpha=0.3, label=lab_fold)
y_real.append(ytest)
y_proba.append(pred_proba[:,1])
y_real = numpy.concatenate(y_real)
y_proba = numpy.concatenate(y_proba)
precision, recall, _ = precision_recall_curve(y_real, y_proba)
lab = 'Overall AUC=%.4f' % (auc(recall, precision))
plt.plot(recall, precision, lw=2,color='red', label=lab)
plt.legend(loc='lower left', fontsize='small')
mean_precision = np.mean(precision_array)
std_precision = np.std(precision_array)
plt.fill_between(recall, precision + std_precision, precision - std_precision, alpha=0.3, linewidth=0, color='grey')
plt.show()
Although in the right direction, the answer from #user1886130 is not entirely correct since the variable precision_array is overwritten at each iteration inside the loop.
A cleaner and correct version is:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.metrics import precision_recall_curve, auc
from sklearn.model_selection import KFold
from sklearn.svm import SVC
X, y = make_blobs(n_samples=500, n_features=2, centers=2, cluster_std=10.0, random_state=10)
k_fold = KFold(n_splits=5, shuffle=True, random_state=10)
predictor = SVC(kernel='linear', C=1.0, probability=True, random_state=10)
y_real = []
y_proba = []
precision_array = []
recall_array = np.linspace(0, 1, 100)
for i, (train_index, test_index) in enumerate(k_fold.split(X)):
Xtrain, Xtest = X[train_index], X[test_index]
ytrain, ytest = y[train_index], y[test_index]
predictor.fit(Xtrain, ytrain)
pred_proba = predictor.predict_proba(Xtest)
precision_fold, recall_fold, _ = precision_recall_curve(ytest, pred_proba[:, 1])
precision_fold, recall_fold = precision_fold[::-1], recall_fold[::-1] # reverse order of results
prec_array = np.interp(recall_array, recall_fold, precision_fold)
pr_auc = auc(recall_array, prec_array)
precision_array.append(prec_array)
lab_fold = 'Fold %d AUPR=%.4f' % (i+1, pr_auc)
plt.plot(recall_fold, precision_fold, alpha=0.3, label=lab_fold)
y_real.append(ytest)
y_proba.append(pred_proba[:, 1])
y_real = np.concatenate(y_real)
y_proba = np.concatenate(y_proba)
precision, recall, _ = precision_recall_curve(y_real, y_proba)
lab = 'Overall AUPR=%.4f' % (auc(recall, precision))
plt.plot(recall, precision, lw=2, color='red', label=lab)
plt.legend(loc='lower left', fontsize='small')
mean_precision = np.mean(precision_array, axis=0)
std_precision = np.std(precision_array, axis=0)
plt.fill_between(recall_array, mean_precision + std_precision, mean_precision - std_precision, alpha=0.3, linewidth=0, color='grey')
plt.title("PR curves; {} folds".format(k_fold.n_splits), weight="bold", fontsize=15)
plt.xlabel("Recall (Sensitivity)", fontsize=12)
plt.ylabel("Precision (PPV)", fontsize=12)
plt.show()

Plot validation curve for class-weight in sklearn

I would greatly appreciate if you could let me know how to plot validation curve for class weight. In fact, I tired the following code to do this task:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, StratifiedKFold, validation_curve, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import numpy as np
import matplotlib.pyplot as plt
def plot_validation_curve(param_range, train_scores, test_scores, title, alpha=0.1):
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.plot(param_range, train_mean, label='train score', color='blue', marker='o')
plt.fill_between(param_range, train_mean + train_std,
train_mean - train_std, color='blue', alpha=alpha)
plt.plot(param_range, test_mean, label='test score', color='red', marker='o')
plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, color='red', alpha=alpha)
plt.title(title)
plt.grid(ls='--')
plt.xlabel('Parameter value')
plt.ylabel('F-measure')
plt.legend(loc='best')
plt.show()
if __name__ == '__main__':
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.9, 0.1], n_informative=3, n_redundant=1, flip_y=0,
n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
st = StandardScaler()
rg = LogisticRegression(class_weight={0: 1, 1: 6.5}, random_state=42, solver='saga', max_iter=100, n_jobs=-1)
param_grid = {'clf__C': [0.001, 0.01, 0.1],
'clf__class_weight': [{0: 1, 1: 6}, {0: 1, 1: 5.5}]}
pipeline = Pipeline(steps=[('scaler', st),
('clf', rg)])
cv = StratifiedKFold(n_splits=5, random_state=42)
rg_cv = GridSearchCV(pipeline, param_grid, cv=cv, scoring='f1')
rg_cv.fit(X_train, y_train)
plt.figure(figsize=(9, 6))
param_range2 = [{0: 1, 1: 6}, {0: 1, 1: 4}, {0: 1, 1: 5.5}]
train_scores, test_scores = validation_curve(
estimator=rg_cv.best_estimator_, X=X_train, y=y_train, param_name="clf__class_weight", param_range=param_range2,
cv=cv, scoring="f1", n_jobs=-1)
plot_validation_curve(param_range2, train_scores, test_scores, title="Validation Curve for class_weight", alpha=0.1)
However, this error is reported, which is related to the param_range2 in the last line:
TypeError: float() argument must be a string or a number, not 'dict'
Thanks in advance.
Best regards,
You have to decide what you want to plot on your x-axis. You do a parameter sweep over the different class weights. Since you fixed the weight for the first class to 1 and only modified the weight for class 2, I decided to plot the scores against the weight for class two.
I then sorted the weights in the plot function in ascending order so that you get nicely connected lines.
def plot_validation_curve(param_range, train_scores, test_scores, title, alpha=0.1):
param_range = [x[1] for x in param_range]
sort_idx = np.argsort(param_range)
param_range=np.array(param_range)[sort_idx]
train_mean = np.mean(train_scores, axis=1)[sort_idx]
train_std = np.std(train_scores, axis=1)[sort_idx]
test_mean = np.mean(test_scores, axis=1)[sort_idx]
test_std = np.std(test_scores, axis=1)[sort_idx]
plt.plot(param_range, train_mean, label='train score', color='blue', marker='o')
plt.fill_between(param_range, train_mean + train_std,
train_mean - train_std, color='blue', alpha=alpha)
plt.plot(param_range, test_mean, label='test score', color='red', marker='o')
plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, color='red', alpha=alpha)
plt.title(title)
plt.grid(ls='--')
plt.xlabel('Weight of class 2')
plt.ylabel('Average values and standard deviation for F1-Score')
plt.legend(loc='best')
plt.show()
This results in the following plot

Categories

Resources