I would greatly appreciate if you could let me know how to plot validation curve for class weight. In fact, I tired the following code to do this task:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, StratifiedKFold, validation_curve, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import numpy as np
import matplotlib.pyplot as plt
def plot_validation_curve(param_range, train_scores, test_scores, title, alpha=0.1):
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.plot(param_range, train_mean, label='train score', color='blue', marker='o')
plt.fill_between(param_range, train_mean + train_std,
train_mean - train_std, color='blue', alpha=alpha)
plt.plot(param_range, test_mean, label='test score', color='red', marker='o')
plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, color='red', alpha=alpha)
plt.title(title)
plt.grid(ls='--')
plt.xlabel('Parameter value')
plt.ylabel('F-measure')
plt.legend(loc='best')
plt.show()
if __name__ == '__main__':
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.9, 0.1], n_informative=3, n_redundant=1, flip_y=0,
n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
st = StandardScaler()
rg = LogisticRegression(class_weight={0: 1, 1: 6.5}, random_state=42, solver='saga', max_iter=100, n_jobs=-1)
param_grid = {'clf__C': [0.001, 0.01, 0.1],
'clf__class_weight': [{0: 1, 1: 6}, {0: 1, 1: 5.5}]}
pipeline = Pipeline(steps=[('scaler', st),
('clf', rg)])
cv = StratifiedKFold(n_splits=5, random_state=42)
rg_cv = GridSearchCV(pipeline, param_grid, cv=cv, scoring='f1')
rg_cv.fit(X_train, y_train)
plt.figure(figsize=(9, 6))
param_range2 = [{0: 1, 1: 6}, {0: 1, 1: 4}, {0: 1, 1: 5.5}]
train_scores, test_scores = validation_curve(
estimator=rg_cv.best_estimator_, X=X_train, y=y_train, param_name="clf__class_weight", param_range=param_range2,
cv=cv, scoring="f1", n_jobs=-1)
plot_validation_curve(param_range2, train_scores, test_scores, title="Validation Curve for class_weight", alpha=0.1)
However, this error is reported, which is related to the param_range2 in the last line:
TypeError: float() argument must be a string or a number, not 'dict'
Thanks in advance.
Best regards,
You have to decide what you want to plot on your x-axis. You do a parameter sweep over the different class weights. Since you fixed the weight for the first class to 1 and only modified the weight for class 2, I decided to plot the scores against the weight for class two.
I then sorted the weights in the plot function in ascending order so that you get nicely connected lines.
def plot_validation_curve(param_range, train_scores, test_scores, title, alpha=0.1):
param_range = [x[1] for x in param_range]
sort_idx = np.argsort(param_range)
param_range=np.array(param_range)[sort_idx]
train_mean = np.mean(train_scores, axis=1)[sort_idx]
train_std = np.std(train_scores, axis=1)[sort_idx]
test_mean = np.mean(test_scores, axis=1)[sort_idx]
test_std = np.std(test_scores, axis=1)[sort_idx]
plt.plot(param_range, train_mean, label='train score', color='blue', marker='o')
plt.fill_between(param_range, train_mean + train_std,
train_mean - train_std, color='blue', alpha=alpha)
plt.plot(param_range, test_mean, label='test score', color='red', marker='o')
plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, color='red', alpha=alpha)
plt.title(title)
plt.grid(ls='--')
plt.xlabel('Weight of class 2')
plt.ylabel('Average values and standard deviation for F1-Score')
plt.legend(loc='best')
plt.show()
This results in the following plot
Related
To plot the learning-curve in a regression problem, we should use 'RMSE' as a measure of analysis, like:
def plot_learning_curves(model, X_train, y_train, X_val, y_val):
plt.figure(figsize=(15, 5))
train_errors, val_errors = [], []
for m in range(5, len(X_train)):
model.fit(X_train[:m], y_train[:m])
y_train_predict = model.predict(X_train[:m])
y_val_predict = model.predict(X_val)
train_errors.append(metrics.mean_squared_error(y_train_predict, y_train[:m]))
val_errors.append(metrics.mean_squared_error(y_val_predict, y_val))
plt.plot(np.sqrt(train_errors), "r-+", linewidth=1, label="training data")
plt.plot(np.sqrt(val_errors), "b-", linewidth=1, label="validation data")
plt.legend(loc="upper right", fontsize=10)
plt.xlabel("Size", fontsize=10)
plt.ylabel("RMSE", fontsize=10)
plt.title("Learning Curves")
plt.show()
However, I would like to have a learning curve for the classification problem, and I know that we should use accuracy as a metric for analysis instead of 'RMSE'.
So far, I've only found snippet codes for this problem using the 'learning-curve' class in sklearn , like:
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
def plot_learning_curve(estimator, X, y, ax=None, cv=None, n_jobs=4, train_sizes=np.linspace(.1, 1.0, 5)):
train_sizes, train_scores, test_scores = \
learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
# Plot learning curve
ax.set_xlabel("Training examples")
ax.set_ylabel("Score")
ax.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
ax.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
ax.legend(loc="best")
return plt
However, as you can see in this solution, we need to pass the x and y datasets and it uses cross-validation to split them into train and test datasets.
But, I've already split my dataset and applied a lot of preprocessing to x-train and x-test.
So, I intend to use my train and test dataset just like the code I wrote for the regression problem, which is based on the "RMSE" metric (without using the "learning-curve" class in sklearn).
My goal is to find the precision-recall curve, comparing with Logistic Regression and Random Forest and plotting them in one graph. I wanted to know if I used the right steps to create a plot to compare both classifiers.
I appreciate all the help!
Code:
from sklearn.preprocessing import MultiLabelBinarizer as mlb
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.datasets import make_classification
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from matplotlib import pyplot
X = df[["DIAGNOSIS_CD_Dummy"]]
y = df[["TEST_RESULT_Dummy"]]
# X = pd.DataFrame(df.iloc[:, -1])
# y = pd.DataFrame(df.iloc[:, :-1])
# raw confusion matrix
df = pd.DataFrame(df, columns=["DIAGNOSIS_CD_Dummy", "TEST_RESULT_Dummy"])
confusion_matrix = pd.crosstab(
df["TEST_RESULT_Dummy"],
df["DIAGNOSIS_CD_Dummy"],
rownames=["Test Result"],
colnames=["Diagnosis"],
)
print(confusion_matrix)
# Logistic Regression Confusion Matrix
from sklearn.preprocessing import MultiLabelBinarizer as mlb
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.datasets import make_classification
from sklearn import metrics
# split into training and test using scikit
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y.values.ravel(), test_size=0.3, random_state=1, stratify=y
)
log_model = LogisticRegression()
log_model.fit(X_train, y_train)
# use logistic regression model to make predictions
y_score = log_model.predict_proba(X_test)[:, 1]
y_pred = log_model.predict(X_test)
y_pred = np.round(y_pred)
confusion_matrix = confusion_matrix(y_test, y_pred)
print("\n")
print(confusion_matrix)
print("\n")
print(classification_report(y_test, y_pred, zero_division=0))
# calculate precision and recall
precision, recall, thresholds = precision_recall_curve(y_test, y_score)
# create precision recall curve
fig, ax = plt.subplots()
ax.plot(recall, precision, color="purple")
# add axis labels to plot
ax.set_title("Precision-Recall Curve")
ax.set_ylabel("Precision")
ax.set_xlabel("Recall")
# display plot
plt.show()
# precision-recall curve
# generate 2 class dataset
X = df[["DIAGNOSIS_CD_Dummy"]]
y = df[["TEST_RESULT_Dummy"]]
# X = pd.DataFrame(df.iloc[:, :-1])
# y = pd.DataFrame(df.iloc[:, -1])
# split into train/test sets
trainX, testX, trainy, testy = train_test_split(
X, y.values.ravel(), test_size=0.3, random_state=2
)
# fit a model
model = LogisticRegression(solver="lbfgs")
model.fit(trainX, trainy)
# predict probabilities
lr_probs = model.predict_proba(testX)
# probs_rf = model_rf.predict_proba(testX)[:, 1]
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# predict class values
yhat = model.predict(testX)
lr_precision, lr_recall, _ = precision_recall_curve(testy, lr_probs)
lr_f1, lr_auc = f1_score(testy, yhat), auc(lr_recall, lr_precision)
# precision_rf, recall_rf, _ = precision_recall_curve(testy, probs_rf)
# f1_rf, auc_rf = f1_score(testy, yhat), auc(recall_rf, precision_rf)
# auc_rf = auc(recall_rf, precision_rf)
# summarize scores
print("Logistic: f1=%.3f auc=%.3f" % (lr_f1, lr_auc))
# plot the precision-recall curves
no_skill = len(testy[testy == 1]) / len(testy)
pyplot.plot([0, 1], [no_skill, no_skill], linestyle="--", label="No Skill")
pyplot.plot(lr_recall, lr_precision, marker=".", label="Logistic")
plt.plot(lr_precision, lr_recall, label=f"AUC (Logistic Regression) = {lr_auc:.2f}")
# axis labels
pyplot.xlabel("Recall")
pyplot.ylabel("Precision")
# show the legend
pyplot.legend()
# show the plot
pyplot.show()
# Random Forest
model_rf = RandomForestClassifier()
model_rf.fit(trainX, trainy)
# model_rf = RandomForestClassifier().fit(trainX, trainy)
# predict probabilities
lr_probs = model.predict_proba(testX)
probs_rf = model_rf.predict_proba(testX)
# keep probabilities for the positive outcome only
probs_rf = probs_rf[:, 1]
# predict class values
yhat = model.predict(testX)
precision_rf, recall_rf, _ = precision_recall_curve(testy, probs_rf)
f1_rf, auc_rf = f1_score(testy, yhat), auc(recall_rf, precision_rf)
auc_rf = auc(recall_rf, precision_rf)
print("Random Forest: f1=%.3f auc=%.3f" % (f1_rf, auc_rf))
# plot the precision-recall curves
no_skill = len(testy[testy == 1]) / len(testy)
pyplot.plot([0, 1], [no_skill, no_skill], linestyle="--", label="No Skill")
pyplot.plot(lr_recall, lr_precision, marker=".", label="Random Forest")
plt.plot(recall_rf, precision_rf, label=f"AUC (Random Forests) = {auc_rf:.2f}")
# axis labels
pyplot.xlabel("Recall")
pyplot.ylabel("Precision")
# show the legend
pyplot.legend()
# show the plot
pyplot.show()
Output:
Diagnosis 0 1
Test Result
0 18385 32
1 1268 165
[[5514 11]
[ 374 56]]
precision recall f1-score support
0 0.94 1.00 0.97 5525
1 0.84 0.13 0.23 430
accuracy 0.94 5955
macro avg 0.89 0.56 0.60 5955
weighted avg 0.93 0.94 0.91 5955
Logistic: f1=0.193 auc=0.488
Random Forest: f1=0.193 auc=0.488
This is my attempt to plot it.
import pathlib
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import (MultipleLocator, AutoMinorLocator)
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from itertools import cycle
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import label_binarize
def __plot_binary_precision_recall_curve(X_test, y_test, *args, **kwargs):
"""
Private function to be used by plot_precision_recall_curve for binary applications.
"""
if 'fig_size' in kwargs and 'dpi' in kwargs:
fig, ax = plt.subplots(figsize=kwargs['fig_size'], dpi=kwargs['dpi'])
else:
fig, ax = plt.subplots()
plt.rcParams["figure.facecolor"] = 'white'
plt.rcParams["axes.facecolor"] = 'white'
plt.rcParams["savefig.facecolor"] = 'white'
ax.xaxis.set_major_locator(MultipleLocator(0.1))
ax.xaxis.set_major_formatter('{x:.1f}')
ax.yaxis.set_major_locator(MultipleLocator(0.1))
ax.yaxis.set_major_formatter('{x:.1f}')
ax.xaxis.set_minor_locator(MultipleLocator(0.05))
ax.yaxis.set_minor_locator(MultipleLocator(0.05))
ax.tick_params(which='both', width=2)
ax.tick_params(which='major', length=7)
ax.tick_params(which='minor', length=4, color='black')
plt.grid(True, zorder=0)
plt.plot([0, 1], [1, 0], linestyle='--', lw=1, color='k',
label='Luck', alpha=.8, zorder=1) # random prediction curve
plt.plot([1, 1], [1, 0], c='k', linestyle='dashdot'), plt.plot([1, 1], c='k', linestyle='dashdot', zorder=2, label="Perfect model") #perfect model prediction curve
f_scores = np.linspace(0.2, 0.8, num=4)
lines, labels = [], []
for f_score in f_scores:
x = np.linspace(0.01, 1)
y = f_score * x / (2 * x - f_score)
(l,) = plt.plot(x[y >= 0], y[y >= 0], color="gray", alpha=0.2)
plt.annotate("f1={0:0.1f}".format(f_score), xy=(0.9, y[45] + 0.02))
zorder = 3
for classifier in args:
display = PrecisionRecallDisplay.from_estimator(classifier, X_test, y_test, ax=ax, zorder=zorder)
zorder +=1
# add the legend for the iso-f1 curves
handles, labels = display.ax_.get_legend_handles_labels()
handles.extend([l])
labels.extend(["iso-f1 curves"])
# set the legend and the axes
ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.legend(handles=handles, labels=labels, loc="best")
plt.xlabel('Recall', fontsize=18)
plt.ylabel('Precision', fontsize=18)
if 'title' in kwargs:
ax.set_title(kwargs['title'], fontsize=18)
else:
ax.set_title("Precision-Recall Curve", fontsize=18)
if 'save_fig_path' in kwargs:
path = pathlib.Path(kwargs['save_fig_path'])
path.parent.mkdir(parents=True, exist_ok=True)
fig.savefig(kwargs['save_fig_path'], dpi=kwargs['dpi'], facecolor=fig.get_facecolor(), edgecolor='none')
return fig, ax
def __plot_multiclass_precision_recall_curve(X_train, y_train, X_test, y_test, *args, **kwargs):
"""
Private function designed to be used by plot_precision_recall_curve for multiclass applications.
"""
my_vals = y_test.unique().tolist()
my_vals.sort()
# binarize the y_test series
y_test = label_binarize(y_test, classes=my_vals)
n_classes = y_test.shape[1]
# setup plot details
colors = cycle(["navy", "turquoise", "darkorange", "cornflowerblue", "teal"])
if 'fig_size' in kwargs and 'dpi' in kwargs:
fig, ax = plt.subplots(len(args), figsize=kwargs['fig_size'], dpi=kwargs['dpi'], facecolor='white')
else:
fig, ax = plt.subplots(len(args), facecolor='white')
for count, clfs in enumerate(args):
ax[count].xaxis.set_major_locator(MultipleLocator(0.1))
ax[count].xaxis.set_major_formatter('{x:.1f}')
ax[count].yaxis.set_major_locator(MultipleLocator(0.1))
ax[count].yaxis.set_major_formatter('{x:.1f}')
ax[count].xaxis.set_minor_locator(MultipleLocator(0.05))
ax[count].yaxis.set_minor_locator(MultipleLocator(0.05))
ax[count].tick_params(which='both', width=2)
ax[count].tick_params(which='major', length=7)
ax[count].tick_params(which='minor', length=4, color='black')
ax[count].grid(True, zorder=0)
ax[count].plot([0, 1], [1, 0], linestyle='--', lw=1, color='k',
label='Luck', alpha=.8, zorder=1) # random prediction curve
ax[count].plot([1, 1], [1, 0], c='k', linestyle='dashdot'), ax[count].plot([1, 1], c='k', linestyle='dashdot', zorder=2, label="Perfect model") #perfect model prediction curve
# set up the model, wrapped by the OneVsRestClassifier
classifier = OneVsRestClassifier(clfs)
classifier.fit(X_train, y_train) # train the model
# produce the predictions (as probabilities)
y_score = classifier.predict_proba(X_test)
# For each class
precision = dict()
recall = dict()
average_precision = dict()
for i in range(n_classes):
precision[i], recall[i], _ = precision_recall_curve(y_test[:, i], y_score[:, i])
average_precision[i] = average_precision_score(y_test[:, i], y_score[:, i])
# A "micro-average": quantifying score on all classes jointly
precision["micro"], recall["micro"], _ = precision_recall_curve(
y_test.ravel(), y_score.ravel()
)
average_precision["micro"] = average_precision_score(y_test, y_score, average="micro")
f_scores = np.linspace(0.2, 0.8, num=4)
lines, labels = [], []
for f_score in f_scores:
x = np.linspace(0.01, 1)
y = f_score * x / (2 * x - f_score)
(l,) = ax[count].plot(x[y >= 0], y[y >= 0], color="gray", alpha=0.2)
ax[count].annotate("f1={0:0.1f}".format(f_score), xy=(0.9, y[45] + 0.02))
display = PrecisionRecallDisplay(
recall=recall["micro"],
precision=precision["micro"],
average_precision=average_precision["micro"],
)
display.plot(ax=ax[count], name="Micro-average precision-recall", color="gold")
for i, color in zip(range(n_classes), colors):
display = PrecisionRecallDisplay(
recall=recall[i],
precision=precision[i],
average_precision=average_precision[i],
)
display.plot(ax=ax[count], name=f"Precision-recall for class {i}", color=color)
# add the legend for the iso-f1 curves
handles, labels = display.ax_.get_legend_handles_labels()
handles.extend([l])
labels.extend(["iso-f1 curves"])
# set the legend and the axes
ax[count].set_xlim([0.0, 1.0])
ax[count].set_ylim([0.0, 1.05])
ax[count].legend(handles=handles, labels=labels, loc="best")
if type(clfs) == Pipeline:
estimator_name = str(type(clfs['clf'])).split(".")[-1][:-2]
else:
estimator_name = str(type(clfs)).split(".")[-1][:-2]
if 'title' in kwargs:
ax[count].set_title(kwargs['title'] + " - " + estimator_name, fontsize=18)
else:
ax[count].set_title("Precision-Recall Curve" + " - " + estimator_name, fontsize=18)
ax[count].set_xlabel('Recall', fontsize=18)
ax[count].set_ylabel('Precision', fontsize=18)
if 'save_fig_path' in kwargs:
path = pathlib.Path(kwargs['save_fig_path'])
path.parent.mkdir(parents=True, exist_ok=True)
fig.savefig(kwargs['save_fig_path'], dpi=kwargs['dpi'], facecolor=fig.get_facecolor(), edgecolor='none')
return fig, ax
def plot_precision_recall_curve(X_train, y_train, X_test, y_test, *args, **kwargs):
"""
Plots precision recall curves for the given models
Parameters
----------
X_test : pandas.DataFrame of shape (n_samples, n_features)
Test values.
y_test : pandas.Series of shape (n_samples,)
Target values.
*args : estimators to plot precision and recall curves
estimator instance (either sklearn.Pipeline, imblearn.Pipeline or a classifier)
PRE-FITTED classifier or a PRE-FITTED Pipeline in which the last estimator is a classifier.
**kwargs : The following options are available with kwargs
fig_size : tuple
Size (inches) of the plot.
dpi : int, default = 100
Image DPI.
title : str
The title of the plot.
save_fig_path : str
Full path where to save the plot. Will generate the folders if they don't exist already.
Returns
-------
fig : Matplotlib.pyplot.Figure
Figure from matplotlib
ax : Matplotlib.pyplot.Axe
Axe object from matplotlib
Example Syntax
--------------
fig, ax = reporting.plot_precision_recall_curve(X_train, y_train, X_test, y_test,
rf_pipe, catboost_classifier,
fig_size=(10,16), dpi=100,
title="Precision-Recall Curve",
save_fig_path="dir1/dir2/precision_recall_curve.png")
"""
if (len(y_test.unique()) == 2):
fig, ax = __plot_binary_precision_recall_curve(X_test, y_test, *args, **kwargs)
else:
fig, ax = __plot_multiclass_precision_recall_curve(X_train, y_train, X_test, y_test, *args, **kwargs)
return fig, ax
Syntax and Output for Binary Classification
fig, ax = plot_precision_recall_curve(X_train, y_train, X_test, y_test,
rf_pipe, xgboost_classifier,
fig_size=(10,8), dpi=100,
title="Precision-Recall Curve",
save_fig_path="dir1/dir2/precision_recall_curve.png")
Syntax and Output for Multi-class Classification
fig, ax = plot_precision_recall_curve(X_train, y_train, X_test, y_test,
rf_pipe, catboost_classifier,
fig_size=(10,16), dpi=100,
title="Precision-Recall Curve",
save_fig_path="dir1/dir2/precision_recall_curve.png")
I have a problem. I want to plot my RMSE value. However, I now use a pipeline because I use cross-validation and also use other steps like feature selection.
My question is, is there a way to get this plot through the pipeline (without training the model a second time)? So how can I display the training and validation RMSE value nicely in a diagram in the pipeline?
Pipeline
dfListingsFeature_regression = pd.read_csv(r"https://raw.githubusercontent.com/Coderanker3/dataset4/main/listings_cleaned.csv")
d = {True: 1, False: 0, np.nan : np.nan}
dfListingsFeature_regression['host_is_superhost'] = dfListingsFeature_regression[
'host_is_superhost'].map(d).astype('int')
X = dfListingsFeature_regression.drop(columns=['host_id', 'id', 'price']) # Features
y = dfListingsFeature_regression['price'] # Target variable
print(dfListingsFeature_nor.shape)
steps = [('feature_selection', SelectFromModel(estimator=LogisticRegression(max_iter=10000))),
('lasso', Lasso(alpha=0.4))]
pipeline = Pipeline(steps)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=30)
parameteres = { }
grid = GridSearchCV(pipeline, param_grid=parameteres, cv=5)
grid.fit(X_train, y_train)
print("score = %3.2f" %(grid.score(X_test,y_test)))
print('Training set score: ' + str(grid.score(X_train,y_train)))
print('Test set score: ' + str(grid.score(X_test,y_test)))
y_pred = grid.predict(X_test)
print("RMSE Val:", metrics.mean_squared_error(y_test, y_pred, squared=False))
y_train_predict = grid.predict(X_train)
print("Train:" , metrics.mean_squared_error(y_train, y_train_predict , squared=False))
r2 = metrics.r2_score(y_test, y_pred)
print(r2)
Plot
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
def plot_learning_curves(model, X, y):
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=10)
train_errors, val_errors = [], []
for m in range(1, 500 + 1):
model.fit(X_train[:m], y_train[:m])
y_train_predict = model.predict(X_train[:m])
y_val_predict = model.predict(X_val)
train_errors.append(mean_squared_error(y_train[:m], y_train_predict))
val_errors.append(mean_squared_error(y_val, y_val_predict))
plt.figure( figsize=(10,10))
plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="train")
plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="val")
plt.legend(loc="upper right", fontsize=14)
plt.xlabel("Training set size", fontsize=14)
plt.ylabel("RMSE", fontsize=14)
%%time
lin_reg = Lasso(alpha=0.1)
plot_learning_curves(lin_reg, X, y)
#plt.axis([0, 80, 0, 3])
plt.show()
You don't have to fit() your model again in plot_learning_curves. You can simply use your fitted pipeline to predict value for both train and validation set and then plot your learning curve.
You function should look as follow without the model.fit():
def plot_learning_curves(model, X, y):
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=10)
train_errors, val_errors = [], []
for m in range(1, 500 + 1):
y_train_predict = model.predict(X_train[:m])
y_val_predict = model.predict(X_val)
train_errors.append(mean_squared_error(y_train[:m], y_train_predict))
val_errors.append(mean_squared_error(y_val, y_val_predict))
plt.figure( figsize=(10,10))
plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="train")
plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="val")
plt.legend(loc="upper right", fontsize=14)
plt.xlabel("Training set size", fontsize=14)
plt.ylabel("RMSE", fontsize=14)
Then you should call this function using your fitted model as parameter.
I have a dataset with X = ['x', 'y'] the two first columns of my dataset
and in target the data['class'].
But i doesn't how display a plot of linear regression in this case.
Because I have the error "x and y must be the same size".
So how i can plot a linear regression and predict with a dataset
or i take X as the first two column of my dataset and in target the last column ?
Thanks so much for the help, here my code below :
data = pd.read_csv('data.csv')
X = data[['x', 'y']]
data['class'] = np.where(data['class']=='P', 1, 0)
Y = data['class']
plt.scatter(X, Y, color='blue')
plt.xlabel('x')
plt.ylabel('y')
plt.plot(X, Y, color='red', linewidth=2)
plt.show()
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=0)
regressor = LinearRegression()
regressor.fit(X_train, y_train)
Based on the offical documentation:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=0)
regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test) #adding your prediction, this was missing
import matplotlib.pyplot as plt
import numpy as np
# Plot outputs
plt.scatter(X_test, y_test, color='black') #plot scatters
plt.plot(X_test, y_pred, color='red', linewidth=2) #plot line
plt.xticks(())
plt.yticks(())
plt.show()
I want to generate a Precision-Recall curve with 5-fold cross-validation showing standard deviation as in the example ROC curve code here.
The code below (adapted from How to Plot PR-Curve Over 10 folds of Cross Validation in Scikit-Learn) gives a PR curves for each fold of cross-validation along with the mean PR curve. I wanted to also show the region of one standard deviation above and below the mean PR curve in grey. But it gives the following error (details in the link below the code):
ValueError: operands could not be broadcast together with shapes (91,) (78,)
import matplotlib.pyplot as plt
import numpy
from sklearn.datasets import make_blobs
from sklearn.metrics import precision_recall_curve, auc
from sklearn.model_selection import KFold
from sklearn.svm import SVC
X, y = make_blobs(n_samples=500, n_features=2, centers=2, cluster_std=10.0,
random_state=10)
k_fold = KFold(n_splits=5, shuffle=True, random_state=10)
predictor = SVC(kernel='linear', C=1.0, probability=True, random_state=10)
y_real = []
y_proba = []
precisions, recalls = [], []
for i, (train_index, test_index) in enumerate(k_fold.split(X)):
Xtrain, Xtest = X[train_index], X[test_index]
ytrain, ytest = y[train_index], y[test_index]
predictor.fit(Xtrain, ytrain)
pred_proba = predictor.predict_proba(Xtest)
precision, recall, _ = precision_recall_curve(ytest, pred_proba[:,1])
lab = 'Fold %d AUC=%.4f' % (i+1, auc(recall, precision))
plt.plot(recall, precision, alpha=0.3, label=lab)
y_real.append(ytest)
y_proba.append(pred_proba[:,1])
precisions.append(precision)
recalls.append(recall)
y_real = numpy.concatenate(y_real)
y_proba = numpy.concatenate(y_proba)
precision, recall, _ = precision_recall_curve(y_real, y_proba)
lab = 'Overall AUC=%.4f' % (auc(recall, precision))
plt.plot(recall, precision, lw=2,color='red', label=lab)
std_precision = np.std(precisions, axis=0)
tprs_upper = np.minimum(precisions[median] + std_precision, 1)
tprs_lower = np.maximum(precisions[median] - std_precision, 0)
plt.fill_between(recall_overall, upper_precision, lower_precision, alpha=0.5, linewidth=0, color='grey')
Error reported and Plot generated
Can you please suggest how I add to the following code to also show one standard deviation around the mean PR curve?
I have got a working solution, but it would be helpful if anybody can comment whether it is doing the right thing.
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.metrics import precision_recall_curve, auc
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from numpy import interp
X, y = make_blobs(n_samples=500, n_features=2, centers=2, cluster_std=10.0,
random_state=10)
k_fold = KFold(n_splits=5, shuffle=True, random_state=10)
predictor = SVC(kernel='linear', C=1.0, probability=True, random_state=10)
y_real = []
y_proba = []
precision_array = []
threshold_array=[]
recall_array = np.linspace(0, 1, 100)
for i, (train_index, test_index) in enumerate(k_fold.split(X)):
Xtrain, Xtest = X[train_index], X[test_index]
ytrain, ytest = y[train_index], y[test_index]
predictor.fit(Xtrain, ytrain)
pred_proba = predictor.predict_proba(Xtest)
precision_fold, recall_fold, thresh = precision_recall_curve(ytest, pred_proba[:,1])
precision_fold, recall_fold, thresh = precision_fold[::-1], recall_fold[::-1], thresh[::-1] # reverse order of results
thresh = np.insert(thresh, 0, 1.0)
precision_array = interp(recall_array, recall_fold, precision_fold)
threshold_array = interp(recall_array, recall_fold, thresh)
pr_auc = auc(recall_array, precision_array)
lab_fold = 'Fold %d AUC=%.4f' % (i+1, pr_auc)
plt.plot(recall_fold, precision_fold, alpha=0.3, label=lab_fold)
y_real.append(ytest)
y_proba.append(pred_proba[:,1])
y_real = numpy.concatenate(y_real)
y_proba = numpy.concatenate(y_proba)
precision, recall, _ = precision_recall_curve(y_real, y_proba)
lab = 'Overall AUC=%.4f' % (auc(recall, precision))
plt.plot(recall, precision, lw=2,color='red', label=lab)
plt.legend(loc='lower left', fontsize='small')
mean_precision = np.mean(precision_array)
std_precision = np.std(precision_array)
plt.fill_between(recall, precision + std_precision, precision - std_precision, alpha=0.3, linewidth=0, color='grey')
plt.show()
Although in the right direction, the answer from #user1886130 is not entirely correct since the variable precision_array is overwritten at each iteration inside the loop.
A cleaner and correct version is:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.metrics import precision_recall_curve, auc
from sklearn.model_selection import KFold
from sklearn.svm import SVC
X, y = make_blobs(n_samples=500, n_features=2, centers=2, cluster_std=10.0, random_state=10)
k_fold = KFold(n_splits=5, shuffle=True, random_state=10)
predictor = SVC(kernel='linear', C=1.0, probability=True, random_state=10)
y_real = []
y_proba = []
precision_array = []
recall_array = np.linspace(0, 1, 100)
for i, (train_index, test_index) in enumerate(k_fold.split(X)):
Xtrain, Xtest = X[train_index], X[test_index]
ytrain, ytest = y[train_index], y[test_index]
predictor.fit(Xtrain, ytrain)
pred_proba = predictor.predict_proba(Xtest)
precision_fold, recall_fold, _ = precision_recall_curve(ytest, pred_proba[:, 1])
precision_fold, recall_fold = precision_fold[::-1], recall_fold[::-1] # reverse order of results
prec_array = np.interp(recall_array, recall_fold, precision_fold)
pr_auc = auc(recall_array, prec_array)
precision_array.append(prec_array)
lab_fold = 'Fold %d AUPR=%.4f' % (i+1, pr_auc)
plt.plot(recall_fold, precision_fold, alpha=0.3, label=lab_fold)
y_real.append(ytest)
y_proba.append(pred_proba[:, 1])
y_real = np.concatenate(y_real)
y_proba = np.concatenate(y_proba)
precision, recall, _ = precision_recall_curve(y_real, y_proba)
lab = 'Overall AUPR=%.4f' % (auc(recall, precision))
plt.plot(recall, precision, lw=2, color='red', label=lab)
plt.legend(loc='lower left', fontsize='small')
mean_precision = np.mean(precision_array, axis=0)
std_precision = np.std(precision_array, axis=0)
plt.fill_between(recall_array, mean_precision + std_precision, mean_precision - std_precision, alpha=0.3, linewidth=0, color='grey')
plt.title("PR curves; {} folds".format(k_fold.n_splits), weight="bold", fontsize=15)
plt.xlabel("Recall (Sensitivity)", fontsize=12)
plt.ylabel("Precision (PPV)", fontsize=12)
plt.show()