Precision Recall curve with n-fold cross validation showing standard deviation - python

I want to generate a Precision-Recall curve with 5-fold cross-validation showing standard deviation as in the example ROC curve code here.
The code below (adapted from How to Plot PR-Curve Over 10 folds of Cross Validation in Scikit-Learn) gives a PR curves for each fold of cross-validation along with the mean PR curve. I wanted to also show the region of one standard deviation above and below the mean PR curve in grey. But it gives the following error (details in the link below the code):
ValueError: operands could not be broadcast together with shapes (91,) (78,)
import matplotlib.pyplot as plt
import numpy
from sklearn.datasets import make_blobs
from sklearn.metrics import precision_recall_curve, auc
from sklearn.model_selection import KFold
from sklearn.svm import SVC
X, y = make_blobs(n_samples=500, n_features=2, centers=2, cluster_std=10.0,
random_state=10)
k_fold = KFold(n_splits=5, shuffle=True, random_state=10)
predictor = SVC(kernel='linear', C=1.0, probability=True, random_state=10)
y_real = []
y_proba = []
precisions, recalls = [], []
for i, (train_index, test_index) in enumerate(k_fold.split(X)):
Xtrain, Xtest = X[train_index], X[test_index]
ytrain, ytest = y[train_index], y[test_index]
predictor.fit(Xtrain, ytrain)
pred_proba = predictor.predict_proba(Xtest)
precision, recall, _ = precision_recall_curve(ytest, pred_proba[:,1])
lab = 'Fold %d AUC=%.4f' % (i+1, auc(recall, precision))
plt.plot(recall, precision, alpha=0.3, label=lab)
y_real.append(ytest)
y_proba.append(pred_proba[:,1])
precisions.append(precision)
recalls.append(recall)
y_real = numpy.concatenate(y_real)
y_proba = numpy.concatenate(y_proba)
precision, recall, _ = precision_recall_curve(y_real, y_proba)
lab = 'Overall AUC=%.4f' % (auc(recall, precision))
plt.plot(recall, precision, lw=2,color='red', label=lab)
std_precision = np.std(precisions, axis=0)
tprs_upper = np.minimum(precisions[median] + std_precision, 1)
tprs_lower = np.maximum(precisions[median] - std_precision, 0)
plt.fill_between(recall_overall, upper_precision, lower_precision, alpha=0.5, linewidth=0, color='grey')
Error reported and Plot generated
Can you please suggest how I add to the following code to also show one standard deviation around the mean PR curve?

I have got a working solution, but it would be helpful if anybody can comment whether it is doing the right thing.
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.metrics import precision_recall_curve, auc
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from numpy import interp
X, y = make_blobs(n_samples=500, n_features=2, centers=2, cluster_std=10.0,
random_state=10)
k_fold = KFold(n_splits=5, shuffle=True, random_state=10)
predictor = SVC(kernel='linear', C=1.0, probability=True, random_state=10)
y_real = []
y_proba = []
precision_array = []
threshold_array=[]
recall_array = np.linspace(0, 1, 100)
for i, (train_index, test_index) in enumerate(k_fold.split(X)):
Xtrain, Xtest = X[train_index], X[test_index]
ytrain, ytest = y[train_index], y[test_index]
predictor.fit(Xtrain, ytrain)
pred_proba = predictor.predict_proba(Xtest)
precision_fold, recall_fold, thresh = precision_recall_curve(ytest, pred_proba[:,1])
precision_fold, recall_fold, thresh = precision_fold[::-1], recall_fold[::-1], thresh[::-1] # reverse order of results
thresh = np.insert(thresh, 0, 1.0)
precision_array = interp(recall_array, recall_fold, precision_fold)
threshold_array = interp(recall_array, recall_fold, thresh)
pr_auc = auc(recall_array, precision_array)
lab_fold = 'Fold %d AUC=%.4f' % (i+1, pr_auc)
plt.plot(recall_fold, precision_fold, alpha=0.3, label=lab_fold)
y_real.append(ytest)
y_proba.append(pred_proba[:,1])
y_real = numpy.concatenate(y_real)
y_proba = numpy.concatenate(y_proba)
precision, recall, _ = precision_recall_curve(y_real, y_proba)
lab = 'Overall AUC=%.4f' % (auc(recall, precision))
plt.plot(recall, precision, lw=2,color='red', label=lab)
plt.legend(loc='lower left', fontsize='small')
mean_precision = np.mean(precision_array)
std_precision = np.std(precision_array)
plt.fill_between(recall, precision + std_precision, precision - std_precision, alpha=0.3, linewidth=0, color='grey')
plt.show()

Although in the right direction, the answer from #user1886130 is not entirely correct since the variable precision_array is overwritten at each iteration inside the loop.
A cleaner and correct version is:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.metrics import precision_recall_curve, auc
from sklearn.model_selection import KFold
from sklearn.svm import SVC
X, y = make_blobs(n_samples=500, n_features=2, centers=2, cluster_std=10.0, random_state=10)
k_fold = KFold(n_splits=5, shuffle=True, random_state=10)
predictor = SVC(kernel='linear', C=1.0, probability=True, random_state=10)
y_real = []
y_proba = []
precision_array = []
recall_array = np.linspace(0, 1, 100)
for i, (train_index, test_index) in enumerate(k_fold.split(X)):
Xtrain, Xtest = X[train_index], X[test_index]
ytrain, ytest = y[train_index], y[test_index]
predictor.fit(Xtrain, ytrain)
pred_proba = predictor.predict_proba(Xtest)
precision_fold, recall_fold, _ = precision_recall_curve(ytest, pred_proba[:, 1])
precision_fold, recall_fold = precision_fold[::-1], recall_fold[::-1] # reverse order of results
prec_array = np.interp(recall_array, recall_fold, precision_fold)
pr_auc = auc(recall_array, prec_array)
precision_array.append(prec_array)
lab_fold = 'Fold %d AUPR=%.4f' % (i+1, pr_auc)
plt.plot(recall_fold, precision_fold, alpha=0.3, label=lab_fold)
y_real.append(ytest)
y_proba.append(pred_proba[:, 1])
y_real = np.concatenate(y_real)
y_proba = np.concatenate(y_proba)
precision, recall, _ = precision_recall_curve(y_real, y_proba)
lab = 'Overall AUPR=%.4f' % (auc(recall, precision))
plt.plot(recall, precision, lw=2, color='red', label=lab)
plt.legend(loc='lower left', fontsize='small')
mean_precision = np.mean(precision_array, axis=0)
std_precision = np.std(precision_array, axis=0)
plt.fill_between(recall_array, mean_precision + std_precision, mean_precision - std_precision, alpha=0.3, linewidth=0, color='grey')
plt.title("PR curves; {} folds".format(k_fold.n_splits), weight="bold", fontsize=15)
plt.xlabel("Recall (Sensitivity)", fontsize=12)
plt.ylabel("Precision (PPV)", fontsize=12)
plt.show()

Related

How to solve ValueError: x and y must be the same size issue on Python?

I'm trying to do a linear regression, however I keep running into the same problem of "ValueError: x and y must be the same size". I'm very confused, and have been on every single website there is to try to fix it. If anyone would know that would be a massive help. I don't understand what to do.
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt
#load datatset
df = pd.read_csv('Real_estate.csv')
X = df[['transaction date', 'house age', 'distance to the nearest MRT station','number of convenience stores', 'latitude','longitude']]
y = df['house price of unit area']
x= df.iloc[:,0:-7].values
y= df.iloc[:,1:].values
x, y = np.array(x), np.array(y)
model = LinearRegression()
model.fit(x, y)
model = LinearRegression().fit(x, y)
x_train, x_test, y_train, y_test = train_test_split( x, y, test_size = 0.4)
sc = StandardScaler()
sc.fit(x_train)
x_train_std = sc.transform(x_train)
x_test_std = sc.transform(x_test)
regr = linear_model.LinearRegression()
regr.fit(x_train_std, y_train)
y_pred = regr.predict(x_test)
r_sq = model.score(x, y)
print("Intercept: ", regr.intercept_)
print("Coefficients: \n", regr.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
##Model evaluation
print("Mean absolute error: %.2f" % mean_absolute_error(y_test,y_pred))
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))
y_pred = model.predict(x)
print('predicted response:', y_pred, sep='\n')
plt.scatter(x_test,y_test, color="black")
plt.plot(x_test, y_pred, color="blue", linewidth=3)
plt.xticks(())
plt.yticks(())
plt.show()
This is my code but I don't know understand what's going wrong. I'm trying to use 7 columns, including the y value. I'm a beginner to Python, so I apologize if this is a very silly question. Thank you.
plt.plot(x_test, y_pred, color="blue", linewidth=3)
Both arguments need to be of the same shape, but y_pred is prediction over entire x, instead of x_test
change
y_pred = model.predict(x)
to
y_pred = model.predict(x_test)

Representation of a training and validation metric in a pipeline

I have a problem. I want to plot my RMSE value. However, I now use a pipeline because I use cross-validation and also use other steps like feature selection.
My question is, is there a way to get this plot through the pipeline (without training the model a second time)? So how can I display the training and validation RMSE value nicely in a diagram in the pipeline?
Pipeline
dfListingsFeature_regression = pd.read_csv(r"https://raw.githubusercontent.com/Coderanker3/dataset4/main/listings_cleaned.csv")
d = {True: 1, False: 0, np.nan : np.nan}
dfListingsFeature_regression['host_is_superhost'] = dfListingsFeature_regression[
'host_is_superhost'].map(d).astype('int')
X = dfListingsFeature_regression.drop(columns=['host_id', 'id', 'price']) # Features
y = dfListingsFeature_regression['price'] # Target variable
print(dfListingsFeature_nor.shape)
steps = [('feature_selection', SelectFromModel(estimator=LogisticRegression(max_iter=10000))),
('lasso', Lasso(alpha=0.4))]
pipeline = Pipeline(steps)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=30)
parameteres = { }
grid = GridSearchCV(pipeline, param_grid=parameteres, cv=5)
grid.fit(X_train, y_train)
print("score = %3.2f" %(grid.score(X_test,y_test)))
print('Training set score: ' + str(grid.score(X_train,y_train)))
print('Test set score: ' + str(grid.score(X_test,y_test)))
y_pred = grid.predict(X_test)
print("RMSE Val:", metrics.mean_squared_error(y_test, y_pred, squared=False))
y_train_predict = grid.predict(X_train)
print("Train:" , metrics.mean_squared_error(y_train, y_train_predict , squared=False))
r2 = metrics.r2_score(y_test, y_pred)
print(r2)
Plot
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
def plot_learning_curves(model, X, y):
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=10)
train_errors, val_errors = [], []
for m in range(1, 500 + 1):
model.fit(X_train[:m], y_train[:m])
y_train_predict = model.predict(X_train[:m])
y_val_predict = model.predict(X_val)
train_errors.append(mean_squared_error(y_train[:m], y_train_predict))
val_errors.append(mean_squared_error(y_val, y_val_predict))
plt.figure( figsize=(10,10))
plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="train")
plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="val")
plt.legend(loc="upper right", fontsize=14)
plt.xlabel("Training set size", fontsize=14)
plt.ylabel("RMSE", fontsize=14)
%%time
lin_reg = Lasso(alpha=0.1)
plot_learning_curves(lin_reg, X, y)
#plt.axis([0, 80, 0, 3])
plt.show()
You don't have to fit() your model again in plot_learning_curves. You can simply use your fitted pipeline to predict value for both train and validation set and then plot your learning curve.
You function should look as follow without the model.fit():
def plot_learning_curves(model, X, y):
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=10)
train_errors, val_errors = [], []
for m in range(1, 500 + 1):
y_train_predict = model.predict(X_train[:m])
y_val_predict = model.predict(X_val)
train_errors.append(mean_squared_error(y_train[:m], y_train_predict))
val_errors.append(mean_squared_error(y_val, y_val_predict))
plt.figure( figsize=(10,10))
plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="train")
plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="val")
plt.legend(loc="upper right", fontsize=14)
plt.xlabel("Training set size", fontsize=14)
plt.ylabel("RMSE", fontsize=14)
Then you should call this function using your fitted model as parameter.

Shap plot crops/truncates the feature names

import csv
import pandas as pd
import numpy as np
from matplotlib import pyplot
import shap
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
df1=pd.read_csv("./wine.data",sep=",",encoding='utf_8_sig')
X_train = df1
le = preprocessing.LabelEncoder()
X_train['alc_class'] = le.fit_transform(X_train.alc_class.values)
print(X_train.columns)
print(X_train.describe())
y = X_train['alc_class']
X = X_train.drop(columns='alc_class')
import xgboost as xgb
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 2100, stratify = y)
# import XGBClassifier
import xgboost as xgb
from sklearn.metrics import mean_squared_error
DM_train = xgb.DMatrix(data = X_train,
label = y_train)
DM_test = xgb.DMatrix(data = X_test,
label = y_test)
xgb_param_grid = {
'colsample_bytree': np.linspace(0.5, 0.9, 2),
'n_estimators':[30],
'max_depth': [5],
'learning_rate':[0.01],
'alpha':[10],
'objective':['binary:logistic'],
'tree_method':['hist'],
'min_child_weight': [1],
'gamma': [0.5],
'subsample': [0.6],
}
# instantiate the classifier
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric="auc")
# perform 5 fold cross-validation using mean square error as a scoring method
grid_mse = GridSearchCV(estimator = xgb_clf, param_grid = xgb_param_grid, scoring = 'neg_mean_squared_error', cv = 5, verbose = 1)
# Fit grid_mse to the data, get best parameters and best score (lowest RMSE)
grid_mse.fit(X_train, y_train)
print("Best parameters found: ",grid_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_)))
#Predict using the test data
y_pred = grid_mse.predict(X_test)
y_pred_prob = grid_mse.predict_proba(X_test)
print("Root mean square error for test dataset: {}".format(np.round(np.sqrt(mean_squared_error(y_test, y_pred)), 2)))
from sklearn.metrics import accuracy_score, roc_curve, auc,recall_score,precision_score, precision_recall_curve,f1_score, classification_report, confusion_matrix,roc_auc_score
print('XGBoost model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))
print('XGBoost model F1 score: {0:0.4f}'. format(f1_score(y_test, y_pred, average='weighted')))
precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
area = auc(recall, precision)
print("----------------")
print("\n\n Evaluation Metrics \n\n")
aucroc_score = roc_auc_score(y_test, y_pred_prob[:,1])
print("Area Under ROC Curve: ",aucroc_score)
# roc curve for models
fpr, tpr, thresh = roc_curve(y_test, y_pred_prob[:,1], pos_label=1)
# roc curve for tpr = fpr
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, _ = roc_curve(y_test, random_probs, pos_label=1)
print("confusion_matrix ", confusion_matrix(y_test,y_pred))
print("classification_report ", classification_report(y_test,y_pred))
explainer = shap.TreeExplainer(grid_mse.best_estimator_)
shap_values = explainer(X_train)
shap.plots.beeswarm(shap_values, plot_size = 1.8, max_display = 13)
print(grid_mse.best_estimator_.feature_importances_)
for col,score in zip(X_train.columns,grid_mse.best_estimator_.feature_importances_):
print('%s, %0.3f ' %(col,score))
I have long feature names and I plot the beeswarm shapley plots and feature names get truncated. I would like the full feature name to be displayed on y-axis. Any help would be greatly appreciated.
I have tried changing the plot size but it did not work.
Add a flag to hide the plot. Then save to output with tight bbox layout:
path = 'save_path_here.png'
shap.plots.beeswarm(shap_values, plot_size = 1.8, max_display = 13, show=False)
plt.savefig(path, bbox_inches='tight', dpi=300)

Python Sklearn Linear Regression Yields Incorrect Coefficient Values

I'm trying to find the slope and y-intercept coefficients for a linear equation. I created a test domain and range to make sure the numbers I was receiving were correct. The equation should be y = 2x + 1, but the model is saying the slope is 24 and the y-intercept is 40.3125. The model accurately predicts every value I give it, but I'm questioning how I can get the proper values.
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X = np.arange(0, 40)
y = (2 * X) + 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=0)
X_train = [[i] for i in X_train]
X_test = [[i] for i in X_test]
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
print('Coefficients: \n', regr.coef_)
print('Y-intercept: \n', regr.intercept_)
print('Mean squared error: %.2f'
% mean_squared_error(y_test, y_pred))
print('Coefficient of determination: %.2f'
% r2_score(y_test, y_pred))
plt.scatter(X_test, y_test, color='black')
plt.plot(X_test, y_pred, color='blue', linewidth=3)
print(X_test)
plt.xticks()
plt.yticks()
plt.show()
This is happening because you scaled your training and testing data. So even though you generated y as a linear function of X, you converted X_train and X_test onto another scale by standardizing it (subtract the mean and divide by the standard deviation).
If we run your code but omit the lines where you scale the data, you get the expected results.
X = np.arange(0, 40)
y = (2 * X) + 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=0)
X_train = [[i] for i in X_train]
X_test = [[i] for i in X_test]
# Skip the scaling of X_train and X_test
#sc = StandardScaler()
#X_train = sc.fit_transform(X_train)
#X_test = sc.transform(X_test)
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
print('Coefficients: \n', regr.coef_)
> Coefficients:
[2.]
print('Y-intercept: \n', regr.intercept_)
> Y-intercept:
1.0

modelselection.Kfold gives different results than kf.split

I am working on a dataset TelcoSigtel which has 5k observations, 21 features, and an imbalanced target with 86% non-churner and 16% churner.
Sorry, I wanted to give an extract of the dataframe but it is way too big or when I try to take a small bunch there are not enough churners.
My problem is the following those two methods below should give the same results but it is dramatically different on some algorithms and on some other they give the exact same results.
Information about the dataset:
models = [('logit',
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=600,
multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
solver='liblinear', tol=0.0001, verbose=0, warm_start=False)), ....]
# Method 1:
from sklearn import model_selection
from sklearn.model_selection import KFold
X = telcom.drop("churn", axis=1)
Y = telcom["churn"]
results = []
names = []
seed = 0
scoring = "roc_auc"
for name, model in models:
kfold = model_selection.KFold(n_splits = 5, random_state = seed)
cv_results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison-AUC')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.grid()
plt.show()
# Method 2:
from sklearn.model_selection import KFold
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score
kf = KFold(n_splits=5, random_state=0)
X = telcom.drop("churn", axis=1)
Y = telcom["churn"]
results = []
names = []
to_store1 = list()
seed = 0
scoring = "roc_auc"
cv_results = np.array([])
for name, model in models:
for train_index, test_index in kf.split(X):
# split the data
X_train, X_test = X.loc[train_index,:].values, X.loc[test_index,:].values
y_train, y_test = np.ravel(Y[train_index]), np.ravel(Y[test_index])
model = model # Choose a model here
model.fit(X_train, y_train )
y_pred = model.predict(X_test)
to_store1.append(train_index)
# store fold results
result = roc_auc_score(y_test, y_pred)
cv_results = np.append(cv_results, result)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
cv_results = np.array([])
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison-AUC')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.grid()
plt.show()
The short answer is that you should use model.predict_proba(X_test)[:, 1] or model.decision_function(X_test) to get identical results since roc auc scorer needs class probabilities. The long answer is that you can reproduce the same behavior with a toy example:
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import roc_auc_score, make_scorer
def assert_equal_scores(rnd_seed, needs_threshold):
"""Assert two different scorings, return equal results."""
X, y, *_ = load_breast_cancer().values()
kfold = KFold(random_state=rnd_seed)
lr = LogisticRegression(random_state=rnd_seed + 10)
roc_auc_scorer = make_scorer(roc_auc_score, needs_threshold=needs_threshold)
cv_scores1 = cross_val_score(lr, X, y, cv=kfold, scoring=roc_auc_scorer)
cv_scores2 = cross_val_score(lr, X, y, cv=kfold, scoring='roc_auc')
np.testing.assert_equal(cv_scores1, cv_scores2)
Try assert_equal_scores(10, False) and assert_equal_scores(10, True) (or any other random seed). The first one raises an AssertionError. The difference is that roc auc scorer requires the needs_threshold parameter to be True.

Categories

Resources