I am trying to apply the idea of sklearn ROC extension to multiclass to my dataset. My per-class ROC curve looks find of a straight line each, unline the sklearn's example showing curve's fluctuating.
I give an MWE below to show what I mean:
# all imports
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
# dummy dataset
X, y = make_classification(10000, n_classes=5, n_informative=10, weights=[.04, .4, .12, .5, .04])
train, test, ytrain, ytest = train_test_split(X, y, test_size=.3, random_state=42)
# random forest model
model = RandomForestClassifier()
model.fit(train, ytrain)
yhat = model.predict(test)
The following function then plots the ROC curve:
def plot_roc_curve(y_test, y_pred):
n_classes = len(np.unique(y_test))
y_test = label_binarize(y_test, classes=np.arange(n_classes))
y_pred = label_binarize(y_pred, classes=np.arange(n_classes))
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_pred[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_pred.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
# Plot all ROC curves
#plt.figure(figsize=(10,5))
plt.figure(dpi=600)
lw = 2
plt.plot(fpr["micro"], tpr["micro"],
label="micro-average ROC curve (area = {0:0.2f})".format(roc_auc["micro"]),
color="deeppink", linestyle=":", linewidth=4,)
plt.plot(fpr["macro"], tpr["macro"],
label="macro-average ROC curve (area = {0:0.2f})".format(roc_auc["macro"]),
color="navy", linestyle=":", linewidth=4,)
colors = cycle(["aqua", "darkorange", "darkgreen", "yellow", "blue"])
for i, color in zip(range(n_classes), colors):
plt.plot(fpr[i], tpr[i], color=color, lw=lw,
label="ROC curve of class {0} (area = {1:0.2f})".format(i, roc_auc[i]),)
plt.plot([0, 1], [0, 1], "k--", lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) curve")
plt.legend()
Output:
plot_roc_curve(ytest, yhat)
Kind of straight line bending once. I would like to see the model performance at different thresholds, not just one, a figure similar to sklearn's illustration for 3-classes shown below:
Point is that you're using predict() rather than predict_proba()/decision_function() to define your y_hat. This means - considering that the threshold vector is defined by the number of distinct values in y_hat (see here for reference), that you'll have few thresholds per class only on which tpr and fpr are computed (which in turn implies that your curves are evaluated at few points only).
Indeed, consider what the doc says to pass to y_scores in roc_curve(), either prob estimates or decision values. In the example from sklearn, decision values are used to compute the scores. Given that you're considering a RandomForestClassifier(), considering probability estimates in your y_hat should be the way to go.
What's the point then of label-binarizing the output? The standard definition for ROC is in terms of binary classification. To pass to a multiclass problem, you have to convert your problem into binary by using OneVsAll approach, so that you'll have n_class number of ROC curves. (Observe, indeed, that as SVC() handles multiclass problems in a OvO fashion by default, in the example they had to force to use OvA by applying OneVsRestClassifier constructor; with a RandomForestClassifier you don't have such problem as that's inherently multiclass, see here for reference). In these terms, once you switch to predict_proba() you'll see there's no much sense in label binarizing predictions.
# all imports
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
# dummy dataset
X, y = make_classification(10000, n_classes=5, n_informative=10, weights=[.04, .4, .12, .5, .04])
train, test, ytrain, ytest = train_test_split(X, y, test_size=.3, random_state=42)
# random forest model
model = RandomForestClassifier()
model.fit(train, ytrain)
yhat = model.predict_proba(test)
def plot_roc_curve(y_test, y_pred):
n_classes = len(np.unique(y_test))
y_test = label_binarize(y_test, classes=np.arange(n_classes))
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
thresholds = dict()
for i in range(n_classes):
fpr[i], tpr[i], thresholds[i] = roc_curve(y_test[:, i], y_pred[:, i], drop_intermediate=False)
roc_auc[i] = auc(fpr[i], tpr[i])
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_pred.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
# Plot all ROC curves
#plt.figure(figsize=(10,5))
plt.figure(dpi=600)
lw = 2
plt.plot(fpr["micro"], tpr["micro"],
label="micro-average ROC curve (area = {0:0.2f})".format(roc_auc["micro"]),
color="deeppink", linestyle=":", linewidth=4,)
plt.plot(fpr["macro"], tpr["macro"],
label="macro-average ROC curve (area = {0:0.2f})".format(roc_auc["macro"]),
color="navy", linestyle=":", linewidth=4,)
colors = cycle(["aqua", "darkorange", "darkgreen", "yellow", "blue"])
for i, color in zip(range(n_classes), colors):
plt.plot(fpr[i], tpr[i], color=color, lw=lw,
label="ROC curve of class {0} (area = {1:0.2f})".format(i, roc_auc[i]),)
plt.plot([0, 1], [0, 1], "k--", lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) curve")
plt.legend()
Eventually, consider that roc_curve() has also a drop_intermediate parameter meant for dropping suboptimal thresholds (it might be useful to know).
Just to update on #amiola answer: I had an issue with non-monotonic classes which lead to very strange fuzzy results. In this case a little modification to the function above will work very well:
classes = sorted(list(y_test['label'].unique()))
Use this in the label_binarize line:
y_test = label_binarize(y_test, classes=classes)
And then when you need a range in the function, just use:
range(len(classes))
Thanks to #dx2-66 for the answer. You can check for more details here.
Related
I want to plot a ROC curve of a classifier using leave-one-out cross validation.
It seems that a similar question has been asked here but without any answer.
In another question here is was stated:
In order to obtain a meaningful ROC AUC with LeaveOneOut, you need to
calculate probability estimates for each fold (each consisting of just
one observation), then calculate the ROC AUC on the set of all these
probability estimates.
Additionally, in the official scikit-learn website there is a similar example but using KFold cross validation (https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html#sphx-glr-auto-examples-model-selection-plot-roc-crossval-py).
So for the leave-one-out cross validation case, I am thinking of gathering all the probability prediction on the test sets (one sample at the time) and after having the predicted probabilities for all my folds, to compute and plot the ROC curve.
Does this seems okay? I do not see any other way to achieve my goal.
Here is my code:
from sklearn.svm import SVC
import numpy as np, matplotlib.pyplot as plt, pandas as pd
from sklearn.model_selection import cross_val_score,cross_val_predict, KFold, LeaveOneOut, StratifiedKFold
from sklearn.metrics import roc_curve, auc
from sklearn import datasets
# Import some data to play with
iris = datasets.load_iris()
X_svc = iris.data
y = iris.target
X_svc, y = X_svc[y != 2], y[y != 2]
clf = SVC(kernel='linear', class_weight='balanced', probability=True, random_state=0)
kf = LeaveOneOut()
all_y = []
all_probs=[]
for train, test in kf.split(X_svc, y):
all_y.append(y[test])
all_probs.append(clf.fit(X_svc[train], y[train]).predict_proba(X_svc[test])[:,1])
all_y = np.array(all_y)
all_probs = np.array(all_probs)
fpr, tpr, thresholds = roc_curve(all_y,all_probs)
roc_auc = auc(fpr, tpr)
plt.figure(1, figsize=(12,6))
plt.plot(fpr, tpr, lw=2, alpha=0.5, label='LOOCV ROC (AUC = %0.2f)' % (roc_auc))
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k', label='Chance level', alpha=.8)
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.grid()
plt.show()
I believe the code is correct and the splitting too. I've added a few lines for validation purposes of both the implementation and the results:
from sklearn.model_selection import cross_val_score,cross_val_predict, KFold, LeaveOneOut, StratifiedKFold
from sklearn.metrics import roc_curve, auc
from sklearn import datasets
# Import some data to play with
iris = datasets.load_iris()
X_svc = iris.data
y = iris.target
X_svc, y = X_svc[y != 2], y[y != 2]
clf = SVC(kernel='linear', class_weight='balanced', probability=True, random_state=0)
kf = LeaveOneOut()
if kf.get_n_splits(X_svc) == len(X_svc):
print("They are the same length, splitting correct")
else:
print("Something is wrong")
all_y = []
all_probs=[]
for train, test in kf.split(X_svc, y):
all_y.append(y[test])
all_probs.append(clf.fit(X_svc[train], y[train]).predict_proba(X_svc[test])[:,1])
all_y = np.array(all_y)
all_probs = np.array(all_probs)
#print(all_y) #For validation
#print(all_probs) #For validation
fpr, tpr, thresholds = roc_curve(all_y,all_probs)
print(fpr, tpr, thresholds) #For validation
roc_auc = auc(fpr, tpr)
plt.figure(1, figsize=(12,6))
plt.plot(fpr, tpr, lw=2, alpha=0.5, label='LOOCV ROC (AUC = %0.2f)' % (roc_auc))
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k', label='Chance level', alpha=.8)
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.grid()
plt.show()
The If line is meant to only make sure that the splitting is made n times, where n is the number of observations for the given dataset. This is because as the documentation states, LeaveOneOut works the same as Kfold(n_splits=n) and LeaveOneOut(p=1).
Also when printing the predicted proba values they were good, making sense of the curve. Congratz on your 1.00AUC!
I'm using scikit learn, and I want to plot the precision and recall curves. the classifier I'm using is RandomForestClassifier. All the resources in the documentations of scikit learn uses binary classification. Also, can I plot a ROC curve for multiclass?
Also, I only found for SVM for multilabel and it has a decision_function which RandomForest doesn't have
From scikit-learn documentation:
Precision-Recall:
Precision-recall curves are typically used in binary classification to
study the output of a classifier. In order to extend the
precision-recall curve and average precision to multi-class or
multi-label classification, it is necessary to binarize the output.
One curve can be drawn per label, but one can also draw a
precision-recall curve by considering each element of the label
indicator matrix as a binary prediction (micro-averaging).
Receiver Operating Characteristic (ROC):
ROC curves are typically used in binary classification to study the
output of a classifier. In order to extend ROC curve and ROC area to
multi-class or multi-label classification, it is necessary to binarize
the output. One ROC curve can be drawn per label, but one can also
draw a ROC curve by considering each element of the label indicator
matrix as a binary prediction (micro-averaging).
Therefore, you should binarize the output and consider precision-recall and roc curves for each class. Moreover, you are going to use predict_proba to get class probabilities.
I divide the code into three parts:
general settings, learning and prediction
precision-recall curve
ROC curve
1. general settings, learning and prediction
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import precision_recall_curve, roc_curve
from sklearn.preprocessing import label_binarize
import matplotlib.pyplot as plt
#%matplotlib inline
mnist = fetch_openml("mnist_784")
y = mnist.target
y = y.astype(np.uint8)
n_classes = len(set(y))
Y = label_binarize(mnist.target, classes=[*range(n_classes)])
X_train, X_test, y_train, y_test = train_test_split(mnist.data,
Y,
random_state = 42)
clf = OneVsRestClassifier(RandomForestClassifier(n_estimators=50,
max_depth=3,
random_state=0))
clf.fit(X_train, y_train)
y_score = clf.predict_proba(X_test)
2. precision-recall curve
# precision recall curve
precision = dict()
recall = dict()
for i in range(n_classes):
precision[i], recall[i], _ = precision_recall_curve(y_test[:, i],
y_score[:, i])
plt.plot(recall[i], precision[i], lw=2, label='class {}'.format(i))
plt.xlabel("recall")
plt.ylabel("precision")
plt.legend(loc="best")
plt.title("precision vs. recall curve")
plt.show()
3. ROC curve
# roc curve
fpr = dict()
tpr = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_test[:, i],
y_score[:, i]))
plt.plot(fpr[i], tpr[i], lw=2, label='class {}'.format(i))
plt.xlabel("false positive rate")
plt.ylabel("true positive rate")
plt.legend(loc="best")
plt.title("ROC curve")
plt.show()
I would like to compare different binary classifiers in Python. For that, I want to calculate the ROC AUC scores, measure the 95% confidence interval (CI), and p-value to access statistical significance.
Below is a minimal example in scikit-learn which trains three different models on a binary classification dataset, plots the ROC curves and calculates the AUC scores.
Here are my specific questions:
How to calculate the 95% confidence interval (CI) of the ROC AUC scores on the test set? (e.g. with bootstrapping).
How to compare the AUC scores (on test set) and measure the p-value to assess statistical significance? (The null hypothesis is that the models are not different. Rejecting the null hypothesis means the difference in AUC scores is statistically significant.)
.
import numpy as np
np.random.seed(2018)
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import matplotlib
import matplotlib.pyplot as plt
data = load_breast_cancer()
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=17)
# Naive Bayes Classifier
nb_clf = GaussianNB()
nb_clf.fit(X_train, y_train)
nb_prediction_proba = nb_clf.predict_proba(X_test)[:, 1]
# Ranodm Forest Classifier
rf_clf = RandomForestClassifier(n_estimators=20)
rf_clf.fit(X_train, y_train)
rf_prediction_proba = rf_clf.predict_proba(X_test)[:, 1]
# Multi-layer Perceptron Classifier
mlp_clf = MLPClassifier(alpha=1, hidden_layer_sizes=150)
mlp_clf.fit(X_train, y_train)
mlp_prediction_proba = mlp_clf.predict_proba(X_test)[:, 1]
def roc_curve_and_score(y_test, pred_proba):
fpr, tpr, _ = roc_curve(y_test.ravel(), pred_proba.ravel())
roc_auc = roc_auc_score(y_test.ravel(), pred_proba.ravel())
return fpr, tpr, roc_auc
plt.figure(figsize=(8, 6))
matplotlib.rcParams.update({'font.size': 14})
plt.grid()
fpr, tpr, roc_auc = roc_curve_and_score(y_test, rf_prediction_proba)
plt.plot(fpr, tpr, color='darkorange', lw=2,
label='ROC AUC={0:.3f}'.format(roc_auc))
fpr, tpr, roc_auc = roc_curve_and_score(y_test, nb_prediction_proba)
plt.plot(fpr, tpr, color='green', lw=2,
label='ROC AUC={0:.3f}'.format(roc_auc))
fpr, tpr, roc_auc = roc_curve_and_score(y_test, mlp_prediction_proba)
plt.plot(fpr, tpr, color='crimson', lw=2,
label='ROC AUC={0:.3f}'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=1, linestyle='--')
plt.legend(loc="lower right")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('1 - Specificity')
plt.ylabel('Sensitivity')
plt.show()
Bootstrap for 95% confidence interval
You want to repeat your analysis on multiple resamplings of your data. In the general case, assume you have a function f(x) that determines whatever statistic you need from data x and you can bootstrap like this:
def bootstrap(x, f, nsamples=1000):
stats = [f(x[np.random.randint(x.shape[0], size=x.shape[0])]) for _ in range(nsamples)]
return np.percentile(stats, (2.5, 97.5))
This gives you so-called plug-in estimates of the 95% confidence interval (i.e. you just take the percentiles of the bootstrap distribution).
In your case, you can write a more specific function like this
def bootstrap_auc(clf, X_train, y_train, X_test, y_test, nsamples=1000):
auc_values = []
for b in range(nsamples):
idx = np.random.randint(X_train.shape[0], size=X_train.shape[0])
clf.fit(X_train[idx], y_train[idx])
pred = clf.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test.ravel(), pred.ravel())
auc_values.append(roc_auc)
return np.percentile(auc_values, (2.5, 97.5))
Here, clf is the classifier for which you want to test the performance and X_train, y_train, X_test, y_test are like in your code.
This gives me the following confidence intervals (rounded to three digits, 1000 bootstrap samples):
Naive Bayes: 0.986 [0.980 0.988] (estimate, lower and upper limit of confidence interval)
Random Forest: 0.983 [0.974 0.989]
Multilayer Perceptron: 0.974 [0.223 0.98]
Permutation tests to test against chance performance
A permutation test would technically go over all permutations of your observation sequence and evaluate your roc curve with the permuted target values (features are not permuted). This is ok if you have a few observations, but it becomes very costly if you more observations. It is therefore common to subsample the number of permutations and simply do a number of random permutations. Here, the implementation depends a bit more on the specific thing you want to test. The following function does that for your roc_auc values
def permutation_test(clf, X_train, y_train, X_test, y_test, nsamples=1000):
idx1 = np.arange(X_train.shape[0])
idx2 = np.arange(X_test.shape[0])
auc_values = np.empty(nsamples)
for b in range(nsamples):
np.random.shuffle(idx1) # Shuffles in-place
np.random.shuffle(idx2)
clf.fit(X_train, y_train[idx1])
pred = clf.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test[idx2].ravel(), pred.ravel())
auc_values[b] = roc_auc
clf.fit(X_train, y_train)
pred = clf.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test.ravel(), pred.ravel())
return roc_auc, np.mean(auc_values >= roc_auc)
This function again takes your classifier as clf and returns the AUC value on the unshuffled data and the p-value (i.e. probability to observe an AUC value larger than or equal to what you have in the unshuffled data).
Running this with 1000 samples gives p-values of 0 for all three classifiers. Note that these are not exact because of the sampling, but they are an indicating that all of these classifiers perform better than chance.
Permutation test for differences between classifiers
This is much easier. Given two classifiers, you have prediction for every observation. You just shuffle the assignment between predictions and classifiers like this
def permutation_test_between_clfs(y_test, pred_proba_1, pred_proba_2, nsamples=1000):
auc_differences = []
auc1 = roc_auc_score(y_test.ravel(), pred_proba_1.ravel())
auc2 = roc_auc_score(y_test.ravel(), pred_proba_2.ravel())
observed_difference = auc1 - auc2
for _ in range(nsamples):
mask = np.random.randint(2, size=len(pred_proba_1.ravel()))
p1 = np.where(mask, pred_proba_1.ravel(), pred_proba_2.ravel())
p2 = np.where(mask, pred_proba_2.ravel(), pred_proba_1.ravel())
auc1 = roc_auc_score(y_test.ravel(), p1)
auc2 = roc_auc_score(y_test.ravel(), p2)
auc_differences.append(auc1 - auc2)
return observed_difference, np.mean(auc_differences >= observed_difference)
With this test and 1000 samples, I find no significant differences between the three classifiers:
Naive bayes vs random forest: diff=0.0029, p(diff>)=0.311
Naive bayes vs MLP: diff=0.0117, p(diff>)=0.186
random forest vs MLP: diff=0.0088, p(diff>)=0.203
Where diff denotes the difference in roc curves between the two classifiers and p(diff>) is the empirical probability to observe a larger difference on a shuffled data set.
One can use the code given below to compute the AUC and asymptotic normally distributed confidence interval for Neural Nets.
tf.contrib.metrics.auc_with_confidence_intervals(
labels,
predictions,
weights=None,
alpha=0.95,
logit_transformation=True,
metrics_collections=(),
updates_collections=(),
name=None)
I am trying to plot a ROC curve to evaluate the accuracy of a prediction model I developed in Python using logistic regression packages. I have computed the true positive rate as well as the false positive rate; however, I am unable to figure out how to plot these correctly using matplotlib and calculate the AUC value. How could I do that?
Here are two ways you may try, assuming your model is an sklearn predictor:
import sklearn.metrics as metrics
# calculate the fpr and tpr for all thresholds of the classification
probs = model.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)
# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
# method II: ggplot
from ggplot import *
df = pd.DataFrame(dict(fpr = fpr, tpr = tpr))
ggplot(df, aes(x = 'fpr', y = 'tpr')) + geom_line() + geom_abline(linetype = 'dashed')
or try
ggplot(df, aes(x = 'fpr', ymin = 0, ymax = 'tpr')) + geom_line(aes(y = 'tpr')) + geom_area(alpha = 0.2) + ggtitle("ROC Curve w/ AUC = %s" % str(roc_auc))
This is the simplest way to plot an ROC curve, given a set of ground truth labels and predicted probabilities. Best part is, it plots the ROC curve for ALL classes, so you get multiple neat-looking curves as well
import scikitplot as skplt
import matplotlib.pyplot as plt
y_true = # ground truth labels
y_probas = # predicted probabilities generated by sklearn classifier
skplt.metrics.plot_roc_curve(y_true, y_probas)
plt.show()
Here's a sample curve generated by plot_roc_curve. I used the sample digits dataset from scikit-learn so there are 10 classes. Notice that one ROC curve is plotted for each class.
Disclaimer: Note that this uses the scikit-plot library, which I built.
AUC curve For Binary Classification using matplotlib
from sklearn import svm, datasets
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
import matplotlib.pyplot as plt
Load Breast Cancer Dataset
breast_cancer = load_breast_cancer()
X = breast_cancer.data
y = breast_cancer.target
Split the Dataset
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33, random_state=44)
Model
clf = LogisticRegression(penalty='l2', C=0.1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
Accuracy
print("Accuracy", metrics.accuracy_score(y_test, y_pred))
AUC Curve
y_pred_proba = clf.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()
It is not at all clear what the problem is here, but if you have an array true_positive_rate and an array false_positive_rate, then plotting the ROC curve and getting the AUC is as simple as:
import matplotlib.pyplot as plt
import numpy as np
x = # false_positive_rate
y = # true_positive_rate
# This is the ROC curve
plt.plot(x,y)
plt.show()
# This is the AUC
auc = np.trapz(y,x)
Here is python code for computing the ROC curve (as a scatter plot):
import matplotlib.pyplot as plt
import numpy as np
score = np.array([0.9, 0.8, 0.7, 0.6, 0.55, 0.54, 0.53, 0.52, 0.51, 0.505, 0.4, 0.39, 0.38, 0.37, 0.36, 0.35, 0.34, 0.33, 0.30, 0.1])
y = np.array([1,1,0, 1, 1, 1, 0, 0, 1, 0, 1,0, 1, 0, 0, 0, 1 , 0, 1, 0])
# false positive rate
fpr = []
# true positive rate
tpr = []
# Iterate thresholds from 0.0, 0.01, ... 1.0
thresholds = np.arange(0.0, 1.01, .01)
# get number of positive and negative examples in the dataset
P = sum(y)
N = len(y) - P
# iterate through all thresholds and determine fraction of true positives
# and false positives found at this threshold
for thresh in thresholds:
FP=0
TP=0
for i in range(len(score)):
if (score[i] > thresh):
if y[i] == 1:
TP = TP + 1
if y[i] == 0:
FP = FP + 1
fpr.append(FP/float(N))
tpr.append(TP/float(P))
plt.scatter(fpr, tpr)
plt.show()
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt
y_true = # true labels
y_probas = # predicted results
fpr, tpr, thresholds = metrics.roc_curve(y_true, y_probas, pos_label=0)
# Print ROC curve
plt.plot(fpr,tpr)
plt.show()
# Print AUC
auc = np.trapz(tpr,fpr)
print('AUC:', auc)
Based on multiple comments from stackoverflow, scikit-learn documentation and some other, I made a python package to plot ROC curve (and other metric) in a really simple way.
To install package : pip install plot-metric (more info at the end of post)
To plot a ROC Curve (example come from the documentation) :
Binary classification
Let's load a simple dataset and make a train & test set :
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
X, y = make_classification(n_samples=1000, n_classes=2, weights=[1,1], random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=2)
Train a classifier and predict test set :
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=50, random_state=23)
model = clf.fit(X_train, y_train)
# Use predict_proba to predict probability of the class
y_pred = clf.predict_proba(X_test)[:,1]
You can now use plot_metric to plot ROC Curve :
from plot_metric.functions import BinaryClassification
# Visualisation with plot_metric
bc = BinaryClassification(y_test, y_pred, labels=["Class 1", "Class 2"])
# Figures
plt.figure(figsize=(5,5))
bc.plot_roc_curve()
plt.show()
Result :
You can find more example of on the github and documentation of the package:
Github : https://github.com/yohann84L/plot_metric
Documentation : https://plot-metric.readthedocs.io/en/latest/
The previous answers assume that you indeed calculated TP/Sens yourself. It's a bad idea to do this manually, it's easy to make mistakes with the calculations, rather use a library function for all of this.
the plot_roc function in scikit_lean does exactly what you need:
http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
The essential part of the code is:
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
There is a library called metriculous that will do that for you:
$ pip install metriculous
Let's first mock some data, this would usually come from the test dataset and the model(s):
import numpy as np
def normalize(array2d: np.ndarray) -> np.ndarray:
return array2d / array2d.sum(axis=1, keepdims=True)
class_names = ["Cat", "Dog", "Pig"]
num_classes = len(class_names)
num_samples = 500
# Mock ground truth
ground_truth = np.random.choice(range(num_classes), size=num_samples, p=[0.5, 0.4, 0.1])
# Mock model predictions
perfect_model = np.eye(num_classes)[ground_truth]
noisy_model = normalize(
perfect_model + 2 * np.random.random((num_samples, num_classes))
)
random_model = normalize(np.random.random((num_samples, num_classes)))
Now we can use metriculous to generate a table with various metrics and diagrams, including ROC curves:
import metriculous
metriculous.compare_classifiers(
ground_truth=ground_truth,
model_predictions=[perfect_model, noisy_model, random_model],
model_names=["Perfect Model", "Noisy Model", "Random Model"],
class_names=class_names,
one_vs_all_figures=True, # This line is important to include ROC curves in the output
).save_html("model_comparison.html").display()
The ROC curves in the output:
The plots are zoomable and draggable, and you get further details when hovering with your mouse over the plot:
You can also follow the offical documentation form scikit:
https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html#sphx-glr-auto-examples-model-selection-plot-roc-py
I have made a simple function included in a package for the ROC curve. I just started practicing machine learning so please also let me know if this code has any problem!
Have a look at the github readme file for more details! :)
https://github.com/bc123456/ROC
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
def plot_ROC(y_train_true, y_train_prob, y_test_true, y_test_prob):
'''
a funciton to plot the ROC curve for train labels and test labels.
Use the best threshold found in train set to classify items in test set.
'''
fpr_train, tpr_train, thresholds_train = roc_curve(y_train_true, y_train_prob, pos_label =True)
sum_sensitivity_specificity_train = tpr_train + (1-fpr_train)
best_threshold_id_train = np.argmax(sum_sensitivity_specificity_train)
best_threshold = thresholds_train[best_threshold_id_train]
best_fpr_train = fpr_train[best_threshold_id_train]
best_tpr_train = tpr_train[best_threshold_id_train]
y_train = y_train_prob > best_threshold
cm_train = confusion_matrix(y_train_true, y_train)
acc_train = accuracy_score(y_train_true, y_train)
auc_train = roc_auc_score(y_train_true, y_train)
print 'Train Accuracy: %s ' %acc_train
print 'Train AUC: %s ' %auc_train
print 'Train Confusion Matrix:'
print cm_train
fig = plt.figure(figsize=(10,5))
ax = fig.add_subplot(121)
curve1 = ax.plot(fpr_train, tpr_train)
curve2 = ax.plot([0, 1], [0, 1], color='navy', linestyle='--')
dot = ax.plot(best_fpr_train, best_tpr_train, marker='o', color='black')
ax.text(best_fpr_train, best_tpr_train, s = '(%.3f,%.3f)' %(best_fpr_train, best_tpr_train))
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve (Train), AUC = %.4f'%auc_train)
fpr_test, tpr_test, thresholds_test = roc_curve(y_test_true, y_test_prob, pos_label =True)
y_test = y_test_prob > best_threshold
cm_test = confusion_matrix(y_test_true, y_test)
acc_test = accuracy_score(y_test_true, y_test)
auc_test = roc_auc_score(y_test_true, y_test)
print 'Test Accuracy: %s ' %acc_test
print 'Test AUC: %s ' %auc_test
print 'Test Confusion Matrix:'
print cm_test
tpr_score = float(cm_test[1][1])/(cm_test[1][1] + cm_test[1][0])
fpr_score = float(cm_test[0][1])/(cm_test[0][0]+ cm_test[0][1])
ax2 = fig.add_subplot(122)
curve1 = ax2.plot(fpr_test, tpr_test)
curve2 = ax2.plot([0, 1], [0, 1], color='navy', linestyle='--')
dot = ax2.plot(fpr_score, tpr_score, marker='o', color='black')
ax2.text(fpr_score, tpr_score, s = '(%.3f,%.3f)' %(fpr_score, tpr_score))
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve (Test), AUC = %.4f'%auc_test)
plt.savefig('ROC', dpi = 500)
plt.show()
return best_threshold
A sample roc graph produced by this code
When you need the probabilities as well... The following gets the AUC value and plots it all in one shot.
from sklearn.metrics import plot_roc_curve
plot_roc_curve(m,xs,y)
When you have the probabilities... you can't get the auc value and plots in one shot. Do the following:
from sklearn.metrics import roc_curve
fpr,tpr,_ = roc_curve(y,y_probas)
plt.plot(fpr,tpr, label='AUC = ' + str(round(roc_auc_score(y,m.oob_decision_function_[:,1]), 2)))
plt.legend(loc='lower right')
In my code, I have X_train and y_train and classes are 0 and 1. The clf.predict_proba() method computes probabilities for both classes for every data point. I compare the probability of class1 with different values of threshold.
probability = clf.predict_proba(X_train)
def plot_roc(y_train, probability):
threshold_values = np.linspace(0,1,100) #Threshold values range from 0 to 1
FPR_list = []
TPR_list = []
for threshold in threshold_values: #For every value of threshold
y_pred = [] #Classify every data point in the test set
#prob is an array consisting of 2 values - Probability of datapoint in Class0 and Class1.
for prob in probability:
if ((prob[1])<threshold): #Prob of class1 (positive class)
y_pred.append(0)
continue
elif ((prob[1])>=threshold): y_pred.append(1)
#Plot Confusion Matrix and Obtain values of TP, FP, TN, FN
c_m = confusion_matrix(y, y_pred)
TN = c_m[0][0]
FP = c_m[0][1]
FN = c_m[1][0]
TP = c_m[1][1]
FPR = FP/(FP + TN) #Obtain False Positive Rate
TPR = TP/(TP + FN) #Obtain True Positive Rate
FPR_list.append(FPR)
TPR_list.append(TPR)
fig = plt.figure()
plt.plot(FPR_list, TPR_list)
plt.ylabel('TPR')
plt.xlabel('FPR')
plt.show()
A new open-source I help maintain have many ways to test model performance. to see ROC curve you can do:
from deepchecks.checks import RocReport
from deepchecks import Dataset
RocReport().run(Dataset(df, label='target'), model)
And the result looks like this:
A more elaborate example of RocReport can be found here
As The ROC Curve is only for Binary Classification
Then use your data Binarize and raveled
# Binarize data for getting AUC
y_test_bin = label_binarize(y_test, classes=range(y_train.min() , y_train.max()))
y_pred_bin = label_binarize(Predicted_result, classes=range(y_train.min() , y_train.max()))
# Calculate FP , TP rate
fpr, tpr, _ = roc_curve(y_test_bin.ravel(), y_pred_bin.ravel() )
# Get AUC ,
auc = roc_auc_score(y_test_bin, y_pred_bin, average='micro', multi_class='ovr')
#create ROC curve
plt.plot(fpr,tpr , label= f"AUC = {auc}" , )
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC')
plt.legend(loc=7)
plt.figure(figsize = [])
plt.show()
I want to evaluate my classification models with a ROC curve. I'm struggling with computing a multiclass ROC Curve for a cross-validated data set. There is no division in train and test set, because of the cross-validation.
Underneath, you can see the code I already tried.
scaler = StandardScaler(with_mean=False)
enc = LabelEncoder()
y = enc.fit_transform(labels)
vec = DictVectorizer()
feat_sel = SelectKBest(mutual_info_classif, k=200)
n_classes = 3
# Pipeline for computing of ROC curves
clf = OneVsRestClassifier(LogisticRegression(solver='newton-cg', multi_class='multinomial'))
clf = clf.label_binarizer_
pipe = Pipeline([('vectorizer', vec),
('scaler', scaler),
('Logreg', clf),
('mutual_info',feat_sel)])
y_pred = model_selection.cross_val_predict(pipe, instances, y, cv=10)
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y[:, i], y_pred[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
# Plot of a ROC curve for a specific class
for i in range(n_classes):
plt.figure()
plt.plot(fpr[i], tpr[i], label='ROC curve (area = %0.2f)' % roc_auc[i])
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
I thought I could binarize my y_pred by using the attribute label_binarizer_ for the OneVsRestclassifier as mentioned here: sklearn.multiclass.OneVsRestclassifier.
However, I get the following error: AttributeError: 'OneVsRestClassifier' object has no attribute 'label_binarizer_'. I don't get this error, because the documentation tells me that it is an
attribute from this classifier.
when I add instances = DataFrame(instances) and clf.fit(instances, y), I get the error: ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
. Instances is a list of feature vector dictionaries. I tried adding instances = np.array(instances) instead, but this gives me this error: TypeError: float() argument must be a string or a number, not 'dict'
What am I doing wrong?
You can use label_binarizer this way and get the desired plot as output.
Example using Iris data:
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
iris = datasets.load_iris()
X = iris.data
y = iris.target
# Binarize the output
y_bin = label_binarize(y, classes=[0, 1, 2])
n_classes = y_bin.shape[1]
pipe= Pipeline([('scaler', StandardScaler()), ('clf', LogisticRegression())])
# or
#clf = OneVsRestClassifier(LogisticRegression())
#pipe= Pipeline([('scaler', StandardScaler()), ('clf', clf)])
y_score = cross_val_predict(pipe, X, y, cv=10 ,method='predict_proba')
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_bin[:, i], y_score[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
colors = cycle(['blue', 'red', 'green'])
for i, color in zip(range(n_classes), colors):
plt.plot(fpr[i], tpr[i], color=color, lw=lw,
label='ROC curve of class {0} (area = {1:0.2f})'
''.format(i, roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic for multi-class data')
plt.legend(loc="lower right")
plt.show()