Python Machine Learning SGD Classification Error - python

# This set will get us started, but you will need to add
# others.
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.preprocessing import label_binarize
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
# We can now access the dataset
wine = load_wine()
X = wine.data
y = wine.target
nclasses = len(wine.target_names)
colors = "bry"
h = .02 # step size in the mesh
# Split the data into a training set and a test set - this is
# where your implementation will need to start. Maybe you
# will need to work with the train_test_split( ... ) function
X_train, X_test, y_train, y_test = train_test_split(X, y)
# Standardize
mean = X_train.mean(axis=0)
std = X_train.std(axis=0)
X_train = (X_train - mean) / std
# Next we can look at the cross validation. Remember we are
# selecting the two best features through this process.
# Take a look at tutorial 4 for an example implementation
best_performance = 0
best_mean_f1 = 0
best_f1 = 0
best_f2 = 0
# We will be testing every combination of pairs of features
for f1 in range(0,13):
for f2 in range(0,13):
# We want 2 features, not 1
if f1 == f2:
continue
features_idx_to_use = [f1,f2]
clf = SGDClassifier(alpha=0.001, max_iter=100, random_state=42)
clf.fit(X_train[:,features_idx_to_use], y_train)
# Return the predictions for the 3-Fold crossvalidation
y_predicted = cross_val_predict(clf, X_train[:,features_idx_to_use],y_train, cv=3)
# Construct the confusion matricies
conf_mat_train = confusion_matrix(y_train, y_predicted)
# Print out the recall, precision and F1 scores
# There will be a value for each class
# CV Train
print("CV Train:",f1,":",f2," - ", recall_score(y_train,y_predicted,average=None))
print("CV Train:",f1,":",f2," - ",precision_score(y_train,y_predicted,average=None))
print("CV Train:",f1,":",f2," - ",f1_score(y_train,y_predicted,average=None))
current_f1 = np.mean(f1_score(y_train,y_predicted,average=None))
if current_f1 > best_mean_f1:
best_f1 = f1
best_f2 = f2
best_mean_f1 = current_f1
best_clf = clf
fig, ax = plt.subplots()
disp = plot_confusion_matrix(best_clf, X_test[:,[best_f1, best_f2]], y_test,
display_labels=wine.target_names,
cmap=plt.cm.Blues,ax=ax)
ax.set_title('Testing')
# Once you have selected the best performing set of features
# in the cross-validation, we can test the best performing
# classifier
# Now we need to test our classifier using the test set.
# Recall that we standardised the data - we need to do the same with the stored
# Mean and standard deviation from the training set.
X_test = (X_test - mean) / std
y_test_predicted = best_clf.predict(X_test[:,[best_f1, best_f2]])
conf_mat_test = confusion_matrix(y_test, y_test_predicted)
# Test
print("Test:",recall_score(y_test,y_test_predicted,average=None))
print("Test:",precision_score(y_test,y_test_predicted,average=None))
print("Test:",f1_score(y_test,y_test_predicted,average=None))
y_score = best_clf.decision_function(X_test[:,[best_f1, best_f2]])
# Now we can plot a ROC curve and calculate the AUC
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(nclasses):
fpr[i], tpr[i], _ = roc_curve(y_test, y_score)
roc_auc[i] = auc(fpr, tpr)
# Once we have finished on the performance, we can plot the
# the classifier boundaries and test points. This one is
# letf to you ;)
h = .02 # step size in the mesh
# create a mesh to plot in
x_min, x_max = X_test[:, best_f1].min() - 1, X_test[:, best_f1].max() + 1
y_min, y_max = X_test[:, best_f2].min() - 1, X_test[:, best_f2].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
fig, ax = plt.subplots(figsize=(10,10))
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
Z = best_clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
cs = ax.contourf(xx, yy, Z,cmap=plt.get_cmap('plasma'))
fig, ax = plt.subplots(figsize=(10,10))
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
Z = best_clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
cs = ax.contourf(xx, yy, Z,cmap=plt.get_cmap('plasma'))
colors = ['cyan','orange']
# Plot also the training points
for i, color in zip(best_clf.classes_, colors):
idx = np.where(y_test == i)
ax.scatter(X_test[idx, best_f1], X_test[idx, best_f2], c=color, label=wine.target_names[i],
cmap=plt.get_cmap('plasma'), edgecolor='black', s=20)
ax.set_title("Decision surface of Binary SGD")
ax.axis('tight')
xmin, xmax = plt.xlim()
ymin, ymax = plt.ylim()
coef = best_clf.coef_
intercept = best_clf.intercept_
# Lets make a function to plot the hyperplanes used by the SVM for
# Classification.
def plot_hyperplane(c, color):
def line(x0):
return (-(x0 * coef[c, 0]) - intercept[c]) / coef[c, 1]
plt.plot([xmin, xmax], [line(xmin), line(xmax)],
ls="--", color=color)
plot_hyperplane(0, "red")
My error is
Traceback (most recent call last):
File "C:\Users\Doug.spyder-py3\temp.py", line 120, in
fpr[i], tpr[i], _ = roc_curve(y_test, y_score)
File "C:\Users\Doug\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
return f(*args, **kwargs)
File "C:\Users\Doug\anaconda3\lib\site-packages\sklearn\metrics_ranking.py", line 913, in roc_curve
fps, tps, thresholds = _binary_clf_curve(
File "C:\Users\Doug\anaconda3\lib\site-packages\sklearn\metrics_ranking.py", line 691, in _binary_clf_curve
raise ValueError("{0} format is not supported".format(y_type))
ValueError: multiclass format is not supported
And the problem line is fpr[i], tpr[i], _ = roc_curve(y_test, y_score)
Can someone explain what it is I am missing please? I know it's got something to do with the kind of data that I am inputting but I am not sure what?

First, as you do not use fpr and tpr later, you can merge roc_curve and auc into roc_auc_score which you already imported.
See the docs.
If you are performing multiclass classification (more than two classes), you need to provide the multi_class parameter.
ovo and ovr give different results; see here.
Maybe relevant:
ROC for multiclass classification
How to fix ValueError: multiclass format is not supported

Related

Why do some contour lines have two different labels?

Does my code have a bug or something else?
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
file = 'https://aegis4048.github.io/downloads/notebooks/sample_data/unconv_MV_v5.csv'
myDF = pd.read_csv(file)
# Split the data into features and target
feature1 = "Brittle"
feature2 = "Por"
X = myDF[[feature1, feature2]] #.iloc[:, :-1].values # A NumPy array!
print("X.info():", X.info())
y = myDF["Prod"] #.iloc[:, -1].values
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# Create a linear regression object
reg = LinearRegression()
# Fit the model to the training data
reg.fit(X_train, y_train)
# Predict the target variable using the test data
y_pred = reg.predict(X_test)
# Evaluate the model using mean squared error (MSE)
mse = np.mean((y_test - y_pred)**2)
print("Mean Squared Error: ", mse)
print("R2 Score:", reg.score(X_test, y_test))
#define figure size in (width, height) for all plots
plt.rcParams['figure.figsize'] = [10, 7]
# Create a mesh of values for the features
print(X_train.shape) # NumPy array
x1_min, x1_max = X_train[feature1].min(), X_train[feature1].max()
x2_min, x2_max = X_train[feature2].min(), X_train[feature2].max()
x1, x2 = np.meshgrid(np.linspace(x1_min, x1_max, 100), np.linspace(x2_min, x2_max, 100))
X_mesh = np.c_[x1.ravel(), x2.ravel()]
# Compute the predictions for the mesh of values
y_pred_mesh = reg.predict(X_mesh).reshape(x1.shape)
# Plot the predictions as a surface. Request 10 contour lines.
contours = plt.contourf(x1, x2, y_pred_mesh, 10, cmap='coolwarm', alpha=0.8) # https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.contourf.html
# Scatter plot of the training data.
# The colors of the points don't mean much except to stand out from the background
plt.scatter(X_train[feature1], X_train[feature2], c=y_train, cmap='coolwarm', s=20)
# Label the contour lines
plt.clabel(contours, inline=1, fontsize=12, colors = "black")
# Label the plot
plt.xlabel(feature1)
plt.ylabel(feature2)
plt.title('Multivariate Linear Regression Contour Plot')
# Show the plot
plt.show()
The output:

How to create a precision-recall curve plot to compare 2 classifiers in Python?

My goal is to find the precision-recall curve, comparing with Logistic Regression and Random Forest and plotting them in one graph. I wanted to know if I used the right steps to create a plot to compare both classifiers.
I appreciate all the help!
Code:
from sklearn.preprocessing import MultiLabelBinarizer as mlb
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.datasets import make_classification
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from matplotlib import pyplot
X = df[["DIAGNOSIS_CD_Dummy"]]
y = df[["TEST_RESULT_Dummy"]]
# X = pd.DataFrame(df.iloc[:, -1])
# y = pd.DataFrame(df.iloc[:, :-1])
# raw confusion matrix
df = pd.DataFrame(df, columns=["DIAGNOSIS_CD_Dummy", "TEST_RESULT_Dummy"])
confusion_matrix = pd.crosstab(
df["TEST_RESULT_Dummy"],
df["DIAGNOSIS_CD_Dummy"],
rownames=["Test Result"],
colnames=["Diagnosis"],
)
print(confusion_matrix)
# Logistic Regression Confusion Matrix
from sklearn.preprocessing import MultiLabelBinarizer as mlb
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.datasets import make_classification
from sklearn import metrics
# split into training and test using scikit
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y.values.ravel(), test_size=0.3, random_state=1, stratify=y
)
log_model = LogisticRegression()
log_model.fit(X_train, y_train)
# use logistic regression model to make predictions
y_score = log_model.predict_proba(X_test)[:, 1]
y_pred = log_model.predict(X_test)
y_pred = np.round(y_pred)
confusion_matrix = confusion_matrix(y_test, y_pred)
print("\n")
print(confusion_matrix)
print("\n")
print(classification_report(y_test, y_pred, zero_division=0))
# calculate precision and recall
precision, recall, thresholds = precision_recall_curve(y_test, y_score)
# create precision recall curve
fig, ax = plt.subplots()
ax.plot(recall, precision, color="purple")
# add axis labels to plot
ax.set_title("Precision-Recall Curve")
ax.set_ylabel("Precision")
ax.set_xlabel("Recall")
# display plot
plt.show()
# precision-recall curve
# generate 2 class dataset
X = df[["DIAGNOSIS_CD_Dummy"]]
y = df[["TEST_RESULT_Dummy"]]
# X = pd.DataFrame(df.iloc[:, :-1])
# y = pd.DataFrame(df.iloc[:, -1])
# split into train/test sets
trainX, testX, trainy, testy = train_test_split(
X, y.values.ravel(), test_size=0.3, random_state=2
)
# fit a model
model = LogisticRegression(solver="lbfgs")
model.fit(trainX, trainy)
# predict probabilities
lr_probs = model.predict_proba(testX)
# probs_rf = model_rf.predict_proba(testX)[:, 1]
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# predict class values
yhat = model.predict(testX)
lr_precision, lr_recall, _ = precision_recall_curve(testy, lr_probs)
lr_f1, lr_auc = f1_score(testy, yhat), auc(lr_recall, lr_precision)
# precision_rf, recall_rf, _ = precision_recall_curve(testy, probs_rf)
# f1_rf, auc_rf = f1_score(testy, yhat), auc(recall_rf, precision_rf)
# auc_rf = auc(recall_rf, precision_rf)
# summarize scores
print("Logistic: f1=%.3f auc=%.3f" % (lr_f1, lr_auc))
# plot the precision-recall curves
no_skill = len(testy[testy == 1]) / len(testy)
pyplot.plot([0, 1], [no_skill, no_skill], linestyle="--", label="No Skill")
pyplot.plot(lr_recall, lr_precision, marker=".", label="Logistic")
plt.plot(lr_precision, lr_recall, label=f"AUC (Logistic Regression) = {lr_auc:.2f}")
# axis labels
pyplot.xlabel("Recall")
pyplot.ylabel("Precision")
# show the legend
pyplot.legend()
# show the plot
pyplot.show()
# Random Forest
model_rf = RandomForestClassifier()
model_rf.fit(trainX, trainy)
# model_rf = RandomForestClassifier().fit(trainX, trainy)
# predict probabilities
lr_probs = model.predict_proba(testX)
probs_rf = model_rf.predict_proba(testX)
# keep probabilities for the positive outcome only
probs_rf = probs_rf[:, 1]
# predict class values
yhat = model.predict(testX)
precision_rf, recall_rf, _ = precision_recall_curve(testy, probs_rf)
f1_rf, auc_rf = f1_score(testy, yhat), auc(recall_rf, precision_rf)
auc_rf = auc(recall_rf, precision_rf)
print("Random Forest: f1=%.3f auc=%.3f" % (f1_rf, auc_rf))
# plot the precision-recall curves
no_skill = len(testy[testy == 1]) / len(testy)
pyplot.plot([0, 1], [no_skill, no_skill], linestyle="--", label="No Skill")
pyplot.plot(lr_recall, lr_precision, marker=".", label="Random Forest")
plt.plot(recall_rf, precision_rf, label=f"AUC (Random Forests) = {auc_rf:.2f}")
# axis labels
pyplot.xlabel("Recall")
pyplot.ylabel("Precision")
# show the legend
pyplot.legend()
# show the plot
pyplot.show()
Output:
Diagnosis 0 1
Test Result
0 18385 32
1 1268 165
[[5514 11]
[ 374 56]]
precision recall f1-score support
0 0.94 1.00 0.97 5525
1 0.84 0.13 0.23 430
accuracy 0.94 5955
macro avg 0.89 0.56 0.60 5955
weighted avg 0.93 0.94 0.91 5955
Logistic: f1=0.193 auc=0.488
Random Forest: f1=0.193 auc=0.488
This is my attempt to plot it.
import pathlib
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import (MultipleLocator, AutoMinorLocator)
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from itertools import cycle
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import label_binarize
def __plot_binary_precision_recall_curve(X_test, y_test, *args, **kwargs):
"""
Private function to be used by plot_precision_recall_curve for binary applications.
"""
if 'fig_size' in kwargs and 'dpi' in kwargs:
fig, ax = plt.subplots(figsize=kwargs['fig_size'], dpi=kwargs['dpi'])
else:
fig, ax = plt.subplots()
plt.rcParams["figure.facecolor"] = 'white'
plt.rcParams["axes.facecolor"] = 'white'
plt.rcParams["savefig.facecolor"] = 'white'
ax.xaxis.set_major_locator(MultipleLocator(0.1))
ax.xaxis.set_major_formatter('{x:.1f}')
ax.yaxis.set_major_locator(MultipleLocator(0.1))
ax.yaxis.set_major_formatter('{x:.1f}')
ax.xaxis.set_minor_locator(MultipleLocator(0.05))
ax.yaxis.set_minor_locator(MultipleLocator(0.05))
ax.tick_params(which='both', width=2)
ax.tick_params(which='major', length=7)
ax.tick_params(which='minor', length=4, color='black')
plt.grid(True, zorder=0)
plt.plot([0, 1], [1, 0], linestyle='--', lw=1, color='k',
label='Luck', alpha=.8, zorder=1) # random prediction curve
plt.plot([1, 1], [1, 0], c='k', linestyle='dashdot'), plt.plot([1, 1], c='k', linestyle='dashdot', zorder=2, label="Perfect model") #perfect model prediction curve
f_scores = np.linspace(0.2, 0.8, num=4)
lines, labels = [], []
for f_score in f_scores:
x = np.linspace(0.01, 1)
y = f_score * x / (2 * x - f_score)
(l,) = plt.plot(x[y >= 0], y[y >= 0], color="gray", alpha=0.2)
plt.annotate("f1={0:0.1f}".format(f_score), xy=(0.9, y[45] + 0.02))
zorder = 3
for classifier in args:
display = PrecisionRecallDisplay.from_estimator(classifier, X_test, y_test, ax=ax, zorder=zorder)
zorder +=1
# add the legend for the iso-f1 curves
handles, labels = display.ax_.get_legend_handles_labels()
handles.extend([l])
labels.extend(["iso-f1 curves"])
# set the legend and the axes
ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.legend(handles=handles, labels=labels, loc="best")
plt.xlabel('Recall', fontsize=18)
plt.ylabel('Precision', fontsize=18)
if 'title' in kwargs:
ax.set_title(kwargs['title'], fontsize=18)
else:
ax.set_title("Precision-Recall Curve", fontsize=18)
if 'save_fig_path' in kwargs:
path = pathlib.Path(kwargs['save_fig_path'])
path.parent.mkdir(parents=True, exist_ok=True)
fig.savefig(kwargs['save_fig_path'], dpi=kwargs['dpi'], facecolor=fig.get_facecolor(), edgecolor='none')
return fig, ax
def __plot_multiclass_precision_recall_curve(X_train, y_train, X_test, y_test, *args, **kwargs):
"""
Private function designed to be used by plot_precision_recall_curve for multiclass applications.
"""
my_vals = y_test.unique().tolist()
my_vals.sort()
# binarize the y_test series
y_test = label_binarize(y_test, classes=my_vals)
n_classes = y_test.shape[1]
# setup plot details
colors = cycle(["navy", "turquoise", "darkorange", "cornflowerblue", "teal"])
if 'fig_size' in kwargs and 'dpi' in kwargs:
fig, ax = plt.subplots(len(args), figsize=kwargs['fig_size'], dpi=kwargs['dpi'], facecolor='white')
else:
fig, ax = plt.subplots(len(args), facecolor='white')
for count, clfs in enumerate(args):
ax[count].xaxis.set_major_locator(MultipleLocator(0.1))
ax[count].xaxis.set_major_formatter('{x:.1f}')
ax[count].yaxis.set_major_locator(MultipleLocator(0.1))
ax[count].yaxis.set_major_formatter('{x:.1f}')
ax[count].xaxis.set_minor_locator(MultipleLocator(0.05))
ax[count].yaxis.set_minor_locator(MultipleLocator(0.05))
ax[count].tick_params(which='both', width=2)
ax[count].tick_params(which='major', length=7)
ax[count].tick_params(which='minor', length=4, color='black')
ax[count].grid(True, zorder=0)
ax[count].plot([0, 1], [1, 0], linestyle='--', lw=1, color='k',
label='Luck', alpha=.8, zorder=1) # random prediction curve
ax[count].plot([1, 1], [1, 0], c='k', linestyle='dashdot'), ax[count].plot([1, 1], c='k', linestyle='dashdot', zorder=2, label="Perfect model") #perfect model prediction curve
# set up the model, wrapped by the OneVsRestClassifier
classifier = OneVsRestClassifier(clfs)
classifier.fit(X_train, y_train) # train the model
# produce the predictions (as probabilities)
y_score = classifier.predict_proba(X_test)
# For each class
precision = dict()
recall = dict()
average_precision = dict()
for i in range(n_classes):
precision[i], recall[i], _ = precision_recall_curve(y_test[:, i], y_score[:, i])
average_precision[i] = average_precision_score(y_test[:, i], y_score[:, i])
# A "micro-average": quantifying score on all classes jointly
precision["micro"], recall["micro"], _ = precision_recall_curve(
y_test.ravel(), y_score.ravel()
)
average_precision["micro"] = average_precision_score(y_test, y_score, average="micro")
f_scores = np.linspace(0.2, 0.8, num=4)
lines, labels = [], []
for f_score in f_scores:
x = np.linspace(0.01, 1)
y = f_score * x / (2 * x - f_score)
(l,) = ax[count].plot(x[y >= 0], y[y >= 0], color="gray", alpha=0.2)
ax[count].annotate("f1={0:0.1f}".format(f_score), xy=(0.9, y[45] + 0.02))
display = PrecisionRecallDisplay(
recall=recall["micro"],
precision=precision["micro"],
average_precision=average_precision["micro"],
)
display.plot(ax=ax[count], name="Micro-average precision-recall", color="gold")
for i, color in zip(range(n_classes), colors):
display = PrecisionRecallDisplay(
recall=recall[i],
precision=precision[i],
average_precision=average_precision[i],
)
display.plot(ax=ax[count], name=f"Precision-recall for class {i}", color=color)
# add the legend for the iso-f1 curves
handles, labels = display.ax_.get_legend_handles_labels()
handles.extend([l])
labels.extend(["iso-f1 curves"])
# set the legend and the axes
ax[count].set_xlim([0.0, 1.0])
ax[count].set_ylim([0.0, 1.05])
ax[count].legend(handles=handles, labels=labels, loc="best")
if type(clfs) == Pipeline:
estimator_name = str(type(clfs['clf'])).split(".")[-1][:-2]
else:
estimator_name = str(type(clfs)).split(".")[-1][:-2]
if 'title' in kwargs:
ax[count].set_title(kwargs['title'] + " - " + estimator_name, fontsize=18)
else:
ax[count].set_title("Precision-Recall Curve" + " - " + estimator_name, fontsize=18)
ax[count].set_xlabel('Recall', fontsize=18)
ax[count].set_ylabel('Precision', fontsize=18)
if 'save_fig_path' in kwargs:
path = pathlib.Path(kwargs['save_fig_path'])
path.parent.mkdir(parents=True, exist_ok=True)
fig.savefig(kwargs['save_fig_path'], dpi=kwargs['dpi'], facecolor=fig.get_facecolor(), edgecolor='none')
return fig, ax
def plot_precision_recall_curve(X_train, y_train, X_test, y_test, *args, **kwargs):
"""
Plots precision recall curves for the given models
Parameters
----------
X_test : pandas.DataFrame of shape (n_samples, n_features)
Test values.
y_test : pandas.Series of shape (n_samples,)
Target values.
*args : estimators to plot precision and recall curves
estimator instance (either sklearn.Pipeline, imblearn.Pipeline or a classifier)
PRE-FITTED classifier or a PRE-FITTED Pipeline in which the last estimator is a classifier.
**kwargs : The following options are available with kwargs
fig_size : tuple
Size (inches) of the plot.
dpi : int, default = 100
Image DPI.
title : str
The title of the plot.
save_fig_path : str
Full path where to save the plot. Will generate the folders if they don't exist already.
Returns
-------
fig : Matplotlib.pyplot.Figure
Figure from matplotlib
ax : Matplotlib.pyplot.Axe
Axe object from matplotlib
Example Syntax
--------------
fig, ax = reporting.plot_precision_recall_curve(X_train, y_train, X_test, y_test,
rf_pipe, catboost_classifier,
fig_size=(10,16), dpi=100,
title="Precision-Recall Curve",
save_fig_path="dir1/dir2/precision_recall_curve.png")
"""
if (len(y_test.unique()) == 2):
fig, ax = __plot_binary_precision_recall_curve(X_test, y_test, *args, **kwargs)
else:
fig, ax = __plot_multiclass_precision_recall_curve(X_train, y_train, X_test, y_test, *args, **kwargs)
return fig, ax
Syntax and Output for Binary Classification
fig, ax = plot_precision_recall_curve(X_train, y_train, X_test, y_test,
rf_pipe, xgboost_classifier,
fig_size=(10,8), dpi=100,
title="Precision-Recall Curve",
save_fig_path="dir1/dir2/precision_recall_curve.png")
Syntax and Output for Multi-class Classification
fig, ax = plot_precision_recall_curve(X_train, y_train, X_test, y_test,
rf_pipe, catboost_classifier,
fig_size=(10,16), dpi=100,
title="Precision-Recall Curve",
save_fig_path="dir1/dir2/precision_recall_curve.png")

Plotting the ROC curve for a multiclass problem

I am trying to apply the idea of sklearn ROC extension to multiclass to my dataset. My per-class ROC curve looks find of a straight line each, unline the sklearn's example showing curve's fluctuating.
I give an MWE below to show what I mean:
# all imports
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
# dummy dataset
X, y = make_classification(10000, n_classes=5, n_informative=10, weights=[.04, .4, .12, .5, .04])
train, test, ytrain, ytest = train_test_split(X, y, test_size=.3, random_state=42)
# random forest model
model = RandomForestClassifier()
model.fit(train, ytrain)
yhat = model.predict(test)
The following function then plots the ROC curve:
def plot_roc_curve(y_test, y_pred):
n_classes = len(np.unique(y_test))
y_test = label_binarize(y_test, classes=np.arange(n_classes))
y_pred = label_binarize(y_pred, classes=np.arange(n_classes))
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_pred[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_pred.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
# Plot all ROC curves
#plt.figure(figsize=(10,5))
plt.figure(dpi=600)
lw = 2
plt.plot(fpr["micro"], tpr["micro"],
label="micro-average ROC curve (area = {0:0.2f})".format(roc_auc["micro"]),
color="deeppink", linestyle=":", linewidth=4,)
plt.plot(fpr["macro"], tpr["macro"],
label="macro-average ROC curve (area = {0:0.2f})".format(roc_auc["macro"]),
color="navy", linestyle=":", linewidth=4,)
colors = cycle(["aqua", "darkorange", "darkgreen", "yellow", "blue"])
for i, color in zip(range(n_classes), colors):
plt.plot(fpr[i], tpr[i], color=color, lw=lw,
label="ROC curve of class {0} (area = {1:0.2f})".format(i, roc_auc[i]),)
plt.plot([0, 1], [0, 1], "k--", lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) curve")
plt.legend()
Output:
plot_roc_curve(ytest, yhat)
Kind of straight line bending once. I would like to see the model performance at different thresholds, not just one, a figure similar to sklearn's illustration for 3-classes shown below:
Point is that you're using predict() rather than predict_proba()/decision_function() to define your y_hat. This means - considering that the threshold vector is defined by the number of distinct values in y_hat (see here for reference), that you'll have few thresholds per class only on which tpr and fpr are computed (which in turn implies that your curves are evaluated at few points only).
Indeed, consider what the doc says to pass to y_scores in roc_curve(), either prob estimates or decision values. In the example from sklearn, decision values are used to compute the scores. Given that you're considering a RandomForestClassifier(), considering probability estimates in your y_hat should be the way to go.
What's the point then of label-binarizing the output? The standard definition for ROC is in terms of binary classification. To pass to a multiclass problem, you have to convert your problem into binary by using OneVsAll approach, so that you'll have n_class number of ROC curves. (Observe, indeed, that as SVC() handles multiclass problems in a OvO fashion by default, in the example they had to force to use OvA by applying OneVsRestClassifier constructor; with a RandomForestClassifier you don't have such problem as that's inherently multiclass, see here for reference). In these terms, once you switch to predict_proba() you'll see there's no much sense in label binarizing predictions.
# all imports
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
# dummy dataset
X, y = make_classification(10000, n_classes=5, n_informative=10, weights=[.04, .4, .12, .5, .04])
train, test, ytrain, ytest = train_test_split(X, y, test_size=.3, random_state=42)
# random forest model
model = RandomForestClassifier()
model.fit(train, ytrain)
yhat = model.predict_proba(test)
def plot_roc_curve(y_test, y_pred):
n_classes = len(np.unique(y_test))
y_test = label_binarize(y_test, classes=np.arange(n_classes))
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
thresholds = dict()
for i in range(n_classes):
fpr[i], tpr[i], thresholds[i] = roc_curve(y_test[:, i], y_pred[:, i], drop_intermediate=False)
roc_auc[i] = auc(fpr[i], tpr[i])
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_pred.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
# Plot all ROC curves
#plt.figure(figsize=(10,5))
plt.figure(dpi=600)
lw = 2
plt.plot(fpr["micro"], tpr["micro"],
label="micro-average ROC curve (area = {0:0.2f})".format(roc_auc["micro"]),
color="deeppink", linestyle=":", linewidth=4,)
plt.plot(fpr["macro"], tpr["macro"],
label="macro-average ROC curve (area = {0:0.2f})".format(roc_auc["macro"]),
color="navy", linestyle=":", linewidth=4,)
colors = cycle(["aqua", "darkorange", "darkgreen", "yellow", "blue"])
for i, color in zip(range(n_classes), colors):
plt.plot(fpr[i], tpr[i], color=color, lw=lw,
label="ROC curve of class {0} (area = {1:0.2f})".format(i, roc_auc[i]),)
plt.plot([0, 1], [0, 1], "k--", lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) curve")
plt.legend()
Eventually, consider that roc_curve() has also a drop_intermediate parameter meant for dropping suboptimal thresholds (it might be useful to know).
Just to update on #amiola answer: I had an issue with non-monotonic classes which lead to very strange fuzzy results. In this case a little modification to the function above will work very well:
classes = sorted(list(y_test['label'].unique()))
Use this in the label_binarize line:
y_test = label_binarize(y_test, classes=classes)
And then when you need a range in the function, just use:
range(len(classes))
Thanks to #dx2-66 for the answer. You can check for more details here.

Scatter Plot of predicted vs actual value with regression curve

I am trying to use scatter plots with regression curves using the following code. I am using different algorithms like Linear regression, SVM and Gaussian Process etc. I have tried different options for plotting the data mentioned below
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
df=pd.read_excel(coded.xlsx)
dfnew=df[['FL','FW','TL','LL','KH']]
Y = df['KH']
X = df[['FL']]
X=X.values.reshape(len(X),1)
Y=Y.values.reshape(len(Y),1)
# Split the data into training/testing sets
X_train = X[:-270]
X_test = X[-270:]
# Split the targets into training/testing sets
Y_train = Y[:-270]
Y_test = Y[-270:]
#regressor = SVR(kernel = 'rbf')
#regressor.fit(X_train, np.ravel(Y_train))
#training the algorithm
regressor = GaussianProcessRegressor(random_state=42)
regressor.fit(X_train, Y_train)
y_pred = regressor.predict(X_test)
mse = np.sum((y_pred - Y_test)**2)
# root mean squared error
# m is the number of training examples
rmse = np.sqrt(mse/270)
print(rmse)
#X_grid = np.arange(min(X), max(X), 0.01) #this step required because data is feature scaled.
#X_grid = np.arange(0, 15, 0.01) #this step required because data is feature scaled.
#X_grid = X_grid.reshape((len(X_grid), 1))
#plt.scatter(X, Y, color = 'red')
print('size of Y_train = {0}'.format(Y_train.size))
print('size of y_pred = {0}'.format(y_pred.size))
#plt.scatter(Y_train, y_pred, color = 'red')
#plt.plot(X_grid, regressor.predict(X_grid), color = 'blue')
#plt.title('GPR')
#plt.xlabel('Measured')
#plt.ylabel('Predicted')
#plt.show()
fig, ax = plt.subplots(1, figsize=(12, 6))
plt.plot(X[:, 0], Y_train, marker='o', color='black', linewidth=0)
plt.plot(X[:, 0], y_pred, marker='x', color='steelblue')
plt.suptitle("$GaussianProcessRegressor(kernel=RBF)$ [default]", fontsize=20)
plt.axis('off')
pass
But I am getting error like:
ValueError: x and y must have same first dimension, but have shapes (540,) and (270, 1)
What is the possible solution?
This code splits X and Y into training/testing sets, but then tries to plot a column from all of X with Y_train and y_pred, which have only half as many values as X. Try creating plots with X_train and X_test instead.

How to plot ROC curve in Python

I am trying to plot a ROC curve to evaluate the accuracy of a prediction model I developed in Python using logistic regression packages. I have computed the true positive rate as well as the false positive rate; however, I am unable to figure out how to plot these correctly using matplotlib and calculate the AUC value. How could I do that?
Here are two ways you may try, assuming your model is an sklearn predictor:
import sklearn.metrics as metrics
# calculate the fpr and tpr for all thresholds of the classification
probs = model.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)
# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
# method II: ggplot
from ggplot import *
df = pd.DataFrame(dict(fpr = fpr, tpr = tpr))
ggplot(df, aes(x = 'fpr', y = 'tpr')) + geom_line() + geom_abline(linetype = 'dashed')
or try
ggplot(df, aes(x = 'fpr', ymin = 0, ymax = 'tpr')) + geom_line(aes(y = 'tpr')) + geom_area(alpha = 0.2) + ggtitle("ROC Curve w/ AUC = %s" % str(roc_auc))
This is the simplest way to plot an ROC curve, given a set of ground truth labels and predicted probabilities. Best part is, it plots the ROC curve for ALL classes, so you get multiple neat-looking curves as well
import scikitplot as skplt
import matplotlib.pyplot as plt
y_true = # ground truth labels
y_probas = # predicted probabilities generated by sklearn classifier
skplt.metrics.plot_roc_curve(y_true, y_probas)
plt.show()
Here's a sample curve generated by plot_roc_curve. I used the sample digits dataset from scikit-learn so there are 10 classes. Notice that one ROC curve is plotted for each class.
Disclaimer: Note that this uses the scikit-plot library, which I built.
AUC curve For Binary Classification using matplotlib
from sklearn import svm, datasets
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
import matplotlib.pyplot as plt
Load Breast Cancer Dataset
breast_cancer = load_breast_cancer()
X = breast_cancer.data
y = breast_cancer.target
Split the Dataset
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33, random_state=44)
Model
clf = LogisticRegression(penalty='l2', C=0.1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
Accuracy
print("Accuracy", metrics.accuracy_score(y_test, y_pred))
AUC Curve
y_pred_proba = clf.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()
It is not at all clear what the problem is here, but if you have an array true_positive_rate and an array false_positive_rate, then plotting the ROC curve and getting the AUC is as simple as:
import matplotlib.pyplot as plt
import numpy as np
x = # false_positive_rate
y = # true_positive_rate
# This is the ROC curve
plt.plot(x,y)
plt.show()
# This is the AUC
auc = np.trapz(y,x)
Here is python code for computing the ROC curve (as a scatter plot):
import matplotlib.pyplot as plt
import numpy as np
score = np.array([0.9, 0.8, 0.7, 0.6, 0.55, 0.54, 0.53, 0.52, 0.51, 0.505, 0.4, 0.39, 0.38, 0.37, 0.36, 0.35, 0.34, 0.33, 0.30, 0.1])
y = np.array([1,1,0, 1, 1, 1, 0, 0, 1, 0, 1,0, 1, 0, 0, 0, 1 , 0, 1, 0])
# false positive rate
fpr = []
# true positive rate
tpr = []
# Iterate thresholds from 0.0, 0.01, ... 1.0
thresholds = np.arange(0.0, 1.01, .01)
# get number of positive and negative examples in the dataset
P = sum(y)
N = len(y) - P
# iterate through all thresholds and determine fraction of true positives
# and false positives found at this threshold
for thresh in thresholds:
FP=0
TP=0
for i in range(len(score)):
if (score[i] > thresh):
if y[i] == 1:
TP = TP + 1
if y[i] == 0:
FP = FP + 1
fpr.append(FP/float(N))
tpr.append(TP/float(P))
plt.scatter(fpr, tpr)
plt.show()
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt
y_true = # true labels
y_probas = # predicted results
fpr, tpr, thresholds = metrics.roc_curve(y_true, y_probas, pos_label=0)
# Print ROC curve
plt.plot(fpr,tpr)
plt.show()
# Print AUC
auc = np.trapz(tpr,fpr)
print('AUC:', auc)
Based on multiple comments from stackoverflow, scikit-learn documentation and some other, I made a python package to plot ROC curve (and other metric) in a really simple way.
To install package : pip install plot-metric (more info at the end of post)
To plot a ROC Curve (example come from the documentation) :
Binary classification
Let's load a simple dataset and make a train & test set :
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
X, y = make_classification(n_samples=1000, n_classes=2, weights=[1,1], random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=2)
Train a classifier and predict test set :
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=50, random_state=23)
model = clf.fit(X_train, y_train)
# Use predict_proba to predict probability of the class
y_pred = clf.predict_proba(X_test)[:,1]
You can now use plot_metric to plot ROC Curve :
from plot_metric.functions import BinaryClassification
# Visualisation with plot_metric
bc = BinaryClassification(y_test, y_pred, labels=["Class 1", "Class 2"])
# Figures
plt.figure(figsize=(5,5))
bc.plot_roc_curve()
plt.show()
Result :
You can find more example of on the github and documentation of the package:
Github : https://github.com/yohann84L/plot_metric
Documentation : https://plot-metric.readthedocs.io/en/latest/
The previous answers assume that you indeed calculated TP/Sens yourself. It's a bad idea to do this manually, it's easy to make mistakes with the calculations, rather use a library function for all of this.
the plot_roc function in scikit_lean does exactly what you need:
http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
The essential part of the code is:
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
There is a library called metriculous that will do that for you:
$ pip install metriculous
Let's first mock some data, this would usually come from the test dataset and the model(s):
import numpy as np
def normalize(array2d: np.ndarray) -> np.ndarray:
return array2d / array2d.sum(axis=1, keepdims=True)
class_names = ["Cat", "Dog", "Pig"]
num_classes = len(class_names)
num_samples = 500
# Mock ground truth
ground_truth = np.random.choice(range(num_classes), size=num_samples, p=[0.5, 0.4, 0.1])
# Mock model predictions
perfect_model = np.eye(num_classes)[ground_truth]
noisy_model = normalize(
perfect_model + 2 * np.random.random((num_samples, num_classes))
)
random_model = normalize(np.random.random((num_samples, num_classes)))
Now we can use metriculous to generate a table with various metrics and diagrams, including ROC curves:
import metriculous
metriculous.compare_classifiers(
ground_truth=ground_truth,
model_predictions=[perfect_model, noisy_model, random_model],
model_names=["Perfect Model", "Noisy Model", "Random Model"],
class_names=class_names,
one_vs_all_figures=True, # This line is important to include ROC curves in the output
).save_html("model_comparison.html").display()
The ROC curves in the output:
The plots are zoomable and draggable, and you get further details when hovering with your mouse over the plot:
You can also follow the offical documentation form scikit:
https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html#sphx-glr-auto-examples-model-selection-plot-roc-py
I have made a simple function included in a package for the ROC curve. I just started practicing machine learning so please also let me know if this code has any problem!
Have a look at the github readme file for more details! :)
https://github.com/bc123456/ROC
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
def plot_ROC(y_train_true, y_train_prob, y_test_true, y_test_prob):
'''
a funciton to plot the ROC curve for train labels and test labels.
Use the best threshold found in train set to classify items in test set.
'''
fpr_train, tpr_train, thresholds_train = roc_curve(y_train_true, y_train_prob, pos_label =True)
sum_sensitivity_specificity_train = tpr_train + (1-fpr_train)
best_threshold_id_train = np.argmax(sum_sensitivity_specificity_train)
best_threshold = thresholds_train[best_threshold_id_train]
best_fpr_train = fpr_train[best_threshold_id_train]
best_tpr_train = tpr_train[best_threshold_id_train]
y_train = y_train_prob > best_threshold
cm_train = confusion_matrix(y_train_true, y_train)
acc_train = accuracy_score(y_train_true, y_train)
auc_train = roc_auc_score(y_train_true, y_train)
print 'Train Accuracy: %s ' %acc_train
print 'Train AUC: %s ' %auc_train
print 'Train Confusion Matrix:'
print cm_train
fig = plt.figure(figsize=(10,5))
ax = fig.add_subplot(121)
curve1 = ax.plot(fpr_train, tpr_train)
curve2 = ax.plot([0, 1], [0, 1], color='navy', linestyle='--')
dot = ax.plot(best_fpr_train, best_tpr_train, marker='o', color='black')
ax.text(best_fpr_train, best_tpr_train, s = '(%.3f,%.3f)' %(best_fpr_train, best_tpr_train))
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve (Train), AUC = %.4f'%auc_train)
fpr_test, tpr_test, thresholds_test = roc_curve(y_test_true, y_test_prob, pos_label =True)
y_test = y_test_prob > best_threshold
cm_test = confusion_matrix(y_test_true, y_test)
acc_test = accuracy_score(y_test_true, y_test)
auc_test = roc_auc_score(y_test_true, y_test)
print 'Test Accuracy: %s ' %acc_test
print 'Test AUC: %s ' %auc_test
print 'Test Confusion Matrix:'
print cm_test
tpr_score = float(cm_test[1][1])/(cm_test[1][1] + cm_test[1][0])
fpr_score = float(cm_test[0][1])/(cm_test[0][0]+ cm_test[0][1])
ax2 = fig.add_subplot(122)
curve1 = ax2.plot(fpr_test, tpr_test)
curve2 = ax2.plot([0, 1], [0, 1], color='navy', linestyle='--')
dot = ax2.plot(fpr_score, tpr_score, marker='o', color='black')
ax2.text(fpr_score, tpr_score, s = '(%.3f,%.3f)' %(fpr_score, tpr_score))
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve (Test), AUC = %.4f'%auc_test)
plt.savefig('ROC', dpi = 500)
plt.show()
return best_threshold
A sample roc graph produced by this code
When you need the probabilities as well... The following gets the AUC value and plots it all in one shot.
from sklearn.metrics import plot_roc_curve
plot_roc_curve(m,xs,y)
When you have the probabilities... you can't get the auc value and plots in one shot. Do the following:
from sklearn.metrics import roc_curve
fpr,tpr,_ = roc_curve(y,y_probas)
plt.plot(fpr,tpr, label='AUC = ' + str(round(roc_auc_score(y,m.oob_decision_function_[:,1]), 2)))
plt.legend(loc='lower right')
In my code, I have X_train and y_train and classes are 0 and 1. The clf.predict_proba() method computes probabilities for both classes for every data point. I compare the probability of class1 with different values of threshold.
probability = clf.predict_proba(X_train)
def plot_roc(y_train, probability):
threshold_values = np.linspace(0,1,100) #Threshold values range from 0 to 1
FPR_list = []
TPR_list = []
for threshold in threshold_values: #For every value of threshold
y_pred = [] #Classify every data point in the test set
#prob is an array consisting of 2 values - Probability of datapoint in Class0 and Class1.
for prob in probability:
if ((prob[1])<threshold): #Prob of class1 (positive class)
y_pred.append(0)
continue
elif ((prob[1])>=threshold): y_pred.append(1)
#Plot Confusion Matrix and Obtain values of TP, FP, TN, FN
c_m = confusion_matrix(y, y_pred)
TN = c_m[0][0]
FP = c_m[0][1]
FN = c_m[1][0]
TP = c_m[1][1]
FPR = FP/(FP + TN) #Obtain False Positive Rate
TPR = TP/(TP + FN) #Obtain True Positive Rate
FPR_list.append(FPR)
TPR_list.append(TPR)
fig = plt.figure()
plt.plot(FPR_list, TPR_list)
plt.ylabel('TPR')
plt.xlabel('FPR')
plt.show()
A new open-source I help maintain have many ways to test model performance. to see ROC curve you can do:
from deepchecks.checks import RocReport
from deepchecks import Dataset
RocReport().run(Dataset(df, label='target'), model)
And the result looks like this:
A more elaborate example of RocReport can be found here
As The ROC Curve is only for Binary Classification
Then use your data Binarize and raveled
# Binarize data for getting AUC
y_test_bin = label_binarize(y_test, classes=range(y_train.min() , y_train.max()))
y_pred_bin = label_binarize(Predicted_result, classes=range(y_train.min() , y_train.max()))
# Calculate FP , TP rate
fpr, tpr, _ = roc_curve(y_test_bin.ravel(), y_pred_bin.ravel() )
# Get AUC ,
auc = roc_auc_score(y_test_bin, y_pred_bin, average='micro', multi_class='ovr')
#create ROC curve
plt.plot(fpr,tpr , label= f"AUC = {auc}" , )
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC')
plt.legend(loc=7)
plt.figure(figsize = [])
plt.show()

Categories

Resources