Related
Does my code have a bug or something else?
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
file = 'https://aegis4048.github.io/downloads/notebooks/sample_data/unconv_MV_v5.csv'
myDF = pd.read_csv(file)
# Split the data into features and target
feature1 = "Brittle"
feature2 = "Por"
X = myDF[[feature1, feature2]] #.iloc[:, :-1].values # A NumPy array!
print("X.info():", X.info())
y = myDF["Prod"] #.iloc[:, -1].values
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# Create a linear regression object
reg = LinearRegression()
# Fit the model to the training data
reg.fit(X_train, y_train)
# Predict the target variable using the test data
y_pred = reg.predict(X_test)
# Evaluate the model using mean squared error (MSE)
mse = np.mean((y_test - y_pred)**2)
print("Mean Squared Error: ", mse)
print("R2 Score:", reg.score(X_test, y_test))
#define figure size in (width, height) for all plots
plt.rcParams['figure.figsize'] = [10, 7]
# Create a mesh of values for the features
print(X_train.shape) # NumPy array
x1_min, x1_max = X_train[feature1].min(), X_train[feature1].max()
x2_min, x2_max = X_train[feature2].min(), X_train[feature2].max()
x1, x2 = np.meshgrid(np.linspace(x1_min, x1_max, 100), np.linspace(x2_min, x2_max, 100))
X_mesh = np.c_[x1.ravel(), x2.ravel()]
# Compute the predictions for the mesh of values
y_pred_mesh = reg.predict(X_mesh).reshape(x1.shape)
# Plot the predictions as a surface. Request 10 contour lines.
contours = plt.contourf(x1, x2, y_pred_mesh, 10, cmap='coolwarm', alpha=0.8) # https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.contourf.html
# Scatter plot of the training data.
# The colors of the points don't mean much except to stand out from the background
plt.scatter(X_train[feature1], X_train[feature2], c=y_train, cmap='coolwarm', s=20)
# Label the contour lines
plt.clabel(contours, inline=1, fontsize=12, colors = "black")
# Label the plot
plt.xlabel(feature1)
plt.ylabel(feature2)
plt.title('Multivariate Linear Regression Contour Plot')
# Show the plot
plt.show()
The output:
My goal is to find the precision-recall curve, comparing with Logistic Regression and Random Forest and plotting them in one graph. I wanted to know if I used the right steps to create a plot to compare both classifiers.
I appreciate all the help!
Code:
from sklearn.preprocessing import MultiLabelBinarizer as mlb
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.datasets import make_classification
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from matplotlib import pyplot
X = df[["DIAGNOSIS_CD_Dummy"]]
y = df[["TEST_RESULT_Dummy"]]
# X = pd.DataFrame(df.iloc[:, -1])
# y = pd.DataFrame(df.iloc[:, :-1])
# raw confusion matrix
df = pd.DataFrame(df, columns=["DIAGNOSIS_CD_Dummy", "TEST_RESULT_Dummy"])
confusion_matrix = pd.crosstab(
df["TEST_RESULT_Dummy"],
df["DIAGNOSIS_CD_Dummy"],
rownames=["Test Result"],
colnames=["Diagnosis"],
)
print(confusion_matrix)
# Logistic Regression Confusion Matrix
from sklearn.preprocessing import MultiLabelBinarizer as mlb
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.datasets import make_classification
from sklearn import metrics
# split into training and test using scikit
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y.values.ravel(), test_size=0.3, random_state=1, stratify=y
)
log_model = LogisticRegression()
log_model.fit(X_train, y_train)
# use logistic regression model to make predictions
y_score = log_model.predict_proba(X_test)[:, 1]
y_pred = log_model.predict(X_test)
y_pred = np.round(y_pred)
confusion_matrix = confusion_matrix(y_test, y_pred)
print("\n")
print(confusion_matrix)
print("\n")
print(classification_report(y_test, y_pred, zero_division=0))
# calculate precision and recall
precision, recall, thresholds = precision_recall_curve(y_test, y_score)
# create precision recall curve
fig, ax = plt.subplots()
ax.plot(recall, precision, color="purple")
# add axis labels to plot
ax.set_title("Precision-Recall Curve")
ax.set_ylabel("Precision")
ax.set_xlabel("Recall")
# display plot
plt.show()
# precision-recall curve
# generate 2 class dataset
X = df[["DIAGNOSIS_CD_Dummy"]]
y = df[["TEST_RESULT_Dummy"]]
# X = pd.DataFrame(df.iloc[:, :-1])
# y = pd.DataFrame(df.iloc[:, -1])
# split into train/test sets
trainX, testX, trainy, testy = train_test_split(
X, y.values.ravel(), test_size=0.3, random_state=2
)
# fit a model
model = LogisticRegression(solver="lbfgs")
model.fit(trainX, trainy)
# predict probabilities
lr_probs = model.predict_proba(testX)
# probs_rf = model_rf.predict_proba(testX)[:, 1]
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# predict class values
yhat = model.predict(testX)
lr_precision, lr_recall, _ = precision_recall_curve(testy, lr_probs)
lr_f1, lr_auc = f1_score(testy, yhat), auc(lr_recall, lr_precision)
# precision_rf, recall_rf, _ = precision_recall_curve(testy, probs_rf)
# f1_rf, auc_rf = f1_score(testy, yhat), auc(recall_rf, precision_rf)
# auc_rf = auc(recall_rf, precision_rf)
# summarize scores
print("Logistic: f1=%.3f auc=%.3f" % (lr_f1, lr_auc))
# plot the precision-recall curves
no_skill = len(testy[testy == 1]) / len(testy)
pyplot.plot([0, 1], [no_skill, no_skill], linestyle="--", label="No Skill")
pyplot.plot(lr_recall, lr_precision, marker=".", label="Logistic")
plt.plot(lr_precision, lr_recall, label=f"AUC (Logistic Regression) = {lr_auc:.2f}")
# axis labels
pyplot.xlabel("Recall")
pyplot.ylabel("Precision")
# show the legend
pyplot.legend()
# show the plot
pyplot.show()
# Random Forest
model_rf = RandomForestClassifier()
model_rf.fit(trainX, trainy)
# model_rf = RandomForestClassifier().fit(trainX, trainy)
# predict probabilities
lr_probs = model.predict_proba(testX)
probs_rf = model_rf.predict_proba(testX)
# keep probabilities for the positive outcome only
probs_rf = probs_rf[:, 1]
# predict class values
yhat = model.predict(testX)
precision_rf, recall_rf, _ = precision_recall_curve(testy, probs_rf)
f1_rf, auc_rf = f1_score(testy, yhat), auc(recall_rf, precision_rf)
auc_rf = auc(recall_rf, precision_rf)
print("Random Forest: f1=%.3f auc=%.3f" % (f1_rf, auc_rf))
# plot the precision-recall curves
no_skill = len(testy[testy == 1]) / len(testy)
pyplot.plot([0, 1], [no_skill, no_skill], linestyle="--", label="No Skill")
pyplot.plot(lr_recall, lr_precision, marker=".", label="Random Forest")
plt.plot(recall_rf, precision_rf, label=f"AUC (Random Forests) = {auc_rf:.2f}")
# axis labels
pyplot.xlabel("Recall")
pyplot.ylabel("Precision")
# show the legend
pyplot.legend()
# show the plot
pyplot.show()
Output:
Diagnosis 0 1
Test Result
0 18385 32
1 1268 165
[[5514 11]
[ 374 56]]
precision recall f1-score support
0 0.94 1.00 0.97 5525
1 0.84 0.13 0.23 430
accuracy 0.94 5955
macro avg 0.89 0.56 0.60 5955
weighted avg 0.93 0.94 0.91 5955
Logistic: f1=0.193 auc=0.488
Random Forest: f1=0.193 auc=0.488
This is my attempt to plot it.
import pathlib
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import (MultipleLocator, AutoMinorLocator)
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from itertools import cycle
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import label_binarize
def __plot_binary_precision_recall_curve(X_test, y_test, *args, **kwargs):
"""
Private function to be used by plot_precision_recall_curve for binary applications.
"""
if 'fig_size' in kwargs and 'dpi' in kwargs:
fig, ax = plt.subplots(figsize=kwargs['fig_size'], dpi=kwargs['dpi'])
else:
fig, ax = plt.subplots()
plt.rcParams["figure.facecolor"] = 'white'
plt.rcParams["axes.facecolor"] = 'white'
plt.rcParams["savefig.facecolor"] = 'white'
ax.xaxis.set_major_locator(MultipleLocator(0.1))
ax.xaxis.set_major_formatter('{x:.1f}')
ax.yaxis.set_major_locator(MultipleLocator(0.1))
ax.yaxis.set_major_formatter('{x:.1f}')
ax.xaxis.set_minor_locator(MultipleLocator(0.05))
ax.yaxis.set_minor_locator(MultipleLocator(0.05))
ax.tick_params(which='both', width=2)
ax.tick_params(which='major', length=7)
ax.tick_params(which='minor', length=4, color='black')
plt.grid(True, zorder=0)
plt.plot([0, 1], [1, 0], linestyle='--', lw=1, color='k',
label='Luck', alpha=.8, zorder=1) # random prediction curve
plt.plot([1, 1], [1, 0], c='k', linestyle='dashdot'), plt.plot([1, 1], c='k', linestyle='dashdot', zorder=2, label="Perfect model") #perfect model prediction curve
f_scores = np.linspace(0.2, 0.8, num=4)
lines, labels = [], []
for f_score in f_scores:
x = np.linspace(0.01, 1)
y = f_score * x / (2 * x - f_score)
(l,) = plt.plot(x[y >= 0], y[y >= 0], color="gray", alpha=0.2)
plt.annotate("f1={0:0.1f}".format(f_score), xy=(0.9, y[45] + 0.02))
zorder = 3
for classifier in args:
display = PrecisionRecallDisplay.from_estimator(classifier, X_test, y_test, ax=ax, zorder=zorder)
zorder +=1
# add the legend for the iso-f1 curves
handles, labels = display.ax_.get_legend_handles_labels()
handles.extend([l])
labels.extend(["iso-f1 curves"])
# set the legend and the axes
ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.legend(handles=handles, labels=labels, loc="best")
plt.xlabel('Recall', fontsize=18)
plt.ylabel('Precision', fontsize=18)
if 'title' in kwargs:
ax.set_title(kwargs['title'], fontsize=18)
else:
ax.set_title("Precision-Recall Curve", fontsize=18)
if 'save_fig_path' in kwargs:
path = pathlib.Path(kwargs['save_fig_path'])
path.parent.mkdir(parents=True, exist_ok=True)
fig.savefig(kwargs['save_fig_path'], dpi=kwargs['dpi'], facecolor=fig.get_facecolor(), edgecolor='none')
return fig, ax
def __plot_multiclass_precision_recall_curve(X_train, y_train, X_test, y_test, *args, **kwargs):
"""
Private function designed to be used by plot_precision_recall_curve for multiclass applications.
"""
my_vals = y_test.unique().tolist()
my_vals.sort()
# binarize the y_test series
y_test = label_binarize(y_test, classes=my_vals)
n_classes = y_test.shape[1]
# setup plot details
colors = cycle(["navy", "turquoise", "darkorange", "cornflowerblue", "teal"])
if 'fig_size' in kwargs and 'dpi' in kwargs:
fig, ax = plt.subplots(len(args), figsize=kwargs['fig_size'], dpi=kwargs['dpi'], facecolor='white')
else:
fig, ax = plt.subplots(len(args), facecolor='white')
for count, clfs in enumerate(args):
ax[count].xaxis.set_major_locator(MultipleLocator(0.1))
ax[count].xaxis.set_major_formatter('{x:.1f}')
ax[count].yaxis.set_major_locator(MultipleLocator(0.1))
ax[count].yaxis.set_major_formatter('{x:.1f}')
ax[count].xaxis.set_minor_locator(MultipleLocator(0.05))
ax[count].yaxis.set_minor_locator(MultipleLocator(0.05))
ax[count].tick_params(which='both', width=2)
ax[count].tick_params(which='major', length=7)
ax[count].tick_params(which='minor', length=4, color='black')
ax[count].grid(True, zorder=0)
ax[count].plot([0, 1], [1, 0], linestyle='--', lw=1, color='k',
label='Luck', alpha=.8, zorder=1) # random prediction curve
ax[count].plot([1, 1], [1, 0], c='k', linestyle='dashdot'), ax[count].plot([1, 1], c='k', linestyle='dashdot', zorder=2, label="Perfect model") #perfect model prediction curve
# set up the model, wrapped by the OneVsRestClassifier
classifier = OneVsRestClassifier(clfs)
classifier.fit(X_train, y_train) # train the model
# produce the predictions (as probabilities)
y_score = classifier.predict_proba(X_test)
# For each class
precision = dict()
recall = dict()
average_precision = dict()
for i in range(n_classes):
precision[i], recall[i], _ = precision_recall_curve(y_test[:, i], y_score[:, i])
average_precision[i] = average_precision_score(y_test[:, i], y_score[:, i])
# A "micro-average": quantifying score on all classes jointly
precision["micro"], recall["micro"], _ = precision_recall_curve(
y_test.ravel(), y_score.ravel()
)
average_precision["micro"] = average_precision_score(y_test, y_score, average="micro")
f_scores = np.linspace(0.2, 0.8, num=4)
lines, labels = [], []
for f_score in f_scores:
x = np.linspace(0.01, 1)
y = f_score * x / (2 * x - f_score)
(l,) = ax[count].plot(x[y >= 0], y[y >= 0], color="gray", alpha=0.2)
ax[count].annotate("f1={0:0.1f}".format(f_score), xy=(0.9, y[45] + 0.02))
display = PrecisionRecallDisplay(
recall=recall["micro"],
precision=precision["micro"],
average_precision=average_precision["micro"],
)
display.plot(ax=ax[count], name="Micro-average precision-recall", color="gold")
for i, color in zip(range(n_classes), colors):
display = PrecisionRecallDisplay(
recall=recall[i],
precision=precision[i],
average_precision=average_precision[i],
)
display.plot(ax=ax[count], name=f"Precision-recall for class {i}", color=color)
# add the legend for the iso-f1 curves
handles, labels = display.ax_.get_legend_handles_labels()
handles.extend([l])
labels.extend(["iso-f1 curves"])
# set the legend and the axes
ax[count].set_xlim([0.0, 1.0])
ax[count].set_ylim([0.0, 1.05])
ax[count].legend(handles=handles, labels=labels, loc="best")
if type(clfs) == Pipeline:
estimator_name = str(type(clfs['clf'])).split(".")[-1][:-2]
else:
estimator_name = str(type(clfs)).split(".")[-1][:-2]
if 'title' in kwargs:
ax[count].set_title(kwargs['title'] + " - " + estimator_name, fontsize=18)
else:
ax[count].set_title("Precision-Recall Curve" + " - " + estimator_name, fontsize=18)
ax[count].set_xlabel('Recall', fontsize=18)
ax[count].set_ylabel('Precision', fontsize=18)
if 'save_fig_path' in kwargs:
path = pathlib.Path(kwargs['save_fig_path'])
path.parent.mkdir(parents=True, exist_ok=True)
fig.savefig(kwargs['save_fig_path'], dpi=kwargs['dpi'], facecolor=fig.get_facecolor(), edgecolor='none')
return fig, ax
def plot_precision_recall_curve(X_train, y_train, X_test, y_test, *args, **kwargs):
"""
Plots precision recall curves for the given models
Parameters
----------
X_test : pandas.DataFrame of shape (n_samples, n_features)
Test values.
y_test : pandas.Series of shape (n_samples,)
Target values.
*args : estimators to plot precision and recall curves
estimator instance (either sklearn.Pipeline, imblearn.Pipeline or a classifier)
PRE-FITTED classifier or a PRE-FITTED Pipeline in which the last estimator is a classifier.
**kwargs : The following options are available with kwargs
fig_size : tuple
Size (inches) of the plot.
dpi : int, default = 100
Image DPI.
title : str
The title of the plot.
save_fig_path : str
Full path where to save the plot. Will generate the folders if they don't exist already.
Returns
-------
fig : Matplotlib.pyplot.Figure
Figure from matplotlib
ax : Matplotlib.pyplot.Axe
Axe object from matplotlib
Example Syntax
--------------
fig, ax = reporting.plot_precision_recall_curve(X_train, y_train, X_test, y_test,
rf_pipe, catboost_classifier,
fig_size=(10,16), dpi=100,
title="Precision-Recall Curve",
save_fig_path="dir1/dir2/precision_recall_curve.png")
"""
if (len(y_test.unique()) == 2):
fig, ax = __plot_binary_precision_recall_curve(X_test, y_test, *args, **kwargs)
else:
fig, ax = __plot_multiclass_precision_recall_curve(X_train, y_train, X_test, y_test, *args, **kwargs)
return fig, ax
Syntax and Output for Binary Classification
fig, ax = plot_precision_recall_curve(X_train, y_train, X_test, y_test,
rf_pipe, xgboost_classifier,
fig_size=(10,8), dpi=100,
title="Precision-Recall Curve",
save_fig_path="dir1/dir2/precision_recall_curve.png")
Syntax and Output for Multi-class Classification
fig, ax = plot_precision_recall_curve(X_train, y_train, X_test, y_test,
rf_pipe, catboost_classifier,
fig_size=(10,16), dpi=100,
title="Precision-Recall Curve",
save_fig_path="dir1/dir2/precision_recall_curve.png")
# This set will get us started, but you will need to add
# others.
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.preprocessing import label_binarize
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
# We can now access the dataset
wine = load_wine()
X = wine.data
y = wine.target
nclasses = len(wine.target_names)
colors = "bry"
h = .02 # step size in the mesh
# Split the data into a training set and a test set - this is
# where your implementation will need to start. Maybe you
# will need to work with the train_test_split( ... ) function
X_train, X_test, y_train, y_test = train_test_split(X, y)
# Standardize
mean = X_train.mean(axis=0)
std = X_train.std(axis=0)
X_train = (X_train - mean) / std
# Next we can look at the cross validation. Remember we are
# selecting the two best features through this process.
# Take a look at tutorial 4 for an example implementation
best_performance = 0
best_mean_f1 = 0
best_f1 = 0
best_f2 = 0
# We will be testing every combination of pairs of features
for f1 in range(0,13):
for f2 in range(0,13):
# We want 2 features, not 1
if f1 == f2:
continue
features_idx_to_use = [f1,f2]
clf = SGDClassifier(alpha=0.001, max_iter=100, random_state=42)
clf.fit(X_train[:,features_idx_to_use], y_train)
# Return the predictions for the 3-Fold crossvalidation
y_predicted = cross_val_predict(clf, X_train[:,features_idx_to_use],y_train, cv=3)
# Construct the confusion matricies
conf_mat_train = confusion_matrix(y_train, y_predicted)
# Print out the recall, precision and F1 scores
# There will be a value for each class
# CV Train
print("CV Train:",f1,":",f2," - ", recall_score(y_train,y_predicted,average=None))
print("CV Train:",f1,":",f2," - ",precision_score(y_train,y_predicted,average=None))
print("CV Train:",f1,":",f2," - ",f1_score(y_train,y_predicted,average=None))
current_f1 = np.mean(f1_score(y_train,y_predicted,average=None))
if current_f1 > best_mean_f1:
best_f1 = f1
best_f2 = f2
best_mean_f1 = current_f1
best_clf = clf
fig, ax = plt.subplots()
disp = plot_confusion_matrix(best_clf, X_test[:,[best_f1, best_f2]], y_test,
display_labels=wine.target_names,
cmap=plt.cm.Blues,ax=ax)
ax.set_title('Testing')
# Once you have selected the best performing set of features
# in the cross-validation, we can test the best performing
# classifier
# Now we need to test our classifier using the test set.
# Recall that we standardised the data - we need to do the same with the stored
# Mean and standard deviation from the training set.
X_test = (X_test - mean) / std
y_test_predicted = best_clf.predict(X_test[:,[best_f1, best_f2]])
conf_mat_test = confusion_matrix(y_test, y_test_predicted)
# Test
print("Test:",recall_score(y_test,y_test_predicted,average=None))
print("Test:",precision_score(y_test,y_test_predicted,average=None))
print("Test:",f1_score(y_test,y_test_predicted,average=None))
y_score = best_clf.decision_function(X_test[:,[best_f1, best_f2]])
# Now we can plot a ROC curve and calculate the AUC
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(nclasses):
fpr[i], tpr[i], _ = roc_curve(y_test, y_score)
roc_auc[i] = auc(fpr, tpr)
# Once we have finished on the performance, we can plot the
# the classifier boundaries and test points. This one is
# letf to you ;)
h = .02 # step size in the mesh
# create a mesh to plot in
x_min, x_max = X_test[:, best_f1].min() - 1, X_test[:, best_f1].max() + 1
y_min, y_max = X_test[:, best_f2].min() - 1, X_test[:, best_f2].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
fig, ax = plt.subplots(figsize=(10,10))
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
Z = best_clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
cs = ax.contourf(xx, yy, Z,cmap=plt.get_cmap('plasma'))
fig, ax = plt.subplots(figsize=(10,10))
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
Z = best_clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
cs = ax.contourf(xx, yy, Z,cmap=plt.get_cmap('plasma'))
colors = ['cyan','orange']
# Plot also the training points
for i, color in zip(best_clf.classes_, colors):
idx = np.where(y_test == i)
ax.scatter(X_test[idx, best_f1], X_test[idx, best_f2], c=color, label=wine.target_names[i],
cmap=plt.get_cmap('plasma'), edgecolor='black', s=20)
ax.set_title("Decision surface of Binary SGD")
ax.axis('tight')
xmin, xmax = plt.xlim()
ymin, ymax = plt.ylim()
coef = best_clf.coef_
intercept = best_clf.intercept_
# Lets make a function to plot the hyperplanes used by the SVM for
# Classification.
def plot_hyperplane(c, color):
def line(x0):
return (-(x0 * coef[c, 0]) - intercept[c]) / coef[c, 1]
plt.plot([xmin, xmax], [line(xmin), line(xmax)],
ls="--", color=color)
plot_hyperplane(0, "red")
My error is
Traceback (most recent call last):
File "C:\Users\Doug.spyder-py3\temp.py", line 120, in
fpr[i], tpr[i], _ = roc_curve(y_test, y_score)
File "C:\Users\Doug\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
return f(*args, **kwargs)
File "C:\Users\Doug\anaconda3\lib\site-packages\sklearn\metrics_ranking.py", line 913, in roc_curve
fps, tps, thresholds = _binary_clf_curve(
File "C:\Users\Doug\anaconda3\lib\site-packages\sklearn\metrics_ranking.py", line 691, in _binary_clf_curve
raise ValueError("{0} format is not supported".format(y_type))
ValueError: multiclass format is not supported
And the problem line is fpr[i], tpr[i], _ = roc_curve(y_test, y_score)
Can someone explain what it is I am missing please? I know it's got something to do with the kind of data that I am inputting but I am not sure what?
First, as you do not use fpr and tpr later, you can merge roc_curve and auc into roc_auc_score which you already imported.
See the docs.
If you are performing multiclass classification (more than two classes), you need to provide the multi_class parameter.
ovo and ovr give different results; see here.
Maybe relevant:
ROC for multiclass classification
How to fix ValueError: multiclass format is not supported
I want to plot figures with different value of k for k-nn classifier.
My problem is that the figures seem to have same values of k.
What I have tried so far, is to change the value of k in each run in the loop:
clf = KNeighborsClassifier(n_neighbors=counter+1)
But all the figures seem to be for k=1
from sklearn.datasets import fetch_california_housing
data = fetch_california_housing()
import numpy as np
from sklearn.model_selection import train_test_split
c = np.array([1 if y > np.median(data['target']) else 0 for y in data['target']])
X_train, X_test, c_train, c_test = train_test_split(data['data'], c, random_state=0)
from sklearn.neighbors import KNeighborsClassifier
import mglearn
import matplotlib.pyplot as plt
fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(20, 6))
for counter in range(3):
clf = KNeighborsClassifier(n_neighbors=counter+1)
clf.fit(X_test, c_test)
plt.tight_layout() # this will help create proper spacing between the plots.
mglearn.discrete_scatter(X_test[:,0], X_test[:,1], c_test, ax=ax[counter])
plt.legend(["Class 0", "Class 1"], loc=4)
plt.xlabel("First feature")
plt.ylabel("Second feature")
#plt.figure()
The reason why all the plots look the same is that you are simply plotting the test set every time instead of plotting the model predictions on the test set. You probably meant to do the following for each value of k:
Fit the model to the training set, in which case you should replace clf.fit(X_test, c_test) with clf.fit(X_train, c_train).
Generate the model predictions on the test set, in which case you should add c_pred = clf.predict(X_test).
Plot the model predictions on the test set, in which case you should replace c_test with c_pred in the scatter plot, i.e. use mglearn.discrete_scatter(X_test[:, 0], X_test[:, 1], c_pred, ax=ax[counter]) instead of mglearn.discrete_scatter(X_test[:, 0], X_test[:, 1], c_test, ax=ax[counter]).
Updated code:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import mglearn
import matplotlib.pyplot as plt
data = fetch_california_housing()
c = np.array([1 if y > np.median(data['target']) else 0 for y in data['target']])
X_train, X_test, c_train, c_test = train_test_split(data['data'], c, random_state=0)
fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(20, 6))
for counter in range(3):
clf = KNeighborsClassifier(n_neighbors=counter+1)
# fit the model to the training set
clf.fit(X_train, c_train)
# extract the model predictions on the test set
c_pred = clf.predict(X_test)
# plot the model predictions
plt.tight_layout()
mglearn.discrete_scatter(X_test[:,0], X_test[:,1], c_pred, ax=ax[counter])
plt.legend(["Class 0", "Class 1"], loc=4)
plt.xlabel("First feature")
plt.ylabel("Second feature")
I have made a simple linear regression model:
LR = LinearRegression()
kfold = model_selection.KFold(n_splits=10, random_state=12)
result_kfold = model_selection.cross_val_score(LR, X_train, Y_train, cv=kfold, scoring = 'r2')
print("Accuracy: %.2f%%" % (result_kfold.mean()*100.0))
LR.fit(X_train,Y_train)
Y_pred = LR.predict(X_test)
print("Y_pred:", Y_pred)
i want to plot the residual errors. I've used 'residplot' for the same. But i'm not sure if i've passed the right arguements. According to the documentation, we've to use predictor variable and result/response variable.
Here's the code:
sns.set(style="whitegrid")
sns.residplot(Y_test, Y_pred, lowess=True, color="g")
Can anyone please tell me if it is right...also what should be the labels of X and Y axis?
Thank You in advance for help
You are plotting something very weird, so let's use an example dataset:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import matplotlib. pyplot as plt
import seaborn as sns
iris = sns.load_dataset('iris')
X_train, X_test, Y_train, Y_test = train_test_split(iris.iloc[:,:3], iris.iloc[:,3],random_state=11)
LR = LinearRegression()
LR.fit(X_train,Y_train)
Y_pred = LR.predict(X_test)
If you just want to plot the residuals, you can do:
sns.set(style="whitegrid")
fig, ax = plt.subplots(figsize =(5,5))
sns.regplot(x=Y_pred,y=Y_test-Y_pred,ax=ax,lowess=True)
ax.set(ylabel='residuals',xlabel='fitted values')
What you are getting with sns.regplot() is the y variable regressed onto the x-variable and the residuals being plotted, which makes no sense in your case, and I illustrate below how the plot is obtained, first you fit the prediction (y variable) to actual (x variable), and get the residuals:
plotfit = LinearRegression()
plotfit.fit(Y_test.to_numpy().reshape(-1,1),Y_pred)
residual = Y_pred - plotfit.predict(Y_test.to_numpy().reshape(-1,1))
Then plotting it gives you exactly the same thing as your sns.residplot:
sns.set(style="whitegrid")
fig, ax = plt.subplots(1,2,figsize =(10,5))
sns.residplot(Y_test,Y_pred,lowess=True, color="g",ax=ax[0])
ax[0].set_xlim(0,2.5)
sns.regplot(x=Y_test,y=residual,lowess=True)
ax[1].set_xlim(0,2.5)