This question already has answers here:
Plot multiple confusion matrices with plot_confusion_matrix
(2 answers)
Closed 1 year ago.
I am new here. This is my first question that I hope to get an answer from experts. I have 5 classifier models that I am trying to plot their confusion matrix.
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import collections
classifiers = {
"Naive Bayes": GaussianNB(),
"LogisiticRegression": LogisticRegression(),
"KNearest": KNeighborsClassifier(),
"Support Vector Classifier": SVC(),
"DecisionTreeClassifier": DecisionTreeClassifier(),
}
and then
from sklearn.metrics import confusion_matrix
for key, classifier in classifiers.items():
y_pred = classifier.fit(X_train, y_train).predict(X_test)
cf_matrix=confusion_matrix(y_test, y_pred)
print(cf_matrix)
which gives me
now I am trying to plot them with below code but no data is shown on the plots.
fig, axn = plt.subplots(1,5, sharex=True, sharey=True)
cbar_ax = fig.add_axes([.91, .3, .03, .4])
for i, ax in enumerate(axn.flat):
sns.heatmap(cf_matrix, ax=ax,
cbar=i == 0,
vmin=0, vmax=1,
cbar_ax=None if i else cbar_ax)
fig.tight_layout(rect=[0, 0, .9, 1])
Can someone please help me get this done?
sklearn provides plotting capability on confusion_matrix.
There are two ways to do it,
plot_confusion_matrix
ConfusionMatrixDisplay
I used the second way here, because removing colorbar was quite verbose in first way (having multiple colorbars looks very cluttered).
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.datasets import load_iris
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
classifiers = {
"Naive Bayes": GaussianNB(),
"LogisiticRegression": LogisticRegression(),
"KNearest": KNeighborsClassifier(),
"Support Vector Classifier": SVC(),
"DecisionTreeClassifier": DecisionTreeClassifier(),
}
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y)
f, axes = plt.subplots(1, 5, figsize=(20, 5), sharey='row')
for i, (key, classifier) in enumerate(classifiers.items()):
y_pred = classifier.fit(X_train, y_train).predict(X_test)
cf_matrix = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(cf_matrix,
display_labels=iris.target_names)
disp.plot(ax=axes[i], xticks_rotation=45)
disp.ax_.set_title(key)
disp.im_.colorbar.remove()
disp.ax_.set_xlabel('')
if i!=0:
disp.ax_.set_ylabel('')
f.text(0.4, 0.1, 'Predicted label', ha='left')
plt.subplots_adjust(wspace=0.40, hspace=0.1)
f.colorbar(disp.im_, ax=axes)
plt.show()
You need to store the confusion matrix somewhere, so for if I use an example dataset:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
data = load_breast_cancer()
scaler = StandardScaler()
X_df = pd.DataFrame(data.data, columns=data.feature_names)
X_df = scaler.fit_transform(X_df)
y_df = pd.DataFrame(data.target, columns=['target'])
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2, random_state=11)
And store it in a similar dictionary:
from sklearn.metrics import confusion_matrix
cf_matrix = dict.fromkeys(classifiers.keys())
for key, classifier in classifiers.items():
y_pred = classifier.fit(X_train, y_train.values.ravel()).predict(X_test)
cf_matrix[key]=confusion_matrix(y_test, y_pred)
Then you can plot it:
fig, axn = plt.subplots(1,5, sharex=True, sharey=True,figsize=(12,2))
for i, ax in enumerate(axn.flat):
k = list(cf_matrix)[i]
sns.heatmap(cf_matrix[k], ax=ax,cbar=i==4)
ax.set_title(k,fontsize=8)
Related
I have a problem with plotting decision regions for Logistic Regression classification model. Can somebody help me and explain something how to do that? I put the colab link to this project here -> https://colab.research.google.com/drive/1JqFyoAk0zithy4esfjiyo6MdB12iBndi?usp=sharing
Dataset from Kaggle -> https://www.kaggle.com/datasets/muratkokludataset/date-fruit-datasets
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from mlxtend.plotting import plot_decision_regions
np.set_printoptions(suppress=True, edgeitems=30, linewidth=100000, formatter=dict(float=lambda x: f'{x:.8f}'))
np.random.seed(42)
sns.set()
desired_width = 320
pd.options.display.float_format = '{:,.8f}'.format
pd.set_option('display.width', desired_width)
pd.set_option('display.max_columns', 12)
raw_data = pd.read_excel(io='/content/Date_Fruit_Datasets.xlsx',
sheet_name='Date_Fruit_Datasets')
data = raw_data.copy()
data.head(n=10)
data.describe().transpose()
data.info()
data.shape
# Creating data and target
X = data.drop(columns='Class')
y = data['Class']
X.shape
y.shape
# Encoding target
encoder = LabelEncoder()
y = encoder.fit_transform(y=y)
# Creating train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
# Scalling data
scaler = StandardScaler()
X_train = scaler.fit_transform(X=X_train)
X_test = scaler.transform(X=X_test)
# Creating classifier, fitting and predicting
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X=X_train, y=y_train)
y_pred = classifier.predict(X=X_test)
y_pred_proba = classifier.predict_proba(X=X_test)
# Checking finally reports and scores
score = accuracy_score(y_true=y_test, y_pred=y_pred)
report = classification_report(y_true=y_test, y_pred=y_pred, target_names=encoder.classes_)
confusion_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)
# Compare y_true and y_pred in DataFrame
results = pd.DataFrame(data={
'y_true': y_test,
'y_pred': y_pred
})
# Creating Data Frame with predict proba
predict_proba = pd.DataFrame(data=classifier.predict_proba(X=X_test), columns=encoder.classes_)
# Saving results to csv
results.to_csv(path_or_buf='/content/data_fruit_predictions.csv')
predict_proba.to_csv(path_or_buf='/content/data_fruit_predict_proba.csv')
# Plotting decision regions
value = 1.5
width = 0.75
plt.figure(figsize=(10, 8))
plot_decision_regions(X=X.values, y=y, clf=classifier,
filler_feature_values={i: value for i in range(1, 34)},
filler_feature_ranges={i: width for i in range(1, 34)}, legend=2)
plt.show()
After using function plot_decision_regions PyCharm shows me error like:
UserWarning: No contour levels were found within the data range.
ax.contour(xx, yy, Z, cset.levels,
and
UserWarning: You passed a edgecolor/edgecolors ('black') for an unfilled marker ('x'). Matplotlib is ignoring the edgecolor in favor of the facecolor. This behavior may change in the future.
ax.scatter(x=x_data,
Let's take data
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.decomposition import PCA
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
data = load_breast_cancer()
X = data.data
y = data.target
I want to create model using only first principal component and calculate AUC for it.
My work so far
scaler = StandardScaler()
scaler.fit(X_train)
X_scaled = scaler.transform(X)
pca = PCA(n_components=1)
principalComponents = pca.fit_transform(X)
principalDf = pd.DataFrame(data = principalComponents
, columns = ['principal component 1'])
clf = LogisticRegression()
clf = clf.fit(principalDf, y)
pred = clf.predict_proba(principalDf)
But while I'm trying to use
fpr, tpr, thresholds = metrics.roc_curve(y, pred, pos_label=2)
Following error occurs :
y should be a 1d array, got an array of shape (569, 2) instead.
I tried to reshape my data
fpr, tpr, thresholds = metrics.roc_curve(y.reshape(1,-1), pred, pos_label=2)
But it didn't solve the issue (it outputs) :
multilabel-indicator format is not supported
Do you have any idea how can I perform AUC on this first principal component?
You may wish to try:
from sklearn.datasets import load_breast_cancer
from sklearn.decomposition import PCA
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
X,y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X,y)
scaler = StandardScaler()
pca = PCA(2)
clf = LogisticRegression()
ppl = Pipeline([("scaler",scaler),("pca",pca),("clf",clf)])
ppl.fit(X_train, y_train)
preds = ppl.predict(X_test)
fpr, tpr, thresholds = metrics.roc_curve(y_test, preds, pos_label=1)
metrics.plot_roc_curve(ppl, X_test, y_test)
The problem is that predict_proba returns a column for each class. Generally with binary classification, your classes are 0 and 1, so you want the probability of the second class, so it's quite common to slice as follows (replacing the last line in your code block):
pred = clf.predict_proba(principalDf)[:, 1]
I want to plot a confusion matrix to visualize the classifer's performance, but it accuracy and recall does not show
Accuracy Screenshot
I don't see any data here, or any code either. Anyway, this works for me.
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
X, y = make_classification(n_samples=1000, n_features=30,
n_informative=12,
n_clusters_per_class=1, n_classes=10,
class_sep=2.0, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, stratify=y)
clf = LogisticRegression(max_iter=1000, random_state=42).fit(X_train, y_train)
df = pd.DataFrame(classification_report(clf.predict(X_test),
y_test, digits=2,
output_dict=True)).T
df['support'] = df.support.apply(int)
df.style.background_gradient(cmap='viridis',subset=pd.IndexSlice['0':'9', :'f1-score'])
import seaborn as sns
sns.heatmap(df, annot=True)
I am using plot_confusion_matrix from sklearn.metrics. I want to represent those confusion matrices next to each other like subplots, how could I do this?
Let's use the good'ol iris dataset to reproduce this, and fit several classifiers to plot their respective confusion matrices with plot_confusion_matrix:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
from sklearn.datasets import load_iris
from sklearn.metrics import plot_confusion_matrix
data = load_iris()
X = data.data
y = data.target
Set up -
X_train, X_test, y_train, y_test = train_test_split(X, y)
classifiers = [LogisticRegression(solver='lbfgs'),
AdaBoostClassifier(),
GradientBoostingClassifier(),
SVC()]
for cls in classifiers:
cls.fit(X_train, y_train)
So the way you could compare all matrices at simple sight, is by creating a set of subplots with plt.subplots. Then iterate both over the axes objects and the trained classifiers (plot_confusion_matrix expects the as input) and plot the individual confusion matrices:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15,10))
for cls, ax in zip(classifiers, axes.flatten()):
plot_confusion_matrix(cls,
X_test,
y_test,
ax=ax,
cmap='Blues',
display_labels=data.target_names)
ax.title.set_text(type(cls).__name__)
plt.tight_layout()
plt.show()
if your desired output is that This is my way to see multiple confusion matrices (confusion_matrix) side by side with ConfusionMatrixDisplay.
note: paste your own test and train data names in "metrics.confusion_matrix()" function.
fig, ax = plt.subplots(1,2)
ax[0].set_title("test")
ax[1].set_title("train")
metrics.ConfusionMatrixDisplay(
confusion_matrix = metrics.confusion_matrix(y_test, y_pred),
display_labels = [False, True]).plot(ax=ax[0])
metrics.ConfusionMatrixDisplay(
confusion_matrix = metrics.confusion_matrix(y_train, y_train_pred),
display_labels = [False, True]).plot(ax=ax[1]);
This is a question regarding best practices for sklearn.
While experimenting with SVMs using the iris dataset provided in the sklearn library. While using train_test_split, I was wondering which parameter to adjust to avoid overfitting. I was taught to adjust test_size (roughly to ~0.3), but there is a train_size parameter. Would it not make sense to adjust the train_size to avoid overfitting, or am I misunderstanding something here?
I get similar results regardless of which parameter I adjust, but I don't know if that's always the case.
Appreciate any help. Thanks!
Here is the code I am currently working with:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split as tts
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
scaler = StandardScaler()
scaler.fit(df)
scaled_df = scaler.transform(df)
df = pd.DataFrame(data=scaled_df, columns=iris.feature_names)
x = df
y = iris.target
#test_size is used here, but is swapped with train_size to experiment
x_train, x_test, y_train, y_test = tts(x, y, test_size=0.33)
c_param = np.arange(1, 100, 10)
gamma_param = np.arange(0.0001, 1, 0.001)
params = {'C':c_param, 'gamma':gamma_param}
grid = GridSearchCV(estimator=SVC(), param_grid=params, verbose=0)
grid_fit = grid.fit(x_train, y_train)
grid_pred = grid.predict(x_test)
print(grid.best_params_)
print('\n')
print("Number of training records: ", len(x_train))
print("Number of test records: ", len(x_test))
print('\n')
print(classification_report(y_true=y_test, y_pred=grid_pred))
print('\n')
print(confusion_matrix(y_true=y_test, y_pred=grid_pred))