I am new here. This is my first question that I hope to get an answer from experts. I have 5 classifier models that I am trying to plot their confusion matrix.
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import collections
classifiers = {
"Naive Bayes": GaussianNB(),
"LogisiticRegression": LogisticRegression(),
"KNearest": KNeighborsClassifier(),
"Support Vector Classifier": SVC(),
"DecisionTreeClassifier": DecisionTreeClassifier(),
and then
from sklearn.metrics import confusion_matrix
for key, classifier in classifiers.items():
y_pred =, y_train).predict(X_test)
cf_matrix=confusion_matrix(y_test, y_pred)
which gives me
now I am trying to plot them with below code but no data is shown on the plots.
fig, axn = plt.subplots(1,5, sharex=True, sharey=True)
cbar_ax = fig.add_axes([.91, .3, .03, .4])
for i, ax in enumerate(axn.flat):
sns.heatmap(cf_matrix, ax=ax,
cbar=i == 0,
vmin=0, vmax=1,
cbar_ax=None if i else cbar_ax)
fig.tight_layout(rect=[0, 0, .9, 1])
Can someone please help me get this done?
sklearn provides plotting capability on confusion_matrix.
There are two ways to do it,
I used the second way here, because removing colorbar was quite verbose in first way (having multiple colorbars looks very cluttered).
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.datasets import load_iris
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
classifiers = {
"Naive Bayes": GaussianNB(),
"LogisiticRegression": LogisticRegression(),
"KNearest": KNeighborsClassifier(),
"Support Vector Classifier": SVC(),
"DecisionTreeClassifier": DecisionTreeClassifier(),
iris = load_iris()
X, y =,
X_train, X_test, y_train, y_test = train_test_split(X, y)
f, axes = plt.subplots(1, 5, figsize=(20, 5), sharey='row')
for i, (key, classifier) in enumerate(classifiers.items()):
y_pred =, y_train).predict(X_test)
cf_matrix = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(cf_matrix,
disp.plot(ax=axes[i], xticks_rotation=45)
if i!=0:
f.text(0.4, 0.1, 'Predicted label', ha='left')
plt.subplots_adjust(wspace=0.40, hspace=0.1)
f.colorbar(disp.im_, ax=axes)
You need to store the confusion matrix somewhere, so for if I use an example dataset:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
data = load_breast_cancer()
scaler = StandardScaler()
X_df = pd.DataFrame(, columns=data.feature_names)
X_df = scaler.fit_transform(X_df)
y_df = pd.DataFrame(, columns=['target'])
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2, random_state=11)
And store it in a similar dictionary:
from sklearn.metrics import confusion_matrix
cf_matrix = dict.fromkeys(classifiers.keys())
for key, classifier in classifiers.items():
y_pred =, y_train.values.ravel()).predict(X_test)
cf_matrix[key]=confusion_matrix(y_test, y_pred)
Then you can plot it:
fig, axn = plt.subplots(1,5, sharex=True, sharey=True,figsize=(12,2))
for i, ax in enumerate(axn.flat):
k = list(cf_matrix)[i]
sns.heatmap(cf_matrix[k], ax=ax,cbar=i==4)
I have a problem with plotting decision regions for Logistic Regression classification model. Can somebody help me and explain something how to do that? I put the colab link to this project here ->
Dataset from Kaggle ->
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from mlxtend.plotting import plot_decision_regions
np.set_printoptions(suppress=True, edgeitems=30, linewidth=100000, formatter=dict(float=lambda x: f'{x:.8f}'))
desired_width = 320
pd.options.display.float_format = '{:,.8f}'.format
pd.set_option('display.width', desired_width)
pd.set_option('display.max_columns', 12)
raw_data = pd.read_excel(io='/content/Date_Fruit_Datasets.xlsx',
data = raw_data.copy()
# Creating data and target
X = data.drop(columns='Class')
y = data['Class']
# Encoding target
encoder = LabelEncoder()
y = encoder.fit_transform(y=y)
# Creating train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
# Scalling data
scaler = StandardScaler()
X_train = scaler.fit_transform(X=X_train)
X_test = scaler.transform(X=X_test)
# Creating classifier, fitting and predicting
classifier = LogisticRegression(max_iter=1000), y=y_train)
y_pred = classifier.predict(X=X_test)
y_pred_proba = classifier.predict_proba(X=X_test)
# Checking finally reports and scores
score = accuracy_score(y_true=y_test, y_pred=y_pred)
report = classification_report(y_true=y_test, y_pred=y_pred, target_names=encoder.classes_)
confusion_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)
# Compare y_true and y_pred in DataFrame
results = pd.DataFrame(data={
'y_true': y_test,
'y_pred': y_pred
# Creating Data Frame with predict proba
predict_proba = pd.DataFrame(data=classifier.predict_proba(X=X_test), columns=encoder.classes_)
# Saving results to csv
# Plotting decision regions
value = 1.5
width = 0.75
plt.figure(figsize=(10, 8))
plot_decision_regions(X=X.values, y=y, clf=classifier,
filler_feature_values={i: value for i in range(1, 34)},
filler_feature_ranges={i: width for i in range(1, 34)}, legend=2)
After using function plot_decision_regions PyCharm shows me error like:
UserWarning: No contour levels were found within the data range.
ax.contour(xx, yy, Z, cset.levels,
UserWarning: You passed a edgecolor/edgecolors ('black') for an unfilled marker ('x'). Matplotlib is ignoring the edgecolor in favor of the facecolor. This behavior may change in the future.
Let's take data
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.decomposition import PCA
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
data = load_breast_cancer()
X =
y =
I want to create model using only first principal component and calculate AUC for it.
My work so far
scaler = StandardScaler()
X_scaled = scaler.transform(X)
pca = PCA(n_components=1)
principalComponents = pca.fit_transform(X)
principalDf = pd.DataFrame(data = principalComponents
, columns = ['principal component 1'])
clf = LogisticRegression()
clf =, y)
pred = clf.predict_proba(principalDf)
But while I'm trying to use
fpr, tpr, thresholds = metrics.roc_curve(y, pred, pos_label=2)
Following error occurs :
y should be a 1d array, got an array of shape (569, 2) instead.
I tried to reshape my data
fpr, tpr, thresholds = metrics.roc_curve(y.reshape(1,-1), pred, pos_label=2)
But it didn't solve the issue (it outputs) :
multilabel-indicator format is not supported
Do you have any idea how can I perform AUC on this first principal component?
You may wish to try:
from sklearn.datasets import load_breast_cancer
from sklearn.decomposition import PCA
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
X,y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X,y)
scaler = StandardScaler()
pca = PCA(2)
clf = LogisticRegression()
ppl = Pipeline([("scaler",scaler),("pca",pca),("clf",clf)]), y_train)
preds = ppl.predict(X_test)
fpr, tpr, thresholds = metrics.roc_curve(y_test, preds, pos_label=1)
metrics.plot_roc_curve(ppl, X_test, y_test)
The problem is that predict_proba returns a column for each class. Generally with binary classification, your classes are 0 and 1, so you want the probability of the second class, so it's quite common to slice as follows (replacing the last line in your code block):
pred = clf.predict_proba(principalDf)[:, 1]
I want to plot a confusion matrix to visualize the classifer's performance, but it accuracy and recall does not show
Accuracy Screenshot
I don't see any data here, or any code either. Anyway, this works for me.
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
X, y = make_classification(n_samples=1000, n_features=30,
n_clusters_per_class=1, n_classes=10,
class_sep=2.0, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, stratify=y)
clf = LogisticRegression(max_iter=1000, random_state=42).fit(X_train, y_train)
df = pd.DataFrame(classification_report(clf.predict(X_test),
y_test, digits=2,
df['support'] ='viridis',subset=pd.IndexSlice['0':'9', :'f1-score'])
import seaborn as sns
sns.heatmap(df, annot=True)
I am using plot_confusion_matrix from sklearn.metrics. I want to represent those confusion matrices next to each other like subplots, how could I do this?
Let's use the good'ol iris dataset to reproduce this, and fit several classifiers to plot their respective confusion matrices with plot_confusion_matrix:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
from sklearn.datasets import load_iris
from sklearn.metrics import plot_confusion_matrix
data = load_iris()
X =
y =
Set up -
X_train, X_test, y_train, y_test = train_test_split(X, y)
classifiers = [LogisticRegression(solver='lbfgs'),
for cls in classifiers:, y_train)
So the way you could compare all matrices at simple sight, is by creating a set of subplots with plt.subplots. Then iterate both over the axes objects and the trained classifiers (plot_confusion_matrix expects the as input) and plot the individual confusion matrices:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15,10))
for cls, ax in zip(classifiers, axes.flatten()):
if your desired output is that This is my way to see multiple confusion matrices (confusion_matrix) side by side with ConfusionMatrixDisplay.
note: paste your own test and train data names in "metrics.confusion_matrix()" function.
fig, ax = plt.subplots(1,2)
confusion_matrix = metrics.confusion_matrix(y_test, y_pred),
display_labels = [False, True]).plot(ax=ax[0])
confusion_matrix = metrics.confusion_matrix(y_train, y_train_pred),
display_labels = [False, True]).plot(ax=ax[1]);
This is a question regarding best practices for sklearn.
While experimenting with SVMs using the iris dataset provided in the sklearn library. While using train_test_split, I was wondering which parameter to adjust to avoid overfitting. I was taught to adjust test_size (roughly to ~0.3), but there is a train_size parameter. Would it not make sense to adjust the train_size to avoid overfitting, or am I misunderstanding something here?
I get similar results regardless of which parameter I adjust, but I don't know if that's always the case.
Appreciate any help. Thanks!
Here is the code I am currently working with:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split as tts
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report
iris = load_iris()
df = pd.DataFrame(, columns=iris.feature_names)
scaler = StandardScaler()
scaled_df = scaler.transform(df)
df = pd.DataFrame(data=scaled_df, columns=iris.feature_names)
x = df
y =
#test_size is used here, but is swapped with train_size to experiment
x_train, x_test, y_train, y_test = tts(x, y, test_size=0.33)
c_param = np.arange(1, 100, 10)
gamma_param = np.arange(0.0001, 1, 0.001)
params = {'C':c_param, 'gamma':gamma_param}
grid = GridSearchCV(estimator=SVC(), param_grid=params, verbose=0)
grid_fit =, y_train)
grid_pred = grid.predict(x_test)
print("Number of training records: ", len(x_train))
print("Number of test records: ", len(x_test))
print(classification_report(y_true=y_test, y_pred=grid_pred))
print(confusion_matrix(y_true=y_test, y_pred=grid_pred))