Plot confusion matrix sklearn with multiple labels - python

I am plotting a confusion matrix for a multiple labelled data, where labels look like:
label1: 1, 0, 0, 0
label2: 0, 1, 0, 0
label3: 0, 0, 1, 0
label4: 0, 0, 0, 1
I am able to classify successfully using the below code. I only need some help to plot confusion matrix.
for i in range(4):
y_train= y[:,i]
print('Train subject %d, class %s' % (subject, cols[i]))
lr.fit(X_train[::sample,:],y_train[::sample])
pred[:,i] = lr.predict_proba(X_test)[:,1]
I used the following code to print confusion matrix, but it always return a 2X2 matrix
prediction = lr.predict(X_train)
print(confusion_matrix(y_train, prediction))

I found a function that can plot the confusion matrix which generated from sklearn.
import numpy as np
def plot_confusion_matrix(cm,
target_names,
title='Confusion matrix',
cmap=None,
normalize=True):
"""
given a sklearn confusion matrix (cm), make a nice plot
Arguments
---------
cm: confusion matrix from sklearn.metrics.confusion_matrix
target_names: given classification classes such as [0, 1, 2]
the class names, for example: ['high', 'medium', 'low']
title: the text to display at the top of the matrix
cmap: the gradient of the values displayed from matplotlib.pyplot.cm
see http://matplotlib.org/examples/color/colormaps_reference.html
plt.get_cmap('jet') or plt.cm.Blues
normalize: If False, plot the raw numbers
If True, plot the proportions
Usage
-----
plot_confusion_matrix(cm = cm, # confusion matrix created by
# sklearn.metrics.confusion_matrix
normalize = True, # show proportions
target_names = y_labels_vals, # list of names of the classes
title = best_estimator_name) # title of graph
Citiation
---------
http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
"""
import matplotlib.pyplot as plt
import numpy as np
import itertools
accuracy = np.trace(cm) / float(np.sum(cm))
misclass = 1 - accuracy
if cmap is None:
cmap = plt.get_cmap('Blues')
plt.figure(figsize=(8, 6))
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
if target_names is not None:
tick_marks = np.arange(len(target_names))
plt.xticks(tick_marks, target_names, rotation=45)
plt.yticks(tick_marks, target_names)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
thresh = cm.max() / 1.5 if normalize else cm.max() / 2
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
if normalize:
plt.text(j, i, "{:0.4f}".format(cm[i, j]),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
else:
plt.text(j, i, "{:,}".format(cm[i, j]),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
plt.show()
It will look like this

This works the best for me :
from sklearn.metrics import multilabel_confusion_matrix
y_unique = y_test.unique()
mcm = multilabel_confusion_matrix(y_test, y_pred, labels = y_unique)
mcm

I see this is still an open issue in sklearn's repository:
https://github.com/scikit-learn/scikit-learn/issues/3452
However there have been some attempts at implementing it. From the same #3452 thread issue:
https://github.com/Magellanea/scikit-learn/commit/514287c1d5dad2f0ab4918dc4da5cf7053fe6734#diff-b04acd877dd793f28ae7be13a999ed88R187
You can check the code proposed in the function and see if that fits your needs.

from sklearn.metrics import multilabel_confusion_matrix
mul_c = multilabel_confusion_matrix(
test_Y,
pred_k,
labels=["benign", "dos","probe","r2l","u2r"])
mul_c

I found an easy solution with sklearn and seaborn libraries.
from sklearn.metrics import confusion_matrix, classification_report
from matplotlib import pyplot as plt
import seaborn as sns
def plot_confusion_matrix(y_test,y_scores, classNames):
y_test=np.argmax(y_test, axis=1)
y_scores=np.argmax(y_scores, axis=1)
classes = len(classNames)
cm = confusion_matrix(y_test, y_scores)
print("**** Confusion Matrix ****")
print(cm)
print("**** Classification Report ****")
print(classification_report(y_test, y_scores, target_names=classNames))
con = np.zeros((classes,classes))
for x in range(classes):
for y in range(classes):
con[x,y] = cm[x,y]/np.sum(cm[x,:])
plt.figure(figsize=(40,40))
sns.set(font_scale=3.0) # for label size
df = sns.heatmap(con, annot=True,fmt='.2', cmap='Blues',xticklabels= classNames , yticklabels= classNames)
df.figure.savefig("image2.png")
classNames = ['A', 'B', 'C', 'D', 'E']
plot_confusion_matrix(y_test,y_scores, classNames)
#y_test is your ground truth
#y_scores is your predicted probabilities

Just use pandas with gradient coloring:
cm = confusion_matrix(y_true, y_pred)
cm = pd.DataFrame(data=cm, columns = np.unique(y_true), index = np.unique(y_true))
cm = (cm / cm.sum(axis = 1).values.reshape(-1,1)) # to fractions of 1
cm.style.background_gradient().format(precision=2)
By now pandas has nice options for table formatting and decoration.

Related

Why we flip coordinates when we plot confusion matrix

I am trying to plot the confusion matrix, but I have a question, why the indices are flipped in the for loop below where we start with plt.text(j, i) instead of plt.text(i, j)? I assume that 2D confusion matrix has values as follows:
(0,0): true zeros.
(0,1): false ones.
(1,0): false zeros.
(1,1): true ones.
Code:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import pandas as pd
from sklearn.metrics import confusion_matrix
import itertools
true = np.array([0, 1, 0])
pred = np.array([0., 0., 0.])
def plot_confusion_matrix(y_true, y_pred, title='', labels=[0, 1]):
cm = confusion_matrix(y_true, y_pred)
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
plt.title(title)
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')
fmt = 'd'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, format(cm[i, j], fmt),
horizontalalignment="center",
color="black" if cm[i, j] > thresh else "white")
plt.show()
plot_confusion_matrix(true, pred, title='Confusion Matrix')
The first coordinate of the plot usually is drawn horizontally, while the first coordinate of the matrix usually is represented vertically.
For example, the upper right square of the plot has coordinates x=1, y=0. This is false-positive values, which are presented in the cell (0, 1) of the confusion matrix.
To bring them into line with each other, it is necessary to flip the matrix along the main diagonal, i.e. transpose it. This is why you see coordinate transposition when displaying the confusion matrix in the coordinate system of the plot layout.

Problem to find the optimal number of clusters with scaled and non-scaled data

I'm trying to do clustering in my data and I'm having some problems to identify the optimal number of clusters.
My data (https://www.dropbox.com/s/6i6wyy0eohtlrrt/wellA.xlsx?dl=0) is an oil exploration well with information of depth, rock_types (label) and rock properties (features). I have the labels information, but I'd like to see how KMeans would work on this.
The problem is that the elbow method and silhouette score show a clear trend when the data isn't scaled, but a bad clustering. On the other hand, scaled data shows better clusters, but its graphs have "weird" shapes... the first one doesn't has an "elbow" and the other has silhouette scores way smaller than the non-scaled data. Why do I see worse graphs for scaled data?
I would like to know if I'm doing something wrong. The features are highly variables and I think they should be scaled for KMeans purposes. Maybe I should scale the data just after find the optimal number of clusters?
P.S.: I'm sorry for the long question and code (most of it are plots). I tried to edit all of this in a simpler example, but I wasn't able to represent this heterogeneity.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
data = pd.read_excel(r'C:\...\wellA.xlsx')
data = data.replace(-999.25, np.nan)
data.dropna(axis=0, inplace=True)
# FEATURES SELECTION FOR TRAINING
well = data.drop(['DEPTH','ROCK_TYPE'], axis=1)
# NORMALIZATION
scaled_well = pd.DataFrame(MinMaxScaler().fit_transform(well))
# ELBOW METHOD AND SILHOUETTE SCORE
def optimal_k(data, title):
inertia =[]
sil =[]
for k in range(2,14):
kmeans_rand = KMeans(n_clusters=k, init='k-means++', random_state=0)
kmeans_rand.fit(data.values)
y_pred = kmeans_rand.predict(data.values)
inertia.append(kmeans_rand.inertia_)
sil.append((k, silhouette_score(data.values, y_pred)))
fig, ax = plt.subplots(1, 2, figsize=(12,4))
ax[0].plot(range(2,14), inertia)
ax[0].set_title('Elbow Method')
ax[0].set_xlabel('Number of clusters')
ax[0].set_ylabel('Inertia')
x_sil = [x[0] for x in sil]
y_sil = [x[1] for x in sil]
ax[1].plot(x_sil, y_sil)
ax[1].set_xlabel('Number of Clusters')
ax[1].set_ylabel('Silhouetter Score')
ax[1].set_title('Silhouetter Score Curve')
fig.suptitle(title)
optimal_k(well, 'Not scaled')
optimal_k(scaled_well, 'Scaled')
# MODEL
def kmeans(data, k):
model = KMeans(n_clusters=k, random_state=0, init='k-means++')
model.fit(data.values)
labels = model.labels_
data['KMEANS'] = labels+1
kmeans(well,3)
kmeans(scaled_well,3)
# CONVERT NAME TO VALUE
facies = {'Claystone':1, 'Coal':2, 'Limestone':3, 'Marl':4, 'Sandstone':5}
data['LABEL'] = data['ROCK_TYPE'].map(facies)
# PLOT
cluster_real = np.repeat(np.expand_dims(data['LABEL'], 1), 1, 1)
cluster_kmeans = np.repeat(np.expand_dims(well['KMEANS'], 1), 1, 1)
cluster_kmeans_scaled = np.repeat(np.expand_dims(scaled_well['KMEANS'], 1), 1, 1)
f, ax = plt.subplots(nrows=1, ncols=3, figsize=(2,12))
ax[0].imshow(cluster_real,
interpolation='none',
aspect='auto',
vmin=1, vmax=5,
extent=[0, 1, data['DEPTH'].max(), data['DEPTH'].min()])
ax[1].imshow(cluster_kmeans,
interpolation='none',
aspect='auto',
vmin=1, vmax=3,
extent=[0, 1, data['DEPTH'].max(), data['DEPTH'].min()])
ax[2].imshow(cluster_kmeans_scaled,
interpolation='none',
aspect='auto',
vmin=1, vmax=3,
extent=[0, 1, data['DEPTH'].max(), data['DEPTH'].min()])
ax[0].set_ylabel('Depth (m)')
ax[0].set_xticks([],[])
ax[0].set_xlabel('REAL ROCKS')
ax[1].set_xticks([],[])
ax[1].set_xlabel('KMEANS')
ax[2].set_xticks([],[])
ax[2].set_xlabel('KMEANS SCALED')
Some clustering methodologies will automatically find the optimal number of clusters for you. Affinity Propagation and Mean Shift are two that come to mind. There are probably a couple others out there that will do the same.
from sklearn.cluster import AffinityPropagation
from sklearn import metrics
from sklearn.datasets import make_blobs
# #############################################################################
# Generate sample data
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=300, centers=centers, cluster_std=0.5,
random_state=0)
# #############################################################################
# Compute Affinity Propagation
af = AffinityPropagation(preference=-50).fit(X)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_
n_clusters_ = len(cluster_centers_indices)
print('Estimated number of clusters: %d' % n_clusters_)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
print("Adjusted Rand Index: %0.3f"
% metrics.adjusted_rand_score(labels_true, labels))
print("Adjusted Mutual Information: %0.3f"
% metrics.adjusted_mutual_info_score(labels_true, labels))
print("Silhouette Coefficient: %0.3f"
% metrics.silhouette_score(X, labels, metric='sqeuclidean'))
# #############################################################################
# Plot result
import matplotlib.pyplot as plt
from itertools import cycle
plt.close('all')
plt.figure(1)
plt.clf()
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
class_members = labels == k
cluster_center = X[cluster_centers_indices[k]]
plt.plot(X[class_members, 0], X[class_members, 1], col + '.')
plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=14)
for x in X[class_members]:
plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()

How to use DBSCAN method from sklearn for clustering

I have a three parameters database for clustering. For example, I can get image result easily from Kmean by sklearn, like that: (val is my database, its shape like (3000,3))
y_pred = KMeans(n_clusters= 4 , random_state=0).fit_predict(val)
fig = plt.figure()
ax1 = fig.add_subplot(1,1,1,projection='3d')
ax1.scatter(val[:, 0], val[:, 1], val[:, 2], c=y_pred)
plt.show()
However, in DBSCAN, I just directly use this one:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
val = StandardScaler().fit_transform(val)
db = DBSCAN(eps=3, min_samples=4).fit(val)
labels = db.labels_
core_samples = np.zeros_like(labels, dtype=bool)
core_samples[db.core_sample_indices_] =True
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
So how to get the image result of DBSCAN, just like Kmean?
You can reuse the same code from your KMeans model. All you need to do it re-assign val and y_pred to ignore the noise labels.
# DBSCAN snippet from the question
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
val = StandardScaler().fit_transform(val)
db = DBSCAN(eps=3, min_samples=4).fit(val)
labels = db.labels_
# re-assign y_pred and core (as val)
y_pred, core = labels[labels != -1], val[labels != -1]
# plotting snippet from the question
fig = plt.figure()
ax1 = fig.add_subplot(1,1,1,projection='3d')
ax1.scatter(core[:, 0], core[:, 1], core[:, 2], c=y_pred)
plt.show()

Scikit-learn (sklearn) confusion matrix plot for more than 3 classes

I have a problem with the confusion matrix when i using the code in scikit-learn
this what i got
as you see the first class is cut
!!!update!!!
i force it work by using this rows
plt.xlim(-0.5, 5.5)
plt.ylim(5.5, -0.5)
and get this
but i still wants to know if there is other way to make it not specific to 5 classes.
i already try to change the ax size but it wasnt work out
if not title:
if normalize:
title = 'Normalized confusion matrix'
else:
title = 'Confusion matrix, without normalization'
# Compute confusion matrix
cm = confusion_matrix(y_true, y_pred)
# Only use the labels that appear in the data
classes = list(unique_labels(y_true, y_pred))
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
fig, ax = plt.subplots()
im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
ax.figure.colorbar(im, ax=ax)
# We want to show all ticks...
ax.set(xticks=np.arange(cm.shape[1]),
yticks=np.arange(cm.shape[0]),
# ... and label them with the respective list entries
xticklabels=classes, yticklabels=classes,
title=title,
ylabel='True label',
xlabel='Predicted label')
# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor")
# Loop over data dimensions and create text annotations.
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i in range(cm.shape[0]):
for j in range(cm.shape[1]):
ax.text(j, i, format(cm[i, j], fmt),
ha="center", va="center",
color="white" if cm[i, j] > thresh else "black")
fig.tight_layout()
return ax
plot_confusion_matrix(y, y_pred, classes=[0, 1, 2, 3, 4, 5], normalize=True,
title='Normalized confusion matrix')
i want that the box will not cut the first and last row
You need in this case to set xlim and ylim and here is an automatic way to do so for e.g. 10 classes.
Briefly, you need:
plt.xlim(-0.5, len(np.unique(y))-0.5)
plt.ylim(len(np.unique(y))-0.5, -0.5)
Full example:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
# import some data to play with
iris = datasets.load_iris()
X = iris.data
y = np.repeat(np.arange(0,10),15)
class_names = np.array(['1', '2', '3', '4', '5','6','7','8','9','10'])
# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
# Run classifier, using a model that is too regularized (C too low) to see
# the impact on the results
classifier = svm.SVC(kernel='linear', C=0.01)
y_pred = classifier.fit(X_train, y_train).predict(X_test)
def plot_confusion_matrix(y_true, y_pred, classes,
normalize=False,
title=None,
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
if not title:
if normalize:
title = 'Normalized confusion matrix'
else:
title = 'Confusion matrix, without normalization'
# Compute confusion matrix
cm = confusion_matrix(y_true, y_pred)
# Only use the labels that appear in the data
classes = classes[unique_labels(y_true, y_pred)]
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
fig, ax = plt.subplots()
im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
ax.figure.colorbar(im, ax=ax)
# We want to show all ticks...
ax.set(xticks=np.arange(cm.shape[1]),
yticks=np.arange(cm.shape[0]),
# ... and label them with the respective list entries
xticklabels=classes, yticklabels=classes,
title=title,
ylabel='True label',
xlabel='Predicted label')
# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor")
# Loop over data dimensions and create text annotations.
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i in range(cm.shape[0]):
for j in range(cm.shape[1]):
ax.text(j, i, format(cm[i, j], fmt),
ha="center", va="center",
color="white" if cm[i, j] > thresh else "black")
fig.tight_layout()
plt.xlim(-0.5, len(np.unique(y))-0.5)
plt.ylim(len(np.unique(y))-0.5, -0.5)
return ax
np.set_printoptions(precision=2)
# Plot non-normalized confusion matrix
plot_confusion_matrix(y_test, y_pred, classes=class_names,
title='Confusion matrix, without normalization')
# Plot normalized confusion matrix
plot_confusion_matrix(y_test, y_pred, classes=class_names, normalize=True,
title='Normalized confusion matrix')
plt.show()

How to get rid of white lines in confusion matrix?

Does anyone know why these white lines are quartering my confusion matrix? I've changed many of the parameters but cannot figure it out. The only thing that makes them go away is if I don't label the blocks at all, ie '0', '1',... but that's obviously not what I want. Any help would be appreciated.
Code:
def plot_confusion_matrix(cm,
target_names = ['1', '2', '3', '4'],
title = 'Confusion matrix',
cmap = None,
normalize = False):
"""
given a sklearn confusion matrix (cm), make a nice plot
Arguments
---------
cm: confusion matrix from sklearn.metrics.confusion_matrix
target_names: given classification classes such as [0, 1, 2]
the class names, for example: ['high', 'medium', 'low']
title: the text to display at the top of the matrix
cmap: the gradient of the values displayed from matplotlib.pyplot.cm
see http://matplotlib.org/examples/color/colormaps_reference.html
plt.get_cmap('jet') or plt.cm.Blues
normalize: If False, plot the raw numbers
If True, plot the proportions
Usage
-----
plot_confusion_matrix(cm = cm, # confusion matrix created by
# sklearn.metrics.confusion_matrix
normalize = True, # show proportions
target_names = y_labels_vals, # list of names of the classes
title = best_estimator_name) # title of graph
Citiation
---------
http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
"""
import matplotlib.pyplot as plt
import numpy as np
import itertools
accuracy = np.trace(cm) / float(np.sum(cm))
misclass = 1 - accuracy
if cmap is None:
cmap = plt.get_cmap('Blues')
plt.figure(figsize = (8, 6))
plt.imshow(cm, interpolation = 'nearest', cmap = cmap)
plt.title(title)
plt.colorbar()
if target_names is not None:
tick_marks = np.arange(len(target_names))
plt.xticks(tick_marks, target_names, rotation = 0)
plt.yticks(tick_marks, target_names)
if normalize:
cm = cm.astype('float') / cm.sum(axis = 1)[:, np.newaxis]
thresh = cm.max() / 1.5 if normalize else cm.max() / 2
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
if normalize:
plt.text(j, i, "{:0.4f}".format(cm[i, j]),
horizontalalignment = "center",
color = "white" if cm[i, j] > thresh else "black")
else:
plt.text(j, i, "{:,}".format(cm[i, j]),
horizontalalignment = "center",
color = "white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
plt.show()
plot_confusion_matrix(cm = (confusion),
normalize = True,
target_names = ['1', '2', '3', '4'],
title = "Confusion Matrix")
Output is:
plt.figure(figsize=(10,5))
plt.grid(False)
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=False, title='Normalized confusion matrix')
Try resetting it
import seaborn as sns
# Try resetting it
sns.reset_orig()

Categories

Resources