I'm working on a classification problem with 20 classes. I'm trying to visualize the results through a confusion matrix using matplotlib.
After computing my confusion matrix, I used the plot_confusion_matrix described here.
def plot_confusion_matrix(y_true, y_pred, classes,
normalize=False,
title=None,
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
if not title:
if normalize:
title = 'Normalized confusion matrix'
else:
title = 'Confusion matrix, without normalization'
# Compute confusion matrix
cm = confusion_matrix(y_true, y_pred)
# Only use the labels that appear in the data
classes = classes[unique_labels(y_true, y_pred)]
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
fig, ax = plt.subplots()
im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
ax.figure.colorbar(im, ax=ax)
# We want to show all ticks...
ax.set(xticks=np.arange(cm.shape[1]),
yticks=np.arange(cm.shape[0]),
# ... and label them with the respective list entries
xticklabels=classes, yticklabels=classes,
title=title,
ylabel='True label',
xlabel='Predicted label')
# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor")
# Loop over data dimensions and create text annotations.
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i in range(cm.shape[0]):
for j in range(cm.shape[1]):
ax.text(j, i, format(cm[i, j], fmt),
ha="center", va="center",
color="white" if cm[i, j] > thresh else "black")
fig.tight_layout()
return ax
Here is what it looks like :
It looks like the problem comes from dealing with too many classes, so a natural solution would be scalling up the plot. But doing that distorts it. Also, how do I choose the correct scale/size ?
How do I proceed to make it look better ?
P.S. You can find the confution matrix as a csv file here.
Since you dont specified the estrict use of matplotlib I recomend you to use the seaborn library its so much easy and simple and if you want to change something weird was constructed with matplolib if I aint wrong. Using seaborn is:
import seaborn as sns
plt.figure(figsize = (10,10)) #This is the size of the image
heatM = sns.heatmap(cov_vals, vmin = -1, vmax = 1,center = 0, cmap = sns.diverging_palette(20, 220, n = 200), square = True, annot = True) #this are the caracteristics of the heatmap
heatM.set_ylim([10,0]) # This is the limit in y axis (number of features)
and this is the result. be careful with the limits heatM.set_ylim([10,0]) for x too, this need to be the number of variables that you have.
hope this was useful.
I ended up using seaborn but I faced a problem. The confusion matrix looked like this. It was actually a bug in the latest version (3.1.1) of seaborn (see this issue). The solution was to use a prior version (3.1.0 in my case).
Related
I have a problem with the confusion matrix when i using the code in scikit-learn
this what i got
as you see the first class is cut
!!!update!!!
i force it work by using this rows
plt.xlim(-0.5, 5.5)
plt.ylim(5.5, -0.5)
and get this
but i still wants to know if there is other way to make it not specific to 5 classes.
i already try to change the ax size but it wasnt work out
if not title:
if normalize:
title = 'Normalized confusion matrix'
else:
title = 'Confusion matrix, without normalization'
# Compute confusion matrix
cm = confusion_matrix(y_true, y_pred)
# Only use the labels that appear in the data
classes = list(unique_labels(y_true, y_pred))
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
fig, ax = plt.subplots()
im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
ax.figure.colorbar(im, ax=ax)
# We want to show all ticks...
ax.set(xticks=np.arange(cm.shape[1]),
yticks=np.arange(cm.shape[0]),
# ... and label them with the respective list entries
xticklabels=classes, yticklabels=classes,
title=title,
ylabel='True label',
xlabel='Predicted label')
# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor")
# Loop over data dimensions and create text annotations.
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i in range(cm.shape[0]):
for j in range(cm.shape[1]):
ax.text(j, i, format(cm[i, j], fmt),
ha="center", va="center",
color="white" if cm[i, j] > thresh else "black")
fig.tight_layout()
return ax
plot_confusion_matrix(y, y_pred, classes=[0, 1, 2, 3, 4, 5], normalize=True,
title='Normalized confusion matrix')
i want that the box will not cut the first and last row
You need in this case to set xlim and ylim and here is an automatic way to do so for e.g. 10 classes.
Briefly, you need:
plt.xlim(-0.5, len(np.unique(y))-0.5)
plt.ylim(len(np.unique(y))-0.5, -0.5)
Full example:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
# import some data to play with
iris = datasets.load_iris()
X = iris.data
y = np.repeat(np.arange(0,10),15)
class_names = np.array(['1', '2', '3', '4', '5','6','7','8','9','10'])
# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
# Run classifier, using a model that is too regularized (C too low) to see
# the impact on the results
classifier = svm.SVC(kernel='linear', C=0.01)
y_pred = classifier.fit(X_train, y_train).predict(X_test)
def plot_confusion_matrix(y_true, y_pred, classes,
normalize=False,
title=None,
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
if not title:
if normalize:
title = 'Normalized confusion matrix'
else:
title = 'Confusion matrix, without normalization'
# Compute confusion matrix
cm = confusion_matrix(y_true, y_pred)
# Only use the labels that appear in the data
classes = classes[unique_labels(y_true, y_pred)]
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
fig, ax = plt.subplots()
im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
ax.figure.colorbar(im, ax=ax)
# We want to show all ticks...
ax.set(xticks=np.arange(cm.shape[1]),
yticks=np.arange(cm.shape[0]),
# ... and label them with the respective list entries
xticklabels=classes, yticklabels=classes,
title=title,
ylabel='True label',
xlabel='Predicted label')
# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor")
# Loop over data dimensions and create text annotations.
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i in range(cm.shape[0]):
for j in range(cm.shape[1]):
ax.text(j, i, format(cm[i, j], fmt),
ha="center", va="center",
color="white" if cm[i, j] > thresh else "black")
fig.tight_layout()
plt.xlim(-0.5, len(np.unique(y))-0.5)
plt.ylim(len(np.unique(y))-0.5, -0.5)
return ax
np.set_printoptions(precision=2)
# Plot non-normalized confusion matrix
plot_confusion_matrix(y_test, y_pred, classes=class_names,
title='Confusion matrix, without normalization')
# Plot normalized confusion matrix
plot_confusion_matrix(y_test, y_pred, classes=class_names, normalize=True,
title='Normalized confusion matrix')
plt.show()
I have a matrix generated by parsing a file the numpy array is the size 101X101X41 and each entry has a value which represents the magnitude at each point.
Now what I want to do is to plot it in a 3d plot where the 4th dimension will be represented by color. so that I will be able to see the shape of the data points (represent molecular orbitals) and deduce its magnitude at that point.
If I plot each slice of data I get the desired outcome, but in a 2d with the 3rd dimension as the color.
Is there a way to plot this model in python using Matplotlib or equivalent library
Thanks
EDIT:
Im trying to get the question clearer to what I desire.
Ive tried the solution suggested but ive received the following plot:
as one can see, due to the fact the the mesh has lots of zeros in it it "hide" the 3d orbitals. in the following plot one can see a slice of the data, where I get the following plot:
So as you can see I have a certain structure I desire to show in the plot.
my question is, is there a way to plot only the structure and ignore the zeroes such that they won't "hide" the structure.
the code I used to generate the plots:
x = np.linspase(1,101,101)
y = np.linspase(1,101,101)
z = np.linspase(1,101,101)
xx,yy,zz = np.meshgrid(x,y,z)
fig=plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(xx, yy, zz, c=cube.calc_data.flatten())
plt.show()
plt.imshow(cube.calc_data[:,:,11],cmap='jet')
plt.show()
Hope that now the question is much clearer, and that you'd appreciate the question enough now to upvote
Thanks.
you can perform the following:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
epsilon = 2.5e-2 # threshold
height, width, depth = data.shape
global_min = np.inf
global_max = -np.inf
for d in range(depth):
slice = data[:, :, d]
minima = slice.min()
if (minima < global_min): global_min = minima
maxima = slice.max()
if (maxima>global_max): global_max=maxima
norm = colors.Normalize(vmin=minima, vmax=maxima, clip=True)
mapper = cm.ScalarMappable(norm=norm, cmap=cm.jet)
points_gt_epsilon = np.where(slice >= epsilon)
ax.scatter(points_gt_epsilon[0], points_gt_epsilon[1], d,
c=mapper.to_rgba(data[points_gt_epsilon[0],points_gt_epsilon[1],d]), alpha=0.015, cmap=cm.jet)
points_lt_epsilon = np.where(slice <= -epsilon)
ax.scatter(points_lt_epsilon[0], points_lt_epsilon[1], d,
c=mapper.to_rgba(data[points_lt_epsilon[0], points_lt_epsilon[1], d]), alpha=0.015, cmap=cm.jet)
ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
ax.set_zlabel('Z Label')
plt.title('Electron Density Prob.')
norm = colors.Normalize(vmin=global_min, vmax=global_max, clip=True)
cax, _ = colorbar.make_axes(ax)
colorbar.ColorbarBase(cax, cmap=cm.jet,norm=norm)
plt.savefig('test.png')
plt.clf()
What this piece of code does is going slice by slice from the data matrix and for each scatter plot only the points desired (depend on epsilon).
in this case you avoid plotting a lot of zeros that 'hide' your model, using your words.
Hope this helps
You can adjust the color and size of the markers for the scatter. So for example you can filter out all markers below a certain threshold by putting their size to 0. You can also make the size of the marker adaptive to the field strength.
As an example:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
f = lambda x,y,z: np.exp(-(x-3)**2-(y-3)**2-(z-1)**2) - \
np.exp(-(x+3)**2-(y+3)**2-(z+1)**2)
t1 = np.linspace(-6,6,101)
t2 = np.linspace(-3,3,41)
# Data of shape 101,101,41
data = f(*np.meshgrid(t1,t1,t2))
print(data.shape)
# Coordinates
x = np.linspace(1,101,101)
y = np.linspace(1,101,101)
z = np.linspace(1,101,41)
xx,yy,zz = np.meshgrid(x,y,z)
fig=plt.figure()
ax = fig.add_subplot(111, projection='3d')
s = np.abs(data/data.max())**2*25
s[np.abs(data) < 0.05] = 0
ax.scatter(xx, yy, zz, s=s, c=data.flatten(), linewidth=0, cmap="jet", alpha=.5)
plt.show()
I am trying to create a confusion matrix in TensorFlow but I am getting a
TypeError: Image data cannot be converted to float.
The images are predicted accurately but now I want to show the confusion matrix using matplotlib. I tried converting to to np.array() but the error is still the same.
I am following the official documentation for confusion matrix from scikit-learn.
https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, format(cm[i, j], fmt),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()
if result[0][0]>0.85:
predictions.append(result[0][0])
elif result[0][1]>0.85:
predictions.append(result[0][1])
elif result[0][2]>0.85:
predictions.append(result[0][2])
elif result[0][3]>0.85:
predictions.append(result[0][3])
elif result[0][4]>0.85:
predictions.append(result[0][4])
elif result[0][5]>0.85:
predictions.append(result[0][5])
class_names = ['Up', 'Down', 'Left', 'Right', 'Forward', 'Backward']
# label_list contains the filename e.g. hand1.jpg, hand2.jpg....
# Compute confusion matrix
cnf_matrix = tf.confusion_matrix(label_list,predictions,num_classes=6)
np.set_printoptions(precision=2)
# Plot non-normalized confusion matrix
plt.figure()
# ERROR HERE
plot_confusion_matrix(cnf_matrix, classes=class_names,title='Confusion matrix, without normalization')
# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,title='Normalized confusion matrix')
plt.show()
I have not tested it in my PC. Your description is a little bit ambiguous for me (the line of error, etc.), but the main difference of your code and the documentation you linked is confusion_matrix(). Just try to go with confusion_matrix() of sckit-learn instead of confusion_matrix() of tensorflow (at the link, the former is used). In my opinion, it is the easiest way you can go.
EDIT:
Make your predictions like this:
for i in range(6):
if result[0][i] > 0.85:
predictions.append(i)
continue
Then your predictions will not be continuous ones. Here your predictions should be integers since you are predicting class labels.
I'm working on plotting sklearn classification report and my plot generated is very narrow, and difficult to read the labels. I used the post here to get the plotting code.
Any suggestions on how to stretch this plot out horizontally? Thank you
def plot_classification_report(cr, title='Classification report ', with_avg_total=False, cmap=plt.cm.Blues):
lines = cr.split('\n')
classes = []
plotMat = []
for line in lines[2 : (len(lines) - 3)]:
#print(line)
t = line.split()
# print(t)
classes.append(t[0])
v = [float(x) for x in t[1: len(t) - 1]]
#print(v)
plotMat.append(v)
if with_avg_total:
aveTotal = lines[len(lines) - 1].split()
classes.append('avg/total')
vAveTotal = [float(x) for x in t[1:len(aveTotal) - 1]]
plotMat.append(vAveTotal)
plt.imshow(plotMat, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
x_tick_marks = np.arange(3)
y_tick_marks = np.arange(len(classes))
plt.xticks(x_tick_marks, ['precision', 'recall', 'f1-score'], rotation=45)
plt.yticks(y_tick_marks, classes)
#plt.tight_layout()
plt.ylabel('Classes')
plt.xlabel('Measures')
plot_classification_report(classification_report(y_test, y_pred))
By default, the axes will have the aspect ratio of the image. You can change that by using the aspect argument to imshow.
Either put it to "auto", to let the image extend to the given space to the axes.
Or, set it to any number, denoting the height over width ratio; number == height/width.
In this case try
plt.imshow(plotMat, interpolation='nearest', cmap=cmap, aspect="auto")
or
plt.imshow(plotMat, interpolation='nearest', cmap=cmap, aspect=len(classes)/12.)
and adapt it to your needs.
When drawing a dot plot using matplotlib, I would like to offset overlapping datapoints to keep them all visible. For example, if I have:
CategoryA: 0,0,3,0,5
CategoryB: 5,10,5,5,10
I want each of the CategoryA "0" datapoints to be set side by side, rather than right on top of each other, while still remaining distinct from CategoryB.
In R (ggplot2) there is a "jitter" option that does this. Is there a similar option in matplotlib, or is there another approach that would lead to a similar result?
Edit: to clarify, the "beeswarm" plot in R is essentially what I have in mind, and pybeeswarm is an early but useful start at a matplotlib/Python version.
Edit: to add that Seaborn's Swarmplot, introduced in version 0.7, is an excellent implementation of what I wanted.
Extending the answer by #user2467675, here’s how I did it:
def rand_jitter(arr):
stdev = .01 * (max(arr) - min(arr))
return arr + np.random.randn(len(arr)) * stdev
def jitter(x, y, s=20, c='b', marker='o', cmap=None, norm=None, vmin=None, vmax=None, alpha=None, linewidths=None, verts=None, hold=None, **kwargs):
return scatter(rand_jitter(x), rand_jitter(y), s=s, c=c, marker=marker, cmap=cmap, norm=norm, vmin=vmin, vmax=vmax, alpha=alpha, linewidths=linewidths, **kwargs)
The stdev variable makes sure that the jitter is enough to be seen on different scales, but it assumes that the limits of the axes are zero and the max value.
You can then call jitter instead of scatter.
Seaborn provides histogram-like categorical dot-plots through sns.swarmplot() and jittered categorical dot-plots via sns.stripplot():
import seaborn as sns
sns.set(style='ticks', context='talk')
iris = sns.load_dataset('iris')
sns.swarmplot('species', 'sepal_length', data=iris)
sns.despine()
sns.stripplot('species', 'sepal_length', data=iris, jitter=0.2)
sns.despine()
I used numpy.random to "scatter/beeswarm" the data along X-axis but around a fixed point for each category, and then basically do pyplot.scatter() for each category:
import matplotlib.pyplot as plt
import numpy as np
#random data for category A, B, with B "taller"
yA, yB = np.random.randn(100), 5.0+np.random.randn(1000)
xA, xB = np.random.normal(1, 0.1, len(yA)),
np.random.normal(3, 0.1, len(yB))
plt.scatter(xA, yA)
plt.scatter(xB, yB)
plt.show()
One way to approach the problem is to think of each 'row' in your scatter/dot/beeswarm plot as a bin in a histogram:
data = np.random.randn(100)
width = 0.8 # the maximum width of each 'row' in the scatter plot
xpos = 0 # the centre position of the scatter plot in x
counts, edges = np.histogram(data, bins=20)
centres = (edges[:-1] + edges[1:]) / 2.
yvals = centres.repeat(counts)
max_offset = width / counts.max()
offsets = np.hstack((np.arange(cc) - 0.5 * (cc - 1)) for cc in counts)
xvals = xpos + (offsets * max_offset)
fig, ax = plt.subplots(1, 1)
ax.scatter(xvals, yvals, s=30, c='b')
This obviously involves binning the data, so you may lose some precision. If you have discrete data, you could replace:
counts, edges = np.histogram(data, bins=20)
centres = (edges[:-1] + edges[1:]) / 2.
with:
centres, counts = np.unique(data, return_counts=True)
An alternative approach that preserves the exact y-coordinates, even for continuous data, is to use a kernel density estimate to scale the amplitude of random jitter in the x-axis:
from scipy.stats import gaussian_kde
kde = gaussian_kde(data)
density = kde(data) # estimate the local density at each datapoint
# generate some random jitter between 0 and 1
jitter = np.random.rand(*data.shape) - 0.5
# scale the jitter by the KDE estimate and add it to the centre x-coordinate
xvals = 1 + (density * jitter * width * 2)
ax.scatter(xvals, data, s=30, c='g')
for sp in ['top', 'bottom', 'right']:
ax.spines[sp].set_visible(False)
ax.tick_params(top=False, bottom=False, right=False)
ax.set_xticks([0, 1])
ax.set_xticklabels(['Histogram', 'KDE'], fontsize='x-large')
fig.tight_layout()
This second method is loosely based on how violin plots work. It still cannot guarantee that none of the points are overlapping, but I find that in practice it tends to give quite nice-looking results as long as there are a decent number of points (>20), and the distribution can be reasonably well approximated by a sum-of-Gaussians.
Not knowing of a direct mpl alternative here you have a very rudimentary proposal:
from matplotlib import pyplot as plt
from itertools import groupby
CA = [0,4,0,3,0,5]
CB = [0,0,4,4,2,2,2,2,3,0,5]
x = []
y = []
for indx, klass in enumerate([CA, CB]):
klass = groupby(sorted(klass))
for item, objt in klass:
objt = list(objt)
points = len(objt)
pos = 1 + indx + (1 - points) / 50.
for item in objt:
x.append(pos)
y.append(item)
pos += 0.04
plt.plot(x, y, 'o')
plt.xlim((0,3))
plt.show()
Seaborn's swarmplot seems like the most apt fit for what you have in mind, but you can also jitter with Seaborn's regplot:
import seaborn as sns
iris = sns.load_dataset('iris')
sns.swarmplot('species', 'sepal_length', data=iris)
sns.regplot(x='sepal_length',
y='sepal_width',
data=iris,
fit_reg=False, # do not fit a regression line
x_jitter=0.1, # could also dynamically set this with range of data
y_jitter=0.1,
scatter_kws={'alpha': 0.5}) # set transparency to 50%
Extending the answer by #wordsforthewise (sorry, can't comment with my reputation), if you need both jitter and the use of hue to color the points by some categorical (like I did), Seaborn's lmplot is a great choice instead of reglpot:
import seaborn as sns
iris = sns.load_dataset('iris')
sns.lmplot(x='sepal_length', y='sepal_width', hue='species', data=iris, fit_reg=False, x_jitter=0.1, y_jitter=0.1)