I am using plot_confusion_matrix from sklearn.metrics. I want to represent those confusion matrices next to each other like subplots, how could I do this?
Let's use the good'ol iris dataset to reproduce this, and fit several classifiers to plot their respective confusion matrices with plot_confusion_matrix:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
from sklearn.datasets import load_iris
from sklearn.metrics import plot_confusion_matrix
data = load_iris()
X = data.data
y = data.target
Set up -
X_train, X_test, y_train, y_test = train_test_split(X, y)
classifiers = [LogisticRegression(solver='lbfgs'),
AdaBoostClassifier(),
GradientBoostingClassifier(),
SVC()]
for cls in classifiers:
cls.fit(X_train, y_train)
So the way you could compare all matrices at simple sight, is by creating a set of subplots with plt.subplots. Then iterate both over the axes objects and the trained classifiers (plot_confusion_matrix expects the as input) and plot the individual confusion matrices:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15,10))
for cls, ax in zip(classifiers, axes.flatten()):
plot_confusion_matrix(cls,
X_test,
y_test,
ax=ax,
cmap='Blues',
display_labels=data.target_names)
ax.title.set_text(type(cls).__name__)
plt.tight_layout()
plt.show()
if your desired output is that This is my way to see multiple confusion matrices (confusion_matrix) side by side with ConfusionMatrixDisplay.
note: paste your own test and train data names in "metrics.confusion_matrix()" function.
fig, ax = plt.subplots(1,2)
ax[0].set_title("test")
ax[1].set_title("train")
metrics.ConfusionMatrixDisplay(
confusion_matrix = metrics.confusion_matrix(y_test, y_pred),
display_labels = [False, True]).plot(ax=ax[0])
metrics.ConfusionMatrixDisplay(
confusion_matrix = metrics.confusion_matrix(y_train, y_train_pred),
display_labels = [False, True]).plot(ax=ax[1]);
Related
I have a problem with plotting decision regions for Logistic Regression classification model. Can somebody help me and explain something how to do that? I put the colab link to this project here -> https://colab.research.google.com/drive/1JqFyoAk0zithy4esfjiyo6MdB12iBndi?usp=sharing
Dataset from Kaggle -> https://www.kaggle.com/datasets/muratkokludataset/date-fruit-datasets
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from mlxtend.plotting import plot_decision_regions
np.set_printoptions(suppress=True, edgeitems=30, linewidth=100000, formatter=dict(float=lambda x: f'{x:.8f}'))
np.random.seed(42)
sns.set()
desired_width = 320
pd.options.display.float_format = '{:,.8f}'.format
pd.set_option('display.width', desired_width)
pd.set_option('display.max_columns', 12)
raw_data = pd.read_excel(io='/content/Date_Fruit_Datasets.xlsx',
sheet_name='Date_Fruit_Datasets')
data = raw_data.copy()
data.head(n=10)
data.describe().transpose()
data.info()
data.shape
# Creating data and target
X = data.drop(columns='Class')
y = data['Class']
X.shape
y.shape
# Encoding target
encoder = LabelEncoder()
y = encoder.fit_transform(y=y)
# Creating train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
# Scalling data
scaler = StandardScaler()
X_train = scaler.fit_transform(X=X_train)
X_test = scaler.transform(X=X_test)
# Creating classifier, fitting and predicting
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X=X_train, y=y_train)
y_pred = classifier.predict(X=X_test)
y_pred_proba = classifier.predict_proba(X=X_test)
# Checking finally reports and scores
score = accuracy_score(y_true=y_test, y_pred=y_pred)
report = classification_report(y_true=y_test, y_pred=y_pred, target_names=encoder.classes_)
confusion_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)
# Compare y_true and y_pred in DataFrame
results = pd.DataFrame(data={
'y_true': y_test,
'y_pred': y_pred
})
# Creating Data Frame with predict proba
predict_proba = pd.DataFrame(data=classifier.predict_proba(X=X_test), columns=encoder.classes_)
# Saving results to csv
results.to_csv(path_or_buf='/content/data_fruit_predictions.csv')
predict_proba.to_csv(path_or_buf='/content/data_fruit_predict_proba.csv')
# Plotting decision regions
value = 1.5
width = 0.75
plt.figure(figsize=(10, 8))
plot_decision_regions(X=X.values, y=y, clf=classifier,
filler_feature_values={i: value for i in range(1, 34)},
filler_feature_ranges={i: width for i in range(1, 34)}, legend=2)
plt.show()
After using function plot_decision_regions PyCharm shows me error like:
UserWarning: No contour levels were found within the data range.
ax.contour(xx, yy, Z, cset.levels,
and
UserWarning: You passed a edgecolor/edgecolors ('black') for an unfilled marker ('x'). Matplotlib is ignoring the edgecolor in favor of the facecolor. This behavior may change in the future.
ax.scatter(x=x_data,
I want to plot figures with different value of k for k-nn classifier.
My problem is that the figures seem to have same values of k.
What I have tried so far, is to change the value of k in each run in the loop:
clf = KNeighborsClassifier(n_neighbors=counter+1)
But all the figures seem to be for k=1
from sklearn.datasets import fetch_california_housing
data = fetch_california_housing()
import numpy as np
from sklearn.model_selection import train_test_split
c = np.array([1 if y > np.median(data['target']) else 0 for y in data['target']])
X_train, X_test, c_train, c_test = train_test_split(data['data'], c, random_state=0)
from sklearn.neighbors import KNeighborsClassifier
import mglearn
import matplotlib.pyplot as plt
fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(20, 6))
for counter in range(3):
clf = KNeighborsClassifier(n_neighbors=counter+1)
clf.fit(X_test, c_test)
plt.tight_layout() # this will help create proper spacing between the plots.
mglearn.discrete_scatter(X_test[:,0], X_test[:,1], c_test, ax=ax[counter])
plt.legend(["Class 0", "Class 1"], loc=4)
plt.xlabel("First feature")
plt.ylabel("Second feature")
#plt.figure()
The reason why all the plots look the same is that you are simply plotting the test set every time instead of plotting the model predictions on the test set. You probably meant to do the following for each value of k:
Fit the model to the training set, in which case you should replace clf.fit(X_test, c_test) with clf.fit(X_train, c_train).
Generate the model predictions on the test set, in which case you should add c_pred = clf.predict(X_test).
Plot the model predictions on the test set, in which case you should replace c_test with c_pred in the scatter plot, i.e. use mglearn.discrete_scatter(X_test[:, 0], X_test[:, 1], c_pred, ax=ax[counter]) instead of mglearn.discrete_scatter(X_test[:, 0], X_test[:, 1], c_test, ax=ax[counter]).
Updated code:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import mglearn
import matplotlib.pyplot as plt
data = fetch_california_housing()
c = np.array([1 if y > np.median(data['target']) else 0 for y in data['target']])
X_train, X_test, c_train, c_test = train_test_split(data['data'], c, random_state=0)
fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(20, 6))
for counter in range(3):
clf = KNeighborsClassifier(n_neighbors=counter+1)
# fit the model to the training set
clf.fit(X_train, c_train)
# extract the model predictions on the test set
c_pred = clf.predict(X_test)
# plot the model predictions
plt.tight_layout()
mglearn.discrete_scatter(X_test[:,0], X_test[:,1], c_pred, ax=ax[counter])
plt.legend(["Class 0", "Class 1"], loc=4)
plt.xlabel("First feature")
plt.ylabel("Second feature")
I have made a simple linear regression model:
LR = LinearRegression()
kfold = model_selection.KFold(n_splits=10, random_state=12)
result_kfold = model_selection.cross_val_score(LR, X_train, Y_train, cv=kfold, scoring = 'r2')
print("Accuracy: %.2f%%" % (result_kfold.mean()*100.0))
LR.fit(X_train,Y_train)
Y_pred = LR.predict(X_test)
print("Y_pred:", Y_pred)
i want to plot the residual errors. I've used 'residplot' for the same. But i'm not sure if i've passed the right arguements. According to the documentation, we've to use predictor variable and result/response variable.
Here's the code:
sns.set(style="whitegrid")
sns.residplot(Y_test, Y_pred, lowess=True, color="g")
Can anyone please tell me if it is right...also what should be the labels of X and Y axis?
Thank You in advance for help
You are plotting something very weird, so let's use an example dataset:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import matplotlib. pyplot as plt
import seaborn as sns
iris = sns.load_dataset('iris')
X_train, X_test, Y_train, Y_test = train_test_split(iris.iloc[:,:3], iris.iloc[:,3],random_state=11)
LR = LinearRegression()
LR.fit(X_train,Y_train)
Y_pred = LR.predict(X_test)
If you just want to plot the residuals, you can do:
sns.set(style="whitegrid")
fig, ax = plt.subplots(figsize =(5,5))
sns.regplot(x=Y_pred,y=Y_test-Y_pred,ax=ax,lowess=True)
ax.set(ylabel='residuals',xlabel='fitted values')
What you are getting with sns.regplot() is the y variable regressed onto the x-variable and the residuals being plotted, which makes no sense in your case, and I illustrate below how the plot is obtained, first you fit the prediction (y variable) to actual (x variable), and get the residuals:
plotfit = LinearRegression()
plotfit.fit(Y_test.to_numpy().reshape(-1,1),Y_pred)
residual = Y_pred - plotfit.predict(Y_test.to_numpy().reshape(-1,1))
Then plotting it gives you exactly the same thing as your sns.residplot:
sns.set(style="whitegrid")
fig, ax = plt.subplots(1,2,figsize =(10,5))
sns.residplot(Y_test,Y_pred,lowess=True, color="g",ax=ax[0])
ax[0].set_xlim(0,2.5)
sns.regplot(x=Y_test,y=residual,lowess=True)
ax[1].set_xlim(0,2.5)
This question already has answers here:
Plot multiple confusion matrices with plot_confusion_matrix
(2 answers)
Closed 1 year ago.
I am new here. This is my first question that I hope to get an answer from experts. I have 5 classifier models that I am trying to plot their confusion matrix.
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import collections
classifiers = {
"Naive Bayes": GaussianNB(),
"LogisiticRegression": LogisticRegression(),
"KNearest": KNeighborsClassifier(),
"Support Vector Classifier": SVC(),
"DecisionTreeClassifier": DecisionTreeClassifier(),
}
and then
from sklearn.metrics import confusion_matrix
for key, classifier in classifiers.items():
y_pred = classifier.fit(X_train, y_train).predict(X_test)
cf_matrix=confusion_matrix(y_test, y_pred)
print(cf_matrix)
which gives me
now I am trying to plot them with below code but no data is shown on the plots.
fig, axn = plt.subplots(1,5, sharex=True, sharey=True)
cbar_ax = fig.add_axes([.91, .3, .03, .4])
for i, ax in enumerate(axn.flat):
sns.heatmap(cf_matrix, ax=ax,
cbar=i == 0,
vmin=0, vmax=1,
cbar_ax=None if i else cbar_ax)
fig.tight_layout(rect=[0, 0, .9, 1])
Can someone please help me get this done?
sklearn provides plotting capability on confusion_matrix.
There are two ways to do it,
plot_confusion_matrix
ConfusionMatrixDisplay
I used the second way here, because removing colorbar was quite verbose in first way (having multiple colorbars looks very cluttered).
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.datasets import load_iris
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
classifiers = {
"Naive Bayes": GaussianNB(),
"LogisiticRegression": LogisticRegression(),
"KNearest": KNeighborsClassifier(),
"Support Vector Classifier": SVC(),
"DecisionTreeClassifier": DecisionTreeClassifier(),
}
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y)
f, axes = plt.subplots(1, 5, figsize=(20, 5), sharey='row')
for i, (key, classifier) in enumerate(classifiers.items()):
y_pred = classifier.fit(X_train, y_train).predict(X_test)
cf_matrix = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(cf_matrix,
display_labels=iris.target_names)
disp.plot(ax=axes[i], xticks_rotation=45)
disp.ax_.set_title(key)
disp.im_.colorbar.remove()
disp.ax_.set_xlabel('')
if i!=0:
disp.ax_.set_ylabel('')
f.text(0.4, 0.1, 'Predicted label', ha='left')
plt.subplots_adjust(wspace=0.40, hspace=0.1)
f.colorbar(disp.im_, ax=axes)
plt.show()
You need to store the confusion matrix somewhere, so for if I use an example dataset:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
data = load_breast_cancer()
scaler = StandardScaler()
X_df = pd.DataFrame(data.data, columns=data.feature_names)
X_df = scaler.fit_transform(X_df)
y_df = pd.DataFrame(data.target, columns=['target'])
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2, random_state=11)
And store it in a similar dictionary:
from sklearn.metrics import confusion_matrix
cf_matrix = dict.fromkeys(classifiers.keys())
for key, classifier in classifiers.items():
y_pred = classifier.fit(X_train, y_train.values.ravel()).predict(X_test)
cf_matrix[key]=confusion_matrix(y_test, y_pred)
Then you can plot it:
fig, axn = plt.subplots(1,5, sharex=True, sharey=True,figsize=(12,2))
for i, ax in enumerate(axn.flat):
k = list(cf_matrix)[i]
sns.heatmap(cf_matrix[k], ax=ax,cbar=i==4)
ax.set_title(k,fontsize=8)
I was using matplotlib.pyplot to plot a continuous curve in jupyter notebook. I used the following code:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
X_train, X_test, y_train, y_test = train_test_split(X.reshape(-1,1), y, random_state = 0)
poly = PolynomialFeatures(degree=9)
X_train_p = poly.fit_transform(X_train)
X_test_p = poly.fit_transform(X_test)
plt.figure(figsize=(5,5))
plt.title("deg={}".format(9))
plt.plot(X_train, y_train.reshape(-1,1), 'r')
plt.show()
I expected the data points to be successively connected by straight lines, however the outcome turns up like this:
I tried multiple variations of reshaping X_train and y_train using .reshape(), but didn't get the expected outcome.