Draw figures using k-nn with different values of k - python

I want to plot figures with different value of k for k-nn classifier.
My problem is that the figures seem to have same values of k.
What I have tried so far, is to change the value of k in each run in the loop:
clf = KNeighborsClassifier(n_neighbors=counter+1)
But all the figures seem to be for k=1
from sklearn.datasets import fetch_california_housing
data = fetch_california_housing()
import numpy as np
from sklearn.model_selection import train_test_split
c = np.array([1 if y > np.median(data['target']) else 0 for y in data['target']])
X_train, X_test, c_train, c_test = train_test_split(data['data'], c, random_state=0)
from sklearn.neighbors import KNeighborsClassifier
import mglearn
import matplotlib.pyplot as plt
fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(20, 6))
for counter in range(3):
clf = KNeighborsClassifier(n_neighbors=counter+1)
clf.fit(X_test, c_test)
plt.tight_layout() # this will help create proper spacing between the plots.
mglearn.discrete_scatter(X_test[:,0], X_test[:,1], c_test, ax=ax[counter])
plt.legend(["Class 0", "Class 1"], loc=4)
plt.xlabel("First feature")
plt.ylabel("Second feature")
#plt.figure()

The reason why all the plots look the same is that you are simply plotting the test set every time instead of plotting the model predictions on the test set. You probably meant to do the following for each value of k:
Fit the model to the training set, in which case you should replace clf.fit(X_test, c_test) with clf.fit(X_train, c_train).
Generate the model predictions on the test set, in which case you should add c_pred = clf.predict(X_test).
Plot the model predictions on the test set, in which case you should replace c_test with c_pred in the scatter plot, i.e. use mglearn.discrete_scatter(X_test[:, 0], X_test[:, 1], c_pred, ax=ax[counter]) instead of mglearn.discrete_scatter(X_test[:, 0], X_test[:, 1], c_test, ax=ax[counter]).
Updated code:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import mglearn
import matplotlib.pyplot as plt
data = fetch_california_housing()
c = np.array([1 if y > np.median(data['target']) else 0 for y in data['target']])
X_train, X_test, c_train, c_test = train_test_split(data['data'], c, random_state=0)
fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(20, 6))
for counter in range(3):
clf = KNeighborsClassifier(n_neighbors=counter+1)
# fit the model to the training set
clf.fit(X_train, c_train)
# extract the model predictions on the test set
c_pred = clf.predict(X_test)
# plot the model predictions
plt.tight_layout()
mglearn.discrete_scatter(X_test[:,0], X_test[:,1], c_pred, ax=ax[counter])
plt.legend(["Class 0", "Class 1"], loc=4)
plt.xlabel("First feature")
plt.ylabel("Second feature")

Related

Problem with plotting decision regions for classification model

I have a problem with plotting decision regions for Logistic Regression classification model. Can somebody help me and explain something how to do that? I put the colab link to this project here -> https://colab.research.google.com/drive/1JqFyoAk0zithy4esfjiyo6MdB12iBndi?usp=sharing
Dataset from Kaggle -> https://www.kaggle.com/datasets/muratkokludataset/date-fruit-datasets
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from mlxtend.plotting import plot_decision_regions
np.set_printoptions(suppress=True, edgeitems=30, linewidth=100000, formatter=dict(float=lambda x: f'{x:.8f}'))
np.random.seed(42)
sns.set()
desired_width = 320
pd.options.display.float_format = '{:,.8f}'.format
pd.set_option('display.width', desired_width)
pd.set_option('display.max_columns', 12)
raw_data = pd.read_excel(io='/content/Date_Fruit_Datasets.xlsx',
sheet_name='Date_Fruit_Datasets')
data = raw_data.copy()
data.head(n=10)
data.describe().transpose()
data.info()
data.shape
# Creating data and target
X = data.drop(columns='Class')
y = data['Class']
X.shape
y.shape
# Encoding target
encoder = LabelEncoder()
y = encoder.fit_transform(y=y)
# Creating train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
# Scalling data
scaler = StandardScaler()
X_train = scaler.fit_transform(X=X_train)
X_test = scaler.transform(X=X_test)
# Creating classifier, fitting and predicting
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X=X_train, y=y_train)
y_pred = classifier.predict(X=X_test)
y_pred_proba = classifier.predict_proba(X=X_test)
# Checking finally reports and scores
score = accuracy_score(y_true=y_test, y_pred=y_pred)
report = classification_report(y_true=y_test, y_pred=y_pred, target_names=encoder.classes_)
confusion_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)
# Compare y_true and y_pred in DataFrame
results = pd.DataFrame(data={
'y_true': y_test,
'y_pred': y_pred
})
# Creating Data Frame with predict proba
predict_proba = pd.DataFrame(data=classifier.predict_proba(X=X_test), columns=encoder.classes_)
# Saving results to csv
results.to_csv(path_or_buf='/content/data_fruit_predictions.csv')
predict_proba.to_csv(path_or_buf='/content/data_fruit_predict_proba.csv')
# Plotting decision regions
value = 1.5
width = 0.75
plt.figure(figsize=(10, 8))
plot_decision_regions(X=X.values, y=y, clf=classifier,
filler_feature_values={i: value for i in range(1, 34)},
filler_feature_ranges={i: width for i in range(1, 34)}, legend=2)
plt.show()
After using function plot_decision_regions PyCharm shows me error like:
UserWarning: No contour levels were found within the data range.
ax.contour(xx, yy, Z, cset.levels,
and
UserWarning: You passed a edgecolor/edgecolors ('black') for an unfilled marker ('x'). Matplotlib is ignoring the edgecolor in favor of the facecolor. This behavior may change in the future.
ax.scatter(x=x_data,

Arranging an array of `ConfusionMatrixDisplay` objects in a single plot using matplotlib plt.subplots()

I have an array of different models' confusion matrices images, each of which is of type ConfusionMatrixDisplay. I want to display them nicely on a single figure using plt.subplots. How to achieve that? A sample code that I tried is attached below.
%matplotlib notebook
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
X, y = make_classification(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(
X, y, random_state=0)
clf = SVC(random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
disp = ConfusionMatrixDisplay.from_predictions(
y_test, y_pred)
arr = [disp,disp,disp]*3
rows = 0
l = len(arr)
if l%4==0:
rows = l//4
else:
rows = l//4 + 1
fig,ax = plt.subplots(rows, 4, sharex='col', sharey='row',figsize=(6, 6))
print(len(arr))
for i in range(rows):
for j in range(4):
if(4*i + j < len(arr)):
ax[i,j] = arr[4*i + j].ax_
plt.show()

Scatter Plot of predicted vs actual value with regression curve

I am trying to use scatter plots with regression curves using the following code. I am using different algorithms like Linear regression, SVM and Gaussian Process etc. I have tried different options for plotting the data mentioned below
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
df=pd.read_excel(coded.xlsx)
dfnew=df[['FL','FW','TL','LL','KH']]
Y = df['KH']
X = df[['FL']]
X=X.values.reshape(len(X),1)
Y=Y.values.reshape(len(Y),1)
# Split the data into training/testing sets
X_train = X[:-270]
X_test = X[-270:]
# Split the targets into training/testing sets
Y_train = Y[:-270]
Y_test = Y[-270:]
#regressor = SVR(kernel = 'rbf')
#regressor.fit(X_train, np.ravel(Y_train))
#training the algorithm
regressor = GaussianProcessRegressor(random_state=42)
regressor.fit(X_train, Y_train)
y_pred = regressor.predict(X_test)
mse = np.sum((y_pred - Y_test)**2)
# root mean squared error
# m is the number of training examples
rmse = np.sqrt(mse/270)
print(rmse)
#X_grid = np.arange(min(X), max(X), 0.01) #this step required because data is feature scaled.
#X_grid = np.arange(0, 15, 0.01) #this step required because data is feature scaled.
#X_grid = X_grid.reshape((len(X_grid), 1))
#plt.scatter(X, Y, color = 'red')
print('size of Y_train = {0}'.format(Y_train.size))
print('size of y_pred = {0}'.format(y_pred.size))
#plt.scatter(Y_train, y_pred, color = 'red')
#plt.plot(X_grid, regressor.predict(X_grid), color = 'blue')
#plt.title('GPR')
#plt.xlabel('Measured')
#plt.ylabel('Predicted')
#plt.show()
fig, ax = plt.subplots(1, figsize=(12, 6))
plt.plot(X[:, 0], Y_train, marker='o', color='black', linewidth=0)
plt.plot(X[:, 0], y_pred, marker='x', color='steelblue')
plt.suptitle("$GaussianProcessRegressor(kernel=RBF)$ [default]", fontsize=20)
plt.axis('off')
pass
But I am getting error like:
ValueError: x and y must have same first dimension, but have shapes (540,) and (270, 1)
What is the possible solution?
This code splits X and Y into training/testing sets, but then tries to plot a column from all of X with Y_train and y_pred, which have only half as many values as X. Try creating plots with X_train and X_test instead.

what should be the parameters in residplot seaborn

I have made a simple linear regression model:
LR = LinearRegression()
kfold = model_selection.KFold(n_splits=10, random_state=12)
result_kfold = model_selection.cross_val_score(LR, X_train, Y_train, cv=kfold, scoring = 'r2')
print("Accuracy: %.2f%%" % (result_kfold.mean()*100.0))
LR.fit(X_train,Y_train)
Y_pred = LR.predict(X_test)
print("Y_pred:", Y_pred)
i want to plot the residual errors. I've used 'residplot' for the same. But i'm not sure if i've passed the right arguements. According to the documentation, we've to use predictor variable and result/response variable.
Here's the code:
sns.set(style="whitegrid")
sns.residplot(Y_test, Y_pred, lowess=True, color="g")
Can anyone please tell me if it is right...also what should be the labels of X and Y axis?
Thank You in advance for help
You are plotting something very weird, so let's use an example dataset:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import matplotlib. pyplot as plt
import seaborn as sns
iris = sns.load_dataset('iris')
X_train, X_test, Y_train, Y_test = train_test_split(iris.iloc[:,:3], iris.iloc[:,3],random_state=11)
LR = LinearRegression()
LR.fit(X_train,Y_train)
Y_pred = LR.predict(X_test)
If you just want to plot the residuals, you can do:
sns.set(style="whitegrid")
fig, ax = plt.subplots(figsize =(5,5))
sns.regplot(x=Y_pred,y=Y_test-Y_pred,ax=ax,lowess=True)
ax.set(ylabel='residuals',xlabel='fitted values')
What you are getting with sns.regplot() is the y variable regressed onto the x-variable and the residuals being plotted, which makes no sense in your case, and I illustrate below how the plot is obtained, first you fit the prediction (y variable) to actual (x variable), and get the residuals:
plotfit = LinearRegression()
plotfit.fit(Y_test.to_numpy().reshape(-1,1),Y_pred)
residual = Y_pred - plotfit.predict(Y_test.to_numpy().reshape(-1,1))
Then plotting it gives you exactly the same thing as your sns.residplot:
sns.set(style="whitegrid")
fig, ax = plt.subplots(1,2,figsize =(10,5))
sns.residplot(Y_test,Y_pred,lowess=True, color="g",ax=ax[0])
ax[0].set_xlim(0,2.5)
sns.regplot(x=Y_test,y=residual,lowess=True)
ax[1].set_xlim(0,2.5)

Plot multiple confusion matrices with plot_confusion_matrix

I am using plot_confusion_matrix from sklearn.metrics. I want to represent those confusion matrices next to each other like subplots, how could I do this?
Let's use the good'ol iris dataset to reproduce this, and fit several classifiers to plot their respective confusion matrices with plot_confusion_matrix:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
from sklearn.datasets import load_iris
from sklearn.metrics import plot_confusion_matrix
data = load_iris()
X = data.data
y = data.target
Set up -
X_train, X_test, y_train, y_test = train_test_split(X, y)
classifiers = [LogisticRegression(solver='lbfgs'),
AdaBoostClassifier(),
GradientBoostingClassifier(),
SVC()]
for cls in classifiers:
cls.fit(X_train, y_train)
So the way you could compare all matrices at simple sight, is by creating a set of subplots with plt.subplots. Then iterate both over the axes objects and the trained classifiers (plot_confusion_matrix expects the as input) and plot the individual confusion matrices:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15,10))
for cls, ax in zip(classifiers, axes.flatten()):
plot_confusion_matrix(cls,
X_test,
y_test,
ax=ax,
cmap='Blues',
display_labels=data.target_names)
ax.title.set_text(type(cls).__name__)
plt.tight_layout()
plt.show()
if your desired output is that This is my way to see multiple confusion matrices (confusion_matrix) side by side with ConfusionMatrixDisplay.
note: paste your own test and train data names in "metrics.confusion_matrix()" function.
fig, ax = plt.subplots(1,2)
ax[0].set_title("test")
ax[1].set_title("train")
metrics.ConfusionMatrixDisplay(
confusion_matrix = metrics.confusion_matrix(y_test, y_pred),
display_labels = [False, True]).plot(ax=ax[0])
metrics.ConfusionMatrixDisplay(
confusion_matrix = metrics.confusion_matrix(y_train, y_train_pred),
display_labels = [False, True]).plot(ax=ax[1]);

Categories

Resources