Python - how to visualise a decision boundary for a LogisticRegression? - python

I'm trying to visualise a decision boundary for a LogisticRegression() classifier.
But getting a ValueError: X has 2 features per sample; expecting 24 when calling plot_boundary(LogisticRegression(), X, y, "Log Reg")
I've checked the X.shape and it is (14635, 24)
What could be wrong with my function?
def plot_boundary(clf, X, y, plot_title):
xx, yy = np.meshgrid(np.linspace(-3, 3, 50), np.linspace(-3, 3, 50))
clf.fit(X, y)
# plot the decision function for each datapoint on the grid
Z = clf.predict_proba(np.vstack((xx.ravel(), yy.ravel())).T)[:, 1]
Z = Z.reshape(xx.shape)
image = plt.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(),
yy.min(), yy.max()), aspect='auto', origin='lower',
cmap=plt.cm.PuOr_r)
contours = plt.contour(xx, yy, Z, levels=[0], linewidths=2, linetypes='--')
plt.scatter(X[:, 0], X[:, 1], s=30, c=y, cmap=plt.cm.Paired)
plt.xticks(())
plt.yticks(())
plt.xlabel(r'$x_1$')
plt.ylabel(r'$x_2$')
plt.axis([-3, 3, -3, 3])
plt.colorbar(image)
plt.title(plot_title, fontsize=12)

Related

3d matplotlib zaxis font

using this code :
data=np.genfromtxt('jpdfomegal2_90.dat')
x_len= len(np.unique(data[:, 0]))
y_len= len(np.unique(data[:, 1]))
#reshape X, Y, and Z into 2D arrays
X = data[:, 0].reshape(x_len, y_len)
Y = data[:, 1].reshape(x_len, y_len)
Z = data[:, 2].reshape(x_len, y_len)
Zmin = np.where(Z > 0, Z, np.inf).min()
Zmax = Z.max()
Z[Z==0] = 0.9 * Zmin
Zlog = np.log10(Z)
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(projection='3d')
rc('font',family='palatino')
rc('font',size=14)
ax.set_xlim3d(0,15)
ax.set_zlim3d(np.floor(np.log10(Zmin))-2, np.ceil(np.log10(10))) #,font:'palatino')
ax.zaxis.set_major_formatter(mticker.FuncFormatter(log_tick_formatter))
ax.zaxis.set_major_locator(mticker.MaxNLocator(integer=True))
ax.contour(X, Y, np.log10(Z), 4, lw=0.1, colors="k", linestyles="--", offset=np.floor(np.log10(Zmin))-2)
ax.plot_surface(X, Y, np.log10(Z), cmap="binary", lw=0.1,alpha=0.5)
ax.plot_wireframe(X, Y, np.log10(Z),linewidth=0.5,color='k')
ax.contour(X, Y, np.log10(Z), 4, lw=0.1, colors="k", linestyles="solid")
ax.view_init(elev=17, azim=-60)
for spine in ax.spines.values():
spine.set_visible(False)
plt.tight_layout()
plt.savefig('jpdf_lambda2_90.pdf', bbox_inches='tight')
plt.show()
I obtain a palatino font in x,y axis numerations but not in Z like. in the piucture :
how can obtain the same font (everywhere) ?
and how can I define the z-axis to obtain sub-thicks like in the classical logarithmic scale? thanks

X.shape[1] = 2 should be equal to 9, the number of features at training time

I'm trying to plot my Binary SVM classifier results using matplotlib.pyplot and using this documentation as a guide:
https://scikit-learn.org/0.18/auto_examples/svm/plot_iris.html
Here's my code:
# create a mesh to plot in
h = .02
x_min, x_max = X.min() - 1, X.max() + 1
y_min, y_max = 0,1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# PLOT
for i, clf in enumerate((svc, lin_svc, rbf_svc, poly_svc)):
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
plt.subplot(2, 2, i + 1)
plt.subplots_adjust(wspace=0.4, hspace=0.4)
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.coolwarm, alpha=0.8)
# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.coolwarm)
plt.xlabel('X')
plt.ylabel('Y')
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())
plt.title(titles[i])
plt.show()
Where X.shape = (10769, 9), and the class values are either 0 or 1, hence why I set y_min and y_max to 0 and 1.
This is the error I'm getting:
X.shape[1] = 2 should be equal to 9, the number of features at training time
I'm not sure if I understand how to plot SVMs correctly - what am I doing wrong?
Here is the link to the notebook with the full code: https://colab.research.google.com/drive/1F3_CFIDv8qaDeWodujpeu4phQADFpodP?usp=sharing
Any help would be much appreciated!!

Use same colors for scatter and contourf on same plot

I have to plot the decision boundaries of a classifier and the samples (with labels) used to train the classifier so I am using scatter plot and contourf. I can't find a way to use the same colors for both functions, when I specify a color sequence the scatter plot works well but the colors for the boundaries do not follow the order that I expect.
Here is my code:
import numpy as np
import sklearn.naive_bayes as nb
import matplotlib.pyplot as plt
clf = nb.GaussianNB()
X = np.zeros((4, 2), dtype=int)
X[0, :] = 10
X[1, :] = 20
X[2, :] = 30
X[3, :] = 40
Y = [0, 1, 2, 3]
clf.fit(X, Y)
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max,0.1))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.figure(figsize=(8,8))
colors_ = ['red', 'yellow', 'cyan', 'blue']
plt.contourf(xx, yy, Z, colors=colors_, alpha=0.5)
labels = ['dog', 'guitar', 'house', 'person']
for i in np.unique(Y):
mask = Y == i
plt.scatter(X[mask, 0], X[mask, 1], c=colors_[i], label=labels[i],
edgecolors='k', alpha=0.8)
plt.legend()
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title('Decision Boundaries of Naive Bayes Classifier')
plt.show()
The image should have decision regions and scatter points with more or less the same colors (i.e. yellow region where most of the yellow points are, red region where most of the red points are and so on)....with the code above the colors are mixed (i.e. red region where most of the yellow points are).
I suppose that this is simply a problem about ordering but I can't figure it out.

Subplot drawn by auxiliary function experiences unexpected shift

I'm trying to merge two plots in one:
http://scikit-learn.org/stable/auto_examples/linear_model/plot_sgd_iris.html
http://scikit-learn.org/stable/auto_examples/ensemble/plot_voting_decision_regions.html#sphx-glr-auto-examples-ensemble-plot-voting-decision-regions-py
In the left plot I want to display the decision boundary with the hyperplane corresponding to the OVA classifiers and in the right plot I would like to show the decision probabilities.
This is the code so far:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn import datasets
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
def plot_hyperplane(c, color, fitted_model):
"""
Plot the one-against-all classifiers for the given model.
Parameters
--------------
c : index of the hyperplane to be plot
color : color to be used when drawing the line
fitted_model : the fitted model
"""
xmin, xmax = plt.xlim()
ymin, ymax = plt.ylim()
try:
coef = fitted_model.coef_
intercept = fitted_model.intercept_
except:
return
def line(x0):
return (-(x0 * coef[c, 0]) - intercept[c]) / coef[c, 1]
plt.plot([xmin, xmax], [line(xmin), line(xmax)], ls="--", color=color, zorder=3)
def plot_decision_boundary(X, y, fitted_model, features, targets):
"""
This function plots a model decision boundary as well as it tries to plot
the decision probabilities, if available.
Requires a model fitted with two features only.
Parameters
--------------
X : the data to learn
y : the classification labels
fitted_model : the fitted model
"""
cmap = plt.get_cmap('Set3')
prob = cmap
colors = [cmap(i) for i in np.linspace(0, 1, len(fitted_model.classes_))]
plt.figure(figsize=(9.5, 5))
for i, plot_type in enumerate(['Decision Boundary', 'Decision Probabilities']):
plt.subplot(1, 2, i+1)
mesh_step_size = 0.01 # step size in the mesh
x_min, x_max = X[:, 0].min() - .1, X[:, 0].max() + .1
y_min, y_max = X[:, 1].min() - .1, X[:, 1].max() + .1
xx, yy = np.meshgrid(np.arange(x_min, x_max, mesh_step_size), np.arange(y_min, y_max, mesh_step_size))
# First plot, predicted results using the given model
if i == 0:
Z = fitted_model.predict(np.c_[xx.ravel(), yy.ravel()])
for h, color in zip(fitted_model.classes_, colors):
plot_hyperplane(h, color, fitted_model)
# Second plot, predicted probabilities using the given model
else:
prob = 'RdYlBu_r'
try:
Z = fitted_model.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
except:
plt.text(0.4, 0.5, 'Probabilities Unavailable', horizontalalignment='center',
verticalalignment='center', transform=plt.gca().transAxes, fontsize=12)
plt.axis('off')
break
Z = Z.reshape(xx.shape)
# Display Z
plt.imshow(Z, interpolation='nearest', cmap=prob, alpha=0.5,
extent=(x_min, x_max, y_min, y_max), origin='lower', zorder=1)
# Plot the data points
for i, color in zip(fitted_model.classes_, colors):
idx = np.where(y == i)
plt.scatter(X[idx, 0], X[idx, 1], facecolor=color, edgecolor='k', lw=1,
label=iris.target_names[i], cmap=cmap, alpha=0.8, zorder=2)
plt.title(plot_type + '\n' +
str(fitted_model).split('(')[0]+ ' Test Accuracy: ' + str(np.round(fitted_model.score(X, y), 5)))
plt.xlabel(features[0])
plt.ylabel(features[1])
plt.gca().set_aspect('equal')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.subplots_adjust(top=0.9, bottom=0.08, wspace=0.02)
plt.show()
if __name__ == '__main__':
iris = datasets.load_iris()
X = iris.data[:, [0, 2]]
y = iris.target
scaler = preprocessing.StandardScaler().fit_transform(X)
clf1 = DecisionTreeClassifier(max_depth=4)
clf2 = KNeighborsClassifier(n_neighbors=7)
clf3 = SVC(kernel='rbf', probability=True)
clf4 = SGDClassifier(alpha=0.001, n_iter=100).fit(X, y)
clf1.fit(X, y)
clf2.fit(X, y)
clf3.fit(X, y)
clf4.fit(X, y)
plot_decision_boundary(X, y, clf1, iris.feature_names, iris.target_names[[0, 2]])
plot_decision_boundary(X, y, clf2, iris.feature_names, iris.target_names[[0, 2]])
plot_decision_boundary(X, y, clf3, iris.feature_names, iris.target_names[[0, 2]])
plot_decision_boundary(X, y, clf4, iris.feature_names, iris.target_names[[0, 2]])
And the results:
As can be seen, for the last example (clf4 in the given code) I'm so far unable to plot the hyperplane in the wrong position. I wonder how to correct this. They should be translated to the correct range regarding the used features to fit the model.
Thanks.
Apparently, the problem is the ends of dashed lines representing hyperplanes are not consistent with the final and expected xlim and ylim. A good thing about this case is that you already have x_min, x_max, y_min, y_max defined. So use that and fix xlim and ylim by applying the following 3 lines before plotting hyperplanes (specifically, add in front of your comment line # First plot, predicted results using the given model).
ax = plt.gca()
ax.set_xlim((x_min, x_max), auto=False)
ax.set_ylim((y_min, y_max), auto=False)

plotting linear SVM

I tried following the example here but i am having trouble applying it when i have 16 features. lin_svc is trained with those 16 features (i deleted the line to re-train it again from the example). it works and i tried it and also extracted .coef_before.
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
#features is an array of 16
#lin_svc variable is available
#train is a pandas DF
X = train[features].as_matrix()
y = train.outcome
h = .02 # step size in the mesh
# create a mesh to plot in
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# title for the plots
titles = ['SVC with linear kernel']
for i, clf in enumerate([lin_svc]):
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
plt.subplot(2, 2, i + 1)
plt.subplots_adjust(wspace=0.4, hspace=0.4)
Z = clf.predict(X)
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8)
# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())
plt.title(titles[i])
plt.show()
The error i am getting is:
ValueError Traceback (most recent call last)
<ipython-input-8-d52ca252fc3a> in <module>()
24
25 # Put the result into a color plot
---> 26 Z = Z.reshape(xx.shape)
27 plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8)
28
ValueError: total size of new array must be unchanged
I've encountered this same issue myself. Since you're really interested in plotting Z as a function of xx and yy, you should be passing those to clf.predict() rathan than passing X. Try replacing
Z = clf.predict(X)
with
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
and the plot should show nicely (assuming no other bugs).
Also you may want to change the title of your question to something like "Plotting 2-D Decision Boundary," since this has nothing to do with SVMs specifically. You'll encounter this kind of issue with any of the sklearn classifiers.

Categories

Resources