problem with array reshaping for SVM kernel visualization - python

I am trying to make a visualization of different kernels effect of a support vector machine over a data-set but I have a problem reshaping an array size:
The dataset I am using is the following:
https://archive.ics.uci.edu/ml/datasets/human+activity+recognition+using+smartphones
df_train_X = pd.read_csv("X_train.txt", header=None, delim_whitespace=True)
df_train_y = pd.read_csv("y_train.txt", header=None, delim_whitespace=True)
X_train = df_train_X.values
y_train = df_train_y[0]
X = X_train
y = y_train
X = X[y != 0, :2]
y = y[y != 0]
n_sample = len(X)
np.random.seed(0)
order = np.random.permutation(n_sample)
X = X[order]
y = y[order].astype(np.float)
X_train = X[:int(.9 * n_sample)]
y_train = y[:int(.9 * n_sample)]
X_test = X[int(.9 * n_sample):]
y_test = y[int(.9 * n_sample):]
# fit the model
for kernel in ('linear', 'rbf', 'poly'):
clf = SVC(kernel=kernel, gamma=10)
clf.fit(X_train, y_train)
plt.figure()
plt.clf()
plt.scatter(X[:, 0], X[:, 1], c=y, zorder=10, cmap=plt.cm.Paired,
edgecolor='k', s=20)
# Circle out the test data
plt.scatter(X_test[:, 0], X_test[:, 1], s=80, facecolors='none',
zorder=10, edgecolor='k')
plt.axis('tight')
x_min = X[:, 0].min()
x_max = X[:, 0].max()
y_min = X[:, 1].min()
y_max = X[:, 1].max()
XX, YY = np.mgrid[x_min:x_max:200j, y_min:y_max:200j]
Z = clf.decision_function(np.c_[XX.ravel(), YY.ravel()])
# Put the result into a color plot
Z = Z.reshape(XX.shape)
plt.pcolormesh(XX, YY, Z > 0, cmap=plt.cm.Paired)
plt.contour(XX, YY, Z, colors=['k', 'k', 'k'],
linestyles=['--', '-', '--'], levels=[-.5, 0, .5])
plt.title(kernel)
plt.show()
The problem is related to Z = Z.reshape(XX.shape) since I get the following error that blocks the code execution: cannot reshape array of size 240000 into shape (200,200)
But I don't know how to solve it to perform the visualization.

Related

How to print the classified points based on SVM classifier

I was using "svm" classifier to classify it was a bike or car.
So, my features were 0,1,2 columns and dependents was 3rd column.I can able to clearly see the classification,but i don't know how to print all the points based on classification in diagram.
import numpy as np
import operator
from matplotlib import pyplot as plt
from sklearn import svm
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.svm import SVC
dataframe=pd.read_csv(DATASET_PATH)
dataframe = dataframe.dropna(how='any',axis=0)
SVM_Trained_Model = preprocessing.LabelEncoder()
train_data=dataframe[0:len(dataframe)]
le=preprocessing.LabelEncoder()
col=dataframe.columns[START_TRAIN_COLUMN:].astype('U')
col_name=["no_of_wheels","dimensions","windows","vehicle_type"]
for i in range(0,len(col_name)):
self.train_data[col_name[i]]=le.fit_transform(self.train_data[col_name[i]])
train_column=np.array(train_data[col]).astype('U')
data=train_data.iloc[:,[0,1,2]].values
target=train_data.iloc[:,3].values
data_train, data_test, target_train, target_test = train_test_split(data,target, test_size = 0.30,
random_state = 0) `split test and test train`
svc_model=SVC(kernel='rbf', probability=True))'classifier model'
svc_model.fit(data_train, target_train)
all_labels =svc_model.predict(data_test)
X_set, y_set = data_train, target_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step =
0.01),np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
Xpred = np.array([X1.ravel(), X2.ravel()] + [np.repeat(0, X1.ravel().size) for _ in range(1)]).T
pred = svc_model.predict(Xpred).reshape(X1.shape)
plt.contourf(X1, X2, pred,alpha = 0.75, cmap = ListedColormap(('white','orange','pink')))
plt.xlim(X1.min(),X1.max())
plt.ylim(X2.min(), X2.max())
colors=['red','yellow','cyan','blue']
for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],c = ListedColormap((colors[i]))(i), label
= j)
plt.title('Multiclass Classifier ')
plt.xlabel('Features')
plt.ylabel('Dependents')
plt.legend()
plt.show()
Image
So here is my diagram I need to print the points using python print() based on pink and white region in the diagram.Please help me to get this points.
You need to select and use only 2 features in order to make a 2D surface plot.
from sklearn.svm import SVC
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
iris = datasets.load_iris()
X = iris.data[:, :2] # we only take the first two features.
y = iris.target
def make_meshgrid(x, y, h=.02):
x_min, x_max = x.min() - 1, x.max() + 1
y_min, y_max = y.min() - 1, y.max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
return xx, yy
def plot_contours(ax, clf, xx, yy, **params):
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
out = ax.contourf(xx, yy, Z, **params)
return out
model = svm.SVC(kernel='linear')
clf = model.fit(X, y)
fig, ax = plt.subplots()
# title for the plots
title = ('Decision surface of linear SVC ')
# Set-up grid for plotting.
X0, X1 = X[:, 0], X[:, 1]
xx, yy = make_meshgrid(X0, X1)
plot_contours(ax, clf, xx, yy, cmap=plt.cm.coolwarm, alpha=0.8)
ax.scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
ax.set_ylabel('y label here')
ax.set_xlabel('x label here')
ax.set_xticks(())
ax.set_yticks(())
ax.set_title(title)
ax.legend()
plt.show()

What is the reason of difference between SVC with rbf kernel and linear kernel which is fitted by rbf-kernel computed X

I'm confused about SVC with kernel method, e.g., rbf. What I understand is when SVC with rbf kernel is applied to fit(x,y), it computes the rbf kernel matrix K of (x,x.T) which shape is [n_samples, n_samples], then transforms this kernel matrix K to y with hinge loss.
Under this intuition, I use sklearn.svm.svc and sklearn.metrics.pairwise.rbf_kernel to compare the results between:
svc(kernel='rbf').fit(x,y)
# and
svc(kernel='precomputed').fit(rbf_kernel(x,x),y)
# and
svc(kernel='linear').fit(rbf_kernel(x,x),y)
I think it's supposed to be same in the classification result. There are some difference between these three results.
More specifically, if you run the code as follow, svc(kernel='precomputed').fit(rbf_kernel(x,x),y)) performs same as svc(kernel='rbf').fit(x,y), but svc(kernel='linear').fit(rbf_kernel(x,x),y) performs not as well as the other two methods.
Could anyone help me to figure out the reason? Thanks.
from sklearn.datasets import make_moons, make_circles, make_classification
import numpy as np
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from matplotlib.colors import ListedColormap
import numpy as np
import matplotlib
matplotlib.use("agg")
import matplotlib.pyplot as plt
h = .02 # step size in the mesh
names = [
"RBF SVM",
# "RP Ridge",
"RBF-Precomp SVM",
"RBF-Linear SVM",
]
classifiers = [
SVC(gamma=1, C=1),
SVC(kernel='precomputed',C=1,gamma=1),
SVC(kernel="linear", C=1),
]
datasets = [
make_moons(n_samples=200,noise=0, random_state=0),
make_moons(n_samples=200,noise=0.2, random_state=0),
make_circles(n_samples=200,noise=0, factor=0.5, random_state=0),
make_circles(n_samples=200,noise=0.2, factor=0.5, random_state=0),]
figure = plt.figure(figsize=(int((len(classifiers)+1)*3), int(len(datasets)*3)))
i=1
# iterate over datasets
for ds_cnt, ds in enumerate(datasets):
# preprocess dataset, split into training and test part
X, y = ds
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=.2, random_state=42)
K_train = rbf_kernel(X_train,X_train,gamma=1)
K_test = rbf_kernel(X_test,X_train,gamma=1)
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# just plot the dataset first
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
if ds_cnt == 0:
ax.set_title("Input data")
# Plot the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
edgecolors='k')
# Plot the testing points
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6,
edgecolors='k', marker='*')
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
i += 1
# iterate over classifiers
for name, clf in zip(names, classifiers):
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
if "Pre" in name:
clf.fit(K_train,y_train)
score = clf.score(K_test, y_test)
elif "Linear" in name:
clf.fit(K_train,y_train)
score = clf.score(K_test, y_test)
else:
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
# create test data from mesh grid
mesh_data = np.c_[xx.ravel(), yy.ravel()]
K_mesh = rbf_kernel(mesh_data, X_train,gamma=1)
if "Pre" in name or "Linear" in name:
Z = clf.decision_function(K_mesh)
else:
Z = clf.decision_function(mesh_data)
# Put the result into a color plot
Z = Z.reshape(xx.shape)
# draw the every mesh grid, distinct them with colors in plt.cm.RdBu
ax.contourf(xx, yy, Z, 66, cmap=cm, alpha=0.6)
# Plot the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
edgecolors='k')
# Plot the testing points
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
edgecolors='k', alpha=0.6, marker='*')
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
if ds_cnt == 0:
ax.set_title(name)
ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
size=15, horizontalalignment='right')
i += 1
plt.tight_layout()
plt.savefig('bench_test.png')

Use same colors for scatter and contourf on same plot

I have to plot the decision boundaries of a classifier and the samples (with labels) used to train the classifier so I am using scatter plot and contourf. I can't find a way to use the same colors for both functions, when I specify a color sequence the scatter plot works well but the colors for the boundaries do not follow the order that I expect.
Here is my code:
import numpy as np
import sklearn.naive_bayes as nb
import matplotlib.pyplot as plt
clf = nb.GaussianNB()
X = np.zeros((4, 2), dtype=int)
X[0, :] = 10
X[1, :] = 20
X[2, :] = 30
X[3, :] = 40
Y = [0, 1, 2, 3]
clf.fit(X, Y)
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max,0.1))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.figure(figsize=(8,8))
colors_ = ['red', 'yellow', 'cyan', 'blue']
plt.contourf(xx, yy, Z, colors=colors_, alpha=0.5)
labels = ['dog', 'guitar', 'house', 'person']
for i in np.unique(Y):
mask = Y == i
plt.scatter(X[mask, 0], X[mask, 1], c=colors_[i], label=labels[i],
edgecolors='k', alpha=0.8)
plt.legend()
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title('Decision Boundaries of Naive Bayes Classifier')
plt.show()
The image should have decision regions and scatter points with more or less the same colors (i.e. yellow region where most of the yellow points are, red region where most of the red points are and so on)....with the code above the colors are mixed (i.e. red region where most of the yellow points are).
I suppose that this is simply a problem about ordering but I can't figure it out.

Plot hyperplane Linear SVM python

I am trying to plot the hyperplane for the model I trained with LinearSVC and sklearn. Note that I am working with natural languages; before fitting the model I extracted features with CountVectorizer and TfidfTransformer.
Here the classifier:
from sklearn.svm import LinearSVC
from sklearn import svm
clf = LinearSVC(C=0.2).fit(X_train_tf, y_train)
Then I tried to plot as suggested on the Scikit-learn website:
# get the separating hyperplane
w = clf.coef_[0]
a = -w[0] / w[1]
xx = np.linspace(-5, 5)
yy = a * xx - (clf.intercept_[0]) / w[1]
# plot the parallels to the separating hyperplane that pass through the
# support vectors
b = clf.support_vectors_[0]
yy_down = a * xx + (b[1] - a * b[0])
b = clf.support_vectors_[-1]
yy_up = a * xx + (b[1] - a * b[0])
# plot the line, the points, and the nearest vectors to the plane
plt.plot(xx, yy, 'k-')
plt.plot(xx, yy_down, 'k--')
plt.plot(xx, yy_up, 'k--')
plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1],
s=80, facecolors='none')
plt.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired)
plt.axis('tight')
plt.show()
This example uses svm.SVC(kernel='linear'), while my classifier is LinearSVC. Therefore, I get this error:
AttributeError Traceback (most recent call last)
<ipython-input-39-6e231c530d87> in <module>()
7 # plot the parallels to the separating hyperplane that pass through the
8 # support vectors
----> 9 b = clf.support_vectors_[0]
1 yy_down = a * xx + (b[1] - a * b[0])
11 b = clf.support_vectors_[-1]
AttributeError: 'LinearSVC' object has no attribute 'support_vectors_'
How can I successfully plot the hyperplan of my LinearSVC classifier?
What about leaving the support_ out, which is not defined for a LinearSVC?
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
np.random.seed(0)
X = np.r_[np.random.randn(20, 2) - [2, 2], np.random.randn(20, 2) + [2, 2]]
Y = [0] * 20 + [1] * 20
fig, ax = plt.subplots()
clf2 = svm.LinearSVC(C=1).fit(X, Y)
# get the separating hyperplane
w = clf2.coef_[0]
a = -w[0] / w[1]
xx = np.linspace(-5, 5)
yy = a * xx - (clf2.intercept_[0]) / w[1]
# create a mesh to plot in
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx2, yy2 = np.meshgrid(np.arange(x_min, x_max, .2),
np.arange(y_min, y_max, .2))
Z = clf2.predict(np.c_[xx2.ravel(), yy2.ravel()])
Z = Z.reshape(xx2.shape)
ax.contourf(xx2, yy2, Z, cmap=plt.cm.coolwarm, alpha=0.3)
ax.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.coolwarm, s=25)
ax.plot(xx,yy)
ax.axis([x_min, x_max,y_min, y_max])
plt.show()

Trouble Setting Up Multiple Algorithm Trials in Scikit-Learn

I am trying to use this example from the sklearn documentation. I am not really sure what the code is doing and although I assume I am inputting my dataset the wrong way, I recently obtained this error:
<ipython-input-26-3c3c0763766b> in <module>()
49 for ds in datasets:
50 # preprocess dataset, split into training and test part
---> 51 X, y = ds
52 X = StandardScaler().fit_transform(X)
53 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)
ValueError: too many values to unpack
Any ideas as to how I can modify the code to work with my dataset (which is a multidimensional numpy array from a pandas dataframe) and fix the error?
dataURL = "peridotites_clean_complete.csv"
pd_data = pd.read_csv(dataURL)
rock_names = pd_data['ROCK NAME']
rock_compositions = pd_data.columns[1:]
rock_data = np.vstack([pd_data[x] for x in rock_compositions])
classifiers = [
KNeighborsClassifier(3),
SVC(kernel="linear", C=0.025),
SVC(gamma=2, C=1),
DecisionTreeClassifier(max_depth=5),
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
AdaBoostClassifier(),
GaussianNB(),
LDA(),
QDA()]
X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
random_state=1, n_clusters_per_class=1)
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
linearly_separable = (X, y)
datasets = [rock_data]
figure = plt.figure(figsize=(27, 9))
i = 1
# iterate over datasets
for ds in datasets:
# preprocess dataset, split into training and test part
X, y = ds
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# just plot the dataset first
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
# Plot the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
# and testing points
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6)
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
i += 1
# iterate over classifiers
for name, clf in zip(names, classifiers):
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
if hasattr(clf, "decision_function"):
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
else:
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
# Put the result into a color plot
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)
# Plot also the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
# and testing points
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
alpha=0.6)
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
ax.set_title(name)
ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
size=15, horizontalalignment='right')
i += 1
figure.subplots_adjust(left=.02, right=.98)
plt.show()
The thing is ds is a list with more than two values like the one shown below:
>>> ds=['rockatr1','rockatr2','rockatr','rocktype']
>>> X,y=ds
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
ValueError: too many values to unpack
You have to specify which part is X and which is y as shown below. Usually in classification data the last column is used as label, that is what I assumed here.
>>> X,y=ds[:-1],ds[-1]
>>> X
['rockatr1', 'rockatr2', 'rockatr']
>>> y
'rocktype'

Categories

Resources