Finding data points close to the decision boundary of a classifier - python

Sorry if this is a very simple question. But I'm a newcomer to the field.
My specific question is this: I have trained an XGboost classifier in Python. After the training, how can I get the samples in my training data that are closer than a fixed value to the decision boundary of the model?
Thanks

I don't think if xgboost has a built-in method for that or if there is a mathematical formula for that like for SVC. This visualization could help though for 2D feature spaces:
import xgboost as xgb
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons
def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):
# setup marker generator and color map
markers = ('s', 'x', 'o', '^', 'v')
colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
cmap = ListedColormap(colors[:len(np.unique(y))])
# plot the decision surface
x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
np.arange(x2_min, x2_max, resolution))
Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
Z = Z.reshape(xx1.shape)
plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
plt.xlim(xx1.min(), xx1.max())
plt.ylim(xx2.min(), xx2.max())
for idx, cl in enumerate(np.unique(y)):
plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
alpha=0.8, c=cmap(idx),
marker=markers[idx], label=cl)
# highlight test samples
if test_idx:
# plot all samples
if not versiontuple(np.__version__) >= versiontuple('1.9.0'):
X_test, y_test = X[list(test_idx), :], y[list(test_idx)]
warnings.warn('Please update to NumPy 1.9.0 or newer')
else:
X_test, y_test = X[test_idx, :], y[test_idx]
plt.scatter(X_test[:, 0],
X_test[:, 1],
c='',
alpha=1.0,
linewidths=1,
marker='o',
s=55, label='test set')
X, y = make_moons(noise=0.3, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
xgb_clf = xgb.XGBClassifier()
xgb_clf = xgb_clf.fit(X_train, y_train)
plot_decision_regions(X_test, y_test, xgb_clf)
plt.show()
The plot_decision_regions function is from Python Machine Learning book, available on its public GitHub here.

Related

Basic Desicion Tree in Python

I don't understand why my algorithm is not separating samples "1" (blue), it is as if the algorithm ignored them. I am not aware of updates to DecisionTreeClasiffier in case I am missing adding any parameters, the algorithm is as follows . I have 3 labels, that is, 3 types of samples(Import is missing in the code)
iris = datasets.load_iris()
X=iris.data[:,[2,3]]
y=iris.target
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)
sc= StandardScaler()
sc.fit(X_train)
X_train_std=sc.transform(X_train)
X_test_std= sc.transform(X_test)
def plot_decision_regions(X, y, classifier, test_idx=None, resolution = 0.02):
#definir un generador de marcadores y un mapa de colores
markers = ('s', 'x', 'o', '^','v')
colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
cmap = ListedColormap(colors[:len(np.unique(y))])
#representar la superficie de decision
x1_min, x1_max = X[:, 0].min() -1, X[:,0].max() + 1
x2_min, x2_max = X[:, 1].min() -1, X[:,1].max() + 1
xx1, xx2= np.meshgrid (np.arange(x1_min, x1_max, resolution), np.arange(x2_min, x2_max, resolution))
Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
Z = Z.reshape(xx1.shape)
plt.contourf(xx1, xx2, Z, alpha= 0.3, cmap = cmap)
plt.xlim(xx1.min(), xx1.max())
plt.ylim(xx2.min(), xx2.max())
# print(Z)
#Representar muestras de clase
for idx, cl in enumerate (np.unique(y)):
plt.scatter (x=X[y == cl, 0], y= X[y == cl, 1], alpha=0.8, c=colors[idx], marker= markers [idx], label = cl, edgecolor = 'black')
if test_idx:
#representa todas las muestras
X_test, y_test= X[test_idx,:], y[test_idx]
# print( X[test_idx,:])
plt.scatter(X_test[:,0], X_test[:,1], c='', edgecolor= 'black', alpha= 0.9, linewidth=1, marker='o', s=100, label='test set' )
X_combined_std= np.vstack((X_train_std, X_test_std))
y_combined=np.hstack((y_train, y_test))
tree=DecisionTreeClassifier(criterion='gini', max_depth=4, random_state=1)
tree.fit(X_train, y_train)
plot_decision_regions(X= X_combined_std, y= y_combined, classifier=tree, test_idx=range(105,150))
plt.xlabel('sepal length[cm]')
plt.ylabel('petal width [cm]')
plt.legend(loc='upper left')
plt.show()
Revised answer:
You just forgot to train the model with the scaled features X_train_std. So, instead of
tree.fit(X_train, y_train)
it should be
tree.fit(X_train_std, y_train)
Now, the model is able to distinguish between all three classes:

How to print the classified points based on SVM classifier

I was using "svm" classifier to classify it was a bike or car.
So, my features were 0,1,2 columns and dependents was 3rd column.I can able to clearly see the classification,but i don't know how to print all the points based on classification in diagram.
import numpy as np
import operator
from matplotlib import pyplot as plt
from sklearn import svm
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.svm import SVC
dataframe=pd.read_csv(DATASET_PATH)
dataframe = dataframe.dropna(how='any',axis=0)
SVM_Trained_Model = preprocessing.LabelEncoder()
train_data=dataframe[0:len(dataframe)]
le=preprocessing.LabelEncoder()
col=dataframe.columns[START_TRAIN_COLUMN:].astype('U')
col_name=["no_of_wheels","dimensions","windows","vehicle_type"]
for i in range(0,len(col_name)):
self.train_data[col_name[i]]=le.fit_transform(self.train_data[col_name[i]])
train_column=np.array(train_data[col]).astype('U')
data=train_data.iloc[:,[0,1,2]].values
target=train_data.iloc[:,3].values
data_train, data_test, target_train, target_test = train_test_split(data,target, test_size = 0.30,
random_state = 0) `split test and test train`
svc_model=SVC(kernel='rbf', probability=True))'classifier model'
svc_model.fit(data_train, target_train)
all_labels =svc_model.predict(data_test)
X_set, y_set = data_train, target_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step =
0.01),np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
Xpred = np.array([X1.ravel(), X2.ravel()] + [np.repeat(0, X1.ravel().size) for _ in range(1)]).T
pred = svc_model.predict(Xpred).reshape(X1.shape)
plt.contourf(X1, X2, pred,alpha = 0.75, cmap = ListedColormap(('white','orange','pink')))
plt.xlim(X1.min(),X1.max())
plt.ylim(X2.min(), X2.max())
colors=['red','yellow','cyan','blue']
for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],c = ListedColormap((colors[i]))(i), label
= j)
plt.title('Multiclass Classifier ')
plt.xlabel('Features')
plt.ylabel('Dependents')
plt.legend()
plt.show()
Image
So here is my diagram I need to print the points using python print() based on pink and white region in the diagram.Please help me to get this points.
You need to select and use only 2 features in order to make a 2D surface plot.
from sklearn.svm import SVC
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
iris = datasets.load_iris()
X = iris.data[:, :2] # we only take the first two features.
y = iris.target
def make_meshgrid(x, y, h=.02):
x_min, x_max = x.min() - 1, x.max() + 1
y_min, y_max = y.min() - 1, y.max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
return xx, yy
def plot_contours(ax, clf, xx, yy, **params):
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
out = ax.contourf(xx, yy, Z, **params)
return out
model = svm.SVC(kernel='linear')
clf = model.fit(X, y)
fig, ax = plt.subplots()
# title for the plots
title = ('Decision surface of linear SVC ')
# Set-up grid for plotting.
X0, X1 = X[:, 0], X[:, 1]
xx, yy = make_meshgrid(X0, X1)
plot_contours(ax, clf, xx, yy, cmap=plt.cm.coolwarm, alpha=0.8)
ax.scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
ax.set_ylabel('y label here')
ax.set_xlabel('x label here')
ax.set_xticks(())
ax.set_yticks(())
ax.set_title(title)
ax.legend()
plt.show()

What is the reason of difference between SVC with rbf kernel and linear kernel which is fitted by rbf-kernel computed X

I'm confused about SVC with kernel method, e.g., rbf. What I understand is when SVC with rbf kernel is applied to fit(x,y), it computes the rbf kernel matrix K of (x,x.T) which shape is [n_samples, n_samples], then transforms this kernel matrix K to y with hinge loss.
Under this intuition, I use sklearn.svm.svc and sklearn.metrics.pairwise.rbf_kernel to compare the results between:
svc(kernel='rbf').fit(x,y)
# and
svc(kernel='precomputed').fit(rbf_kernel(x,x),y)
# and
svc(kernel='linear').fit(rbf_kernel(x,x),y)
I think it's supposed to be same in the classification result. There are some difference between these three results.
More specifically, if you run the code as follow, svc(kernel='precomputed').fit(rbf_kernel(x,x),y)) performs same as svc(kernel='rbf').fit(x,y), but svc(kernel='linear').fit(rbf_kernel(x,x),y) performs not as well as the other two methods.
Could anyone help me to figure out the reason? Thanks.
from sklearn.datasets import make_moons, make_circles, make_classification
import numpy as np
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from matplotlib.colors import ListedColormap
import numpy as np
import matplotlib
matplotlib.use("agg")
import matplotlib.pyplot as plt
h = .02 # step size in the mesh
names = [
"RBF SVM",
# "RP Ridge",
"RBF-Precomp SVM",
"RBF-Linear SVM",
]
classifiers = [
SVC(gamma=1, C=1),
SVC(kernel='precomputed',C=1,gamma=1),
SVC(kernel="linear", C=1),
]
datasets = [
make_moons(n_samples=200,noise=0, random_state=0),
make_moons(n_samples=200,noise=0.2, random_state=0),
make_circles(n_samples=200,noise=0, factor=0.5, random_state=0),
make_circles(n_samples=200,noise=0.2, factor=0.5, random_state=0),]
figure = plt.figure(figsize=(int((len(classifiers)+1)*3), int(len(datasets)*3)))
i=1
# iterate over datasets
for ds_cnt, ds in enumerate(datasets):
# preprocess dataset, split into training and test part
X, y = ds
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=.2, random_state=42)
K_train = rbf_kernel(X_train,X_train,gamma=1)
K_test = rbf_kernel(X_test,X_train,gamma=1)
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# just plot the dataset first
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
if ds_cnt == 0:
ax.set_title("Input data")
# Plot the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
edgecolors='k')
# Plot the testing points
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6,
edgecolors='k', marker='*')
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
i += 1
# iterate over classifiers
for name, clf in zip(names, classifiers):
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
if "Pre" in name:
clf.fit(K_train,y_train)
score = clf.score(K_test, y_test)
elif "Linear" in name:
clf.fit(K_train,y_train)
score = clf.score(K_test, y_test)
else:
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
# create test data from mesh grid
mesh_data = np.c_[xx.ravel(), yy.ravel()]
K_mesh = rbf_kernel(mesh_data, X_train,gamma=1)
if "Pre" in name or "Linear" in name:
Z = clf.decision_function(K_mesh)
else:
Z = clf.decision_function(mesh_data)
# Put the result into a color plot
Z = Z.reshape(xx.shape)
# draw the every mesh grid, distinct them with colors in plt.cm.RdBu
ax.contourf(xx, yy, Z, 66, cmap=cm, alpha=0.6)
# Plot the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
edgecolors='k')
# Plot the testing points
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
edgecolors='k', alpha=0.6, marker='*')
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
if ds_cnt == 0:
ax.set_title(name)
ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
size=15, horizontalalignment='right')
i += 1
plt.tight_layout()
plt.savefig('bench_test.png')

Subplot drawn by auxiliary function experiences unexpected shift

I'm trying to merge two plots in one:
http://scikit-learn.org/stable/auto_examples/linear_model/plot_sgd_iris.html
http://scikit-learn.org/stable/auto_examples/ensemble/plot_voting_decision_regions.html#sphx-glr-auto-examples-ensemble-plot-voting-decision-regions-py
In the left plot I want to display the decision boundary with the hyperplane corresponding to the OVA classifiers and in the right plot I would like to show the decision probabilities.
This is the code so far:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn import datasets
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
def plot_hyperplane(c, color, fitted_model):
"""
Plot the one-against-all classifiers for the given model.
Parameters
--------------
c : index of the hyperplane to be plot
color : color to be used when drawing the line
fitted_model : the fitted model
"""
xmin, xmax = plt.xlim()
ymin, ymax = plt.ylim()
try:
coef = fitted_model.coef_
intercept = fitted_model.intercept_
except:
return
def line(x0):
return (-(x0 * coef[c, 0]) - intercept[c]) / coef[c, 1]
plt.plot([xmin, xmax], [line(xmin), line(xmax)], ls="--", color=color, zorder=3)
def plot_decision_boundary(X, y, fitted_model, features, targets):
"""
This function plots a model decision boundary as well as it tries to plot
the decision probabilities, if available.
Requires a model fitted with two features only.
Parameters
--------------
X : the data to learn
y : the classification labels
fitted_model : the fitted model
"""
cmap = plt.get_cmap('Set3')
prob = cmap
colors = [cmap(i) for i in np.linspace(0, 1, len(fitted_model.classes_))]
plt.figure(figsize=(9.5, 5))
for i, plot_type in enumerate(['Decision Boundary', 'Decision Probabilities']):
plt.subplot(1, 2, i+1)
mesh_step_size = 0.01 # step size in the mesh
x_min, x_max = X[:, 0].min() - .1, X[:, 0].max() + .1
y_min, y_max = X[:, 1].min() - .1, X[:, 1].max() + .1
xx, yy = np.meshgrid(np.arange(x_min, x_max, mesh_step_size), np.arange(y_min, y_max, mesh_step_size))
# First plot, predicted results using the given model
if i == 0:
Z = fitted_model.predict(np.c_[xx.ravel(), yy.ravel()])
for h, color in zip(fitted_model.classes_, colors):
plot_hyperplane(h, color, fitted_model)
# Second plot, predicted probabilities using the given model
else:
prob = 'RdYlBu_r'
try:
Z = fitted_model.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
except:
plt.text(0.4, 0.5, 'Probabilities Unavailable', horizontalalignment='center',
verticalalignment='center', transform=plt.gca().transAxes, fontsize=12)
plt.axis('off')
break
Z = Z.reshape(xx.shape)
# Display Z
plt.imshow(Z, interpolation='nearest', cmap=prob, alpha=0.5,
extent=(x_min, x_max, y_min, y_max), origin='lower', zorder=1)
# Plot the data points
for i, color in zip(fitted_model.classes_, colors):
idx = np.where(y == i)
plt.scatter(X[idx, 0], X[idx, 1], facecolor=color, edgecolor='k', lw=1,
label=iris.target_names[i], cmap=cmap, alpha=0.8, zorder=2)
plt.title(plot_type + '\n' +
str(fitted_model).split('(')[0]+ ' Test Accuracy: ' + str(np.round(fitted_model.score(X, y), 5)))
plt.xlabel(features[0])
plt.ylabel(features[1])
plt.gca().set_aspect('equal')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.subplots_adjust(top=0.9, bottom=0.08, wspace=0.02)
plt.show()
if __name__ == '__main__':
iris = datasets.load_iris()
X = iris.data[:, [0, 2]]
y = iris.target
scaler = preprocessing.StandardScaler().fit_transform(X)
clf1 = DecisionTreeClassifier(max_depth=4)
clf2 = KNeighborsClassifier(n_neighbors=7)
clf3 = SVC(kernel='rbf', probability=True)
clf4 = SGDClassifier(alpha=0.001, n_iter=100).fit(X, y)
clf1.fit(X, y)
clf2.fit(X, y)
clf3.fit(X, y)
clf4.fit(X, y)
plot_decision_boundary(X, y, clf1, iris.feature_names, iris.target_names[[0, 2]])
plot_decision_boundary(X, y, clf2, iris.feature_names, iris.target_names[[0, 2]])
plot_decision_boundary(X, y, clf3, iris.feature_names, iris.target_names[[0, 2]])
plot_decision_boundary(X, y, clf4, iris.feature_names, iris.target_names[[0, 2]])
And the results:
As can be seen, for the last example (clf4 in the given code) I'm so far unable to plot the hyperplane in the wrong position. I wonder how to correct this. They should be translated to the correct range regarding the used features to fit the model.
Thanks.
Apparently, the problem is the ends of dashed lines representing hyperplanes are not consistent with the final and expected xlim and ylim. A good thing about this case is that you already have x_min, x_max, y_min, y_max defined. So use that and fix xlim and ylim by applying the following 3 lines before plotting hyperplanes (specifically, add in front of your comment line # First plot, predicted results using the given model).
ax = plt.gca()
ax.set_xlim((x_min, x_max), auto=False)
ax.set_ylim((y_min, y_max), auto=False)

Trouble Setting Up Multiple Algorithm Trials in Scikit-Learn

I am trying to use this example from the sklearn documentation. I am not really sure what the code is doing and although I assume I am inputting my dataset the wrong way, I recently obtained this error:
<ipython-input-26-3c3c0763766b> in <module>()
49 for ds in datasets:
50 # preprocess dataset, split into training and test part
---> 51 X, y = ds
52 X = StandardScaler().fit_transform(X)
53 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)
ValueError: too many values to unpack
Any ideas as to how I can modify the code to work with my dataset (which is a multidimensional numpy array from a pandas dataframe) and fix the error?
dataURL = "peridotites_clean_complete.csv"
pd_data = pd.read_csv(dataURL)
rock_names = pd_data['ROCK NAME']
rock_compositions = pd_data.columns[1:]
rock_data = np.vstack([pd_data[x] for x in rock_compositions])
classifiers = [
KNeighborsClassifier(3),
SVC(kernel="linear", C=0.025),
SVC(gamma=2, C=1),
DecisionTreeClassifier(max_depth=5),
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
AdaBoostClassifier(),
GaussianNB(),
LDA(),
QDA()]
X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
random_state=1, n_clusters_per_class=1)
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
linearly_separable = (X, y)
datasets = [rock_data]
figure = plt.figure(figsize=(27, 9))
i = 1
# iterate over datasets
for ds in datasets:
# preprocess dataset, split into training and test part
X, y = ds
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# just plot the dataset first
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
# Plot the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
# and testing points
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6)
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
i += 1
# iterate over classifiers
for name, clf in zip(names, classifiers):
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
if hasattr(clf, "decision_function"):
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
else:
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
# Put the result into a color plot
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)
# Plot also the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
# and testing points
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
alpha=0.6)
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
ax.set_title(name)
ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
size=15, horizontalalignment='right')
i += 1
figure.subplots_adjust(left=.02, right=.98)
plt.show()
The thing is ds is a list with more than two values like the one shown below:
>>> ds=['rockatr1','rockatr2','rockatr','rocktype']
>>> X,y=ds
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
ValueError: too many values to unpack
You have to specify which part is X and which is y as shown below. Usually in classification data the last column is used as label, that is what I assumed here.
>>> X,y=ds[:-1],ds[-1]
>>> X
['rockatr1', 'rockatr2', 'rockatr']
>>> y
'rocktype'

Categories

Resources