I don't understand why my algorithm is not separating samples "1" (blue), it is as if the algorithm ignored them. I am not aware of updates to DecisionTreeClasiffier in case I am missing adding any parameters, the algorithm is as follows . I have 3 labels, that is, 3 types of samples(Import is missing in the code)
iris = datasets.load_iris()
X=iris.data[:,[2,3]]
y=iris.target
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)
sc= StandardScaler()
sc.fit(X_train)
X_train_std=sc.transform(X_train)
X_test_std= sc.transform(X_test)
def plot_decision_regions(X, y, classifier, test_idx=None, resolution = 0.02):
#definir un generador de marcadores y un mapa de colores
markers = ('s', 'x', 'o', '^','v')
colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
cmap = ListedColormap(colors[:len(np.unique(y))])
#representar la superficie de decision
x1_min, x1_max = X[:, 0].min() -1, X[:,0].max() + 1
x2_min, x2_max = X[:, 1].min() -1, X[:,1].max() + 1
xx1, xx2= np.meshgrid (np.arange(x1_min, x1_max, resolution), np.arange(x2_min, x2_max, resolution))
Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
Z = Z.reshape(xx1.shape)
plt.contourf(xx1, xx2, Z, alpha= 0.3, cmap = cmap)
plt.xlim(xx1.min(), xx1.max())
plt.ylim(xx2.min(), xx2.max())
# print(Z)
#Representar muestras de clase
for idx, cl in enumerate (np.unique(y)):
plt.scatter (x=X[y == cl, 0], y= X[y == cl, 1], alpha=0.8, c=colors[idx], marker= markers [idx], label = cl, edgecolor = 'black')
if test_idx:
#representa todas las muestras
X_test, y_test= X[test_idx,:], y[test_idx]
# print( X[test_idx,:])
plt.scatter(X_test[:,0], X_test[:,1], c='', edgecolor= 'black', alpha= 0.9, linewidth=1, marker='o', s=100, label='test set' )
X_combined_std= np.vstack((X_train_std, X_test_std))
y_combined=np.hstack((y_train, y_test))
tree=DecisionTreeClassifier(criterion='gini', max_depth=4, random_state=1)
tree.fit(X_train, y_train)
plot_decision_regions(X= X_combined_std, y= y_combined, classifier=tree, test_idx=range(105,150))
plt.xlabel('sepal length[cm]')
plt.ylabel('petal width [cm]')
plt.legend(loc='upper left')
plt.show()
Revised answer:
You just forgot to train the model with the scaled features X_train_std. So, instead of
tree.fit(X_train, y_train)
it should be
tree.fit(X_train_std, y_train)
Now, the model is able to distinguish between all three classes:
Related
I am trying to make a visualization of different kernels effect of a support vector machine over a data-set but I have a problem reshaping an array size:
The dataset I am using is the following:
https://archive.ics.uci.edu/ml/datasets/human+activity+recognition+using+smartphones
df_train_X = pd.read_csv("X_train.txt", header=None, delim_whitespace=True)
df_train_y = pd.read_csv("y_train.txt", header=None, delim_whitespace=True)
X_train = df_train_X.values
y_train = df_train_y[0]
X = X_train
y = y_train
X = X[y != 0, :2]
y = y[y != 0]
n_sample = len(X)
np.random.seed(0)
order = np.random.permutation(n_sample)
X = X[order]
y = y[order].astype(np.float)
X_train = X[:int(.9 * n_sample)]
y_train = y[:int(.9 * n_sample)]
X_test = X[int(.9 * n_sample):]
y_test = y[int(.9 * n_sample):]
# fit the model
for kernel in ('linear', 'rbf', 'poly'):
clf = SVC(kernel=kernel, gamma=10)
clf.fit(X_train, y_train)
plt.figure()
plt.clf()
plt.scatter(X[:, 0], X[:, 1], c=y, zorder=10, cmap=plt.cm.Paired,
edgecolor='k', s=20)
# Circle out the test data
plt.scatter(X_test[:, 0], X_test[:, 1], s=80, facecolors='none',
zorder=10, edgecolor='k')
plt.axis('tight')
x_min = X[:, 0].min()
x_max = X[:, 0].max()
y_min = X[:, 1].min()
y_max = X[:, 1].max()
XX, YY = np.mgrid[x_min:x_max:200j, y_min:y_max:200j]
Z = clf.decision_function(np.c_[XX.ravel(), YY.ravel()])
# Put the result into a color plot
Z = Z.reshape(XX.shape)
plt.pcolormesh(XX, YY, Z > 0, cmap=plt.cm.Paired)
plt.contour(XX, YY, Z, colors=['k', 'k', 'k'],
linestyles=['--', '-', '--'], levels=[-.5, 0, .5])
plt.title(kernel)
plt.show()
The problem is related to Z = Z.reshape(XX.shape) since I get the following error that blocks the code execution: cannot reshape array of size 240000 into shape (200,200)
But I don't know how to solve it to perform the visualization.
I have 3 regression models: Linear regression, Random Forest, and ANN.
Trying to compare them, I am plotting two plots: residuals and actual vs predicted.
Overall, the residuals suggest that most models predict the data well as they have a symmetric shape and follow the horizontal line. However, when evaluating the actual vs predicted, none of the models nearly follow the 45 degree line (suggesting perfect prediction). What am I doing wrong? Please see the relevant code below:
# Separate features from the label (add .values) to use arrays instead of dataframes
X = dataset2.iloc[:, 0:-1]
y = dataset2.iloc[:, -1]
# Split into training and testin sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
# Scale the data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Train OLS ####################################
ols_regressor = sm.OLS(y_train, X_train).fit()
# Predictions
y_pred_ols_test = ols_regressor.predict(X_test)
y_pred_ols_train = ols_regressor.predict(X_train)
# Train RF ####################################
rf = RandomForestRegressor(bootstrap=False,
max_depth=100,
max_features=3,
min_samples_leaf=1,
min_samples_split=5,
n_estimators=800)
rf.fit(X_train, y_train)
# Predictions
y_pred_rf_test = rf.predict(X_test)
y_pred_rf_train = rf.predict(X_train)
# Train RF ####################################
# ... TRAIN NN WITH KERAS AND TENSORFLOW ...
y_pred_nn_test = ann.predict(X_test)
y_pred_nn_train = ann.predict(X_train)
Here I am plotting the residuals:
fig, axes =plt.subplots(3,1,figsize=(8,12))
fig.suptitle('Residuals', fontweight="bold", fontsize=15)
# OLS #######################################
axes[0] = sns.residplot(ax=axes[0], x=y_pred_ols_test, y=y_test, data=dataset,
lowess=True,
color='darkcyan',
scatter_kws={'alpha': 0.5},
line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})
axes[0].set_xlim([-0.0050, 0.0005])
axes[0].set_ylim([-0.03, 0.03])
axes[0].set_title('Linear Regression')
axes[0].set_xlabel(' ')
axes[0].set_ylabel('Residuals')
# Random Forest ###############################
axes[1] = sns.residplot(ax=axes[1], x=y_pred_rf_test, y=y_test, data=dataset,
lowess=True,
color='darkcyan',
scatter_kws={'alpha': 0.5},
line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})
axes[1].set_xlim([-0.016, 0.006])
axes[1].set_ylim([-0.03, 0.03])
axes[1].set_title('Random Forest')
axes[1].set_xlabel(' ')
axes[1].set_ylabel('Residuals')
# Neural Network ###############################
axes[2] = sns.residplot(ax=axes[2], x=y_pred_nn_test, y=y_test, data=dataset,
lowess=True,
color='darkcyan',
scatter_kws={'alpha': 0.5},
line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})
axes[2].set_xlim([-0.018, -0.009])
axes[2].set_ylim([-0.03, 0.03])
axes[2].set_title('Artificial Neural Network)')
axes[2].set_xlabel('Predicted values')
axes[2].set_ylabel('Residuals')
plt.show()
Then I plot the predicted vs actual plot:
fig, axes =plt.subplots(3,1,figsize=(8,12))
fig.suptitle('Predicted vs Actual', fontweight="bold", fontsize=15)
# OLS #######################################
axes[0] = sns.regplot(ax=axes[0], x=y_pred_ols_test, y=y_test,
color='darkcyan',
scatter_kws={'alpha': 0.5},
line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})
x0 = -0.00415
x1 = 0.00025
y0 = -0.03
y1 = 0.03
# Plot 45 degree line
axes[0].plot([x0, x1], [y0, y1], ls="--", c="grey")
# # Axes Scale
axes[0].set_xlim([x0, x1])
axes[0].set_ylim([y0, y1])
# Labels
axes[0].set_title('Linear Regression')
axes[0].set_xlabel('')
axes[0].set_ylabel('True values')
# Random Forest ###############################
axes[1] = sns.regplot(ax=axes[1], x=y_pred_rf_test, y=y_test,
color='darkcyan',
scatter_kws={'alpha': 0.5},
line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})
x0 = -0.0125
x1 = 0.0045
y0 = -0.03
y1 = 0.03
# Plot 45 degree line
axes[1].plot([x0, x1], [y0, y1], ls="--", c="grey")
# Axes Scale
axes[1].set_xlim([x0, x1])
axes[1].set_ylim([y0, y1])
axes[1].set_title('Random Forest')
axes[1].set_xlabel(' ')
axes[1].set_ylabel('True values')
# NN #########################################
axes[2] = sns.regplot(ax=axes[2], x=y_pred_nn_test, y=y_test,
color='darkcyan',
scatter_kws={'alpha': 0.5},
line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})
x0 = -0.0175
x1 = -0.0092
y0 = -0.03
y1 = 0.03
# Plot 45 degree line
axes[2].plot([x0, x1], [y0, y1], ls="--", c="grey")
# Axes Scale
axes[2].set_xlim([x0, x1])
axes[2].set_ylim([y0, y1])
axes[2].set_title('Artificial Neural Network')
axes[2].set_xlabel('Predicted values')
axes[2].set_ylabel('True values')
plt.show()
Theoretically, the actual vs predicted plot should be good right? Considering that the residuals plot is symmetric and horizontal. What am I missing here?
For the actual vs predicted I also tried using sns.scatterplot() but it was the same really...
Sorry if this is a very simple question. But I'm a newcomer to the field.
My specific question is this: I have trained an XGboost classifier in Python. After the training, how can I get the samples in my training data that are closer than a fixed value to the decision boundary of the model?
Thanks
I don't think if xgboost has a built-in method for that or if there is a mathematical formula for that like for SVC. This visualization could help though for 2D feature spaces:
import xgboost as xgb
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons
def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):
# setup marker generator and color map
markers = ('s', 'x', 'o', '^', 'v')
colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
cmap = ListedColormap(colors[:len(np.unique(y))])
# plot the decision surface
x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
np.arange(x2_min, x2_max, resolution))
Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
Z = Z.reshape(xx1.shape)
plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
plt.xlim(xx1.min(), xx1.max())
plt.ylim(xx2.min(), xx2.max())
for idx, cl in enumerate(np.unique(y)):
plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
alpha=0.8, c=cmap(idx),
marker=markers[idx], label=cl)
# highlight test samples
if test_idx:
# plot all samples
if not versiontuple(np.__version__) >= versiontuple('1.9.0'):
X_test, y_test = X[list(test_idx), :], y[list(test_idx)]
warnings.warn('Please update to NumPy 1.9.0 or newer')
else:
X_test, y_test = X[test_idx, :], y[test_idx]
plt.scatter(X_test[:, 0],
X_test[:, 1],
c='',
alpha=1.0,
linewidths=1,
marker='o',
s=55, label='test set')
X, y = make_moons(noise=0.3, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
xgb_clf = xgb.XGBClassifier()
xgb_clf = xgb_clf.fit(X_train, y_train)
plot_decision_regions(X_test, y_test, xgb_clf)
plt.show()
The plot_decision_regions function is from Python Machine Learning book, available on its public GitHub here.
I'm confused about SVC with kernel method, e.g., rbf. What I understand is when SVC with rbf kernel is applied to fit(x,y), it computes the rbf kernel matrix K of (x,x.T) which shape is [n_samples, n_samples], then transforms this kernel matrix K to y with hinge loss.
Under this intuition, I use sklearn.svm.svc and sklearn.metrics.pairwise.rbf_kernel to compare the results between:
svc(kernel='rbf').fit(x,y)
# and
svc(kernel='precomputed').fit(rbf_kernel(x,x),y)
# and
svc(kernel='linear').fit(rbf_kernel(x,x),y)
I think it's supposed to be same in the classification result. There are some difference between these three results.
More specifically, if you run the code as follow, svc(kernel='precomputed').fit(rbf_kernel(x,x),y)) performs same as svc(kernel='rbf').fit(x,y), but svc(kernel='linear').fit(rbf_kernel(x,x),y) performs not as well as the other two methods.
Could anyone help me to figure out the reason? Thanks.
from sklearn.datasets import make_moons, make_circles, make_classification
import numpy as np
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from matplotlib.colors import ListedColormap
import numpy as np
import matplotlib
matplotlib.use("agg")
import matplotlib.pyplot as plt
h = .02 # step size in the mesh
names = [
"RBF SVM",
# "RP Ridge",
"RBF-Precomp SVM",
"RBF-Linear SVM",
]
classifiers = [
SVC(gamma=1, C=1),
SVC(kernel='precomputed',C=1,gamma=1),
SVC(kernel="linear", C=1),
]
datasets = [
make_moons(n_samples=200,noise=0, random_state=0),
make_moons(n_samples=200,noise=0.2, random_state=0),
make_circles(n_samples=200,noise=0, factor=0.5, random_state=0),
make_circles(n_samples=200,noise=0.2, factor=0.5, random_state=0),]
figure = plt.figure(figsize=(int((len(classifiers)+1)*3), int(len(datasets)*3)))
i=1
# iterate over datasets
for ds_cnt, ds in enumerate(datasets):
# preprocess dataset, split into training and test part
X, y = ds
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=.2, random_state=42)
K_train = rbf_kernel(X_train,X_train,gamma=1)
K_test = rbf_kernel(X_test,X_train,gamma=1)
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# just plot the dataset first
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
if ds_cnt == 0:
ax.set_title("Input data")
# Plot the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
edgecolors='k')
# Plot the testing points
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6,
edgecolors='k', marker='*')
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
i += 1
# iterate over classifiers
for name, clf in zip(names, classifiers):
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
if "Pre" in name:
clf.fit(K_train,y_train)
score = clf.score(K_test, y_test)
elif "Linear" in name:
clf.fit(K_train,y_train)
score = clf.score(K_test, y_test)
else:
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
# create test data from mesh grid
mesh_data = np.c_[xx.ravel(), yy.ravel()]
K_mesh = rbf_kernel(mesh_data, X_train,gamma=1)
if "Pre" in name or "Linear" in name:
Z = clf.decision_function(K_mesh)
else:
Z = clf.decision_function(mesh_data)
# Put the result into a color plot
Z = Z.reshape(xx.shape)
# draw the every mesh grid, distinct them with colors in plt.cm.RdBu
ax.contourf(xx, yy, Z, 66, cmap=cm, alpha=0.6)
# Plot the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
edgecolors='k')
# Plot the testing points
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
edgecolors='k', alpha=0.6, marker='*')
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
if ds_cnt == 0:
ax.set_title(name)
ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
size=15, horizontalalignment='right')
i += 1
plt.tight_layout()
plt.savefig('bench_test.png')
I am trying to use this example from the sklearn documentation. I am not really sure what the code is doing and although I assume I am inputting my dataset the wrong way, I recently obtained this error:
<ipython-input-26-3c3c0763766b> in <module>()
49 for ds in datasets:
50 # preprocess dataset, split into training and test part
---> 51 X, y = ds
52 X = StandardScaler().fit_transform(X)
53 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)
ValueError: too many values to unpack
Any ideas as to how I can modify the code to work with my dataset (which is a multidimensional numpy array from a pandas dataframe) and fix the error?
dataURL = "peridotites_clean_complete.csv"
pd_data = pd.read_csv(dataURL)
rock_names = pd_data['ROCK NAME']
rock_compositions = pd_data.columns[1:]
rock_data = np.vstack([pd_data[x] for x in rock_compositions])
classifiers = [
KNeighborsClassifier(3),
SVC(kernel="linear", C=0.025),
SVC(gamma=2, C=1),
DecisionTreeClassifier(max_depth=5),
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
AdaBoostClassifier(),
GaussianNB(),
LDA(),
QDA()]
X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
random_state=1, n_clusters_per_class=1)
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
linearly_separable = (X, y)
datasets = [rock_data]
figure = plt.figure(figsize=(27, 9))
i = 1
# iterate over datasets
for ds in datasets:
# preprocess dataset, split into training and test part
X, y = ds
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# just plot the dataset first
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
# Plot the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
# and testing points
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6)
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
i += 1
# iterate over classifiers
for name, clf in zip(names, classifiers):
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
if hasattr(clf, "decision_function"):
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
else:
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
# Put the result into a color plot
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)
# Plot also the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
# and testing points
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
alpha=0.6)
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
ax.set_title(name)
ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
size=15, horizontalalignment='right')
i += 1
figure.subplots_adjust(left=.02, right=.98)
plt.show()
The thing is ds is a list with more than two values like the one shown below:
>>> ds=['rockatr1','rockatr2','rockatr','rocktype']
>>> X,y=ds
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
ValueError: too many values to unpack
You have to specify which part is X and which is y as shown below. Usually in classification data the last column is used as label, that is what I assumed here.
>>> X,y=ds[:-1],ds[-1]
>>> X
['rockatr1', 'rockatr2', 'rockatr']
>>> y
'rocktype'