Basic Desicion Tree in Python - python

I don't understand why my algorithm is not separating samples "1" (blue), it is as if the algorithm ignored them. I am not aware of updates to DecisionTreeClasiffier in case I am missing adding any parameters, the algorithm is as follows . I have 3 labels, that is, 3 types of samples(Import is missing in the code)
iris = datasets.load_iris()
X=iris.data[:,[2,3]]
y=iris.target
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)
sc= StandardScaler()
sc.fit(X_train)
X_train_std=sc.transform(X_train)
X_test_std= sc.transform(X_test)
def plot_decision_regions(X, y, classifier, test_idx=None, resolution = 0.02):
#definir un generador de marcadores y un mapa de colores
markers = ('s', 'x', 'o', '^','v')
colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
cmap = ListedColormap(colors[:len(np.unique(y))])
#representar la superficie de decision
x1_min, x1_max = X[:, 0].min() -1, X[:,0].max() + 1
x2_min, x2_max = X[:, 1].min() -1, X[:,1].max() + 1
xx1, xx2= np.meshgrid (np.arange(x1_min, x1_max, resolution), np.arange(x2_min, x2_max, resolution))
Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
Z = Z.reshape(xx1.shape)
plt.contourf(xx1, xx2, Z, alpha= 0.3, cmap = cmap)
plt.xlim(xx1.min(), xx1.max())
plt.ylim(xx2.min(), xx2.max())
# print(Z)
#Representar muestras de clase
for idx, cl in enumerate (np.unique(y)):
plt.scatter (x=X[y == cl, 0], y= X[y == cl, 1], alpha=0.8, c=colors[idx], marker= markers [idx], label = cl, edgecolor = 'black')
if test_idx:
#representa todas las muestras
X_test, y_test= X[test_idx,:], y[test_idx]
# print( X[test_idx,:])
plt.scatter(X_test[:,0], X_test[:,1], c='', edgecolor= 'black', alpha= 0.9, linewidth=1, marker='o', s=100, label='test set' )
X_combined_std= np.vstack((X_train_std, X_test_std))
y_combined=np.hstack((y_train, y_test))
tree=DecisionTreeClassifier(criterion='gini', max_depth=4, random_state=1)
tree.fit(X_train, y_train)
plot_decision_regions(X= X_combined_std, y= y_combined, classifier=tree, test_idx=range(105,150))
plt.xlabel('sepal length[cm]')
plt.ylabel('petal width [cm]')
plt.legend(loc='upper left')
plt.show()

Revised answer:
You just forgot to train the model with the scaled features X_train_std. So, instead of
tree.fit(X_train, y_train)
it should be
tree.fit(X_train_std, y_train)
Now, the model is able to distinguish between all three classes:

Related

problem with array reshaping for SVM kernel visualization

I am trying to make a visualization of different kernels effect of a support vector machine over a data-set but I have a problem reshaping an array size:
The dataset I am using is the following:
https://archive.ics.uci.edu/ml/datasets/human+activity+recognition+using+smartphones
df_train_X = pd.read_csv("X_train.txt", header=None, delim_whitespace=True)
df_train_y = pd.read_csv("y_train.txt", header=None, delim_whitespace=True)
X_train = df_train_X.values
y_train = df_train_y[0]
X = X_train
y = y_train
X = X[y != 0, :2]
y = y[y != 0]
n_sample = len(X)
np.random.seed(0)
order = np.random.permutation(n_sample)
X = X[order]
y = y[order].astype(np.float)
X_train = X[:int(.9 * n_sample)]
y_train = y[:int(.9 * n_sample)]
X_test = X[int(.9 * n_sample):]
y_test = y[int(.9 * n_sample):]
# fit the model
for kernel in ('linear', 'rbf', 'poly'):
clf = SVC(kernel=kernel, gamma=10)
clf.fit(X_train, y_train)
plt.figure()
plt.clf()
plt.scatter(X[:, 0], X[:, 1], c=y, zorder=10, cmap=plt.cm.Paired,
edgecolor='k', s=20)
# Circle out the test data
plt.scatter(X_test[:, 0], X_test[:, 1], s=80, facecolors='none',
zorder=10, edgecolor='k')
plt.axis('tight')
x_min = X[:, 0].min()
x_max = X[:, 0].max()
y_min = X[:, 1].min()
y_max = X[:, 1].max()
XX, YY = np.mgrid[x_min:x_max:200j, y_min:y_max:200j]
Z = clf.decision_function(np.c_[XX.ravel(), YY.ravel()])
# Put the result into a color plot
Z = Z.reshape(XX.shape)
plt.pcolormesh(XX, YY, Z > 0, cmap=plt.cm.Paired)
plt.contour(XX, YY, Z, colors=['k', 'k', 'k'],
linestyles=['--', '-', '--'], levels=[-.5, 0, .5])
plt.title(kernel)
plt.show()
The problem is related to Z = Z.reshape(XX.shape) since I get the following error that blocks the code execution: cannot reshape array of size 240000 into shape (200,200)
But I don't know how to solve it to perform the visualization.

Python: Unexpected predicted vs actual plot for regression models

I have 3 regression models: Linear regression, Random Forest, and ANN.
Trying to compare them, I am plotting two plots: residuals and actual vs predicted.
Overall, the residuals suggest that most models predict the data well as they have a symmetric shape and follow the horizontal line. However, when evaluating the actual vs predicted, none of the models nearly follow the 45 degree line (suggesting perfect prediction). What am I doing wrong? Please see the relevant code below:
# Separate features from the label (add .values) to use arrays instead of dataframes
X = dataset2.iloc[:, 0:-1]
y = dataset2.iloc[:, -1]
# Split into training and testin sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
# Scale the data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Train OLS ####################################
ols_regressor = sm.OLS(y_train, X_train).fit()
# Predictions
y_pred_ols_test = ols_regressor.predict(X_test)
y_pred_ols_train = ols_regressor.predict(X_train)
# Train RF ####################################
rf = RandomForestRegressor(bootstrap=False,
max_depth=100,
max_features=3,
min_samples_leaf=1,
min_samples_split=5,
n_estimators=800)
rf.fit(X_train, y_train)
# Predictions
y_pred_rf_test = rf.predict(X_test)
y_pred_rf_train = rf.predict(X_train)
# Train RF ####################################
# ... TRAIN NN WITH KERAS AND TENSORFLOW ...
y_pred_nn_test = ann.predict(X_test)
y_pred_nn_train = ann.predict(X_train)
Here I am plotting the residuals:
fig, axes =plt.subplots(3,1,figsize=(8,12))
fig.suptitle('Residuals', fontweight="bold", fontsize=15)
# OLS #######################################
axes[0] = sns.residplot(ax=axes[0], x=y_pred_ols_test, y=y_test, data=dataset,
lowess=True,
color='darkcyan',
scatter_kws={'alpha': 0.5},
line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})
axes[0].set_xlim([-0.0050, 0.0005])
axes[0].set_ylim([-0.03, 0.03])
axes[0].set_title('Linear Regression')
axes[0].set_xlabel(' ')
axes[0].set_ylabel('Residuals')
# Random Forest ###############################
axes[1] = sns.residplot(ax=axes[1], x=y_pred_rf_test, y=y_test, data=dataset,
lowess=True,
color='darkcyan',
scatter_kws={'alpha': 0.5},
line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})
axes[1].set_xlim([-0.016, 0.006])
axes[1].set_ylim([-0.03, 0.03])
axes[1].set_title('Random Forest')
axes[1].set_xlabel(' ')
axes[1].set_ylabel('Residuals')
# Neural Network ###############################
axes[2] = sns.residplot(ax=axes[2], x=y_pred_nn_test, y=y_test, data=dataset,
lowess=True,
color='darkcyan',
scatter_kws={'alpha': 0.5},
line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})
axes[2].set_xlim([-0.018, -0.009])
axes[2].set_ylim([-0.03, 0.03])
axes[2].set_title('Artificial Neural Network)')
axes[2].set_xlabel('Predicted values')
axes[2].set_ylabel('Residuals')
plt.show()
Then I plot the predicted vs actual plot:
fig, axes =plt.subplots(3,1,figsize=(8,12))
fig.suptitle('Predicted vs Actual', fontweight="bold", fontsize=15)
# OLS #######################################
axes[0] = sns.regplot(ax=axes[0], x=y_pred_ols_test, y=y_test,
color='darkcyan',
scatter_kws={'alpha': 0.5},
line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})
x0 = -0.00415
x1 = 0.00025
y0 = -0.03
y1 = 0.03
# Plot 45 degree line
axes[0].plot([x0, x1], [y0, y1], ls="--", c="grey")
# # Axes Scale
axes[0].set_xlim([x0, x1])
axes[0].set_ylim([y0, y1])
# Labels
axes[0].set_title('Linear Regression')
axes[0].set_xlabel('')
axes[0].set_ylabel('True values')
# Random Forest ###############################
axes[1] = sns.regplot(ax=axes[1], x=y_pred_rf_test, y=y_test,
color='darkcyan',
scatter_kws={'alpha': 0.5},
line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})
x0 = -0.0125
x1 = 0.0045
y0 = -0.03
y1 = 0.03
# Plot 45 degree line
axes[1].plot([x0, x1], [y0, y1], ls="--", c="grey")
# Axes Scale
axes[1].set_xlim([x0, x1])
axes[1].set_ylim([y0, y1])
axes[1].set_title('Random Forest')
axes[1].set_xlabel(' ')
axes[1].set_ylabel('True values')
# NN #########################################
axes[2] = sns.regplot(ax=axes[2], x=y_pred_nn_test, y=y_test,
color='darkcyan',
scatter_kws={'alpha': 0.5},
line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})
x0 = -0.0175
x1 = -0.0092
y0 = -0.03
y1 = 0.03
# Plot 45 degree line
axes[2].plot([x0, x1], [y0, y1], ls="--", c="grey")
# Axes Scale
axes[2].set_xlim([x0, x1])
axes[2].set_ylim([y0, y1])
axes[2].set_title('Artificial Neural Network')
axes[2].set_xlabel('Predicted values')
axes[2].set_ylabel('True values')
plt.show()
Theoretically, the actual vs predicted plot should be good right? Considering that the residuals plot is symmetric and horizontal. What am I missing here?
For the actual vs predicted I also tried using sns.scatterplot() but it was the same really...

Finding data points close to the decision boundary of a classifier

Sorry if this is a very simple question. But I'm a newcomer to the field.
My specific question is this: I have trained an XGboost classifier in Python. After the training, how can I get the samples in my training data that are closer than a fixed value to the decision boundary of the model?
Thanks
I don't think if xgboost has a built-in method for that or if there is a mathematical formula for that like for SVC. This visualization could help though for 2D feature spaces:
import xgboost as xgb
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons
def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):
# setup marker generator and color map
markers = ('s', 'x', 'o', '^', 'v')
colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
cmap = ListedColormap(colors[:len(np.unique(y))])
# plot the decision surface
x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
np.arange(x2_min, x2_max, resolution))
Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
Z = Z.reshape(xx1.shape)
plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
plt.xlim(xx1.min(), xx1.max())
plt.ylim(xx2.min(), xx2.max())
for idx, cl in enumerate(np.unique(y)):
plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
alpha=0.8, c=cmap(idx),
marker=markers[idx], label=cl)
# highlight test samples
if test_idx:
# plot all samples
if not versiontuple(np.__version__) >= versiontuple('1.9.0'):
X_test, y_test = X[list(test_idx), :], y[list(test_idx)]
warnings.warn('Please update to NumPy 1.9.0 or newer')
else:
X_test, y_test = X[test_idx, :], y[test_idx]
plt.scatter(X_test[:, 0],
X_test[:, 1],
c='',
alpha=1.0,
linewidths=1,
marker='o',
s=55, label='test set')
X, y = make_moons(noise=0.3, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
xgb_clf = xgb.XGBClassifier()
xgb_clf = xgb_clf.fit(X_train, y_train)
plot_decision_regions(X_test, y_test, xgb_clf)
plt.show()
The plot_decision_regions function is from Python Machine Learning book, available on its public GitHub here.

What is the reason of difference between SVC with rbf kernel and linear kernel which is fitted by rbf-kernel computed X

I'm confused about SVC with kernel method, e.g., rbf. What I understand is when SVC with rbf kernel is applied to fit(x,y), it computes the rbf kernel matrix K of (x,x.T) which shape is [n_samples, n_samples], then transforms this kernel matrix K to y with hinge loss.
Under this intuition, I use sklearn.svm.svc and sklearn.metrics.pairwise.rbf_kernel to compare the results between:
svc(kernel='rbf').fit(x,y)
# and
svc(kernel='precomputed').fit(rbf_kernel(x,x),y)
# and
svc(kernel='linear').fit(rbf_kernel(x,x),y)
I think it's supposed to be same in the classification result. There are some difference between these three results.
More specifically, if you run the code as follow, svc(kernel='precomputed').fit(rbf_kernel(x,x),y)) performs same as svc(kernel='rbf').fit(x,y), but svc(kernel='linear').fit(rbf_kernel(x,x),y) performs not as well as the other two methods.
Could anyone help me to figure out the reason? Thanks.
from sklearn.datasets import make_moons, make_circles, make_classification
import numpy as np
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from matplotlib.colors import ListedColormap
import numpy as np
import matplotlib
matplotlib.use("agg")
import matplotlib.pyplot as plt
h = .02 # step size in the mesh
names = [
"RBF SVM",
# "RP Ridge",
"RBF-Precomp SVM",
"RBF-Linear SVM",
]
classifiers = [
SVC(gamma=1, C=1),
SVC(kernel='precomputed',C=1,gamma=1),
SVC(kernel="linear", C=1),
]
datasets = [
make_moons(n_samples=200,noise=0, random_state=0),
make_moons(n_samples=200,noise=0.2, random_state=0),
make_circles(n_samples=200,noise=0, factor=0.5, random_state=0),
make_circles(n_samples=200,noise=0.2, factor=0.5, random_state=0),]
figure = plt.figure(figsize=(int((len(classifiers)+1)*3), int(len(datasets)*3)))
i=1
# iterate over datasets
for ds_cnt, ds in enumerate(datasets):
# preprocess dataset, split into training and test part
X, y = ds
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=.2, random_state=42)
K_train = rbf_kernel(X_train,X_train,gamma=1)
K_test = rbf_kernel(X_test,X_train,gamma=1)
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# just plot the dataset first
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
if ds_cnt == 0:
ax.set_title("Input data")
# Plot the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
edgecolors='k')
# Plot the testing points
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6,
edgecolors='k', marker='*')
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
i += 1
# iterate over classifiers
for name, clf in zip(names, classifiers):
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
if "Pre" in name:
clf.fit(K_train,y_train)
score = clf.score(K_test, y_test)
elif "Linear" in name:
clf.fit(K_train,y_train)
score = clf.score(K_test, y_test)
else:
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
# create test data from mesh grid
mesh_data = np.c_[xx.ravel(), yy.ravel()]
K_mesh = rbf_kernel(mesh_data, X_train,gamma=1)
if "Pre" in name or "Linear" in name:
Z = clf.decision_function(K_mesh)
else:
Z = clf.decision_function(mesh_data)
# Put the result into a color plot
Z = Z.reshape(xx.shape)
# draw the every mesh grid, distinct them with colors in plt.cm.RdBu
ax.contourf(xx, yy, Z, 66, cmap=cm, alpha=0.6)
# Plot the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
edgecolors='k')
# Plot the testing points
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
edgecolors='k', alpha=0.6, marker='*')
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
if ds_cnt == 0:
ax.set_title(name)
ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
size=15, horizontalalignment='right')
i += 1
plt.tight_layout()
plt.savefig('bench_test.png')

Trouble Setting Up Multiple Algorithm Trials in Scikit-Learn

I am trying to use this example from the sklearn documentation. I am not really sure what the code is doing and although I assume I am inputting my dataset the wrong way, I recently obtained this error:
<ipython-input-26-3c3c0763766b> in <module>()
49 for ds in datasets:
50 # preprocess dataset, split into training and test part
---> 51 X, y = ds
52 X = StandardScaler().fit_transform(X)
53 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)
ValueError: too many values to unpack
Any ideas as to how I can modify the code to work with my dataset (which is a multidimensional numpy array from a pandas dataframe) and fix the error?
dataURL = "peridotites_clean_complete.csv"
pd_data = pd.read_csv(dataURL)
rock_names = pd_data['ROCK NAME']
rock_compositions = pd_data.columns[1:]
rock_data = np.vstack([pd_data[x] for x in rock_compositions])
classifiers = [
KNeighborsClassifier(3),
SVC(kernel="linear", C=0.025),
SVC(gamma=2, C=1),
DecisionTreeClassifier(max_depth=5),
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
AdaBoostClassifier(),
GaussianNB(),
LDA(),
QDA()]
X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
random_state=1, n_clusters_per_class=1)
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
linearly_separable = (X, y)
datasets = [rock_data]
figure = plt.figure(figsize=(27, 9))
i = 1
# iterate over datasets
for ds in datasets:
# preprocess dataset, split into training and test part
X, y = ds
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# just plot the dataset first
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
# Plot the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
# and testing points
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6)
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
i += 1
# iterate over classifiers
for name, clf in zip(names, classifiers):
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
if hasattr(clf, "decision_function"):
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
else:
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
# Put the result into a color plot
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)
# Plot also the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
# and testing points
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
alpha=0.6)
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
ax.set_title(name)
ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
size=15, horizontalalignment='right')
i += 1
figure.subplots_adjust(left=.02, right=.98)
plt.show()
The thing is ds is a list with more than two values like the one shown below:
>>> ds=['rockatr1','rockatr2','rockatr','rocktype']
>>> X,y=ds
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
ValueError: too many values to unpack
You have to specify which part is X and which is y as shown below. Usually in classification data the last column is used as label, that is what I assumed here.
>>> X,y=ds[:-1],ds[-1]
>>> X
['rockatr1', 'rockatr2', 'rockatr']
>>> y
'rocktype'

Categories

Resources