how to use instance weight to custom loss function in AdaBoost implementation - python

From ageron's hand-on ML with scipy and tensorflow, the Adaboost funcion equations are described in detail, except for how to use instance weight when training.
Below are codes i use skelearn's DecisionTree classifier, i guess sample_weight could be Weights when fit(), but accuracies are unstable when changing n_estimators. What's wrong with codes?
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from matplotlib import pyplot as plt
X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y)
# implement AdaBoost classification ,TBD: score weired with n_estimators and max_depth
eta = 0.5 # learning_rate
n_estimators = 10 # simple start
# initial
clfs = [DecisionTreeClassifier(max_depth=1),] * n_estimators # predictors
W = np.ones((X_train.shape[0])) / X_train.shape[0] # instance weight
R = np.zeros(n_estimators) # weighted error rate of predictors
Alpha = np.zeros(n_estimators) # predictor weight
# build_trees
for j in range(n_estimators):
clf = clfs[j]
plt.plot(W)
clf.fit(X_train, y_train, sample_weight=W)
y_pred_train = clf.predict(X_train)
# Equation7-1,
R[j] = W[y_pred_train != y_train].sum() / (W.sum())
# 7-2,
Alpha[j] = eta * np.log((1 - R[j]) / R[j])
# 7-3, update Weight
W[y_pred_train != y_train] *= np.exp(Alpha[j])
# normalize
W /= W.sum()
# predict
K = np.zeros((y_test.shape[0], n_estimators), dtype=np.int32)
for j in range(n_estimators):
K[:,j] = clfs[j].predict(X_test)
# find max k with sum(alpha)
V = np.zeros((y_test.shape[0], 2))
for i in range(y_test.shape[0]):
for j in range(n_estimators):
if K[i,j] == y_test[i]:
V[i,y_test[i]] += Alpha[j]
y_pred = np.argmax(V, axis=1)
print(accuracy_score(y_test, y_pred))
plt.legend(range(n_estimators))
plt.show()

Related

How to plot training and test convergence of a multilayer perceptron

I couldn't find anything helpful about plotting the process of converging test and training data of Sklearn.neural_network.MLPrgressor. I found that there is loss_curve_ attribute, but what about validation data?
I have built a simple model in which both inputs and outputs are randomly selected (say x = numpy.linspace(0, numpy.pi, 100), y = numpy.sin(x). I wrote this one to obtain variation of sklearn.metrics.mean_squared_error for a different number of hidden layers.
How can I overcome this problem?
from sklearn.preprocessing import RobustScaler
inputs /= 10
ERE /= 10
scaler = RobustScaler()
inputs = scaler.fit_transform(inputs)
X_train, X_test, y_train, y_test = train_test_split(inputs, ERE,
train_size=0.8,
random_state=123)
from sklearn.neural_network import MLPRegressor
hidden_layer_size = (10, )
activation = "tanh"
solver = "adam"
alpha = 1e-4
batch_size = 6
learning_rate = "adaptive"
learning_rate_init = 1e-4
power_t = "sgd"
max_iter = 1000
shuffle = True
random_state = 123
verbose = True
early_stopping = True
validation_fraction = 0.15
n_iter_no_change = 35
from sklearn.metrics import mean_squared_error as mse
import numpy as np
error_scores = np.zeros(shape = (11,))
for _iterator, hidden_layer_size in enumerate(range(1, 110, 10)):
mlr = MLPRegressor(hidden_layer_sizes=hidden_layer_size,
activation=activation,
solver=solver,
batch_size=batch_size,
learning_rate=learning_rate,
learning_rate_init=learning_rate_init,
shuffle=shuffle,
random_state=random_state,
early_stopping=early_stopping,
validation_fraction=validation_fraction,
n_iter_no_change=n_iter_no_change,
alpha=alpha)
mlr.fit(X_train, y_train)
error_scores[_iterator] = mse(y_test, mlr.predict(X_test))
Class MLPrgressor (well, BaseMultilayerPerceptron really) has an undocumented validation_scores_ attribute which keeps track of scores on validation data. However, it is only populated if you pass True as parameter early_stopping when initialising the solver object.

Poor accuarcy score for Semi-Supervised Support Vector machine

I am using a Semi-Supervised approach for Support Vector Machine in Python for the image classification from PASCAL VOC 2007 data.
I have tried with the default parameters from the libraries and also tuned them but it get extremely bad accuracy of about only ~ 2%.
Below is my code:
import pandas as pd
import numpy as np
from sklearn import decomposition
from sklearn.model_selection import train_test_split
from numpy import concatenate
import numpy as np
from sklearn import datasets
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import decomposition
import warnings
warnings.filterwarnings("ignore")
color_layout_features = pd.read_pickle("color_layout_descriptor.pkl")
bow_surf = pd.read_pickle("bow_surf.pkl")
color_hist_features = pd.read_pickle("hist.pkl")
labels = pd.read_pickle("labels.pkl")
# Feat. Scaling
def scale(X, x_min, x_max):
nom = (X-X.min(axis=0))*(x_max-x_min)
denom = X.max(axis=0) - X.min(axis=0)
denom[denom==0] = 1
return x_min + nom/denom
# normalization
def normalize(x):
return (x - np.min(x))/(np.max(x) - np.min(x))
color_layout_features_scaled = scale(color_layout_features, 0, 1)
color_hist_features_scaled = scale(color_hist_features, 0, 1)
bow_surf_scaled = scale(bow_surf, 0, 1)
features = np.hstack([color_layout_features_scaled, color_hist_features_scaled, bow_surf_scaled])
# define dataset
X, Y = features, labels
X = normalize(X)
pca = decomposition.PCA(n_components=100)
pca.fit(X)
X = pca.transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=1, stratify=Y)
# split train into labeled and unlabeled
X_train_lab, X_test_unlab, y_train_lab, y_test_unlab = train_test_split(X_train, y_train, test_size=0.30, random_state=1, stratify=y_train)
# create the training dataset input
X_train_mixed = concatenate((X_train_lab, X_test_unlab))
# create "no label" for unlabeled data
nolabel = [-1 for _ in range(len(y_test_unlab))]
# recombine training dataset labels
y_train_mixed = concatenate((y_train_lab, nolabel))
from semisupervised import S3VM
model = S3VM(kernel="Linear", C = 1e-2, gamma = 0.5, lamU = 1.0, probability=True)
#model.fit(X_train_mixed, _train_mixed)
model.fit(np.vstack((X_train_lab, X_test_unlab)), np.append(y_train_lab, nolabel))
#model.fit(np.vstack((label_X_train, unlabel_X_train)), np.append(label_y_train, unlabel_y))
# predict
predict = model.predict(X_test)
acc = metrics.accuracy_score(y_test, predict)
# metric
print("accuracy", acc*100)
accuracy 2.6692291266282298
I am using a Transductive version of SVM (TSVM) from the semisupervised library. But not sure what am I doing wrong so that even after tweaking the parameters I still get the same result. Any inputs would be helpful.
I refer https://github.com/rosefun/SemiSupervised/blob/master/semisupervised/TSVM.py to make the implementation. Any inputs would be helpful.
Please consider that according to link Documentation "The unlabeled samples should be labeled as -1".

How to tune hyperparameters over a hyperparameter space using Bayesian Optimization (in Python)?

I am trying to tune hyperparameters using bayesian optimization for random forest regression over a hyperparameter space using the code below, but I get an error that says
TypeError: init() got an unexpected keyword argument 'min_samples'
I got this error when I tried the following code:
# Import packages
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn import datasets
from sklearn.ensemble import RandomForestRegressor
from hyperopt import hp, tpe, fmin, Trials, STATUS_OK
# Create datasets
reg_prob = datasets.make_friedman1(n_samples=100, n_features=10, noise=1.0, random_state=None)
x_train = reg_prob[0][0:50]
y_train = reg_prob[1][0:50]
x_test = reg_prob[0][50:100]
y_test = reg_prob[1][50:100]
#Create Hyperparameter space
space= {'n_estimators':hp.choice('n_estimators', range(2, 150, 1)),
'min_samples':hp.choice('min_samples', range(2, 100, 1)),
'max_features':hp.choice('max_features', range(2, 100, 1)),
'max_samples':hp.choice('max_samples', range(2, 100, 1)),
}
#Define Objective Function
def objective(space):
rf = RandomForestRegressor(**space)
# fit Training model
rf.fit(x_train, y_train)
# Making predictions and find RMSE
y_pred = rf.predict(x_test)
mse = mean_squared_error(y_test,y_pred)
rmse = np.sqrt(mse)
# Return RMSE
return rmse
#Surrogate Fn
trials = Trials()
best = fmin(objective,
space=space,
algo=tpe.suggest,
max_evals=100,
trials=trials)
print(best)
print(trials.results)
I have also tried listing the hyperparameters in the objective function using the code below, but I get the following error
TypeError: objective() missing 3 required positional arguments: 'min_samples', 'max_features', and 'max_samples'
#Define Objective Function
def objective(n_estimators,min_samples,max_features,max_samples):
rf = RandomForestRegressor(n_estimators, min_samples, max_features, max_samples)
# fit Training model
rf.fit(x_train, y_train)
# Making predictions and find RMSE
y_pred = rf.predict(x_test)
mse = mean_squared_error(y_test,y_pred)
rmse = np.sqrt(mse)
# Return RMSE
return rmse
Can you please advise on what I can do to fix my code?
I was able to tune a single hyperparameter using the code below:
# Import packages
import numpy as np
import time
from sklearn.metrics import mean_squared_error
from sklearn import datasets
from sklearn.ensemble import RandomForestRegressor
from hyperopt import hp, tpe, fmin, Trials, STATUS_OK
from collections import OrderedDict
reg_prob = datasets.make_friedman1(n_samples=100, n_features=10, noise=1.0, random_state=None)
x_train = reg_prob[0][0:50]
y_train = reg_prob[1][0:50]
x_test = reg_prob[0][50:100]
y_test = reg_prob[1][50:100]
space= hp.choice('num_leaves', range(2, 100, 1))
def objective(num_leaves):
rf = RandomForestRegressor(num_leaves)
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)
mse = mean_squared_error(y_test,y_pred)
rmse = np.sqrt(mse)
# Return RMSE
return rmse
trials = Trials()
best = fmin(objective,
space=space,
algo=tpe.suggest,
max_evals=100,
trials=trials)
print(best)
print(trials.results)
The problem is that there is no parameter called min_samples in RandomForestClassifier. See here. Probably you meant min_samples_leaf.
Just keep the upper bound of min_sample_leaf within the range of number of samples in your dataset.
Otherwise there is no other problem with your code.
import matplotlib.pyplot as plt
# Import packages
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn import datasets
from sklearn.ensemble import RandomForestRegressor
from hyperopt import hp, tpe, fmin, Trials, STATUS_OK
# Create datasets
reg_prob = datasets.make_friedman1(n_samples=100, n_features=10, noise=1.0, random_state=None)
x_train = reg_prob[0][0:50]
y_train = reg_prob[1][0:50]
x_test = reg_prob[0][50:100]
y_test = reg_prob[1][50:100]
#Create Hyperparameter space
space= {'n_estimators':hp.choice('n_estimators', range(2, 150, 1)),
'min_samples_leaf':hp.choice('min_samples', range(2, 50, 1)),
'max_features':hp.choice('max_features', range(2, 10, 1)),
'max_samples':hp.choice('max_samples', range(2, 50, 1)),
}
#Define Objective Function
def objective(space):
rf = RandomForestRegressor(**space)
# fit Training model
rf.fit(x_train, y_train)
# Making predictions and find RMSE
y_pred = rf.predict(x_test)
mse = mean_squared_error(y_test,y_pred)
rmse = np.sqrt(mse)
# Return RMSE
return rmse
#Surrogate Fn
trials = Trials()
best = fmin(objective,
space=space,
algo=tpe.suggest,
max_evals=2,
trials=trials)
print(best)
print(trials.results)

Multi Layer Perceptron has low accuracy when it has 1 output neuron, works fine when it has more

I am makiing a Neural Network with an RBF hidden layer. First, the input goes into the RBF (trained with KMeans) and after that it goes to a Multi Layer Perceptron ( sklearn - python ) . The problem arises when I feed the MLP with my data from the RBF layer. If I try with e.g. (10, 2) layers I get something like 80% accuracy but when I try with (10, 1) I get around 50% accuracy.
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from mlxtend.data import loadlocal_mnist
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn import metrics
from random import sample
from math import sqrt
from time import time
import numpy as np
def convert(x):
if x%2==0:
return 1
else:
return -1
def calc_centers(x, k, n, algo='kmeans'):
centers = list()
if algo=='kmeans':
kmeans = KMeans(n_clusters=k, random_state=0, precompute_distances=True, n_init=n).fit(x)
centers = kmeans.cluster_centers_
elif algo=='random':
centers = sample(list(x), k)
return centers
def phi(xi, xj, sigma):
return np.e**(-1*distance(xi, xj)**2/(2*sigma**2))
def distance(x, y):
d = 0
for i in range(len(x)):
d += (x[i]-y[i])**2
d = sqrt(d)
return d
def hidden_layer_output(xi, centers):
k = len(centers)
max_i = -1
max_j = -1
max_d = -1
for i in range(k):
for j in range(i+1, k):
dist = distance(centers[i], centers[j])
if dist > max_d:
max_d = dist
max_i = i
max_j = j
sigma = max_d/sqrt(2*k)
return [phi(xi, xj, sigma) for xj in centers]
def run(x_train, y_train, x_test, y_test, k, n, algo):
#t=time()
centers = calc_centers(x_train, k, n, algo)
hidden_layer_outputs = list()
for x in x_train:
hidden_layer_outputs.append(hidden_layer_output(x, centers))
neurons = (k, 2) # <----------------- THIS IS THE LINE IN QUESTION ----------------------->
learning_rate = 0.001
earlyStop = False
model = MLPClassifier(hidden_layer_sizes=neurons, activation='relu',
max_iter = 2000, solver='adam', random_state=1,
learning_rate_init=learning_rate,
early_stopping=earlyStop)
model.fit(hidden_layer_outputs, y_train)
#Dt = '{0:.1f}'.format(time() - t)
y1 = model.predict(hidden_layer_outputs)
#y2 = model.predict([hidden_layer_output(x, centers) for x in x_test])
#print(str(100*p)+'% variance level, '+str(neurons)+' neurons')
#print('Learning rate: '+str(learning_rate))
#print('Early Stopping: '+str(earlyStop))
print('{0:.2f}'.format(100*metrics.accuracy_score(y_train, y1))+'% training accuracy')
#print('{0:.2f}'.format(100*metrics.accuracy_score(y_test, y2))+'% testing accuracy')
And my main:
if __name__=='__main__':
(x_train, y_train) = loadlocal_mnist(
images_path='train-images.idx3-ubyte',
labels_path='train-labels.idx1-ubyte')
(x_test, y_test) = loadlocal_mnist(
images_path='t10k-images.idx3-ubyte',
labels_path='t10k-labels.idx1-ubyte')
x_train = np.array(x_train)[:2000]
y_train = np.array(y_train)[:2000]
x_test = np.array(x_test)[:2000]
y_test = np.array(y_test)[:2000]
k=20
n=20
sc = StandardScaler().fit(x_train)
x_train = sc.transform(x_train)
x_test = sc.transform(x_test)
p = 0.95
pca = PCA(p)
pca.fit(x_train)
x_train = pca.transform(x_train)
x_test = pca.transform(x_test)
y_train = [convert(y) for y in y_train]
y_test = [convert(y) for y in y_test]
#run(x_train, y_train, x_test, y_test, k, n, 'kmeans')
run(x_train, y_train, x_test, y_test, k, n, 'random')
I would appreciate some help.

How to run through loop to use non-scaled and scaled data in python for loop

I have the following code running through and fitting a model on the iris data using different modeling techniques. How can I add a second step in this process so I can demonstrate the improvement between using scaled and non-scaled data?
I don't need to run the scale transform outside of the loop, i was just having a lot of issues with transforming the data type from pandas dataframe to np array and back again.
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import KFold
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.metrics import accuracy_score
iris = datasets.load_iris()
X = iris.data[:, :2] # we only take the first two features.
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
sc = StandardScaler()
X_train_scale = sc.fit_transform(X_train)
X_test_scale = sc.transform(X_test)
numFolds = 10
kf = KFold(len(y_train), numFolds, shuffle=True)
# These are "Class objects". For each Class, find the AUC through
# 10 fold cross validation.
Models = [LogisticRegression, svm.SVC]
params = [{},{}]
for param, Model in zip(params, Models):
total = 0
for train_indices, test_indices in kf:
train_X = X_train[train_indices]; train_Y = y_train[train_indices]
test_X = X_train[test_indices]; test_Y = y_train[test_indices]
reg = Model(**param)
reg.fit(train_X, train_Y)
predictions = reg.predict(test_X)
total += accuracy_score(test_Y, predictions)
accuracy = total / numFolds
print ("CV accuracy score of {0}: {1}".format(Model.__name__, round(accuracy, 6)))
So ideally my output would be:
CV standard accuracy score of LogisticRegression: 0.683333
CV scaled accuracy score of LogisticRegression: 0.766667
CV standard accuracy score of SVC: 0.766667
CV scaled accuracy score of SVC: 0.783333
It seems like this is unclear, I am trying to loop through scaled and unscaled data, similar to how I am looping through the different ML algorithms.
I wanted to follow up with this. I was able to do this by creating a pipeline and using gridsearchCV
pipe = Pipeline([('scale', StandardScaler()),
('clf', LogisticRegression())])
param_grid = [{
'scale':[None,StandardScaler()],
'clf':[SVC(),LogisticRegression()]}]
grid_search = GridSearchCV(pipe, param_grid=param_grid,n_jobs=-1, verbose=1 )
In the end this got me the results I wanted and was able to test easily how to work between scaling and not scaling.
try this:
from __future__ import division
for param, Model in zip(params, Models):
total = 0
for train_indices, test_indices in kf:
train_X = X_train[train_indices]; train_Y = y_train[train_indices]
test_X = X_train[test_indices]; test_Y = y_train[test_indices]
reg = Model(**param)
reg.fit(train_X, train_Y)
predictions = reg.predict(test_X)
total += accuracy_score(test_Y, predictions)
accuracy = total / numFolds
print ("CV accuracy score of {0}: {1}".format(Model.__name__, round(accuracy, 6)))
# added to your code
if previous_accuracy:
improvement = 1 - (accuracy / previous_accuracy)
print "CV accuracy score improved by", improvement
else:
previous_accuracy = accuracy

Categories

Resources