Function to generate optuna grids provided an sklearn pipeline - python

I am using sklearn along with optuna for HPO. I would like to create a custom function that would take an sklearn pipeline as input and return optuna-specifc grids. Returning sklearn specific param grids (i.e. dictionaries) seems to be more straight-forward (duh) ; this is what I 've got so far :
def grid_from_estimator(estimator, type = 'sklearn'):
estimator_name = estimator.named_steps['estimator'].__class__.__name__
if type == 'sklearn':
if estimator_name=='LogisticRegression':
params = {
'estimator__penalty': ['l1','elasticnet'],
'estimator__C': np.logspace(-4, 4, 20)
}
elif estimator_name=='LGBMClassifier':
params = {
'estimator__n_estimators': np.arange(100, 1000, 200),
'estimator__boosting_type':['gbdt','dart'],
'estimator__max_depth': np.arange(6, 12),
'estimator__num_leaves': np.arange(30, 150,5),
'estimator__learning_rate': [1e-2/2 , 1e-2, 1e-1/2, 1e-1, 0.5, 1],
'estimator__min_child_samples': np.arange(20, 100, 5),
'estimator__subsample': np.arange(0.65, 1, 0.05),
'estimator__colsample_bytree': np.arange(0.4, 0.75, 0.05),
'estimator__reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
'estimator__reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100],
'estimator__iterations': np.arange(100, 800, 100),
'estimator__objective': 'binary'
}
elif type == 'optuna':
if estimator_name == 'LogisticRegression':
params = {
'estimator__penalty': trial.suggest_categorical('penalty', ['l1', 'elasticnet']),
'estimator__C': trial.suggest.suggest_loguniform('c', -4, 4)
}
elif estimator_name == 'LGBMClassifier':
params = {
'estimator__n_estimators': trial.suggest_int('n_estimators', 100, 1000),
'estimator__boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart']),
'estimator__max_depth': trial.suggest_int('max_depth', 6, 12),
'estimator__num_leaves': trial.suggest_int('num_leaves', 30, 150, 5),
'estimator__learning_rate': trial.suggest_float('learning_rate', 1e-4, 1),
'estimator__min_child_samples': trial.suggest_int('min_child_samples', 20, 100),
'estimator__subsample': trial.suggest_float('subsample', 0.5, 1),
'estimator__colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 0.75),
'estimator__reg_alpha': trial.suggest_float('reg_alpha', 1e-2, 10),
'estimator__reg_lambda': trial.suggest_float('reg_lambda', 1e-2, 10)
}
return params
The "trial.suggest_..." parts keeps 'complaining' and returning an error; although I understand the reason why, I can't see any way around it. Is this even possible? Any ideas?
Appreciate your support!

I think, something along this should work,
def grid_from_estimator(estimator, trial, type = 'sklearn'):
pass
def your_objective_function(trial):
params = grid_from_estimator('LogisticRegression', trial, 'optuna')
#Rest of the code here.
def tune_model():
study = optuna.create_study()
study.optimize(your_objective_function, n_trials=20)
tune_model()

An example method using optuna ask and tell interface.
Code
import optuna
import numpy as np
def optuna_objective(estimator_name, params):
if estimator_name == 'LogisticRegression':
x = params['x']
y = params['y']
return (x - 2) ** 2 + y
if estimator_name == 'LGBMClassifier':
# estimator__n_estimators = params['estimator__n_estimators']
# return accuracy
pass
return None
def grid_from_estimator(estimator_name, type_='sklearn', study=None):
params, trial = None, None
if type_ == 'sklearn':
if estimator_name == 'LogisticRegression':
params = {
'estimator__penalty': ['l1','elasticnet'],
'estimator__C': np.logspace(-4, 4, 20)
}
elif estimator_name == 'LGBMClassifier':
params = {
'estimator__n_estimators': np.arange(100, 1000, 200),
'estimator__boosting_type':['gbdt','dart'],
'estimator__max_depth': np.arange(6, 12),
'estimator__num_leaves': np.arange(30, 150,5),
'estimator__learning_rate': [1e-2/2 , 1e-2, 1e-1/2, 1e-1, 0.5, 1],
'estimator__min_child_samples': np.arange(20, 100, 5),
'estimator__subsample': np.arange(0.65, 1, 0.05),
'estimator__colsample_bytree': np.arange(0.4, 0.75, 0.05),
'estimator__reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
'estimator__reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100],
'estimator__iterations': np.arange(100, 800, 100),
'estimator__objective': 'binary'
}
elif type_ == 'optuna':
trial = study.ask()
if estimator_name == 'LogisticRegression':
params = {
'x': trial.suggest_float('x', -10, 10),
'y': trial.suggest_float('y', -10, 10)
}
# params = {
# 'estimator__penalty': trial.suggest_categorical('estimator__penalty', ['l1', 'elasticnet']),
# 'estimator__C': trial.suggest_float('estimator__C', -4, 4)
# }
elif estimator_name == 'LGBMClassifier':
params = {
'estimator__n_estimators': trial.suggest_int('estimator__n_estimators', 100, 1000),
'estimator__boosting_type': trial.suggest_categorical('estimator__boosting_type', ['gbdt', 'dart']),
'estimator__max_depth': trial.suggest_int('estimator__max_depth', 6, 12),
'estimator__num_leaves': trial.suggest_int('estimator__num_leaves', 30, 150, 5),
'estimator__learning_rate': trial.suggest_float('estimator__learning_rate', 1e-4, 1),
'estimator__min_child_samples': trial.suggest_int('estimator__min_child_samples', 20, 100),
'estimator__subsample': trial.suggest_float('estimator__subsample', 0.5, 1),
'estimator__colsample_bytree': trial.suggest_float('estimator__colsample_bytree', 0.4, 0.75),
'estimator__reg_alpha': trial.suggest_float('estimator__reg_alpha', 1e-2, 10),
'estimator__reg_lambda': trial.suggest_float('estimator__reg_lambda', 1e-2, 10)
}
return params, trial
# (1) sklearn example
print('SKLEARN')
estimator_name = 'LogisticRegression'
optimizer_type = 'sklearn'
params, _ = grid_from_estimator(estimator_name, type_=optimizer_type)
print(params)
print()
# (2) Optuna example with ask and tell interface.
print('OPTUNA')
study = optuna.create_study(direction='maximize')
n_trials = 10
estimator_name = 'LogisticRegression'
optimizer_type = 'optuna'
for _ in range(n_trials):
params, trial = grid_from_estimator(estimator_name, type_=optimizer_type, study=study)
objective_value = optuna_objective(estimator_name, params)
study.tell(trial, objective_value) # tell the pair of trial and objective value
print(f'trialnum: {trial.number}, params: {params}, value: {objective_value}')
best_params = study.best_params
best_x = best_params["x"]
best_y = best_params["y"]
best_value = study.best_value
best_trial_num = study.best_trial.number
print(f"best x: {best_x}, best y: {best_y}, (x - 2)^2 + y: {(best_x - 2) ** 2 + best_y}, best_value: {best_value}, best_trial_num: {best_trial_num}") # trial num starts at 0
Output
SKLEARN
{'estimator__penalty': ['l1', 'elasticnet'], 'estimator__C': array([1.00000000e-04, 2.63665090e-04, 6.95192796e-04, 1.83298071e-03,
4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04])}
OPTUNA
[I 2021-11-25 19:03:09,673] A new study created in memory with name: no-name-f5046b21-f579-4c74-8046-79420c256d4a
trialnum: 0, params: {'x': 2.905894660287128, 'y': -4.537699327718261}, value: -3.7170541921815303
trialnum: 1, params: {'x': -9.275103438355583, 'y': -5.925000918692578}, value: 121.2029566269253
trialnum: 2, params: {'x': -2.9531168045205103, 'y': 5.253730464314739}, value: 29.78709654353821
trialnum: 3, params: {'x': 3.766902399344163, 'y': 3.778408673279479}, value: 6.900352762087639
trialnum: 4, params: {'x': -0.897563829823584, 'y': -0.887774211794973}, value: 7.508101936106943
trialnum: 5, params: {'x': -2.2256917634354645, 'y': 3.8017184220598903}, value: 21.658189301626216
trialnum: 6, params: {'x': -6.333366980619912, 'y': 9.87067058585388}, value: 79.3156758195401
trialnum: 7, params: {'x': 2.570258991787558, 'y': -0.1959178948625162}, value: 0.1292774228520457
trialnum: 8, params: {'x': 2.94430596072913, 'y': 4.318454050149043}, value: 5.210167797617609
trialnum: 9, params: {'x': 5.972023459737699, 'y': 4.165369460555215}, value: 19.942339825261854
best x: -9.275103438355583, best y: -5.925000918692578, (x - 2)^2 + y: 121.2029566269253, best_value: 121.2029566269253, best_trial_num: 1

Related

Optuna score vs Cross_val_score?

A accuracy score from optuna and a score in cross_val_score were different. Why does it occuer and which score should I choose?
I used the hyperparameters that I got in optuna in cross_val_score.
def objective_lgb(trial):
num_leaves = trial.suggest_int("num_leaves", 2, 1000)
max_depth = trial.suggest_int("max_depth", 2, 100)
learning_rate = trial.suggest_float('learning_rate', 0.001, 1)
n_estimators = trial.suggest_int('n_estimators', 100, 2000)
min_child_samples = trial.suggest_int('min_child_samples', 3, 1000)
subsample = trial.suggest_float('subsample', 0.000001, 1)
colsample_bytree = trial.suggest_float('colsample_bytree', 0.00000001, 1)
reg_alpha = trial.suggest_float('reg_alpha', 0, 400)
reg_lambda = trial.suggest_float("reg_lambda", 0, 400)
importance_type = trial.suggest_categorical('importance_type', ["split", "gain"])
lgb_clf = lgb.LGBMClassifier(random_state=1,
objective="multiclass",
num_class = 3,
importance_type=importance_type,
num_leaves=num_leaves,
max_depth=max_depth,
learning_rate=learning_rate,
n_estimators=n_estimators,
min_child_samples=min_child_samples,
subsample=subsample,
colsample_bytree=colsample_bytree,
reg_alpha=reg_alpha,
reg_lambda=reg_lambda
)
score = cross_val_score(lgb_clf, train_x, train_y, n_jobs=-1, cv=KFold(n_splits=10, shuffle=True, random_state=1), scoring='accuracy')
mean_score = score.mean()
return mean_score
lgb_study = optuna.create_study(direction="maximize")
lgb_study.optimize(objective_lgb, n_trials=1500)
lgb_trial = lgb_study.best_trial
print("accuracy:", lgb_trial.value)
print()
print("Best params:", lgb_trial.params)
=========================================================
def light_check(x,params):
model = lgb.LGBMClassifier()
scores = cross_val_score(model,x,y,cv=KFold(n_splits=10, shuffle=True, random_state=1),n_jobs=-1)
mean = scores.mean()
return scores, mean
light_check(x,{'num_leaves': 230, 'max_depth': 53, 'learning_rate': 0.04037430031226232, 'n_estimators': 1143, 'min_child_samples': 381, 'subsample': 0.12985990464862135, 'colsample_bytree': 0.8914118949904919, 'reg_alpha': 31.869348047391053, 'reg_lambda': 17.45653692887209, 'importance_type': 'split'})
From what I can see, you are using X_train, y_train in the optuna call, while in light_check you are passing x and y. Assuming you did a spilt in some unknown code, the data set for optuna is smaller and you get a different number.

Labels stuck at 0 using nn.CrossEntropyLoss for binary classification in PyTorch

Is the way I am calculating the loss or pred_labels wrong? I am getting really low accuracy values on my val and test sets. The dataset is somewhat balanced actually and large enough. I am doing binary classification here.
30% of my dataset is class 0 and 70% is class 1 and dataset includes ~2000 2D tensors of different size ranging from 100x512 to 8000x512 with a median size of 1200x512.
class Classifier(nn.Module):
def __init__(self, n_class, batch_size):
super(Classifier, self).__init__()
self.batch_size = batch_size
self.transformer = VisionTransformer()
#self.criterion = nn.CrossEntropyLoss(reduce=False)
#self.criterion = nn.BCELoss(reduce=False)
#self.criterion = nn.BCEWithLogitsLoss(reduce=False) # weighted loss
#self.criterion = nn.BCEWithLogitsLoss() # balanced loss
#self.criterion = nn.BCELoss()
self.criterion = nn.CrossEntropyLoss()
def forward(self, X, labels):
stacked_X = torch.stack(X)
out = self.transformer(stacked_X)
#labels = torch.tensor(labels, dtype=torch.float32)
labels = torch.tensor(labels)
#m = nn.Sigmoid()
with torch.cuda.amp.autocast():
print(out[:,1]-out[:,0])
#loss = self.criterion(m(out[:,1]-out[:,0]), labels.cuda())
loss = self.criterion(out, labels.cuda())
#pred_labels = out.data.max(1)[1]
pred_labels = out.argmax(dim=1)
labels = labels.int()
return pred_labels, labels, loss
evaluator.get_scores 0.3194444444444444 (showing ~30 of my data which is label 0)
For calculating accuracy, I am using this code snippet:
class ConfusionMatrix(object):
def __init__(self, n_classes):
self.n_classes = n_classes
# axis = 0: prediction
# axis = 1: target
self.confusion_matrix = np.zeros((n_classes, n_classes))
def _fast_hist(self, label_true, label_pred, n_class):
hist = np.zeros((n_class, n_class))
hist[label_pred, label_true] += 1
return hist
def update(self, label_trues, label_preds):
for lt, lp in zip(label_trues, label_preds):
tmp = self._fast_hist(lt.item(), lp.item(), self.n_classes) #lt.item(), lp.item()
self.confusion_matrix += tmp
def get_scores(self):
"""Returns accuracy score evaluation result.
- overall accuracy
- mean accuracy
- mean IU
- fwavacc
"""
hist = self.confusion_matrix
# accuracy is recall/sensitivity for each class, predicted TP / all real positives
# axis in sum: perform summation along
if sum(hist.sum(axis=1)) != 0:
acc = sum(np.diag(hist)) / sum(hist.sum(axis=1))
print('acc is: ', acc)
else:
acc = 0.0
return acc
def plotcm(self):
print(self.confusion_matrix)
def reset(self):
self.confusion_matrix = np.zeros((self.n_classes, self.n_classes))
and during the test phase, with 1 epoch, I am using this:
if epoch % 1 == 0:
with torch.no_grad():
model.eval()
print("evaluating...")
total = 0.
batch_idx = 0
val_preds = []
val_labels = []
predictions = []
actuals = []
for i_batch, sample_batched in enumerate(dataloader_val):
val_pred, val_label, val_loss = evaluator.eval_test(sample_batched, model)
val_epoch_loss += val_loss
val_preds.extend(val_pred.tolist())
val_labels.extend(val_label)
total += len(val_label)
evaluator.metrics.update(torch.tensor(val_label).cuda(), val_pred)
print('evaluator.get_scores', evaluator.get_scores())
Here's how out from transformer looks like:
transformer out: tensor([[ 0.4381, -0.6186],
[ 0.4252, -0.4492],
[ 1.0657, -0.5201],
[ 0.8421, -0.6315],
[ 0.9444, -0.5340],
[ 0.9247, -0.6726],
[ 1.1587, -0.9463],
[ 1.0038, -1.0780],
[ 1.4244, -1.0721],
[ 0.4215, -0.7684],
[ 0.7522, -0.8166],
[ 1.2995, -0.9579],
[ 0.8080, -0.6492],
[ 1.0144, -0.5562],
[ 1.0666, -1.0291],
[ 0.3030, -0.7651],
[ 0.5221, -0.6741],
[ 1.1583, -0.4493],
[ 0.6098, -1.0080],
[ 0.3495, -1.0742],
[ 0.2278, -0.7298],
[ 0.5189, -0.6456],
[ 0.3409, -0.3661],
[ 0.9637, -0.9262],
[ 1.0781, -0.9345],
[ 1.0993, -1.0937],
[ 0.8297, -0.6071],
[ 0.5423, -1.1961],
[ 0.7860, -0.6777],
[-0.2522, -0.9376],
[ 0.6013, -0.9057],
[ 0.9975, -0.1858]], device='cuda:0', grad_fn=<AddmmBackward0>)
labels: tensor([1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1,
0, 0, 1, 1, 0, 1, 1, 0], dtype=torch.int32)
pred labels: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
I ran the training for 50 epochs and result is same. pred_labels are stuck at 0.

ValueError: Invalid parameter when fiting gridsearchcv

This is my code
param_grid = [{'kneighborsclassifier__n_neighbors': [3, 4, 5, 6, 7]},
{'svc__C': [0.1, 1, 10, 100], 'svc__gamma': [0.001, 0.01, 0.1, 1, 10]},
{'randomforestclassifier__n_estimators': [50, 100, 200, 300 ,400]},
{'decisiontreeclassifier__max_depth': [12, 25, 50, 75, 100]},
{'adaboostclassifier__n_estimators': [50, 100, 200, 300 ,400]},
{'baggingclassifier__n_estimators': [50, 100, 200, 300, 400]}
]
inner_cv = StratifiedShuffleSplit(test_size=.8, train_size=.2, n_splits=5, random_state=0)
outer_cv = StratifiedShuffleSplit(test_size=.8, train_size=.2, n_splits=5, random_state=0)
models = [knn, svc, forest, dtc, ada, bag]
model_names = ['knn', 'svc','forest', 'dtc', 'ada', 'bag']
for m, mname in zip(models, model_names):
pipe = make_pipeline(VarianceThreshold(threshold=1),
MinMaxScaler(),
SelectKBest(f_classif, k=20),
m)
grid = GridSearchCV(pipe, param_grid=param_grid, cv=inner_cv)
grid.fit(X_train_test, y_train_test)
nested_score = cross_val_score(grid, X=X_train_test, y=y_train_test.values.ravel(), cv=outer_cv)
print(mname)
print(grid.best_params_)
print(grid.best_score_)
print('\n')
This is the error:
ValueError: Invalid parameter svc for estimator Pipeline(memory=None,
steps=[('variancethreshold', VarianceThreshold(threshold=1)),
('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1))),
('selectkbest',
SelectKBest(k=20,
score_func=<function f_classif at 0x0000019E0A485AF8>)),
('kneighborsclassifier',
KNeighborsClassifier(algorithm='auto', leaf_size=30,
metric='minkowski', metric_params=None,
n_jobs=None, n_neighbors=5, p=2,
weights='uniform'))],
verbose=False). Check the list of available parameters with `estimator.get_params().keys()`.
I dont know whats wrong. I copied the parameter and model names from the named_steps of the pipeline. If I run it without a parameter grid it works so the problem is most likely there.
Seems to work like this, but I dont like it.
pg1 = {'kneighborsclassifier__n_neighbors': [3, 4, 5, 6, 7]}
pg2 = {'svc__C': [0.1, 1, 10, 100],
'svc__gamma': [0.001, 0.01, 0.1, 1, 10]}
pg3 = {'randomforestclassifier__n_estimators': [50, 100, 200, 300 ,400]}
pg4 = {'decisiontreeclassifier__max_depth': [12, 25, 50, 75, 100]}
pg5 = {'adaboostclassifier__n_estimators': [50, 100, 200, 300 ,400]}
pg6 = {'baggingclassifier__n_estimators': [50, 100, 200, 300, 400]}
param_grid_list = [pg1, pg2, pg3, pg4, pg5, pg6]
And then the loop becomes:
for m, p, mname in zip(models, param_grid_list, model_names):
pipe = make_pipeline(VarianceThreshold(threshold=1),
MinMaxScaler(),
SelectKBest(f_classif, k=20),
m)
grid = GridSearchCV(pipe, param_grid=p, cv=inner_cv)
grid.fit(X_train_test, y_train_test)
nested_score = cross_val_score(grid, X=X_train_test, y=y_train_test.values.ravel(), cv=outer_cv)
print(mname)
print(grid.best_params_)
print(grid.best_score_)
print('\n')

spot checking ml models. How do i fit all models do i use a for loop

I am spot checking bunch of regression models.how do a fit multiple ml models, Would i use a for loop and do model.fit
#Variables
alpha= [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
#function
def get_models(model=dict()):
model['lr'] = LinearRegression()
for values in alpha:
model["Lasso"]=Lasso(alpha=values)
model["Ridge"]=Ridge(alpha=values)
model["Huber"]=HuberRegressor()
model["Lars"]=Lars()
model["Lasso_l"]=LassoLars()
model["PA"]=PassiveAggressiveRegressor(max_iter=1000, tol=1e-3)
model["RANSAC"]=RANSACRegressor()
model["SGD"]=SGDRegressor(max_iter=1000, tol=1e-3)
model["theil"]=TheilSenRegressor()
model["cart"] = DecisionTreeRegressor()
model["extra"] = ExtraTreeRegressor()
model["svml"] = SVR(kernel='linear')
model["svmp"] = SVR(kernel='poly')
#Loaded data and have X and y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
#fitting models
Yes, once your dict is filled with get_models(), you can fit the models with a for loop:
for model in models:
model.fit(X_train, y_train)
You can easily loop through several Scikit Learn models, and do all the fitting too. Try the sample code below directly below and take at the links towards the bottom of my post.
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from sklearn import datasets
from sklearn.linear_model import SGDClassifier, LogisticRegression, \
Perceptron, PassiveAggressiveClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid, RadiusNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF, ConstantKernel, DotProduct, Matern, StationaryKernelMixin, WhiteKernel
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from utilities import *
from universal_params import *
def gen_classification_data(n=None):
"""
uses the iris data
:return: x, y
"""
iris = datasets.load_iris()
x = iris.data
y = iris.target
if n:
half = int(n/2)
np.concatenate((x[:half], x[-half:]), 1), np.concatenate((y[:half], y[-half:]), 0)
return x, y
linear_models_n_params = [
(SGDClassifier,
{'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge'],
'alpha': [0.0001, 0.001, 0.1],
**penalty_12none
}),
(LogisticRegression,
{**penalty_12, **max_iter, **tol, ** warm_start, **C,
'solver': ['liblinear']
}),
(Perceptron,
{**penalty_all, **alpha, **n_iter, **eta0, **warm_start
}),
(PassiveAggressiveClassifier,
{**C, **n_iter, **warm_start,
'loss': ['hinge', 'squared_hinge'],
})
]
linear_models_n_params_small = linear_models_n_params
svm_models_n_params = [
(SVC,
{**C, **kernel, **degree, **gamma, **coef0, **shrinking, **tol, **max_iter_inf2}),
(NuSVC,
{**nu, **kernel, **degree, **gamma, **coef0, **shrinking, **tol
}),
(LinearSVC,
{ **C, **penalty_12, **tol, **max_iter,
'loss': ['hinge', 'squared_hinge'],
})
]
svm_models_n_params_small = [
(SVC,
{**kernel, **degree, **shrinking
}),
(NuSVC,
{**nu_small, **kernel, **degree, **shrinking
}),
(LinearSVC,
{ **C_small,
'penalty': ['l2'],
'loss': ['hinge', 'squared_hinge'],
})
]
neighbor_models_n_params = [
(KMeans,
{'algorithm': ['auto', 'full', 'elkan'],
'init': ['k-means++', 'random']}),
(KNeighborsClassifier,
{**n_neighbors, **neighbor_algo, **neighbor_leaf_size, **neighbor_metric,
'weights': ['uniform', 'distance'],
'p': [1, 2]
}),
(NearestCentroid,
{**neighbor_metric,
'shrink_threshold': [1e-3, 1e-2, 0.1, 0.5, 0.9, 2]
}),
(RadiusNeighborsClassifier,
{**neighbor_radius, **neighbor_algo, **neighbor_leaf_size, **neighbor_metric,
'weights': ['uniform', 'distance'],
'p': [1, 2],
'outlier_label': [-1]
})
]
gaussianprocess_models_n_params = [
(GaussianProcessClassifier,
{**warm_start,
'kernel': [RBF(), ConstantKernel(), DotProduct(), WhiteKernel()],
'max_iter_predict': [500],
'n_restarts_optimizer': [3],
})
]
bayes_models_n_params = [
(GaussianNB, {})
]
nn_models_n_params = [
(MLPClassifier,
{ 'hidden_layer_sizes': [(16,), (64,), (100,), (32, 32)],
'activation': ['identity', 'logistic', 'tanh', 'relu'],
**alpha, **learning_rate, **tol, **warm_start,
'batch_size': ['auto', 50],
'max_iter': [1000],
'early_stopping': [True, False],
'epsilon': [1e-8, 1e-5]
})
]
nn_models_n_params_small = [
(MLPClassifier,
{ 'hidden_layer_sizes': [(64,), (32, 64)],
'batch_size': ['auto', 50],
'activation': ['identity', 'tanh', 'relu'],
'max_iter': [500],
'early_stopping': [True],
**learning_rate_small
})
]
tree_models_n_params = [
(RandomForestClassifier,
{'criterion': ['gini', 'entropy'],
**max_features, **n_estimators, **max_depth,
**min_samples_split, **min_impurity_split, **warm_start, **min_samples_leaf,
}),
(DecisionTreeClassifier,
{'criterion': ['gini', 'entropy'],
**max_features, **max_depth, **min_samples_split, **min_impurity_split, **min_samples_leaf
}),
(ExtraTreesClassifier,
{**n_estimators, **max_features, **max_depth,
**min_samples_split, **min_samples_leaf, **min_impurity_split, **warm_start,
'criterion': ['gini', 'entropy']})
]
tree_models_n_params_small = [
(RandomForestClassifier,
{**max_features_small, **n_estimators_small, **min_samples_split, **max_depth_small, **min_samples_leaf
}),
(DecisionTreeClassifier,
{**max_features_small, **max_depth_small, **min_samples_split, **min_samples_leaf
}),
(ExtraTreesClassifier,
{**n_estimators_small, **max_features_small, **max_depth_small,
**min_samples_split, **min_samples_leaf})
]
def run_linear_models(x, y, small = True, normalize_x = True):
return big_loop(linear_models_n_params_small if small else linear_models_n_params,
StandardScaler().fit_transform(x) if normalize_x else x, y, isClassification=True)
def run_svm_models(x, y, small = True, normalize_x = True):
return big_loop(svm_models_n_params_small if small else svm_models_n_params,
StandardScaler().fit_transform(x) if normalize_x else x, y, isClassification=True)
def run_neighbor_models(x, y, normalize_x = True):
return big_loop(neighbor_models_n_params,
StandardScaler().fit_transform(x) if normalize_x else x, y, isClassification=True)
def run_gaussian_models(x, y, normalize_x = True):
return big_loop(gaussianprocess_models_n_params,
StandardScaler().fit_transform(x) if normalize_x else x, y, isClassification=True)
def run_nn_models(x, y, small = True, normalize_x = True):
return big_loop(nn_models_n_params_small if small else nn_models_n_params,
StandardScaler().fit_transform(x) if normalize_x else x, y, isClassification=True)
def run_tree_models(x, y, small = True, normalize_x = True):
return big_loop(tree_models_n_params_small if small else tree_models_n_params,
StandardScaler().fit_transform(x) if normalize_x else x, y, isClassification=True)
def run_all(x, y, small = True, normalize_x = True, n_jobs=cpu_count()-1):
all_params = (linear_models_n_params_small if small else linear_models_n_params) + \
(nn_models_n_params_small if small else nn_models_n_params) + \
([] if small else gaussianprocess_models_n_params) + \
neighbor_models_n_params + \
(svm_models_n_params_small if small else svm_models_n_params) + \
(tree_models_n_params_small if small else tree_models_n_params)
return big_loop(all_params,
StandardScaler().fit_transform(x) if normalize_x else x, y,
isClassification=True, n_jobs=n_jobs)
if __name__ == '__main__':
x, y = gen_classification_data()
run_all(x, y, n_jobs=1)
Here are a couple examples that you can follow.
https://github.com/PyDataBlog/Python-for-Data-Science/blob/master/Tutorials/Yellow%20brick.ipynb
https://medium.com/vickdata/a-simple-guide-to-scikit-learn-pipelines-4ac0d974bdcf
https://machinelearningmastery.com/compare-machine-learning-algorithms-python-scikit-learn/

ValueError of hyperopt in searching parameters of RandomForest

I am trying to find parameters of RandomForestClassifier using hyperopt. Here is my code:
X, y = load_wine(return_X_y=True)
def rf_neg_score(params):
X, y = params.pop('X'), params.pop('y')
cv = params.pop('cv')
scoring = params.pop('scoring')
rf_clf = RandomForestClassifier(**params)
score = cross_val_score(rf_clf, X=X, y=y, n_jobs=-1, scoring=scoring,
cv=cv).mean()
return -score
rf_search_space = {
'n_estimators': hp.choice('n_estimators', np.arange(10, 1000, dtype=int)),
'max_depth': hp.choice('max_depth', np.arange(2, 8, dtype=int)),
'max_leaf_nodes': hp.choice('max_leaf_nodes', np.arange(2, 65, dtype=int)),
'n_jobs': -1,
'X': X,
'y': y,
'cv': StratifiedKFold(n_splits=5),
'scoring': 'f1_micro'
}
rf_best_params = fmin(fn=rf_neg_score, space=rf_search_space, max_evals=100,
algo=tpe.suggest)
After I run ValueError is raised at once:
/usr/local/lib/python3.6/dist-packages/hyperopt/utils.py in use_obj_for_literal_in_memo(expr, obj, lit, memo)
167 for node in pyll.dfs(expr):
168 try:
--> 169 if node.obj == lit:
170 memo[node] = obj
171 except AttributeError:
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
What do you think what I am doing wrong?
Found decision. It seems hyperopt checks every item in search space if it has hyperopt.hp.* function and while checking ValueError is raised. So there`s no opportunity to provide data with that way. Here is right code:
def rf_neg_score(params):
scoring = params.pop('scoring')
cv = params.pop('cv')
rf_clf = RandomForestClassifier(**params)
# X and y are provided out of function
score = cross_val_score(rf_clf, X=X, y=y, n_jobs=-1,
scoring='f1_micro', cv=5).mean()
return -score
rf_search_space = {
'n_estimators': hp.choice('n_estimators', np.arange(10, 1000, dtype=int)),
'max_depth': hp.choice('max_depth', np.arange(2, 8, dtype=int)),
'max_leaf_nodes': hp.choice('max_leaf_nodes', np.arange(2, 65, dtype=int)),
'scoring': 'f1_micro',
'cv': StratifiedKFold(n_splits=5)
}
rf_best_params = fmin(fn=rf_neg_score, space=rf_search_space, max_evals=100,
algo=tpe.suggest)

Categories

Resources