This is my code
param_grid = [{'kneighborsclassifier__n_neighbors': [3, 4, 5, 6, 7]},
{'svc__C': [0.1, 1, 10, 100], 'svc__gamma': [0.001, 0.01, 0.1, 1, 10]},
{'randomforestclassifier__n_estimators': [50, 100, 200, 300 ,400]},
{'decisiontreeclassifier__max_depth': [12, 25, 50, 75, 100]},
{'adaboostclassifier__n_estimators': [50, 100, 200, 300 ,400]},
{'baggingclassifier__n_estimators': [50, 100, 200, 300, 400]}
]
inner_cv = StratifiedShuffleSplit(test_size=.8, train_size=.2, n_splits=5, random_state=0)
outer_cv = StratifiedShuffleSplit(test_size=.8, train_size=.2, n_splits=5, random_state=0)
models = [knn, svc, forest, dtc, ada, bag]
model_names = ['knn', 'svc','forest', 'dtc', 'ada', 'bag']
for m, mname in zip(models, model_names):
pipe = make_pipeline(VarianceThreshold(threshold=1),
MinMaxScaler(),
SelectKBest(f_classif, k=20),
m)
grid = GridSearchCV(pipe, param_grid=param_grid, cv=inner_cv)
grid.fit(X_train_test, y_train_test)
nested_score = cross_val_score(grid, X=X_train_test, y=y_train_test.values.ravel(), cv=outer_cv)
print(mname)
print(grid.best_params_)
print(grid.best_score_)
print('\n')
This is the error:
ValueError: Invalid parameter svc for estimator Pipeline(memory=None,
steps=[('variancethreshold', VarianceThreshold(threshold=1)),
('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1))),
('selectkbest',
SelectKBest(k=20,
score_func=<function f_classif at 0x0000019E0A485AF8>)),
('kneighborsclassifier',
KNeighborsClassifier(algorithm='auto', leaf_size=30,
metric='minkowski', metric_params=None,
n_jobs=None, n_neighbors=5, p=2,
weights='uniform'))],
verbose=False). Check the list of available parameters with `estimator.get_params().keys()`.
I dont know whats wrong. I copied the parameter and model names from the named_steps of the pipeline. If I run it without a parameter grid it works so the problem is most likely there.
Seems to work like this, but I dont like it.
pg1 = {'kneighborsclassifier__n_neighbors': [3, 4, 5, 6, 7]}
pg2 = {'svc__C': [0.1, 1, 10, 100],
'svc__gamma': [0.001, 0.01, 0.1, 1, 10]}
pg3 = {'randomforestclassifier__n_estimators': [50, 100, 200, 300 ,400]}
pg4 = {'decisiontreeclassifier__max_depth': [12, 25, 50, 75, 100]}
pg5 = {'adaboostclassifier__n_estimators': [50, 100, 200, 300 ,400]}
pg6 = {'baggingclassifier__n_estimators': [50, 100, 200, 300, 400]}
param_grid_list = [pg1, pg2, pg3, pg4, pg5, pg6]
And then the loop becomes:
for m, p, mname in zip(models, param_grid_list, model_names):
pipe = make_pipeline(VarianceThreshold(threshold=1),
MinMaxScaler(),
SelectKBest(f_classif, k=20),
m)
grid = GridSearchCV(pipe, param_grid=p, cv=inner_cv)
grid.fit(X_train_test, y_train_test)
nested_score = cross_val_score(grid, X=X_train_test, y=y_train_test.values.ravel(), cv=outer_cv)
print(mname)
print(grid.best_params_)
print(grid.best_score_)
print('\n')
Related
A accuracy score from optuna and a score in cross_val_score were different. Why does it occuer and which score should I choose?
I used the hyperparameters that I got in optuna in cross_val_score.
def objective_lgb(trial):
num_leaves = trial.suggest_int("num_leaves", 2, 1000)
max_depth = trial.suggest_int("max_depth", 2, 100)
learning_rate = trial.suggest_float('learning_rate', 0.001, 1)
n_estimators = trial.suggest_int('n_estimators', 100, 2000)
min_child_samples = trial.suggest_int('min_child_samples', 3, 1000)
subsample = trial.suggest_float('subsample', 0.000001, 1)
colsample_bytree = trial.suggest_float('colsample_bytree', 0.00000001, 1)
reg_alpha = trial.suggest_float('reg_alpha', 0, 400)
reg_lambda = trial.suggest_float("reg_lambda", 0, 400)
importance_type = trial.suggest_categorical('importance_type', ["split", "gain"])
lgb_clf = lgb.LGBMClassifier(random_state=1,
objective="multiclass",
num_class = 3,
importance_type=importance_type,
num_leaves=num_leaves,
max_depth=max_depth,
learning_rate=learning_rate,
n_estimators=n_estimators,
min_child_samples=min_child_samples,
subsample=subsample,
colsample_bytree=colsample_bytree,
reg_alpha=reg_alpha,
reg_lambda=reg_lambda
)
score = cross_val_score(lgb_clf, train_x, train_y, n_jobs=-1, cv=KFold(n_splits=10, shuffle=True, random_state=1), scoring='accuracy')
mean_score = score.mean()
return mean_score
lgb_study = optuna.create_study(direction="maximize")
lgb_study.optimize(objective_lgb, n_trials=1500)
lgb_trial = lgb_study.best_trial
print("accuracy:", lgb_trial.value)
print()
print("Best params:", lgb_trial.params)
=========================================================
def light_check(x,params):
model = lgb.LGBMClassifier()
scores = cross_val_score(model,x,y,cv=KFold(n_splits=10, shuffle=True, random_state=1),n_jobs=-1)
mean = scores.mean()
return scores, mean
light_check(x,{'num_leaves': 230, 'max_depth': 53, 'learning_rate': 0.04037430031226232, 'n_estimators': 1143, 'min_child_samples': 381, 'subsample': 0.12985990464862135, 'colsample_bytree': 0.8914118949904919, 'reg_alpha': 31.869348047391053, 'reg_lambda': 17.45653692887209, 'importance_type': 'split'})
From what I can see, you are using X_train, y_train in the optuna call, while in light_check you are passing x and y. Assuming you did a spilt in some unknown code, the data set for optuna is smaller and you get a different number.
I am using sklearn along with optuna for HPO. I would like to create a custom function that would take an sklearn pipeline as input and return optuna-specifc grids. Returning sklearn specific param grids (i.e. dictionaries) seems to be more straight-forward (duh) ; this is what I 've got so far :
def grid_from_estimator(estimator, type = 'sklearn'):
estimator_name = estimator.named_steps['estimator'].__class__.__name__
if type == 'sklearn':
if estimator_name=='LogisticRegression':
params = {
'estimator__penalty': ['l1','elasticnet'],
'estimator__C': np.logspace(-4, 4, 20)
}
elif estimator_name=='LGBMClassifier':
params = {
'estimator__n_estimators': np.arange(100, 1000, 200),
'estimator__boosting_type':['gbdt','dart'],
'estimator__max_depth': np.arange(6, 12),
'estimator__num_leaves': np.arange(30, 150,5),
'estimator__learning_rate': [1e-2/2 , 1e-2, 1e-1/2, 1e-1, 0.5, 1],
'estimator__min_child_samples': np.arange(20, 100, 5),
'estimator__subsample': np.arange(0.65, 1, 0.05),
'estimator__colsample_bytree': np.arange(0.4, 0.75, 0.05),
'estimator__reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
'estimator__reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100],
'estimator__iterations': np.arange(100, 800, 100),
'estimator__objective': 'binary'
}
elif type == 'optuna':
if estimator_name == 'LogisticRegression':
params = {
'estimator__penalty': trial.suggest_categorical('penalty', ['l1', 'elasticnet']),
'estimator__C': trial.suggest.suggest_loguniform('c', -4, 4)
}
elif estimator_name == 'LGBMClassifier':
params = {
'estimator__n_estimators': trial.suggest_int('n_estimators', 100, 1000),
'estimator__boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart']),
'estimator__max_depth': trial.suggest_int('max_depth', 6, 12),
'estimator__num_leaves': trial.suggest_int('num_leaves', 30, 150, 5),
'estimator__learning_rate': trial.suggest_float('learning_rate', 1e-4, 1),
'estimator__min_child_samples': trial.suggest_int('min_child_samples', 20, 100),
'estimator__subsample': trial.suggest_float('subsample', 0.5, 1),
'estimator__colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 0.75),
'estimator__reg_alpha': trial.suggest_float('reg_alpha', 1e-2, 10),
'estimator__reg_lambda': trial.suggest_float('reg_lambda', 1e-2, 10)
}
return params
The "trial.suggest_..." parts keeps 'complaining' and returning an error; although I understand the reason why, I can't see any way around it. Is this even possible? Any ideas?
Appreciate your support!
I think, something along this should work,
def grid_from_estimator(estimator, trial, type = 'sklearn'):
pass
def your_objective_function(trial):
params = grid_from_estimator('LogisticRegression', trial, 'optuna')
#Rest of the code here.
def tune_model():
study = optuna.create_study()
study.optimize(your_objective_function, n_trials=20)
tune_model()
An example method using optuna ask and tell interface.
Code
import optuna
import numpy as np
def optuna_objective(estimator_name, params):
if estimator_name == 'LogisticRegression':
x = params['x']
y = params['y']
return (x - 2) ** 2 + y
if estimator_name == 'LGBMClassifier':
# estimator__n_estimators = params['estimator__n_estimators']
# return accuracy
pass
return None
def grid_from_estimator(estimator_name, type_='sklearn', study=None):
params, trial = None, None
if type_ == 'sklearn':
if estimator_name == 'LogisticRegression':
params = {
'estimator__penalty': ['l1','elasticnet'],
'estimator__C': np.logspace(-4, 4, 20)
}
elif estimator_name == 'LGBMClassifier':
params = {
'estimator__n_estimators': np.arange(100, 1000, 200),
'estimator__boosting_type':['gbdt','dart'],
'estimator__max_depth': np.arange(6, 12),
'estimator__num_leaves': np.arange(30, 150,5),
'estimator__learning_rate': [1e-2/2 , 1e-2, 1e-1/2, 1e-1, 0.5, 1],
'estimator__min_child_samples': np.arange(20, 100, 5),
'estimator__subsample': np.arange(0.65, 1, 0.05),
'estimator__colsample_bytree': np.arange(0.4, 0.75, 0.05),
'estimator__reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
'estimator__reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100],
'estimator__iterations': np.arange(100, 800, 100),
'estimator__objective': 'binary'
}
elif type_ == 'optuna':
trial = study.ask()
if estimator_name == 'LogisticRegression':
params = {
'x': trial.suggest_float('x', -10, 10),
'y': trial.suggest_float('y', -10, 10)
}
# params = {
# 'estimator__penalty': trial.suggest_categorical('estimator__penalty', ['l1', 'elasticnet']),
# 'estimator__C': trial.suggest_float('estimator__C', -4, 4)
# }
elif estimator_name == 'LGBMClassifier':
params = {
'estimator__n_estimators': trial.suggest_int('estimator__n_estimators', 100, 1000),
'estimator__boosting_type': trial.suggest_categorical('estimator__boosting_type', ['gbdt', 'dart']),
'estimator__max_depth': trial.suggest_int('estimator__max_depth', 6, 12),
'estimator__num_leaves': trial.suggest_int('estimator__num_leaves', 30, 150, 5),
'estimator__learning_rate': trial.suggest_float('estimator__learning_rate', 1e-4, 1),
'estimator__min_child_samples': trial.suggest_int('estimator__min_child_samples', 20, 100),
'estimator__subsample': trial.suggest_float('estimator__subsample', 0.5, 1),
'estimator__colsample_bytree': trial.suggest_float('estimator__colsample_bytree', 0.4, 0.75),
'estimator__reg_alpha': trial.suggest_float('estimator__reg_alpha', 1e-2, 10),
'estimator__reg_lambda': trial.suggest_float('estimator__reg_lambda', 1e-2, 10)
}
return params, trial
# (1) sklearn example
print('SKLEARN')
estimator_name = 'LogisticRegression'
optimizer_type = 'sklearn'
params, _ = grid_from_estimator(estimator_name, type_=optimizer_type)
print(params)
print()
# (2) Optuna example with ask and tell interface.
print('OPTUNA')
study = optuna.create_study(direction='maximize')
n_trials = 10
estimator_name = 'LogisticRegression'
optimizer_type = 'optuna'
for _ in range(n_trials):
params, trial = grid_from_estimator(estimator_name, type_=optimizer_type, study=study)
objective_value = optuna_objective(estimator_name, params)
study.tell(trial, objective_value) # tell the pair of trial and objective value
print(f'trialnum: {trial.number}, params: {params}, value: {objective_value}')
best_params = study.best_params
best_x = best_params["x"]
best_y = best_params["y"]
best_value = study.best_value
best_trial_num = study.best_trial.number
print(f"best x: {best_x}, best y: {best_y}, (x - 2)^2 + y: {(best_x - 2) ** 2 + best_y}, best_value: {best_value}, best_trial_num: {best_trial_num}") # trial num starts at 0
Output
SKLEARN
{'estimator__penalty': ['l1', 'elasticnet'], 'estimator__C': array([1.00000000e-04, 2.63665090e-04, 6.95192796e-04, 1.83298071e-03,
4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04])}
OPTUNA
[I 2021-11-25 19:03:09,673] A new study created in memory with name: no-name-f5046b21-f579-4c74-8046-79420c256d4a
trialnum: 0, params: {'x': 2.905894660287128, 'y': -4.537699327718261}, value: -3.7170541921815303
trialnum: 1, params: {'x': -9.275103438355583, 'y': -5.925000918692578}, value: 121.2029566269253
trialnum: 2, params: {'x': -2.9531168045205103, 'y': 5.253730464314739}, value: 29.78709654353821
trialnum: 3, params: {'x': 3.766902399344163, 'y': 3.778408673279479}, value: 6.900352762087639
trialnum: 4, params: {'x': -0.897563829823584, 'y': -0.887774211794973}, value: 7.508101936106943
trialnum: 5, params: {'x': -2.2256917634354645, 'y': 3.8017184220598903}, value: 21.658189301626216
trialnum: 6, params: {'x': -6.333366980619912, 'y': 9.87067058585388}, value: 79.3156758195401
trialnum: 7, params: {'x': 2.570258991787558, 'y': -0.1959178948625162}, value: 0.1292774228520457
trialnum: 8, params: {'x': 2.94430596072913, 'y': 4.318454050149043}, value: 5.210167797617609
trialnum: 9, params: {'x': 5.972023459737699, 'y': 4.165369460555215}, value: 19.942339825261854
best x: -9.275103438355583, best y: -5.925000918692578, (x - 2)^2 + y: 121.2029566269253, best_value: 121.2029566269253, best_trial_num: 1
I write a classification algorythm and I use 6 differents models. I want to improve the model by using finetuning parameters for each model. The problem I am encounering is related to my "for loop". In fact, I loop in three different dictionnaries but the correspondance between the model I use in the gridsearch and the parameters are not keep since the dictionnaries are not ordered;
I seem to failed to find another solution :
here my code and the result as you can see the model_name is different from the param_name so I get multiple erros like for example (ValueError: Invalid parameter alpha for estimator LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
)
below the code
model1 = LinearSVC()
model2 = MultinomialNB()
model3 = LogisticRegression()
model4 = RandomForestClassifier()
model5 = KNeighborsClassifier()
model6 = MLPClassifier(max_iter=300, random_state=1)
models = {'Model_SVC': model1, 'Model_G_NB': model2, 'Model_LR': model3, 'Model_RF': model4, 'Model_KN': model5, 'Model_MLP': model6}
# list of parameters
parameter_RF = {'min_samples_split': [3, 5, 10],
'criterion': ['gini', 'entropy'],
'n_estimators' : [100, 300],
'max_features': ['auto', 'sqrt','log2'],
'bootstrap': ['True', 'False'],
'max_depth': [3, 5, 15, 25]
}
parameter_LinearSvc = {'C': [0.001, 0.01, 0.1, 1, 10, 100]
}
parameter_LR = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
'penalty' : ['l1', 'l2'],
'solver' : ['liblinear', 'warn'],
'dual' : ['True','False'],
'max_iter' :[100, 110, 120, 130, 140]
}
parameter_NB = {'alpha': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
'loss': ['hinge', 'hinge_squarred'],
'penalty' : ['l1', 'l2']
}
k_range = list(range(1, 31))
weight_options = ['uniform', 'distance']
parameter_KNN = dict(n_neighbors=k_range, weights=weight_options)
parameter_MLP = { 'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
'activation': ['tanh', 'relu'],
'solver': ['sgd', 'adam'],
'alpha': [0.0001, 0.05],
'learning_rate': ['constant','adaptive'],
'max_iter' : [100, 200, 300]
}
parameters_dict = {'Model_SVC': parameter_LinearSvc, 'Model_G_NB': parameter_NB, 'Model_LR': parameter_LR, 'Model_RF': parameter_LR, 'Model_KN': parameter_KNN, 'Model_MLP': parameter_MLP}
cv_splitter = KFold(n_splits=10, shuffle=False, random_state=None)
for feature_name, feature in features.items():
for model_name, model in models.items():
for param_name, parameter in parameters_dict.items():
clf = GridSearchCV(estimator=model, param_grid=parameter, cv=cv_splitter, verbose = 1, n_jobs = -1, return_train_score=True)
best_model = clf.fit(feature, ylabels)
output : as you can see sometimes it works but other times param and model are not the same which causes the error
[5 rows x 7 columns]
Feature: vecteur_CV
Model: Model_SVC
Param: Model_SVC
Fitting 10 folds for each of 6 candidates, totalling 60 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 58 out of 60 | elapsed: 2.8s remaining: 0.1s
/svm/base.py:929: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
"the number of iterations.", ConvergenceWarning)
[Parallel(n_jobs=-1)]: Done 60 out of 60 | elapsed: 2.8s finished
Feature: vecteur_CV
Model: Model_SVC
Param: Model_G_NB
Fitting 10 folds for each of 24 candidates, totalling 240 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
joblib.externals.loky.process_executor._RemoteTraceback:
"""
Traceback (most recent call last):
File "/ho/anaconda3/lib/python3.7/site-packages/joblib/externals/loky/process_executor.py", line 418, in _process_worker
r = call_item()
File "/ho/anaconda3/lib/python3.7/site-packages/joblib/externals/loky/process_executor.py", line 272, in __call__
return self.fn(*self.args, **self.kwargs)
File "/ho/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 567, in __call__
return self.func(*args, **kwargs)
File "/ho/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 225, in __call__
for func, args, kwargs in self.items]
File "/ho/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 225, in <listcomp>
for func, args, kwargs in self.items]
File "/home/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 503, in _fit_and_score
estimator.set_params(**parameters)
File "/home/anaconda3/lib/python3.7/site-packages/sklearn/base.py", line 224, in set_params
(key, self))
ValueError: Invalid parameter alpha for estimator LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
Features look like this
`X_data, X_data_0, X_data_1, X_data_2 = features_fusion(verbatim, first_arg)
features = {'vecteur_CV': X_data, 'vecteur_NEG': X_data_0, 'Vecteur_NEG_lexique': X_data_1, 'Vecteur_NEG_CV': X_data_2}
I used 2 slightly different versions of the Iris dataset and this code below runs (albeit with many warnings during training):
from sklearn.model_selection import GridSearchCV,KFold
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
data = load_iris()
model1 = LinearSVC()
model2 = MultinomialNB()
model3 = LogisticRegression()
model4 = RandomForestClassifier()
model5 = KNeighborsClassifier()
model6 = MLPClassifier(max_iter=300, random_state=1)
models = {'Model_SVC': model1, 'Model_G_NB': model2, 'Model_LR': model3,
'Model_RF': model4, 'Model_KN': model5, 'Model_MLP': model6}
# list of parameters
parameter_RF = {'max_depth': [2,3, 5, 15, 25],
'min_samples_split': [3, 5, 10],
'criterion': ['gini', 'entropy'],
'n_estimators' : [100, 300],
'max_features': ['auto', 'sqrt','log2'],
'bootstrap': ['True', 'False'],
}
parameter_LinearSvc = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
parameter_LR = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
'penalty' : ['l1', 'l2'],
'solver' : ['liblinear', 'warn'],
#'dual' : ['True','False'],
'max_iter' :[100, 110, 120, 130, 140]
}
parameter_NB = {'alpha': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001],
#'loss': ['hinge', 'hinge_squarred'],
#'penalty' : ['l1', 'l2']
}
k_range = list(range(1, 31))
weight_options = ['uniform', 'distance']
parameter_KNN = dict(n_neighbors=k_range, weights=weight_options)
parameter_MLP = {'hidden_layer_sizes':[(50,50,50),(50,100,50),(100,)],
'activation': ['tanh', 'relu'],
'solver': ['sgd', 'adam'],
'alpha': [0.0001, 0.05],
'learning_rate': ['constant','adaptive'],
'max_iter' : [100, 200, 300]
}
parameters_dict = {'Model_SVC': parameter_LinearSvc,
'Model_G_NB': parameter_NB,
'Model_LR': parameter_LR, 'Model_RF': parameter_RF,
'Model_KN': parameter_KNN, 'Model_MLP': parameter_MLP}
cv_splitter = KFold(n_splits=10, shuffle=False, random_state=None)
#features = {'vecteur_CV': X_data, 'vecteur_NEG': X_data_0,
# 'Vecteur_NEG_lexique': X_data_1, 'Vecteur_NEG_CV': X_data_2}
features = {'iris': data['data'],'iris_sub':data['data'][:,1:]}
ylabels = data['target']
for feature_name, feature in features.items():
#print(feature_name, feature)
for model_name in models:
print('Training model: ', model_name)
clf = GridSearchCV(estimator=models[model_name],
param_grid=parameters_dict[model_name],
cv=cv_splitter, verbose = 1, n_jobs = -1,
return_train_score=True)
best_model = clf.fit(feature, ylabels)
#for feature_name, feature in features.items():
# for model_name, model in models.items():
# for param_name, parameter in parameters_dict.items():
# print(model_name,model,param_name,parameter)
# clf = GridSearchCV(estimator=model, param_grid=parameter,
# cv=cv_splitter, verbose = 1, n_jobs = -1,
# return_train_score=True)
# best_model = clf.fit(feature, ylabels)
I had to comment some model parameters as they gave errors. There was also a typo in your snippet above 'Model_RF': parameter_LR should be 'Model_RF': parameter_RF. I'm not sure if that was the reason of your error. I also removed the inner parameters_dict loop as I could access all elements using the same keys as models.
I am trying to find parameters of RandomForestClassifier using hyperopt. Here is my code:
X, y = load_wine(return_X_y=True)
def rf_neg_score(params):
X, y = params.pop('X'), params.pop('y')
cv = params.pop('cv')
scoring = params.pop('scoring')
rf_clf = RandomForestClassifier(**params)
score = cross_val_score(rf_clf, X=X, y=y, n_jobs=-1, scoring=scoring,
cv=cv).mean()
return -score
rf_search_space = {
'n_estimators': hp.choice('n_estimators', np.arange(10, 1000, dtype=int)),
'max_depth': hp.choice('max_depth', np.arange(2, 8, dtype=int)),
'max_leaf_nodes': hp.choice('max_leaf_nodes', np.arange(2, 65, dtype=int)),
'n_jobs': -1,
'X': X,
'y': y,
'cv': StratifiedKFold(n_splits=5),
'scoring': 'f1_micro'
}
rf_best_params = fmin(fn=rf_neg_score, space=rf_search_space, max_evals=100,
algo=tpe.suggest)
After I run ValueError is raised at once:
/usr/local/lib/python3.6/dist-packages/hyperopt/utils.py in use_obj_for_literal_in_memo(expr, obj, lit, memo)
167 for node in pyll.dfs(expr):
168 try:
--> 169 if node.obj == lit:
170 memo[node] = obj
171 except AttributeError:
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
What do you think what I am doing wrong?
Found decision. It seems hyperopt checks every item in search space if it has hyperopt.hp.* function and while checking ValueError is raised. So there`s no opportunity to provide data with that way. Here is right code:
def rf_neg_score(params):
scoring = params.pop('scoring')
cv = params.pop('cv')
rf_clf = RandomForestClassifier(**params)
# X and y are provided out of function
score = cross_val_score(rf_clf, X=X, y=y, n_jobs=-1,
scoring='f1_micro', cv=5).mean()
return -score
rf_search_space = {
'n_estimators': hp.choice('n_estimators', np.arange(10, 1000, dtype=int)),
'max_depth': hp.choice('max_depth', np.arange(2, 8, dtype=int)),
'max_leaf_nodes': hp.choice('max_leaf_nodes', np.arange(2, 65, dtype=int)),
'scoring': 'f1_micro',
'cv': StratifiedKFold(n_splits=5)
}
rf_best_params = fmin(fn=rf_neg_score, space=rf_search_space, max_evals=100,
algo=tpe.suggest)
Following up from my question yesterday (How to speed up nested cross validation in python?) I am trying to run models in parallel processing, however with my code is it possible to check if these models, each requiring 1 core only to run on, could run on cores in parallel or whether with this code it will always be 1 core only taking on the models 1 at a time?
I have access to higher powered computing where I could ask for 6 cores, a core per model, however, I am not sure with my code whether the models would actually be assigned their own core and run in parallel. If that makes sense (apologies if this is completely misinformed, as I am trying to learn as I go, any help would be appreciated). With n_jobs=1 for each of my models in their gridsearch can I further specifiy for each to have their own core?
Here is my code:
dataset= pd.read_csv('data.csv')
data = dataset.drop(["gene"],1)
df = data.iloc[:,0:24]
df = df.fillna(0)
X = MinMaxScaler().fit_transform(df)
le = preprocessing.LabelEncoder()
encoded_value = le.fit_transform(["certain", "likely", "possible", "unlikely"])
Y = le.fit_transform(data["category"])
sm = SMOTE(random_state=100)
X_res, y_res = sm.fit_resample(X, Y)
seed = 7
logreg = LogisticRegression(penalty='l1', solver='liblinear',multi_class='auto')
LR_par= {'penalty':['l1'], 'C': [0.5, 1, 5, 10], 'max_iter':[500, 1000, 5000]}
rfc =RandomForestClassifier()
param_grid = {'bootstrap': [True, False],
'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
'max_features': ['auto', 'sqrt'],
'min_samples_leaf': [1, 2, 4,25],
'min_samples_split': [2, 5, 10, 25],
'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}
mlp = MLPClassifier(random_state=seed)
parameter_space = {'hidden_layer_sizes': [(10,20), (10,20,10), (50,)],
'activation': ['tanh', 'relu'],
'solver': ['adam', 'sgd'],
'max_iter': [10000],
'alpha': [0.1, 0.01, 0.001],
'learning_rate': ['constant','adaptive']}
gbm = GradientBoostingClassifier(min_samples_split=25, min_samples_leaf=25)
param = {"loss":["deviance"],
"learning_rate": [0.15,0.1,0.05,0.01,0.005,0.001],
"min_samples_split": [2, 5, 10, 25],
"min_samples_leaf": [1, 2, 4,25],
"max_depth":[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
"max_features":['auto', 'sqrt'],
"criterion": ["friedman_mse"],
"n_estimators":[200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
}
svm = SVC(gamma="scale", probability=True)
tuned_parameters = {'kernel':('linear', 'rbf'), 'C':(1,0.25,0.5,0.75)}
def baseline_model(optimizer='adam', learn_rate=0.01):
model = Sequential()
model.add(Dense(100, input_dim=X_res.shape[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(50, activation='relu')) #8 is the dim/ the number of hidden units (units are the kernel)
model.add(Dense(4, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
return model
keras = KerasClassifier(build_fn=baseline_model, batch_size=32, epochs=100, verbose=0)
learn_rate = [0.001, 0.01, 0.1, 0.2, 0.3]
optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']
kerasparams = dict(optimizer=optimizer, learn_rate=learn_rate)
inner_cv = KFold(n_splits=10, shuffle=True, random_state=seed)
outer_cv = KFold(n_splits=10, shuffle=True, random_state=seed)
models = []
models.append(('GBM', GridSearchCV(gbm, param, cv=inner_cv,iid=False, n_jobs=1)))
models.append(('RFC', GridSearchCV(rfc, param_grid, cv=inner_cv,iid=False, n_jobs=1)))
models.append(('LR', GridSearchCV(logreg, LR_par, cv=inner_cv, iid=False, n_jobs=1)))
models.append(('SVM', GridSearchCV(svm, tuned_parameters, cv=inner_cv, iid=False, n_jobs=1)))
models.append(('MLP', GridSearchCV(mlp, parameter_space, cv=inner_cv,iid=False, n_jobs=1)))
models.append(('Keras', GridSearchCV(estimator=keras, param_grid=kerasparams, cv=inner_cv,iid=False, n_jobs=1)))
results = []
names = []
scoring = 'accuracy'
X_train, X_test, Y_train, Y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=0)
for name, model in models:
nested_cv_results = model_selection.cross_val_score(model, X_res, y_res, cv=outer_cv, scoring=scoring)
results.append(nested_cv_results)
names.append(name)
msg = "Nested CV Accuracy %s: %f (+/- %f )" % (name, nested_cv_results.mean()*100, nested_cv_results.std()*100)
print(msg)
model.fit(X_train, Y_train)
print('Test set accuracy: {:.2f}'.format(model.score(X_test, Y_test)*100), '%')
print("Best Parameters: \n{}\n".format(model.best_params_))
print("Best CV Score: \n{}\n".format(model.best_score_))
Edit: I have now tried my for loop as:
def run_models(models):
nested_cv_results = model_selection.cross_val_score(model, X_res, y_res, cv=outer_cv, scoring=scoring)
results.append(nested_cv_results)
names.append(name)
msg = "Nested CV Accuracy %s: %f (+/- %f )" % (name, nested_cv_results.mean()*100, nested_cv_results.std()*100)
print(msg)
model.fit(X_train, Y_train)
print('Test set accuracy: {:.2f}'.format(model.score(X_test, Y_test)*100), '%')
#print("Best Estimator: \n{}\n".format(model.best_estimator_))
print("Best Parameters: \n{}\n".format(model.best_params_))
print("Best CV Score: \n{}\n".format(model.best_score_))
if __name__ == '__main__':
pool = Pool() # Create a multiprocessing Pool
pool.map(run_models, models)
However this seems to run indefinitely with no errors but no output
Edit: on trying to changing multiprocessing to not be IDLE I have tried:
def run_models(models):
nested_cv_results = model_selection.cross_val_score(model, X_res, y_res, cv=outer_cv, scoring=scoring)
results.append(nested_cv_results)
names.append(name)
msg = "Nested CV Accuracy %s: %f (+/- %f )" % (name, nested_cv_results.mean()*100, nested_cv_results.std()*100)
return msg, model.best_params_
for name, model in models:
if __name__ == '__main__':
pool = Pool() # Create a multiprocessing Pool
pool.map(run_models, model)
However this gives the error:
TypeError: 'GridSearchCV' object is not iterable