ValueError: Invalid parameter when fiting gridsearchcv

ValueError: Invalid parameter when fiting gridsearchcv - python

This is my code
param_grid = [{'kneighborsclassifier__n_neighbors': [3, 4, 5, 6, 7]},
{'svc__C': [0.1, 1, 10, 100], 'svc__gamma': [0.001, 0.01, 0.1, 1, 10]},
{'randomforestclassifier__n_estimators': [50, 100, 200, 300 ,400]},
{'decisiontreeclassifier__max_depth': [12, 25, 50, 75, 100]},
{'adaboostclassifier__n_estimators': [50, 100, 200, 300 ,400]},
{'baggingclassifier__n_estimators': [50, 100, 200, 300, 400]}
]
inner_cv = StratifiedShuffleSplit(test_size=.8, train_size=.2, n_splits=5, random_state=0)
outer_cv = StratifiedShuffleSplit(test_size=.8, train_size=.2, n_splits=5, random_state=0)
models = [knn, svc, forest, dtc, ada, bag]
model_names = ['knn', 'svc','forest', 'dtc', 'ada', 'bag']
for m, mname in zip(models, model_names):
pipe = make_pipeline(VarianceThreshold(threshold=1),
MinMaxScaler(),
SelectKBest(f_classif, k=20),
m)
grid = GridSearchCV(pipe, param_grid=param_grid, cv=inner_cv)
grid.fit(X_train_test, y_train_test)
nested_score = cross_val_score(grid, X=X_train_test, y=y_train_test.values.ravel(), cv=outer_cv)
print(mname)
print(grid.best_params_)
print(grid.best_score_)
print('\n')
This is the error:
ValueError: Invalid parameter svc for estimator Pipeline(memory=None,
steps=[('variancethreshold', VarianceThreshold(threshold=1)),
('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1))),
('selectkbest',
SelectKBest(k=20,
score_func=<function f_classif at 0x0000019E0A485AF8>)),
('kneighborsclassifier',
KNeighborsClassifier(algorithm='auto', leaf_size=30,
metric='minkowski', metric_params=None,
n_jobs=None, n_neighbors=5, p=2,
weights='uniform'))],
verbose=False). Check the list of available parameters with `estimator.get_params().keys()`.
I dont know whats wrong. I copied the parameter and model names from the named_steps of the pipeline. If I run it without a parameter grid it works so the problem is most likely there.

Seems to work like this, but I dont like it.
pg1 = {'kneighborsclassifier__n_neighbors': [3, 4, 5, 6, 7]}
pg2 = {'svc__C': [0.1, 1, 10, 100],
'svc__gamma': [0.001, 0.01, 0.1, 1, 10]}
pg3 = {'randomforestclassifier__n_estimators': [50, 100, 200, 300 ,400]}
pg4 = {'decisiontreeclassifier__max_depth': [12, 25, 50, 75, 100]}
pg5 = {'adaboostclassifier__n_estimators': [50, 100, 200, 300 ,400]}
pg6 = {'baggingclassifier__n_estimators': [50, 100, 200, 300, 400]}
param_grid_list = [pg1, pg2, pg3, pg4, pg5, pg6]
And then the loop becomes:
for m, p, mname in zip(models, param_grid_list, model_names):
pipe = make_pipeline(VarianceThreshold(threshold=1),
MinMaxScaler(),
SelectKBest(f_classif, k=20),
m)
grid = GridSearchCV(pipe, param_grid=p, cv=inner_cv)
grid.fit(X_train_test, y_train_test)
nested_score = cross_val_score(grid, X=X_train_test, y=y_train_test.values.ravel(), cv=outer_cv)
print(mname)
print(grid.best_params_)
print(grid.best_score_)
print('\n')

Related

Optuna score vs Cross_val_score?

A accuracy score from optuna and a score in cross_val_score were different. Why does it occuer and which score should I choose?
I used the hyperparameters that I got in optuna in cross_val_score.
def objective_lgb(trial):
num_leaves = trial.suggest_int("num_leaves", 2, 1000)
max_depth = trial.suggest_int("max_depth", 2, 100)
learning_rate = trial.suggest_float('learning_rate', 0.001, 1)
n_estimators = trial.suggest_int('n_estimators', 100, 2000)
min_child_samples = trial.suggest_int('min_child_samples', 3, 1000)
subsample = trial.suggest_float('subsample', 0.000001, 1)
colsample_bytree = trial.suggest_float('colsample_bytree', 0.00000001, 1)
reg_alpha = trial.suggest_float('reg_alpha', 0, 400)
reg_lambda = trial.suggest_float("reg_lambda", 0, 400)
importance_type = trial.suggest_categorical('importance_type', ["split", "gain"])
lgb_clf = lgb.LGBMClassifier(random_state=1,
objective="multiclass",
num_class = 3,
importance_type=importance_type,
num_leaves=num_leaves,
max_depth=max_depth,
learning_rate=learning_rate,
n_estimators=n_estimators,
min_child_samples=min_child_samples,
subsample=subsample,
colsample_bytree=colsample_bytree,
reg_alpha=reg_alpha,
reg_lambda=reg_lambda
)
score = cross_val_score(lgb_clf, train_x, train_y, n_jobs=-1, cv=KFold(n_splits=10, shuffle=True, random_state=1), scoring='accuracy')
mean_score = score.mean()
return mean_score
lgb_study = optuna.create_study(direction="maximize")
lgb_study.optimize(objective_lgb, n_trials=1500)
lgb_trial = lgb_study.best_trial
print("accuracy:", lgb_trial.value)
print()
print("Best params:", lgb_trial.params)
=========================================================
def light_check(x,params):
model = lgb.LGBMClassifier()
scores = cross_val_score(model,x,y,cv=KFold(n_splits=10, shuffle=True, random_state=1),n_jobs=-1)
mean = scores.mean()
return scores, mean
light_check(x,{'num_leaves': 230, 'max_depth': 53, 'learning_rate': 0.04037430031226232, 'n_estimators': 1143, 'min_child_samples': 381, 'subsample': 0.12985990464862135, 'colsample_bytree': 0.8914118949904919, 'reg_alpha': 31.869348047391053, 'reg_lambda': 17.45653692887209, 'importance_type': 'split'})

From what I can see, you are using X_train, y_train in the optuna call, while in light_check you are passing x and y. Assuming you did a spilt in some unknown code, the data set for optuna is smaller and you get a different number.

Function to generate optuna grids provided an sklearn pipeline

I am using sklearn along with optuna for HPO. I would like to create a custom function that would take an sklearn pipeline as input and return optuna-specifc grids. Returning sklearn specific param grids (i.e. dictionaries) seems to be more straight-forward (duh) ; this is what I 've got so far :
def grid_from_estimator(estimator, type = 'sklearn'):
estimator_name = estimator.named_steps['estimator'].__class__.__name__
if type == 'sklearn':
if estimator_name=='LogisticRegression':
params = {
'estimator__penalty': ['l1','elasticnet'],
'estimator__C': np.logspace(-4, 4, 20)
}
elif estimator_name=='LGBMClassifier':
params = {
'estimator__n_estimators': np.arange(100, 1000, 200),
'estimator__boosting_type':['gbdt','dart'],
'estimator__max_depth': np.arange(6, 12),
'estimator__num_leaves': np.arange(30, 150,5),
'estimator__learning_rate': [1e-2/2 , 1e-2, 1e-1/2, 1e-1, 0.5, 1],
'estimator__min_child_samples': np.arange(20, 100, 5),
'estimator__subsample': np.arange(0.65, 1, 0.05),
'estimator__colsample_bytree': np.arange(0.4, 0.75, 0.05),
'estimator__reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
'estimator__reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100],
'estimator__iterations': np.arange(100, 800, 100),
'estimator__objective': 'binary'
}
elif type == 'optuna':
if estimator_name == 'LogisticRegression':
params = {
'estimator__penalty': trial.suggest_categorical('penalty', ['l1', 'elasticnet']),
'estimator__C': trial.suggest.suggest_loguniform('c', -4, 4)
}
elif estimator_name == 'LGBMClassifier':
params = {
'estimator__n_estimators': trial.suggest_int('n_estimators', 100, 1000),
'estimator__boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart']),
'estimator__max_depth': trial.suggest_int('max_depth', 6, 12),
'estimator__num_leaves': trial.suggest_int('num_leaves', 30, 150, 5),
'estimator__learning_rate': trial.suggest_float('learning_rate', 1e-4, 1),
'estimator__min_child_samples': trial.suggest_int('min_child_samples', 20, 100),
'estimator__subsample': trial.suggest_float('subsample', 0.5, 1),
'estimator__colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 0.75),
'estimator__reg_alpha': trial.suggest_float('reg_alpha', 1e-2, 10),
'estimator__reg_lambda': trial.suggest_float('reg_lambda', 1e-2, 10)
}
return params
The "trial.suggest_..." parts keeps 'complaining' and returning an error; although I understand the reason why, I can't see any way around it. Is this even possible? Any ideas?
Appreciate your support!

I think, something along this should work,
def grid_from_estimator(estimator, trial, type = 'sklearn'):
pass
def your_objective_function(trial):
params = grid_from_estimator('LogisticRegression', trial, 'optuna')
#Rest of the code here.
def tune_model():
study = optuna.create_study()
study.optimize(your_objective_function, n_trials=20)
tune_model()

An example method using optuna ask and tell interface.
Code
import optuna
import numpy as np
def optuna_objective(estimator_name, params):
if estimator_name == 'LogisticRegression':
x = params['x']
y = params['y']
return (x - 2) ** 2 + y
if estimator_name == 'LGBMClassifier':
# estimator__n_estimators = params['estimator__n_estimators']
# return accuracy
pass
return None
def grid_from_estimator(estimator_name, type_='sklearn', study=None):
params, trial = None, None
if type_ == 'sklearn':
if estimator_name == 'LogisticRegression':
params = {
'estimator__penalty': ['l1','elasticnet'],
'estimator__C': np.logspace(-4, 4, 20)
}
elif estimator_name == 'LGBMClassifier':
params = {
'estimator__n_estimators': np.arange(100, 1000, 200),
'estimator__boosting_type':['gbdt','dart'],
'estimator__max_depth': np.arange(6, 12),
'estimator__num_leaves': np.arange(30, 150,5),
'estimator__learning_rate': [1e-2/2 , 1e-2, 1e-1/2, 1e-1, 0.5, 1],
'estimator__min_child_samples': np.arange(20, 100, 5),
'estimator__subsample': np.arange(0.65, 1, 0.05),
'estimator__colsample_bytree': np.arange(0.4, 0.75, 0.05),
'estimator__reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
'estimator__reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100],
'estimator__iterations': np.arange(100, 800, 100),
'estimator__objective': 'binary'
}
elif type_ == 'optuna':
trial = study.ask()
if estimator_name == 'LogisticRegression':
params = {
'x': trial.suggest_float('x', -10, 10),
'y': trial.suggest_float('y', -10, 10)
}
# params = {
# 'estimator__penalty': trial.suggest_categorical('estimator__penalty', ['l1', 'elasticnet']),
# 'estimator__C': trial.suggest_float('estimator__C', -4, 4)
# }
elif estimator_name == 'LGBMClassifier':
params = {
'estimator__n_estimators': trial.suggest_int('estimator__n_estimators', 100, 1000),
'estimator__boosting_type': trial.suggest_categorical('estimator__boosting_type', ['gbdt', 'dart']),
'estimator__max_depth': trial.suggest_int('estimator__max_depth', 6, 12),
'estimator__num_leaves': trial.suggest_int('estimator__num_leaves', 30, 150, 5),
'estimator__learning_rate': trial.suggest_float('estimator__learning_rate', 1e-4, 1),
'estimator__min_child_samples': trial.suggest_int('estimator__min_child_samples', 20, 100),
'estimator__subsample': trial.suggest_float('estimator__subsample', 0.5, 1),
'estimator__colsample_bytree': trial.suggest_float('estimator__colsample_bytree', 0.4, 0.75),
'estimator__reg_alpha': trial.suggest_float('estimator__reg_alpha', 1e-2, 10),
'estimator__reg_lambda': trial.suggest_float('estimator__reg_lambda', 1e-2, 10)
}
return params, trial
# (1) sklearn example
print('SKLEARN')
estimator_name = 'LogisticRegression'
optimizer_type = 'sklearn'
params, _ = grid_from_estimator(estimator_name, type_=optimizer_type)
print(params)
print()
# (2) Optuna example with ask and tell interface.
print('OPTUNA')
study = optuna.create_study(direction='maximize')
n_trials = 10
estimator_name = 'LogisticRegression'
optimizer_type = 'optuna'
for _ in range(n_trials):
params, trial = grid_from_estimator(estimator_name, type_=optimizer_type, study=study)
objective_value = optuna_objective(estimator_name, params)
study.tell(trial, objective_value) # tell the pair of trial and objective value
print(f'trialnum: {trial.number}, params: {params}, value: {objective_value}')
best_params = study.best_params
best_x = best_params["x"]
best_y = best_params["y"]
best_value = study.best_value
best_trial_num = study.best_trial.number
print(f"best x: {best_x}, best y: {best_y}, (x - 2)^2 + y: {(best_x - 2) ** 2 + best_y}, best_value: {best_value}, best_trial_num: {best_trial_num}") # trial num starts at 0
Output
SKLEARN
{'estimator__penalty': ['l1', 'elasticnet'], 'estimator__C': array([1.00000000e-04, 2.63665090e-04, 6.95192796e-04, 1.83298071e-03,
4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04])}
OPTUNA
[I 2021-11-25 19:03:09,673] A new study created in memory with name: no-name-f5046b21-f579-4c74-8046-79420c256d4a
trialnum: 0, params: {'x': 2.905894660287128, 'y': -4.537699327718261}, value: -3.7170541921815303
trialnum: 1, params: {'x': -9.275103438355583, 'y': -5.925000918692578}, value: 121.2029566269253
trialnum: 2, params: {'x': -2.9531168045205103, 'y': 5.253730464314739}, value: 29.78709654353821
trialnum: 3, params: {'x': 3.766902399344163, 'y': 3.778408673279479}, value: 6.900352762087639
trialnum: 4, params: {'x': -0.897563829823584, 'y': -0.887774211794973}, value: 7.508101936106943
trialnum: 5, params: {'x': -2.2256917634354645, 'y': 3.8017184220598903}, value: 21.658189301626216
trialnum: 6, params: {'x': -6.333366980619912, 'y': 9.87067058585388}, value: 79.3156758195401
trialnum: 7, params: {'x': 2.570258991787558, 'y': -0.1959178948625162}, value: 0.1292774228520457
trialnum: 8, params: {'x': 2.94430596072913, 'y': 4.318454050149043}, value: 5.210167797617609
trialnum: 9, params: {'x': 5.972023459737699, 'y': 4.165369460555215}, value: 19.942339825261854
best x: -9.275103438355583, best y: -5.925000918692578, (x - 2)^2 + y: 121.2029566269253, best_value: 121.2029566269253, best_trial_num: 1

how to use gridsearch and cross validation with differents parameters models?

I write a classification algorythm and I use 6 differents models. I want to improve the model by using finetuning parameters for each model. The problem I am encounering is related to my "for loop". In fact, I loop in three different dictionnaries but the correspondance between the model I use in the gridsearch and the parameters are not keep since the dictionnaries are not ordered;
I seem to failed to find another solution :
here my code and the result as you can see the model_name is different from the param_name so I get multiple erros like for example (ValueError: Invalid parameter alpha for estimator LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
)
below the code
model1 = LinearSVC()
model2 = MultinomialNB()
model3 = LogisticRegression()
model4 = RandomForestClassifier()
model5 = KNeighborsClassifier()
model6 = MLPClassifier(max_iter=300, random_state=1)
models = {'Model_SVC': model1, 'Model_G_NB': model2, 'Model_LR': model3, 'Model_RF': model4, 'Model_KN': model5, 'Model_MLP': model6}
# list of parameters
parameter_RF = {'min_samples_split': [3, 5, 10],
'criterion': ['gini', 'entropy'],
'n_estimators' : [100, 300],
'max_features': ['auto', 'sqrt','log2'],
'bootstrap': ['True', 'False'],
'max_depth': [3, 5, 15, 25]
}
parameter_LinearSvc = {'C': [0.001, 0.01, 0.1, 1, 10, 100]
}
parameter_LR = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
'penalty' : ['l1', 'l2'],
'solver' : ['liblinear', 'warn'],
'dual' : ['True','False'],
'max_iter' :[100, 110, 120, 130, 140]
}
parameter_NB = {'alpha': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
'loss': ['hinge', 'hinge_squarred'],
'penalty' : ['l1', 'l2']
}
k_range = list(range(1, 31))
weight_options = ['uniform', 'distance']
parameter_KNN = dict(n_neighbors=k_range, weights=weight_options)
parameter_MLP = { 'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
'activation': ['tanh', 'relu'],
'solver': ['sgd', 'adam'],
'alpha': [0.0001, 0.05],
'learning_rate': ['constant','adaptive'],
'max_iter' : [100, 200, 300]
}
parameters_dict = {'Model_SVC': parameter_LinearSvc, 'Model_G_NB': parameter_NB, 'Model_LR': parameter_LR, 'Model_RF': parameter_LR, 'Model_KN': parameter_KNN, 'Model_MLP': parameter_MLP}
cv_splitter = KFold(n_splits=10, shuffle=False, random_state=None)
for feature_name, feature in features.items():
for model_name, model in models.items():
for param_name, parameter in parameters_dict.items():
clf = GridSearchCV(estimator=model, param_grid=parameter, cv=cv_splitter, verbose = 1, n_jobs = -1, return_train_score=True)
best_model = clf.fit(feature, ylabels)
output : as you can see sometimes it works but other times param and model are not the same which causes the error
[5 rows x 7 columns]
Feature: vecteur_CV
Model: Model_SVC
Param: Model_SVC
Fitting 10 folds for each of 6 candidates, totalling 60 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 58 out of 60 | elapsed: 2.8s remaining: 0.1s
/svm/base.py:929: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
"the number of iterations.", ConvergenceWarning)
[Parallel(n_jobs=-1)]: Done 60 out of 60 | elapsed: 2.8s finished
Feature: vecteur_CV
Model: Model_SVC
Param: Model_G_NB
Fitting 10 folds for each of 24 candidates, totalling 240 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
joblib.externals.loky.process_executor._RemoteTraceback:
"""
Traceback (most recent call last):
File "/ho/anaconda3/lib/python3.7/site-packages/joblib/externals/loky/process_executor.py", line 418, in _process_worker
r = call_item()
File "/ho/anaconda3/lib/python3.7/site-packages/joblib/externals/loky/process_executor.py", line 272, in __call__
return self.fn(*self.args, **self.kwargs)
File "/ho/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 567, in __call__
return self.func(*args, **kwargs)
File "/ho/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 225, in __call__
for func, args, kwargs in self.items]
File "/ho/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 225, in <listcomp>
for func, args, kwargs in self.items]
File "/home/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 503, in _fit_and_score
estimator.set_params(**parameters)
File "/home/anaconda3/lib/python3.7/site-packages/sklearn/base.py", line 224, in set_params
(key, self))
ValueError: Invalid parameter alpha for estimator LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
Features look like this
`X_data, X_data_0, X_data_1, X_data_2 = features_fusion(verbatim, first_arg)
features = {'vecteur_CV': X_data, 'vecteur_NEG': X_data_0, 'Vecteur_NEG_lexique': X_data_1, 'Vecteur_NEG_CV': X_data_2}

I used 2 slightly different versions of the Iris dataset and this code below runs (albeit with many warnings during training):
from sklearn.model_selection import GridSearchCV,KFold
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
data = load_iris()
model1 = LinearSVC()
model2 = MultinomialNB()
model3 = LogisticRegression()
model4 = RandomForestClassifier()
model5 = KNeighborsClassifier()
model6 = MLPClassifier(max_iter=300, random_state=1)
models = {'Model_SVC': model1, 'Model_G_NB': model2, 'Model_LR': model3,
'Model_RF': model4, 'Model_KN': model5, 'Model_MLP': model6}
# list of parameters
parameter_RF = {'max_depth': [2,3, 5, 15, 25],
'min_samples_split': [3, 5, 10],
'criterion': ['gini', 'entropy'],
'n_estimators' : [100, 300],
'max_features': ['auto', 'sqrt','log2'],
'bootstrap': ['True', 'False'],
}
parameter_LinearSvc = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
parameter_LR = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
'penalty' : ['l1', 'l2'],
'solver' : ['liblinear', 'warn'],
#'dual' : ['True','False'],
'max_iter' :[100, 110, 120, 130, 140]
}
parameter_NB = {'alpha': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001],
#'loss': ['hinge', 'hinge_squarred'],
#'penalty' : ['l1', 'l2']
}
k_range = list(range(1, 31))
weight_options = ['uniform', 'distance']
parameter_KNN = dict(n_neighbors=k_range, weights=weight_options)
parameter_MLP = {'hidden_layer_sizes':[(50,50,50),(50,100,50),(100,)],
'activation': ['tanh', 'relu'],
'solver': ['sgd', 'adam'],
'alpha': [0.0001, 0.05],
'learning_rate': ['constant','adaptive'],
'max_iter' : [100, 200, 300]
}
parameters_dict = {'Model_SVC': parameter_LinearSvc,
'Model_G_NB': parameter_NB,
'Model_LR': parameter_LR, 'Model_RF': parameter_RF,
'Model_KN': parameter_KNN, 'Model_MLP': parameter_MLP}
cv_splitter = KFold(n_splits=10, shuffle=False, random_state=None)
#features = {'vecteur_CV': X_data, 'vecteur_NEG': X_data_0,
# 'Vecteur_NEG_lexique': X_data_1, 'Vecteur_NEG_CV': X_data_2}
features = {'iris': data['data'],'iris_sub':data['data'][:,1:]}
ylabels = data['target']
for feature_name, feature in features.items():
#print(feature_name, feature)
for model_name in models:
print('Training model: ', model_name)
clf = GridSearchCV(estimator=models[model_name],
param_grid=parameters_dict[model_name],
cv=cv_splitter, verbose = 1, n_jobs = -1,
return_train_score=True)
best_model = clf.fit(feature, ylabels)
#for feature_name, feature in features.items():
# for model_name, model in models.items():
# for param_name, parameter in parameters_dict.items():
# print(model_name,model,param_name,parameter)
# clf = GridSearchCV(estimator=model, param_grid=parameter,
# cv=cv_splitter, verbose = 1, n_jobs = -1,
# return_train_score=True)
# best_model = clf.fit(feature, ylabels)
I had to comment some model parameters as they gave errors. There was also a typo in your snippet above 'Model_RF': parameter_LR should be 'Model_RF': parameter_RF. I'm not sure if that was the reason of your error. I also removed the inner parameters_dict loop as I could access all elements using the same keys as models.

ValueError of hyperopt in searching parameters of RandomForest

I am trying to find parameters of RandomForestClassifier using hyperopt. Here is my code:
X, y = load_wine(return_X_y=True)
def rf_neg_score(params):
X, y = params.pop('X'), params.pop('y')
cv = params.pop('cv')
scoring = params.pop('scoring')
rf_clf = RandomForestClassifier(**params)
score = cross_val_score(rf_clf, X=X, y=y, n_jobs=-1, scoring=scoring,
cv=cv).mean()
return -score
rf_search_space = {
'n_estimators': hp.choice('n_estimators', np.arange(10, 1000, dtype=int)),
'max_depth': hp.choice('max_depth', np.arange(2, 8, dtype=int)),
'max_leaf_nodes': hp.choice('max_leaf_nodes', np.arange(2, 65, dtype=int)),
'n_jobs': -1,
'X': X,
'y': y,
'cv': StratifiedKFold(n_splits=5),
'scoring': 'f1_micro'
}
rf_best_params = fmin(fn=rf_neg_score, space=rf_search_space, max_evals=100,
algo=tpe.suggest)
After I run ValueError is raised at once:
/usr/local/lib/python3.6/dist-packages/hyperopt/utils.py in use_obj_for_literal_in_memo(expr, obj, lit, memo)
167 for node in pyll.dfs(expr):
168 try:
--> 169 if node.obj == lit:
170 memo[node] = obj
171 except AttributeError:
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
What do you think what I am doing wrong?

Found decision. It seems hyperopt checks every item in search space if it has hyperopt.hp.* function and while checking ValueError is raised. So there`s no opportunity to provide data with that way. Here is right code:
def rf_neg_score(params):
scoring = params.pop('scoring')
cv = params.pop('cv')
rf_clf = RandomForestClassifier(**params)
# X and y are provided out of function
score = cross_val_score(rf_clf, X=X, y=y, n_jobs=-1,
scoring='f1_micro', cv=5).mean()
return -score
rf_search_space = {
'n_estimators': hp.choice('n_estimators', np.arange(10, 1000, dtype=int)),
'max_depth': hp.choice('max_depth', np.arange(2, 8, dtype=int)),
'max_leaf_nodes': hp.choice('max_leaf_nodes', np.arange(2, 65, dtype=int)),
'scoring': 'f1_micro',
'cv': StratifiedKFold(n_splits=5)
}
rf_best_params = fmin(fn=rf_neg_score, space=rf_search_space, max_evals=100,
algo=tpe.suggest)

How to get nested cross validation to run on multiple cores?

Following up from my question yesterday (How to speed up nested cross validation in python?) I am trying to run models in parallel processing, however with my code is it possible to check if these models, each requiring 1 core only to run on, could run on cores in parallel or whether with this code it will always be 1 core only taking on the models 1 at a time?
I have access to higher powered computing where I could ask for 6 cores, a core per model, however, I am not sure with my code whether the models would actually be assigned their own core and run in parallel. If that makes sense (apologies if this is completely misinformed, as I am trying to learn as I go, any help would be appreciated). With n_jobs=1 for each of my models in their gridsearch can I further specifiy for each to have their own core?
Here is my code:
dataset= pd.read_csv('data.csv')
data = dataset.drop(["gene"],1)
df = data.iloc[:,0:24]
df = df.fillna(0)
X = MinMaxScaler().fit_transform(df)
le = preprocessing.LabelEncoder()
encoded_value = le.fit_transform(["certain", "likely", "possible", "unlikely"])
Y = le.fit_transform(data["category"])
sm = SMOTE(random_state=100)
X_res, y_res = sm.fit_resample(X, Y)
seed = 7
logreg = LogisticRegression(penalty='l1', solver='liblinear',multi_class='auto')
LR_par= {'penalty':['l1'], 'C': [0.5, 1, 5, 10], 'max_iter':[500, 1000, 5000]}
rfc =RandomForestClassifier()
param_grid = {'bootstrap': [True, False],
'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
'max_features': ['auto', 'sqrt'],
'min_samples_leaf': [1, 2, 4,25],
'min_samples_split': [2, 5, 10, 25],
'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}
mlp = MLPClassifier(random_state=seed)
parameter_space = {'hidden_layer_sizes': [(10,20), (10,20,10), (50,)],
'activation': ['tanh', 'relu'],
'solver': ['adam', 'sgd'],
'max_iter': [10000],
'alpha': [0.1, 0.01, 0.001],
'learning_rate': ['constant','adaptive']}
gbm = GradientBoostingClassifier(min_samples_split=25, min_samples_leaf=25)
param = {"loss":["deviance"],
"learning_rate": [0.15,0.1,0.05,0.01,0.005,0.001],
"min_samples_split": [2, 5, 10, 25],
"min_samples_leaf": [1, 2, 4,25],
"max_depth":[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
"max_features":['auto', 'sqrt'],
"criterion": ["friedman_mse"],
"n_estimators":[200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
}
svm = SVC(gamma="scale", probability=True)
tuned_parameters = {'kernel':('linear', 'rbf'), 'C':(1,0.25,0.5,0.75)}
def baseline_model(optimizer='adam', learn_rate=0.01):
model = Sequential()
model.add(Dense(100, input_dim=X_res.shape[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(50, activation='relu')) #8 is the dim/ the number of hidden units (units are the kernel)
model.add(Dense(4, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
return model
keras = KerasClassifier(build_fn=baseline_model, batch_size=32, epochs=100, verbose=0)
learn_rate = [0.001, 0.01, 0.1, 0.2, 0.3]
optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']
kerasparams = dict(optimizer=optimizer, learn_rate=learn_rate)
inner_cv = KFold(n_splits=10, shuffle=True, random_state=seed)
outer_cv = KFold(n_splits=10, shuffle=True, random_state=seed)
models = []
models.append(('GBM', GridSearchCV(gbm, param, cv=inner_cv,iid=False, n_jobs=1)))
models.append(('RFC', GridSearchCV(rfc, param_grid, cv=inner_cv,iid=False, n_jobs=1)))
models.append(('LR', GridSearchCV(logreg, LR_par, cv=inner_cv, iid=False, n_jobs=1)))
models.append(('SVM', GridSearchCV(svm, tuned_parameters, cv=inner_cv, iid=False, n_jobs=1)))
models.append(('MLP', GridSearchCV(mlp, parameter_space, cv=inner_cv,iid=False, n_jobs=1)))
models.append(('Keras', GridSearchCV(estimator=keras, param_grid=kerasparams, cv=inner_cv,iid=False, n_jobs=1)))
results = []
names = []
scoring = 'accuracy'
X_train, X_test, Y_train, Y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=0)
for name, model in models:
nested_cv_results = model_selection.cross_val_score(model, X_res, y_res, cv=outer_cv, scoring=scoring)
results.append(nested_cv_results)
names.append(name)
msg = "Nested CV Accuracy %s: %f (+/- %f )" % (name, nested_cv_results.mean()*100, nested_cv_results.std()*100)
print(msg)
model.fit(X_train, Y_train)
print('Test set accuracy: {:.2f}'.format(model.score(X_test, Y_test)*100), '%')
print("Best Parameters: \n{}\n".format(model.best_params_))
print("Best CV Score: \n{}\n".format(model.best_score_))
Edit: I have now tried my for loop as:
def run_models(models):
nested_cv_results = model_selection.cross_val_score(model, X_res, y_res, cv=outer_cv, scoring=scoring)
results.append(nested_cv_results)
names.append(name)
msg = "Nested CV Accuracy %s: %f (+/- %f )" % (name, nested_cv_results.mean()*100, nested_cv_results.std()*100)
print(msg)
model.fit(X_train, Y_train)
print('Test set accuracy: {:.2f}'.format(model.score(X_test, Y_test)*100), '%')
#print("Best Estimator: \n{}\n".format(model.best_estimator_))
print("Best Parameters: \n{}\n".format(model.best_params_))
print("Best CV Score: \n{}\n".format(model.best_score_))
if __name__ == '__main__':
pool = Pool() # Create a multiprocessing Pool
pool.map(run_models, models)
However this seems to run indefinitely with no errors but no output
Edit: on trying to changing multiprocessing to not be IDLE I have tried:
def run_models(models):
nested_cv_results = model_selection.cross_val_score(model, X_res, y_res, cv=outer_cv, scoring=scoring)
results.append(nested_cv_results)
names.append(name)
msg = "Nested CV Accuracy %s: %f (+/- %f )" % (name, nested_cv_results.mean()*100, nested_cv_results.std()*100)
return msg, model.best_params_
for name, model in models:
if __name__ == '__main__':
pool = Pool() # Create a multiprocessing Pool
pool.map(run_models, model)
However this gives the error:
TypeError: 'GridSearchCV' object is not iterable

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

ValueError: Invalid parameter when fiting gridsearchcv - python

Related

Optuna score vs Cross_val_score?

Function to generate optuna grids provided an sklearn pipeline

how to use gridsearch and cross validation with differents parameters models?

ValueError of hyperopt in searching parameters of RandomForest

How to get nested cross validation to run on multiple cores?

Categories

Resources