how to use gridsearch and cross validation with differents parameters models? - python

I write a classification algorythm and I use 6 differents models. I want to improve the model by using finetuning parameters for each model. The problem I am encounering is related to my "for loop". In fact, I loop in three different dictionnaries but the correspondance between the model I use in the gridsearch and the parameters are not keep since the dictionnaries are not ordered;
I seem to failed to find another solution :
here my code and the result as you can see the model_name is different from the param_name so I get multiple erros like for example (ValueError: Invalid parameter alpha for estimator LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
)
below the code
model1 = LinearSVC()
model2 = MultinomialNB()
model3 = LogisticRegression()
model4 = RandomForestClassifier()
model5 = KNeighborsClassifier()
model6 = MLPClassifier(max_iter=300, random_state=1)
models = {'Model_SVC': model1, 'Model_G_NB': model2, 'Model_LR': model3, 'Model_RF': model4, 'Model_KN': model5, 'Model_MLP': model6}
# list of parameters
parameter_RF = {'min_samples_split': [3, 5, 10],
'criterion': ['gini', 'entropy'],
'n_estimators' : [100, 300],
'max_features': ['auto', 'sqrt','log2'],
'bootstrap': ['True', 'False'],
'max_depth': [3, 5, 15, 25]
}
parameter_LinearSvc = {'C': [0.001, 0.01, 0.1, 1, 10, 100]
}
parameter_LR = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
'penalty' : ['l1', 'l2'],
'solver' : ['liblinear', 'warn'],
'dual' : ['True','False'],
'max_iter' :[100, 110, 120, 130, 140]
}
parameter_NB = {'alpha': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
'loss': ['hinge', 'hinge_squarred'],
'penalty' : ['l1', 'l2']
}
k_range = list(range(1, 31))
weight_options = ['uniform', 'distance']
parameter_KNN = dict(n_neighbors=k_range, weights=weight_options)
parameter_MLP = { 'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
'activation': ['tanh', 'relu'],
'solver': ['sgd', 'adam'],
'alpha': [0.0001, 0.05],
'learning_rate': ['constant','adaptive'],
'max_iter' : [100, 200, 300]
}
parameters_dict = {'Model_SVC': parameter_LinearSvc, 'Model_G_NB': parameter_NB, 'Model_LR': parameter_LR, 'Model_RF': parameter_LR, 'Model_KN': parameter_KNN, 'Model_MLP': parameter_MLP}
cv_splitter = KFold(n_splits=10, shuffle=False, random_state=None)
for feature_name, feature in features.items():
for model_name, model in models.items():
for param_name, parameter in parameters_dict.items():
clf = GridSearchCV(estimator=model, param_grid=parameter, cv=cv_splitter, verbose = 1, n_jobs = -1, return_train_score=True)
best_model = clf.fit(feature, ylabels)
output : as you can see sometimes it works but other times param and model are not the same which causes the error
[5 rows x 7 columns]
Feature: vecteur_CV
Model: Model_SVC
Param: Model_SVC
Fitting 10 folds for each of 6 candidates, totalling 60 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 58 out of 60 | elapsed: 2.8s remaining: 0.1s
/svm/base.py:929: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
"the number of iterations.", ConvergenceWarning)
[Parallel(n_jobs=-1)]: Done 60 out of 60 | elapsed: 2.8s finished
Feature: vecteur_CV
Model: Model_SVC
Param: Model_G_NB
Fitting 10 folds for each of 24 candidates, totalling 240 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
joblib.externals.loky.process_executor._RemoteTraceback:
"""
Traceback (most recent call last):
File "/ho/anaconda3/lib/python3.7/site-packages/joblib/externals/loky/process_executor.py", line 418, in _process_worker
r = call_item()
File "/ho/anaconda3/lib/python3.7/site-packages/joblib/externals/loky/process_executor.py", line 272, in __call__
return self.fn(*self.args, **self.kwargs)
File "/ho/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 567, in __call__
return self.func(*args, **kwargs)
File "/ho/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 225, in __call__
for func, args, kwargs in self.items]
File "/ho/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 225, in <listcomp>
for func, args, kwargs in self.items]
File "/home/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 503, in _fit_and_score
estimator.set_params(**parameters)
File "/home/anaconda3/lib/python3.7/site-packages/sklearn/base.py", line 224, in set_params
(key, self))
ValueError: Invalid parameter alpha for estimator LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
Features look like this
`X_data, X_data_0, X_data_1, X_data_2 = features_fusion(verbatim, first_arg)
features = {'vecteur_CV': X_data, 'vecteur_NEG': X_data_0, 'Vecteur_NEG_lexique': X_data_1, 'Vecteur_NEG_CV': X_data_2}

I used 2 slightly different versions of the Iris dataset and this code below runs (albeit with many warnings during training):
from sklearn.model_selection import GridSearchCV,KFold
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
data = load_iris()
model1 = LinearSVC()
model2 = MultinomialNB()
model3 = LogisticRegression()
model4 = RandomForestClassifier()
model5 = KNeighborsClassifier()
model6 = MLPClassifier(max_iter=300, random_state=1)
models = {'Model_SVC': model1, 'Model_G_NB': model2, 'Model_LR': model3,
'Model_RF': model4, 'Model_KN': model5, 'Model_MLP': model6}
# list of parameters
parameter_RF = {'max_depth': [2,3, 5, 15, 25],
'min_samples_split': [3, 5, 10],
'criterion': ['gini', 'entropy'],
'n_estimators' : [100, 300],
'max_features': ['auto', 'sqrt','log2'],
'bootstrap': ['True', 'False'],
}
parameter_LinearSvc = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
parameter_LR = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
'penalty' : ['l1', 'l2'],
'solver' : ['liblinear', 'warn'],
#'dual' : ['True','False'],
'max_iter' :[100, 110, 120, 130, 140]
}
parameter_NB = {'alpha': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001],
#'loss': ['hinge', 'hinge_squarred'],
#'penalty' : ['l1', 'l2']
}
k_range = list(range(1, 31))
weight_options = ['uniform', 'distance']
parameter_KNN = dict(n_neighbors=k_range, weights=weight_options)
parameter_MLP = {'hidden_layer_sizes':[(50,50,50),(50,100,50),(100,)],
'activation': ['tanh', 'relu'],
'solver': ['sgd', 'adam'],
'alpha': [0.0001, 0.05],
'learning_rate': ['constant','adaptive'],
'max_iter' : [100, 200, 300]
}
parameters_dict = {'Model_SVC': parameter_LinearSvc,
'Model_G_NB': parameter_NB,
'Model_LR': parameter_LR, 'Model_RF': parameter_RF,
'Model_KN': parameter_KNN, 'Model_MLP': parameter_MLP}
cv_splitter = KFold(n_splits=10, shuffle=False, random_state=None)
#features = {'vecteur_CV': X_data, 'vecteur_NEG': X_data_0,
# 'Vecteur_NEG_lexique': X_data_1, 'Vecteur_NEG_CV': X_data_2}
features = {'iris': data['data'],'iris_sub':data['data'][:,1:]}
ylabels = data['target']
for feature_name, feature in features.items():
#print(feature_name, feature)
for model_name in models:
print('Training model: ', model_name)
clf = GridSearchCV(estimator=models[model_name],
param_grid=parameters_dict[model_name],
cv=cv_splitter, verbose = 1, n_jobs = -1,
return_train_score=True)
best_model = clf.fit(feature, ylabels)
#for feature_name, feature in features.items():
# for model_name, model in models.items():
# for param_name, parameter in parameters_dict.items():
# print(model_name,model,param_name,parameter)
# clf = GridSearchCV(estimator=model, param_grid=parameter,
# cv=cv_splitter, verbose = 1, n_jobs = -1,
# return_train_score=True)
# best_model = clf.fit(feature, ylabels)
I had to comment some model parameters as they gave errors. There was also a typo in your snippet above 'Model_RF': parameter_LR should be 'Model_RF': parameter_RF. I'm not sure if that was the reason of your error. I also removed the inner parameters_dict loop as I could access all elements using the same keys as models.

Related

Currently only multi-regression, multilabel and survival objectives work with multidimensional target

I used bayes_optto tunse hper-parameter of CatBoostRegressor (from catboost) for regression and got the following error:
CatBoostError: catboost/private/libs/target/data_providers.cpp:603: Currently only multi-regression, multilabel and survival objectives work with multidimensional target
Here is the code:
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from catboost import Pool, CatBoostRegressor
from bayes_opt import BayesianOptimization
from bayes_opt.util import Colours
from sklearn.metrics import accuracy_score
def get_data():
""" Preparing data ."""
# trainx, testx, trainy, testy= train_test_split(XN, YN, test_size=0.2, random_state= 31)
return trainx, testx, trainy, testy
def CBR_cv(iterations, learning_rate, depth, l2_leaf_reg, min_child_samples, trainx, testx, trainy, testy):
train_pool = Pool(trainx, trainy)
test_pool = Pool(testx)
model = CatBoostRegressor(iterations = iterations, learning_rate = learning_rate, depth = depth,
l2_leaf_reg = l2_leaf_reg, min_child_samples = min_child_samples, loss_function='RMSE' )
# param['learning_rate'] = trial.suggest_discrete_uniform("learning_rate", 0.001, 0.02, 0.001)
# param['depth'] = trial.suggest_int('depth', 9, 15)
# param['l2_leaf_reg'] = trial.suggest_discrete_uniform('l2_leaf_reg', 1.0, 5.5, 0.5)
# param['min_child_samples'] = trial.suggest_categorical('min_child_samples', [1, 4, 8, 16, 32])
# cval = cross_val_score(model, trainx, trainy, scoring='accuracy', cv=4)
# return cval.mean()
## fit the model
model.fit(train_pool)
## evaluate performance
yhat = model.predict(test_pool)
score = r2_score(testy, yhat)
return score
def optimize_XGB(trainx2, testx2, trainy2, testy2):
"""Apply Bayesian Optimization to Random Forest parameters."""
def CBR_crossval(iterations, learning_rate, depth, l2_leaf_reg, min_child_samples):
"""Wrapper of RandomForest cross validation.
Notice how we ensure n_estimators and min_samples_split are casted
to integer before we pass them along. Moreover, to avoid max_features
taking values outside the (0, 1) range, we also ensure it is capped
accordingly.
"""
return CBR_cv(iterations = int(iterations),
learning_rate = max(min(learning_rate, 0.5), 1e-3),
depth = int(depth),
l2_leaf_reg = max(min(l2_leaf_reg, 5.5), 1.0),
min_child_samples = int(min_child_samples),
trainx = trainx2, testx= testx2, trainy = trainy2, testy= testy2)
optimizer = BayesianOptimization(
f=CBR_crossval,
pbounds={
"iterations": (50, 500),
"depth": (2, 25),
"learning_rate": (0.01, 0.5),
"l2_leaf_reg": (1.0, 5.5),
"min_child_samples": (1, 50),
},
random_state=1234,
verbose=2
)
optimizer.maximize(n_iter=1000)
print("Final result:", optimizer.max)
if __name__ == "__main__":
trainx2, testx2, trainy2, testy2 = get_data()
print(Colours.green("--- Optimizing XGB ---"))
optimize_XGB(trainx2, testx2, trainy2, testy2)

Optuna score vs Cross_val_score?

A accuracy score from optuna and a score in cross_val_score were different. Why does it occuer and which score should I choose?
I used the hyperparameters that I got in optuna in cross_val_score.
def objective_lgb(trial):
num_leaves = trial.suggest_int("num_leaves", 2, 1000)
max_depth = trial.suggest_int("max_depth", 2, 100)
learning_rate = trial.suggest_float('learning_rate', 0.001, 1)
n_estimators = trial.suggest_int('n_estimators', 100, 2000)
min_child_samples = trial.suggest_int('min_child_samples', 3, 1000)
subsample = trial.suggest_float('subsample', 0.000001, 1)
colsample_bytree = trial.suggest_float('colsample_bytree', 0.00000001, 1)
reg_alpha = trial.suggest_float('reg_alpha', 0, 400)
reg_lambda = trial.suggest_float("reg_lambda", 0, 400)
importance_type = trial.suggest_categorical('importance_type', ["split", "gain"])
lgb_clf = lgb.LGBMClassifier(random_state=1,
objective="multiclass",
num_class = 3,
importance_type=importance_type,
num_leaves=num_leaves,
max_depth=max_depth,
learning_rate=learning_rate,
n_estimators=n_estimators,
min_child_samples=min_child_samples,
subsample=subsample,
colsample_bytree=colsample_bytree,
reg_alpha=reg_alpha,
reg_lambda=reg_lambda
)
score = cross_val_score(lgb_clf, train_x, train_y, n_jobs=-1, cv=KFold(n_splits=10, shuffle=True, random_state=1), scoring='accuracy')
mean_score = score.mean()
return mean_score
lgb_study = optuna.create_study(direction="maximize")
lgb_study.optimize(objective_lgb, n_trials=1500)
lgb_trial = lgb_study.best_trial
print("accuracy:", lgb_trial.value)
print()
print("Best params:", lgb_trial.params)
=========================================================
def light_check(x,params):
model = lgb.LGBMClassifier()
scores = cross_val_score(model,x,y,cv=KFold(n_splits=10, shuffle=True, random_state=1),n_jobs=-1)
mean = scores.mean()
return scores, mean
light_check(x,{'num_leaves': 230, 'max_depth': 53, 'learning_rate': 0.04037430031226232, 'n_estimators': 1143, 'min_child_samples': 381, 'subsample': 0.12985990464862135, 'colsample_bytree': 0.8914118949904919, 'reg_alpha': 31.869348047391053, 'reg_lambda': 17.45653692887209, 'importance_type': 'split'})
From what I can see, you are using X_train, y_train in the optuna call, while in light_check you are passing x and y. Assuming you did a spilt in some unknown code, the data set for optuna is smaller and you get a different number.

ValueError: Invalid parameter when fiting gridsearchcv

This is my code
param_grid = [{'kneighborsclassifier__n_neighbors': [3, 4, 5, 6, 7]},
{'svc__C': [0.1, 1, 10, 100], 'svc__gamma': [0.001, 0.01, 0.1, 1, 10]},
{'randomforestclassifier__n_estimators': [50, 100, 200, 300 ,400]},
{'decisiontreeclassifier__max_depth': [12, 25, 50, 75, 100]},
{'adaboostclassifier__n_estimators': [50, 100, 200, 300 ,400]},
{'baggingclassifier__n_estimators': [50, 100, 200, 300, 400]}
]
inner_cv = StratifiedShuffleSplit(test_size=.8, train_size=.2, n_splits=5, random_state=0)
outer_cv = StratifiedShuffleSplit(test_size=.8, train_size=.2, n_splits=5, random_state=0)
models = [knn, svc, forest, dtc, ada, bag]
model_names = ['knn', 'svc','forest', 'dtc', 'ada', 'bag']
for m, mname in zip(models, model_names):
pipe = make_pipeline(VarianceThreshold(threshold=1),
MinMaxScaler(),
SelectKBest(f_classif, k=20),
m)
grid = GridSearchCV(pipe, param_grid=param_grid, cv=inner_cv)
grid.fit(X_train_test, y_train_test)
nested_score = cross_val_score(grid, X=X_train_test, y=y_train_test.values.ravel(), cv=outer_cv)
print(mname)
print(grid.best_params_)
print(grid.best_score_)
print('\n')
This is the error:
ValueError: Invalid parameter svc for estimator Pipeline(memory=None,
steps=[('variancethreshold', VarianceThreshold(threshold=1)),
('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1))),
('selectkbest',
SelectKBest(k=20,
score_func=<function f_classif at 0x0000019E0A485AF8>)),
('kneighborsclassifier',
KNeighborsClassifier(algorithm='auto', leaf_size=30,
metric='minkowski', metric_params=None,
n_jobs=None, n_neighbors=5, p=2,
weights='uniform'))],
verbose=False). Check the list of available parameters with `estimator.get_params().keys()`.
I dont know whats wrong. I copied the parameter and model names from the named_steps of the pipeline. If I run it without a parameter grid it works so the problem is most likely there.
Seems to work like this, but I dont like it.
pg1 = {'kneighborsclassifier__n_neighbors': [3, 4, 5, 6, 7]}
pg2 = {'svc__C': [0.1, 1, 10, 100],
'svc__gamma': [0.001, 0.01, 0.1, 1, 10]}
pg3 = {'randomforestclassifier__n_estimators': [50, 100, 200, 300 ,400]}
pg4 = {'decisiontreeclassifier__max_depth': [12, 25, 50, 75, 100]}
pg5 = {'adaboostclassifier__n_estimators': [50, 100, 200, 300 ,400]}
pg6 = {'baggingclassifier__n_estimators': [50, 100, 200, 300, 400]}
param_grid_list = [pg1, pg2, pg3, pg4, pg5, pg6]
And then the loop becomes:
for m, p, mname in zip(models, param_grid_list, model_names):
pipe = make_pipeline(VarianceThreshold(threshold=1),
MinMaxScaler(),
SelectKBest(f_classif, k=20),
m)
grid = GridSearchCV(pipe, param_grid=p, cv=inner_cv)
grid.fit(X_train_test, y_train_test)
nested_score = cross_val_score(grid, X=X_train_test, y=y_train_test.values.ravel(), cv=outer_cv)
print(mname)
print(grid.best_params_)
print(grid.best_score_)
print('\n')

How to get nested cross validation to run on multiple cores?

Following up from my question yesterday (How to speed up nested cross validation in python?) I am trying to run models in parallel processing, however with my code is it possible to check if these models, each requiring 1 core only to run on, could run on cores in parallel or whether with this code it will always be 1 core only taking on the models 1 at a time?
I have access to higher powered computing where I could ask for 6 cores, a core per model, however, I am not sure with my code whether the models would actually be assigned their own core and run in parallel. If that makes sense (apologies if this is completely misinformed, as I am trying to learn as I go, any help would be appreciated). With n_jobs=1 for each of my models in their gridsearch can I further specifiy for each to have their own core?
Here is my code:
dataset= pd.read_csv('data.csv')
data = dataset.drop(["gene"],1)
df = data.iloc[:,0:24]
df = df.fillna(0)
X = MinMaxScaler().fit_transform(df)
le = preprocessing.LabelEncoder()
encoded_value = le.fit_transform(["certain", "likely", "possible", "unlikely"])
Y = le.fit_transform(data["category"])
sm = SMOTE(random_state=100)
X_res, y_res = sm.fit_resample(X, Y)
seed = 7
logreg = LogisticRegression(penalty='l1', solver='liblinear',multi_class='auto')
LR_par= {'penalty':['l1'], 'C': [0.5, 1, 5, 10], 'max_iter':[500, 1000, 5000]}
rfc =RandomForestClassifier()
param_grid = {'bootstrap': [True, False],
'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
'max_features': ['auto', 'sqrt'],
'min_samples_leaf': [1, 2, 4,25],
'min_samples_split': [2, 5, 10, 25],
'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}
mlp = MLPClassifier(random_state=seed)
parameter_space = {'hidden_layer_sizes': [(10,20), (10,20,10), (50,)],
'activation': ['tanh', 'relu'],
'solver': ['adam', 'sgd'],
'max_iter': [10000],
'alpha': [0.1, 0.01, 0.001],
'learning_rate': ['constant','adaptive']}
gbm = GradientBoostingClassifier(min_samples_split=25, min_samples_leaf=25)
param = {"loss":["deviance"],
"learning_rate": [0.15,0.1,0.05,0.01,0.005,0.001],
"min_samples_split": [2, 5, 10, 25],
"min_samples_leaf": [1, 2, 4,25],
"max_depth":[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
"max_features":['auto', 'sqrt'],
"criterion": ["friedman_mse"],
"n_estimators":[200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
}
svm = SVC(gamma="scale", probability=True)
tuned_parameters = {'kernel':('linear', 'rbf'), 'C':(1,0.25,0.5,0.75)}
def baseline_model(optimizer='adam', learn_rate=0.01):
model = Sequential()
model.add(Dense(100, input_dim=X_res.shape[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(50, activation='relu')) #8 is the dim/ the number of hidden units (units are the kernel)
model.add(Dense(4, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
return model
keras = KerasClassifier(build_fn=baseline_model, batch_size=32, epochs=100, verbose=0)
learn_rate = [0.001, 0.01, 0.1, 0.2, 0.3]
optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']
kerasparams = dict(optimizer=optimizer, learn_rate=learn_rate)
inner_cv = KFold(n_splits=10, shuffle=True, random_state=seed)
outer_cv = KFold(n_splits=10, shuffle=True, random_state=seed)
models = []
models.append(('GBM', GridSearchCV(gbm, param, cv=inner_cv,iid=False, n_jobs=1)))
models.append(('RFC', GridSearchCV(rfc, param_grid, cv=inner_cv,iid=False, n_jobs=1)))
models.append(('LR', GridSearchCV(logreg, LR_par, cv=inner_cv, iid=False, n_jobs=1)))
models.append(('SVM', GridSearchCV(svm, tuned_parameters, cv=inner_cv, iid=False, n_jobs=1)))
models.append(('MLP', GridSearchCV(mlp, parameter_space, cv=inner_cv,iid=False, n_jobs=1)))
models.append(('Keras', GridSearchCV(estimator=keras, param_grid=kerasparams, cv=inner_cv,iid=False, n_jobs=1)))
results = []
names = []
scoring = 'accuracy'
X_train, X_test, Y_train, Y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=0)
for name, model in models:
nested_cv_results = model_selection.cross_val_score(model, X_res, y_res, cv=outer_cv, scoring=scoring)
results.append(nested_cv_results)
names.append(name)
msg = "Nested CV Accuracy %s: %f (+/- %f )" % (name, nested_cv_results.mean()*100, nested_cv_results.std()*100)
print(msg)
model.fit(X_train, Y_train)
print('Test set accuracy: {:.2f}'.format(model.score(X_test, Y_test)*100), '%')
print("Best Parameters: \n{}\n".format(model.best_params_))
print("Best CV Score: \n{}\n".format(model.best_score_))
Edit: I have now tried my for loop as:
def run_models(models):
nested_cv_results = model_selection.cross_val_score(model, X_res, y_res, cv=outer_cv, scoring=scoring)
results.append(nested_cv_results)
names.append(name)
msg = "Nested CV Accuracy %s: %f (+/- %f )" % (name, nested_cv_results.mean()*100, nested_cv_results.std()*100)
print(msg)
model.fit(X_train, Y_train)
print('Test set accuracy: {:.2f}'.format(model.score(X_test, Y_test)*100), '%')
#print("Best Estimator: \n{}\n".format(model.best_estimator_))
print("Best Parameters: \n{}\n".format(model.best_params_))
print("Best CV Score: \n{}\n".format(model.best_score_))
if __name__ == '__main__':
pool = Pool() # Create a multiprocessing Pool
pool.map(run_models, models)
However this seems to run indefinitely with no errors but no output
Edit: on trying to changing multiprocessing to not be IDLE I have tried:
def run_models(models):
nested_cv_results = model_selection.cross_val_score(model, X_res, y_res, cv=outer_cv, scoring=scoring)
results.append(nested_cv_results)
names.append(name)
msg = "Nested CV Accuracy %s: %f (+/- %f )" % (name, nested_cv_results.mean()*100, nested_cv_results.std()*100)
return msg, model.best_params_
for name, model in models:
if __name__ == '__main__':
pool = Pool() # Create a multiprocessing Pool
pool.map(run_models, model)
However this gives the error:
TypeError: 'GridSearchCV' object is not iterable

How set learning xgboost with evaluation set?

While using sklearn wrapper this is pretty easy to do for me this way:
import xgboost as xgb
clf = xgb.XGBClassifier( n_estimators=1500, learning_rate=0.015, gamma =0.3, min_child_weight = 3,nthread = 15,max_depth=150,
subsample=0.9, colsample_bytree=0.8, seed=2100, eval_metric = "rmse")
VALID = True
if VALID == True:
X_train, X_valid, y_train, y_valid = train_test_split(
X, y, test_size = 0.19, random_state=23)
model = xgb.train(X_train, y_train, params,
evallist = [(X_valid, y_valid)],
verbose_eval = 50,
early_stopping_rounds=50)
However I cant set it using standart class of xgboost:
params = {
'objective' : 'gpu:reg:linear',
'learning_rate': 0.02,
'gamma' : 0.3,
'min_child_weight' : 3,
'nthread' : 15,
'max_depth' : 30,
'subsample' : 0.9,
'colsample_bytree' : 0.8,
'seed':2100,
'eval_metric' : "rmse",
'num_boost_round' : 300
}
VALID = True
if VALID == True:
X_train, X_valid, y_train, y_valid = train_test_split(
X, y, test_size = 0.19, random_state=23)
model = xgb.train(X_train, y_train, params,
evallist = [(X_valid, y_valid)],
verbose_eval = 50,
early_stopping_rounds=50)
#error TypeError: train() got an unexpected keyword argument 'evallist'
Just need to specify parametrs correctly:
params = {
#'objective' : 'gpu:reg:linear',
'tree_method':'gpu_hist',
'learning_rate': 0.02,
'gamma' : 0.3,
'min_child_weight' : 3,
'nthread' : 15,
'max_depth' : 30,
'subsample' : 0.9,
'colsample_bytree' : 0.8,
'seed':2100,
'eval_metric' : "rmse",
'num_boost_round' : 300,
'n_estimators':999,
'max_leaves': 300
}
VALID = True
if VALID == True:
X_train, X_valid, y_train, y_valid = train_test_split(
X, y, test_size = 0.19, random_state=23)
tr_data = xgb.DMatrix(X_train, y_train)
va_data = xgb.DMatrix(X_valid, y_valid)
#del X_train, X_valid, y_train, y_valid ; gc.collect()
watchlist = [(tr_data, 'train'), (va_data, 'valid')]
model = xgb.train(params, tr_data, 300, watchlist, maximize=False, early_stopping_rounds = 30, verbose_eval=50)

Categories

Resources