Optuna score vs Cross_val_score? - python

A accuracy score from optuna and a score in cross_val_score were different. Why does it occuer and which score should I choose?
I used the hyperparameters that I got in optuna in cross_val_score.
def objective_lgb(trial):
num_leaves = trial.suggest_int("num_leaves", 2, 1000)
max_depth = trial.suggest_int("max_depth", 2, 100)
learning_rate = trial.suggest_float('learning_rate', 0.001, 1)
n_estimators = trial.suggest_int('n_estimators', 100, 2000)
min_child_samples = trial.suggest_int('min_child_samples', 3, 1000)
subsample = trial.suggest_float('subsample', 0.000001, 1)
colsample_bytree = trial.suggest_float('colsample_bytree', 0.00000001, 1)
reg_alpha = trial.suggest_float('reg_alpha', 0, 400)
reg_lambda = trial.suggest_float("reg_lambda", 0, 400)
importance_type = trial.suggest_categorical('importance_type', ["split", "gain"])
lgb_clf = lgb.LGBMClassifier(random_state=1,
objective="multiclass",
num_class = 3,
importance_type=importance_type,
num_leaves=num_leaves,
max_depth=max_depth,
learning_rate=learning_rate,
n_estimators=n_estimators,
min_child_samples=min_child_samples,
subsample=subsample,
colsample_bytree=colsample_bytree,
reg_alpha=reg_alpha,
reg_lambda=reg_lambda
)
score = cross_val_score(lgb_clf, train_x, train_y, n_jobs=-1, cv=KFold(n_splits=10, shuffle=True, random_state=1), scoring='accuracy')
mean_score = score.mean()
return mean_score
lgb_study = optuna.create_study(direction="maximize")
lgb_study.optimize(objective_lgb, n_trials=1500)
lgb_trial = lgb_study.best_trial
print("accuracy:", lgb_trial.value)
print()
print("Best params:", lgb_trial.params)
=========================================================
def light_check(x,params):
model = lgb.LGBMClassifier()
scores = cross_val_score(model,x,y,cv=KFold(n_splits=10, shuffle=True, random_state=1),n_jobs=-1)
mean = scores.mean()
return scores, mean
light_check(x,{'num_leaves': 230, 'max_depth': 53, 'learning_rate': 0.04037430031226232, 'n_estimators': 1143, 'min_child_samples': 381, 'subsample': 0.12985990464862135, 'colsample_bytree': 0.8914118949904919, 'reg_alpha': 31.869348047391053, 'reg_lambda': 17.45653692887209, 'importance_type': 'split'})

From what I can see, you are using X_train, y_train in the optuna call, while in light_check you are passing x and y. Assuming you did a spilt in some unknown code, the data set for optuna is smaller and you get a different number.

Related

ValueError: Found input variables with inconsistent numbers of samples: [28332, 24]

I am currently working with multilabel text classification in the Arabic language using binary relevance and label power set, after I make all preprocessing that I need when I need to combine chi and mutual feature selection based on their weights, I am facing this problem
Found input variables with inconsistent numbers of samples: [28332, 24]
where my dataset has one column have the text and 24 columns as a target as shown in the image :
enter image description here
I am writing this code
`class Classifier:
def __init__(self):
self.merged_df = pd.read_csv(r"D:\project\Ymal.csv", encoding='utf-8')
self.train_df, self.test_df = train_test_split(self.merged_df,test_size=0.2,random_state=42)
self.vectorizer = CountVectorizer()
self.ModelsPerformance = {}
def train(self):
self.train_text = self.train_df['text']
self.test_text = self.test_df['text']
self.train_labels = self.train_df.drop(columns=['text'])
self.test_labels = self.test_df.drop(columns=['text'])
self.mlb = MultiLabelBinarizer()
self.train_labels = self.mlb.fit_transform(self.train_labels)
self.test_labels = self.mlb.transform(self.test_labels)
self.train_text_bow = self.vectorizer.fit_transform(self.train_text)
self.test_text_bow = self.vectorizer.transform(self.test_text)
self.chi2_selector = SelectKBest(chi2, k='all',)
self.mi_selector = SelectKBest(mutual_info_classif, k='all',)
self.chi2_features = self.chi2_selector.fit_transform(self.train_text_bow,self.train_labels)
self.mi_features = self.mi_selector.fit_transform(self.train_text_bow,self.train_labels)
self.weights_chi2 = self.chi2_selector.scores_
self.weights_mi = self.mi_selector.scores_
self.weights = (self.weights_chi2 + self.weights_mi ) / 2
self.top_features = np.argsort(self.weights)[-4000:] #[::-1]
self.train_combined_features = self.train_text_bow[:,self.top_features]
self.test_text_bow = self.vectorizer.transform(self.test_text)
self.test_combined_features = self.test_text_bow[:, self.top_features]
def metricsReport(self,modelName, test_labels, predictions):
hamLoss = hamming_loss(test_labels, predictions)
print("------" + modelName + " Model Metrics-----")
accuracy = accuracy_score(test_labels, predictions)
macroPrecision = precision_score(test_labels, predictions, average='macro')
macroRecall = recall_score(test_labels, predictions, average='macro')
macroF1 = f1_score(test_labels, predictions, average='macro')
microPrecision = precision_score(test_labels, predictions, average='micro')
microRecall = recall_score(test_labels, predictions, average='micro')
microF1 = f1_score(test_labels, predictions, average='micro')
weightedF1 = f1_score(test_labels, predictions, average='weighted')
# print metrics
print("Hamming Loss: {:.4f}".format(hamLoss))
print('Accuracy: {0:.4f}'.format(accuracy))
print('Macro Precision: {0:.4f}'.format(macroPrecision))
print('Macro Recall: {0:.4f}'.format(macroRecall))
print('Macro F1-measure: {0:.4f}'.format(macroF1))
print('Micro Precision: {0:.4f}'.format(microPrecision))
print('Micro Recall: {0:.4f}'.format(microRecall))
print('Micro F1-measure: {0:.4f}\n'.format(microF1))
print('Weighted F1-measure: {0:.4f}\n'.format(weightedF1))
def fitAlgorithms(self):
algorithms = [{'name': 'LinearSVC', 'model': LinearSVC(max_iter=12000, dual=False),
'params': {'C': [0.1, 1, 10]}},
{'name': 'KNN', 'model': KNeighborsClassifier(),
'params': {'n_neighbors': [5, 10, 15]}},
{'name': 'RandomForest', 'model': RandomForestClassifier(),
'params': {'n_estimators': [100, 300, 500]}},
{'name': 'LogisticRegression', 'model': LogisticRegression(),
'params': {'C': [0.1, 1, 10]}},
{'name': 'DecisionTree', 'model': DecisionTreeClassifier(),
'params': {'max_depth': [5, 10, 15]}},
{'name': 'MultinomialNB', 'model': MultinomialNB(),
'params': {'alpha': [0.1, 1, 10]}}
]
for algorithm in algorithms:
model = algorithm['model']
name = algorithm['name']
params = algorithm['params']
# Fit the binary relevance and label powerset classifiers before the grid search
binary_relevance_classifier = BinaryRelevance(model)
binary_relevance_classifier.fit(self.train_combined_features, self.train_labels)
labelPowerSet_classifier = LabelPowerset(model)
labelPowerSet_classifier.fit(self.train_combined_features, self.train_labels)
print(f"Performing GridSearchCV for {name}...")
clf = GridSearchCV(model, params, scoring='accuracy', cv=5)
clf.fit(self.train_combined_features, self.train_labels)
best_params = clf.best_params_
print(f"Best parameters for {name}: {best_params}")
model.set_params(**best_params)
binary_relevance_preds = binary_relevance_classifier.predict(self.test_combined_features)
self.metricsReport(f"Binary Relevance with {name}", self.test_labels, binary_relevance_preds)
labelPowerSet_preds = labelPowerSet_classifier.predict(self.test_combined_features)
self.metricsReport(f"Label Powerset with {name}", self.test_labels, labelPowerSet_preds)
self.ModelsPerformance[name] = clf.best_score_
return self.ModelsPerformance
# Create an instance of the Classifier
classifier = Classifier()
# Invoke the training method
classifier.train()
# Invoke the fitAlgorithms() method
classifier.fitAlgorithms()
but this basic problem is this error I referee it above
please any one can help me and if any one can optimize this ?
I believe that error is clear but I cant avoid this , also i tried the do this to sure the shape but it fine
print("train_text_bow shape:", train_text_bow.shape) print("train_labels shape:", train_labels.shape) train_text_bow shape: (28332, 121714) train_labels shape: (28332, 24)t
I need just to avoid this error

Currently only multi-regression, multilabel and survival objectives work with multidimensional target

I used bayes_optto tunse hper-parameter of CatBoostRegressor (from catboost) for regression and got the following error:
CatBoostError: catboost/private/libs/target/data_providers.cpp:603: Currently only multi-regression, multilabel and survival objectives work with multidimensional target
Here is the code:
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from catboost import Pool, CatBoostRegressor
from bayes_opt import BayesianOptimization
from bayes_opt.util import Colours
from sklearn.metrics import accuracy_score
def get_data():
""" Preparing data ."""
# trainx, testx, trainy, testy= train_test_split(XN, YN, test_size=0.2, random_state= 31)
return trainx, testx, trainy, testy
def CBR_cv(iterations, learning_rate, depth, l2_leaf_reg, min_child_samples, trainx, testx, trainy, testy):
train_pool = Pool(trainx, trainy)
test_pool = Pool(testx)
model = CatBoostRegressor(iterations = iterations, learning_rate = learning_rate, depth = depth,
l2_leaf_reg = l2_leaf_reg, min_child_samples = min_child_samples, loss_function='RMSE' )
# param['learning_rate'] = trial.suggest_discrete_uniform("learning_rate", 0.001, 0.02, 0.001)
# param['depth'] = trial.suggest_int('depth', 9, 15)
# param['l2_leaf_reg'] = trial.suggest_discrete_uniform('l2_leaf_reg', 1.0, 5.5, 0.5)
# param['min_child_samples'] = trial.suggest_categorical('min_child_samples', [1, 4, 8, 16, 32])
# cval = cross_val_score(model, trainx, trainy, scoring='accuracy', cv=4)
# return cval.mean()
## fit the model
model.fit(train_pool)
## evaluate performance
yhat = model.predict(test_pool)
score = r2_score(testy, yhat)
return score
def optimize_XGB(trainx2, testx2, trainy2, testy2):
"""Apply Bayesian Optimization to Random Forest parameters."""
def CBR_crossval(iterations, learning_rate, depth, l2_leaf_reg, min_child_samples):
"""Wrapper of RandomForest cross validation.
Notice how we ensure n_estimators and min_samples_split are casted
to integer before we pass them along. Moreover, to avoid max_features
taking values outside the (0, 1) range, we also ensure it is capped
accordingly.
"""
return CBR_cv(iterations = int(iterations),
learning_rate = max(min(learning_rate, 0.5), 1e-3),
depth = int(depth),
l2_leaf_reg = max(min(l2_leaf_reg, 5.5), 1.0),
min_child_samples = int(min_child_samples),
trainx = trainx2, testx= testx2, trainy = trainy2, testy= testy2)
optimizer = BayesianOptimization(
f=CBR_crossval,
pbounds={
"iterations": (50, 500),
"depth": (2, 25),
"learning_rate": (0.01, 0.5),
"l2_leaf_reg": (1.0, 5.5),
"min_child_samples": (1, 50),
},
random_state=1234,
verbose=2
)
optimizer.maximize(n_iter=1000)
print("Final result:", optimizer.max)
if __name__ == "__main__":
trainx2, testx2, trainy2, testy2 = get_data()
print(Colours.green("--- Optimizing XGB ---"))
optimize_XGB(trainx2, testx2, trainy2, testy2)

how to use gridsearch and cross validation with differents parameters models?

I write a classification algorythm and I use 6 differents models. I want to improve the model by using finetuning parameters for each model. The problem I am encounering is related to my "for loop". In fact, I loop in three different dictionnaries but the correspondance between the model I use in the gridsearch and the parameters are not keep since the dictionnaries are not ordered;
I seem to failed to find another solution :
here my code and the result as you can see the model_name is different from the param_name so I get multiple erros like for example (ValueError: Invalid parameter alpha for estimator LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
)
below the code
model1 = LinearSVC()
model2 = MultinomialNB()
model3 = LogisticRegression()
model4 = RandomForestClassifier()
model5 = KNeighborsClassifier()
model6 = MLPClassifier(max_iter=300, random_state=1)
models = {'Model_SVC': model1, 'Model_G_NB': model2, 'Model_LR': model3, 'Model_RF': model4, 'Model_KN': model5, 'Model_MLP': model6}
# list of parameters
parameter_RF = {'min_samples_split': [3, 5, 10],
'criterion': ['gini', 'entropy'],
'n_estimators' : [100, 300],
'max_features': ['auto', 'sqrt','log2'],
'bootstrap': ['True', 'False'],
'max_depth': [3, 5, 15, 25]
}
parameter_LinearSvc = {'C': [0.001, 0.01, 0.1, 1, 10, 100]
}
parameter_LR = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
'penalty' : ['l1', 'l2'],
'solver' : ['liblinear', 'warn'],
'dual' : ['True','False'],
'max_iter' :[100, 110, 120, 130, 140]
}
parameter_NB = {'alpha': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
'loss': ['hinge', 'hinge_squarred'],
'penalty' : ['l1', 'l2']
}
k_range = list(range(1, 31))
weight_options = ['uniform', 'distance']
parameter_KNN = dict(n_neighbors=k_range, weights=weight_options)
parameter_MLP = { 'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
'activation': ['tanh', 'relu'],
'solver': ['sgd', 'adam'],
'alpha': [0.0001, 0.05],
'learning_rate': ['constant','adaptive'],
'max_iter' : [100, 200, 300]
}
parameters_dict = {'Model_SVC': parameter_LinearSvc, 'Model_G_NB': parameter_NB, 'Model_LR': parameter_LR, 'Model_RF': parameter_LR, 'Model_KN': parameter_KNN, 'Model_MLP': parameter_MLP}
cv_splitter = KFold(n_splits=10, shuffle=False, random_state=None)
for feature_name, feature in features.items():
for model_name, model in models.items():
for param_name, parameter in parameters_dict.items():
clf = GridSearchCV(estimator=model, param_grid=parameter, cv=cv_splitter, verbose = 1, n_jobs = -1, return_train_score=True)
best_model = clf.fit(feature, ylabels)
output : as you can see sometimes it works but other times param and model are not the same which causes the error
[5 rows x 7 columns]
Feature: vecteur_CV
Model: Model_SVC
Param: Model_SVC
Fitting 10 folds for each of 6 candidates, totalling 60 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 58 out of 60 | elapsed: 2.8s remaining: 0.1s
/svm/base.py:929: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
"the number of iterations.", ConvergenceWarning)
[Parallel(n_jobs=-1)]: Done 60 out of 60 | elapsed: 2.8s finished
Feature: vecteur_CV
Model: Model_SVC
Param: Model_G_NB
Fitting 10 folds for each of 24 candidates, totalling 240 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
joblib.externals.loky.process_executor._RemoteTraceback:
"""
Traceback (most recent call last):
File "/ho/anaconda3/lib/python3.7/site-packages/joblib/externals/loky/process_executor.py", line 418, in _process_worker
r = call_item()
File "/ho/anaconda3/lib/python3.7/site-packages/joblib/externals/loky/process_executor.py", line 272, in __call__
return self.fn(*self.args, **self.kwargs)
File "/ho/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 567, in __call__
return self.func(*args, **kwargs)
File "/ho/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 225, in __call__
for func, args, kwargs in self.items]
File "/ho/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 225, in <listcomp>
for func, args, kwargs in self.items]
File "/home/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 503, in _fit_and_score
estimator.set_params(**parameters)
File "/home/anaconda3/lib/python3.7/site-packages/sklearn/base.py", line 224, in set_params
(key, self))
ValueError: Invalid parameter alpha for estimator LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
Features look like this
`X_data, X_data_0, X_data_1, X_data_2 = features_fusion(verbatim, first_arg)
features = {'vecteur_CV': X_data, 'vecteur_NEG': X_data_0, 'Vecteur_NEG_lexique': X_data_1, 'Vecteur_NEG_CV': X_data_2}
I used 2 slightly different versions of the Iris dataset and this code below runs (albeit with many warnings during training):
from sklearn.model_selection import GridSearchCV,KFold
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
data = load_iris()
model1 = LinearSVC()
model2 = MultinomialNB()
model3 = LogisticRegression()
model4 = RandomForestClassifier()
model5 = KNeighborsClassifier()
model6 = MLPClassifier(max_iter=300, random_state=1)
models = {'Model_SVC': model1, 'Model_G_NB': model2, 'Model_LR': model3,
'Model_RF': model4, 'Model_KN': model5, 'Model_MLP': model6}
# list of parameters
parameter_RF = {'max_depth': [2,3, 5, 15, 25],
'min_samples_split': [3, 5, 10],
'criterion': ['gini', 'entropy'],
'n_estimators' : [100, 300],
'max_features': ['auto', 'sqrt','log2'],
'bootstrap': ['True', 'False'],
}
parameter_LinearSvc = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
parameter_LR = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
'penalty' : ['l1', 'l2'],
'solver' : ['liblinear', 'warn'],
#'dual' : ['True','False'],
'max_iter' :[100, 110, 120, 130, 140]
}
parameter_NB = {'alpha': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001],
#'loss': ['hinge', 'hinge_squarred'],
#'penalty' : ['l1', 'l2']
}
k_range = list(range(1, 31))
weight_options = ['uniform', 'distance']
parameter_KNN = dict(n_neighbors=k_range, weights=weight_options)
parameter_MLP = {'hidden_layer_sizes':[(50,50,50),(50,100,50),(100,)],
'activation': ['tanh', 'relu'],
'solver': ['sgd', 'adam'],
'alpha': [0.0001, 0.05],
'learning_rate': ['constant','adaptive'],
'max_iter' : [100, 200, 300]
}
parameters_dict = {'Model_SVC': parameter_LinearSvc,
'Model_G_NB': parameter_NB,
'Model_LR': parameter_LR, 'Model_RF': parameter_RF,
'Model_KN': parameter_KNN, 'Model_MLP': parameter_MLP}
cv_splitter = KFold(n_splits=10, shuffle=False, random_state=None)
#features = {'vecteur_CV': X_data, 'vecteur_NEG': X_data_0,
# 'Vecteur_NEG_lexique': X_data_1, 'Vecteur_NEG_CV': X_data_2}
features = {'iris': data['data'],'iris_sub':data['data'][:,1:]}
ylabels = data['target']
for feature_name, feature in features.items():
#print(feature_name, feature)
for model_name in models:
print('Training model: ', model_name)
clf = GridSearchCV(estimator=models[model_name],
param_grid=parameters_dict[model_name],
cv=cv_splitter, verbose = 1, n_jobs = -1,
return_train_score=True)
best_model = clf.fit(feature, ylabels)
#for feature_name, feature in features.items():
# for model_name, model in models.items():
# for param_name, parameter in parameters_dict.items():
# print(model_name,model,param_name,parameter)
# clf = GridSearchCV(estimator=model, param_grid=parameter,
# cv=cv_splitter, verbose = 1, n_jobs = -1,
# return_train_score=True)
# best_model = clf.fit(feature, ylabels)
I had to comment some model parameters as they gave errors. There was also a typo in your snippet above 'Model_RF': parameter_LR should be 'Model_RF': parameter_RF. I'm not sure if that was the reason of your error. I also removed the inner parameters_dict loop as I could access all elements using the same keys as models.

How to get nested cross validation to run on multiple cores?

Following up from my question yesterday (How to speed up nested cross validation in python?) I am trying to run models in parallel processing, however with my code is it possible to check if these models, each requiring 1 core only to run on, could run on cores in parallel or whether with this code it will always be 1 core only taking on the models 1 at a time?
I have access to higher powered computing where I could ask for 6 cores, a core per model, however, I am not sure with my code whether the models would actually be assigned their own core and run in parallel. If that makes sense (apologies if this is completely misinformed, as I am trying to learn as I go, any help would be appreciated). With n_jobs=1 for each of my models in their gridsearch can I further specifiy for each to have their own core?
Here is my code:
dataset= pd.read_csv('data.csv')
data = dataset.drop(["gene"],1)
df = data.iloc[:,0:24]
df = df.fillna(0)
X = MinMaxScaler().fit_transform(df)
le = preprocessing.LabelEncoder()
encoded_value = le.fit_transform(["certain", "likely", "possible", "unlikely"])
Y = le.fit_transform(data["category"])
sm = SMOTE(random_state=100)
X_res, y_res = sm.fit_resample(X, Y)
seed = 7
logreg = LogisticRegression(penalty='l1', solver='liblinear',multi_class='auto')
LR_par= {'penalty':['l1'], 'C': [0.5, 1, 5, 10], 'max_iter':[500, 1000, 5000]}
rfc =RandomForestClassifier()
param_grid = {'bootstrap': [True, False],
'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
'max_features': ['auto', 'sqrt'],
'min_samples_leaf': [1, 2, 4,25],
'min_samples_split': [2, 5, 10, 25],
'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}
mlp = MLPClassifier(random_state=seed)
parameter_space = {'hidden_layer_sizes': [(10,20), (10,20,10), (50,)],
'activation': ['tanh', 'relu'],
'solver': ['adam', 'sgd'],
'max_iter': [10000],
'alpha': [0.1, 0.01, 0.001],
'learning_rate': ['constant','adaptive']}
gbm = GradientBoostingClassifier(min_samples_split=25, min_samples_leaf=25)
param = {"loss":["deviance"],
"learning_rate": [0.15,0.1,0.05,0.01,0.005,0.001],
"min_samples_split": [2, 5, 10, 25],
"min_samples_leaf": [1, 2, 4,25],
"max_depth":[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
"max_features":['auto', 'sqrt'],
"criterion": ["friedman_mse"],
"n_estimators":[200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
}
svm = SVC(gamma="scale", probability=True)
tuned_parameters = {'kernel':('linear', 'rbf'), 'C':(1,0.25,0.5,0.75)}
def baseline_model(optimizer='adam', learn_rate=0.01):
model = Sequential()
model.add(Dense(100, input_dim=X_res.shape[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(50, activation='relu')) #8 is the dim/ the number of hidden units (units are the kernel)
model.add(Dense(4, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
return model
keras = KerasClassifier(build_fn=baseline_model, batch_size=32, epochs=100, verbose=0)
learn_rate = [0.001, 0.01, 0.1, 0.2, 0.3]
optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']
kerasparams = dict(optimizer=optimizer, learn_rate=learn_rate)
inner_cv = KFold(n_splits=10, shuffle=True, random_state=seed)
outer_cv = KFold(n_splits=10, shuffle=True, random_state=seed)
models = []
models.append(('GBM', GridSearchCV(gbm, param, cv=inner_cv,iid=False, n_jobs=1)))
models.append(('RFC', GridSearchCV(rfc, param_grid, cv=inner_cv,iid=False, n_jobs=1)))
models.append(('LR', GridSearchCV(logreg, LR_par, cv=inner_cv, iid=False, n_jobs=1)))
models.append(('SVM', GridSearchCV(svm, tuned_parameters, cv=inner_cv, iid=False, n_jobs=1)))
models.append(('MLP', GridSearchCV(mlp, parameter_space, cv=inner_cv,iid=False, n_jobs=1)))
models.append(('Keras', GridSearchCV(estimator=keras, param_grid=kerasparams, cv=inner_cv,iid=False, n_jobs=1)))
results = []
names = []
scoring = 'accuracy'
X_train, X_test, Y_train, Y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=0)
for name, model in models:
nested_cv_results = model_selection.cross_val_score(model, X_res, y_res, cv=outer_cv, scoring=scoring)
results.append(nested_cv_results)
names.append(name)
msg = "Nested CV Accuracy %s: %f (+/- %f )" % (name, nested_cv_results.mean()*100, nested_cv_results.std()*100)
print(msg)
model.fit(X_train, Y_train)
print('Test set accuracy: {:.2f}'.format(model.score(X_test, Y_test)*100), '%')
print("Best Parameters: \n{}\n".format(model.best_params_))
print("Best CV Score: \n{}\n".format(model.best_score_))
Edit: I have now tried my for loop as:
def run_models(models):
nested_cv_results = model_selection.cross_val_score(model, X_res, y_res, cv=outer_cv, scoring=scoring)
results.append(nested_cv_results)
names.append(name)
msg = "Nested CV Accuracy %s: %f (+/- %f )" % (name, nested_cv_results.mean()*100, nested_cv_results.std()*100)
print(msg)
model.fit(X_train, Y_train)
print('Test set accuracy: {:.2f}'.format(model.score(X_test, Y_test)*100), '%')
#print("Best Estimator: \n{}\n".format(model.best_estimator_))
print("Best Parameters: \n{}\n".format(model.best_params_))
print("Best CV Score: \n{}\n".format(model.best_score_))
if __name__ == '__main__':
pool = Pool() # Create a multiprocessing Pool
pool.map(run_models, models)
However this seems to run indefinitely with no errors but no output
Edit: on trying to changing multiprocessing to not be IDLE I have tried:
def run_models(models):
nested_cv_results = model_selection.cross_val_score(model, X_res, y_res, cv=outer_cv, scoring=scoring)
results.append(nested_cv_results)
names.append(name)
msg = "Nested CV Accuracy %s: %f (+/- %f )" % (name, nested_cv_results.mean()*100, nested_cv_results.std()*100)
return msg, model.best_params_
for name, model in models:
if __name__ == '__main__':
pool = Pool() # Create a multiprocessing Pool
pool.map(run_models, model)
However this gives the error:
TypeError: 'GridSearchCV' object is not iterable

How set learning xgboost with evaluation set?

While using sklearn wrapper this is pretty easy to do for me this way:
import xgboost as xgb
clf = xgb.XGBClassifier( n_estimators=1500, learning_rate=0.015, gamma =0.3, min_child_weight = 3,nthread = 15,max_depth=150,
subsample=0.9, colsample_bytree=0.8, seed=2100, eval_metric = "rmse")
VALID = True
if VALID == True:
X_train, X_valid, y_train, y_valid = train_test_split(
X, y, test_size = 0.19, random_state=23)
model = xgb.train(X_train, y_train, params,
evallist = [(X_valid, y_valid)],
verbose_eval = 50,
early_stopping_rounds=50)
However I cant set it using standart class of xgboost:
params = {
'objective' : 'gpu:reg:linear',
'learning_rate': 0.02,
'gamma' : 0.3,
'min_child_weight' : 3,
'nthread' : 15,
'max_depth' : 30,
'subsample' : 0.9,
'colsample_bytree' : 0.8,
'seed':2100,
'eval_metric' : "rmse",
'num_boost_round' : 300
}
VALID = True
if VALID == True:
X_train, X_valid, y_train, y_valid = train_test_split(
X, y, test_size = 0.19, random_state=23)
model = xgb.train(X_train, y_train, params,
evallist = [(X_valid, y_valid)],
verbose_eval = 50,
early_stopping_rounds=50)
#error TypeError: train() got an unexpected keyword argument 'evallist'
Just need to specify parametrs correctly:
params = {
#'objective' : 'gpu:reg:linear',
'tree_method':'gpu_hist',
'learning_rate': 0.02,
'gamma' : 0.3,
'min_child_weight' : 3,
'nthread' : 15,
'max_depth' : 30,
'subsample' : 0.9,
'colsample_bytree' : 0.8,
'seed':2100,
'eval_metric' : "rmse",
'num_boost_round' : 300,
'n_estimators':999,
'max_leaves': 300
}
VALID = True
if VALID == True:
X_train, X_valid, y_train, y_valid = train_test_split(
X, y, test_size = 0.19, random_state=23)
tr_data = xgb.DMatrix(X_train, y_train)
va_data = xgb.DMatrix(X_valid, y_valid)
#del X_train, X_valid, y_train, y_valid ; gc.collect()
watchlist = [(tr_data, 'train'), (va_data, 'valid')]
model = xgb.train(params, tr_data, 300, watchlist, maximize=False, early_stopping_rounds = 30, verbose_eval=50)

Categories

Resources