Related
I am currently working with multilabel text classification in the Arabic language using binary relevance and label power set, after I make all preprocessing that I need when I need to combine chi and mutual feature selection based on their weights, I am facing this problem
Found input variables with inconsistent numbers of samples: [28332, 24]
where my dataset has one column have the text and 24 columns as a target as shown in the image :
enter image description here
I am writing this code
`class Classifier:
def __init__(self):
self.merged_df = pd.read_csv(r"D:\project\Ymal.csv", encoding='utf-8')
self.train_df, self.test_df = train_test_split(self.merged_df,test_size=0.2,random_state=42)
self.vectorizer = CountVectorizer()
self.ModelsPerformance = {}
def train(self):
self.train_text = self.train_df['text']
self.test_text = self.test_df['text']
self.train_labels = self.train_df.drop(columns=['text'])
self.test_labels = self.test_df.drop(columns=['text'])
self.mlb = MultiLabelBinarizer()
self.train_labels = self.mlb.fit_transform(self.train_labels)
self.test_labels = self.mlb.transform(self.test_labels)
self.train_text_bow = self.vectorizer.fit_transform(self.train_text)
self.test_text_bow = self.vectorizer.transform(self.test_text)
self.chi2_selector = SelectKBest(chi2, k='all',)
self.mi_selector = SelectKBest(mutual_info_classif, k='all',)
self.chi2_features = self.chi2_selector.fit_transform(self.train_text_bow,self.train_labels)
self.mi_features = self.mi_selector.fit_transform(self.train_text_bow,self.train_labels)
self.weights_chi2 = self.chi2_selector.scores_
self.weights_mi = self.mi_selector.scores_
self.weights = (self.weights_chi2 + self.weights_mi ) / 2
self.top_features = np.argsort(self.weights)[-4000:] #[::-1]
self.train_combined_features = self.train_text_bow[:,self.top_features]
self.test_text_bow = self.vectorizer.transform(self.test_text)
self.test_combined_features = self.test_text_bow[:, self.top_features]
def metricsReport(self,modelName, test_labels, predictions):
hamLoss = hamming_loss(test_labels, predictions)
print("------" + modelName + " Model Metrics-----")
accuracy = accuracy_score(test_labels, predictions)
macroPrecision = precision_score(test_labels, predictions, average='macro')
macroRecall = recall_score(test_labels, predictions, average='macro')
macroF1 = f1_score(test_labels, predictions, average='macro')
microPrecision = precision_score(test_labels, predictions, average='micro')
microRecall = recall_score(test_labels, predictions, average='micro')
microF1 = f1_score(test_labels, predictions, average='micro')
weightedF1 = f1_score(test_labels, predictions, average='weighted')
# print metrics
print("Hamming Loss: {:.4f}".format(hamLoss))
print('Accuracy: {0:.4f}'.format(accuracy))
print('Macro Precision: {0:.4f}'.format(macroPrecision))
print('Macro Recall: {0:.4f}'.format(macroRecall))
print('Macro F1-measure: {0:.4f}'.format(macroF1))
print('Micro Precision: {0:.4f}'.format(microPrecision))
print('Micro Recall: {0:.4f}'.format(microRecall))
print('Micro F1-measure: {0:.4f}\n'.format(microF1))
print('Weighted F1-measure: {0:.4f}\n'.format(weightedF1))
def fitAlgorithms(self):
algorithms = [{'name': 'LinearSVC', 'model': LinearSVC(max_iter=12000, dual=False),
'params': {'C': [0.1, 1, 10]}},
{'name': 'KNN', 'model': KNeighborsClassifier(),
'params': {'n_neighbors': [5, 10, 15]}},
{'name': 'RandomForest', 'model': RandomForestClassifier(),
'params': {'n_estimators': [100, 300, 500]}},
{'name': 'LogisticRegression', 'model': LogisticRegression(),
'params': {'C': [0.1, 1, 10]}},
{'name': 'DecisionTree', 'model': DecisionTreeClassifier(),
'params': {'max_depth': [5, 10, 15]}},
{'name': 'MultinomialNB', 'model': MultinomialNB(),
'params': {'alpha': [0.1, 1, 10]}}
]
for algorithm in algorithms:
model = algorithm['model']
name = algorithm['name']
params = algorithm['params']
# Fit the binary relevance and label powerset classifiers before the grid search
binary_relevance_classifier = BinaryRelevance(model)
binary_relevance_classifier.fit(self.train_combined_features, self.train_labels)
labelPowerSet_classifier = LabelPowerset(model)
labelPowerSet_classifier.fit(self.train_combined_features, self.train_labels)
print(f"Performing GridSearchCV for {name}...")
clf = GridSearchCV(model, params, scoring='accuracy', cv=5)
clf.fit(self.train_combined_features, self.train_labels)
best_params = clf.best_params_
print(f"Best parameters for {name}: {best_params}")
model.set_params(**best_params)
binary_relevance_preds = binary_relevance_classifier.predict(self.test_combined_features)
self.metricsReport(f"Binary Relevance with {name}", self.test_labels, binary_relevance_preds)
labelPowerSet_preds = labelPowerSet_classifier.predict(self.test_combined_features)
self.metricsReport(f"Label Powerset with {name}", self.test_labels, labelPowerSet_preds)
self.ModelsPerformance[name] = clf.best_score_
return self.ModelsPerformance
# Create an instance of the Classifier
classifier = Classifier()
# Invoke the training method
classifier.train()
# Invoke the fitAlgorithms() method
classifier.fitAlgorithms()
but this basic problem is this error I referee it above
please any one can help me and if any one can optimize this ?
I believe that error is clear but I cant avoid this , also i tried the do this to sure the shape but it fine
print("train_text_bow shape:", train_text_bow.shape) print("train_labels shape:", train_labels.shape) train_text_bow shape: (28332, 121714) train_labels shape: (28332, 24)t
I need just to avoid this error
I used bayes_optto tunse hper-parameter of CatBoostRegressor (from catboost) for regression and got the following error:
CatBoostError: catboost/private/libs/target/data_providers.cpp:603: Currently only multi-regression, multilabel and survival objectives work with multidimensional target
Here is the code:
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from catboost import Pool, CatBoostRegressor
from bayes_opt import BayesianOptimization
from bayes_opt.util import Colours
from sklearn.metrics import accuracy_score
def get_data():
""" Preparing data ."""
# trainx, testx, trainy, testy= train_test_split(XN, YN, test_size=0.2, random_state= 31)
return trainx, testx, trainy, testy
def CBR_cv(iterations, learning_rate, depth, l2_leaf_reg, min_child_samples, trainx, testx, trainy, testy):
train_pool = Pool(trainx, trainy)
test_pool = Pool(testx)
model = CatBoostRegressor(iterations = iterations, learning_rate = learning_rate, depth = depth,
l2_leaf_reg = l2_leaf_reg, min_child_samples = min_child_samples, loss_function='RMSE' )
# param['learning_rate'] = trial.suggest_discrete_uniform("learning_rate", 0.001, 0.02, 0.001)
# param['depth'] = trial.suggest_int('depth', 9, 15)
# param['l2_leaf_reg'] = trial.suggest_discrete_uniform('l2_leaf_reg', 1.0, 5.5, 0.5)
# param['min_child_samples'] = trial.suggest_categorical('min_child_samples', [1, 4, 8, 16, 32])
# cval = cross_val_score(model, trainx, trainy, scoring='accuracy', cv=4)
# return cval.mean()
## fit the model
model.fit(train_pool)
## evaluate performance
yhat = model.predict(test_pool)
score = r2_score(testy, yhat)
return score
def optimize_XGB(trainx2, testx2, trainy2, testy2):
"""Apply Bayesian Optimization to Random Forest parameters."""
def CBR_crossval(iterations, learning_rate, depth, l2_leaf_reg, min_child_samples):
"""Wrapper of RandomForest cross validation.
Notice how we ensure n_estimators and min_samples_split are casted
to integer before we pass them along. Moreover, to avoid max_features
taking values outside the (0, 1) range, we also ensure it is capped
accordingly.
"""
return CBR_cv(iterations = int(iterations),
learning_rate = max(min(learning_rate, 0.5), 1e-3),
depth = int(depth),
l2_leaf_reg = max(min(l2_leaf_reg, 5.5), 1.0),
min_child_samples = int(min_child_samples),
trainx = trainx2, testx= testx2, trainy = trainy2, testy= testy2)
optimizer = BayesianOptimization(
f=CBR_crossval,
pbounds={
"iterations": (50, 500),
"depth": (2, 25),
"learning_rate": (0.01, 0.5),
"l2_leaf_reg": (1.0, 5.5),
"min_child_samples": (1, 50),
},
random_state=1234,
verbose=2
)
optimizer.maximize(n_iter=1000)
print("Final result:", optimizer.max)
if __name__ == "__main__":
trainx2, testx2, trainy2, testy2 = get_data()
print(Colours.green("--- Optimizing XGB ---"))
optimize_XGB(trainx2, testx2, trainy2, testy2)
I write a classification algorythm and I use 6 differents models. I want to improve the model by using finetuning parameters for each model. The problem I am encounering is related to my "for loop". In fact, I loop in three different dictionnaries but the correspondance between the model I use in the gridsearch and the parameters are not keep since the dictionnaries are not ordered;
I seem to failed to find another solution :
here my code and the result as you can see the model_name is different from the param_name so I get multiple erros like for example (ValueError: Invalid parameter alpha for estimator LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
)
below the code
model1 = LinearSVC()
model2 = MultinomialNB()
model3 = LogisticRegression()
model4 = RandomForestClassifier()
model5 = KNeighborsClassifier()
model6 = MLPClassifier(max_iter=300, random_state=1)
models = {'Model_SVC': model1, 'Model_G_NB': model2, 'Model_LR': model3, 'Model_RF': model4, 'Model_KN': model5, 'Model_MLP': model6}
# list of parameters
parameter_RF = {'min_samples_split': [3, 5, 10],
'criterion': ['gini', 'entropy'],
'n_estimators' : [100, 300],
'max_features': ['auto', 'sqrt','log2'],
'bootstrap': ['True', 'False'],
'max_depth': [3, 5, 15, 25]
}
parameter_LinearSvc = {'C': [0.001, 0.01, 0.1, 1, 10, 100]
}
parameter_LR = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
'penalty' : ['l1', 'l2'],
'solver' : ['liblinear', 'warn'],
'dual' : ['True','False'],
'max_iter' :[100, 110, 120, 130, 140]
}
parameter_NB = {'alpha': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
'loss': ['hinge', 'hinge_squarred'],
'penalty' : ['l1', 'l2']
}
k_range = list(range(1, 31))
weight_options = ['uniform', 'distance']
parameter_KNN = dict(n_neighbors=k_range, weights=weight_options)
parameter_MLP = { 'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
'activation': ['tanh', 'relu'],
'solver': ['sgd', 'adam'],
'alpha': [0.0001, 0.05],
'learning_rate': ['constant','adaptive'],
'max_iter' : [100, 200, 300]
}
parameters_dict = {'Model_SVC': parameter_LinearSvc, 'Model_G_NB': parameter_NB, 'Model_LR': parameter_LR, 'Model_RF': parameter_LR, 'Model_KN': parameter_KNN, 'Model_MLP': parameter_MLP}
cv_splitter = KFold(n_splits=10, shuffle=False, random_state=None)
for feature_name, feature in features.items():
for model_name, model in models.items():
for param_name, parameter in parameters_dict.items():
clf = GridSearchCV(estimator=model, param_grid=parameter, cv=cv_splitter, verbose = 1, n_jobs = -1, return_train_score=True)
best_model = clf.fit(feature, ylabels)
output : as you can see sometimes it works but other times param and model are not the same which causes the error
[5 rows x 7 columns]
Feature: vecteur_CV
Model: Model_SVC
Param: Model_SVC
Fitting 10 folds for each of 6 candidates, totalling 60 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 58 out of 60 | elapsed: 2.8s remaining: 0.1s
/svm/base.py:929: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
"the number of iterations.", ConvergenceWarning)
[Parallel(n_jobs=-1)]: Done 60 out of 60 | elapsed: 2.8s finished
Feature: vecteur_CV
Model: Model_SVC
Param: Model_G_NB
Fitting 10 folds for each of 24 candidates, totalling 240 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
joblib.externals.loky.process_executor._RemoteTraceback:
"""
Traceback (most recent call last):
File "/ho/anaconda3/lib/python3.7/site-packages/joblib/externals/loky/process_executor.py", line 418, in _process_worker
r = call_item()
File "/ho/anaconda3/lib/python3.7/site-packages/joblib/externals/loky/process_executor.py", line 272, in __call__
return self.fn(*self.args, **self.kwargs)
File "/ho/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 567, in __call__
return self.func(*args, **kwargs)
File "/ho/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 225, in __call__
for func, args, kwargs in self.items]
File "/ho/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 225, in <listcomp>
for func, args, kwargs in self.items]
File "/home/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 503, in _fit_and_score
estimator.set_params(**parameters)
File "/home/anaconda3/lib/python3.7/site-packages/sklearn/base.py", line 224, in set_params
(key, self))
ValueError: Invalid parameter alpha for estimator LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
Features look like this
`X_data, X_data_0, X_data_1, X_data_2 = features_fusion(verbatim, first_arg)
features = {'vecteur_CV': X_data, 'vecteur_NEG': X_data_0, 'Vecteur_NEG_lexique': X_data_1, 'Vecteur_NEG_CV': X_data_2}
I used 2 slightly different versions of the Iris dataset and this code below runs (albeit with many warnings during training):
from sklearn.model_selection import GridSearchCV,KFold
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
data = load_iris()
model1 = LinearSVC()
model2 = MultinomialNB()
model3 = LogisticRegression()
model4 = RandomForestClassifier()
model5 = KNeighborsClassifier()
model6 = MLPClassifier(max_iter=300, random_state=1)
models = {'Model_SVC': model1, 'Model_G_NB': model2, 'Model_LR': model3,
'Model_RF': model4, 'Model_KN': model5, 'Model_MLP': model6}
# list of parameters
parameter_RF = {'max_depth': [2,3, 5, 15, 25],
'min_samples_split': [3, 5, 10],
'criterion': ['gini', 'entropy'],
'n_estimators' : [100, 300],
'max_features': ['auto', 'sqrt','log2'],
'bootstrap': ['True', 'False'],
}
parameter_LinearSvc = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
parameter_LR = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
'penalty' : ['l1', 'l2'],
'solver' : ['liblinear', 'warn'],
#'dual' : ['True','False'],
'max_iter' :[100, 110, 120, 130, 140]
}
parameter_NB = {'alpha': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001],
#'loss': ['hinge', 'hinge_squarred'],
#'penalty' : ['l1', 'l2']
}
k_range = list(range(1, 31))
weight_options = ['uniform', 'distance']
parameter_KNN = dict(n_neighbors=k_range, weights=weight_options)
parameter_MLP = {'hidden_layer_sizes':[(50,50,50),(50,100,50),(100,)],
'activation': ['tanh', 'relu'],
'solver': ['sgd', 'adam'],
'alpha': [0.0001, 0.05],
'learning_rate': ['constant','adaptive'],
'max_iter' : [100, 200, 300]
}
parameters_dict = {'Model_SVC': parameter_LinearSvc,
'Model_G_NB': parameter_NB,
'Model_LR': parameter_LR, 'Model_RF': parameter_RF,
'Model_KN': parameter_KNN, 'Model_MLP': parameter_MLP}
cv_splitter = KFold(n_splits=10, shuffle=False, random_state=None)
#features = {'vecteur_CV': X_data, 'vecteur_NEG': X_data_0,
# 'Vecteur_NEG_lexique': X_data_1, 'Vecteur_NEG_CV': X_data_2}
features = {'iris': data['data'],'iris_sub':data['data'][:,1:]}
ylabels = data['target']
for feature_name, feature in features.items():
#print(feature_name, feature)
for model_name in models:
print('Training model: ', model_name)
clf = GridSearchCV(estimator=models[model_name],
param_grid=parameters_dict[model_name],
cv=cv_splitter, verbose = 1, n_jobs = -1,
return_train_score=True)
best_model = clf.fit(feature, ylabels)
#for feature_name, feature in features.items():
# for model_name, model in models.items():
# for param_name, parameter in parameters_dict.items():
# print(model_name,model,param_name,parameter)
# clf = GridSearchCV(estimator=model, param_grid=parameter,
# cv=cv_splitter, verbose = 1, n_jobs = -1,
# return_train_score=True)
# best_model = clf.fit(feature, ylabels)
I had to comment some model parameters as they gave errors. There was also a typo in your snippet above 'Model_RF': parameter_LR should be 'Model_RF': parameter_RF. I'm not sure if that was the reason of your error. I also removed the inner parameters_dict loop as I could access all elements using the same keys as models.
Following up from my question yesterday (How to speed up nested cross validation in python?) I am trying to run models in parallel processing, however with my code is it possible to check if these models, each requiring 1 core only to run on, could run on cores in parallel or whether with this code it will always be 1 core only taking on the models 1 at a time?
I have access to higher powered computing where I could ask for 6 cores, a core per model, however, I am not sure with my code whether the models would actually be assigned their own core and run in parallel. If that makes sense (apologies if this is completely misinformed, as I am trying to learn as I go, any help would be appreciated). With n_jobs=1 for each of my models in their gridsearch can I further specifiy for each to have their own core?
Here is my code:
dataset= pd.read_csv('data.csv')
data = dataset.drop(["gene"],1)
df = data.iloc[:,0:24]
df = df.fillna(0)
X = MinMaxScaler().fit_transform(df)
le = preprocessing.LabelEncoder()
encoded_value = le.fit_transform(["certain", "likely", "possible", "unlikely"])
Y = le.fit_transform(data["category"])
sm = SMOTE(random_state=100)
X_res, y_res = sm.fit_resample(X, Y)
seed = 7
logreg = LogisticRegression(penalty='l1', solver='liblinear',multi_class='auto')
LR_par= {'penalty':['l1'], 'C': [0.5, 1, 5, 10], 'max_iter':[500, 1000, 5000]}
rfc =RandomForestClassifier()
param_grid = {'bootstrap': [True, False],
'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
'max_features': ['auto', 'sqrt'],
'min_samples_leaf': [1, 2, 4,25],
'min_samples_split': [2, 5, 10, 25],
'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}
mlp = MLPClassifier(random_state=seed)
parameter_space = {'hidden_layer_sizes': [(10,20), (10,20,10), (50,)],
'activation': ['tanh', 'relu'],
'solver': ['adam', 'sgd'],
'max_iter': [10000],
'alpha': [0.1, 0.01, 0.001],
'learning_rate': ['constant','adaptive']}
gbm = GradientBoostingClassifier(min_samples_split=25, min_samples_leaf=25)
param = {"loss":["deviance"],
"learning_rate": [0.15,0.1,0.05,0.01,0.005,0.001],
"min_samples_split": [2, 5, 10, 25],
"min_samples_leaf": [1, 2, 4,25],
"max_depth":[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
"max_features":['auto', 'sqrt'],
"criterion": ["friedman_mse"],
"n_estimators":[200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
}
svm = SVC(gamma="scale", probability=True)
tuned_parameters = {'kernel':('linear', 'rbf'), 'C':(1,0.25,0.5,0.75)}
def baseline_model(optimizer='adam', learn_rate=0.01):
model = Sequential()
model.add(Dense(100, input_dim=X_res.shape[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(50, activation='relu')) #8 is the dim/ the number of hidden units (units are the kernel)
model.add(Dense(4, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
return model
keras = KerasClassifier(build_fn=baseline_model, batch_size=32, epochs=100, verbose=0)
learn_rate = [0.001, 0.01, 0.1, 0.2, 0.3]
optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']
kerasparams = dict(optimizer=optimizer, learn_rate=learn_rate)
inner_cv = KFold(n_splits=10, shuffle=True, random_state=seed)
outer_cv = KFold(n_splits=10, shuffle=True, random_state=seed)
models = []
models.append(('GBM', GridSearchCV(gbm, param, cv=inner_cv,iid=False, n_jobs=1)))
models.append(('RFC', GridSearchCV(rfc, param_grid, cv=inner_cv,iid=False, n_jobs=1)))
models.append(('LR', GridSearchCV(logreg, LR_par, cv=inner_cv, iid=False, n_jobs=1)))
models.append(('SVM', GridSearchCV(svm, tuned_parameters, cv=inner_cv, iid=False, n_jobs=1)))
models.append(('MLP', GridSearchCV(mlp, parameter_space, cv=inner_cv,iid=False, n_jobs=1)))
models.append(('Keras', GridSearchCV(estimator=keras, param_grid=kerasparams, cv=inner_cv,iid=False, n_jobs=1)))
results = []
names = []
scoring = 'accuracy'
X_train, X_test, Y_train, Y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=0)
for name, model in models:
nested_cv_results = model_selection.cross_val_score(model, X_res, y_res, cv=outer_cv, scoring=scoring)
results.append(nested_cv_results)
names.append(name)
msg = "Nested CV Accuracy %s: %f (+/- %f )" % (name, nested_cv_results.mean()*100, nested_cv_results.std()*100)
print(msg)
model.fit(X_train, Y_train)
print('Test set accuracy: {:.2f}'.format(model.score(X_test, Y_test)*100), '%')
print("Best Parameters: \n{}\n".format(model.best_params_))
print("Best CV Score: \n{}\n".format(model.best_score_))
Edit: I have now tried my for loop as:
def run_models(models):
nested_cv_results = model_selection.cross_val_score(model, X_res, y_res, cv=outer_cv, scoring=scoring)
results.append(nested_cv_results)
names.append(name)
msg = "Nested CV Accuracy %s: %f (+/- %f )" % (name, nested_cv_results.mean()*100, nested_cv_results.std()*100)
print(msg)
model.fit(X_train, Y_train)
print('Test set accuracy: {:.2f}'.format(model.score(X_test, Y_test)*100), '%')
#print("Best Estimator: \n{}\n".format(model.best_estimator_))
print("Best Parameters: \n{}\n".format(model.best_params_))
print("Best CV Score: \n{}\n".format(model.best_score_))
if __name__ == '__main__':
pool = Pool() # Create a multiprocessing Pool
pool.map(run_models, models)
However this seems to run indefinitely with no errors but no output
Edit: on trying to changing multiprocessing to not be IDLE I have tried:
def run_models(models):
nested_cv_results = model_selection.cross_val_score(model, X_res, y_res, cv=outer_cv, scoring=scoring)
results.append(nested_cv_results)
names.append(name)
msg = "Nested CV Accuracy %s: %f (+/- %f )" % (name, nested_cv_results.mean()*100, nested_cv_results.std()*100)
return msg, model.best_params_
for name, model in models:
if __name__ == '__main__':
pool = Pool() # Create a multiprocessing Pool
pool.map(run_models, model)
However this gives the error:
TypeError: 'GridSearchCV' object is not iterable
While using sklearn wrapper this is pretty easy to do for me this way:
import xgboost as xgb
clf = xgb.XGBClassifier( n_estimators=1500, learning_rate=0.015, gamma =0.3, min_child_weight = 3,nthread = 15,max_depth=150,
subsample=0.9, colsample_bytree=0.8, seed=2100, eval_metric = "rmse")
VALID = True
if VALID == True:
X_train, X_valid, y_train, y_valid = train_test_split(
X, y, test_size = 0.19, random_state=23)
model = xgb.train(X_train, y_train, params,
evallist = [(X_valid, y_valid)],
verbose_eval = 50,
early_stopping_rounds=50)
However I cant set it using standart class of xgboost:
params = {
'objective' : 'gpu:reg:linear',
'learning_rate': 0.02,
'gamma' : 0.3,
'min_child_weight' : 3,
'nthread' : 15,
'max_depth' : 30,
'subsample' : 0.9,
'colsample_bytree' : 0.8,
'seed':2100,
'eval_metric' : "rmse",
'num_boost_round' : 300
}
VALID = True
if VALID == True:
X_train, X_valid, y_train, y_valid = train_test_split(
X, y, test_size = 0.19, random_state=23)
model = xgb.train(X_train, y_train, params,
evallist = [(X_valid, y_valid)],
verbose_eval = 50,
early_stopping_rounds=50)
#error TypeError: train() got an unexpected keyword argument 'evallist'
Just need to specify parametrs correctly:
params = {
#'objective' : 'gpu:reg:linear',
'tree_method':'gpu_hist',
'learning_rate': 0.02,
'gamma' : 0.3,
'min_child_weight' : 3,
'nthread' : 15,
'max_depth' : 30,
'subsample' : 0.9,
'colsample_bytree' : 0.8,
'seed':2100,
'eval_metric' : "rmse",
'num_boost_round' : 300,
'n_estimators':999,
'max_leaves': 300
}
VALID = True
if VALID == True:
X_train, X_valid, y_train, y_valid = train_test_split(
X, y, test_size = 0.19, random_state=23)
tr_data = xgb.DMatrix(X_train, y_train)
va_data = xgb.DMatrix(X_valid, y_valid)
#del X_train, X_valid, y_train, y_valid ; gc.collect()
watchlist = [(tr_data, 'train'), (va_data, 'valid')]
model = xgb.train(params, tr_data, 300, watchlist, maximize=False, early_stopping_rounds = 30, verbose_eval=50)