Related
A accuracy score from optuna and a score in cross_val_score were different. Why does it occuer and which score should I choose?
I used the hyperparameters that I got in optuna in cross_val_score.
def objective_lgb(trial):
num_leaves = trial.suggest_int("num_leaves", 2, 1000)
max_depth = trial.suggest_int("max_depth", 2, 100)
learning_rate = trial.suggest_float('learning_rate', 0.001, 1)
n_estimators = trial.suggest_int('n_estimators', 100, 2000)
min_child_samples = trial.suggest_int('min_child_samples', 3, 1000)
subsample = trial.suggest_float('subsample', 0.000001, 1)
colsample_bytree = trial.suggest_float('colsample_bytree', 0.00000001, 1)
reg_alpha = trial.suggest_float('reg_alpha', 0, 400)
reg_lambda = trial.suggest_float("reg_lambda", 0, 400)
importance_type = trial.suggest_categorical('importance_type', ["split", "gain"])
lgb_clf = lgb.LGBMClassifier(random_state=1,
objective="multiclass",
num_class = 3,
importance_type=importance_type,
num_leaves=num_leaves,
max_depth=max_depth,
learning_rate=learning_rate,
n_estimators=n_estimators,
min_child_samples=min_child_samples,
subsample=subsample,
colsample_bytree=colsample_bytree,
reg_alpha=reg_alpha,
reg_lambda=reg_lambda
)
score = cross_val_score(lgb_clf, train_x, train_y, n_jobs=-1, cv=KFold(n_splits=10, shuffle=True, random_state=1), scoring='accuracy')
mean_score = score.mean()
return mean_score
lgb_study = optuna.create_study(direction="maximize")
lgb_study.optimize(objective_lgb, n_trials=1500)
lgb_trial = lgb_study.best_trial
print("accuracy:", lgb_trial.value)
print()
print("Best params:", lgb_trial.params)
=========================================================
def light_check(x,params):
model = lgb.LGBMClassifier()
scores = cross_val_score(model,x,y,cv=KFold(n_splits=10, shuffle=True, random_state=1),n_jobs=-1)
mean = scores.mean()
return scores, mean
light_check(x,{'num_leaves': 230, 'max_depth': 53, 'learning_rate': 0.04037430031226232, 'n_estimators': 1143, 'min_child_samples': 381, 'subsample': 0.12985990464862135, 'colsample_bytree': 0.8914118949904919, 'reg_alpha': 31.869348047391053, 'reg_lambda': 17.45653692887209, 'importance_type': 'split'})
From what I can see, you are using X_train, y_train in the optuna call, while in light_check you are passing x and y. Assuming you did a spilt in some unknown code, the data set for optuna is smaller and you get a different number.
This is my code
param_grid = [{'kneighborsclassifier__n_neighbors': [3, 4, 5, 6, 7]},
{'svc__C': [0.1, 1, 10, 100], 'svc__gamma': [0.001, 0.01, 0.1, 1, 10]},
{'randomforestclassifier__n_estimators': [50, 100, 200, 300 ,400]},
{'decisiontreeclassifier__max_depth': [12, 25, 50, 75, 100]},
{'adaboostclassifier__n_estimators': [50, 100, 200, 300 ,400]},
{'baggingclassifier__n_estimators': [50, 100, 200, 300, 400]}
]
inner_cv = StratifiedShuffleSplit(test_size=.8, train_size=.2, n_splits=5, random_state=0)
outer_cv = StratifiedShuffleSplit(test_size=.8, train_size=.2, n_splits=5, random_state=0)
models = [knn, svc, forest, dtc, ada, bag]
model_names = ['knn', 'svc','forest', 'dtc', 'ada', 'bag']
for m, mname in zip(models, model_names):
pipe = make_pipeline(VarianceThreshold(threshold=1),
MinMaxScaler(),
SelectKBest(f_classif, k=20),
m)
grid = GridSearchCV(pipe, param_grid=param_grid, cv=inner_cv)
grid.fit(X_train_test, y_train_test)
nested_score = cross_val_score(grid, X=X_train_test, y=y_train_test.values.ravel(), cv=outer_cv)
print(mname)
print(grid.best_params_)
print(grid.best_score_)
print('\n')
This is the error:
ValueError: Invalid parameter svc for estimator Pipeline(memory=None,
steps=[('variancethreshold', VarianceThreshold(threshold=1)),
('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1))),
('selectkbest',
SelectKBest(k=20,
score_func=<function f_classif at 0x0000019E0A485AF8>)),
('kneighborsclassifier',
KNeighborsClassifier(algorithm='auto', leaf_size=30,
metric='minkowski', metric_params=None,
n_jobs=None, n_neighbors=5, p=2,
weights='uniform'))],
verbose=False). Check the list of available parameters with `estimator.get_params().keys()`.
I dont know whats wrong. I copied the parameter and model names from the named_steps of the pipeline. If I run it without a parameter grid it works so the problem is most likely there.
Seems to work like this, but I dont like it.
pg1 = {'kneighborsclassifier__n_neighbors': [3, 4, 5, 6, 7]}
pg2 = {'svc__C': [0.1, 1, 10, 100],
'svc__gamma': [0.001, 0.01, 0.1, 1, 10]}
pg3 = {'randomforestclassifier__n_estimators': [50, 100, 200, 300 ,400]}
pg4 = {'decisiontreeclassifier__max_depth': [12, 25, 50, 75, 100]}
pg5 = {'adaboostclassifier__n_estimators': [50, 100, 200, 300 ,400]}
pg6 = {'baggingclassifier__n_estimators': [50, 100, 200, 300, 400]}
param_grid_list = [pg1, pg2, pg3, pg4, pg5, pg6]
And then the loop becomes:
for m, p, mname in zip(models, param_grid_list, model_names):
pipe = make_pipeline(VarianceThreshold(threshold=1),
MinMaxScaler(),
SelectKBest(f_classif, k=20),
m)
grid = GridSearchCV(pipe, param_grid=p, cv=inner_cv)
grid.fit(X_train_test, y_train_test)
nested_score = cross_val_score(grid, X=X_train_test, y=y_train_test.values.ravel(), cv=outer_cv)
print(mname)
print(grid.best_params_)
print(grid.best_score_)
print('\n')
I am spot checking bunch of regression models.how do a fit multiple ml models, Would i use a for loop and do model.fit
#Variables
alpha= [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
#function
def get_models(model=dict()):
model['lr'] = LinearRegression()
for values in alpha:
model["Lasso"]=Lasso(alpha=values)
model["Ridge"]=Ridge(alpha=values)
model["Huber"]=HuberRegressor()
model["Lars"]=Lars()
model["Lasso_l"]=LassoLars()
model["PA"]=PassiveAggressiveRegressor(max_iter=1000, tol=1e-3)
model["RANSAC"]=RANSACRegressor()
model["SGD"]=SGDRegressor(max_iter=1000, tol=1e-3)
model["theil"]=TheilSenRegressor()
model["cart"] = DecisionTreeRegressor()
model["extra"] = ExtraTreeRegressor()
model["svml"] = SVR(kernel='linear')
model["svmp"] = SVR(kernel='poly')
#Loaded data and have X and y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
#fitting models
Yes, once your dict is filled with get_models(), you can fit the models with a for loop:
for model in models:
model.fit(X_train, y_train)
You can easily loop through several Scikit Learn models, and do all the fitting too. Try the sample code below directly below and take at the links towards the bottom of my post.
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from sklearn import datasets
from sklearn.linear_model import SGDClassifier, LogisticRegression, \
Perceptron, PassiveAggressiveClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid, RadiusNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF, ConstantKernel, DotProduct, Matern, StationaryKernelMixin, WhiteKernel
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from utilities import *
from universal_params import *
def gen_classification_data(n=None):
"""
uses the iris data
:return: x, y
"""
iris = datasets.load_iris()
x = iris.data
y = iris.target
if n:
half = int(n/2)
np.concatenate((x[:half], x[-half:]), 1), np.concatenate((y[:half], y[-half:]), 0)
return x, y
linear_models_n_params = [
(SGDClassifier,
{'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge'],
'alpha': [0.0001, 0.001, 0.1],
**penalty_12none
}),
(LogisticRegression,
{**penalty_12, **max_iter, **tol, ** warm_start, **C,
'solver': ['liblinear']
}),
(Perceptron,
{**penalty_all, **alpha, **n_iter, **eta0, **warm_start
}),
(PassiveAggressiveClassifier,
{**C, **n_iter, **warm_start,
'loss': ['hinge', 'squared_hinge'],
})
]
linear_models_n_params_small = linear_models_n_params
svm_models_n_params = [
(SVC,
{**C, **kernel, **degree, **gamma, **coef0, **shrinking, **tol, **max_iter_inf2}),
(NuSVC,
{**nu, **kernel, **degree, **gamma, **coef0, **shrinking, **tol
}),
(LinearSVC,
{ **C, **penalty_12, **tol, **max_iter,
'loss': ['hinge', 'squared_hinge'],
})
]
svm_models_n_params_small = [
(SVC,
{**kernel, **degree, **shrinking
}),
(NuSVC,
{**nu_small, **kernel, **degree, **shrinking
}),
(LinearSVC,
{ **C_small,
'penalty': ['l2'],
'loss': ['hinge', 'squared_hinge'],
})
]
neighbor_models_n_params = [
(KMeans,
{'algorithm': ['auto', 'full', 'elkan'],
'init': ['k-means++', 'random']}),
(KNeighborsClassifier,
{**n_neighbors, **neighbor_algo, **neighbor_leaf_size, **neighbor_metric,
'weights': ['uniform', 'distance'],
'p': [1, 2]
}),
(NearestCentroid,
{**neighbor_metric,
'shrink_threshold': [1e-3, 1e-2, 0.1, 0.5, 0.9, 2]
}),
(RadiusNeighborsClassifier,
{**neighbor_radius, **neighbor_algo, **neighbor_leaf_size, **neighbor_metric,
'weights': ['uniform', 'distance'],
'p': [1, 2],
'outlier_label': [-1]
})
]
gaussianprocess_models_n_params = [
(GaussianProcessClassifier,
{**warm_start,
'kernel': [RBF(), ConstantKernel(), DotProduct(), WhiteKernel()],
'max_iter_predict': [500],
'n_restarts_optimizer': [3],
})
]
bayes_models_n_params = [
(GaussianNB, {})
]
nn_models_n_params = [
(MLPClassifier,
{ 'hidden_layer_sizes': [(16,), (64,), (100,), (32, 32)],
'activation': ['identity', 'logistic', 'tanh', 'relu'],
**alpha, **learning_rate, **tol, **warm_start,
'batch_size': ['auto', 50],
'max_iter': [1000],
'early_stopping': [True, False],
'epsilon': [1e-8, 1e-5]
})
]
nn_models_n_params_small = [
(MLPClassifier,
{ 'hidden_layer_sizes': [(64,), (32, 64)],
'batch_size': ['auto', 50],
'activation': ['identity', 'tanh', 'relu'],
'max_iter': [500],
'early_stopping': [True],
**learning_rate_small
})
]
tree_models_n_params = [
(RandomForestClassifier,
{'criterion': ['gini', 'entropy'],
**max_features, **n_estimators, **max_depth,
**min_samples_split, **min_impurity_split, **warm_start, **min_samples_leaf,
}),
(DecisionTreeClassifier,
{'criterion': ['gini', 'entropy'],
**max_features, **max_depth, **min_samples_split, **min_impurity_split, **min_samples_leaf
}),
(ExtraTreesClassifier,
{**n_estimators, **max_features, **max_depth,
**min_samples_split, **min_samples_leaf, **min_impurity_split, **warm_start,
'criterion': ['gini', 'entropy']})
]
tree_models_n_params_small = [
(RandomForestClassifier,
{**max_features_small, **n_estimators_small, **min_samples_split, **max_depth_small, **min_samples_leaf
}),
(DecisionTreeClassifier,
{**max_features_small, **max_depth_small, **min_samples_split, **min_samples_leaf
}),
(ExtraTreesClassifier,
{**n_estimators_small, **max_features_small, **max_depth_small,
**min_samples_split, **min_samples_leaf})
]
def run_linear_models(x, y, small = True, normalize_x = True):
return big_loop(linear_models_n_params_small if small else linear_models_n_params,
StandardScaler().fit_transform(x) if normalize_x else x, y, isClassification=True)
def run_svm_models(x, y, small = True, normalize_x = True):
return big_loop(svm_models_n_params_small if small else svm_models_n_params,
StandardScaler().fit_transform(x) if normalize_x else x, y, isClassification=True)
def run_neighbor_models(x, y, normalize_x = True):
return big_loop(neighbor_models_n_params,
StandardScaler().fit_transform(x) if normalize_x else x, y, isClassification=True)
def run_gaussian_models(x, y, normalize_x = True):
return big_loop(gaussianprocess_models_n_params,
StandardScaler().fit_transform(x) if normalize_x else x, y, isClassification=True)
def run_nn_models(x, y, small = True, normalize_x = True):
return big_loop(nn_models_n_params_small if small else nn_models_n_params,
StandardScaler().fit_transform(x) if normalize_x else x, y, isClassification=True)
def run_tree_models(x, y, small = True, normalize_x = True):
return big_loop(tree_models_n_params_small if small else tree_models_n_params,
StandardScaler().fit_transform(x) if normalize_x else x, y, isClassification=True)
def run_all(x, y, small = True, normalize_x = True, n_jobs=cpu_count()-1):
all_params = (linear_models_n_params_small if small else linear_models_n_params) + \
(nn_models_n_params_small if small else nn_models_n_params) + \
([] if small else gaussianprocess_models_n_params) + \
neighbor_models_n_params + \
(svm_models_n_params_small if small else svm_models_n_params) + \
(tree_models_n_params_small if small else tree_models_n_params)
return big_loop(all_params,
StandardScaler().fit_transform(x) if normalize_x else x, y,
isClassification=True, n_jobs=n_jobs)
if __name__ == '__main__':
x, y = gen_classification_data()
run_all(x, y, n_jobs=1)
Here are a couple examples that you can follow.
https://github.com/PyDataBlog/Python-for-Data-Science/blob/master/Tutorials/Yellow%20brick.ipynb
https://medium.com/vickdata/a-simple-guide-to-scikit-learn-pipelines-4ac0d974bdcf
https://machinelearningmastery.com/compare-machine-learning-algorithms-python-scikit-learn/
I write a classification algorythm and I use 6 differents models. I want to improve the model by using finetuning parameters for each model. The problem I am encounering is related to my "for loop". In fact, I loop in three different dictionnaries but the correspondance between the model I use in the gridsearch and the parameters are not keep since the dictionnaries are not ordered;
I seem to failed to find another solution :
here my code and the result as you can see the model_name is different from the param_name so I get multiple erros like for example (ValueError: Invalid parameter alpha for estimator LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
)
below the code
model1 = LinearSVC()
model2 = MultinomialNB()
model3 = LogisticRegression()
model4 = RandomForestClassifier()
model5 = KNeighborsClassifier()
model6 = MLPClassifier(max_iter=300, random_state=1)
models = {'Model_SVC': model1, 'Model_G_NB': model2, 'Model_LR': model3, 'Model_RF': model4, 'Model_KN': model5, 'Model_MLP': model6}
# list of parameters
parameter_RF = {'min_samples_split': [3, 5, 10],
'criterion': ['gini', 'entropy'],
'n_estimators' : [100, 300],
'max_features': ['auto', 'sqrt','log2'],
'bootstrap': ['True', 'False'],
'max_depth': [3, 5, 15, 25]
}
parameter_LinearSvc = {'C': [0.001, 0.01, 0.1, 1, 10, 100]
}
parameter_LR = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
'penalty' : ['l1', 'l2'],
'solver' : ['liblinear', 'warn'],
'dual' : ['True','False'],
'max_iter' :[100, 110, 120, 130, 140]
}
parameter_NB = {'alpha': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
'loss': ['hinge', 'hinge_squarred'],
'penalty' : ['l1', 'l2']
}
k_range = list(range(1, 31))
weight_options = ['uniform', 'distance']
parameter_KNN = dict(n_neighbors=k_range, weights=weight_options)
parameter_MLP = { 'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
'activation': ['tanh', 'relu'],
'solver': ['sgd', 'adam'],
'alpha': [0.0001, 0.05],
'learning_rate': ['constant','adaptive'],
'max_iter' : [100, 200, 300]
}
parameters_dict = {'Model_SVC': parameter_LinearSvc, 'Model_G_NB': parameter_NB, 'Model_LR': parameter_LR, 'Model_RF': parameter_LR, 'Model_KN': parameter_KNN, 'Model_MLP': parameter_MLP}
cv_splitter = KFold(n_splits=10, shuffle=False, random_state=None)
for feature_name, feature in features.items():
for model_name, model in models.items():
for param_name, parameter in parameters_dict.items():
clf = GridSearchCV(estimator=model, param_grid=parameter, cv=cv_splitter, verbose = 1, n_jobs = -1, return_train_score=True)
best_model = clf.fit(feature, ylabels)
output : as you can see sometimes it works but other times param and model are not the same which causes the error
[5 rows x 7 columns]
Feature: vecteur_CV
Model: Model_SVC
Param: Model_SVC
Fitting 10 folds for each of 6 candidates, totalling 60 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 58 out of 60 | elapsed: 2.8s remaining: 0.1s
/svm/base.py:929: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
"the number of iterations.", ConvergenceWarning)
[Parallel(n_jobs=-1)]: Done 60 out of 60 | elapsed: 2.8s finished
Feature: vecteur_CV
Model: Model_SVC
Param: Model_G_NB
Fitting 10 folds for each of 24 candidates, totalling 240 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
joblib.externals.loky.process_executor._RemoteTraceback:
"""
Traceback (most recent call last):
File "/ho/anaconda3/lib/python3.7/site-packages/joblib/externals/loky/process_executor.py", line 418, in _process_worker
r = call_item()
File "/ho/anaconda3/lib/python3.7/site-packages/joblib/externals/loky/process_executor.py", line 272, in __call__
return self.fn(*self.args, **self.kwargs)
File "/ho/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 567, in __call__
return self.func(*args, **kwargs)
File "/ho/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 225, in __call__
for func, args, kwargs in self.items]
File "/ho/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 225, in <listcomp>
for func, args, kwargs in self.items]
File "/home/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 503, in _fit_and_score
estimator.set_params(**parameters)
File "/home/anaconda3/lib/python3.7/site-packages/sklearn/base.py", line 224, in set_params
(key, self))
ValueError: Invalid parameter alpha for estimator LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
Features look like this
`X_data, X_data_0, X_data_1, X_data_2 = features_fusion(verbatim, first_arg)
features = {'vecteur_CV': X_data, 'vecteur_NEG': X_data_0, 'Vecteur_NEG_lexique': X_data_1, 'Vecteur_NEG_CV': X_data_2}
I used 2 slightly different versions of the Iris dataset and this code below runs (albeit with many warnings during training):
from sklearn.model_selection import GridSearchCV,KFold
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
data = load_iris()
model1 = LinearSVC()
model2 = MultinomialNB()
model3 = LogisticRegression()
model4 = RandomForestClassifier()
model5 = KNeighborsClassifier()
model6 = MLPClassifier(max_iter=300, random_state=1)
models = {'Model_SVC': model1, 'Model_G_NB': model2, 'Model_LR': model3,
'Model_RF': model4, 'Model_KN': model5, 'Model_MLP': model6}
# list of parameters
parameter_RF = {'max_depth': [2,3, 5, 15, 25],
'min_samples_split': [3, 5, 10],
'criterion': ['gini', 'entropy'],
'n_estimators' : [100, 300],
'max_features': ['auto', 'sqrt','log2'],
'bootstrap': ['True', 'False'],
}
parameter_LinearSvc = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
parameter_LR = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
'penalty' : ['l1', 'l2'],
'solver' : ['liblinear', 'warn'],
#'dual' : ['True','False'],
'max_iter' :[100, 110, 120, 130, 140]
}
parameter_NB = {'alpha': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001],
#'loss': ['hinge', 'hinge_squarred'],
#'penalty' : ['l1', 'l2']
}
k_range = list(range(1, 31))
weight_options = ['uniform', 'distance']
parameter_KNN = dict(n_neighbors=k_range, weights=weight_options)
parameter_MLP = {'hidden_layer_sizes':[(50,50,50),(50,100,50),(100,)],
'activation': ['tanh', 'relu'],
'solver': ['sgd', 'adam'],
'alpha': [0.0001, 0.05],
'learning_rate': ['constant','adaptive'],
'max_iter' : [100, 200, 300]
}
parameters_dict = {'Model_SVC': parameter_LinearSvc,
'Model_G_NB': parameter_NB,
'Model_LR': parameter_LR, 'Model_RF': parameter_RF,
'Model_KN': parameter_KNN, 'Model_MLP': parameter_MLP}
cv_splitter = KFold(n_splits=10, shuffle=False, random_state=None)
#features = {'vecteur_CV': X_data, 'vecteur_NEG': X_data_0,
# 'Vecteur_NEG_lexique': X_data_1, 'Vecteur_NEG_CV': X_data_2}
features = {'iris': data['data'],'iris_sub':data['data'][:,1:]}
ylabels = data['target']
for feature_name, feature in features.items():
#print(feature_name, feature)
for model_name in models:
print('Training model: ', model_name)
clf = GridSearchCV(estimator=models[model_name],
param_grid=parameters_dict[model_name],
cv=cv_splitter, verbose = 1, n_jobs = -1,
return_train_score=True)
best_model = clf.fit(feature, ylabels)
#for feature_name, feature in features.items():
# for model_name, model in models.items():
# for param_name, parameter in parameters_dict.items():
# print(model_name,model,param_name,parameter)
# clf = GridSearchCV(estimator=model, param_grid=parameter,
# cv=cv_splitter, verbose = 1, n_jobs = -1,
# return_train_score=True)
# best_model = clf.fit(feature, ylabels)
I had to comment some model parameters as they gave errors. There was also a typo in your snippet above 'Model_RF': parameter_LR should be 'Model_RF': parameter_RF. I'm not sure if that was the reason of your error. I also removed the inner parameters_dict loop as I could access all elements using the same keys as models.
Following up from my question yesterday (How to speed up nested cross validation in python?) I am trying to run models in parallel processing, however with my code is it possible to check if these models, each requiring 1 core only to run on, could run on cores in parallel or whether with this code it will always be 1 core only taking on the models 1 at a time?
I have access to higher powered computing where I could ask for 6 cores, a core per model, however, I am not sure with my code whether the models would actually be assigned their own core and run in parallel. If that makes sense (apologies if this is completely misinformed, as I am trying to learn as I go, any help would be appreciated). With n_jobs=1 for each of my models in their gridsearch can I further specifiy for each to have their own core?
Here is my code:
dataset= pd.read_csv('data.csv')
data = dataset.drop(["gene"],1)
df = data.iloc[:,0:24]
df = df.fillna(0)
X = MinMaxScaler().fit_transform(df)
le = preprocessing.LabelEncoder()
encoded_value = le.fit_transform(["certain", "likely", "possible", "unlikely"])
Y = le.fit_transform(data["category"])
sm = SMOTE(random_state=100)
X_res, y_res = sm.fit_resample(X, Y)
seed = 7
logreg = LogisticRegression(penalty='l1', solver='liblinear',multi_class='auto')
LR_par= {'penalty':['l1'], 'C': [0.5, 1, 5, 10], 'max_iter':[500, 1000, 5000]}
rfc =RandomForestClassifier()
param_grid = {'bootstrap': [True, False],
'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
'max_features': ['auto', 'sqrt'],
'min_samples_leaf': [1, 2, 4,25],
'min_samples_split': [2, 5, 10, 25],
'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}
mlp = MLPClassifier(random_state=seed)
parameter_space = {'hidden_layer_sizes': [(10,20), (10,20,10), (50,)],
'activation': ['tanh', 'relu'],
'solver': ['adam', 'sgd'],
'max_iter': [10000],
'alpha': [0.1, 0.01, 0.001],
'learning_rate': ['constant','adaptive']}
gbm = GradientBoostingClassifier(min_samples_split=25, min_samples_leaf=25)
param = {"loss":["deviance"],
"learning_rate": [0.15,0.1,0.05,0.01,0.005,0.001],
"min_samples_split": [2, 5, 10, 25],
"min_samples_leaf": [1, 2, 4,25],
"max_depth":[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
"max_features":['auto', 'sqrt'],
"criterion": ["friedman_mse"],
"n_estimators":[200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
}
svm = SVC(gamma="scale", probability=True)
tuned_parameters = {'kernel':('linear', 'rbf'), 'C':(1,0.25,0.5,0.75)}
def baseline_model(optimizer='adam', learn_rate=0.01):
model = Sequential()
model.add(Dense(100, input_dim=X_res.shape[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(50, activation='relu')) #8 is the dim/ the number of hidden units (units are the kernel)
model.add(Dense(4, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
return model
keras = KerasClassifier(build_fn=baseline_model, batch_size=32, epochs=100, verbose=0)
learn_rate = [0.001, 0.01, 0.1, 0.2, 0.3]
optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']
kerasparams = dict(optimizer=optimizer, learn_rate=learn_rate)
inner_cv = KFold(n_splits=10, shuffle=True, random_state=seed)
outer_cv = KFold(n_splits=10, shuffle=True, random_state=seed)
models = []
models.append(('GBM', GridSearchCV(gbm, param, cv=inner_cv,iid=False, n_jobs=1)))
models.append(('RFC', GridSearchCV(rfc, param_grid, cv=inner_cv,iid=False, n_jobs=1)))
models.append(('LR', GridSearchCV(logreg, LR_par, cv=inner_cv, iid=False, n_jobs=1)))
models.append(('SVM', GridSearchCV(svm, tuned_parameters, cv=inner_cv, iid=False, n_jobs=1)))
models.append(('MLP', GridSearchCV(mlp, parameter_space, cv=inner_cv,iid=False, n_jobs=1)))
models.append(('Keras', GridSearchCV(estimator=keras, param_grid=kerasparams, cv=inner_cv,iid=False, n_jobs=1)))
results = []
names = []
scoring = 'accuracy'
X_train, X_test, Y_train, Y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=0)
for name, model in models:
nested_cv_results = model_selection.cross_val_score(model, X_res, y_res, cv=outer_cv, scoring=scoring)
results.append(nested_cv_results)
names.append(name)
msg = "Nested CV Accuracy %s: %f (+/- %f )" % (name, nested_cv_results.mean()*100, nested_cv_results.std()*100)
print(msg)
model.fit(X_train, Y_train)
print('Test set accuracy: {:.2f}'.format(model.score(X_test, Y_test)*100), '%')
print("Best Parameters: \n{}\n".format(model.best_params_))
print("Best CV Score: \n{}\n".format(model.best_score_))
Edit: I have now tried my for loop as:
def run_models(models):
nested_cv_results = model_selection.cross_val_score(model, X_res, y_res, cv=outer_cv, scoring=scoring)
results.append(nested_cv_results)
names.append(name)
msg = "Nested CV Accuracy %s: %f (+/- %f )" % (name, nested_cv_results.mean()*100, nested_cv_results.std()*100)
print(msg)
model.fit(X_train, Y_train)
print('Test set accuracy: {:.2f}'.format(model.score(X_test, Y_test)*100), '%')
#print("Best Estimator: \n{}\n".format(model.best_estimator_))
print("Best Parameters: \n{}\n".format(model.best_params_))
print("Best CV Score: \n{}\n".format(model.best_score_))
if __name__ == '__main__':
pool = Pool() # Create a multiprocessing Pool
pool.map(run_models, models)
However this seems to run indefinitely with no errors but no output
Edit: on trying to changing multiprocessing to not be IDLE I have tried:
def run_models(models):
nested_cv_results = model_selection.cross_val_score(model, X_res, y_res, cv=outer_cv, scoring=scoring)
results.append(nested_cv_results)
names.append(name)
msg = "Nested CV Accuracy %s: %f (+/- %f )" % (name, nested_cv_results.mean()*100, nested_cv_results.std()*100)
return msg, model.best_params_
for name, model in models:
if __name__ == '__main__':
pool = Pool() # Create a multiprocessing Pool
pool.map(run_models, model)
However this gives the error:
TypeError: 'GridSearchCV' object is not iterable