How set learning xgboost with evaluation set? - python

While using sklearn wrapper this is pretty easy to do for me this way:
import xgboost as xgb
clf = xgb.XGBClassifier( n_estimators=1500, learning_rate=0.015, gamma =0.3, min_child_weight = 3,nthread = 15,max_depth=150,
subsample=0.9, colsample_bytree=0.8, seed=2100, eval_metric = "rmse")
VALID = True
if VALID == True:
X_train, X_valid, y_train, y_valid = train_test_split(
X, y, test_size = 0.19, random_state=23)
model = xgb.train(X_train, y_train, params,
evallist = [(X_valid, y_valid)],
verbose_eval = 50,
early_stopping_rounds=50)
However I cant set it using standart class of xgboost:
params = {
'objective' : 'gpu:reg:linear',
'learning_rate': 0.02,
'gamma' : 0.3,
'min_child_weight' : 3,
'nthread' : 15,
'max_depth' : 30,
'subsample' : 0.9,
'colsample_bytree' : 0.8,
'seed':2100,
'eval_metric' : "rmse",
'num_boost_round' : 300
}
VALID = True
if VALID == True:
X_train, X_valid, y_train, y_valid = train_test_split(
X, y, test_size = 0.19, random_state=23)
model = xgb.train(X_train, y_train, params,
evallist = [(X_valid, y_valid)],
verbose_eval = 50,
early_stopping_rounds=50)
#error TypeError: train() got an unexpected keyword argument 'evallist'

Just need to specify parametrs correctly:
params = {
#'objective' : 'gpu:reg:linear',
'tree_method':'gpu_hist',
'learning_rate': 0.02,
'gamma' : 0.3,
'min_child_weight' : 3,
'nthread' : 15,
'max_depth' : 30,
'subsample' : 0.9,
'colsample_bytree' : 0.8,
'seed':2100,
'eval_metric' : "rmse",
'num_boost_round' : 300,
'n_estimators':999,
'max_leaves': 300
}
VALID = True
if VALID == True:
X_train, X_valid, y_train, y_valid = train_test_split(
X, y, test_size = 0.19, random_state=23)
tr_data = xgb.DMatrix(X_train, y_train)
va_data = xgb.DMatrix(X_valid, y_valid)
#del X_train, X_valid, y_train, y_valid ; gc.collect()
watchlist = [(tr_data, 'train'), (va_data, 'valid')]
model = xgb.train(params, tr_data, 300, watchlist, maximize=False, early_stopping_rounds = 30, verbose_eval=50)

Related

Optuna score vs Cross_val_score?

A accuracy score from optuna and a score in cross_val_score were different. Why does it occuer and which score should I choose?
I used the hyperparameters that I got in optuna in cross_val_score.
def objective_lgb(trial):
num_leaves = trial.suggest_int("num_leaves", 2, 1000)
max_depth = trial.suggest_int("max_depth", 2, 100)
learning_rate = trial.suggest_float('learning_rate', 0.001, 1)
n_estimators = trial.suggest_int('n_estimators', 100, 2000)
min_child_samples = trial.suggest_int('min_child_samples', 3, 1000)
subsample = trial.suggest_float('subsample', 0.000001, 1)
colsample_bytree = trial.suggest_float('colsample_bytree', 0.00000001, 1)
reg_alpha = trial.suggest_float('reg_alpha', 0, 400)
reg_lambda = trial.suggest_float("reg_lambda", 0, 400)
importance_type = trial.suggest_categorical('importance_type', ["split", "gain"])
lgb_clf = lgb.LGBMClassifier(random_state=1,
objective="multiclass",
num_class = 3,
importance_type=importance_type,
num_leaves=num_leaves,
max_depth=max_depth,
learning_rate=learning_rate,
n_estimators=n_estimators,
min_child_samples=min_child_samples,
subsample=subsample,
colsample_bytree=colsample_bytree,
reg_alpha=reg_alpha,
reg_lambda=reg_lambda
)
score = cross_val_score(lgb_clf, train_x, train_y, n_jobs=-1, cv=KFold(n_splits=10, shuffle=True, random_state=1), scoring='accuracy')
mean_score = score.mean()
return mean_score
lgb_study = optuna.create_study(direction="maximize")
lgb_study.optimize(objective_lgb, n_trials=1500)
lgb_trial = lgb_study.best_trial
print("accuracy:", lgb_trial.value)
print()
print("Best params:", lgb_trial.params)
=========================================================
def light_check(x,params):
model = lgb.LGBMClassifier()
scores = cross_val_score(model,x,y,cv=KFold(n_splits=10, shuffle=True, random_state=1),n_jobs=-1)
mean = scores.mean()
return scores, mean
light_check(x,{'num_leaves': 230, 'max_depth': 53, 'learning_rate': 0.04037430031226232, 'n_estimators': 1143, 'min_child_samples': 381, 'subsample': 0.12985990464862135, 'colsample_bytree': 0.8914118949904919, 'reg_alpha': 31.869348047391053, 'reg_lambda': 17.45653692887209, 'importance_type': 'split'})
From what I can see, you are using X_train, y_train in the optuna call, while in light_check you are passing x and y. Assuming you did a spilt in some unknown code, the data set for optuna is smaller and you get a different number.

How to optimize hyperparameters with RandomSearchCV or GridSearchCV in neural network for beginners?

I'm using Python. How can the GridSearchCV or RandomSearchCV be easily integrated in the network training?
I receive the following error code:
AttributeError: 'RandomizedSearchCV' object has no attribute 'fit_generator'
Does anyone have a simple idea to get around this error?
def train_test_splitting(df):
x_train,x_test, y_train, y_test = train_test_split(features, target, test_size = 0.2, random_state = 123, shuffle = False)
return [x_train,x_test, y_train, y_test]
def preprocessing_nn(df,time_period_window, num_oberservations_per_batch ,number_training_features):
TimeseriesGenerator(features, target, length = 6 , sampling_rate = 1, batch_size =1)[0]
win_length = time_period_window #Wie viele Datenpunkte
batch_size = num_oberservations_per_batch # wie viele Beobachtungswertee pro Iteration
num_features = number_training_features
nn_train_generator= TimeseriesGenerator(x_train, y_train, length = win_length , sampling_rate = 1, batch_size = batch_size)
nn_test_generator= TimeseriesGenerator(x_test, y_test, length = win_length , sampling_rate = 1, batch_size = batch_size)
return [nn_train_generator, nn_test_generator]
#Applying Functions:
x_train,x_test, y_train, y_test = train_test_splitting(df)
nn_train_generator,nn_test_generator = preprocessing_nn(df,time_period_window, num_oberservations_per_batch ,number_training_features)
def create_LSTM_network(optimizer='adam'):
model = Sequential()
model.add(LSTM(units=120, return_sequences=True, input_shape=(time_period_window,number_training_features)))
model.add(Dropout(0.5))
model.add(LSTM(units=120, return_sequences=False))
model.add(Dense(1, activation = "relu"))
model.compile(loss='mse', optimizer=optimizer,
metrics=['mae'])
return model
#Wrapper für Scikit Learn zur Benutztung von Randomised CV
LSTM_neural_network = KerasClassifier(build_fn=create_LSTM_network, verbose=1)
#Create hyperparameter space
epochs = sp_randInt(10,500)
batches = sp_randInt(10,100)
optimizers = ['rmsprop','adam','sgd']
hyperparameters = dict(optimizer = optimizers, epochs=epochs, batch_size = batches)
#RandomSearchCV - Creation
grid = RandomizedSearchCV(estimator = LSTM_neural_network, param_distributions=hyperparameters,
cv= 2, n_iter = 5, n_jobs =-1)
grid_result = grid.fit(nn_train_generator, epochs = 50,
validation_data = nn_test_generator,
shuffle=False,
callbacks=[early_stopping])
print(); print(grid_result.best_params_)
AttributeError: 'RandomizedSearchCV' object has no attribute 'fit_generator'

how to use gridsearch and cross validation with differents parameters models?

I write a classification algorythm and I use 6 differents models. I want to improve the model by using finetuning parameters for each model. The problem I am encounering is related to my "for loop". In fact, I loop in three different dictionnaries but the correspondance between the model I use in the gridsearch and the parameters are not keep since the dictionnaries are not ordered;
I seem to failed to find another solution :
here my code and the result as you can see the model_name is different from the param_name so I get multiple erros like for example (ValueError: Invalid parameter alpha for estimator LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
)
below the code
model1 = LinearSVC()
model2 = MultinomialNB()
model3 = LogisticRegression()
model4 = RandomForestClassifier()
model5 = KNeighborsClassifier()
model6 = MLPClassifier(max_iter=300, random_state=1)
models = {'Model_SVC': model1, 'Model_G_NB': model2, 'Model_LR': model3, 'Model_RF': model4, 'Model_KN': model5, 'Model_MLP': model6}
# list of parameters
parameter_RF = {'min_samples_split': [3, 5, 10],
'criterion': ['gini', 'entropy'],
'n_estimators' : [100, 300],
'max_features': ['auto', 'sqrt','log2'],
'bootstrap': ['True', 'False'],
'max_depth': [3, 5, 15, 25]
}
parameter_LinearSvc = {'C': [0.001, 0.01, 0.1, 1, 10, 100]
}
parameter_LR = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
'penalty' : ['l1', 'l2'],
'solver' : ['liblinear', 'warn'],
'dual' : ['True','False'],
'max_iter' :[100, 110, 120, 130, 140]
}
parameter_NB = {'alpha': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
'loss': ['hinge', 'hinge_squarred'],
'penalty' : ['l1', 'l2']
}
k_range = list(range(1, 31))
weight_options = ['uniform', 'distance']
parameter_KNN = dict(n_neighbors=k_range, weights=weight_options)
parameter_MLP = { 'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
'activation': ['tanh', 'relu'],
'solver': ['sgd', 'adam'],
'alpha': [0.0001, 0.05],
'learning_rate': ['constant','adaptive'],
'max_iter' : [100, 200, 300]
}
parameters_dict = {'Model_SVC': parameter_LinearSvc, 'Model_G_NB': parameter_NB, 'Model_LR': parameter_LR, 'Model_RF': parameter_LR, 'Model_KN': parameter_KNN, 'Model_MLP': parameter_MLP}
cv_splitter = KFold(n_splits=10, shuffle=False, random_state=None)
for feature_name, feature in features.items():
for model_name, model in models.items():
for param_name, parameter in parameters_dict.items():
clf = GridSearchCV(estimator=model, param_grid=parameter, cv=cv_splitter, verbose = 1, n_jobs = -1, return_train_score=True)
best_model = clf.fit(feature, ylabels)
output : as you can see sometimes it works but other times param and model are not the same which causes the error
[5 rows x 7 columns]
Feature: vecteur_CV
Model: Model_SVC
Param: Model_SVC
Fitting 10 folds for each of 6 candidates, totalling 60 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 58 out of 60 | elapsed: 2.8s remaining: 0.1s
/svm/base.py:929: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
"the number of iterations.", ConvergenceWarning)
[Parallel(n_jobs=-1)]: Done 60 out of 60 | elapsed: 2.8s finished
Feature: vecteur_CV
Model: Model_SVC
Param: Model_G_NB
Fitting 10 folds for each of 24 candidates, totalling 240 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
joblib.externals.loky.process_executor._RemoteTraceback:
"""
Traceback (most recent call last):
File "/ho/anaconda3/lib/python3.7/site-packages/joblib/externals/loky/process_executor.py", line 418, in _process_worker
r = call_item()
File "/ho/anaconda3/lib/python3.7/site-packages/joblib/externals/loky/process_executor.py", line 272, in __call__
return self.fn(*self.args, **self.kwargs)
File "/ho/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 567, in __call__
return self.func(*args, **kwargs)
File "/ho/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 225, in __call__
for func, args, kwargs in self.items]
File "/ho/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 225, in <listcomp>
for func, args, kwargs in self.items]
File "/home/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 503, in _fit_and_score
estimator.set_params(**parameters)
File "/home/anaconda3/lib/python3.7/site-packages/sklearn/base.py", line 224, in set_params
(key, self))
ValueError: Invalid parameter alpha for estimator LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
Features look like this
`X_data, X_data_0, X_data_1, X_data_2 = features_fusion(verbatim, first_arg)
features = {'vecteur_CV': X_data, 'vecteur_NEG': X_data_0, 'Vecteur_NEG_lexique': X_data_1, 'Vecteur_NEG_CV': X_data_2}
I used 2 slightly different versions of the Iris dataset and this code below runs (albeit with many warnings during training):
from sklearn.model_selection import GridSearchCV,KFold
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
data = load_iris()
model1 = LinearSVC()
model2 = MultinomialNB()
model3 = LogisticRegression()
model4 = RandomForestClassifier()
model5 = KNeighborsClassifier()
model6 = MLPClassifier(max_iter=300, random_state=1)
models = {'Model_SVC': model1, 'Model_G_NB': model2, 'Model_LR': model3,
'Model_RF': model4, 'Model_KN': model5, 'Model_MLP': model6}
# list of parameters
parameter_RF = {'max_depth': [2,3, 5, 15, 25],
'min_samples_split': [3, 5, 10],
'criterion': ['gini', 'entropy'],
'n_estimators' : [100, 300],
'max_features': ['auto', 'sqrt','log2'],
'bootstrap': ['True', 'False'],
}
parameter_LinearSvc = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
parameter_LR = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
'penalty' : ['l1', 'l2'],
'solver' : ['liblinear', 'warn'],
#'dual' : ['True','False'],
'max_iter' :[100, 110, 120, 130, 140]
}
parameter_NB = {'alpha': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001],
#'loss': ['hinge', 'hinge_squarred'],
#'penalty' : ['l1', 'l2']
}
k_range = list(range(1, 31))
weight_options = ['uniform', 'distance']
parameter_KNN = dict(n_neighbors=k_range, weights=weight_options)
parameter_MLP = {'hidden_layer_sizes':[(50,50,50),(50,100,50),(100,)],
'activation': ['tanh', 'relu'],
'solver': ['sgd', 'adam'],
'alpha': [0.0001, 0.05],
'learning_rate': ['constant','adaptive'],
'max_iter' : [100, 200, 300]
}
parameters_dict = {'Model_SVC': parameter_LinearSvc,
'Model_G_NB': parameter_NB,
'Model_LR': parameter_LR, 'Model_RF': parameter_RF,
'Model_KN': parameter_KNN, 'Model_MLP': parameter_MLP}
cv_splitter = KFold(n_splits=10, shuffle=False, random_state=None)
#features = {'vecteur_CV': X_data, 'vecteur_NEG': X_data_0,
# 'Vecteur_NEG_lexique': X_data_1, 'Vecteur_NEG_CV': X_data_2}
features = {'iris': data['data'],'iris_sub':data['data'][:,1:]}
ylabels = data['target']
for feature_name, feature in features.items():
#print(feature_name, feature)
for model_name in models:
print('Training model: ', model_name)
clf = GridSearchCV(estimator=models[model_name],
param_grid=parameters_dict[model_name],
cv=cv_splitter, verbose = 1, n_jobs = -1,
return_train_score=True)
best_model = clf.fit(feature, ylabels)
#for feature_name, feature in features.items():
# for model_name, model in models.items():
# for param_name, parameter in parameters_dict.items():
# print(model_name,model,param_name,parameter)
# clf = GridSearchCV(estimator=model, param_grid=parameter,
# cv=cv_splitter, verbose = 1, n_jobs = -1,
# return_train_score=True)
# best_model = clf.fit(feature, ylabels)
I had to comment some model parameters as they gave errors. There was also a typo in your snippet above 'Model_RF': parameter_LR should be 'Model_RF': parameter_RF. I'm not sure if that was the reason of your error. I also removed the inner parameters_dict loop as I could access all elements using the same keys as models.

How to get nested cross validation to run on multiple cores?

Following up from my question yesterday (How to speed up nested cross validation in python?) I am trying to run models in parallel processing, however with my code is it possible to check if these models, each requiring 1 core only to run on, could run on cores in parallel or whether with this code it will always be 1 core only taking on the models 1 at a time?
I have access to higher powered computing where I could ask for 6 cores, a core per model, however, I am not sure with my code whether the models would actually be assigned their own core and run in parallel. If that makes sense (apologies if this is completely misinformed, as I am trying to learn as I go, any help would be appreciated). With n_jobs=1 for each of my models in their gridsearch can I further specifiy for each to have their own core?
Here is my code:
dataset= pd.read_csv('data.csv')
data = dataset.drop(["gene"],1)
df = data.iloc[:,0:24]
df = df.fillna(0)
X = MinMaxScaler().fit_transform(df)
le = preprocessing.LabelEncoder()
encoded_value = le.fit_transform(["certain", "likely", "possible", "unlikely"])
Y = le.fit_transform(data["category"])
sm = SMOTE(random_state=100)
X_res, y_res = sm.fit_resample(X, Y)
seed = 7
logreg = LogisticRegression(penalty='l1', solver='liblinear',multi_class='auto')
LR_par= {'penalty':['l1'], 'C': [0.5, 1, 5, 10], 'max_iter':[500, 1000, 5000]}
rfc =RandomForestClassifier()
param_grid = {'bootstrap': [True, False],
'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
'max_features': ['auto', 'sqrt'],
'min_samples_leaf': [1, 2, 4,25],
'min_samples_split': [2, 5, 10, 25],
'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}
mlp = MLPClassifier(random_state=seed)
parameter_space = {'hidden_layer_sizes': [(10,20), (10,20,10), (50,)],
'activation': ['tanh', 'relu'],
'solver': ['adam', 'sgd'],
'max_iter': [10000],
'alpha': [0.1, 0.01, 0.001],
'learning_rate': ['constant','adaptive']}
gbm = GradientBoostingClassifier(min_samples_split=25, min_samples_leaf=25)
param = {"loss":["deviance"],
"learning_rate": [0.15,0.1,0.05,0.01,0.005,0.001],
"min_samples_split": [2, 5, 10, 25],
"min_samples_leaf": [1, 2, 4,25],
"max_depth":[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
"max_features":['auto', 'sqrt'],
"criterion": ["friedman_mse"],
"n_estimators":[200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
}
svm = SVC(gamma="scale", probability=True)
tuned_parameters = {'kernel':('linear', 'rbf'), 'C':(1,0.25,0.5,0.75)}
def baseline_model(optimizer='adam', learn_rate=0.01):
model = Sequential()
model.add(Dense(100, input_dim=X_res.shape[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(50, activation='relu')) #8 is the dim/ the number of hidden units (units are the kernel)
model.add(Dense(4, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
return model
keras = KerasClassifier(build_fn=baseline_model, batch_size=32, epochs=100, verbose=0)
learn_rate = [0.001, 0.01, 0.1, 0.2, 0.3]
optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']
kerasparams = dict(optimizer=optimizer, learn_rate=learn_rate)
inner_cv = KFold(n_splits=10, shuffle=True, random_state=seed)
outer_cv = KFold(n_splits=10, shuffle=True, random_state=seed)
models = []
models.append(('GBM', GridSearchCV(gbm, param, cv=inner_cv,iid=False, n_jobs=1)))
models.append(('RFC', GridSearchCV(rfc, param_grid, cv=inner_cv,iid=False, n_jobs=1)))
models.append(('LR', GridSearchCV(logreg, LR_par, cv=inner_cv, iid=False, n_jobs=1)))
models.append(('SVM', GridSearchCV(svm, tuned_parameters, cv=inner_cv, iid=False, n_jobs=1)))
models.append(('MLP', GridSearchCV(mlp, parameter_space, cv=inner_cv,iid=False, n_jobs=1)))
models.append(('Keras', GridSearchCV(estimator=keras, param_grid=kerasparams, cv=inner_cv,iid=False, n_jobs=1)))
results = []
names = []
scoring = 'accuracy'
X_train, X_test, Y_train, Y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=0)
for name, model in models:
nested_cv_results = model_selection.cross_val_score(model, X_res, y_res, cv=outer_cv, scoring=scoring)
results.append(nested_cv_results)
names.append(name)
msg = "Nested CV Accuracy %s: %f (+/- %f )" % (name, nested_cv_results.mean()*100, nested_cv_results.std()*100)
print(msg)
model.fit(X_train, Y_train)
print('Test set accuracy: {:.2f}'.format(model.score(X_test, Y_test)*100), '%')
print("Best Parameters: \n{}\n".format(model.best_params_))
print("Best CV Score: \n{}\n".format(model.best_score_))
Edit: I have now tried my for loop as:
def run_models(models):
nested_cv_results = model_selection.cross_val_score(model, X_res, y_res, cv=outer_cv, scoring=scoring)
results.append(nested_cv_results)
names.append(name)
msg = "Nested CV Accuracy %s: %f (+/- %f )" % (name, nested_cv_results.mean()*100, nested_cv_results.std()*100)
print(msg)
model.fit(X_train, Y_train)
print('Test set accuracy: {:.2f}'.format(model.score(X_test, Y_test)*100), '%')
#print("Best Estimator: \n{}\n".format(model.best_estimator_))
print("Best Parameters: \n{}\n".format(model.best_params_))
print("Best CV Score: \n{}\n".format(model.best_score_))
if __name__ == '__main__':
pool = Pool() # Create a multiprocessing Pool
pool.map(run_models, models)
However this seems to run indefinitely with no errors but no output
Edit: on trying to changing multiprocessing to not be IDLE I have tried:
def run_models(models):
nested_cv_results = model_selection.cross_val_score(model, X_res, y_res, cv=outer_cv, scoring=scoring)
results.append(nested_cv_results)
names.append(name)
msg = "Nested CV Accuracy %s: %f (+/- %f )" % (name, nested_cv_results.mean()*100, nested_cv_results.std()*100)
return msg, model.best_params_
for name, model in models:
if __name__ == '__main__':
pool = Pool() # Create a multiprocessing Pool
pool.map(run_models, model)
However this gives the error:
TypeError: 'GridSearchCV' object is not iterable

Invalid agrument error using hyperas in keras

I have been attempting to use the hyperas wrapper to search for optimal hyperparameters. Right now I am keeping it relatively basic to get it working first. Unfortunately, I am getting an "invalid argument" error. I am working in jupyter notebook.
I have read that I might have to not use jupyter notebook but I would like to continue using it if possible.
import keras
# prevents memory allocation errors when using GPU
from keras import backend as K
cfg = K.tf.ConfigProto()
cfg.gpu_options.allow_growth = True
K.set_session(K.tf.Session(config=cfg))
from keras.models import Sequential
from keras.layers import Dropout, Dense, Activation
from keras.regularizers import l2
from keras.layers.normalization import BatchNormalization
from keras.optimizers import Adam, sgd
from keras.activations import relu
from keras.losses import categorical_crossentropy
from keras import metrics
import pandas as pd
import numpy as np
# load data
x_main = pd.read_csv("glioma DB X.csv")
y_main = pd.read_csv("glioma DB Y.csv")
# fill with median (will have to improve later, not done yet)
fill_median =['Surgery_SBRT','df','Dose','Ki67','KPS','BMI','tumor_size']
x_main[fill_median] = x_main[fill_median].fillna(x_main[fill_median].median())
x_main['Neurofc'] = x_main['Neurofc'].fillna(2)
x_main['comorbid'] = x_main['comorbid'].fillna(int(x_main['comorbid'].median()))
# drop surgery
x_main = x_main.drop(['Surgery'], axis=1)
# normalize all x
x_main_normalized = x_main.apply(lambda x: (x-np.mean(x))/(np.std(x)+1e-10))
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_main, y_main, test_size=0.3)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5)
params = {'lr': 0.0001,
'batch_size': 30,
'epochs': 100,
'dropout': 0.2,
'weight_regulizer':['l2'],
'optimizer': 'sgd',
'losses': 'categorical_crossentropy',
'activation':'relu',
'last_activation': 'softmax'}
from keras.utils.np_utils import to_categorical
#categorical_labels = to_categorical(int_labels, num_classes=None)
last_layer = 1
if params['losses']=='categorical_crossentropy':
y_train = to_categorical(y_train,num_classes=4)
y_val = to_categorical(y_val,num_classes=4)
y_test = to_categorical(y_test,num_classes=4)
last_layer=4
from hyperopt import Trials, STATUS_OK, tpe
from keras.utils import np_utils
from hyperas import optim
from keras.models import model_from_json
from hyperas.distributions import choice, uniform, conditional
# Data()
def data():
x_train = x_train
x_val = x_val
y_train = y_train
y_val = y_val
return x_train, y_train, x_val, y_val
def model(x_train, y_train, x_val, y_val, layers=[20, 20, 4],
kernel_init ='he_uniform', bias_init ='he_uniform',
batch_norm=True, dropout=True):
model = Sequential()
# layer 1
model.add(Dense({{choice([10, 20, 30, 50])}},
input_dim=x_train.shape[1],
W_regularizer=l2(l2_reg),
kernel_initializer=kernel_init,
bias_initializer=bias_init))
if batch_norm == True:
model.add(BatchNormalization(axis=-1, momentum=momentum, center=True))
model.add(Activation(params['activation']))
if dropout == True:
model.add(Dropout({{uniform(0, 0.5)}}))
# layer 2+
for layer in range(0, len(layers)-1):
model.add(Dense({{choice([10, 20, 30, 50])}},
W_regularizer=l2(l2_reg),
kernel_initializer=kernel_init,
bias_initializer=bias_init))
if batch_norm == True:
model.add(BatchNormalization(axis=-1, momentum=momentum, center=True))
model.add(Activation(params['activation']))
if dropout == True:
model.add(Dropout(params['dropout']))
# Last layer
model.add(Dense(layers[-1], activation=params['last_activation'],
kernel_initializer=kernel_init,
bias_initializer=bias_init))
model.compile(loss=params['losses'],
optimizer=keras.optimizers.adam(lr=params['lr']),
metrics=['accuracy'])
history = model.fit(x_train, y_train,
validation_data=[x_val, y_val],
batch_size={{choice([5, 10, 30, 60])}},
epochs=params['epochs'],
verbose=1)
score, acc = model.evaluate(x_test, y_test, verbose=0)
print('Test accuracy:', acc)
return {'loss': -acc, 'status': STATUS_OK, 'model': model}
history_dict = history.history
return {'model':model, 'status': STATUS_OK, 'history_dict':history_dict, 'loss':-acc}
best_run, best_model = optim.minimize(model=model,
data=data,
algo=tpe.suggest,
max_evals=2,
trials=Trials())
x_train, y_train, x_val, y_val = data()
print("Evalutation of best performing model:")
print(best_model.evaluate(x_val, y_val))
print("Best performing model chosen hyper-parameters:")
print(best_run)
Error:
OSError Traceback (most recent call last)
<ipython-input-7-7cf7a7c755ab> in <module>()
64 algo=tpe.suggest,
65 max_evals=2,
---> 66 trials=Trials())
67
68 x_train, y_train, x_val, y_val = data()
~\Anaconda3\lib\site-packages\hyperas\optim.py in minimize(model, data, algo, max_evals, trials, functions, rseed, notebook_name, verbose, eval_space, return_space)
65 full_model_string=None,
66 notebook_name=notebook_name,
---> 67 verbose=verbose)
68
69 best_model = None
~\Anaconda3\lib\site-packages\hyperas\optim.py in base_minimizer(model, data, functions, algo, max_evals, trials, rseed, full_model_string, notebook_name, verbose, stack)
94 model_str = full_model_string
95 else:
---> 96 model_str = get_hyperopt_model_string(model, data, functions, notebook_name, verbose, stack)
97 temp_file = './temp_model.py'
98 write_temp_files(model_str, temp_file)
~\Anaconda3\lib\site-packages\hyperas\optim.py in get_hyperopt_model_string(model, data, functions, notebook_name, verbose, stack)
176 else:
177 calling_script_file = os.path.abspath(inspect.stack()[stack][1])
--> 178 with open(calling_script_file, 'r') as f:
179 source = f.read()
180
OSError: [Errno 22] Invalid argument: 'C:\\Users\\Michael\\Desktop\\CSV data-20180807T164633Z-001\\CSV data\\<ipython-input-7-7cf7a7c755ab>'
Jeez I need to read more. I think my question is answered on the github site
notebook adjustment
If you find error like "No such file or directory" or OSError, Err22, you may need add notebook_name='simple_notebook'(assume your current notebook name is simple_notebook) in optim.minimize function like this:
best_run, best_model = optim.minimize(model=model,
data=data,
algo=tpe.suggest,
max_evals=5,
trials=Trials(),
notebook_name='simple_notebook')
except I needed to put
notebook='keras_glioma_hyperopt')
the name of my notebook is keras_glioma_hyperopt
Still running into errors but it could be in my set up with my code. I will update as a go along but the above code helped me progressed.
EDIT
finally got it to run.
Problems I ran into:
you really need to put all the data loading into data():
I changed mine to
def data():
# load data
x_main = pd.read_csv("glioma DB X.csv")
y_main = pd.read_csv("glioma DB Y.csv")
# fill with median (will have to improve later, not done yet)
fill_median =['Surgery_SBRT','df','Dose','Ki67','KPS','BMI','tumor_size']
x_main[fill_median] = x_main[fill_median].fillna(x_main[fill_median].median())
x_main['Neurofc'] = x_main['Neurofc'].fillna(2)
x_main['comorbid'] = x_main['comorbid'].fillna(int(x_main['comorbid'].median()))
# drop surgery
x_main = x_main.drop(['Surgery'], axis=1)
# normalize all x
x_main_normalized = x_main.apply(lambda x: (x-np.mean(x))/(np.std(x)+1e-10))
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_main, y_main, test_size=0.3)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5)
last_layer = 1
params = {'lr': 0.0001,
'batch_size': 30,
'epochs': 100,
'dropout': 0.2,
'weight_regulizer':['l2'],
'optimizer': 'sgd',
'losses': 'categorical_crossentropy',
'activation':'relu',
'last_activation': 'softmax'}
from keras.utils.np_utils import to_categorical
if params['losses']=='categorical_crossentropy':
y_train = to_categorical(y_train,num_classes=4)
y_val = to_categorical(y_val,num_classes=4)
y_test = to_categorical(y_test,num_classes=4)
last_layer=4
x_train = x_train
x_val = x_val
y_train = y_train
y_val = y_val
return x_train, y_train, x_val, y_val
I also ran into the error
TypeError: 'generator' object is not subscriptable
if you follow the hyperas github the fix is
pip install networkx==1.11

Categories

Resources