How to use cross validation to select model? - python

I'm trying to use cross validation to select the best model. This is my code:
models = []
scaler = StandardScaler()
scaler.fit(x_train)
scaled_x_train = scaler.transform(x_train)
scaled_x_test = scaler.transform(x_test)
models.append(('LogisticRegression', LogisticRegression()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('RandomForest', RandomForestClassifier()))
models.append(('GaussianNB', GaussianNB()))
models.append(('SVM', svm.SVC()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('LDA', LinearDiscriminantAnalysis()))
for name, model in models:
kfold = KFold(n_splits=n_folds)
cv_results = cross_val_score(model, scaled_x_train, y_train, cv=kfold, scoring='accuracy')
print("%6s %.3f %.3f " % (name, cv_results.mean(), cv_results.std()))
Now I want to add a CNN model that I defined. How can I add it in the cross validation?

Related

Merge these scripts together to iterate over more algorithms

I have 4 types of data.
each one has been pre-processed using:
x1,y1=Standardisation
x2,y2=Normalisation
x3,y3=Rescale
and one is completely unprocessed (x,y).
I have applied logistic regression to each like this:
#Building Logistic Regression model on the UNPROCESSED DATA
from sklearn.metrics import accuracy_score
lr_model = LogisticRegression()
lr_model.fit(x_train,y_train)
lr_predict = lr_model.predict(x_test)
print('Logistic Regression - ',accuracy_score(lr_predict,y_test))
#Building Logistic Regression model on the NORMALISED DATA
from sklearn.linear_model import LogisticRegression
lr_norm = LogisticRegression()
lr_norm.fit(x1_train, y1_train)
y_pred = lr_norm.predict(x1_test)
print("Accuracy of logistic regression on test set with Rescaled features: {:.2f}".format(lr_norm.score(x1_test, y1_test)))
and so on...
I want to make one graph, not sure which, that best represents the performance through its accuracy score, or whatever else there may be... but of the other models I wish to test down below:
svm_model = SVC(kernel='linear')
svm_model.fit(x_train,y_train)
svc_predict = svm_model.predict(x_test)
print('SVM - ',accuracy_score(svc_predict,y_test))
print('\t\t\t\tTRAIN DATA\n')
print(classification_report(y_train, svm_model.predict(x_train), target_names=encoder.inverse_transform([0,1,2])))
print('\n')
print('\t\t\t\tTEST DATA\n')
print(classification_report(y_test, svm_model.predict(x_test), target_names=encoder.inverse_transform([0,1,2])))
nb_model = GaussianNB()
nb_model.fit(x_train,y_train)
nb_predict = nb_model.predict(x_test)
print('Naive bayes - ',accuracy_score(nb_predict,y_test))
dt_model = DecisionTreeClassifier(max_leaf_nodes=3)
dt_model.fit(x_train,y_train)
dt_predict = dt_model.predict(x_test)
print('Decision Tree - ',accuracy_score(dt_predict,y_test))
rfc_model = RandomForestClassifier(max_depth=3)
rfc_model.fit(x_train,y_train)
rfc_predict = rfc_model.predict(x_test)
print('Random Forest - ',accuracy_score(rfc_predict,y_test))
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(x_train,y_train)
knn_predict = knn_model.predict(x_test)
print('knn - ',accuracy_score(knn_predict,y_test))
Hope this makes sense..
#preprare data
pre_processing=[('NOT PROCESSED', None)]
pre_processing.append(('RESCALED', MinMaxScaler(feature_range=(0, 1))))
pre_processing.append(('STANDARDIZED', StandardScaler()))
pre_processing.append(('NORMALIZED', Normalizer()))
# prepare models
models = []
models.append(( 'LR' , LogisticRegression(max_iter=10000)))
models.append(( 'LDA' , LinearDiscriminantAnalysis()))
models.append(( 'KNN' , KNeighborsClassifier()))
models.append(( 'CART' , DecisionTreeClassifier()))
models.append(( 'NB' , GaussianNB()))
models.append(( 'SVM' , SVC(probability=True)))
results = []
names = []
higher_acc=0
standard=0
best_model=''
for process in pre_processing:
globals()['df_'+process[0]] = pd.DataFrame(index=None, columns=None)
for algo in models:
estimators = [process,algo]
model = Pipeline(estimators)
ss = ShuffleSplit(n_splits=10, test_size=test_size, random_state=seed)
names.append(algo[0])
for scoring in performance_metrix:
cv_results = cross_val_score(model, X_train, Y_train, cv=ss, scoring=scoring)
globals()['df_'+process[0]].loc[algo[0],scoring]= '%s\u00B1%s'%(round(cv_results.mean()*100.0,2),round(cv_results.std()*100.0,2))
if performance_metrix.index(scoring)==0:
results.append(cv_results)
if cv_results.mean()*100.0 > higher_acc:
higher_acc=cv_results.mean()*100.0
standard=cv_results.std()*100.0
best_model=process[0], algo[0]
elif cv_results.mean()*100.0 == higher_acc:
if cv_results.std()*100.0 < standard:
higher_acc=cv_results.mean()*100.0
best_model=process[0], algo[0]
print('For %s data we produced:\n\n'%(process[0]),globals()['df_'+process[0]],'\n\n')
# boxplot algorithm comparison
fig = pyplot.figure()
fig.suptitle('Algorithms accuracy comparison for %s data'%(process[0]))
ax = fig.add_subplot(111)
pyplot.boxplot(results[:len(models)])
ax.set_xticklabels(names)
pyplot.show()
# Create a pipeline that standardizes the data then creates a model
print("The overall best performance was the one obtained with %s data, using %s algorithm. \nIt's Accuracy resulted to be %s with a standard deviation of %s" %(best_model[0],best_model[1],round(higher_acc,2),round(standard,2)))
datasets = {
"Unprocessed": (x_train, x_test, y_train, y_test),
"Standardisation": (x1_train, x1_test, y1_train, y1_test),
"Normalisation": (x2_train, x2_test, y2_train, y2_test),
"Rescale": (x3_train, x3_test, y3_train, y3_test),
}
models = {
"Logistic Regression": LogisticRegression(),
"Decision Tree": DecisionTreeClassifier(max_leaf_nodes=3),
"Random Forest": RandomForestClassifier(max_depth=3)
}
def evaluate_model(model, dataset):
x_train, x_test, y_train, y_test = data
model.fit(x_train, y_train)
pred = model.predict(x_test)
return accuracy_score(pred, y_test)
model_scores_for_datasets = {}
for dataset_name, dataset in datasets.items():
dataset_scores = {}
for model_name, model in models.items():
model_score = evaluate_model(model, dataset)
dataset_scores[model_name] = model_score
model_scores_for_datasets[dataset_name] = dataset_scores
Here, model_scores_for_datasets will contain the accuracy results for every dataset for each model and will look something like:
{
"Unprocessed" :
{
"Logistic Regression" : 10,
"Decision Tree": 5,
"Random Forest": 20
},
"Standardisation" :
{
"Logistic Regression" : 10,
"Decision Tree": 5,
"Random Forest": 20
},
"Normalisation" :
{
"Logistic Regression" : 10,
"Decision Tree": 5,
"Random Forest": 20
},
"Rescale" :
{
"Logistic Regression" : 10,
"Decision Tree": 5,
"Random Forest": 20
},
}
You now have the results for each dataset and can create your required plots. Something along these line:
for dataset_name, scores in model_scores_for_datasets.items():
# For example:
# dataset_name will be "Unprocessed"
# scores will be a dict like so:
# {
# "Logistic Regression" : 10,
# "Decision Tree": 5,
# "Random Forest": 20
# }
generate_plot(dataset_name scores)
Of course, you need to figure out the generate_plot function. Hope this helps and gives you some idea.

DMatrix in MultiOutput XGBoost Regressor

I am trying to convert a hyperparameter tuning algorithm to a MultiOutput regression setup, can someone please help me create DMatrix for the same. Here is the code for reference:
def modelfit(alg, dtrain, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
if useTrainCV:
xgb_param = alg.get_xgb_params()
xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
metrics='auc', early_stopping_rounds=early_stopping_rounds, show_progress=False)
alg.set_params(n_estimators=cvresult.shape[0])
#Fit the algorithm on the data
alg.fit(dtrain[predictors], dtrain['Disbursed'],eval_metric='auc')
#Predict training set:
dtrain_predictions = alg.predict(dtrain[predictors])
dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
#Print model report:
print("\nModel Report")
print("Accuracy : %.4g" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions))
print("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob))
feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
Any further clarification required, please comment.TIA!

TypeError: __init__() got an unexpected keyword argument 'n_folds'

results = []
names = []
for name, model in models:
kfold = model_selection.KFold(n_splits=num_instances, n_folds=num_folds, random_state=seed)
cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
I guess with new update, n_folds is no longer in use in Kfold. Can anyone help me to overcome the issue?
In the updated library, the n_folds parameter was renamed to n_splits:
https://scikit-learn.org/0.18/whats_new.html#model-selection-enhancements-and-api-changes

TypeError: 'set' object is not subscriptable. 3 CSV files

When trying to build my data set an error of "TypeError: 'set' object is not subscriptable" is received.
dataDir = '/content/drive/My Drive/Colab Notebooks/HW 3/' # Directory with input files
trainFile = 'q2train.csv' # Training examples
labelFile = 'q2label.csv' # Test label
validFile = 'q2valid.csv' # Valid Files
train = pd.read_csv(dataDir+trainFile)
valid = pd.read_csv(dataDir+validFile)
label = pd.read_csv(dataDir+labelFile)
data_sets = {
'train',
'label',
'valid'}
def get_data(data_set_name, test_prop=0.2, seed=2019):
"""returns data for training, testing, and data characteristics"""
data = data_sets[data_set_name]
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=test_prop,
random_state=seed)
nF = X.shape[1] # number of features
nC = len(np.unique(y)) # number of classes
nTrain, nTest = len(y_train), len(y_test)
print("\nData set: %s" %data_set_name)
print("\tNumber of features %d" %nF)
print("\tNumber of output classes = %d" %(nC))
print("\tNumber of training examples = %d" %(nTrain))
print("\tNumber of testing examples = %d" %(nTest))
return X_train, X_test, y_train, y_test, nF, nC, nTrain, nTest
for name in data_set:
X_train, X_test, y_train, y_test, nF, nC, nTrain, nTest = get_data(name)
Any help would be appreciated, thanks in advance.
Use a dictionary:
train = pd.read_csv(dataDir+trainFile)
valid = pd.read_csv(dataDir+validFile)
label = pd.read_csv(dataDir+labelFile)
data_sets = {
'train': train,
'label': label,
'valid': valid
}
Then data_sets[data_set_name] will retrieve the dataset you want.

My gridsearchCV don't work and i don't know why

Hello i have problem with GridSearchCV it works perfectly on mnist_dataset but not on my own data, and i don't know why.
# df = pd.read_csv('bank-full.csv',sep=';')
# print(df.head())
#
# print(df.shape)
#
# print(df.columns)
# print(df.info)
# df.columns = [col.replace('"', '') for col in df.columns]
#
#
# df.drop(columns=['day', 'poutcome'], axis =1 , inplace=True)
#
#
# print(df.head())
# print(df.shape)
#
# le = preprocessing.LabelEncoder()
# df.job = le.fit_transform(df.job)
# df.education = le.fit_transform(df.education)
# df.housing = le.fit_transform(df.housing)
# df.loan = le.fit_transform(df.loan)
# #df.poutcome = le.fit_transform(df.poutcome)
# df.month = le.fit_transform(df.month)
# df.contact = le.fit_transform(df.contact)
# df.marital = le.fit_transform(df.marital)
# df.default = le.fit_transform(df.default)
# df.y = le.fit_transform(df.y)
#
#
#
# print(df.head())
#
# X = df.iloc[:, 0:14]
# y = df.iloc[:, 14]
# X = np.array(X, dtype="float64")
# y = np.array(y,dtype="float64")
#
# scaler = Normalizer()
# X = scaler.fit_transform(X)
#
#
#
#
# x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1, random_state=0)
# model = LogisticRegression(penalty='l2', max_iter=1000)
# model.fit(x_train, y_train)
# prediction = model.predict(x_test)
# from sklearn.metrics import accuracy_score
# print("ACC: {} ".format(accuracy_score(y_test, prediction)))
#
#
# print(x_train.shape)
#
# nn = Sequential()
# nn.add(Dense(120,input_dim = 14, activation='relu'))
# nn.add(Dense(240,activation='relu'))
#
#
# nn.add(Dense(1))
# nn.add(Activation('sigmoid'))
#
# nn.compile(loss=keras.losses.binary_crossentropy,
# optimizer='sgd',
# metrics=['accuracy'])
#
# nn.fit(x_train, y_train,
# batch_size=10,
# epochs=10,
# verbose=1,
#
# validation_data=(x_test, y_test))
#
# loss_acc = nn.evaluate(x_test, y_test, verbose=0)
# print('Test loss:', loss_acc[0])
# print('Test accuracy:', loss_acc[1])
data = bm.load_data('bank-full.csv')
data = bm.preprocess_data(data)
X,y = bm.split_data(data)
scaler = Normalizer()
X = scaler.fit_transform(X)
x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1, random_state=0)
start = time()
model = KerasClassifier(build_fn=nnmodel.create_model())
optimizers = ['rmsprop', 'adam']
init = ['glorot_uniform', 'normal', 'uniform']
epochs = np.array([50, 100, 150])
batches = np.array([5, 10, 20])
param_grid = dict(optimizer=optimizers, nb_epoch=epochs, batch_size=batches, init=init)
grid = GridSearchCV(estimator=model, param_grid=param_grid)
grid_result = grid.fit(x_train, y_train)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
for params, mean_score, scores in grid_result.grid_scores_:
print("%f (%f) with: %r" % (scores.mean(), scores.std(), params))
print("total time:", time() - start)
this commented section, is just a simple keras model, that works perfectrly but below if i try gridSearchCV on this model, it gives me this errors:
https://pastebin.com/mhJLSXAS , for example if i run this program https://www.kaggle.com/shujunge/gridsearchcv-with-keras it works perferctly byt on my data it's not, does somebody know why ?
Scikitlearn builds each time new model. Script has to build a classifier with specific parameters inside the grid search method. So you have to send the method name as an argument, not the result of it.
Probably nnmodel.create_model is your function which creates a new model based on parameters. So try to change:
build_fn=nnmodel.create_model()
To:
build_fn=nnmodel.create_model

Categories

Resources