Related
As a newbie in 'pytorch', I am building a neural network for classification of faulty water pumps in tazania for this competition I am also using ax-platform for hyperparameter tuning.
Yes methods such as gradient boosting classifiers and random forest probably works well or even better than neural network classifier for this tabular data problem but I want to practice using pytorch.
The problem is that when I am doing optimise from ax-platform I am getting accuracy score not more than 54% and I wish to improve that. It jumps around certain number only as shown here:
I tried to do some debugging in my evaluate function by printing out predicted and labels
def evaluate(net, testloader):
correct = 0
total = 0
with torch.no_grad():
for data in testloader:
inputs, labels = map(Variable,data)
outputs = net(inputs)
_,predicted = torch.max(outputs,1)
print("row in testloader")
print("predicted: ")
print(predicted)
print("labels: ")
print(labels)
total += labels.size(0)
correct += (predicted == labels).sum().item()
# print(str(correct)+"/"+str(total))
print('Accuracy of the network: %d %%' % (
100 * correct / total))
return 100 * correct / total
and net(inputs) is giving a tensor of all ones. It should have been a tensors with varied classifications/numbers of [0,1,2] .
I have tried:
using SGD and ADAM
1,2 and 3 hidden layers
different epoches
'Dropout' with different dropout probabilites
disabling biases
having different learning rates
different number of neurons
different total trails
different betas and eps for ADAM algorithm
'log_scale = True' in parameters optimsation
sci-kit learn StandardScaler on train_values and test_values
How can I break the 54% accuracy barrier here? What are the gaps in my knowledges? What bugs in the code that causes this?
Here is the code:
import numpy as np
import pandas as pd
import torch
import torchvision
import torchvision.transforms as transforms
from ax import optimize
from ax.plot.contour import plot_contour
from ax.plot.trace import optimization_trace_single_method
from ax.utils.notebook.plotting import render
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler
from torch import functional, nn, optim
from torch.autograd import Variable
import math
rng =0
ordinalEncorder = OrdinalEncoder()
labelEncoder = LabelEncoder()
standardScaler = StandardScaler()
d = {
"funder": "string","installer": "string",
"wpt_name": "string", "basin": "string",
"subvillage": "string", "region": "string", "lga": "string", "ward": "string",
"public_meeting": "string", "recorded_by": "string", "scheme_management": "string",
"scheme_name": "string", "permit": "string",
"extraction_type": "string", "extraction_type_group": "string", "extraction_type_class": "string",
"management": "string",
"management_group": "string", "payment": "string",
"payment_type": "string",
"water_quality": "string", "quality_group": "string",
"quantity": "string", "quantity_group": "string", "source": "string",
"source_type": "string",
"source_class": "string", "waterpoint_type": "string",
"waterpoint_type_group": "string",
"date_recorded": "string",
}
str_cat= [
"funder","installer",
"wpt_name" , "basin",
"subvillage", "region",
"lga", "ward",
"public_meeting", "recorded_by",
"scheme_management", "scheme_name",
"permit", "extraction_type",
"extraction_type_group",
"extraction_type_class", "management",
"management_group",
"payment", "payment_type", "water_quality", "quality_group",
"quantity",
"quantity_group", "source",
"source_type", "source_class",
"waterpoint_type",
"waterpoint_type_group",
"date_recorded",
]
test_values =pd.read_csv("./test-set-values.csv")
train_values = pd.read_csv("./training-set-values.csv")
train_labels = pd.read_csv("./training-set-labels.csv")
length = train_values.shape[0]
train_batch_size = 100
test_batch_size= 100
train_labels['status_group'] = labelEncoder.fit_transform(train_labels.status_group.astype('string'))
train_values[str_cat] = ordinalEncorder.fit_transform(train_values[str_cat].astype(d).fillna(""))
test_values[str_cat] = ordinalEncorder.fit_transform(test_values[str_cat].astype(d).fillna(""))
train_values[str_cat] = standardScaler.fit_transform(train_values[str_cat])
test_values[str_cat]=standardScaler.fit_transform(test_values[str_cat])
train_values = train_values.astype('float32').values
test_values = test_values.astype('float32').values
train_labels = train_labels.status_group.astype('long').values
x_train,x_test,y_train,y_test = map(torch.from_numpy,
train_test_split(train_values,train_labels,test_size=0.2,
random_state=rng,shuffle=True))
train = torch.utils.data.TensorDataset(x_train,y_train)
test = torch.utils.data.TensorDataset(x_test,y_test)
train_loader = torch.utils.data.DataLoader(train,batch_size=train_batch_size,shuffle = False)
test_loader = torch.utils.data.DataLoader(test,batch_size=test_batch_size,shuffle = False)
class Net(nn.Module):
def __init__(self,hidden1,hidden2,dropoutProbabilities1):
super(Net,self).__init__()
self.layer1 = nn.Linear(40,hidden1,)
self.layer2 = nn.Linear(hidden1,hidden2)
self.layer3 = nn.Linear(hidden2,3)
# self.layer2 = nn.Linear(hidden1,3) # layer 2 configuration if layer 3 is not used.
self.layer4 = nn.Linear(3,3,)
self.activation =nn.ReLU()
self.dropout1 = nn.Dropout(p=dropoutProbabilities1)
# self.dropout2 = nn.Dropout(p=dropoutProbabilities2)
# self.batchNormalisation1 = nn.BatchNorm1d(hidden1)
# self.batchNormalisation2 = nn.BatchNorm1d(hidden2)
def forward(self, x):
x = self.layer1(x)
# x = self.batchNormalisation1(x)
x = self.layer2(x)
x = self.activation(x)
x = self.dropout1(x)
# x = self.batchNormalisation2(x)
x = self.layer3(x)
x = self.activation(x)
x = self.dropout1(x)
x = self.layer4(x)
return x
def train(net, parameterization, trainloader):
optimizer = optim.Adam(net.parameters(),
lr=parameterization['lr'],
weight_decay=parameterization['weight_decay'],
maximize = True
)
criterion = nn.CrossEntropyLoss()
for epoch in range(5):
for data in trainloader:
inputs, labels = map(Variable,data)
optimizer.zero_grad()
loss = criterion(net(inputs), labels)
loss.backward()
optimizer.step()
return net
def evaluate(net, testloader):
correct = 0
total = 0
with torch.no_grad():
for data in testloader:
inputs, labels = map(Variable,data)
outputs = net(inputs)
_,predicted = torch.max(outputs,1)
# print("row in testloader")
# print("predicted: ")
# print(predicted)
# print("labels: ")
# print(labels)
total += labels.size(0)
correct += (predicted == labels).sum().item()
# print(str(correct)+"/"+str(total))
print('Accuracy of the network: %d %%' % (
100 * correct / total))
return 100 * correct / total
def train_evaluate(parameters):
net = Net(parameters["hidden1"],parameters['hidden2'],parameters['dropoutProbabilities1'])
# net = Net(parameters["hidden1"],parameters['hidden2'])
net = train(net, parameters, train_loader)
return evaluate(net, test_loader)
parameters=[
{"name": "lr", "type": "range",
"value_type":'float', "bounds": [0.001, 0.5],'log_scale':True},
{"name": "weight_decay", "type": "range",
"value_type":'float',"bounds": [0.1, 0.9999],'log_scale':True},
{"name": "hidden1", "type":"range",
"value_type":'int',"bounds":[5,1100],'log_scale':True},
{"name": "hidden2", "type":"range",
"value_type":'int',"bounds":[5,1100],'log_scale':True},
{"name":"dropoutProbabilities1","type":"range","value_type"
:"float","bounds":[0.1,0.4]},
]
best_parameters, values, experiment, model = optimize(
parameters= parameters,
evaluation_function=train_evaluate,
objective_name='accuracy',
minimize=False,
total_trials=10
)
at the moment i am trying to build a SARIMAX Model in python with exogen variables.
Unfortunately i am getting this error: "cannot perform reduce with flexible type"
# Function for Rolling Forecast with Sarima
def rolling_forecast(traindata,test_data, Modell_order = None , Sarima_order = None, eliminate_0 = True, exogen= None, exogen_test=None):
history = [x for x in traindata]
history_exogen = [x for x in exogen]
predictions = list()
for t in range(len(test_data)):
Sarima_Modell_same = SARIMAX(history,order = Modell_order ,seasonal_order= Sarima_order, exog=history_exogen)
model_fit = Sarima_Modell_same.fit()
output = model_fit.forecast(steps = 1,exog=history_exogen)
yhat = output[0]
obs = test_data[t]
obs_ex = exogen_test[t]
predictions.append(yhat)
history.append(obs)
history_exogen.append(obs_ex)
#print('predicted=%f, expected=%f' % (yhat, obs))
series_predicted = pd.Series(predictions, dtype='float64')
series_predicted.index = test_data.index
if eliminate_0 is True:
# Eliminate 0 values --> (for differenced Time Series not applyable because of negativ values)
series_predicted = series_predicted.apply(lambda x : x if x > 0 else 0)
test_data.plot()
series_predicted.plot(color = 'red')
else:
test_data.plot()
series_predicted.plot(color = 'red')
#print(sqrt(mean_squared_error(test_data, series_predicted)))
Is there any way to do this?
Ist More about the multivariate Part. Without the exogen variable it is working but if i try to include it the error appears.
Would appreciate any help.
I am trying to use h2o.glm to find the optimal penalty lambda by cross-validation. This is a multinomial model.
However, I see that it is optimizing according to the multinomial deviance. Can I do cross-validation with respect to some other metric, such as misclassification error?
The parameter custom_metric_func is mentioned in the docs, but I am not clear on its description. Is this metric used as the cross-validation score? If yes, the docs also state that it is only available in the Python API. Is this really true?
If you are definitively working on h2o, then the suitable option to not leave the R interface with h2o would be to use the options keep_cross_validation_models = TRUE, keep_cross_validation_predictions = TRUE,. from this you could build the misclassification error, of each category, on each model fitted with a specific sequence of values for lambda. Alternatively, you could loop or lapply through a sequence of lambdas. for example for (i in lambda_vector){ models[[i]]= h2o.glm(...,lambda= i )}. Each one of the objects has a confusion matrix hence you could pontentially compute the classification error for each category. And you could make your own selection criterion. The custom metric does work only in python.
if you can use just R:
to fit a multinomial model with an elastic penalty I would recommend , if there is not a particular reason to be tied to h2o, to use the package glmnet that provides the command cv.glmnet() with the options family="multinomial" and type.measure="class". This would yield a multinomial model chosen by cross validation on the clasification error.
I wrote h2o.glm_custom as a "replacement" for h2o.glm that keeps the cross validation models, so that a custom selection criterion can be used afterwords, as suggested by #Diegolog. My approach uses h2o.grid. I attempted to include all of the parameters for h2o.glm but simplified some defaults to avoid extra work.
h2o.glm_custom <- function(x,
y,
training_frame,
model_id = NULL,
validation_frame = NULL,
nfolds = 0,
seed = -1,
keep_cross_validation_models = TRUE,
keep_cross_validation_predictions = FALSE,
keep_cross_validation_fold_assignment = FALSE,
fold_assignment = "AUTO",
fold_column = NULL,
random_columns = NULL,
ignore_const_cols = TRUE,
score_each_iteration = FALSE,
offset_column = NULL,
weights_column = NULL,
family = "binomial",
rand_family = c("[gaussian]"),
tweedie_variance_power = 0,
tweedie_link_power = 1,
theta = 1e-10,
solver = "AUTO",
alpha = 0,
early_stopping = TRUE,
nlambdas = 100,
standardize = TRUE,
missing_values_handling = "MeanImputation",
plug_values = NULL,
compute_p_values = FALSE,
remove_collinear_columns = FALSE,
intercept = TRUE,
non_negative = FALSE,
max_iterations = -1,
objective_epsilon = -1,
beta_epsilon = 1e-04,
gradient_epsilon = -1,
link = "family_default",
rand_link = "[identity]",
startval = NULL,
calc_like = FALSE,
HGLM = FALSE,
prior = -1,
lambda_min_ratio = 0.01,
beta_constraints = NULL,
max_active_predictors = -1,
obj_reg = -1,
export_checkpoints_dir = NULL,
balance_classes = FALSE,
class_sampling_factors = NULL,
max_after_balance_size = 5,
max_hit_ratio_k = 0,
max_runtime_secs = 0,
custom_metric_func = NULL) {
# Find lambda_max
model <- h2o.glm(x,
y,
training_frame,
model_id,
validation_frame,
nfolds,
seed,
keep_cross_validation_models,
keep_cross_validation_predictions,
keep_cross_validation_fold_assignment,
fold_assignment,
fold_column,
random_columns,
ignore_const_cols,
score_each_iteration,
offset_column,
weights_column,
family,
rand_family,
tweedie_variance_power,
tweedie_link_power,
theta,
solver,
alpha,
NULL, # lambda
TRUE, # lambda_search
early_stopping,
1, # nlambdas
standardize,
missing_values_handling,
plug_values,
compute_p_values,
remove_collinear_columns,
intercept,
non_negative,
max_iterations,
objective_epsilon,
beta_epsilon,
gradient_epsilon,
link,
rand_link,
startval,
calc_like,
HGLM,
prior,
lambda_min_ratio,
beta_constraints,
max_active_predictors,
obj_reg = obj_reg,
export_checkpoints_dir = export_checkpoints_dir,
balance_classes = balance_classes,
class_sampling_factor = class_sampling_factors,
max_after_balance_size = max_after_balance_size,
max_hit_ratio_k = max_hit_ratio_k,
max_runtime_secs = max_runtime_secs,
custom_metric_func = custom_metric_func)
lambda_max <- model#model$lambda_best
# Perform grid search on lambda, with logarithmic scale
lambda_min <- lambda_max * lambda_min_ratio
grid <- exp(seq(log(lambda_max), log(lambda_min), length.out = nlambdas))
grid_list <- lapply(sapply(grid, list), list)
hyper_parameters <- list(lambda = grid_list)
result <- h2o.grid('glm',
x = x,
y = y,
training_frame = training_frame,
nfolds = nfolds,
family = family,
alpha = alpha,
ignore_const_cols = ignore_const_cols,
hyper_params = hyper_parameters,
seed = seed)
}
Then the following function could be used to select lambda based on misclassification error:
get_cv_means <- function(grid_results) {
mean_errors <- lapply(grid_results#model_ids, function(id) {
model <- h2o.getModel(id)
lambda <- model#parameters$lambda
err <- as.numeric(model#model$cross_validation_metrics_summary['err', 'mean'])
data.frame(lambda = lambda, error = err)
})
dt <- data.table::rbindlist(mean_errors)
data.table::setkey(dt, lambda)
dt
}
Here is a complete example using these function to select lambda using cross validation based on misclassification error:
h2o.init()
path <- system.file("extdata", "prostate.csv", package= "h2o")
h2o_df <- h2o.importFile(path)
h2o_df$CAPSULE <- as.factor(h2o_df$CAPSULE)
lambda_min_ratio <- 0.000001
nlambdas <- 100
nfolds <- 20
result <- h2o.glm_custom(x = c("AGE", "RACE", "PSA", "GLEASON"),
y = "CAPSULE",
training_frame = h2o_df,
family = "binomial",
alpha = 1,
nfolds = nfolds,
lambda_min_ratio = lambda_min_ratio,
nlambdas = nlambdas,
early_stopping = TRUE)
tbl <- get_cv_means(result)
Gives:
> head(tbl)
lambda error
1: 2.222376e-07 0.2264758
2: 2.555193e-07 0.2394541
3: 2.937851e-07 0.2380508
4: 3.377814e-07 0.2595451
5: 3.883666e-07 0.2478443
6: 4.465272e-07 0.2595603
Which can be plotted, etc...
ggplot() + geom_line(data = tbl[lambda < 0.00001], aes(x = lambda, y = error))
While passing the exact same parameters to LightGBM and sklearn's implementation of LightGBM, I am getting different results. Initially, I was getting the exact same results on doing this, however, I made some changes to my code and now I can't find out why they're not coming the same. This means that the performance metrics and feature importance are coming differently. Please help me figure it out, I can't figure out the mistake I am making. It could either be a mistake in the way I am implementing LightGBM using the original library or in sklearn's implementation. Link for explanation on why we should get identical results - light gbm - python API vs Scikit-learn API
x_train, x_test, y_train, y_test = train_test_split(df_dummy[df_merge.columns], labels, test_size=0.25,random_state=42)
n_folds = 5
lgb_train = lgb.Dataset(x_train, y_train)
def objective(params, n_folds = n_folds):
"""Objective function for Gradient Boosting Machine Hyperparameter Tuning"""
print(params)
params['max_depth'] = int(params['max_depth'])
params['num_leaves'] = int(params['num_leaves'])
params['min_child_samples'] = int(params['min_child_samples'])
params['subsample_freq'] = int(params['subsample_freq'])
# Perform n_fold cross validation with hyperparameters
# Use early stopping and evalute based on ROC AUC
cv_results = lgb.cv(params, lgb_train, nfold=n_folds, num_boost_round=10000,
early_stopping_rounds=100, metrics='auc')
# Extract the best score
best_score = max(cv_results['auc-mean'])
# Loss must be minimized
loss = 1 - best_score
num_iteration = int(np.argmax(cv_results['auc-mean']) + 1)
of_connection = open(out_file, 'a')
writer = csv.writer(of_connection)
writer.writerow([loss, params, num_iteration])
# Dictionary with information for evaluation
return {'loss': loss, 'params': params, 'status': STATUS_OK, 'estimators': num_iteration}
space = {
'min_child_samples': hp.quniform('min_child_samples', 5, 100, 5),
'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0),
'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0),
'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),
'max_depth' : hp.quniform('max_depth', 3, 10, 1),
'subsample' : hp.quniform('subsample', 0.6, 1, 0.05),
'num_leaves': hp.quniform('num_leaves', 20, 150, 1),
'subsample_freq': hp.quniform('subsample_freq',0,10,1),
'min_gain_to_split': hp.quniform('min_gain_to_split', 0.01, 0.1, 0.01),
'learning_rate' : 0.05,
'objective' : 'binary',
}
out_file = 'results/gbm_trials.csv'
of_connection = open(out_file, 'w')
writer = csv.writer(of_connection)
writer.writerow(['loss', 'params', 'estimators'])
of_connection.close()
trials = Trials()
best = fmin(objective, space, algo=tpe.suggest, trials=trials, max_evals=10)
bayes_trials_results = sorted(trials.results, key = lambda x: x['loss'])
results = pd.read_csv('results/gbm_trials.csv')
# Sort with best scores on top and reset index for slicing
results.sort_values('loss', ascending = True, inplace = True)
results.reset_index(inplace = True, drop = True)
results.head()
best_bayes_estimators = int(results.loc[0, 'estimators'])
best['max_depth'] = int(best['max_depth'])
best['num_leaves'] = int(best['num_leaves'])
best['min_child_samples'] = int(best['min_child_samples'])
num_boost_round=int(best_bayes_estimators * 1.1)
best['objective'] = 'binary'
best['boosting_type'] = 'gbdt'
best['subsample_freq'] = int(best['subsample_freq'])
#Actual LightGBM
best_gbm = lgb.train(params=best, train_set=lgb_train, num_boost_round=num_boost_round)
print('Plotting feature importances...')
ax = lgb.plot_importance(best_gbm, max_num_features=15)
plt.show()
feature_imp = pd.DataFrame()
feature_imp["feature"] = list(x_train.columns)
feature_imp["importance_gain"] = best_gbm.feature_importance(importance_type='gain')
feature_imp["importance_split"] = best_gbm.feature_importance(importance_type='split')
feature_imp.to_clipboard()
y_pred_score = best_gbm.predict(x_test)
roc_auc_score_list = []
f1_score_list = []
accuracy_score_list = []
precision_score_list = []
recall_score_list = []
thresholds = [0.4,0.5,0.6,0.7]
for threshold in thresholds:
print("threshold is {}".format(threshold))
y_pred = np.where(y_pred_score>=threshold, 1, 0)
print(roc_auc_score(y_test,y_pred_score))
print(f1_score(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(precision_score(y_test,y_pred))
print(recall_score(y_test,y_pred))
roc_auc_score_list.append(roc_auc_score(y_test,y_pred_score))
f1_score_list.append(f1_score(y_test,y_pred))
accuracy_score_list.append(accuracy_score(y_test,y_pred))
precision_score_list.append(precision_score(y_test,y_pred))
recall_score_list.append(recall_score(y_test,y_pred))
performance_metrics = pd.DataFrame(
{'thresholds':thresholds,
'roc_auc_score':roc_auc_score_list,
'f1_score':f1_score_list,
'accuracy_score':accuracy_score_list,
'precision_score':precision_score_list,
'recall_score':recall_score_list })
performance_metrics.transpose().to_clipboard()
#Sklearn's Implementation of LightGBM
best_sk = dict(best)
del best_sk['min_gain_to_split']
sk_best_gbm = lgb.LGBMClassifier(**best_sk, n_estimators=num_boost_round, learning_rate=0.05, min_split_gain=best['min_gain_to_split'])
sk_best_gbm.fit(x_train, y_train)
sk_best_gbm.get_params()
print('Plotting feature importances...')
ax = lgb.plot_importance(sk_best_gbm, max_num_features=15)
plt.show()
feature_imp = pd.DataFrame()
feature_imp["feature"] = list(x_train.columns)
feature_imp["Importance"] = sk_best_gbm.feature_importances_
feature_imp.to_clipboard()
y_pred_score = sk_best_gbm.predict_proba(x_test)[:,1]
roc_auc_score_list = []
f1_score_list = []
accuracy_score_list = []
precision_score_list = []
recall_score_list = []
thresholds = [0.4,0.5,0.6,0.7]
for threshold in thresholds:
print("threshold is {}".format(threshold))
y_pred = np.where(y_pred_score>=threshold, 1, 0)
print(roc_auc_score(y_test,y_pred_score))
print(f1_score(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(precision_score(y_test,y_pred))
print(recall_score(y_test,y_pred))
roc_auc_score_list.append(roc_auc_score(y_test,y_pred_score))
f1_score_list.append(f1_score(y_test,y_pred))
accuracy_score_list.append(accuracy_score(y_test,y_pred))
precision_score_list.append(precision_score(y_test,y_pred))
recall_score_list.append(recall_score(y_test,y_pred))
performance_metrics = pd.DataFrame(
{'thresholds':thresholds,
'roc_auc_score':roc_auc_score_list,
'f1_score':f1_score_list,
'accuracy_score':accuracy_score_list,
'precision_score':precision_score_list,
'recall_score':recall_score_list })
performance_metrics.transpose().to_clipboard()
I've already seen the Replace a row by a new Dataframe solution, but It's pretty unclear to me.
I've got a DataFrame of results of some model with a MultiIndex at the name of the model and the mode (train/test) that I want to update with the last execution keeping the other model results (create the DataFrame if it doesn't exist or update the row with the same name and mode with the dic variable). Here is my code:
def save_results(dic, path = "../ModelsResults"):
try:
df_results = pd.read_pickle(path)
print("Updating ModelResults...")
df_now = pd.DataFrame.from_dict([dic])
if df_results.index.isin([(dic["Model"], dic["Mode"])]).any():
print("\tUpdating Model/Mode...")
df_now.drop(["Model", "Mode"],axis=1)
df_results.at[dic["Model"], dic["Mode"]] = df_now
else:
print("\tCreating Model/Mode...")
df_results = df_results.append(df_now)
except FileNotFoundError:
print("Creating ModelResults...")
df_results = pd.DataFrame.from_dict([dic])
df_results = df_results.set_index(["Model", "Mode"])
df_results.to_pickle(path)
print("Done")
return df_results
Every metric that I want to save is in the dic variable. For example:
dic = {
"Model": "Dummy-PredictingAlwaysZero",
"Mode": "Train",
"MSE": mse ,
"nRMSE": nrmse,
"nDCG#10": ndcg(train["rel"].values, y_pred, k = 10),
"nDCG#50": ndcg(train["rel"].values, y_pred, k = 50)
}
df_results = save_results(dic, path = "./ModelsResults")
And the expected DataFrame is like:
MSE nDCG#10 nDCG#50 nRMSE
Model Mode
Dummy-PredictingAlwaysZero Train 0.08639 0.162948 0.106816 0.293922