I have set up custom training and testing functions in my project so I can minutely customise the training process. I use k-fold cross-validation to evaluate my model. For whatever reason, the model trains correctly for the first fold, and then on the second in throws this error.
tensorflow.python.framework.errors_impl.FailedPreconditionError: Could not find variable _AnonymousVar13. This could mean that the variable has been deleted. In TF1, it can also mean the variable is uninitialized. Debug info: container=localhost, status=Not found: Resource localhost/_AnonymousVar13/N10tensorflow3VarE does not exist. [[node test_model/dense_2/Tensordot/ReadVariableOp (defined at training_example.py:33) ]] [Op:__inference__train_step_1082]
I have no idea what's happening. I assumed the error arose because of poor initialisation, so I model.build() with the input shape. I have tried initialising the graph's weights with blank tensor, too, but that didn't work. I have also reset the backend on the last line in case there was a conflict with names, but that doesn't do the trick.
import numpy as np
import sklearn.model_selection
import tensorflow as tf
from tensorflow.python.keras.metrics import Mean, Precision, Recall
from tensorflow.python.keras.optimizer_v2.adam import Adam
n_splits = 5
batch_size = 16
n_epochs = 2
loss_function = tf.keras.losses.BinaryCrossentropy()
optimiser_fn = Adam
metrics = [
Mean(name='loss'),
Precision(name='prec'),
Recall(name='recall'),
]
learning_rate = 1e-2
dense_outputs = [10,10]
activation = 'relu'
class TestModel(tf.keras.Model):
def __init__(self):
super().__init__()
self._dense_ops = [tf.keras.layers.Dense(o) for o in dense_outputs]
self._output = tf.keras.layers.Dense(1)
def call(self, inputs):
hidden = inputs
for l in self._dense_ops:
hidden = l(hidden)
return self._output(hidden)
def _load_fold_sets_for_training(fold, fold_idcs, features, labels, batch_size):
# Get the indices for the sets.
train_idcs, validation_idcs, _ = fold_idcs[fold]
# Get the training data and labels.
training_data = features[train_idcs]
training_labels = labels[train_idcs]
# Load the training, validation and testing sets.
training_set = tf.data.Dataset.from_tensor_slices(
(training_data, training_labels)
)
training_set = training_set.batch(batch_size, drop_remainder=False)
validation_set = tf.data.Dataset.from_tensor_slices(
(features[validation_idcs], labels[validation_idcs])
)
validation_set = validation_set.batch(batch_size, drop_remainder=False)
return training_set, validation_set
#tf.function
def _train_step(batch_samples, batch_labels):
batch_predictions = model(batch_samples, training=True)
loss = loss_function(batch_predictions, batch_labels)
gradients = tf.gradients(loss, model.trainable_variables)
optimiser.apply_gradients(
zip(gradients, model.trainable_variables)
)
batch_predictions = tf.sigmoid(batch_predictions)
metrics[0].update_state(loss)
[m.update_state(batch_labels, batch_predictions) for m in metrics[1:]]
#tf.function
def _inference_step(batch_samples, batch_labels):
batch_predictions = model(batch_samples, training=False)
loss = loss_function(batch_predictions, batch_labels)
batch_predictions = tf.sigmoid(batch_predictions)
metrics[0].update_state(loss)
[m.update_state(batch_labels, batch_predictions) for m in metrics[1:]]
# Generate dataset.
features = np.random.rand(15,1440,1)
labels = np.random.rand(15,1440)
# Set up splits.
kfold = sklearn.model_selection.KFold(n_splits=n_splits, shuffle=True)
splits = []
for train_idcs, test_idcs in kfold.split(features):
train_idcs, val_idcs = sklearn.model_selection.train_test_split(train_idcs)
splits += [[train_idcs, val_idcs, test_idcs]]
fold = 0
while fold < n_splits:
# Load datasets for fold.
training_set, validation_set = _load_fold_sets_for_training(fold, splits, features, labels, batch_size)
# Load model.
model = TestModel()
# Build model.
model.build((1440, 1))
# Initialise Adam optimiser.
optimiser = optimiser_fn(learning_rate)
epoch = 0
while epoch < n_epochs:
epoch += 1
# Training.
for batch_features, batch_labels in training_set: _train_step(batch_features, batch_labels)
print(f'fold {fold}: epoch {epoch}:', ' '.join(f'train_{m.name}: {m.result():0.05f}' for m in metrics))
# Validation.
for batch_features, batch_labels in validation_set: _inference_step(batch_features, batch_labels)
print(f'fold {fold}: epoch {epoch}:', ' '.join(f'val_{m.name}: {m.result():0.05f}' for m in metrics))
tf.keras.backend.clear_session()
fold += 1
Any ideas?
The issue was the placement of the _train_step and _inference_step. If the two functions are redefined on every iteration of the fold, the error disappears and the model trains. I don't know why they must be redefined every step.
import numpy as np
import sklearn.model_selection
import tensorflow as tf
from tensorflow.python.keras.metrics import Mean, Precision, Recall
from tensorflow.python.keras.optimizer_v2.adam import Adam
n_splits = 5
batch_size = 2
n_epochs = 2
loss_function = tf.keras.losses.BinaryCrossentropy()
optimiser_fn = Adam
metrics = [
Mean(name='loss'),
Precision(name='prec'),
Recall(name='recall'),
]
learning_rate = 1e-2
dense_outputs = [10, 10]
activation = 'relu'
class TestModel(tf.keras.Model):
def __init__(self):
super().__init__()
self._dense_ops = [tf.keras.layers.Dense(o) for o in dense_outputs]
self._output = tf.keras.layers.Dense(1)
def call(self, inputs):
hidden = inputs
for l in self._dense_ops:
hidden = l(hidden)
return self._output(hidden)
def _load_fold_sets_for_training(fold, fold_idcs, features, labels, batch_size):
# Get the indices for the sets.
train_idcs, validation_idcs, _ = fold_idcs[fold]
# Get the training data and labels.
training_data = features[train_idcs]
training_labels = labels[train_idcs]
# Load the training, validation and testing sets.
training_set = tf.data.Dataset.from_tensor_slices(
(training_data, training_labels)
)
training_set = training_set.batch(batch_size, drop_remainder=False)
validation_set = tf.data.Dataset.from_tensor_slices(
(features[validation_idcs], labels[validation_idcs])
)
validation_set = validation_set.batch(batch_size, drop_remainder=False)
return training_set, validation_set
# Generate dataset.
features = np.random.rand(15, 1440, 1)
labels = np.random.rand(15, 1440)
# Set up splits.
kfold = sklearn.model_selection.KFold(n_splits=n_splits, shuffle=True)
splits = []
for train_idcs, test_idcs in kfold.split(features):
train_idcs, val_idcs = sklearn.model_selection.train_test_split(train_idcs)
splits += [[train_idcs, val_idcs, test_idcs]]
fold = 0
while fold < n_splits:
#tf.function
def _train_step(batch_samples, batch_labels):
batch_predictions = model(batch_samples, training=True)
loss = loss_function(batch_predictions, batch_labels)
gradients = tf.gradients(loss, model.trainable_variables)
optimiser.apply_gradients(
zip(gradients, model.trainable_variables)
)
batch_predictions = tf.sigmoid(batch_predictions)
metrics[0].update_state(loss)
[m.update_state(batch_labels, batch_predictions) for m in metrics[1:]]
#tf.function
def _inference_step(batch_samples, batch_labels):
batch_predictions = model(batch_samples, training=False)
loss = loss_function(batch_predictions, batch_labels)
batch_predictions = tf.sigmoid(batch_predictions)
metrics[0].update_state(loss)
[m.update_state(batch_labels, batch_predictions) for m in metrics[1:]]
# Load datasets for fold.
training_set, validation_set = _load_fold_sets_for_training(fold, splits, features, labels,
batch_size)
# Load model.
model = TestModel()
# Build model.
model.build((1440, 1))
# Initialise Adam optimiser.
optimiser = optimiser_fn(learning_rate)
epoch = 0
while epoch < n_epochs:
epoch += 1
# Training.
for batch_features, batch_labels in training_set: _train_step(batch_features,
batch_labels)
print(f'fold {fold}: epoch {epoch}:', ' '.join(f'train_{m.name}: {m.result():0.05f}' for
m in metrics))
# Validation.
for batch_features, batch_labels in validation_set: _inference_step(batch_features,
batch_labels)
print(f'fold {fold}: epoch {epoch}:', ' '.join(f'val_{m.name}: {m.result():0.05f}' for m
in metrics))
tf.keras.backend.clear_session()
fold += 1
Related
I am trying to implemented a Optuna Hyperparameter optimization for a Pytorch LSTM. But I do not know how to define my model correctly.
When I just use nn.linear erverything works fine but when I use nn.LSTMCell I get the following error:
AttributeError: 'tuple' object has no attribute 'dim'
The error gets raised because, the LSTM returns a tupel not a tensor. But I do not know how to fix it and can not find an example of an Pytorch LSTM with Optuna optimization online.
Here the Model definition:
def build_model_custom(trail):
# Suggest the number of layers of neural network model
n_layers = trail.suggest_int("n_layers", 1, 3)
layers = []
in_features = 20
for i in range(n_layers):
# Suggest the number of units in each layer
out_features = trail.suggest_int("n_units_l{}".format(i), 4, 18)
layers.append(nn.LSTMCell(in_features, out_features))
in_features = out_features
layers.append(nn.Linear(in_features, 2))
return nn.Sequential(*layers)
I have implemented an example of optuna optimizing LSTM before, I hope it will help you:
def get_best_parameters(args, Dtr, Val):
def objective(trial):
model = TransformerModel(args).to(args.device)
loss_function = nn.MSELoss().to(args.device)
optimizer = trial.suggest_categorical('optimizer',
[torch.optim.SGD,
torch.optim.RMSprop,
torch.optim.Adam])(
model.parameters(), lr=trial.suggest_loguniform('lr', 5e-4, 1e-2))
print('training...')
epochs = 10
val_loss = 0
for epoch in range(epochs):
train_loss = []
for batch_idx, (seq, target) in enumerate(Dtr, 0):
seq, target = seq.to(args.device), target.to(args.device)
optimizer.zero_grad()
y_pred = model(seq)
loss = loss_function(y_pred, target)
train_loss.append(loss.item())
loss.backward()
optimizer.step()
# validation
val_loss = get_val_loss(args, model, Val)
print('epoch {:03d} train_loss {:.8f} val_loss {:.8f}'.format(epoch, np.mean(train_loss), val_loss))
model.train()
return val_loss
sampler = optuna.samplers.TPESampler()
study = optuna.create_study(sampler=sampler, direction='minimize')
study.optimize(func=objective, n_trials=5)
pruned_trials = study.get_trials(deepcopy=False,
states=tuple([TrialState.PRUNED]))
complete_trials = study.get_trials(deepcopy=False,
states=tuple([TrialState.COMPLETE]))
best_trial = study.best_trial
print('val_loss = ', best_trial.value)
for key, value in best_trial.params.items():
print("{}: {}".format(key, value))
I implemented a solution by my self. I am not sure if it's the most pythonic but it works.
Suggestions for improvement are welcome.
def train_and_evaluate(param, model, trail):
# Load Data
train_dataloader = torch.utils.data.DataLoader(Train_Dataset, batch_size=batch_size)
Test_dataloader = torch.utils.data.DataLoader(Test_Dataset, batch_size=batch_size)
criterion = nn.MSELoss()
optimizer = getattr(optim, param['optimizer'])(model.parameters(), lr= param['learning_rate'])
acc = nn.L1Loss()
# Training Loop
for epoch_num in range(EPOCHS):
# Training
total_loss_train = 0
for train_input, train_target in train_dataloader:
output = model.forward(train_input.float())
batch_loss = criterion(output, train_target.float())
total_loss_train += batch_loss.item()
model.zero_grad()
batch_loss.backward()
optimizer.step()
# Evaluation
total_loss_val = 0
total_mae = 0
with torch.no_grad():
for test_input, test_target in Test_dataloader:
output = model(test_input.float())
batch_loss = criterion(output, test_target)
total_loss_val += batch_loss.item()
batch_mae = acc(output, test_target)
total_mae += batch_mae.item()
accuracy = total_mae/len(Test_Dataset)
# Add prune mechanism
trail.report(accuracy, epoch_num)
if trail.should_prune():
raise optuna.exceptions.TrialPruned()
return accuracy
Being new to deep learning, I plan to open this post with a reproducible code example using Mnist, to understand fully on how to improve the training speed.
I'm using Ubuntu 20.04 LTS and have a RTX 3080, when I don't use the batch training and just train the whole 60,000 like below, it takes about 6-7 seconds to finish the training and GPU usage at 99-100%.
import torch
import torch.nn as nn
from torchvision import datasets
from torchvision.transforms import ToTensor
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import random
from matplotlib import pyplot as plt
from tqdm.auto import tqdm
import timeit
# Set Device function (to GPU)
def set_device():
device = "cuda" if torch.cuda.is_available() else "cpu"
if device != "cuda":
print("GPU is not enabled")
else:
print("GPU is enabled")
return device
DEVICE = set_device()
# set seed function
def set_seed(seed=None, seed_torch=True):
if seed is None:
seed = np.random.choice(2 ** 32)
random.seed(seed)
np.random.seed(seed)
if seed_torch:
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
print(f'Random seed {seed} has been set.')
SEED = 2021
# for DataLoader
def seed_worker(worker_id):
worker_seed = torch.initial_seed() % 2**32
np.random.seed(worker_seed)
random.seed(worker_seed)
# Download Mnist datasets
train_data = datasets.MNIST(
root='data',
train=True,
transform=ToTensor(),
download=True,
)
test_data = datasets.MNIST(
root='data',
train=False,
transform=ToTensor()
)
X = train_data.data.reshape(60000, -1).float()
y = train_data.train_labels
X_test = test_data.data.reshape(10000, -1).float()
y_test = test_data.train_labels
# Simple Neural Net
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
# define layers
self.layers = nn.Sequential(
nn.Linear(784, 600),
nn.ReLU(),
nn.Linear(600, 300),
nn.ReLU(),
nn.Linear(300, 100),
nn.ReLU(),
nn.Linear(100, 10)
)
def forward(self, x):
return self.layers(x)
def predict(self, x):
return torch.argmax(self.forward(x), 1)
# simple train
X = X.to(DEVICE)
y = y.to(DEVICE)
X_test = X_test.to(DEVICE)
y_test = y_test.to(DEVICE)
SEED = 2021
set_seed(SEED)
model = Net().to(DEVICE)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.005, momentum=0.9)
loss_list = []
logits = model.forward(X)
loss = loss_function(logits, y)
start1 = timeit.default_timer()
for epoch in range(500):
logits = model.forward(X)
loss = loss_function(logits, y)
loss_list.append(loss.item())
loss.backward()
optimizer.step()
optimizer.zero_grad()
if epoch % 20 == 0:
print(f"epoch {epoch + 1}: loss: {loss:.5f},"
f"train_accuracy: {torch.sum(model.predict(X) == y) / 60000:.3f},"
f"test_accuracy:{torch.sum(model.predict(X_test) == y_test) / 10000:.3f}")
end1 = timeit.default_timer()
print(f"Time: {end1 - start1:.2f} seconds")
But when I use batch training like below, the speed drops significantly, and when num_workers=0, it takes 176 seconds to finish the training, and when num_workers=4, it takes 216 seconds to finish the training. And in both scenarios, the GPU usage hover around 20-30% and sometimes even lower. So my question is: is it normal to expect this time increase when using batch training, and if so, why should we use batch training? Is it to improve the test accuracy?
Secondly, why does increasing the num_workers take longer to train? Is there anything fundamentally wrong in the code? And is it normal to have GPU usage low when doing the batch training?
X = train_data.data.reshape(60000, -1).float()
y = train_data.train_labels
X_test = test_data.data.reshape(10000, -1).float()
y_test = test_data.train_labels
# Dataloader
g_seed = torch.Generator()
g_seed.manual_seed(SEED)
batch_size = 300
test_data = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_data, batch_size=batch_size,
shuffle=False, num_workers=8,
worker_init_fn=seed_worker,
generator=g_seed)
train_data = TensorDataset(X, y)
train_loader = DataLoader(train_data, batch_size=batch_size, drop_last=True,
shuffle=True, num_workers=8,
worker_init_fn=seed_worker,
generator=g_seed)
def train_test_classification(net, criterion, optimizer, train_loader,
test_loader, num_epochs=1, verbose=True,
training_plot=True, device='cuda'):
net.train()
training_losses = []
for epoch in tqdm(range(num_epochs)): # loop over the dataset multiple times
running_loss = 0.0
for (i, data) in enumerate(train_loader, 0):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data
inputs = inputs.to(device).float()
labels = labels.to(device).long()
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = net(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# print statistics
if verbose:
training_losses += [loss.item()]
net.eval()
def test(data_loader):
correct = 0
total = 0
for data in data_loader:
inputs, labels = data
inputs = inputs.to(device).float()
labels = labels.to(device).long()
outputs = net(inputs)
_, predicted = torch.max(outputs, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
acc = 100 * correct / total
return total, acc
train_total, train_acc = test(train_loader)
test_total, test_acc = test(test_loader)
if verbose:
print(f"Accuracy on the {train_total} training samples: {train_acc:0.2f}")
print(f"Accuracy on the {test_total} testing samples: {test_acc:0.2f}")
if training_plot:
plt.plot(training_losses)
plt.xlabel('Batch')
plt.ylabel('Training loss')
plt.show()
return train_acc, test_acc
set_seed(SEED)
net = Net().to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net.parameters(), lr=0.005, momentum=0.9)
num_epochs = 500
start = timeit.default_timer()
_, _ = train_test_classification(net, criterion, optimizer, train_loader,
test_loader, num_epochs=num_epochs,
training_plot=True, device=DEVICE)
end = timeit.default_timer()
print(f"Time: {end-start:.2f}")
Low GPU usage can sometimes be due to slow data transfer. Having a large number of workers does not always help though.
Consider using pin_memory=True in the DataLoader definition. This should speed up the data transfer between CPU and GPU. Here is a thread on the Pytorch forum if you want more details.
Another solution may be to add the argument non_blocking=True inside the to() method.
I have the below code for a binary classification and it works fine but i would like to modify the nn.Sequential parameters and add an BiLSTM layer. I have the below code:
class BertClassifier(nn.Module):
def __init__(self, freeze_bert=False):
super(BertClassifier, self).__init__()
# Specify hidden size of BERT, hidden size of our classifier, and number of labels
D_in, H, D_out = 768, 50, 2
# Instantiate BERT model
self.bert = BertModel.from_pretrained('bert-base-multilingual-uncased')
# Instantiate an one-layer feed-forward classifier
self.classifier = nn.Sequential(nn.Linear(D_in, H),nn.ReLU(),nn.Linear(H, D_out))
# Freeze the BERT model
if freeze_bert:
for param in self.bert.parameters():
param.requires_grad = False
def forward(self, input_ids, attention_mask):
# Feed input to BERT
outputs = self.bert(input_ids=input_ids,attention_mask=attention_mask)
# Extract the last hidden state of the token `[CLS]` for classification task
last_hidden_state_cls = outputs[0][:, 0, :]
# Feed input to classifier to compute logits
logits = self.classifier(last_hidden_state_cls)
return logits
I have tried to modify the sequential like this self.classifier = nn.Sequential(nn.LSTM(D_in, H, batch_first=True, bidirectional=True),nn.ReLU(),nn.Linear(H, D_out)) but then it throws the error RuntimeError: input must have 3 dimensions, got 2 on line logits = self.classifier(last_hidden_state_cls). I found that I can use nn.ModuleDict instead of nn.Sequential and i made the below :
self.classifier = nn.ModuleDict({
'lstm': nn.LSTM(input_size=D_in, hidden_size=H,batch_first=True, bidirectional=True ),
'linear': nn.Linear(in_features=H,out_features=D_out)})
But now I'm having issues computing the forward function with this. Can someone advice how i can properly modify the forward function?
Update: I also installed CUDA and now when I run the code it returns the error CUDA out of memory. Tried to allocate 16.00 MiB and I tried to lower the batch size but that doesn't fix the problem. I also tried the below but didn't resolved either. Any advice, please?
import torch, gc
gc.collect()
torch.cuda.empty_cache()
Update with the code:
MAX_LEN = 64
# For fine-tuning BERT, the authors recommend a batch size of 16 or 32.
batch_size = 32
VALID_BATCH_SIZE = 4
file1 = open('MH.txt', 'r')
list_com = []
list_label = []
for line in file1:
possible_labels = 'positive|negative'
label = re.findall(possible_labels, line)
line = re.sub(possible_labels, ' ', line)
line = re.sub('\n', ' ', line)
list_com.append(line)
list_label.append(label[0])
list_tuples = list(zip(list_com, list_label))
file1.close()
labels = ['positive', 'negative']
df = pd.DataFrame(list_tuples, columns=['text', 'label'])
df['label'] = df['label'].map({'positive': 1, 'negative': 0})
for i in range(0,len(df['label'])):
list_label[i] = df['label'][i]
#print(df)
#print(df['label'].value_counts())
X = df.text.values
y = df.label.values
X_train, X_val, y_train, y_val =\
train_test_split(X, y, test_size=0.1, random_state=2020)
def text_preprocessing(text):
# Remove '#name'
text = re.sub(r'(#.*?)[\s]', ' ', text)
# Replace '&' with '&'
text = re.sub(r'&', '&', text)
# Remove trailing whitespace
text = re.sub(r'\s+', ' ', text).strip()
return text
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', do_lower_case=True)
# Create a function to tokenize a set of texts
def preprocessing_for_bert(data):
input_ids = []
attention_masks = []
for sent in data:
encoded_sent = tokenizer.encode_plus(
text=text_preprocessing(sent), # Preprocess sentence
add_special_tokens=True, # Add `[CLS]` and `[SEP]`
max_length=MAX_LEN, # Max length to truncate/pad
pad_to_max_length=True, # Pad sentence to max length
# return_tensors='pt', # Return PyTorch tensor
return_attention_mask=True # Return attention mask
)
# Add the outputs to the lists
input_ids.append(encoded_sent.get('input_ids'))
attention_masks.append(encoded_sent.get('attention_mask'))
# Convert lists to tensors
input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)
return input_ids, attention_masks
train_inputs, train_masks = preprocessing_for_bert(X_train)
val_inputs, val_masks = preprocessing_for_bert(X_val)
# Convert other data types to torch.Tensor
train_labels = torch.tensor(y_train)
val_labels = torch.tensor(y_val)
# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
# Create the DataLoader for our validation set
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)
# Create the BertClassfier class
class BertClassifier(nn.Module):
"""Bert Model for Classification Tasks."""
def __init__(self, freeze_bert=False):
"""
#param bert: a BertModel object
#param classifier: a torch.nn.Module classifier
#param freeze_bert (bool): Set `False` to fine-tune the BERT model
"""
super(BertClassifier, self).__init__()
# Specify hidden size of BERT, hidden size of our classifier, and number of labels
D_in, H, D_out = 768, 50, 2
# Instantiate BERT model
self.bert = BertModel.from_pretrained('bert-base-multilingual-uncased')
# Instantiate an one-layer feed-forward classifier
self.classifier = nn.ModuleDict({
'lstm': nn.LSTM(input_size=D_in, hidden_size=H, batch_first=True, bidirectional=True),
'linear': nn.Linear(in_features=H, out_features=D_out)})
# Freeze the BERT model
if freeze_bert:
for param in self.bert.parameters():
param.requires_grad = False
def forward(self, input_ids, attention_mask):
outputs = self.bert(input_ids=input_ids,attention_mask=attention_mask)
sequence_output = outputs[0]
sequence_output, _ = self.lstm(sequence_output)
linear_output = self.linear(sequence_output[:, -1])
return linear_output
def initialize_model(epochs=4):
# Instantiate Bert Classifier
bert_classifier = BertClassifier(freeze_bert=False)
print(bert_classifier)
# Tell PyTorch to run the model on GPU
bert_classifier.to(device)
# Create the optimizer
optimizer = AdamW(bert_classifier.parameters(), lr=5e-5)
# Total number of training steps
total_steps = len(train_dataloader) * epochs
# Set up the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps=total_steps)
return bert_classifier, optimizer, scheduler
# Specify loss function
loss_fn = nn.CrossEntropyLoss()
def set_seed(seed_value=42):
"""Set seed for reproducibility."""
random.seed(seed_value)
np.random.seed(seed_value)
torch.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)
def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
"""Train the BertClassifier model."""
# Start training loop
print("Start training...\n")
for epoch_i in range(epochs):
# Print the header of the result table
print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
print("-" * 70)
# Measure the elapsed time of each epoch
t0_epoch, t0_batch = time.time(), time.time()
# Reset tracking variables at the beginning of each epoch
total_loss, batch_loss, batch_counts = 0, 0, 0
# Put the model into the training mode
model.train()
# For each batch of training data...
for step, batch in enumerate(train_dataloader):
batch_counts += 1
# Load batch to GPU
b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
# Zero out any previously calculated gradients
model.zero_grad()
# Perform a forward pass. This will return logits.
logits = model(b_input_ids, b_attn_mask)
# Compute loss and accumulate the loss values
loss = loss_fn(logits, b_labels)
batch_loss += loss.item()
total_loss += loss.item()
# Perform a backward pass to calculate gradients
loss.backward()
# Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# Update parameters and the learning rate
optimizer.step()
scheduler.step()
# Print the loss values and time elapsed for every 20 batches
if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
# Calculate time elapsed for 20 batches
time_elapsed = time.time() - t0_batch
# Print training results
print(
f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")
# Reset batch tracking variables
batch_loss, batch_counts = 0, 0
t0_batch = time.time()
# Calculate the average loss over the entire training data
avg_train_loss = total_loss / len(train_dataloader)
print("-" * 70)
#Evaluation
if evaluation == True:
# After the completion of each training epoch, measure the model's performance
# on our validation set.
val_loss, val_accuracy = evaluate(model, val_dataloader)
# Print performance over the entire training data
time_elapsed = time.time() - t0_epoch
print(
f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
print("-" * 70)
print("\n")
print("Training complete!")
def evaluate(model, val_dataloader):
"""After the completion of each training epoch, measure the model's performance
on our validation set.
"""
# Put the model into the evaluation mode. The dropout layers are disabled during
# the test time.
model.eval()
# Tracking variables
val_accuracy = []
val_loss = []
# For each batch in our validation set...
for batch in val_dataloader:
# Load batch to GPU
b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
# Compute logits
with torch.no_grad():
logits = model(b_input_ids, b_attn_mask)
# Compute loss
loss = loss_fn(logits, b_labels)
val_loss.append(loss.item())
# Get the predictions
preds = torch.argmax(logits, dim=1).flatten()
# Calculate the accuracy rate
accuracy = (preds == b_labels).cpu().numpy().mean() * 100
val_accuracy.append(accuracy)
# Compute the average accuracy and loss over the validation set.
val_loss = np.mean(val_loss)
val_accuracy = np.mean(val_accuracy)
return val_loss, val_accuracy
def accuracy(probs, y_true):
"""
- Print AUC and accuracy on the test set
#params probs (np.array): an array of predicted probabilities with shape (len(y_true), 2)
#params y_true (np.array): an array of the true values with shape (len(y_true),)
fpr, tpr, threshold = roc_curve(y_true, preds)
roc_auc = auc(fpr, tpr)
print(f'AUC: {roc_auc:.4f}')
"""
preds = probs[:, 1]
# Get accuracy over the test set
y_pred = np.where(preds >= 0.5, 1, 0)
accuracy = accuracy_score(y_true, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')
def bert_predict(model, test_dataloader):
"""Perform a forward pass on the trained BERT model to predict probabilities on the test set."""
# Put the model into the evaluation mode. The dropout layers are disabled during the test time.
model.eval()
all_logits = []
# For each batch in our test set...
for batch in test_dataloader:
# Load batch to GPU
b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)[:2]
# Compute logits
with torch.no_grad():
logits = model(b_input_ids, b_attn_mask)
all_logits.append(logits)
# Concatenate logits from each batch
all_logits = torch.cat(all_logits, dim=0)
# Apply softmax to calculate probabilities
probs = F.softmax(all_logits, dim=1).cpu().numpy()
return probs
set_seed(42) # Set seed for reproducibility
bert_classifier, optimizer, scheduler = initialize_model(epochs=3)
# start training
train(bert_classifier, train_dataloader, val_dataloader, epochs=3, evaluation=True)
# Compute predicted probabilities on the test set
probs = bert_predict(bert_classifier, val_dataloader)
# Evaluate the Bert classifier
accuracy(probs, y_val)
Here is my code.I dont know why my train and validation accuracy increase too slow.Is that normal? I’m new at deep learning.This is my homework.Train and validation values dont change nearly till loop 500.Is that normal? I changed learning rate and add weight_decay etc. but i didnt see difference
# -*- coding: utf-8 -*-
#Libraries
import torch
import torch.nn.functional as F
from torch import autograd, nn
from torch.autograd import Variable
import numpy as np
import matplotlib.pyplot as plt
from torchvision import transforms, datasets
from torch.utils import data
"""
Olivetti face dataset
"""
from sklearn.datasets import fetch_olivetti_faces
# Olivetti dataset download
olivetti = fetch_olivetti_faces()
train = olivetti.images
label = olivetti.target
X = train
Y = label
print("\nDownload Ok")
"""
Set for train
"""
train_rate = 0.8
X_train = np.zeros([int(train_rate * X.shape[0]),64,64], dtype=float)
Y_train = np.zeros([int(train_rate * X.shape[0])], dtype=int)
X_val = np.zeros([int((1-train_rate) * X.shape[0]+1),64,64], dtype=float)
Y_val = np.zeros([int((1-train_rate) * X.shape[0]+1)], dtype=int)
#Split data for train and validation
ie=0
iv=0
for i in range(X.shape[0]):
if (i%10)/9 <= train_rate:
X_train[ie] = X[i]
Y_train[ie] = Y[i]
ie += 1
else:
X_val[iv] = X[i]
Y_val[iv] = Y[i]
iv += 1
X_train = X_train.reshape(320,-1,64,64)
X_val = X_val.reshape(80,-1,64,64)
print(Y_train.shape)
X_train = torch.Tensor(X_train)
Y_train = torch.Tensor(Y_train)
X_val = torch.Tensor(X_val)
Y_val = torch.Tensor(Y_val)
batch_size = 16
train_loader = torch.utils.data.DataLoader(X_train,
batch_size=batch_size,
)
val_loader = torch.utils.data.DataLoader(X_val,
batch_size=batch_size,
)
class CNNModule(nn.Module):
def __init__(self):
super(CNNModule, self).__init__()
self.conv1 = nn.Conv2d(1, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16 * 13 * 13, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 40)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(-1, 16 * 13 * 13)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
def make_train(model,dataset,n_iters,gpu):
# Organize data
X_train,Y_train,X_val,Y_val = dataset
kriter = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(),lr=0.03)
#Arrays to save loss and accuracy
tl=np.zeros(n_iters) #For train loss
ta=np.zeros(n_iters) #For train accuracy
vl=np.zeros(n_iters) #For validation loss
va=np.zeros(n_iters) #For validation accuracy
# Convert labels to long
Y_train = Y_train.long()
Y_val = Y_val.long()
# GPU control
if gpu:
X_train,Y_train = X_train.cuda(),Y_train.cuda()
X_val,Y_val = X_val.cuda(),Y_val.cuda()
model = model.cuda() # Parameters to GPU!
print("Using GPU")
else:
print("Using CPU")
# print(X_train.shape)
# print(Y_train.shape)
for i in range(n_iters):
# train forward
train_out = model.forward(X_train)
train_loss = kriter(train_out,Y_train)
# Backward and optimization
train_loss.backward()
optimizer.step()
optimizer.zero_grad()
# Compute train accuracy
train_predict = train_out.cpu().detach().argmax(dim=1)
train_accuracy = (train_predict.cpu().numpy()==Y_train.cpu().numpy()).mean()
# For validation
val_out = model.forward(X_val)
val_loss = kriter(val_out,Y_val)
# Compute validation accuracy
val_predict = val_out.cpu().detach().argmax(dim=1)
val_accuracy = (val_predict.cpu().numpy()==Y_val.cpu().numpy()).mean()
tl[i] = train_loss.cpu().detach().numpy()
ta[i] = train_accuracy
vl[i] = val_loss.cpu().detach().numpy()
va[i] = val_accuracy
# Show result each 5 loop
if i%5==0:
print("Loop --> ",i)
print("Train Loss :",train_loss.cpu().detach().numpy())
print("Train Accuracy :",train_accuracy)
print("Validation Loss :",val_loss.cpu().detach().numpy())
print("Validation Accuracy :",val_accuracy)
model = model.cpu()
#Print result
plt.subplot(2,2,1)
plt.plot(np.arange(n_iters), tl, 'r-')
plt.subplot(2,2,2)
plt.plot(np.arange(n_iters), ta, 'b--')
plt.subplot(2,2,3)
plt.plot(np.arange(n_iters), vl, 'r-')
plt.subplot(2,2,4)
plt.plot(np.arange(n_iters), va, 'b--')
dataset = X_train,Y_train,X_val,Y_val
gpu = True
gpu = gpu and torch.cuda.is_available()
model = CNNModule()
make_train(model,dataset,1000,gpu)
OUTPUT:
Loop --> 0
Train Loss : 3.6910985
Train Accuracy : 0.025
Validation Loss : 3.6908844
Validation Accuracy : 0.025
Loop --> 5
Loop --> 215
Train Loss : 3.6849258
Train Accuracy : 0.025
Validation Loss : 3.6850574
Validation Accuracy : 0.025
Loop --> 500
Train Loss : 3.4057992
Train Accuracy : 0.103125
Validation Loss : 3.5042462
Validation Accuracy : 0.0875
Loop --> 995
Train Loss : 0.007807272
Train Accuracy : 1.0
Validation Loss : 0.64222467
Validation Accuracy : 0.8375
OUTPUT GRAPH IMAGE:
I don't know if this is the only problem - but please note that you zero the gradient, then do forward pass over the validation data. which means that new gradients of the validation data are stored in the model before the next iteration. The common practice should be to create some evaluation method, and use it to make prediction over the validation set without saving the gradients. something like:
def eval_model(data, X_val, Y_val):
model.eval(); # this sets the model to be in inferrence mode (for example if you have batchNorm or droput layers)
with torch.no_grad(): # tells the model to not compute gradients.
val_out = model.forward(X_val)
val_loss = criterion(val_out,Y_val)
# here put some prints or whatever you want to do
model.train() # this returns the model to be in training mode
This
is an example from the TFLearn documentation. It shows how to combine TFLearn and Tensorflow, using a TFLearn trainer with a regular Tensorflow graph. However, the current training, test and validation accuracy calculations are not accessible.
import tensorflow as tf
import tflearn
...
# User defined placeholders
with tf.Graph().as_default():
# Placeholders for data and labels
X = tf.placeholder(shape=(None, 784), dtype=tf.float32)
Y = tf.placeholder(shape=(None, 10), dtype=tf.float32)
net = tf.reshape(X, [-1, 28, 28, 1])
# Using TFLearn wrappers for network building
net = tflearn.conv_2d(net, 32, 3, activation='relu')
.
.
.
net = tflearn.fully_connected(net, 10, activation='linear')
loss = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits_v2(
logits=net,
labels=Y))
optimizer = tf.train.AdamOptimizer(learning_rate=0.01).minimize(loss)
# Initializing the variables
...
# Launch the graph
with tf.Session() as sess:
sess.run(init)
...
for epoch in range(2): # 2 epochs
...
for i in range(total_batch):
batch_xs, batch_ys = mnist_data.train.next_batch(batch_size)
sess.run(optimizer, feed_dict={X: batch_xs, Y: batch_ys})
How do I access the calculated training and validation accuracy at each step in the nested FOR loop?
UPDATE FOR CLARITY:
A solution might be as follows: Using the fit_batch method of the Trainer class, I believe I am calculating the training and validation accuracy during the nested loop.
Does this code calculate the running accuracies as the model trains?
Is there a better way of doing this with TFLearn?
I understand that tensorboard uses these values. Could I retrieve the values from the eventlogs?
def accuracy(predictions, labels):
return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
/ predictions.shape[0])
...
network = input_data(shape=[None, image_size, image_size, num_channels],
data_preprocessing=feature_normalization,
data_augmentation=None,
name='input_d')
.
.
.
network = regression(network, optimizer='SGD',
loss='categorical_crossentropy',
learning_rate=0.05, name='targets')
model_dnn_tr = tflearn.DNN(network, tensorboard_verbose=0)
...
with tf.Session(graph=graph) as session:
...
for step in range(num_steps):
...
batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
batch_labels = train_labels[offset:(offset + batch_size), :]
loss = model_dnn_tr.fit_batch({'input_d' : batch_data}, {'targets':
batch_labels})
if (step % 50 == 0):
trainAccr = accuracy(model_dnn_tr.predict({'input_d' :
batch_data}), batch_labels)
validAccr = accuracy(model_dnn_tr.predict({'input_d' :
valid_dataset}), valid_labels)
testAccr = accuracy(model_dnn_tr.predict({'input_d' : test_dataset}),
test_labels)
UPDATE with The correct answer
Could I retrieve the values from the eventlogs?
Tensorboard does have a means to download the accuracy datasets, but making use of it during training is problematic.
Does this code calculate the running accuracies as the model trains?
In a word. Yes.
The fit_batch method works as one might expect; as does the initial solution I posted below.
However, neither is the prescribed method.
Is there a better way of doing this within TFLearn?
Yes!
In order to o track and interact with the metrics of the training, a Training Callback function should be implemented.
from tflearn import callbacks as cb
class BiasVarianceStrategyCallback(cb.Callback):
def __init__(self, train_acc_thresh,run_id,rel_err=.1):
""" Note: We are free to define our init function however we please. """
def errThrshld(Tran_accuracy=train_acc_thresh,relative_err=rel_err):
Tran_err = round(1-Tran_accuracy,2)
Test_err = ...
Vald_err = ...
Diff_err = ...
return {'Tr':Tran_err,'Vl':Vald_err,'Ts':Test_err,'Df':Diff_err}
return
def update_acc_df(self,training_state,state):
...
return
def on_epoch_begin(self, training_state):
""" """
...
variance_found = ...
if trn_acc_stall or vld_acc_stall:
print("accuracy increase stalled. training epoch:"...
if trn_lss_mvNup or vld_lss_mvNup:
print("loss began increase training:"...
raise StopIteration
return
if variance_found or bias_found:
print("bias:",bias_found,"variance:",variance_found)
raise StopIteration
return
return
def on_batch_end(self, training_state, snapshot=False):
self.update_acc_df(training_state,"batch")
return
def on_epoch_end(self, training_state):
self.update_acc_df(training_state,"epoch")
return
def on_train_end(self, training_state):
self.update_acc_df(training_state,"train")
self.df = self.df.iloc[0:0]
return
Initial solution
The most satisfying solution I found thus far:
Uses the dataset object and iterators to feed data.
Not much different from the fit_batch method in the OP.
def accuracy(predictions, labels):
return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
/ predictions.shape[0])
...
graph = tf.Graph()
with graph.as_default():
...
# create a placeholder to dynamically switch between
# validation and training batch sizes
batch_size_x = tf.placeholder(tf.int64)
data_placeholder = tf.placeholder(tf.float32,
shape=(None, image_size, image_size, num_channels))
labels_placeholder = tf.placeholder(tf.float32, shape=(None, num_labels))
# create dataset: one for training and one for test etc
dataset = tf.data.Dataset.from_tensor_slices((data_placeholder,labels_placeholder)).batch(batch_size_x).repeat()
# create a iterator
iterator = tf.data.Iterator.from_structure(dataset.output_types, dataset.output_shapes)
# get the tensor that will contain data
feature, label = iterator.get_next()
# create the initialisation operations
init_op = iterator.make_initializer(dataset)
valid_data_x = tf.constant(valid_data)
test_data_x = tf.constant(test_data)
# Model.
network = input_data(shape=[None, image_size, image_size, num_channels],
placeholder=data_placeholder,
data_preprocessing=feature_normalization,
data_augmentation=None,
name='input_d')
.
.
.
logits = fully_connected(network,...
# Training computation.
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=labels_placeholder,logits=logits))
# Optimizer.
optimizer = tf.train.GradientDescentOptimizer(0.01).minimize(loss)
prediction = tf.nn.softmax(logits)
...
with tf.Session(graph=graph) as session:
tf.global_variables_initializer().run()
# initialise iterator with train data
feed_dict = {data_placeholder: train_data,
labels_placeholder: train_data_labels,
batch_size_x: batch_size}
session.run(init_op, feed_dict = feed_dict)
for step in range(num_steps):
batch_data,batch_labels = session.run( [feature, label], feed_dict =
feed_dict )
feed_dict2 = {data_placeholder: batch_data, labels_placeholder: batch_labels}
_, l, predictions = session.run([optimizer, loss, prediction],
feed_dict=feed_dict2)
if (step % 50 == 0):
trainAccrMb = accuracy(predictions, batch_labels)
feed_dict = {data_placeholder: valid_data_x.eval(), labels_placeholder: valid_data_labels }
valid_prediction = session.run(prediction,
feed_dict=feed_dict)
validAccr= accuracy(valid_prediction, valid_data_labels)
feed_dict = {data_placeholder: test_data_x.eval(), labels_placeholder:
test_data_labels }#, batch_size_x: len(valid_data)}
test_prediction = session.run(prediction,
feed_dict=feed_dict)
testAccr = accuracy(test_prediction, test_data_labels)