How to reference loss score in LearningRateScheduler

How to reference loss score in LearningRateScheduler - python

I have a working Step_Decay:
def step_decay(epoch):
initial_lr = 0.01
decay_factor=0.1
step_size=1
new_lr = initial_lr * (decay_factor ** np.floor(epoch / step_size))
print("Learning rate: " + str(new_lr))
return new_lr
lr_sched = keras.callbacks.LearningRateScheduler(step_decay)
But I want to stop reducing the learning rate once loss < 0.1.
How can I access epoch loss number to accomplish this?

By keras.callbacks.Callback to record loss history and learning rate during the training procedure.
learning-rate-schedules - Access to loss by step_decay with history loss
def step_decay(epoch):
initial_lrate = 0.1
drop = 0.5
epochs_drop = 10.0
lrate = initial_lrate * math.pow(drop,
math.floor((1+epoch)/epochs_drop))
return lrate
lrate = LearningRateScheduler(step_decay)
class LossHistory(keras.callbacks.Callback):
def on_train_begin(self, logs={}):
self.losses = []
self.lr = []
def on_epoch_end(self, batch, logs={}):
self.losses.append(logs.get(‘loss’))
self.lr.append(step_decay(len(self.losses)))
loss_history = LossHistory()
lrate = LearningRateScheduler(step_decay)
callbacks_list = [loss_history, lrate]
history = model.fit(X_train, y_train,
validation_data=(X_test, y_test),
epochs=epochs,
batch_size=batch_size,
callbacks=callbacks_list,
verbose=2)
Access epoch loss number:
def get_learningrate_metric(optimizer):
def learningrate(y_true, y_pred):
return optimizer.learningrate
return learningrate
x = Input((50,))
out = Dense(1, activation='sigmoid')(x)
model = Model(x, out)
optimizer = Adam(lr=0.001)
learningrate_metric = get_learningrate_metric(optimizer)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['acc', learningrate_metric])
# reducing the learning rate by half every 2 epochs
cbks = [LearningRateScheduler(lambda epoch: 0.001 * 0.5 ** (epoch // 2)),
TensorBoard(write_graph=False)]
X = np.random.rand(1000, 50)
Y = np.random.randint(2, size=1000)
model.fit(X, Y, epochs=10, callbacks=cbks)
or
Adadelta = optimizers.Adadelta(lr=1.0, rho=0.95, epsilon=1e-08, decay=0.1)
Adadelta is an extension of Adagrad that seeks to reduce its aggressive, monotonically decreasing learning rate.
get the learning rate value after every epoch
Or can it help you?
EarlyStopping

Related

Using ray tune `tune.run` with pytorch returns different optimal hyperparameters combination

I've initialized two identical ANN with PyTorch (both as structure and initial parameters), and I've noticed that the hyperparameters setting with Ray Tune, returns different results for the two ANN, even if I didn't have any random initialization.
Someone could explain what I'm doing wrong? I'll attach the code:
ANN Initialization:
class Featrues_model(nn.Module):
def __init__(self, n_inputs, dim_hidden, n_outputs):
super().__init__()
self.fc1 = nn.Linear(n_inputs, dim_hidden)
self.fc2 = nn.Linear(dim_hidden, n_outputs)
def forward(self, X):
X = self.fc1(X)
X = self.fc2(X)
return X
features_model_v1 = Featrues_model(len(list_input_variables),5,6)
features_model_v2 = Featrues_model(len(list_input_variables),5,6)
features_model_v2.load_state_dict(features_model_v1.state_dict())
Hyperpamameters setting
config = {
"lr": tune.choice([1e-2, 1e-5]),
"weight_decay": tune.choice([1e-2, 1e-5]),
"batch_size": tune.choice([16,64]),
"epochs": tune.choice([10,50])
}
Train & Validation Dataframe
trainset = df_final.copy()
test_abs = int(len(trainset) * 0.8)
train_subset, val_subset = random_split(
trainset, [test_abs, len(trainset) - test_abs]
)
df_train = df_final.iloc[train_subset.indices]
df_val = df_final.iloc[val_subset.indices]
Train function design
def setting_model(config, df_train, df_val, model):
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=config["lr"], weight_decay=config["weight_decay"])
BATCH_SIZE = config["batch_size"]
for epoch in range(config["epochs"]):
train_epoch_loss = 0
train_epoch_acc = 0
step = 0
for i in tqdm(range(0, df_train.shape[0], BATCH_SIZE)):
batch_X = np.array(
df_train[list_input_variables].iloc[i:i+BATCH_SIZE]
)
batch_X = torch.Tensor([x for x in batch_X])
batch_Y = np.array(
df_train[list_output_variables].iloc[i:i+BATCH_SIZE]
)
batch_Y = torch.Tensor([int(y) for y in batch_Y])
batch_Y = batch_Y.type(torch.int64)
optimizer.zero_grad()
outputs = model.forward(batch_X)
train_loss = criterion(outputs, batch_Y)
train_acc = multi_acc(outputs, batch_Y)
train_loss.backward()
optimizer.step()
train_epoch_loss += train_loss.item()
train_epoch_acc += train_acc.item()
step += 1
# print statistics
print(f"Epochs: {epoch}")
print(f"Train Loss: {train_epoch_loss/len(df_train)}")
print(f"Train Acc: {train_epoch_acc/step}")
print("\n")
# Validation loss
with torch.no_grad():
X_val = np.array(
df_val[list_input_variables]
)
X_val = torch.Tensor([x for x in X_val])
Y_val = np.array(
df_val[list_output_variables]
)
Y_val = torch.Tensor([int(y) for y in Y_val])
Y_val = Y_val.type(torch.int64)
outputs = model.forward(X_val)
_, predicted = torch.max(outputs.data, 1)
total = Y_val.size(0)
correct = (predicted == Y_val).sum().item()
loss = criterion(outputs, Y_val)
tune.report(loss=(loss.numpy()), accuracy=correct / total)
print(f"Validation Loss: {loss.numpy()/len(df_val)}")
print(f"Validation Acc: {correct / total:.3f}")
print("Finished Training")
Hyperparameters Tune
result_v1 = tune.run(
partial(setting_model, df_train=df_train, df_val=df_val, model=features_model_v1),
config=config,
fail_fast="raise",
)
result_v2 = tune.run(
partial(setting_model, df_train=df_train, df_val=df_val, model=features_model_v2),
config=config,
fail_fast="raise"
)
Output
result_v1.get_best_config()
{'lr': 1e-05, 'weight_decay': 1e-05, 'epochs': 1}
result_v2.get_best_config()
{'lr': 0.01, 'weight_decay': 1e-05, 'epochs': 1}

The issue is the use of torch.random under the hood. Since you are not directly providing a weight matrix for your layers, pytorch initializes it for you. Luckily, you can have a reproducible experiment by setting
torch.manual_seed(x) # where x is an integer
One should use only a few random seeds, otherwise you might overfit on the random seed. See lottery ticket hypothesis at https://arxiv.org/abs/1803.03635)

Input contains NaN, infinity or a value too large for dtype('float32'). Pythorch

I try to train model but in vain. I see the error
Input contains NaN, infinity or a value too large for dtype('float32').
I think it can be connected with Mse function, because with MAE it works somehow also with RMSE it works somehow (on the second epoch i have RMSE = 10). I can't figure out what i do wrong.
# Count Nan
df = pd.read_csv('data.txt.zip', header=None)
X = df.iloc[:, 1:].values
y = df.iloc[:, 0].values
train_size = 463715
X_train = X[:train_size, :]
y_train = y[:train_size]
X_test = X[train_size:, :]
y_test = y[train_size:]
#ToTensor
X_train = torch.FloatTensor(X_train)
y_train = torch.FloatTensor(y_train)
X_test = torch.FloatTensor(X_test)
y_test = torch.FloatTensor(y_test)
# Create TensorDataset
train_ds = TensorDataset(X_train, y_train)
test_ds = TensorDataset(X_test, y_test)
val_num = 92743
train_num = 370972
# Divide train data into train and validation data
train_ds, val_ds = random_split(train_ds, [train_num, val_num])
# Evaluate accuracy
def accuracy(y_true, y_pred):
return r2_score(y_true, y_pred)
# create Class
class BaselineModel(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(BaselineModel, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.linear1 = nn.Linear(90, 45)
self.linear2 = nn.Linear(45, 1)
self.linear3 = nn.Linear(45, 15)
self.linear4 = nn.Linear(15, 1)
self.batch = nn.BatchNorm2d(hidden_size)
self.relu = nn.ReLU()
self.lreku = nn.LeakyReLU()
self.elu = nn.ELU()
self.dropout = nn.Dropout(0.5)
def forward(self, x):
x = self.elu(self.linear1(x))
return self.linear2(x)
def training_step(self, criterion, batch):
x_train, y_train = batch
y_pred = self(x_train)
loss = (criterion(y_pred, y_train.unsqueeze(1)))
return loss
def validation_step(self, criterion, batch):
x_val, y_val = batch
y_pred = self(x_val)
loss = (criterion(y_pred, y_val.unsqueeze(1)))
acc = accuracy(y_val, y_pred)
return {'val_loss': loss, 'val_acc': acc}
def validation_epoch_end(self, y_pred):
batch_losses = [x['val_loss'] for x in y_pred]
epoch_loss = torch.stack(batch_losses).mean()
batch_accs = [x['val_acc'] for x in y_pred]
epoch_acc = np.mean(batch_accs)
#epoch_acc = torch.stack(batch_accs).mean()
return {'val_loss': epoch_loss.item(), 'val_acc': epoch_acc.item()}
def epoch_end(self, epoch, result):
print(f"Epoch {epoch}, val_loss: {result['val_loss']}, val_acc: {result['val_acc']} ")
model = BaselineModel(input_size = 90, hidden_size = 45, output_size = 1)
# Evaluate
def evaluate(model, criterion, val_loader):
with torch.no_grad():
y_pred = [model.validation_step(criterion, batch) for batch in val_loader]
return model.validation_epoch_end(y_pred)
# Train
def train(model, criterion, optimizer, train_loader, val_loader, lr, epochs):
history = []
for epoch in range(epochs):
for batch in train_loader:
optimizer.zero_grad()
loss = model.training_step(criterion, batch)
loss.backward()
optimizer.step()
result = evaluate(model, criterion, val_loader)
model.epoch_end(epoch, result)
history.append(result)
#return history
# Create train_loader & val_loader
batch_size = 128
train_loader = DataLoader(train_ds, batch_size = batch_size, shuffle = True)
val_loader = DataLoader(val_ds, batch_size = batch_size, shuffle = True)
# Create parameters and Train
lr = 0.05
optimizer = torch.optim.SGD(model.parameters(), lr, momentum = 0.9)
criterion = F.mse_loss
epochs = 10
train(model, criterion, optimizer, train_loader, val_loader, lr, epochs)

Yes, it is because of your loss of function. if the value of the loss function after some epoch becomes very small or very large then when you want to use it in backpropagation to train the model, you face this error. To handle that, you should use Early Stopping to Halt the Training. so you should implement Callback, Callbacks provide a way to execute code and interact with the training model process automatically.

PyTorch adapt binary classification model to output probabilities of both classes

My dataset has 14 features and a target containing {0,1}.
I have trained this binary classifier:
class SimpleBinaryClassifier(nn.Module):
def __init__(self,input_shape):
super().__init__()
self.fc1 = nn.Linear(input_shape,64)
self.fc2 = nn.Linear(64,32)
self.dropout = nn.Dropout(p=0.1)
self.fc3 = nn.Linear(32,1)
def forward(self,x):
x = torch.relu(self.fc1(x))
x = torch.relu(self.fc2(x))
x = self.dropout(x)
x = self.fc3(x)
return x
with the following criterion and training loop:
criterion = nn.BCEWithLogitsLoss()
def binary_acc(y_pred, y_test):
y_pred_tag = torch.round(torch.sigmoid(y_pred))
correct_results_sum = (y_pred_tag == y_test).sum().float()
acc = correct_results_sum/y_test.shape[0]
acc = torch.round(acc * 100)
return acc
model.train()
for e in range(1, EPOCHS+1):
epoch_loss = 0
epoch_acc = 0
for X_batch, y_batch in train_loader:
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
optimizer.zero_grad()
y_pred = model(X_batch)
loss = criterion(y_pred, y_batch.unsqueeze(1))
acc = binary_acc(y_pred, y_batch.unsqueeze(1))
loss.backward()
optimizer.step()
epoch_loss += loss.item()
epoch_acc += acc.item()
print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')
This model, when called like sigmoid(model(input_tensor)) outputs a single number in [0,1]. The pipeline I'm working with, expects a model to output probabilities [p_class1, p_class2].
How can I adapt the model and the training loop?
If I set the output of the last layer to 2, I have problems with the criterion inside the training loop.
class SimpleBinaryClassifier2(nn.Module):
def __init__(self,input_shape):
super().__init__()
self.fc1 = nn.Linear(input_shape,64)
self.fc2 = nn.Linear(64,32)
self.dropout = nn.Dropout(p=0.1)
self.fc3 = nn.Linear(32,2) # now it's 2
def forward(self,x):
x = torch.relu(self.fc1(x))
x = torch.relu(self.fc2(x))
x = self.dropout(x)
x = self.fc3(x)
return x
I use the CrossEntropy
model = SimpleBinaryClassifier2(input_shape=14)
model.to(device)
print(model)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()
and replace y_pred_tag = torch.round(torch.sigmoid(y_pred)) with argmax(softmax)
def binary_acc2(y_pred, y_test):
y_pred_tag = torch.argmax(torch.softmax(y_pred), dim=1)
correct_results_sum = (y_pred_tag == y_test).sum().float()
acc = correct_results_sum/y_test.shape[0]
acc = torch.round(acc * 100)
return acc
Then the train loop rises an error:
model.train()
for e in range(1, EPOCHS+1):
epoch_loss = 0
epoch_acc = 0
for X_batch, y_batch in train_loader:
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
optimizer.zero_grad()
y_pred = model(X_batch)
loss = criterion(y_pred, y_batch)
acc = binary_acc(y_pred, y_batch)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
epoch_acc += acc.item()

print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')
The error is the following:
RuntimeError: "nll_loss_forward_reduce_cuda_kernel_2d_index" not implemented for 'Float'
I already looked up on this other post where the cause of that error was that the element was a Float and not a tensor, but in my case the datasets are tensors:
train_dataset = GenericDataset(torch.FloatTensor(X_train), torch.FloatTensor(y_train))
test_dataset = GenericDataset(torch.FloatTensor(X_test), torch.FloatTensor(y_test))

According to nn.CrossEntropyLoss description it expects target as long and not float, while in your train_dataset you clearly convert it to float

Overfitting when fine-tuning BERT sentiment analysis

I am newbie to Machine Learning in general. I am currently trying to follow a tutorial on sentiment analysis using BERT and Transformers https://curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/
However when I train the model it has appeared that the model is overfitting
I do not know how to fix this. I have tried lowering amount of epochs, increasing batch size , shuffling my data (which is ordered) and increasing the validation split. So far nothing has worked. I have even tried changing different learning rate but the one I am using now is the smallest.
Below is my code:
PRE_TRAINED_MODEL_NAME = 'TurkuNLP/bert-base-finnish-cased-v1'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
MAX_LEN = 40
#Make a PyTorch dataset
class FIDataset(Dataset):
def __init__(self, texts, targets, tokenizer, max_len):
self.texts = texts
self.targets = targets
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
return len(self.texts)
def __getitem__(self, item):
text = str(self.texts[item])
target = self.targets[item]
encoding = self.tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=self.max_len,
return_token_type_ids=False,
pad_to_max_length=True,
return_attention_mask=True,
return_tensors='pt',
)
return {
'text': text,
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'targets': torch.tensor(target, dtype=torch.long)
}
#split test and train
df_train, df_test = train_test_split(
df,
test_size=0.1,
random_state=RANDOM_SEED
)
df_val, df_test = train_test_split(
df_test,
test_size=0.5,
random_state=RANDOM_SEED
)
#data loader function
def create_data_loader(df, tokenizer, max_len, batch_size):
ds = FIDataset(
texts=df.content.to_numpy(),
targets=df.sentiment.to_numpy(),
tokenizer=tokenizer,
max_len=max_len
)
return DataLoader(
ds,
batch_size=batch_size,
num_workers=4
)
BATCH_SIZE = 32
#Load data into train, test, val
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)
#Bert model loading
bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
# Sentiment Classifier based on Bert model just loaded
class SentimentClassifier(nn.Module):
def __init__(self, n_classes):
super(SentimentClassifier, self).__init__()
self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
self.drop = nn.Dropout(p=0.1)
self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
def forward(self, input_ids, attention_mask):
returned = self.bert(
input_ids=input_ids,
attention_mask=attention_mask
)
pooled_output = returned["pooler_output"]
output = self.drop(pooled_output)
return self.out(output)
#Create a Classifier instance and move to GPU
model = SentimentClassifier(3)
model = model.to(device)
#Optimize with AdamW
EPOCHS = 6
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=0,
num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)
#Train each Epoch function
def train_epoch(
model,
data_loader,
loss_fn,
optimizer,
device,
scheduler,
n_examples
):
model = model.train()
losses = []
correct_predictions = 0
for d in data_loader:
input_ids = d["input_ids"].to(device)
attention_mask = d["attention_mask"].to(device)
targets = d["targets"].to(device)
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask
)
_, preds = torch.max(outputs, dim=1)
loss = loss_fn(outputs, targets)
correct_predictions += torch.sum(preds == targets)
losses.append(loss.item())
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
scheduler.step()
optimizer.zero_grad()
return correct_predictions.double() / n_examples, np.mean(losses)
import torch
history = defaultdict(list)
best_accuracy = 0
if __name__ == '__main__':
for epoch in range(EPOCHS):
print(f'Epoch {epoch + 1}/{EPOCHS}')
print('-' * 10)
train_acc, train_loss = train_epoch(
model,
train_data_loader,
loss_fn,
optimizer,
device,
scheduler,
len(df_train)
)
print(f'Train loss {train_loss} accuracy {train_acc}')
val_acc, val_loss = eval_model(
model,
val_data_loader,
loss_fn,
device,
len(df_val)
)
print(f'Val loss {val_loss} accuracy {val_acc}')
print()
history['train_acc'].append(train_acc)
history['train_loss'].append(train_loss)
history['val_acc'].append(val_acc)
history['val_loss'].append(val_loss)
if val_acc > best_accuracy:
torch.save(model.state_dict(), 'best_model_state.bin')
best_accuracy = val_acc

Broadly speaking, to reduce overfitting, you can:
increase regularization
reduce model complexity
perform early stopping
increase training data
From what you've written, you've already tried 3 and 4. In the case of neural networks, you can increase regularization by increasing dropout. You already have the code for it.
# NOTE: You don't need bert_model here since you're creating one inside
# of SentimentClassifier.
#bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
# Sentiment Classifier based on Bert model just loaded
class SentimentClassifier(nn.Module):
def __init__(self, n_classes):
super(SentimentClassifier, self).__init__()
self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
self.drop = nn.Dropout(p=0.1) # <-- INCREASE THIS VALUE
self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
I'd recommend trying higher values of the Dropout probability, as I noted in your code above ("INCREASE THIS VALUE"). Keep track of the Dropout probability and the resulting observed overfitting. Try probability values of 0.1, 0.2, 0.3, 0.4, 0.5.
Usually, I've found that dropout over 0.5 doesn't do much good.

Increase of GPU memory usage during training

I was training the network on usual MNIST dataset, and encountered the next problem:
when i start to add valid_metrics to a loss_list and accuracy_list, amount of GPU memory that is being used starts increasing with every 1 or 2 epochs.
This is the code of train_loop:
def train_model(model: torch.nn.Module,
train_dataset: torch.utils.data.Dataset,
valid_dataset: torch.utils.data.Dataset,
loss_function: torch.nn.Module = torch.nn.CrossEntropyLoss(),
optimizer_class: Type[torch.optim.Optimizer] = torch.optim,
optimizer_params: Dict = {},
initial_lr = 0.01,
lr_scheduler_class: Any = torch.optim.lr_scheduler.ReduceLROnPlateau,
lr_scheduler_params: Dict = {},
batch_size = 64,
max_epochs = 1000,
early_stopping_patience = 20):
optimizer = torch.optim.Adam(model.parameters(), lr=initial_lr, **optimizer_params)
lr_scheduler = lr_scheduler_class(optimizer, **lr_scheduler_params)
train_loader = torch.utils.data.DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size)
best_valid_loss = None
best_epoch = None
loss_list = list()
accuracy_list = list()
for epoch in range(max_epochs):
print(f'Epoch {epoch}')
start = timer()
train_single_epoch(model, optimizer, loss_function, train_loader)
valid_metrics = validate_single_epoch(model, loss_function, valid_loader)
loss_list.append(valid_metrics['loss'])
accuracy_list.append(valid_metrics['accuracy'])
print('time:', timer() - start)
print(f'Validation metrics: \n{valid_metrics}')
lr_scheduler.step(valid_metrics['loss'])
if best_valid_loss is None or best_valid_loss > valid_metrics['loss']:
print(f'Best model yet, saving')
best_valid_loss = valid_metrics['loss']
best_epoch = epoch
torch.save(model, './best_model.pth')
if epoch - best_epoch > early_stopping_patience:
print('Early stopping triggered')
return loss_list, accuracy_list
and the code of validate_single_epoch:
def validate_single_epoch(model: torch.nn.Module,
loss_function: torch.nn.Module,
data_loader: torch.utils.data.DataLoader):
loss_total = 0
accuracy_total = 0
for data in data_loader:
X, y = data
X, y = X.view(-1, 784), y.to(device)
X = X.to(device)
output = model(X)
loss = loss_function(output, y)
loss_total += loss
y_pred = output.argmax(dim = 1, keepdim=True).to(device)
accuracy_total += y_pred.eq(y.view_as(y_pred)).sum().item()
loss_avg = loss_total / len(data_loader.dataset)
accuracy_avg = 100.0 * accuracy_total / len(data_loader.dataset)
return {'loss' : loss_avg, 'accuracy' : accuracy_avg}
I use GeForce MX250 as GPU

The problem is likely because the gradients are being computed and stored in the validation loop. To solve that, perhaps the easiest way is to wrap the validation call in a no_grad context:
with torch.no_grad():
valid_metrics = validate_single_epoch(model, loss_function, valid_loader)
If you prefer, you can also decorate the validate_single_epoch(...) with #torch.no_grad():
#torch.no_grad()
def validate_single_epoch(...):
# ...
Not related to your problem, but pay attention that you're using a model in training mode during validation, which may not be what you want. Perhaps there is a missing call to model.eval() in the validation function.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to reference loss score in LearningRateScheduler - python

Related

Using ray tune `tune.run` with pytorch returns different optimal hyperparameters combination

Input contains NaN, infinity or a value too large for dtype('float32'). Pythorch

PyTorch adapt binary classification model to output probabilities of both classes

Overfitting when fine-tuning BERT sentiment analysis

Increase of GPU memory usage during training

Categories

Resources