I'm training a LSTM model for time series prediction and at each epoch my accuracy restarts from 0 as if I'm training for the first time.
I attach below the training method snippet:
def train(model, loader, epoch, mini_batch_size, sequence_size):
model.train()
correct = 0
padded_size = 0
size_input = mini_batch_size * sequence_size
for batch_idx, (inputs, labels, agreement_score) in enumerate(loader):
if(inputs.size(0) == size_input):
inputs = inputs.clone().reshape(mini_batch_size, sequence_size, inputs.size(1))
labels = labels.clone().squeeze().reshape(mini_batch_size*sequence_size)
agreement_score = agreement_score.clone().squeeze().reshape(mini_batch_size*sequence_size)
else:
padded_size = size_input - inputs.size(0)
(inputs, labels, agreement_score) = padd_incomplete_sequences(inputs, labels, agreement_score, mini_batch_size, sequence_size)
inputs, labels, agreement_score = Variable(inputs.cuda()), Variable(labels.cuda()), Variable(agreement_score.cuda())
output = model(inputs)
loss = criterion(output, labels)
loss = loss * agreement_score
loss = loss.mean()
optimizer.zero_grad()
loss.backward()
optimizer.step()
pred = output.data.max(1, keepdim = True)[1]
correct += pred.eq(labels.data.view_as(pred)).cuda().sum()
accuracy = 100. * correct / (len(loader.dataset) + padded_size)
print("Train: Epoch: {}, [{}/{} ({:.0f}%)]\t loss: {:.6f}, Accuracy: {}/{} ({:.0f}%)".format(
epoch,
batch_idx * len(output),
(len(loader.dataset) + padded_size),
100. * batch_idx / (len(loader.dataset)+padded_size),
loss.item(),
correct,
(len(loader.dataset) + padded_size),
accuracy))
accuracy = 100. * correct / (len(loader.dataset) + padded_size)
train_accuracy.append(accuracy)
train_epochs.append(epoch)
train_loss.append(loss.item())
According to that my loop looks like:
for epoch in range(1, 10):
train(audio_lstm_model, train_rnn_audio_loader, epoch, MINI_BATCH_SIZE, SEQUENCE_SIZE_AUDIO)
evaluation(audio_lstm_model,validation_rnn_audio_loader, epoch, MINI_BATCH_SIZE, SEQUENCE_SIZE_AUDIO)
Consequently, my accuracy and loss restarts at every epoch:
Train: Epoch: 1, [0/1039079 (0%)] loss: 0.921637, Accuracy: 0/1039079 (0%)
...
Train: Epoch: 1, [10368/1039079 (0%)] loss: 0.523242, Accuracy: 206010/1039079 (19%)
Test set: loss: 151.4845, Accuracy: 88222/523315 (16%)
Train: Epoch: 2, [0/1039079 (0%)] loss: 0.921497, Accuracy: 0/1039079 (0%)
If anyone has any clue about it, your help is welcomed!
Have a nice day!
The problem turn out to be the fact that the sequence size was too small for the network in order to be able to make some predictions from it.
So after increasing the sequence length by some orders of magnitude, I was able to improve my model after each epoch.
Related
I'm trying to make a model predict the race of a 75x75 image's ethnicity, but when ever I train the model, the accuracy always stays completely still at 53.2%. I didn't realize why until I actually ran it on some of photos. It turned out, that no matter what the photo was, it would always predict 'other'. I'm not entirely sure why though.
I copied the code over from the official PyTorch Quickstart tutorial, and in that dataset or the standard MNIST data, it worked fine. I changed the dataset to the UTKFace, and then it started only predicting one label, all the time.
Here's my code:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder
from torchvision.transforms import ToTensor
import torch.nn.functional as F
training_data = ImageFolder(
root = "data_training/",
transform = ToTensor(),
)
testing_data = ImageFolder(
root = "data_testing/",
transform = ToTensor()
)
training_dataloader = DataLoader(training_data, batch_size=64, shuffle=True)
test_dataloader = DataLoader(testing_data, batch_size=64, shuffle=True)
class NeuralNetwork(nn.Module):
def __init__(self):
super(NeuralNetwork, self).__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(1296, 1024)
self.fc2 = nn.Linear(1024, 1024)
self.fc3 = nn.Linear(1024, 512)
self.fc4 = nn.Linear(512, 84)
self.fc5 = nn.Linear(84, 5)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(x.size(0), -1)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
x = F.relu(self.fc4(x))
x = self.fc5(x)
return x
model = NeuralNetwork().to("cpu")
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
def train(dataloader, model, loss_fn, optimizer):
size = len(dataloader.dataset)
for batch, (X, y) in enumerate(dataloader):
X, y = X.to("cpu"), y.to("cpu")
# Compute prediction error
pred = model(X)
loss = loss_fn(pred, y)
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch % 100 == 0:
loss, current = loss.item(), batch * len(X)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
def tests(dataloader, model):
size = len(dataloader.dataset)
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
for X, y in dataloader:
X, y = X.to("cpu"), y.to("cpu")
pred = model(X)
test_loss += loss_fn(pred, y).item()
correct += (pred.argmax(1) == y).type(torch.float).sum().item()
test_loss /= size
correct /= size
print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
epochs = 10
for t in range(epochs):
print(f"Epoch {t+1}\n-------------------------------")
train(training_dataloader, model, loss_fn, optimizer)
tests(test_dataloader, model)
torch.save(model.state_dict(), "model.pth")
The training logs:
Epoch 1
-------------------------------
loss: 1.628994 [ 0/23705]
loss: 1.620698 [ 6400/23705]
loss: 1.615423 [12800/23705]
loss: 1.596390 [19200/23705]
Test Error:
Accuracy: 53.2%, Avg loss: 0.024725
Epoch 2
-------------------------------
loss: 1.593613 [ 0/23705]
loss: 1.581375 [ 6400/23705]
loss: 1.583656 [12800/23705]
loss: 1.591942 [19200/23705]
Test Error:
Accuracy: 53.2%, Avg loss: 0.024165
Epoch 3
-------------------------------
loss: 1.541260 [ 0/23705]
loss: 1.592345 [ 6400/23705]
loss: 1.540908 [12800/23705]
loss: 1.540741 [19200/23705]
Test Error:
Accuracy: 53.2%, Avg loss: 0.023705
Epoch 4
-------------------------------
loss: 1.566888 [ 0/23705]
loss: 1.524875 [ 6400/23705]
loss: 1.540764 [12800/23705]
loss: 1.510044 [19200/23705]
Test Error:
Accuracy: 53.2%, Avg loss: 0.023323
Epoch 5
-------------------------------
loss: 1.530084 [ 0/23705]
loss: 1.498773 [ 6400/23705]
loss: 1.537755 [12800/23705]
loss: 1.508989 [19200/23705]
Test Error:
Accuracy: 53.2%, Avg loss: 0.022993
....
No matter how many epochs I set it to, or how many layers I add in to try to get it to overfit, it always just seems to guess the same thing over and over again, with no signs of improvement.
I separated the UTKFace dataset into folders based on the ethnicity category of the name. There are 23705 images in the training data and 10134 in the testing.
I'm not sure why this is happening. Is my dataset not large enough? Are there not enough layers?
The number of layers and the dataset size don't explain this behavior for this example. Your CNN is behaving as a constant function, so far I don't know why, but these might be some clues:
Since you have separated your data by label into folders, if you are training your model using only one of those folders you will obtain a constant function.
The last layer of your neural network has no activation function! This is, in method forward you are doing x = self.fc5(x) instead of x = F.<function>(self.fc5(x)).
Where do you indicate, when loading the training data, which is the label for each image? Are you sure that training_dataloader is loading the images with their correct label?
few comments:
Did you check the ground truth in the test data (the shape may be different)
can you check the output probabilities to see if the predictions are unanimous ? (Btw you don't necessarily need a activation function at the end in this case as the pytorch crossentropy already contains a logsoftmax)
Did you try conv2d with more filters (like 16,32 or 64)
The % of error seems fine as in the link you put, the accuracy is around 35%
It does seems a bit weird not to be able to over-fit.
def train_and_test(e):
epochs = e
train_losses, test_losses, val_acc, train_acc= [], [], [], []
valid_loss_min = np.Inf
model.train()
print("Model Training started.....")
for epoch in range(epochs):
running_loss = 0
batch = 0
for images, labels in trainloader:
images, labels = images.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(images)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
batch += 1
if batch % 10 == 0:
print(f" epoch {epoch + 1} batch {batch} completed")
test_loss = 0
accuracy = 0
with torch.no_grad():
print(f"validation started for {epoch + 1}")
model.eval()
for images, labels in validloader:
images, labels = images.to(device), labels.to(device)
logps = model(images)
test_loss += criterion(logps, labels)
ps = torch.exp(logps)
top_p, top_class = ps.topk(1, dim=1)
equals = top_class == labels.view(*top_class.shape)
accuracy += torch.mean(equals.type(torch.FloatTensor))
train_losses.append(running_loss / len(trainloader))
test_losses.append(test_loss / len(validloader))
val_acc.append(accuracy / len(validloader))
training_acc.append(running_loss / len(trainloader))
scheduler.step()
print("Epoch: {}/{}.. ".format(epoch + 1, epochs),"Training Loss: {:.3f}.. ".format(train_losses[-1]), "Valid Loss: {:.3f}.. ".format(test_losses[-1]),
"Valid Accuracy: {:.3f}".format(accuracy / len(validloader)), "train Accuracy: {:.3f}".format(running_loss / len(trainloader)))
model.train()
if test_loss / len(validloader) <= valid_loss_min:
print('Validation loss decreased ({:.6f} --> {:.6f}). Saving model ...'.format(valid_loss_min, test_loss / len(validloader)))
torch.save({
'epoch': epoch,
'model': model,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'loss': valid_loss_min
}, path)
valid_loss_min = test_loss / len(validloader)
print('Training Completed Succesfully !')
return train_losses, test_losses, val_acc ,train_acc
my output is
Model Training started.....
epoch 1 batch 10 completed
epoch 1 batch 20 completed
epoch 1 batch 30 completed
epoch 1 batch 40 completed
validation started for 1
Epoch: 1/2.. Training Loss: 0.088.. Valid Loss: 0.072.. Valid Accuracy: 0.979 train Accuracy: 0.088
Validation loss decreased (inf --> 0.072044). Saving model ...
I am using dataset that is multi-set classification and getting training accuracy and training loss equal so I think there is error in training accuracy code.
training_acc.append(running_loss / len(trainloader))
"train Accuracy: {:.3f}".format(running_loss / len(trainloader))
training_acc.append(accuracy / len(trainloader))
"train Accuracy: {:.3f}".format(accuracy / len(trainloader))
is also not working fine
this method should be followed to plot training loses as well as accuracy
for images , labels in trainloader:
#start = time.time()
images, labels = images.to(device), labels.to(device)
optimizer.zero_grad()# Clear the gradients, do this because gradients are accumulated as 0 in each epoch
# Forward pass - compute outputs on input data using the model
outputs = model(images) # modeling for each image batch
loss = criterion(outputs,labels) # calculating the loss
# the backward pass
loss.backward() # This is where the model learns by backpropagating
optimizer.step() # And optimizes its weights here - Update the parameters
running_loss += loss.item()
# as Output of the network are log-probabilities, need to take exponential for probabilities
ps = torch.exp(outputs)
top_p , top_class = ps.topk(1,dim=1)
equals = top_class == labels.view(*top_class.shape)
# Convert correct_counts to float and then compute the mean
acc += torch.mean(equals.type(torch.FloatTensor))
I am playing with tensorflow 2. I did my own model similar to how it is done here.
Then I created my own fit function. Now I get the weirdest thing ever. Here is the EXACT copy/paste output from my notebook where I did the tests:
def fit(x_train, y_train, learning_rate=0.01, epochs=10, batch_size=100, normal=True, verbose=True, display_freq=100):
if normal:
x_train = normalize(x_train) # TODO: This normalize could be a bit different for each and be bad.
num_tr_iter = int(len(y_train) / batch_size) # Number of training iterations in each epoch
if verbose:
print("Starting training...")
for epoch in range(epochs):
# Randomly shuffle the training data at the beginning of each epoch
x_train, y_train = randomize(x_train, y_train)
for iteration in range(num_tr_iter):
# Get the batch
start = iteration * batch_size
end = (iteration + 1) * batch_size
x_batch, y_batch = get_next_batch(x_train, y_train, start, end)
# Run optimization op (backpropagation)
# import pdb; pdb.set_trace()
if verbose and (epoch * batch_size + iteration) % display_freq == 0:
current_loss = _apply_loss(y_train, model(x_train, training=True))
current_acc = evaluate_accuracy(x_train, y_train)
print("Epoch: {0}/{1}; batch {2}/{3}; loss: {4:.4f}; accuracy: {5:.2f} %"
.format(epoch, epochs, iteration, num_tr_iter, current_loss, current_acc*100))
train_step(x_batch, y_batch, learning_rate)
current_loss = _apply_loss(y_train, model(x_train, training=True))
current_acc = evaluate_accuracy(x_train, y_train)
print("End: loss: {0:.4f}; accuracy: {1:.2f} %".format(current_loss, current_acc*100))
import logging
logging.getLogger('tensorflow').disabled = True
fit(x_train, y_train)
current_loss = _apply_loss(y_train, model(x_train, training=True))
current_acc = evaluate_accuracy(x_train, y_train)
print("End: loss: {0:.4f}; accuracy: {1:.2f} %".format(current_loss, current_acc*100))
This segment outputs:
Starting training...
Epoch: 0/10; batch 0/80; loss: 0.9533; accuracy: 59.67 %
Epoch: 1/10; batch 0/80; loss: 0.9386; accuracy: 60.15 %
Epoch: 2/10; batch 0/80; loss: 0.9259; accuracy: 60.50 %
Epoch: 3/10; batch 0/80; loss: 0.9148; accuracy: 61.05 %
Epoch: 4/10; batch 0/80; loss: 0.9051; accuracy: 61.15 %
Epoch: 5/10; batch 0/80; loss: 0.8968; accuracy: 61.35 %
Epoch: 6/10; batch 0/80; loss: 0.8896; accuracy: 61.27 %
Epoch: 7/10; batch 0/80; loss: 0.8833; accuracy: 61.51 %
Epoch: 8/10; batch 0/80; loss: 0.8780; accuracy: 61.52 %
Epoch: 9/10; batch 0/80; loss: 0.8733; accuracy: 61.54 %
End: loss: 0.8733; accuracy: 61.54 %
End: loss: 0.4671; accuracy: 77.08 %
Now my question is, how is it that I get a different value on the last 2 lines!? I am doing the same thing right? I am totally puzzled here. I don't even know how to google this.
So the problem was just stupid. It was due to the normalize thing I did at the start of the train example! Removed it and started working Ok.
I am trying to evaluate my model on the whole training set after each epoch.
This is what I did:
torch.manual_seed(1)
model = ConvNet(num_classes=num_classes)
cost_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
def compute_accuracy(model, data_loader):
correct_pred, num_examples = 0, 0
for features, targets in data_loader:
logits = model(features)
predicted_labels = torch.argmax(logits, 1)
num_examples += targets.size(0)
correct_pred += (predicted_labels == targets).sum()
return correct_pred.float()/num_examples * 100
for epoch in range(num_epochs):
model = model.train()
for features, targets in train_loader:
logits = model(features)
cost = cost_fn(logits, targets)
optimizer.zero_grad()
cost.backward()
optimizer.step()
model = model.eval()
print('Epoch: %03d/%03d training accuracy: %.2f%%' % (
epoch+1, num_epochs,
compute_accuracy(model, train_loader)))
the output was convincing:
Epoch: 001/005 training accuracy: 89.08%
Epoch: 002/005 training accuracy: 90.41%
Epoch: 003/005 training accuracy: 91.70%
Epoch: 004/005 training accuracy: 92.31%
Epoch: 005/005 training accuracy: 92.95%
But then I added another line at the end of the training loop, to also evaluate the model on the whole test set after each epoch:
for epoch in range(num_epochs):
model = model.train()
for features, targets in train_loader:
logits = model(features)
cost = cost_fn(logits, targets)
optimizer.zero_grad()
cost.backward()
optimizer.step()
model = model.eval()
print('Epoch: %03d/%03d training accuracy: %.2f%%' % (
epoch+1, num_epochs,
compute_accuracy(model, train_loader)))
print('\t\t testing accuracy: %.2f%%' % (compute_accuracy(model, test_loader)))
But the training accuracies started to change:
Epoch: 001/005 training accuracy: 89.08%
testing accuracy: 87.66%
Epoch: 002/005 training accuracy: 90.42%
testing accuracy: 89.04%
Epoch: 003/005 training accuracy: 91.84%
testing accuracy: 90.01%
Epoch: 004/005 training accuracy: 91.86%
testing accuracy: 89.83%
Epoch: 005/005 training accuracy: 92.45%
testing accuracy: 90.32%
Am I doing something wrong? I expected the training accuracies to remain the same because the manual seed is 1 in both cases.
Is this an expected output ?
The random seed had been set wasn't stop the model for learning to get higher accuracy becuase the random seed is a number for Pseudo random. In this case, you had told the model to shuffle the training data with a random number("1").
For the purposes of this MWE I'm trying to fit a linear regression using a custom loss function with multiple terms. However, I'm running into strange behavior when trying to weight the different terms in my loss function by dotting a weight vector with my losses. Just summing the losses works as expected; however, when dotting the weights and losses the backpropagation gets broken somehow and the loss function doesn't decrease.
I've tried enabling and disabling requires_grad on both tensors, but have been unable to replicate the expected behavior.
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
# Hyper-parameters
input_size = 1
output_size = 1
num_epochs = 60
learning_rate = 0.001
# Toy dataset
x_train = np.array([[3.3], [4.4], [5.5], [6.71], [6.93], [4.168],
[9.779], [6.182], [7.59], [2.167], [7.042],
[10.791], [5.313], [7.997], [3.1]], dtype=np.float32)
y_train = np.array([[1.7], [2.76], [2.09], [3.19], [1.694], [1.573],
[3.366], [2.596], [2.53], [1.221], [2.827],
[3.465], [1.65], [2.904], [1.3]], dtype=np.float32)
# Linear regression model
model = nn.Linear(input_size, output_size)
# Loss and optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
def loss_fn(outputs, targets):
l1loss = torch.norm(outputs - targets, 1)
l2loss = torch.norm(outputs - targets, 2)
# This works as expected
# loss = 1 * l1loss + 1 * l2loss
# Loss never changes, no matter what combination of
# requires_grad I set
loss = torch.dot(torch.tensor([1.0, 1.0], requires_grad=False),
torch.tensor([l1loss, l2loss], requires_grad=True))
return loss
# Train the model
for epoch in range(num_epochs):
# Convert numpy arrays to torch tensors
inputs = torch.from_numpy(x_train)
targets = torch.from_numpy(y_train)
# Forward pass
outputs = model(inputs)
loss = loss_fn(outputs, targets)
# Backward and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (epoch+1) % 5 == 0:
print ('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, loss.item()))
# Plot the graph
predicted = model(torch.from_numpy(x_train)).detach().numpy()
plt.plot(x_train, y_train, 'ro', label='Original data')
plt.plot(x_train, predicted, label='Fitted line')
plt.legend()
plt.show()
Expected result: loss function decreases and the linear regression is fitted (see output below)
Epoch [5/60], Loss: 7.9943
Epoch [10/60], Loss: 7.7597
Epoch [15/60], Loss: 7.6619
Epoch [20/60], Loss: 7.6102
Epoch [25/60], Loss: 7.4971
Epoch [30/60], Loss: 7.4106
Epoch [35/60], Loss: 7.3942
Epoch [40/60], Loss: 7.2438
Epoch [45/60], Loss: 7.2322
Epoch [50/60], Loss: 7.1012
Epoch [55/60], Loss: 7.0701
Epoch [60/60], Loss: 6.9612
Actual result: no change in loss function
Epoch [5/60], Loss: 73.7473
Epoch [10/60], Loss: 73.7473
Epoch [15/60], Loss: 73.7473
Epoch [20/60], Loss: 73.7473
Epoch [25/60], Loss: 73.7473
Epoch [30/60], Loss: 73.7473
Epoch [35/60], Loss: 73.7473
Epoch [40/60], Loss: 73.7473
Epoch [45/60], Loss: 73.7473
Epoch [50/60], Loss: 73.7473
Epoch [55/60], Loss: 73.7473
Epoch [60/60], Loss: 73.7473
I'm pretty confused as to why such a simple operation is breaking the backpropagation gradients and would really appreciate it if anyone had some insights on why this isn't working.
Use torch.cat((loss1, loss2)), you are creating new Tensor from existing tensors destroying graph.
Anyway you shouldn't do that unless you are trying to generalize your loss function, it's pretty unreadable. Simple addition is way better.