Validation losses increasing after a few epochs

Validation losses increasing after a few epochs - python

I'm building a small CNN model to predict plant crop disease with the Plant Village Dataset. It consists of 39 classes of different species with and without diseases.
CNN model
class CropDetectCNN(nn.Module):
# initialize the class and the parameters
def __init__(self):
super(CropDetectCNN, self).__init__()
# convolutional layer 1 & max pool layer 1
self.layer1 = nn.Sequential(
nn.Conv2d(3, 16, kernel_size=3),
nn.MaxPool2d(kernel_size=2))
# convolutional layer 2 & max pool layer 2
self.layer2 = nn.Sequential(
nn.Conv2d(16, 32, kernel_size=3, padding=1, stride=2),
nn.MaxPool2d(kernel_size=2))
#Fully connected layer
self.fc = nn.Linear(32*28*28, 39)
# Feed forward the network
def forward(self, x):
out = self.layer1(x)
out = self.layer2(out)
out = out.reshape(out.size(0), -1)
out = self.fc(out)
return out
model = CropDetectCNN()
Training
criterion = nn.CrossEntropyLoss() # this include softmax + cross entropy loss
optimizer = torch.optim.Adam(model.parameters())
def batch_gd(model, criterion, train_loader, validation_loader, epochs):
train_losses = np.zeros(epochs)
test_losses = np.zeros(epochs)
validation_losses = np.zeros(epochs)
for e in range(epochs):
t0 = datetime.now()
train_loss = []
model.train()
for inputs, targets in train_loader:
inputs, targets = inputs.to(device), targets.to(device)
optimizer.zero_grad()
output = model(inputs)
loss = criterion(output, targets)
train_loss.append(loss.item()) # torch to numpy world
loss.backward()
optimizer.step()
train_loss = np.mean(train_loss)
validation_loss = []
for inputs, targets in validation_loader:
model.eval()
inputs, targets = inputs.to(device), targets.to(device)
output = model(inputs)
loss = criterion(output, targets)
validation_loss.append(loss.item()) # torch to numpy world
validation_loss = np.mean(validation_loss)
train_losses[e] = train_loss
validation_losses[e] = validation_loss
dt = datetime.now() - t0
print(
f"Epoch : {e+1}/{epochs} Train_loss: {train_loss:.3f} Validation_loss: {validation_loss:.3f} Duration: {dt}"
)
return train_losses, validation_losses
# Running the function
train_losses, validation_losses = batch_gd(
model, criterion, train_loader, validation_loader, 5
)
# And theses are results:
Epoch : 1/5 Train_loss: 1.164 Validation_loss: 0.861 Duration: 0:10:59.968168
Epoch : 2/5 Train_loss: 0.515 Validation_loss: 0.816 Duration: 0:10:49.199842
Epoch : 3/5 Train_loss: 0.241 Validation_loss: 1.007 Duration: 0:09:56.334155
Epoch : 4/5 Train_loss: 0.156 Validation_loss: 1.147 Duration: 0:10:12.625819
Epoch : 5/5 Train_loss: 0.135 Validation_loss: 1.603 Duration: 0:09:56.746308
Isn't the validation loss suppose to decrease with epochs ? So why is it first decreasing and then increasing ?
How should I set the number of epochs, and why ?
Any help is really appreciated !

You are facing the phenomenon of "overfitting" when your validation loss goes up after decreasing. You should stop training at that point and try to use some tricks to avoid overfitting.
Getting different predictions might happen when your gradients keep updating during inference so try explicitly "stop" them from updating with torch.no_grad()

Related

MNIST overfitting

I am currently working on the MNIST dataset. My model has overfit the training data and I want to reduce the overfitting by using weight_decay. I am currently using 0.1 as the value for weight_decay which is giving me bad results as my validation loss and training loss are not decreasing. However, I want to experiment with different values for weight_decay. So that i can plot the different amounts of weight_decay on the x-axis and the performance of validation set on the y-axis. How do i do that? store the values in a list and use a for loop to iterate through? Below is the code that i have tried until now.
class NN(nn.Module):
def __init__(self):
super().__init__()
self.layers = nn.Sequential(
nn.Flatten(),
nn.Linear(784,4096),
nn.ReLU(),
nn.Linear(4096,2048),
nn.ReLU(),
nn.Linear(2048,1024),
nn.ReLU(),
nn.Linear(1024,512),
nn.ReLU(),
nn.Linear(512,256),
nn.ReLU(),
nn.Linear(256,128),
nn.ReLU(),
nn.Linear(128,64),
nn.ReLU(),
nn.Linear(64,32),
nn.ReLU(),
nn.Linear(32,16),
nn.ReLU(),
nn.Linear(16,10))
def forward(self,x):
return self.layers(x)
def accuracy_and_loss(model, loss_function, dataloader):
total_correct = 0
total_loss = 0
total_examples = 0
n_batches = 0
with torch.no_grad():
for data in testloader:
images, labels = data
outputs = model(images)
batch_loss = loss_function(outputs,labels)
n_batches += 1
total_loss += batch_loss.item()
_, predicted = torch.max(outputs, dim=1)
total_examples += labels.size(0)
total_correct += (predicted == labels).sum().item()
accuracy = total_correct / total_examples
mean_loss = total_loss / n_batches
return (accuracy, mean_loss)
def define_and_train(model,dataset_training, dataset_test):
trainloader = torch.utils.data.DataLoader( small_trainset, batch_size=500, shuffle=True)
testloader = torch.utils.data.DataLoader( dataset_test, batch_size=500, shuffle=True)
values = [1e-8,1e-7,1e-6,1e-5]
model = NN()
for params in values:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay = params)
train_acc = []
val_acc = []
train_loss = []
val_loss = []
for epoch in range(100):
total_loss = 0
total_correct = 0
total_examples = 0
n_mini_batches = 0
for i,mini_batch in enumerate(trainloader,0):
images,labels = mini_batch
optimizer.zero_grad()
outputs = model(images)
loss = loss_function(outputs,labels)
loss.backward()
optimizer.step()
n_mini_batches += 1
total_loss += loss.item()
_, predicted = torch.max(outputs, dim=1)
total_examples += labels.size(0)
total_correct += (predicted == labels).sum().item()
epoch_training_accuracy = total_correct / total_examples
epoch_training_loss = total_loss / n_mini_batches
epoch_val_accuracy, epoch_val_loss = accuracy_and_loss( model, loss_function, testloader )
print('Params %f Epoch %d loss: %.3f acc: %.3f val_loss: %.3f val_acc: %.3f'
%(params, epoch+1, epoch_training_loss, epoch_training_accuracy, epoch_val_loss, epoch_val_accuracy))
train_loss.append( epoch_training_loss )
train_acc.append( epoch_training_accuracy )
val_loss.append( epoch_val_loss )
val_acc.append( epoch_val_accuracy )
history = { 'train_loss': train_loss,
'train_acc': train_acc,
'val_loss': val_loss,
'val_acc': val_acc }
return ( history, model )
This is the plot that I am getting. Where am I going wrong?

I cannot know any information. (Such as loss function, dataset size, dataset content (training and validation), results of 100 or 200 epochs, your scope of the question)
However, the overfitted model may classify the validation dataset. Because the MNIST dataset is not that hard with deep learning (compared to other image classifications).
How about adding white noise to the validation dataset? You may get a large loss on validation.
Or if you want to use your validation dataset, train the model for more at least 1000 epochs. But, as I said above, the overfitted model may classify the validation dataset.

Very high validation loss/small train loss in Pytorch, while finetuning resnet 50

I am training model to classify 2 types of images. I have decided to take a transfer-learning approach, freeze every part of resnet50 and new layer and start finetuning process. My dataset is not perfectly balanced but i used weights for that purpose.Please take a look at validation loss vs training loss graph. It seems to be extremely inconsitent. Could you please take a look at my code? I am new to Pytorch, maybe there is something wrong with my method and code. Final accuracy tested on test set is 86%. Thank you!
learning_rate = 1e-1
num_epochs = 100
patience = 10
batch_size = 100
weights = [4, 1]
model = models.resnet50(pretrained=True)
# Replace last layer
num_features = model.fc.in_features
model.fc = nn.Sequential(
nn.Linear(num_features, 512),
nn.ReLU(inplace=True),
nn.Linear(512, 64),
nn.Dropout(0.5, inplace=True),
nn.Linear(64, 2))
class_weights = torch.FloatTensor(weights).cuda()
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
running_loss = 0
losses = []
# To freeze the residual layers
for param in model.parameters():
param.requires_grad = False
for param in model.fc.parameters():
param.requires_grad = True
# Find total parameters and trainable parameters
total_params = sum(p.numel() for p in model.parameters())
print(f'{total_params:,} total parameters.')
total_trainable_params = sum(
p.numel() for p in model.parameters() if p.requires_grad)
print(f'{total_trainable_params:,} training parameters.')
24,590,082 total parameters.
1,082,050 training parameters.
# initialize the early_stopping object
early_stopping = pytorchtools.EarlyStopping(patience=patience, verbose=True)
for epoch in range(num_epochs):
##########################
#######TRAIN MODEL########
##########################
epochs_loss=0
##Switch to train mode
model.train()
for i, (images, labels) in enumerate(train_dl):
# Move tensors to the configured device
images = images.to(device)
labels = labels.to(device)
# Forward pass
# Backprpagation and optimization
optimizer.zero_grad()
outputs = model(images).to(device)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
#calculate train_loss
train_losses.append(loss.item())
##########################
#####VALIDATE MODEL#######
##########################
model.eval()
for images, labels in val_dl:
images = images.to(device)
labels = labels.to(device)
outputs = model(images).to(device)
loss = criterion(outputs,labels)
valid_losses.append(loss.item())
# print training/validation statistics
# calculate average loss over an epoch
train_loss = np.average(train_losses)
valid_loss = np.average(valid_losses)
# print(train_loss)
avg_train_losses.append(train_loss)
avg_valid_losses.append(valid_loss)
print_msg = (f'train_loss: {train_loss:.5f} ' + f'valid_loss: {valid_loss:.5f}')
print(print_msg)
# clear lists to track next epoch
train_losses = []
valid_losses = []
early_stopping(valid_loss, model)
print(epoch)
if early_stopping.early_stop:
print("Early stopping")
break

Good accuracy and loss on training vs bad accuracy on validation

I am learning pytorch and I have created binary classification algorithm. After having trained the model I have very low loss and quite good accuracy. However, on validation the accuracy is exactly 50%. I am wondering if I loaded samples incorrectly or the algorithm does not perform well.
Here you can find the plot of Training loss and accuracy.
Here is my training method:
epochs = 15
itr = 1
p_itr = 100
model.train()
total_loss = 0
loss_list = []
acc_list = []
for epoch in range(epochs):
for samples, labels in train_loader:
samples, labels = samples.to(device), labels.to(device)
optimizer.zero_grad()
output = model(samples)
labels = labels.unsqueeze(-1)
labels = labels.float()
loss = criterion(output, labels)
loss.backward()
optimizer.step()
total_loss += loss.item()
scheduler.step()
#if itr%p_itr == 0:
pred = torch.round(output)
correct = pred.eq(labels)
acc = torch.mean(correct.float())
print('[Epoch {}/{}] Iteration {} -> Train Loss: {:.4f}, Accuracy: {:.3f}'.format(epoch+1, epochs, itr, total_loss/p_itr, acc))
loss_list.append(total_loss/p_itr)
acc_list.append(acc)
total_loss = 0
itr += 1
Here, I am loading data from the path:
train_list_cats = glob.glob(os.path.join(train_cats_dir,'*.jpg'))
train_list_dogs = glob.glob(os.path.join(train_dogs_dir,'*.jpg'))
train_list = train_list_cats + train_list_dogs
val_list_cats = glob.glob(os.path.join(validation_cats_dir,'*.jpg'))
val_list_dogs = glob.glob(os.path.join(validation_dogs_dir,'*.jpg'))
val_list = val_list_cats + val_list_dogs
I am not attaching the model architecture, however I can add it if required.
I think that my training method is correct, although, I am not sure about training/validation data processing.
Edit:
The network params are as follow:
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001)
criterion = nn.BCELoss()
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[500,1000,1500], gamma=0.5)
Activation function is sigmoid.
The network architecture:
self.layer1 = nn.Sequential(
nn.Conv2d(3,16,kernel_size=3),
nn.ReLU(),
nn.MaxPool2d(2),
nn.Dropout(p=0.2)
)
self.layer2 = nn.Sequential(
nn.Conv2d(16,32, kernel_size=3),
nn.ReLU(),
nn.MaxPool2d(2),
nn.Dropout(p=0.2)
)
self.layer3 = nn.Sequential(
nn.Conv2d(32,64, kernel_size=3),
nn.ReLU(),
nn.MaxPool2d(2),
nn.Dropout(p=0.2)
)
self.fc1 = nn.Linear(17*17*64,512)
self.fc2 = nn.Linear(512,1)
self.relu = nn.ReLU()
self.sigmoid = nn.Sigmoid()
def forward(self,x):
out = self.layer1(x)
out = self.layer2(out)
out = self.layer3(out)
out = out.view(out.size(0),-1)
out = self.relu(self.fc1(out))
out = self.fc2(out)
return torch.sigmoid(out)

Going by your "Training loss and accuracy" plot your model is overfitting. Your train loss is near zero after 25 epochs and you continue training for 200+ epochs. This is wrong way to train a model. You should rather be doing early stopping based on the validation set. ie. Run one epoch of train and one epoch of eval and repeat. Stop when your train epoch is improving and the corresponding eval epoch is not improving.

Model weights are not being updatesd, but loss is decreasing

The following code is to train an MLP with images of size 64*64, while using the loss ||output - input||^2.
For some reason, my weights per epoch are not being updated as shown at the end.
class MLP(nn.Module):
def __init__(self, size_list):
super(MLP, self).__init__()
layers = []
self.size_list = size_list
for i in range(len(size_list) - 2):
layers.append(nn.Linear(size_list[i],size_list[i+1]))
layers.append(nn.ReLU())
layers.append(nn.Linear(size_list[-2], size_list[-1]))
self.net = nn.Sequential(*layers)
def forward(self, x):
return self.net(x)
model_1 = MLP([4096, 64, 4096])
And for training each epoch:
def train_epoch(model, train_loader, criterion, optimizer):
model.train()
model.to(device)
running_loss = 0.0
start_time = time.time()
# train batch
for batch_idx, (data) in enumerate(train_loader):
optimizer.zero_grad()
data = data.to(device)
outputs = model(data)
loss = criterion(outputs, data)
running_loss += loss.item()
loss.backward()
optimizer.step()
end_time = time.time()
weight_ll = model.net[0].weight
running_loss /= len(train_loader)
print('Training Loss: ', running_loss, 'Time: ',end_time - start_time, 's')
return running_loss, outputs, weight_ll
for training the data:
n_epochs = 20
Train_loss = []
weights=[]
criterion = nn.MSELoss()
optimizer = optim.SGD(model_1.parameters(), lr = 0.1)
for i in range(n_epochs):
train_loss, output, weights_ll = train_epoch(model_1, trainloader, criterion, optimizer)
Train_loss.append(train_loss)
weights.append(weights_ll)
print('='*20)
Now, when I print the weights of the first fully connected layer per epoch they aren't being updated.
print(weights[0][0])
print(weights[19][0])
The output for the above is (showing the weight in epoch 0 and in epoch 19):
tensor([ 0.0086, 0.0069, -0.0048, ..., -0.0082, -0.0115, -0.0133],
grad_fn=<SelectBackward>)
tensor([ 0.0086, 0.0069, -0.0048, ..., -0.0082, -0.0115, -0.0133],
grad_fn=<SelectBackward>)
What may be going wrong? Looking at my loss, it's decreasing at a steady rate but there is no change in the weights.

Try to change it weight_ll = model.net[0].weight.clone().detach() or just weight_ll = model.net[0].weight.clone() in your train_epoch() function. And you will see the weights differ.
Explanation: weights_ll are always the last epoch values if you do not clone it. It will be regarded as the same tensor in the graph. That's why your weights[0][0] equals to weights[19][0], they are actually the same tensor.

Pytorch loss of convolution is 0.0 from start

I am building a conv net that classifies dog and cat. Architecture is pretty simple. 2 Conv(with batch norm, leakyReLU, Maxpooling) to 1 fc. Input image size is resized to 64. The size is good. The problem is loss is 0.0 from the start. I have no clue what the cause is. I couldn't find any answer. I have wrote every detail that might be important. If you need anything else please tell me, I will edit.
main.py
import torch
import torch.nn as nn
from torchvision import transforms, datasets
import PIL
import matplotlib.pyplot as plt
from Dataset import Dataset
from Network import Network
# Added to avoid torch._C._cuda_init() \n RuntimeError: CUDA error: unknown error
torch.cuda.current_device()
# Hyper Parameters
batch_size = 1
img_size = 64
learning_rate = 0.001
num_epoch = 1
# Directories
trainDir = "D:/Programming/python/Deep learning/datasets/dogs-vs-cats/train"
testDir = "D:/Programming/python/Deep learning/datasets/dogs-vs-cats/test1"
print("Initializing...")
# Device
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# Augmentation
transforms = transforms.Compose([
transforms.Resize((img_size, img_size)),
transforms.ColorJitter(hue=.05, saturation=.05),
transforms.RandomHorizontalFlip(),
transforms.RandomRotation(20, resample=PIL.Image.BILINEAR) ,
transforms.ToTensor()
])
trainset = datasets.ImageFolder(root=trainDir, transform=transforms)
testset = datasets.ImageFolder(root=testDir, transform=transforms)
train_loader = torch.utils.data.DataLoader(
trainset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(
testset, batch_size=batch_size, shuffle=False) # test set will not be shuffled
model = Network(img_size,2).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
total_step = len(train_loader)
print("Tranining started")
for epoch in range(num_epoch):
for i, (images, labels) in enumerate(train_loader):
images = images.to(device)
labels = labels.to(device)
# forward propagate
outputs = model(images)
loss = criterion(outputs, labels)
# backpropagte and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (i+1) % 100 == 0:
print(
"Epoch [{}/{}], Step[{}/{}], Loss: {}".format(
epoch+1, num_epoch, i+1, total_step, loss.item()
)
)
print("Tranining complete, validation started")
with torch.no_grad():
correct = 0
total = 0
for images, labels in test_loader:
images = images.to(device)
labels = labels.to(device)
outputs = model(images)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
print('Test Accuracy: {} %'.format(100 * correct / total))
#
torch.save(model.state_dict(), "model.ckpy")
Network.py
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
"""
Input size for conv
l = number of input feature maps
k = number of output feature maps
n, m = width and height of kernel
total parameter = (n*m*l+1)*k
"""
class Network(nn.Module):
def __init__(self, input_size, num_class):
super(Network, self).__init__()
self.conv1 = nn.Sequential(
nn.Conv2d(3, 16, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(16),
nn.LeakyReLU(),
nn.MaxPool2d(kernel_size=2, stride=2)
) # output size = (128, 128, 16)
self.conv2 = nn.Sequential(
nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(32),
nn.LeakyReLU(),
nn.MaxPool2d(kernel_size=2, stride=2)
) # output size = (64, 64, 32)
self.fc1 = nn.Linear(
int((input_size/4)**2*32), num_class
)
def forward(self, x):
out = self.conv1(x)
out = self.conv2(out)
out = out.view(out.size(0), -1)
out = self.fc1(out)
return out
Output
Epoch [1/1], Step[5800/25000], Loss: 0.0
Epoch [1/1], Step[5900/25000], Loss: 0.0
Epoch [1/1], Step[6000/25000], Loss: 0.0
Epoch [1/1], Step[6100/25000], Loss: 0.0
Epoch [1/1], Step[6200/25000], Loss: 0.0
Epoch [1/1], Step[6300/25000], Loss: 0.0
Epoch [1/1], Step[6400/25000], Loss: 0.0
Epoch [1/1], Step[6500/25000], Loss: 0.0
Result after each layer
outputs of conv1,2
[[ 3.0135e-01, 3.5849e-01, 4.7758e-01, ..., 3.9759e-01,
3.7988e-01, 9.7870e-01],
[ 4.3010e-01, 6.0753e-03, 4.5642e-01, ..., -8.5486e-04,
4.4537e-02, 2.9074e-01],
[ 3.8567e-01, 7.8431e-02, 2.3859e-01, ..., -3.0013e-03,
-5.5821e-03, 1.2284e-01],
...,
[ 3.9181e-01, 3.9093e-01, 1.2053e-01, ..., -4.7156e-03,
5.6266e-01, 7.7017e-01],
outputs of fc1
[[-0.0772, 0.2166]]

loss = criterion(output, target.view(-1)) # Flatten target
try this.
could you remove these two line?
images = images.to(device)
labels = labels.to(device)
self.conv1 and 2 must be sent to cuda : self.conv1(2).cuda()

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Validation losses increasing after a few epochs - python

Related

MNIST overfitting

Very high validation loss/small train loss in Pytorch, while finetuning resnet 50

Good accuracy and loss on training vs bad accuracy on validation

Model weights are not being updatesd, but loss is decreasing

Pytorch loss of convolution is 0.0 from start

Categories

Resources