I am building a conv net that classifies dog and cat. Architecture is pretty simple. 2 Conv(with batch norm, leakyReLU, Maxpooling) to 1 fc. Input image size is resized to 64. The size is good. The problem is loss is 0.0 from the start. I have no clue what the cause is. I couldn't find any answer. I have wrote every detail that might be important. If you need anything else please tell me, I will edit.
main.py
import torch
import torch.nn as nn
from torchvision import transforms, datasets
import PIL
import matplotlib.pyplot as plt
from Dataset import Dataset
from Network import Network
# Added to avoid torch._C._cuda_init() \n RuntimeError: CUDA error: unknown error
torch.cuda.current_device()
# Hyper Parameters
batch_size = 1
img_size = 64
learning_rate = 0.001
num_epoch = 1
# Directories
trainDir = "D:/Programming/python/Deep learning/datasets/dogs-vs-cats/train"
testDir = "D:/Programming/python/Deep learning/datasets/dogs-vs-cats/test1"
print("Initializing...")
# Device
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# Augmentation
transforms = transforms.Compose([
transforms.Resize((img_size, img_size)),
transforms.ColorJitter(hue=.05, saturation=.05),
transforms.RandomHorizontalFlip(),
transforms.RandomRotation(20, resample=PIL.Image.BILINEAR) ,
transforms.ToTensor()
])
trainset = datasets.ImageFolder(root=trainDir, transform=transforms)
testset = datasets.ImageFolder(root=testDir, transform=transforms)
train_loader = torch.utils.data.DataLoader(
trainset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(
testset, batch_size=batch_size, shuffle=False) # test set will not be shuffled
model = Network(img_size,2).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
total_step = len(train_loader)
print("Tranining started")
for epoch in range(num_epoch):
for i, (images, labels) in enumerate(train_loader):
images = images.to(device)
labels = labels.to(device)
# forward propagate
outputs = model(images)
loss = criterion(outputs, labels)
# backpropagte and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (i+1) % 100 == 0:
print(
"Epoch [{}/{}], Step[{}/{}], Loss: {}".format(
epoch+1, num_epoch, i+1, total_step, loss.item()
)
)
print("Tranining complete, validation started")
with torch.no_grad():
correct = 0
total = 0
for images, labels in test_loader:
images = images.to(device)
labels = labels.to(device)
outputs = model(images)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
print('Test Accuracy: {} %'.format(100 * correct / total))
#
torch.save(model.state_dict(), "model.ckpy")
Network.py
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
"""
Input size for conv
l = number of input feature maps
k = number of output feature maps
n, m = width and height of kernel
total parameter = (n*m*l+1)*k
"""
class Network(nn.Module):
def __init__(self, input_size, num_class):
super(Network, self).__init__()
self.conv1 = nn.Sequential(
nn.Conv2d(3, 16, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(16),
nn.LeakyReLU(),
nn.MaxPool2d(kernel_size=2, stride=2)
) # output size = (128, 128, 16)
self.conv2 = nn.Sequential(
nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(32),
nn.LeakyReLU(),
nn.MaxPool2d(kernel_size=2, stride=2)
) # output size = (64, 64, 32)
self.fc1 = nn.Linear(
int((input_size/4)**2*32), num_class
)
def forward(self, x):
out = self.conv1(x)
out = self.conv2(out)
out = out.view(out.size(0), -1)
out = self.fc1(out)
return out
Output
Epoch [1/1], Step[5800/25000], Loss: 0.0
Epoch [1/1], Step[5900/25000], Loss: 0.0
Epoch [1/1], Step[6000/25000], Loss: 0.0
Epoch [1/1], Step[6100/25000], Loss: 0.0
Epoch [1/1], Step[6200/25000], Loss: 0.0
Epoch [1/1], Step[6300/25000], Loss: 0.0
Epoch [1/1], Step[6400/25000], Loss: 0.0
Epoch [1/1], Step[6500/25000], Loss: 0.0
Result after each layer
outputs of conv1,2
[[ 3.0135e-01, 3.5849e-01, 4.7758e-01, ..., 3.9759e-01,
3.7988e-01, 9.7870e-01],
[ 4.3010e-01, 6.0753e-03, 4.5642e-01, ..., -8.5486e-04,
4.4537e-02, 2.9074e-01],
[ 3.8567e-01, 7.8431e-02, 2.3859e-01, ..., -3.0013e-03,
-5.5821e-03, 1.2284e-01],
...,
[ 3.9181e-01, 3.9093e-01, 1.2053e-01, ..., -4.7156e-03,
5.6266e-01, 7.7017e-01],
outputs of fc1
[[-0.0772, 0.2166]]
loss = criterion(output, target.view(-1)) # Flatten target
try this.
could you remove these two line?
images = images.to(device)
labels = labels.to(device)
self.conv1 and 2 must be sent to cuda : self.conv1(2).cuda()
Related
I'm building a small CNN model to predict plant crop disease with the Plant Village Dataset. It consists of 39 classes of different species with and without diseases.
CNN model
class CropDetectCNN(nn.Module):
# initialize the class and the parameters
def __init__(self):
super(CropDetectCNN, self).__init__()
# convolutional layer 1 & max pool layer 1
self.layer1 = nn.Sequential(
nn.Conv2d(3, 16, kernel_size=3),
nn.MaxPool2d(kernel_size=2))
# convolutional layer 2 & max pool layer 2
self.layer2 = nn.Sequential(
nn.Conv2d(16, 32, kernel_size=3, padding=1, stride=2),
nn.MaxPool2d(kernel_size=2))
#Fully connected layer
self.fc = nn.Linear(32*28*28, 39)
# Feed forward the network
def forward(self, x):
out = self.layer1(x)
out = self.layer2(out)
out = out.reshape(out.size(0), -1)
out = self.fc(out)
return out
model = CropDetectCNN()
Training
criterion = nn.CrossEntropyLoss() # this include softmax + cross entropy loss
optimizer = torch.optim.Adam(model.parameters())
def batch_gd(model, criterion, train_loader, validation_loader, epochs):
train_losses = np.zeros(epochs)
test_losses = np.zeros(epochs)
validation_losses = np.zeros(epochs)
for e in range(epochs):
t0 = datetime.now()
train_loss = []
model.train()
for inputs, targets in train_loader:
inputs, targets = inputs.to(device), targets.to(device)
optimizer.zero_grad()
output = model(inputs)
loss = criterion(output, targets)
train_loss.append(loss.item()) # torch to numpy world
loss.backward()
optimizer.step()
train_loss = np.mean(train_loss)
validation_loss = []
for inputs, targets in validation_loader:
model.eval()
inputs, targets = inputs.to(device), targets.to(device)
output = model(inputs)
loss = criterion(output, targets)
validation_loss.append(loss.item()) # torch to numpy world
validation_loss = np.mean(validation_loss)
train_losses[e] = train_loss
validation_losses[e] = validation_loss
dt = datetime.now() - t0
print(
f"Epoch : {e+1}/{epochs} Train_loss: {train_loss:.3f} Validation_loss: {validation_loss:.3f} Duration: {dt}"
)
return train_losses, validation_losses
# Running the function
train_losses, validation_losses = batch_gd(
model, criterion, train_loader, validation_loader, 5
)
# And theses are results:
Epoch : 1/5 Train_loss: 1.164 Validation_loss: 0.861 Duration: 0:10:59.968168
Epoch : 2/5 Train_loss: 0.515 Validation_loss: 0.816 Duration: 0:10:49.199842
Epoch : 3/5 Train_loss: 0.241 Validation_loss: 1.007 Duration: 0:09:56.334155
Epoch : 4/5 Train_loss: 0.156 Validation_loss: 1.147 Duration: 0:10:12.625819
Epoch : 5/5 Train_loss: 0.135 Validation_loss: 1.603 Duration: 0:09:56.746308
Isn't the validation loss suppose to decrease with epochs ? So why is it first decreasing and then increasing ?
How should I set the number of epochs, and why ?
Any help is really appreciated !
You are facing the phenomenon of "overfitting" when your validation loss goes up after decreasing. You should stop training at that point and try to use some tricks to avoid overfitting.
Getting different predictions might happen when your gradients keep updating during inference so try explicitly "stop" them from updating with torch.no_grad()
I am learning pytorch and I have created binary classification algorithm. After having trained the model I have very low loss and quite good accuracy. However, on validation the accuracy is exactly 50%. I am wondering if I loaded samples incorrectly or the algorithm does not perform well.
Here you can find the plot of Training loss and accuracy.
Here is my training method:
epochs = 15
itr = 1
p_itr = 100
model.train()
total_loss = 0
loss_list = []
acc_list = []
for epoch in range(epochs):
for samples, labels in train_loader:
samples, labels = samples.to(device), labels.to(device)
optimizer.zero_grad()
output = model(samples)
labels = labels.unsqueeze(-1)
labels = labels.float()
loss = criterion(output, labels)
loss.backward()
optimizer.step()
total_loss += loss.item()
scheduler.step()
#if itr%p_itr == 0:
pred = torch.round(output)
correct = pred.eq(labels)
acc = torch.mean(correct.float())
print('[Epoch {}/{}] Iteration {} -> Train Loss: {:.4f}, Accuracy: {:.3f}'.format(epoch+1, epochs, itr, total_loss/p_itr, acc))
loss_list.append(total_loss/p_itr)
acc_list.append(acc)
total_loss = 0
itr += 1
Here, I am loading data from the path:
train_list_cats = glob.glob(os.path.join(train_cats_dir,'*.jpg'))
train_list_dogs = glob.glob(os.path.join(train_dogs_dir,'*.jpg'))
train_list = train_list_cats + train_list_dogs
val_list_cats = glob.glob(os.path.join(validation_cats_dir,'*.jpg'))
val_list_dogs = glob.glob(os.path.join(validation_dogs_dir,'*.jpg'))
val_list = val_list_cats + val_list_dogs
I am not attaching the model architecture, however I can add it if required.
I think that my training method is correct, although, I am not sure about training/validation data processing.
Edit:
The network params are as follow:
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001)
criterion = nn.BCELoss()
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[500,1000,1500], gamma=0.5)
Activation function is sigmoid.
The network architecture:
self.layer1 = nn.Sequential(
nn.Conv2d(3,16,kernel_size=3),
nn.ReLU(),
nn.MaxPool2d(2),
nn.Dropout(p=0.2)
)
self.layer2 = nn.Sequential(
nn.Conv2d(16,32, kernel_size=3),
nn.ReLU(),
nn.MaxPool2d(2),
nn.Dropout(p=0.2)
)
self.layer3 = nn.Sequential(
nn.Conv2d(32,64, kernel_size=3),
nn.ReLU(),
nn.MaxPool2d(2),
nn.Dropout(p=0.2)
)
self.fc1 = nn.Linear(17*17*64,512)
self.fc2 = nn.Linear(512,1)
self.relu = nn.ReLU()
self.sigmoid = nn.Sigmoid()
def forward(self,x):
out = self.layer1(x)
out = self.layer2(out)
out = self.layer3(out)
out = out.view(out.size(0),-1)
out = self.relu(self.fc1(out))
out = self.fc2(out)
return torch.sigmoid(out)
Going by your "Training loss and accuracy" plot your model is overfitting. Your train loss is near zero after 25 epochs and you continue training for 200+ epochs. This is wrong way to train a model. You should rather be doing early stopping based on the validation set. ie. Run one epoch of train and one epoch of eval and repeat. Stop when your train epoch is improving and the corresponding eval epoch is not improving.
I am new to Pytorch and I've been working on training the MLP model using the MNIST dataset. Basically, I am feeding the model with images and labels as an input and training the dataset on it. I am using CrossEntropyLoss() as a loss function, however I am getting the dimension error whenever I run my model.
IndexError Traceback (most recent call last)
<ipython-input-37-04f8cfc1d3b6> in <module>()
47
48 # Forward
---> 49 outputs = model(images)
50
5 frames
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/flatten.py in forward(self, input)
38
39 def forward(self, input: Tensor) -> Tensor:
---> 40 return input.flatten(self.start_dim, self.end_dim)
41
42 def extra_repr(self) -> str:
IndexError: Dimension out of range (expected to be in range of [-4, 3], but got 64)
Here is the MLP class that I've created
class MLP(nn.Module):
def __init__(self, device, input_size = 1*28*28, output_size = 10):
super().__init__()
self.seq = nn.Sequential(nn.Flatten(BATCH=64, input_size),
nn.Linear(input_size, 32),
nn.ReLU(),
nn.Linear(32, output_size))
self.to(device)
def forward(self, x):
return self.seq(x)
And rest of the training model is
from tqdm.notebook import tqdm
from datetime import datetime
from torch.utils.tensorboard import SummaryWriter
import torch.optim as optim
exp_name = "MLP version 1"
# log_name = "logs/" + exp_name + f" {datetime.now()}"
# print("Tensorboard logs will be written to:", log_name)
# writer = SummaryWriter(log_name)
criterion = nn.CrossEntropyLoss()
model = MLP(device)
optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)
num_epochs = 10
for epoch in tqdm(range(num_epochs)):
epoch_train_loss = 0.0
epoch_accuracy = 0.0
for data in train_loader:
images, labels = data
images, labels = images.to(device), labels.to(device)
images = images.permute(0, 3, 1, 2)
optimizer.zero_grad()
print("hello")
outputs = model(images)
loss = criterion(outputs, labels)
epoch_train_loss += loss.item()
loss.backward()
optimizer.step()
accuracy = compute_accuracy(outputs, labels)
epoch_accuracy += accuracy
writer.add_scalar("Loss/training", epoch_train_loss, epoch)
writer.add_scalar("Accuracy/training", epoch_accuracy / len(train_loader), epoch)
print('epoch: %d loss: %.3f' % (epoch + 1, epoch_train_loss / len(train_loader)))
print('epoch: %d accuracy: %.3f' % (epoch + 1, epoch_accuracy / len(train_loader)))
epoch_accuracy = 0.0
# The code below computes the validation results
for data in val_loader:
images, labels = data
images, labels = images.to(device), labels.to(device)
images = images.permute(0, 3, 1, 2)
model.eval()
with torch.no_grad():
outputs = model(images)
accuracy = compute_accuracy(outputs, labels)
epoch_accuracy += accuracy
writer.add_scalar("Accuracy/validation", epoch_accuracy / len(val_loader), epoch)
print("finished training")
Any help would be appreciated. Thank you.
nn.Flatten() instead of nn.Flatten(BATCH=64, input_size)
https://pytorch.org/docs/stable/generated/torch.nn.Flatten.html
The dataset is CIFAR10. I've created a VGG-like network:
class FirstModel(nn.Module):
def __init__(self):
super(FirstModel, self).__init__()
self.vgg1 = nn.Sequential(
nn.Conv2d(3, 16, 3, padding=1),
nn.BatchNorm2d(16),
nn.ReLU(),
nn.Conv2d(16, 16, 3, padding=1),
nn.BatchNorm2d(16),
nn.ReLU(),
nn.MaxPool2d(2,2),
nn.Dropout(0.2)
)
self.vgg2 = nn.Sequential(
nn.Conv2d(16, 32, 3, padding=1),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.Conv2d(32, 32, 3, padding=1),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(2,2),
nn.Dropout(0.2)
)
self.vgg3 = nn.Sequential(
nn.Conv2d(32, 64, 3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.Conv2d(64, 64, 3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(2,2),
nn.Dropout(0.2)
)
self.fc1 = nn.Linear(4 * 4 * 64, 4096)
self.relu = nn.ReLU()
self.fc2 = nn.Linear(4096, 4096)
self.fc3 = nn.Linear(4096, 10)
self.softmax = nn.Softmax()
self.dropout = nn.Dropout(0.5)
def forward(self, x):
x = self.vgg3(self.vgg2(self.vgg1(x)))
x = nn.Flatten()(x)
x = self.relu(self.fc1(x))
x = self.dropout(x)
x = self.relu(self.fc2(x))
x = self.dropout(x)
x = self.softmax(self.fc3(x))
return x
Then I train it and visualize loss and accuracy:
import matplotlib.pyplot as plt
from IPython.display import clear_output
def plot_history(train_history, val_history, title='loss'):
plt.figure()
plt.title('{}'.format(title))
plt.plot(train_history, label='train', zorder=1)
points = np.array(val_history)
steps = list(range(0, len(train_history) + 1, int(len(train_history) / len(val_history))))[1:]
plt.scatter(steps, val_history, marker='*', s=180, c='red', label='val', zorder=2)
plt.xlabel('train steps')
plt.legend(loc='best')
plt.grid()
plt.show()
def train_model(model, optimizer, train_dataloader, test_dataloader):
criterion = nn.CrossEntropyLoss()
train_loss_log = []
train_acc_log = []
val_loss_log = []
val_acc_log = []
for epoch in range(NUM_EPOCH):
model.train()
train_loss = 0.
train_size = 0
train_acc = 0.
for inputs, labels in train_dataloader:
inputs, labels = inputs.to(device), labels.to(device)
optimizer.zero_grad()
y_pred = model(inputs)
loss = criterion(y_pred, labels)
loss.backward()
optimizer.step()
train_loss += loss.item()
train_size += y_pred.size(0)
train_loss_log.append(loss.data / y_pred.size(0))
_, pred_classes = torch.max(y_pred, 1)
train_acc += (pred_classes == labels).sum().item()
train_acc_log.append(np.mean((pred_classes == labels).cpu().numpy()))
# блок validation
val_loss = 0.
val_size = 0
val_acc = 0.
model.eval()
with torch.no_grad():
for inputs, labels in test_dataloader:
inputs, labels = inputs.to(device), labels.to(device)
y_pred = model(inputs)
loss = criterion(y_pred, labels)
val_loss += loss.item()
val_size += y_pred.size(0)
_, pred_classes = torch.max(y_pred, 1)
val_acc += (pred_classes == labels).sum().item()
val_loss_log.append(val_loss/val_size)
val_acc_log.append(val_acc/val_size)
clear_output()
plot_history(train_loss_log, val_loss_log, 'loss')
plot_history(train_acc_log, val_acc_log, 'accuracy')
print('Train loss:', train_loss / train_size)
print('Train acc:', train_acc / train_size)
print('Val loss:', val_loss / val_size)
print('Val acc:', val_acc / val_size)
Then I train the model:
first_model = FirstModel()
first_model.to(device)
optimizer = optim.RMSprop(first_model.parameters(), lr=0.001, momentum=0.9)
train_model(first_model_rms, optimizer, train_dataloader, test_dataloader)
The loss and accuracy do not change (accuracy at level of 0.1). However, if the optimizer is SGD with momentum everything works fine (loss and accuracy change). I've already tried to change momentum and lr, but it does not help.
What should be fixed? Would be grateful for any possible advice!
So first of all, you don't have to use softmax in the "model" as it is done by the nn.CrossEntropyLoss, and I also think that the RMSprop doesn't work with momentum.
try to decrease the learning rate more .....if then also there is no affect on the accuracy and loss then change the optimizer to adams or something else and play with different learning rates.
In my case, I was facing the same error. On my laptop without GPU the training was fine. When I tried on GPU the model didn’t change the accuracy and loss after the first epochs. I was using nn.CrossEntropyLoss() with Adam.
Changing Adam with SGD worked for me.
I am reimplementing the pytorch tutorial of the Pytorch cifar10 tutorial
But I want to use a different model.
I don't want to use fully connected (in pytorch linear) layers and I want to add Batch Normalization.
My model looks like this:
import torch
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.pool = nn.MaxPool2d(2,2)
self.conv1 = nn.Conv2d(in_channels=3,out_channels=16,kernel_size=3, padding=1, padding_mode='zeros')
self.conv1_bn = nn.BatchNorm2d(16)
self.conv2 = nn.Conv2d(in_channels=16,out_channels=32,kernel_size=3, padding=1, padding_mode='zeros')
self.conv2_bn = nn.BatchNorm2d(32)
self.conv3 = nn.Conv2d(in_channels=32,out_channels=64,kernel_size=3, padding=1, padding_mode='zeros')
self.conv3_bn = nn.BatchNorm2d(64)
self.conv4 = nn.Conv2d(64,64,3, padding=1, padding_mode='zeros')
self.conv4_bn = nn.BatchNorm2d(64)
self.conv5 = nn.Conv2d(64,10,2,padding=0)
def forward(self, x): # x has shape (4,32,32,3)
x = self.pool(F.relu(self.conv1_bn(self.conv1(x)))) # feature map resolution is now 16*16
x = self.pool(F.relu(self.conv2_bn(self.conv2(x)))) # resolution now 8*8
x = self.pool(F.relu(self.conv3_bn(self.conv3(x)))) #resolution now 4*4
x = self.pool(F.relu(self.conv4_bn(self.conv4(x)))) # now 2*2
x = F.relu(self.conv5(x)) # The output shape is (batchsize, 1,1,10)
return x
Batchsize is 4 and image resolution is 32*32 so inputsize is 4,32,32,3
The convolution layers don't reduce the resolution size of the feature maps because of the padding. The resolution is halved with the maxpool layers. Conv5 gets an input with shape 4,2,2,64.
Now I use filtersize 2 and no padding to get a resolution of 1*1.
I have 10 classes so I use 10 filters. Each of the last filters should predict it's corresponding class.
The shape of the output is now (4,1,1,10).
But when I try to train this model the loss doesn't decrease. The amount of parameters of the tutorial model and my net are about the same at ~62k.
Here is the rest of the code. This is identical to the code in the tutorial but I have to reshape the output so it fits. (output in the tutorial was (4,10) and mine is 4,1,1,10)
transform = transforms.Compose(
[transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=4,
shuffle=True, num_workers=2)
testset = torchvision.datasets.CIFAR10(root='./data', train=False,
download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=4,
shuffle=False, num_workers=2)
net = Net()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net.to(device)
for epoch in range(2): # loop over the dataset multiple times
running_loss = 0.0
for i, data in enumerate(trainloader, 0):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data[0].to(device), data[1].to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = net(inputs) # I get the values as 4,1,1,10
outputs_reshaped = outputs.reshape(4,10)
loss = criterion(outputs_reshaped, labels)
loss.backward()
optimizer.step()
running_loss +=loss.item()
if i % 2000 == 1999: # print every 2000 mini-batches
print('[%d, %5d] loss: %.3f' %
(epoch + 1, i + 1, running_loss / 2000))
running_loss = 0.0
My loss looks like this.
[1, 2000] loss: 2.348
[1, 2000] loss: 2.477
[1, 4000] loss: 2.482
[1, 6000] loss: 2.468
[1, 8000] loss: 2.471
[1, 10000] loss: 2.482
[1, 12000] loss: 2.485
[2, 2000] loss: 2.486
[2, 4000] loss: 2.470
[2, 6000] loss: 2.479
[2, 8000] loss: 2.481
[2, 10000] loss: 2.474
[2, 12000] loss: 2.470
My model doesn't seem to learn anything. Anyone an idea why this might happen?
Your learning rate and momentum combination is too large for such a small batch size, try something like these:
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.0)
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
Update: I just realized another problem is you are using a relu activation at the end of the network. If you look at the documentation of CrossEntropyLoss, there is an advice:
The input is expected to contain raw, unnormalized scores for each
class.
Try training your network by removing last relu from conv5 and keeping lr=0.01 and momentum=0.9. Relu before cross entropy loss throws away information about class scores.
So if you have a similar problem
I changed the optimizer to
optimizer = optim.Adam(net.parameters(),0.001)
my last line in forward()
was
x = F.relu(self.conv5(x))
I removed the relu, it's now
x= self.conv5(x)
and now the loss is decreasing as expected (way faster than the tutorial with the same amount of parameters)