Model weights are not being updatesd, but loss is decreasing - python

The following code is to train an MLP with images of size 64*64, while using the loss ||output - input||^2.
For some reason, my weights per epoch are not being updated as shown at the end.
class MLP(nn.Module):
def __init__(self, size_list):
super(MLP, self).__init__()
layers = []
self.size_list = size_list
for i in range(len(size_list) - 2):
layers.append(nn.Linear(size_list[i],size_list[i+1]))
layers.append(nn.ReLU())
layers.append(nn.Linear(size_list[-2], size_list[-1]))
self.net = nn.Sequential(*layers)
def forward(self, x):
return self.net(x)
model_1 = MLP([4096, 64, 4096])
And for training each epoch:
def train_epoch(model, train_loader, criterion, optimizer):
model.train()
model.to(device)
running_loss = 0.0
start_time = time.time()
# train batch
for batch_idx, (data) in enumerate(train_loader):
optimizer.zero_grad()
data = data.to(device)
outputs = model(data)
loss = criterion(outputs, data)
running_loss += loss.item()
loss.backward()
optimizer.step()
end_time = time.time()
weight_ll = model.net[0].weight
running_loss /= len(train_loader)
print('Training Loss: ', running_loss, 'Time: ',end_time - start_time, 's')
return running_loss, outputs, weight_ll
for training the data:
n_epochs = 20
Train_loss = []
weights=[]
criterion = nn.MSELoss()
optimizer = optim.SGD(model_1.parameters(), lr = 0.1)
for i in range(n_epochs):
train_loss, output, weights_ll = train_epoch(model_1, trainloader, criterion, optimizer)
Train_loss.append(train_loss)
weights.append(weights_ll)
print('='*20)
Now, when I print the weights of the first fully connected layer per epoch they aren't being updated.
print(weights[0][0])
print(weights[19][0])
The output for the above is (showing the weight in epoch 0 and in epoch 19):
tensor([ 0.0086, 0.0069, -0.0048, ..., -0.0082, -0.0115, -0.0133],
grad_fn=<SelectBackward>)
tensor([ 0.0086, 0.0069, -0.0048, ..., -0.0082, -0.0115, -0.0133],
grad_fn=<SelectBackward>)
What may be going wrong? Looking at my loss, it's decreasing at a steady rate but there is no change in the weights.

Try to change it weight_ll = model.net[0].weight.clone().detach() or just weight_ll = model.net[0].weight.clone() in your train_epoch() function. And you will see the weights differ.
Explanation: weights_ll are always the last epoch values if you do not clone it. It will be regarded as the same tensor in the graph. That's why your weights[0][0] equals to weights[19][0], they are actually the same tensor.

Related

MNIST overfitting

I am currently working on the MNIST dataset. My model has overfit the training data and I want to reduce the overfitting by using weight_decay. I am currently using 0.1 as the value for weight_decay which is giving me bad results as my validation loss and training loss are not decreasing. However, I want to experiment with different values for weight_decay. So that i can plot the different amounts of weight_decay on the x-axis and the performance of validation set on the y-axis. How do i do that? store the values in a list and use a for loop to iterate through? Below is the code that i have tried until now.
class NN(nn.Module):
def __init__(self):
super().__init__()
self.layers = nn.Sequential(
nn.Flatten(),
nn.Linear(784,4096),
nn.ReLU(),
nn.Linear(4096,2048),
nn.ReLU(),
nn.Linear(2048,1024),
nn.ReLU(),
nn.Linear(1024,512),
nn.ReLU(),
nn.Linear(512,256),
nn.ReLU(),
nn.Linear(256,128),
nn.ReLU(),
nn.Linear(128,64),
nn.ReLU(),
nn.Linear(64,32),
nn.ReLU(),
nn.Linear(32,16),
nn.ReLU(),
nn.Linear(16,10))
def forward(self,x):
return self.layers(x)
def accuracy_and_loss(model, loss_function, dataloader):
total_correct = 0
total_loss = 0
total_examples = 0
n_batches = 0
with torch.no_grad():
for data in testloader:
images, labels = data
outputs = model(images)
batch_loss = loss_function(outputs,labels)
n_batches += 1
total_loss += batch_loss.item()
_, predicted = torch.max(outputs, dim=1)
total_examples += labels.size(0)
total_correct += (predicted == labels).sum().item()
accuracy = total_correct / total_examples
mean_loss = total_loss / n_batches
return (accuracy, mean_loss)
def define_and_train(model,dataset_training, dataset_test):
trainloader = torch.utils.data.DataLoader( small_trainset, batch_size=500, shuffle=True)
testloader = torch.utils.data.DataLoader( dataset_test, batch_size=500, shuffle=True)
values = [1e-8,1e-7,1e-6,1e-5]
model = NN()
for params in values:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay = params)
train_acc = []
val_acc = []
train_loss = []
val_loss = []
for epoch in range(100):
total_loss = 0
total_correct = 0
total_examples = 0
n_mini_batches = 0
for i,mini_batch in enumerate(trainloader,0):
images,labels = mini_batch
optimizer.zero_grad()
outputs = model(images)
loss = loss_function(outputs,labels)
loss.backward()
optimizer.step()
n_mini_batches += 1
total_loss += loss.item()
_, predicted = torch.max(outputs, dim=1)
total_examples += labels.size(0)
total_correct += (predicted == labels).sum().item()
epoch_training_accuracy = total_correct / total_examples
epoch_training_loss = total_loss / n_mini_batches
epoch_val_accuracy, epoch_val_loss = accuracy_and_loss( model, loss_function, testloader )
print('Params %f Epoch %d loss: %.3f acc: %.3f val_loss: %.3f val_acc: %.3f'
%(params, epoch+1, epoch_training_loss, epoch_training_accuracy, epoch_val_loss, epoch_val_accuracy))
train_loss.append( epoch_training_loss )
train_acc.append( epoch_training_accuracy )
val_loss.append( epoch_val_loss )
val_acc.append( epoch_val_accuracy )
history = { 'train_loss': train_loss,
'train_acc': train_acc,
'val_loss': val_loss,
'val_acc': val_acc }
return ( history, model )
This is the plot that I am getting. Where am I going wrong?
I cannot know any information. (Such as loss function, dataset size, dataset content (training and validation), results of 100 or 200 epochs, your scope of the question)
However, the overfitted model may classify the validation dataset. Because the MNIST dataset is not that hard with deep learning (compared to other image classifications).
How about adding white noise to the validation dataset? You may get a large loss on validation.
Or if you want to use your validation dataset, train the model for more at least 1000 epochs. But, as I said above, the overfitted model may classify the validation dataset.

Validation losses increasing after a few epochs

I'm building a small CNN model to predict plant crop disease with the Plant Village Dataset. It consists of 39 classes of different species with and without diseases.
CNN model
class CropDetectCNN(nn.Module):
# initialize the class and the parameters
def __init__(self):
super(CropDetectCNN, self).__init__()
# convolutional layer 1 & max pool layer 1
self.layer1 = nn.Sequential(
nn.Conv2d(3, 16, kernel_size=3),
nn.MaxPool2d(kernel_size=2))
# convolutional layer 2 & max pool layer 2
self.layer2 = nn.Sequential(
nn.Conv2d(16, 32, kernel_size=3, padding=1, stride=2),
nn.MaxPool2d(kernel_size=2))
#Fully connected layer
self.fc = nn.Linear(32*28*28, 39)
# Feed forward the network
def forward(self, x):
out = self.layer1(x)
out = self.layer2(out)
out = out.reshape(out.size(0), -1)
out = self.fc(out)
return out
model = CropDetectCNN()
Training
criterion = nn.CrossEntropyLoss() # this include softmax + cross entropy loss
optimizer = torch.optim.Adam(model.parameters())
def batch_gd(model, criterion, train_loader, validation_loader, epochs):
train_losses = np.zeros(epochs)
test_losses = np.zeros(epochs)
validation_losses = np.zeros(epochs)
for e in range(epochs):
t0 = datetime.now()
train_loss = []
model.train()
for inputs, targets in train_loader:
inputs, targets = inputs.to(device), targets.to(device)
optimizer.zero_grad()
output = model(inputs)
loss = criterion(output, targets)
train_loss.append(loss.item()) # torch to numpy world
loss.backward()
optimizer.step()
train_loss = np.mean(train_loss)
validation_loss = []
for inputs, targets in validation_loader:
model.eval()
inputs, targets = inputs.to(device), targets.to(device)
output = model(inputs)
loss = criterion(output, targets)
validation_loss.append(loss.item()) # torch to numpy world
validation_loss = np.mean(validation_loss)
train_losses[e] = train_loss
validation_losses[e] = validation_loss
dt = datetime.now() - t0
print(
f"Epoch : {e+1}/{epochs} Train_loss: {train_loss:.3f} Validation_loss: {validation_loss:.3f} Duration: {dt}"
)
return train_losses, validation_losses
# Running the function
train_losses, validation_losses = batch_gd(
model, criterion, train_loader, validation_loader, 5
)
# And theses are results:
Epoch : 1/5 Train_loss: 1.164 Validation_loss: 0.861 Duration: 0:10:59.968168
Epoch : 2/5 Train_loss: 0.515 Validation_loss: 0.816 Duration: 0:10:49.199842
Epoch : 3/5 Train_loss: 0.241 Validation_loss: 1.007 Duration: 0:09:56.334155
Epoch : 4/5 Train_loss: 0.156 Validation_loss: 1.147 Duration: 0:10:12.625819
Epoch : 5/5 Train_loss: 0.135 Validation_loss: 1.603 Duration: 0:09:56.746308
Isn't the validation loss suppose to decrease with epochs ? So why is it first decreasing and then increasing ?
How should I set the number of epochs, and why ?
Any help is really appreciated !
You are facing the phenomenon of "overfitting" when your validation loss goes up after decreasing. You should stop training at that point and try to use some tricks to avoid overfitting.
Getting different predictions might happen when your gradients keep updating during inference so try explicitly "stop" them from updating with torch.no_grad()

How to run one batch in pytorch?

I'm new to AI and python and I'm trying to run only one batch to aim to overfit.I found the code:
iter(train_loader).next()
but I'm not sure where to implement it in my code. even if I did, how can I check after each iteration to make sure that I'm training the same batch?
train_loader = torch.utils.data.DataLoader(
dataset_train,
batch_size=48,
shuffle=True,
num_workers=2
)
net = nn.Sequential(
nn.Flatten(),
nn.Linear(128*128*3,10)
)
nepochs = 3
statsrec = np.zeros((3,nepochs))
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)
for epoch in range(nepochs): # loop over the dataset multiple times
running_loss = 0.0
n = 0
for i, data in enumerate(train_loader, 0):
inputs, labels = data
# Zero the parameter gradients
optimizer.zero_grad()
# Forward, backward, and update parameters
outputs = net(inputs)
loss = loss_fn(outputs, labels)
loss.backward()
optimizer.step()
# accumulate loss
running_loss += loss.item()
n += 1
ltrn = running_loss/n
ltst, atst = stats(train_loader, net)
statsrec[:,epoch] = (ltrn, ltst, atst)
print(f"epoch: {epoch} training loss: {ltrn: .3f} test loss: {ltst: .3f} test accuracy: {atst: .1%}")
please give me a hint
If you are looking to train on a single batch, then remove your loop over your dataloader:
for i, data in enumerate(train_loader, 0):
inputs, labels = data
And simply get the first element of the train_loader iterator before looping over the epochs, otherwise next will be called at every iteration and you will run on a different batch every epoch:
inputs, labels = next(iter(train_loader))
i = 0
for epoch in range(nepochs):
optimizer.zero_grad()
outputs = net(inputs)
loss = loss_fn(outputs, labels)
loss.backward()
optimizer.step()
# ...

Pytorch: Weights not changing during training

Basically the same question as this one here, which was never answered: Why the first convolutional layer weights don't change during training?
I just want to watch the weights of my convolutional layers as they change during training. How can I do this? No matter what I do, the weights seem to stay the same even though loss is decreasing.
I'm trying to follow this tutorial here although the model is slightly different: https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html#sphx-glr-beginner-blitz-cifar10-tutorial-py
Model
class CNN(nn.Module):
def __init__(self):
super(Digit_Classifier, self).__init__()
self.conv1 = nn.Conv2d(1,6,3)
self.pool1 = nn.MaxPool2d(2)
self.conv2 = nn.Conv2d(6,16,3)
self.pool2 = nn.MaxPool2d(2)
self.out = nn.Linear(400, 10)
def forward(self, inputs):
x = self.pool1(F.relu(self.conv1(inputs)))
x = self.pool2(F.relu(self.conv2(x)))
x = torch.flatten(x, start_dim=1)
x = self.out(x)
return x
Training
def train(epochs=100):
criterion = nn.CrossEntropyLoss()
net = CNN()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
losses = []
for epoch in range(epochs): # loop over the dataset multiple times
running_loss = 0.0
for i, data in enumerate(trainloader, 0):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data
# zero the parameter gradients
optimizer.zero_grad()
outputs = net(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
w = model.conv1._parameters['weight']
print(w)
losses.append(running_loss / z)
if i % 2000 == 1999: # print every 2000 mini-batches
print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 2000))
running_loss = 0.0
return net
If you don't use any normalization modules, the closer the weights are to the input of the network, the smaller the gradients and therefore the changes will be, so the changes are probably in the decimals that aren't displayed anymore in your print() statement. To see the changes, I'd suggest saving the weights from one iteration to the next, and subtracting them to display the difference:
...
w = model.conv1._parameters['weight'].detach()
print(w-w_previous)
w_previous = w
...

Pytorch MSE loss function nan during training

I am trying linear regression from boston dataset. MSE loss function is nan since the first iteration. I tried altering learning rate and batch_size but of no use.
from torch.utils.data import TensorDataset , DataLoader
inputs = torch.from_numpy(Features).to(torch.float32)
targets = torch.from_numpy(target).to(torch.float32)
train_ds = TensorDataset(inputs , targets)
train_dl = DataLoader(train_ds , batch_size = 5 , shuffle = True)
model = nn.Linear(13,1)
opt = optim.SGD(model.parameters(), lr=1e-5)
loss_fn = F.mse_loss
def fit(num_epochs, model, loss_fn, opt, train_dl):
# Repeat for given number of epochs
for epoch in range(num_epochs):
# Train with batches of data
for xb,yb in train_dl:
# 1. Generate predictions
pred = model(xb)
# 2. Calculate loss
loss = loss_fn(pred, yb)
# 3. Compute gradients
loss.backward()
# 4. Update parameters using gradients
opt.step()
# 5. Reset the gradients to zero
opt.zero_grad()
# Print the progress
if (epoch+1) % 10 == 0:
print('Epoch [{}/{}], Loss: {}'.format(epoch+1, num_epochs, loss.item()))
fit(100, model, loss_fn , opt , train_dl)
output
Pay attention to:
Use normalization: x = (x - x.mean()) / x.std()
y_train / y_test have to be (-1, 1) shapes. Use y_train.view(-1, 1) (if y_train is torch.Tensor or something)
(not your case, but for someone else) If you use torch.nn.MSELoss(reduction='sum') than you have to reduse the sum to mean. It can be done with torch.nn.MSELoss() or in train-loop: l = loss(y_pred, y) / y.shape[0].
Example:
...
loss = torch.nn.MSELoss()
...
for epoch in range(num_epochs):
for x, y in train_iter:
y_pred = model(x)
l = loss(y_pred, y)
optimizer.zero_grad()
l.backward()
optimizer.step()
print("epoch {} loss: {:.4f}".format(epoch + 1, l.item()))

Categories

Resources