Pytorch MSE loss function nan during training - python

I am trying linear regression from boston dataset. MSE loss function is nan since the first iteration. I tried altering learning rate and batch_size but of no use.
from import TensorDataset , DataLoader
inputs = torch.from_numpy(Features).to(torch.float32)
targets = torch.from_numpy(target).to(torch.float32)
train_ds = TensorDataset(inputs , targets)
train_dl = DataLoader(train_ds , batch_size = 5 , shuffle = True)
model = nn.Linear(13,1)
opt = optim.SGD(model.parameters(), lr=1e-5)
loss_fn = F.mse_loss
def fit(num_epochs, model, loss_fn, opt, train_dl):
# Repeat for given number of epochs
for epoch in range(num_epochs):
# Train with batches of data
for xb,yb in train_dl:
# 1. Generate predictions
pred = model(xb)
# 2. Calculate loss
loss = loss_fn(pred, yb)
# 3. Compute gradients
# 4. Update parameters using gradients
# 5. Reset the gradients to zero
# Print the progress
if (epoch+1) % 10 == 0:
print('Epoch [{}/{}], Loss: {}'.format(epoch+1, num_epochs, loss.item()))
fit(100, model, loss_fn , opt , train_dl)

Pay attention to:
Use normalization: x = (x - x.mean()) / x.std()
y_train / y_test have to be (-1, 1) shapes. Use y_train.view(-1, 1) (if y_train is torch.Tensor or something)
(not your case, but for someone else) If you use torch.nn.MSELoss(reduction='sum') than you have to reduse the sum to mean. It can be done with torch.nn.MSELoss() or in train-loop: l = loss(y_pred, y) / y.shape[0].
loss = torch.nn.MSELoss()
for epoch in range(num_epochs):
for x, y in train_iter:
y_pred = model(x)
l = loss(y_pred, y)
print("epoch {} loss: {:.4f}".format(epoch + 1, l.item()))


Why the Validation Loss is too high?

I tried to write a code which is about the brand detection. But while training the model, there are high losses in every epoch. I tried to normalize the dataset, however nothings changed. Am I doing something wrong?
My code is as below:
train_link = "C:/Users\proin\OneDrive\Masaüstü\Data_2/train"
test_link = "C:/Users\proin\OneDrive\Masaüstü\Data_2/test"
val_link = "C:/Users\proin\OneDrive\Masaüstü\Data_2/validation"
transforming_train = transforms.Compose([transforms.Resize((300, 300)), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
transforming_test = transforms.Compose([transforms.Resize((300, 300)), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
I did also try to plug mean and std values in normalize function but nothings changed.
Let me continue:
trainset = torchvision.datasets.ImageFolder(train_link, transform = transforming_train)
testset = torchvision.datasets.ImageFolder(test_link, transform = transforming_test)
valset = torchvision.datasets.ImageFolder(val_link, transform = transforming_test)
batch_size = 1
trainloader =, batch_size=batch_size,
valloader =, batch_size=batch_size,
testloader =, batch_size=1,
Because ImageFolder has no argument of normalize, I couldn't plug it in here.
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
And I'm using the CUDA as well.
def train_log_loss_network(model, train_loader, val_loader=None, epochs=50, device="cpu"):
loss_fn = nn.CrossEntropyLoss() #CrossEntropy is another name for the Logistic Regression loss function. Like before, we phrase learning as minimize a loss function. This is the loss we are going to minimize!
#We need an optimizer! Adam is a good default one that works "well enough" for most problems
#To tell Adam what to optimize, we give it the model's parameters - because thats what the learning will adjust
# optimizer = torch.optim.Adam(model.parameters())
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
#Devices can be spcified by a string, or a special torch object
#If it is a string, lets get the correct device
if device.__class__ == str:
device = torch.device(device) the model on the correct compute resource
for epoch in range(epochs):
model = model.train()#Put our model in training mode
running_loss = 0.0
for inputs, labels in train_loader: #tqdm(train_loader):
#Move the batch to the device we are using.
inputs, labels = inputs.cuda(), labels.cuda()
inputs =
labels =
# zero the parameter gradients
y_pred = model(inputs)
# Compute loss.
loss = loss_fn(y_pred, labels.long())
# Backward pass: compute gradient of the loss with respect to model parameters
# Calling the step function on an Optimizer makes an update to its parameters
running_loss += loss.item() * inputs.size(0)
if val_loader is None:
print("Loss after epoch {} is {}".format(epoch + 1, running_loss))
else:#Lets find out validation performance as we go!
model = model.eval() #Set the model to "evaluation" mode, b/c we don't want to make any updates!
predictions = []
targets = []
for inputs, labels in val_loader:
#Move the batch to the device we are using.
inputs =
labels =
y_pred = model(inputs)
# Get predicted classes
# y_pred will have a shape (Batch_size, C)
#We are asking for which class had the largest response along dimension #1, the C dimension
for pred in torch.argmax(y_pred, dim=1).cpu().numpy():
for l in labels.cpu().numpy():
#print("Network Accuracy: ", )
print("Loss after epoch {} is {}. Accuracy: {}".format(epoch + 1, running_loss, accuracy_score(predictions, targets)))
And lastly,
train_log_loss_network(model, trainloader, val_loader=valloader, epochs=10, device=device)
Even I tried different epoch number and different conv layer, the results are similar.
Loss after epoch 1 is 72568.83042097092. Accuracy: 0.0036231884057971015
Loss after epoch 2 is 72568.78793954849. Accuracy: 0.0036231884057971015
Loss after epoch 3 is 72568.74511051178. Accuracy: 0.0036231884057971015
Loss after epoch 4 is 72568.7018828392. Accuracy: 0.014492753623188406
Loss after epoch 5 is 72568.65722990036. Accuracy: 0.014492753623188406

RuntimeError: Given groups=1, weight of size [64, 3, 7, 7], expected input[100, 1, 28, 28] to have 3 channels, but got 1 channels instead

I am trying to use a pre-trained (resnet) model on the MNIST dataset, but this error always appears to me
RuntimeError: Given groups=1, weight of size [64, 3, 7, 7], expected input[100, 1, 28, 28] to have 3 channels, but got 1 channels instead.
This is my code:
MNIST dataset
from torchvision import datasets
import torchvision.transforms as transforms
# number of subprocesses to use for data loading
num_workers = 0
# how many samples per batch to load
batch_size = 100
# convert data to torch.FloatTensor
transform = transforms.Compose
# choose the training and test datasets
train_data = datasets.MNIST(root='data', train=True, download=True, transform=transform)
test_data = datasets.MNIST(root='data', train=False, download=True, transform=transform)
# prepare data loaders
train_loader =, batch_size=batch_size, num_workers=num_workers)
test_loader =, batch_size=batch_size, num_workers=num_workers)
MLP network definition
from torch.nn.modules.activation import ReLU
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
# building the model and the type to model → Sequential
self.model = nn.Sequential
# building the layers and the type to layers → Linear
nn.Linear(28 * 28 , 200), # input layer = 100
# to avoid problem to overfitting → using the Dropout (As we can see, dropouts are used to randomly remove neurons while training of the neural network.)
nn.Dropout(0.2), # to use Dropout to avoid problem → overfitting
nn.ReLU(True), # activation function
nn.BatchNorm1d(num_features = 200), # Batch normalization (also known as batch norm) is a method used
# to make training of artificial neural networks faster and more stable through normalization of the layers' inputs by re-centering and re-scaling.
nn.Linear(200 , 10), # output layer = 10
def forward(self, x):
x = x.view(-1, 1, 28*28)
return self.model(x)
# initialize the N
model = Net()
model = resnet50(weights = ResNet50_Weights.IMAGENET1K_V2)
model.fc = nn.Linear(512,10)
define an optimizer to update the model parameters
## Specify loss and optimization functions
# specify loss function
criterion = nn.CrossEntropyLoss()
# specify optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
Training Data
#number of epochs to train the model
n_epochs = 1 # suggest training between 20-50 epochs
model.train() # prep model for training
for epoch in range(n_epochs):
# monitor training loss
train_loss = 0.0
# train the model #
for data, target in train_loader:
# clear the gradients of all optimized variables
data = data.repeat(1,3,1,1)
# forward pass: compute predicted outputs by passing inputs to the model
output = model(data)
# calculate the loss
loss = criterion(output, target)
# backward pass: compute gradient of the loss with respect to model parameters
# perform a single optimization step (parameter update)
# update running training loss
train_loss += loss.item()*data.size(0)
# print training statistics
# calculate average loss over an epoch
train_loss = train_loss/len(train_loader.dataset)
print('Epoch: {} \tTraining Loss: {:.6f}'.format(
Initialize lists to monitor test loss and accuracy
# initialize lists to monitor test loss and accuracy
test_loss = 0.0
class_correct = list(0. for i in range(10))
class_total = list(0. for i in range(10))
model.eval() # prep model for *evaluation*
for data, target in test_loader:
# forward pass: compute predicted outputs by passing inputs to the model
output = model(data)
# calculate the loss
loss = criterion(output, target)
# update test loss
test_loss += loss.item()*data.size(0)
# convert output probabilities to predicted class
_, pred = torch.max(output, 1)
# compare predictions to true label
correct = np.squeeze(pred.eq(
# calculate test accuracy for each object class
for i in range(16):
label =[i]
class_correct[label] += correct[i].item()
class_total[label] += 1
# calculate and print avg test loss
test_loss = test_loss/len(test_loader.dataset)
print('Test Loss: {:.6f}\n'.format(test_loss))
for i in range(10):
if class_total[i] > 0:
print('Test Accuracy of %5s: %2d%% (%2d/%2d)' % (
str(i), 100 * class_correct[i] / class_total[i],
np.sum(class_correct[i]), np.sum(class_total[i])))
print('Test Accuracy of %5s: N/A (no training examples)' % (classes[i]))
print('\nTest Accuracy (Overall): %2d%% (%2d/%2d)' % (
100. * np.sum(class_correct) / np.sum(class_total),
np.sum(class_correct), np.sum(class_total)))

Tensorflow gradient tape returns exploding gradient model.trainable_variables

I'm trying to train my deep learning with tensorflow gradient tape, however the accuracy does not change with the epochs. I also checked for reseting my loss and accuracy.
For the MNIST dataset my code looks the following:
(mnist_train, mnist_test), mnist_info = tfds.load('mnist', split=['train', 'test'], as_supervised=True, with_info=True)
def prepare(ds, batch_size=128):
ds = ds.cache()
ds = ds.batch(batch_size)
ds = ds.prefetch(
return ds
def split_tasks(ds, predicate):
return ds.filter(predicate), ds.filter(lambda img, label: not predicate(img, label))
task_A_train, task_B_train = split_tasks(mnist_train, lambda img, label: label % 2 == 0)
task_A_train, task_B_train = prepare(task_A_train), prepare(task_B_train)
task_A_test, task_B_test = split_tasks(mnist_test, lambda img, label: label % 2 == 0)
task_A_test, task_B_test = prepare(task_A_test), prepare(task_B_test)
def evaluate(model, test_set):
acc = tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')
for i, (imgs, labels) in enumerate(test_set):
preds = model.predict_on_batch(imgs)
acc.update_state(labels, preds)
return acc.result().numpy()
multi_task_model = tf.keras.Sequential([
tf.keras.layers.Flatten(input_shape=(28, 28, 1)),
tf.keras.layers.Dense(128, activation='relu'),
multi_task_model.compile(optimizer='adam', loss=sparse_categorical_crossentropy, metrics='accuracy')
def l2_penalty(model, theta_A):
penalty = 0
for i, theta_i in enumerate(model.trainable_variables):
_penalty = tf.norm(theta_i - theta_A[i])
penalty += _penalty
return 0.5*penalty
def train_with_l2(model, task_A_train, task_B_train, task_A_test, task_B_test, epochs=6):
# First we're going to fit to task A and retain a copy of parameters trained on Task A, epochs=epochs)
theta_A = {n: p.value() for n, p in enumerate(model.trainable_variables.copy())}
print("Task A accuracy after training on Task A: {}".format(evaluate(model, task_A_test)))
# Metrics for the custom training loop
accuracy = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
loss = tf.keras.metrics.SparseCategoricalCrossentropy('loss')
for epoch in range(epochs):
for batch, (imgs, labels) in enumerate(task_B_train):
with tf.GradientTape() as tape:
preds = model(imgs)
# Loss is crossentropy loss with regularization term for each parameter
total_loss = model.loss(labels, preds) + l2_penalty(model, theta_A)
grads = tape.gradient(total_loss, model.trainable_variables)
model.optimizer.apply_gradients(zip(grads, model.trainable_variables))
accuracy.update_state(labels, preds)
loss.update_state(labels, preds)
print("\rEpoch: {}, Batch: {}, Loss: {:.3f}, Accuracy: {:.3f}".format(
epoch+1, batch+1, loss.result().numpy(), accuracy.result().numpy()), flush=True, end=''
print("Task B accuracy after training trained model on Task B: {}".format(evaluate(model, task_B_test)))
print("Task A accuracy after training trained model on Task B: {}".format(evaluate(model, task_A_test)))
Does anybody see what I'm doing wrong concerning the training within gradientTape?
EDIT: I rechecked my gradients and it seems that these are exploding and thus return nan. However I cannot see why this is happening.

How to run one batch in pytorch?

I'm new to AI and python and I'm trying to run only one batch to aim to overfit.I found the code:
but I'm not sure where to implement it in my code. even if I did, how can I check after each iteration to make sure that I'm training the same batch?
train_loader =
net = nn.Sequential(
nepochs = 3
statsrec = np.zeros((3,nepochs))
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)
for epoch in range(nepochs): # loop over the dataset multiple times
running_loss = 0.0
n = 0
for i, data in enumerate(train_loader, 0):
inputs, labels = data
# Zero the parameter gradients
# Forward, backward, and update parameters
outputs = net(inputs)
loss = loss_fn(outputs, labels)
# accumulate loss
running_loss += loss.item()
n += 1
ltrn = running_loss/n
ltst, atst = stats(train_loader, net)
statsrec[:,epoch] = (ltrn, ltst, atst)
print(f"epoch: {epoch} training loss: {ltrn: .3f} test loss: {ltst: .3f} test accuracy: {atst: .1%}")
please give me a hint
If you are looking to train on a single batch, then remove your loop over your dataloader:
for i, data in enumerate(train_loader, 0):
inputs, labels = data
And simply get the first element of the train_loader iterator before looping over the epochs, otherwise next will be called at every iteration and you will run on a different batch every epoch:
inputs, labels = next(iter(train_loader))
i = 0
for epoch in range(nepochs):
outputs = net(inputs)
loss = loss_fn(outputs, labels)
# ...

Model weights are not being updatesd, but loss is decreasing

The following code is to train an MLP with images of size 64*64, while using the loss ||output - input||^2.
For some reason, my weights per epoch are not being updated as shown at the end.
class MLP(nn.Module):
def __init__(self, size_list):
super(MLP, self).__init__()
layers = []
self.size_list = size_list
for i in range(len(size_list) - 2):
layers.append(nn.Linear(size_list[-2], size_list[-1])) = nn.Sequential(*layers)
def forward(self, x):
model_1 = MLP([4096, 64, 4096])
And for training each epoch:
def train_epoch(model, train_loader, criterion, optimizer):
running_loss = 0.0
start_time = time.time()
# train batch
for batch_idx, (data) in enumerate(train_loader):
data =
outputs = model(data)
loss = criterion(outputs, data)
running_loss += loss.item()
end_time = time.time()
weight_ll =[0].weight
running_loss /= len(train_loader)
print('Training Loss: ', running_loss, 'Time: ',end_time - start_time, 's')
return running_loss, outputs, weight_ll
for training the data:
n_epochs = 20
Train_loss = []
criterion = nn.MSELoss()
optimizer = optim.SGD(model_1.parameters(), lr = 0.1)
for i in range(n_epochs):
train_loss, output, weights_ll = train_epoch(model_1, trainloader, criterion, optimizer)
Now, when I print the weights of the first fully connected layer per epoch they aren't being updated.
The output for the above is (showing the weight in epoch 0 and in epoch 19):
tensor([ 0.0086, 0.0069, -0.0048, ..., -0.0082, -0.0115, -0.0133],
tensor([ 0.0086, 0.0069, -0.0048, ..., -0.0082, -0.0115, -0.0133],
What may be going wrong? Looking at my loss, it's decreasing at a steady rate but there is no change in the weights.
Try to change it weight_ll =[0].weight.clone().detach() or just weight_ll =[0].weight.clone() in your train_epoch() function. And you will see the weights differ.
Explanation: weights_ll are always the last epoch values if you do not clone it. It will be regarded as the same tensor in the graph. That's why your weights[0][0] equals to weights[19][0], they are actually the same tensor.

