How to normalize images in PyTorch - python

transform = transforms.Compose([
transforms.ToTensor()
])
trainset = torchvision.datasets.ImageFolder(root='C:/Users/beomseokpark/Desktop/CNN/train_data', transform = transform)
data_loader = DataLoader(dataset = trainset, batch_size = 8, shuffle = True, num_workers=2)
with torch.no_grad():
for num, data in enumerate(trainset):
imgs, label = data
I loaded images with ImageFolder in torchvision library, and how can I get mean and std from each channel of my images?
Can anyone please help me out?

There's the "lazy man" approach: You can simply plug a nn.BatchNorm2d as the very first layer of your network. With the appropriate momentum, and track_running_stats=True this layer will estimate your data's mean and variance for you.
Alternatively, you can compute the mean and variance using
mu = torch.zeros((3,), dtype=torch.float)
sig = torch.zeros((3,), dtype=torch.float)
n = 0
with torch.no_grad():
for num, data in enumerate(trainset):
imgs, _ = data
mu += torch.sum(imgs, dim=(0, 2, 3))
sig += torch.sum(imgs**2, dim=(0, 2, 3))
n += imgs.numel() // imgs.shape[0]
n = float(n)
mu = mu / n # mean
sig = sig / n - (mu ** 2)

import torch as t
batch_size = 8
imgs = t.empty(batch_size, 3, 128, 128).normal_()
t.nn.Flatten(start_dim=1)(imgs.permute(1, 0, 2, 3)).mean(dim=1)
t.nn.Flatten(start_dim=1)(imgs.permute(1, 0, 2, 3)).std(dim=1).shape
torch.Size([3])

Related

RuntimeError: Given groups=1, weight of size [6, 3, 3, 3], expected input[4, 224, 3, 224] to have 3 channels, but got 224 channels instead

I'm trying to code a CNN that recognized if a person is wearing a mask or not and what type of mask they wearing in a portrait of themselves. My training dataset contains about 1500 photos of all the needed classes with a balance in the amount of picture between them and my test dataset contains about 450. I coded the traindata loader and the tester, but I'm getting this error:
Traceback (most recent call last):
File "C:\Users\Chris\PycharmProjects\Project1\main.py", line 128, in <module>
outputs = model(images)
RuntimeError: Given groups=1, weight of size [6, 3, 3, 3], expected input[4, 224, 3, 224] to have 3 channels, but got 224 channels instead
This is my code:
import os
import random
import cv2
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from matplotlib import pyplot as plt
from tqdm import tqdm
# Hyper-parameters
num_epochs = 3
batch_size = 4
learning_rate = 0.001
# Device will determine whether to run the training on GPU or CPU.
use_cuda = torch.cuda.is_available()
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if __name__ == '__main__':
# Use transforms.compose method to reformat images for modeling and save to variable all_transforms for later use dataset has PILImage images of range [0, 1]. We transform them to Tensors of normalized range [-1, 1]
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
#give paths to train and test datasets
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
test_dataset = torchvision.datasets.ImageFolder(root=ROOT_DIR + "/Face_Mask_Dataset/Test/", transform=transform)
classes = ('WithoutMask', 'Clothmask', 'Surgicalmask', 'N95Mask')
imgSize = 255
train_data = []
test_data = []
def imshow(img):
img = img / 2 + 0.5 # unnormalize
npimg = img.numpy()
plt.imshow(np.transpose(npimg, (1, 2, 0)))
plt.show()
def create_training_data():
counter = 0
rnd = random.randrange(0, 1001)
for category in classes: # cycle through categories
path = os.path.join(ROOT_DIR + "/Face_Mask_Dataset/Train/", category) # create path to categories
class_num = classes.index(category) # get the classification by index per category
for img in tqdm(os.listdir(path)): # iterate over each image per category
try:
img_array = cv2.imread(os.path.join(path, img)) # convert to array
new_array = cv2.resize(img_array, (imgSize, imgSize)) # resize to normalize data size
train_data.append([new_array, class_num]) # add this to our training_data
counter += 1
if counter == rnd:
plt.imshow(new_array, cmap='gray') # graph it
plt.show()
except Exception as e:
pass
create_training_data()
train_dataset = train_data
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
class CNN(nn.Module):
def __init__(self):
super(CNN, self).__init__()
self.conv1 = nn.Conv2d(3, 6, 3)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(59536, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
x = x.permute(0, 3, 1, 2)
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(x.size(0), -1)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
model = CNN()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
optimizer.param_groups
criterion = nn.CrossEntropyLoss()
n_total_steps = len(train_loader)
for epoch in range(num_epochs):
for i, (images, labels) in enumerate(train_loader):
# origin shape: [6, 3, 5, 5] = 6, 3, 25
# input_layer: 3 input channels, 6 output channels, 5 kernel size
images = images.to(device)
labels = labels.to(device)
# Forward pass
images = images.float()
outputs = model(images)
loss = criterion(outputs, labels)
# Backward and optimize
optimizer.zero_grad()
torch.autograd.set_detect_anomaly(True)
loss.backward(retain_graph=True)
optimizer.step()
if (i + 1) % 2000 == 0:
print(f'Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{n_total_steps}], Loss: {loss.item():.4f}')
print('Finished Training')
PATH = './cnn.pth'
torch.save(model.state_dict(), PATH)
with torch.no_grad():
n_correct = 0
n_samples = 0
n_class_correct = [0 for i in range(10)]
n_class_samples = [0 for i in range(10)]
for images, labels in test_loader:
images = images.to(device)
labels = labels.to(device)
outputs = model(images)
# max returns (value ,index)
_, predicted = torch.max(outputs, 1)
n_samples += labels.size(0)
n_correct += (predicted == labels).sum().item()
for i in range(batch_size):
label = labels[i]
pred = predicted[i]
if (label == pred):
n_class_correct[label] += 1
n_class_samples[label] += 1
acc = 100.0 * n_correct / n_samples
print(f'Accuracy of the network: {acc} %')
for i in range(10):
acc = 100.0 * n_class_correct[i] / n_class_samples[i]
print(f'Accuracy of {classes[i]}: {acc} %')
forgive me for unclarities in the code I'm still a beginner and trying my best.
It looks like your problem is using
torchvision.datasets.ImageFolder for test_loader and in your forward func you are permuting the channels order
x = x.permute(0, 3, 1, 2)
ImageFolder returns the images in the right order (N,C,W,H), and then you are changing it to (N,H,C,W).
You need to change the order in create_training_data() to (C,W,H) and then you can remove the x.permute(0, 3, 1, 2) from the forward
example:
def create_training_data():
counter = 0
rnd = random.randrange(0, 1001)
for category in classes: # cycle through categories
path = os.path.join(ROOT_DIR + "/Face_Mask_Dataset/Train/", category) # create path to categories
class_num = classes.index(category) # get the classification by index per category
for img in tqdm(os.listdir(path)): # iterate over each image per category
try:
img_array = cv2.imread(os.path.join(path, img)) # convert to array
new_array = cv2.resize(img_array, (imgSize, imgSize)) # resize to normalize data size
new_array = np.transpose(new_array, (2, 0, 1))
train_data.append([new_array, class_num]) # add this to our training_data
counter += 1
if counter == rnd:
plt.imshow(new_array, cmap='gray') # graph it
plt.show()
except Exception as e:
pass
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(x.size(0), -1)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x

Why am getting precision , recall as zero in ANFIS model using tensorflow in python

i have build ANFIS model with tensorflow for classification problem. For every epoch i am getting precision and recall as zero. I am using guassian membership function but when i print sigma it is giving 0.Used below code for training
## settings
n = X_train.shape[1] # no of input features
m = 2*n # number of fuzzy rules
learning_rate = 0.01
epochs = 1000
################################ train
X_train_t = tf.placeholder(tf.float32, shape=[None, n]) # Train input
y_train_t = tf.placeholder(tf.float32, shape=None) # Train output
mu = tf.get_variable(name="mu", shape=[m * n], initializer=tf.random_normal_initializer(0, 1)) # mean of Gaussian MFS
sigma = tf.get_variable(name="sigma", shape = [m * n], initializer=tf.random_normal_initializer(0, 1)) # std_dev of Gaussian MFS
w = tf.get_variable(name="w", shape= [1, m], initializer=tf.random_normal_initializer(0, 1))
rula = tf.reduce_prod(tf.reshape(tf.exp( -0.5* ((tf.tile(X_train_t, (1, m))- mu)**2) / (sigma**2)),
(-1, m, n)), axis=2) #activations
Y_train_t = tf.reduce_sum(rula*w,axis=1) / tf.clip_by_value(tf.reduce_sum(rula,axis=1), 1e-8, 1e8)
#loss = tf.losses.log_loss(y_train, Y_train) # loss function
loss = tf.losses.sigmoid_cross_entropy(y_train_t, Y_train_t) # loss function
#loss = tf.sqrt(tf.losses.mean_squared_error(y_train, Y_train))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss) # optimizer
################################ test
X_test_t = tf.placeholder(tf.float32, shape=[None, n]) # Test input
y_test_t = tf.placeholder(tf.float32, shape=None) # Train output
rula_test = tf.reduce_prod(tf.reshape(tf.exp( -0.5* ((tf.tile(X_test_t, (1, m))- mu)**2) / (sigma**2)),
(-1, m, n)), axis=2) # rule activation
Y_test_t = tf.reduce_sum(rula_test*w,axis=1) / tf.clip_by_value(tf.reduce_sum(rula_test,axis=1), 1e-8, 1e8)
loss_test = tf.losses.sigmoid_cross_entropy(y_test_t, Y_test_t) # loss function
################################ start session
x_axis = []
tr_loss, te_loss = [],[]
tr_prec, te_prec = [], []
tr_rec, te_rec = [], []
init=tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for e in range(epochs):
Y_train, loss_tr, _ = sess.run([Y_train_t, loss, optimizer], feed_dict={X_train_t: X_train, y_train_t: y_train})
Y_test, loss_te = sess.run([Y_test_t, loss_test], feed_dict={X_test_t: X_test, y_test_t: y_test})
if (e+1) % 10 == 0:
x_axis.append(e+1)
tr_loss.append(loss_tr)
te_loss.append(loss_te)
Y_train = np.where(Y_train > 0, 1, 0)
Y_test = np.where(Y_test > 0, 1, 0)
prec_tr = precision_score(y_train,Y_train)
prec_te = precision_score(y_test,Y_test)
rec_tr = recall_score(y_train,Y_train)
rec_te = recall_score(y_test,Y_test)
tr_prec.append(prec_tr)
te_prec.append(prec_te)
tr_rec.append(rec_tr)
te_rec.append(rec_te)
code is referenced from https://github.com/subhalingamd/ANFIS-diabetes-prediction/blob/main/main.py
I am new to this algorithm.Please, help me where am gone wrong.

I get a tensor of 600 values instead of 3 values for mean and std of train_loader in PyTorch

I am trying to Normalize my images data and for that I need to find the mean and std for train_loader.
mean = 0.0
std = 0.0
nb_samples = 0.0
for data in train_loader:
images, landmarks = data["image"], data["landmarks"]
batch_samples = images.size(0)
images_data = images.view(batch_samples, images.size(1), -1)
mean += torch.Tensor.float(images_data).mean(2).sum(0)
std += torch.Tensor.float(images_data).std(2).sum(0)
###mean += images_data.mean(2).sum(0)
###std += images_data.std(2).sum(0)
nb_samples += batch_samples
mean /= nb_samples
std /= nb_samples
the mean and std here are each a torch.Size([600])
When I tried (almost) same code on dataloader, it worked as expected:
# code from https://discuss.pytorch.org/t/about-normalization-using-pre-trained-vgg16-networks/23560/6?u=mona_jalal
mean = 0.0
std = 0.0
nb_samples = 0.0
for data in dataloader:
images, landmarks = data["image"], data["landmarks"]
batch_samples = images.size(0)
images_data = images.view(batch_samples, images.size(1), -1)
mean += images_data.mean(2).sum(0)
std += images_data.std(2).sum(0)
nb_samples += batch_samples
mean /= nb_samples
std /= nb_samples
and I got:
mean is: tensor([0.4192, 0.4195, 0.4195], dtype=torch.float64), std is: tensor([0.1182, 0.1184, 0.1186], dtype=torch.float64)
So my dataloader is:
class MothLandmarksDataset(Dataset):
"""Face Landmarks dataset."""
def __init__(self, csv_file, root_dir, transform=None):
"""
Args:
csv_file (string): Path to the csv file with annotations.
root_dir (string): Directory with all the images.
transform (callable, optional): Optional transform to be applied
on a sample.
"""
self.landmarks_frame = pd.read_csv(csv_file)
self.root_dir = root_dir
self.transform = transform
def __len__(self):
return len(self.landmarks_frame)
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
img_name = os.path.join(self.root_dir, self.landmarks_frame.iloc[idx, 0])
image = io.imread(img_name)
landmarks = self.landmarks_frame.iloc[idx, 1:]
landmarks = np.array([landmarks])
landmarks = landmarks.astype('float').reshape(-1, 2)
sample = {'image': image, 'landmarks': landmarks}
if self.transform:
sample = self.transform(sample)
return sample
transformed_dataset = MothLandmarksDataset(csv_file='moth_gt.csv',
root_dir='.',
transform=transforms.Compose(
[
Rescale(256),
RandomCrop(224),
ToTensor()
]
)
)
dataloader = DataLoader(transformed_dataset, batch_size=3,
shuffle=True, num_workers=4)
and train_loader is:
# Device configuration
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
# split the dataset into validation and test sets
len_valid_set = int(0.1*len(dataset))
len_train_set = len(dataset) - len_valid_set
print("The length of Train set is {}".format(len_train_set))
print("The length of Test set is {}".format(len_valid_set))
train_dataset , valid_dataset, = torch.utils.data.random_split(dataset , [len_train_set, len_valid_set])
# shuffle and batch the datasets
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=4)
test_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=8, shuffle=True, num_workers=4)
Please let me know if more information is needed.
I basically need to get 3 values for mean of train_loader and 3 values for std of train_loader to use as args for Normalize.
images_data in dataloader is torch.Size([3, 3, 50176]) inside the loop and images_data in train_loader is torch.Size([8, 600, 2400])
First, the weird shape you get for your mean and std ([600]) is unsuprising, it is due to your data having the shape [8, 600, 800, 3]. Basically, the channel dimension is the last one here, so when you try to flatten your images with
# (N, 600, 800, 3) -> [view] -> (N, 600, 2400 = 800*3)
images_data = images.view(batch_samples, images.size(1), -1)
You actually perform a weird operation that fuses together the width and channel dimensions of your image which is now [8, 600, 2400]. Thus, applying
# (8, 600, 2400) -> [mean(2)] -> (8, 600) -> [sum(0)] -> (600)
data.mean(2).sum(0)
Creates a tensor of size [600] which is what you indeed get.
There are two quite simple solutions :
Either you start by permuting the dimensions to make the 2nd dimension the channel one :
batch_samples = images.size(0)
# (N, H, W, C) -> (N, C, H, W)
reordered = images.permute(0, 3, 1, 2)
# flatten image into (N, C, H*W)
images_data = reordered.view(batch_samples, reordered.size(1), -1)
# mean is now (C) = (3)
mean += images_data.mean(2).sum(0)
Or you changes the axis along which to apply mean and sum
batch_samples = images.size(0)
# flatten image into (N, H*W, C), careful this is not what you did
images_data = images.view(batch_samples, -1, images.size(1))
# mean is now (C) = (3)
mean += images_data.mean(1).sum(0)
Finally, why did dataloaderand trainloader behave differently ? Well I think it's because one is using dataset while the other is using transformedDataset. In TransformedDataset, you apply the toTensortransform which cast a PIL image into a torch tensor, and I think that pytorch is smart enough to permute your dimensions during this operation (and put the channels in the second dimension). In other word, your two datasets just do not yield images with identical format, they differ by a permutation of the axis.

Denoising linear autoencoder learns to output a constant instead of denoising

I am trying to create a denoising autoencoder for 1d cyclic signals like cos(x) etc.
The process of creating the dataset is that I pass a list of cyclic functions and for each example generated it rolls random coefficients for each function in the list so every function generated is different yet cyclic. eg - 0.856cos(x) - 1.3cos(0.1x)
Then I add noise and normalize the signal to be between [0, 1).
Next, I train my autoencoder on it but it learns to output a constant (usually 0.5). my guess is that it happens because 0.5 is the usual mean value of the normalized functions. But this is not the result im aspiring to get at all.
I am providing the code I wrote for the autoencoder, the data generator and the training loop as well as two pictures depicting the problem im having.
first example:
second example:
Linear autoencoder:
class LinAutoencoder(nn.Module):
def __init__(self, in_channels, K, B, z_dim, out_channels):
super(LinAutoencoder, self).__init__()
self.in_channels = in_channels
self.K = K # number of samples per 2pi interval
self.B = B # how many intervals
self.out_channels = out_channels
encoder_layers = []
decoder_layers = []
encoder_layers += [
nn.Linear(in_channels * K * B, 2*z_dim, bias=True),
nn.ReLU(),
nn.Linear(2*z_dim, z_dim, bias=True),
nn.ReLU(),
nn.Linear(z_dim, z_dim, bias=True),
nn.ReLU()
]
decoder_layers += [
nn.Linear(z_dim, z_dim, bias=True),
nn.ReLU(),
nn.Linear(z_dim, 2*z_dim, bias=True),
nn.ReLU(),
nn.Linear(2*z_dim, out_channels * K * B, bias=True),
nn.Tanh()
]
self.encoder = nn.Sequential(*encoder_layers)
self.decoder = nn.Sequential(*decoder_layers)
def forward(self, x):
batch_size = x.shape[0]
x_flat = torch.flatten(x, start_dim=1)
enc = self.encoder(x_flat)
dec = self.decoder(enc)
res = dec.view((batch_size, self.out_channels, self.K * self.B))
return res
The data generator:
def lincomb_generate_data(batch_size, intervals, sample_length, functions, noise_type="gaussian", **kwargs)->torch.tensor:
channels = 1
mul_term = 2 * np.pi / sample_length
positions = np.arange(0, sample_length * intervals)
x_axis = positions * mul_term
X = np.tile(x_axis, (channels, 1))
y = X
Y = np.repeat(y[np.newaxis, :], batch_size, axis=0)
if noise_type == "gaussian":
# defaults to 0, 0.4
noise_mean = kwargs.get("noise_mean", 0)
noise_std = kwargs.get("noise_std", 0.4)
noise = np.random.normal(noise_mean, noise_std, Y.shape)
if noise_type == "uniform":
# defaults to 0, 1
noise_low = kwargs.get("noise_low", 0)
noise_high = kwargs.get("noise_high", 1)
noise = np.random.uniform(noise_low, noise_high, Y.shape)
coef_lo = -2
coef_hi = 2
coef_mat = np.random.uniform(coef_lo, coef_hi, (batch_size, len(functions))) # creating a matrix of coefficients
coef_mat = np.where(np.abs(coef_mat) < 10**-1, 0, coef_mat)
for i in range(batch_size):
curr_res = np.zeros((channels, sample_length * intervals))
for func_id, function in enumerate(functions):
curr_func = functions[func_id]
curr_coef = coef_mat[i][func_id]
curr_res += curr_coef * curr_func(Y[i, :, :])
Y[i, :, :] = curr_res
clean = Y
noisy = clean + noise
# Normalizing
clean -= clean.min(axis=2, keepdims=2)
clean /= clean.max(axis=2, keepdims=2) + 1e-5 #avoiding zero division
noisy -= noisy.min(axis=2, keepdims=2)
noisy /= noisy.max(axis=2, keepdims=2) + 1e-5 #avoiding zero division
clean = torch.from_numpy(clean)
noisy = torch.from_numpy(noisy)
return x_axis, clean, noisy
Training loop:
functions = [lambda x: np.cos(0.1*x),
lambda x: np.cos(x),
lambda x: np.cos(3*x)]
num_epochs = 200
lin_loss_list = []
criterion = torch.nn.MSELoss()
lin_optimizer = torch.optim.SGD(lin_model.parameters(), lr=0.01, momentum=0.9)
_, val_clean, val_noisy = util.lincomb_generate_data(batch_size, B, K, functions, noise_type="gaussian")
print("STARTED TRAINING")
for epoch in range(num_epochs):
# generate data returns the x-axis used for plotting as well as the clean and noisy data
_, t_clean, t_noisy = util.lincomb_generate_data(batch_size, B, K, functions, noise_type="gaussian")
# ===================forward=====================
lin_output = lin_model(t_noisy.float())
lin_loss = criterion(lin_output.float(), t_clean.float())
lin_loss_list.append(lin_loss.data)
# ===================backward====================
lin_optimizer.zero_grad()
lin_loss.backward()
lin_optimizer.step()
val_lin_loss = F.mse_loss(lin_model(val_noisy.float()), val_clean.float())
print("DONE TRAINING")
edit: shared the parameters requested
L = 1
K = 512
B = 2
batch_size = 64
z_dim = 64
noise_mean = 0
noise_std = 0.4
The problem was I didnt use nn.BatchNorm1d in my model so i guess something wrong happened during training (probably vanishing gradients).

'None' gradients in pytorch

I am trying to implement a simple MDN that predicts the parameters of a distribution over a target variable instead of a point value, and then assigns probabilities to discrete bins of the point value. Narrowing down the issue, the code from which the 'None' springs is:
import torch
# params
tte_bins = np.linspace(
start=0,
stop=399,
num=400,
dtype='float32'
).reshape(1, 1, -1)
bins = torch.tensor(tte_bins, dtype=torch.float32)
x_train = np.random.randn(1, 1024, 3)
y_labels = np.random.randint(low=0, high=399, size=(1, 1024))
y_train = np.eye(400)[y_labels]
# data
in_train = torch.tensor(x_train[0:1, :, :], dtype=torch.float)
in_train = (in_train - torch.mean(in_train)) / torch.std(in_train)
out_train = torch.tensor(y_train[0:1, :, :], dtype=torch.float)
# model
linear = torch.nn.Linear(in_features=3, out_features=2)
lin = linear(in_train)
preds = torch.exp(lin)
# intermediate values
alpha = torch.clamp(preds[0:1, :, 0:1], 0, 500)
beta = torch.clamp(preds[0:1, :, 1:2], 0, 100)
# probs
p1 = torch.exp(-torch.pow(bins / alpha, beta))
p2 = torch.exp(-torch.pow((bins + 1.0) / alpha, beta))
probs = p1 - p2
# loss
loss = torch.mean(torch.pow(out_train - probs, 2))
# gradients
loss.backward()
for p in linear.parameters():
print(p.grad, 'gradient')
in_train has shape: [1, 1024, 3], out_train has shape: [1, 1024, 400], bins has shape: [1, 1, 400]. All the broadcasting etc.. appears find, the resulting matrices (like alpha/beta/loss) are the right shape and have the right values - there's simply no gradients
edit: added loss.backward() and x_train/y_train, now I have nans
You simply forgot to compute the gradients. While you calculate the loss, you never tell pytorch with respect to which function it should calculate the gradients.
Simply adding
loss.backward()
to your code should fix the problem.
Additionally, in your code some intermediate results like alpha are sometimes zero but are in a denominator when computing the gradient. This will lead to the nan results you observed.

Categories

Resources