I am making a simple PyTorch neural net to approximate the sine function on x = [0, 2pi]. This is a simple architecture I use with different deep learning libraries to test whether I understand how to use it or not. The neural net, when untrained, always produces a straight horizontal line, and when trained, produces a straight line at y = 0. In general, it always produces a straight line at y = (The mean of the function). This leads me to believe something is wrong with the forward prop portion of it, as the boundary should not just be a straight line when untrained. Here is the code for the net:
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.model = nn.Sequential(
nn.Linear(1, 20),
nn.Sigmoid(),
nn.Linear(20, 50),
nn.Sigmoid(),
nn.Linear(50, 50),
nn.Sigmoid(),
nn.Linear(50, 1)
)
def forward(self, x):
x = self.model(x)
return x
Here is the training loop
def train(net, trainloader, valloader, learningrate, n_epochs):
net = net.train()
loss = nn.MSELoss()
optimizer = torch.optim.SGD(net.parameters(), lr = learningrate)
for epoch in range(n_epochs):
for X, y in trainloader:
X = X.reshape(-1, 1)
y = y.view(-1, 1)
optimizer.zero_grad()
outputs = net(X)
error = loss(outputs, y)
error.backward()
#net.parameters() net.parameters() * learningrate
optimizer.step()
total_loss = 0
for X, y in valloader:
X = X.reshape(-1, 1).float()
y = y.view(-1, 1)
outputs = net(X)
error = loss(outputs, y)
total_loss += error.data
print('Val loss for epoch', epoch, 'is', total_loss / len(valloader) )
it is called as:
net = Net()
losslist = train(net, trainloader, valloader, .0001, n_epochs = 4)
Where trainloader and valloader are the training and validation loaders. Can anyone help me see what's wrong with this? I know its not the learning rate since its the one I use in other frameworks, and I know its not the fact im using SGD or sigmoid activation functions, although I have a suspicion the error is in the activation functions somewhere.
Does anyone know how to fix this? Thanks.
After a while playing with some hyperparameters, modifying the net and changing the optimizer (following this excellent recipe) I ended up with changing the line optimizer = torch.optim.SGD(net.parameters(), lr = learningrate) to optimizer = torch.optim.Adam(net.parameters()) (the default optimizer parameters was used), running for 100 epochs and batch size equal to 1.
The following code was used (tested on CPU only):
import torch
import torch.nn as nn
from torch.utils import data
import numpy as np
import matplotlib.pyplot as plt
# for reproducibility
torch.manual_seed(0)
np.random.seed(0)
class Dataset(data.Dataset):
def __init__(self, init, end, n):
self.n = n
self.x = np.random.rand(self.n, 1) * (end - init) + init
self.y = np.sin(self.x)
def __len__(self):
return self.n
def __getitem__(self, idx):
x = self.x[idx, np.newaxis]
y = self.y[idx, np.newaxis]
return torch.Tensor(x), torch.Tensor(y)
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.model = nn.Sequential(
nn.Linear(1, 20),
nn.Sigmoid(),
nn.Linear(20, 50),
nn.Sigmoid(),
nn.Linear(50, 50),
nn.Sigmoid(),
nn.Linear(50, 1)
)
def forward(self, x):
x = self.model(x)
return x
def train(net, trainloader, valloader, n_epochs):
loss = nn.MSELoss()
# Switch the two following lines and run the code
# optimizer = torch.optim.SGD(net.parameters(), lr = 0.0001)
optimizer = torch.optim.Adam(net.parameters())
for epoch in range(n_epochs):
net.train()
for x, y in trainloader:
optimizer.zero_grad()
outputs = net(x).view(-1)
error = loss(outputs, y)
error.backward()
optimizer.step()
net.eval()
total_loss = 0
for x, y in valloader:
outputs = net(x)
error = loss(outputs, y)
total_loss += error.data
print('Val loss for epoch', epoch, 'is', total_loss / len(valloader) )
net.eval()
f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
def plot_result(ax, dataloader):
out, xx, yy = [], [], []
for x, y in dataloader:
out.append(net(x))
xx.append(x)
yy.append(y)
out = torch.cat(out, dim=0).detach().numpy().reshape(-1)
xx = torch.cat(xx, dim=0).numpy().reshape(-1)
yy = torch.cat(yy, dim=0).numpy().reshape(-1)
ax.scatter(xx, yy, facecolor='green')
ax.scatter(xx, out, facecolor='red')
xx = np.linspace(0.0, 3.14159*2, 1000)
ax.plot(xx, np.sin(xx), color='green')
plot_result(ax1, trainloader)
plot_result(ax2, valloader)
plt.show()
train_dataset = Dataset(0.0, 3.14159*2, 100)
val_dataset = Dataset(0.0, 3.14159*2, 30)
params = {'batch_size': 1,
'shuffle': True,
'num_workers': 4}
trainloader = data.DataLoader(train_dataset, **params)
valloader = data.DataLoader(val_dataset, **params)
net = Net()
losslist = train(net, trainloader, valloader, n_epochs = 100)
Result with Adam optimizer:
Result with SGD optimizer:
In general, it always produces a straight line at y = (The mean of the function).
Usually, this means that the NN has only successfully trained the final layer so far. You need to train it for longer or with better optimizations, as ViniciusArruda shows here.
Edit: To explain further.. When only the final layer has been trained, the NN is effectively trying to guess the output y with no knowledge of the input X. In this case, the best guess it can make is the mean value. That way, it can minimize its MSE loss.
Related
I'm trying to fit a small dataset(just 7x1 size) with a 3-layer perceptron model, but the loss can't converge. I'm fresh to machine learning area, can someone please give me a hint to adjust my code?
import torch
import torch.nn as nn
import torch.nn.functional as F
vec_shape = [7, 1]
x_0 = [500, 1000, 2000, 4000, 5000, 8000, 10000]
y_0 = [1.171467, 1.486507, 11.7738, 34.448421, 75.402871, 225.319848, 492.262426]
# x = torch.tensor(x_0).reshape(vec_shape).float()
x = torch.log(torch.tensor(x_0).reshape(vec_shape).float())
y = torch.tensor(y_0).reshape(vec_shape).float()
class Net(nn.Module):
def __init__(self,n_input,n_hidden,n_output):
super(Net,self).__init__()
self.hidden1 = nn.Linear(n_input,n_hidden)
self.hidden2 = nn.Linear(n_hidden,n_hidden)
self.predict = nn.Linear(n_hidden,n_output)
def forward(self,input):
out = self.hidden1(input)
out = F.relu(out)
out = self.hidden2(out)
out = torch.sigmoid(out)
out =self.predict(out)
return out
def weight_init(self):
for op in self.modules():
if isinstance(op, nn.Linear):
nn.init.normal_(op.weight.data)
nn.init.normal_(op.bias.data)
net = Net(1,10,1)
net.weight_init()
# print(net)
optimizer = torch.optim.SGD(net.parameters(),lr = 0.1)
loss_func = torch.nn.MSELoss()
for t in range(500):
prediction = net(x)
loss = loss_func(prediction, y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if(t%50 == 0):
print('Loss = %.4f' % loss.data)
I tried to expand the model or shrink it, but both changes don't work.
Rescaling and normalization is key in machine learning, your setup is pretty good and you apply some rescaling but simply not enough. With the limited amount of datapoints you have, the range is way too large. So just like you do with x_0, apply torch.log to y_0. You can always scale back the predictions after training. Below you can find the adapted code, I changed two things:
torch.log on y_0
Learning rate to 0.01
Number of iterations to 50000
Added a print statement to show rescaling of predictions
import torch
import torch.nn as nn
import torch.nn.functional as F
vec_shape = [7, 1]
x_0 = [500, 1000, 2000, 4000, 5000, 8000, 10000]
y_0 = [1.171467, 1.486507, 11.7738, 34.448421, 75.402871, 225.319848, 492.262426]
# x = torch.tensor(x_0).reshape(vec_shape).float()
x = torch.log(torch.tensor(x_0).reshape(vec_shape).float())
y = torch.log(torch.tensor(y_0).reshape(vec_shape).float()) # modified
class Net(nn.Module):
def __init__(self,n_input,n_hidden,n_output):
super(Net,self).__init__()
self.hidden1 = nn.Linear(n_input,n_hidden)
self.hidden2 = nn.Linear(n_hidden,n_hidden)
self.predict = nn.Linear(n_hidden,n_output)
def forward(self,input):
out = self.hidden1(input)
out = F.relu(out)
out = self.hidden2(out)
out = torch.sigmoid(out)
out =self.predict(out)
return out
def weight_init(self):
for op in self.modules():
if isinstance(op, nn.Linear):
nn.init.normal_(op.weight.data)
nn.init.normal_(op.bias.data)
net = Net(1,10,1)
net.weight_init()
# print(net)
optimizer = torch.optim.SGD(net.parameters(),lr = 0.01) # modified
loss_func = torch.nn.MSELoss()
for t in range(50000): # modified
prediction = net(x)
loss = loss_func(prediction, y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if(t%50 == 0):
print('Loss = %.4f' % loss.data)
print(torch.exp(net(x))) # added
I also recommend normalizing your dataset after the logarithmic rescaling, for instance by dividing by the standard deviation and subtracting the mean.
I ran into this weird behavior when trying to "manually" optimize a network's parameters via SGD. When attempting to update the model's parameters using the following way, it works just fine:
for _ in trange(epochs):
for x, y in train_loader:
x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
loss = F.cross_entropy(m(x), y)
grad = torch.autograd.grad(loss, m.parameters())
with torch.no_grad():
for p, g in zip(m.parameters(), grad):
p -= 0.1 * g
However, doing the following throws off the model completely:
for _ in trange(epochs):
for x, y in train_loader:
x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
loss = F.cross_entropy(m(x), y)
loss.backward()
with torch.no_grad():
for p in m.parameters():
p -= 0.1 * p.grad
But to me, both methods should be equivalent. And upon further inspection, when comparing the values of g from grad with the values of p.grad from m.paramters(), it turned out that the gradient values are not the same! I also tried removing with torch.no_grad(): and doing the following, but it didn't work either:
for p in m.parameters():
p.data -= 0.1 * p.grad
Can somebody please explain why is this happening? Shouldn't the gradients in both methods have the same values (keeping in mind that both models m are identical)?
REPRODUCIBLE EXAMPLE:
Ensure reproducibility:
device = torch.device('cuda')
torch.manual_seed(0)
torch.cuda.manual_seed_all(0)
np.random.seed(0)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
torch.cuda.empty_cache()
Load the data:
T = transforms.ToTensor()
train_data = datasets.MNIST(root='data', transform=T, download=True)
test_data = datasets.MNIST(root='data', transform=T, train=False, download=True)
BS = 300
epochs = 5
LR = 0.1
train_loader = DataLoader(train_data, batch_size=BS, pin_memory=True)
test_loader = DataLoader(test_data, batch_size=1000, pin_memory=True)
Define the model to be optimized:
class Model(nn.Module):
def __init__(self, out_dims):
super().__init__()
self.conv1 = nn.Conv2d(1, out_dims, 3, stride=3, padding=1)
self.conv2 = nn.Sequential(nn.Conv2d(out_dims, out_dims * 2, 3), nn.BatchNorm2d(out_dims * 2), nn.ReLU())
self.conv3 = nn.Sequential(nn.Conv2d(out_dims * 2, out_dims * 4, 4, stride=2, padding=1), nn.BatchNorm2d(out_dims * 4), nn.ReLU(), nn.Flatten())
self.fc = nn.Linear(out_dims * 4 * 16, 10)
def forward(self, x):
return nn.Sequential(*tuple(self.children()))(x)
m1 = Model(5).to(device)
m2 = deepcopy(m1) # "m2.load_state_dict(m1.state_dict())" doesn't work either
Training and evaluation:
# M1's training:
for _ in trange(epochs):
for x, y in train_loader:
x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
loss = F.cross_entropy(m1(x), y)
grad = torch.autograd.grad(loss, m1.parameters())
with torch.no_grad():
for p, g in zip(m1.parameters(), grad):
p -= LR * g
# M1's evaluation:
m1.eval()
acc1 = []
with torch.no_grad():
for x, y in test_loader:
x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
_, pred = m1(x).max(1)
acc1.append(metric(pred, y).item())
print(f'Accuracy: {np.mean(acc1) * 100:.4}%')
# M2's training:
for _ in trange(epochs):
for x, y in train_loader:
x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
loss = F.cross_entropy(m2(x), y)
loss.backward()
with torch.no_grad():
for p in m2.parameters():
p -= LR * p.grad
# M2's evaluation:
m2.eval()
acc2 = []
with torch.no_grad():
for x, y in test_loader:
x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
_, pred = m2(x).max(1)
acc2.append(metric(pred, y).item())
print(f'Accuracy: {np.mean(acc2) * 100:.4}%')
It took me a while to figure out, but the problem was in loss.backward(). Unlike autograd.grad() which computes and returns the gradients, the inplace backward() computes and accumulates the gradients of participating nodes in the computation graph. In other words, the two will have the same effect when used to back-prop once, but every repetition of backward() will add the currently computed gradients to all previous ones (hence the divergence). Resetting the gradients using model.zero_grad() fixes stuff.
I have the following code:
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
import matplotlib.pyplot as plt
import os
import keras
from random import choice
import sys
devicet = 'cuda' if torch.cuda.is_available() else 'cpu'
device = torch.device(devicet)
if devicet == 'cpu':
print ('Using CPU')
else:
print ('Using GPU')
cuda0 = torch.device('cuda:0')
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.step1 = nn.Linear(5, 25)
self.step2 = nn.Linear(25, 50)
self.step3 = nn.Linear(50, 100)
self.step4 = nn.Linear(100, 100)
self.step5 = nn.Linear(100, 10)
self.step6 = nn.Linear(10, 1)
def forward(self, x):
x = F.relu(x)
x = self.step1(x)
x = F.relu(x)
x = self.step2(x)
x = F.relu(x)
x = self.step3(x)
x = F.relu(x)
x = self.step4(x)
x = F.relu(x)
x = self.step5(x)
x = F.relu(x)
x = self.step6(x)
x = F.relu(x)
return (x)
net = Net()
x = torch.rand(10,5)
num = choice(range(10))
zero_tensor = torch.zeros(num, 1)
one_tensor = torch.ones(10-num, 1)
y = torch.cat((zero_tensor,one_tensor),0)
x.to(devicet)
y.to(devicet)
learning_rate = 1e-3
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)
loss_fn = torch.nn.BCELoss()
acc_list = []
for i in tqdm(range(1000),desc='Training'):
y_pred = net(x)
loss = loss_fn(y_pred, y)
loss.backward()
optimizer.step()
acc_list.append(abs(net(x).detach().numpy()[0]-y.detach().numpy()[0]))
with torch.no_grad():
for param in net.parameters():
param -= learning_rate * param.grad
optimizer.zero_grad()
print ('\nFinished training in {} epochs.'.format(len(acc_list)))
plt.plot(range(len(acc_list)),acc_list)
plt.show()
for i in range(10):
print (str(net(x).detach().numpy()[i][0])+', '+str(y.detach().numpy()[i][0]))
When I run this, it consistently just prints out the following:
Image
Why won't it do any training? It works if I use MSE loss (actually, it only works sometimes with MSE loss, sometimes it does the same thing as in the image) , it's only when I use BCE that it stops working entirely.
Final layer activation
You are outputting only positive values, those should be between 0 and 1 for starters, these line specifically:
x = F.relu(x)
return (x)
Use torch.sigmoid with BCELoss or even better, just output x and use torch.nn.BCEWithLogitsLoss which uses logits directly
Training
You are using Adam optimizer and doing SGD manually here:
with torch.no_grad():
for param in net.parameters():
param -= learning_rate * param.grad
Essentially you are applying optimization step twice which is probably too much and might destroy the weights.
optimizer.step() already does this, no need for both!
Accuracy
This part:
abs(net(x).detach().numpy()[0]-y.detach().numpy()[0])
I assume you want to calculate accuracy, it would go like this (also do not push data through the network twice via net(x), you already have y_pred!):
# Assuming sigmoid activation
def accuracy(y_pred, y_true):
# For logits use
# predicted_labels = y_pred > 0.0
predicted_labels = y_pred > 0.5
return torch.mean((y_true == predicted_labels).float())
I am new to tensorflow-2 and I was starting my learning curve, with the follow simple Linear-Regression model:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
# Make data
num_samples, w, b = 20, 0.5, 2
xs = np.asarray(range(num_samples))
ys = np.asarray([x*w + b + np.random.normal() for x in range(num_samples)])
xts = tf.convert_to_tensor(xs, dtype=tf.float32)
yts = tf.convert_to_tensor(xs, dtype=tf.float32)
plt.plot(xs, ys, 'ro')
class Linear(tf.keras.Model):
def __init__(self, name='linear', **kwargs):
super().__init__(name='linear', **kwargs)
self.w = tf.Variable(0, True, name="w", dtype=tf.float32)
self.b = tf.Variable(1, True, name="b", dtype=tf.float32)
def call(self, inputs):
return self.w*inputs + self.b
class Custom(tf.keras.callbacks.Callback):
def on_epoch_end(self, epoch, logs=None):
if epoch % 20 == 0:
preds = self.model.predict(xts)
plt.plot(xs, preds, label='{} {:7.2f}'.format(epoch, logs['loss']))
print('The average loss for epoch {} is .'.format(epoch, logs['loss']))
x = tf.keras.Input(dtype=tf.float32, shape=[])
#model = tf.keras.Sequential([tf.keras.layers.Dense(units=1, input_shape=[1])])
model = Linear()
optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='MSE')
model.fit(x=xts, y=yts, verbose=1, batch_size=4, epochs=250, callbacks=[Custom()])
plt.legend()
plt.show()
For a reason I don't understand it seems like my model is not fitting the curve.
I also tried with keras.layers.Dense(1) and I had the same exact result.
Also it seems like the results don't correspond to a proper loss function, as around epoch 120 the model should have less loss than on 250.
Can you maybe help me understand what I am doing wrong?
Thanks a lot!
There is a small bug in your code as xts and yts are identical to each other, i.e. you wrote
xts = tf.convert_to_tensor(xs, dtype=tf.float32)
yts = tf.convert_to_tensor(xs, dtype=tf.float32)
instead of
xts = tf.convert_to_tensor(xs, dtype=tf.float32)
yts = tf.convert_to_tensor(ys, dtype=tf.float32)
which is why the loss doesn't make sense. Once this has been fixed the results are as expected, see the plot below.
H1, I am try to make NN model that satisfy simple formula.
y = X1^2 + X2^2
But when i use CrossEntropyLoss for loss function, i get two different error message.
First, when i set code like this
x = torch.randn(batch_size, 2)
y_hat = model(x)
y = answer(x).long()
optimizer.zero_grad()
loss = loss_func(y_hat, y)
loss.backward()
optimizer.step()
i get this message
RuntimeError: Assertion `cur_target >= 0 && cur_target < n_classes' failed. at
c:\programdata\miniconda3\conda-bld\pytorch_1533090623466\work\aten\src\thnn\generic/Cl
assNLLCriterion.c:93
Second, I change code like this
x = torch.randn(batch_size, 2)
y_hat = model(x)
y = answer(x).long().view(batch_size,1,1)
optimizer.zero_grad()
loss = loss_func(y_hat, y)
loss.backward()
optimizer.step()
then i get message like
RuntimeError: multi-target not supported at c:\programdata\miniconda3\conda-bld\pytorch_1533090623466\work\aten\src\thnn\generic/ClassNLLCriterion.c:21
How can i solve this problem? Thanks.(sorry for my English)
This is my code
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
def answer(x):
y = x[:,0].pow(2) + x[:,1].pow(2)
return y
class Model(nn.Module):
def __init__(self, input_size, output_size):
super(Model, self).__init__()
self.linear1 = nn.Linear(input_size, 10)
self.linear2 = nn.Linear(10, 1)
def forward(self, x):
y = F.relu(self.linear1(x))
y = F.relu(self.linear2(y))
return y
model = Model(2,1)
print(model, '\n')
loss_func = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr = 0.001)
batch_size = 3
epoch_n = 100
iter_n = 100
for epoch in range(epoch_n):
loss_avg = 0
for i in range(iter_n):
x = torch.randn(batch_size, 2)
y_hat = model(x)
y = answer(x).long().view(batch_size,1,1)
optimizer.zero_grad()
loss = loss_func(y_hat, y)
loss.backward()
optimizer.step()
loss_avg += loss
loss_avg = loss_avg / iter_n
if epoch % 10 == 0:
print(loss_avg)
if loss_avg < 0.001:
break
Can i make those dataset using dataloader in pytorch? Thanks for your help.
You are using the wrong loss function. CrossEntropyLoss is used for classification problems generally wheread your problem is that of regression. So you should use losses which are meant for regression like tasks like Mean Squared Error Loss, L1 Loss etc. Take a look at this, this, this and this.