Pytorch parameters won't update with custom loss function (Pytorch) - python

I am trying to use the optimizer to tune a set of parameters for a cost function that includes, among other things, a forward pass across a neural network. The parameters specify the means and variances of the weights of this neural network. However, when updating the parameters at every iteration of the optimization process, all the terms of the cost function except the one belonging to the forward pass contribute to the parameter update. That is, if all other terms are commented out, no parameters will update. Are there any ways of fixing this issue?
EDIT: I added a contrived example below.
import torch
class TestNN(torch.nn.Module):
def __init__(self):
super(TestNN, self).__init__()
self.fc1 = torch.nn.Linear(10, 1)
def forward(self, x):
x = self.fc1(x)
return x
def getParameters(self):
return [self.fc1.weight.transpose(0, 1), self.fc1.bias]
def setParameters(self, parameters):
# Can anything be done here to keep parameters in the graph?
weight, bias = parameters
self.fc1.weight = torch.nn.Parameter(weight.transpose(0, 1))
self.fc1.bias = torch.nn.Parameter(bias)
def computeCost(parameters, input):
testNN = TestNN()
testNN.setParameters(parameters)
cost = testNN(input) ** 2
print(cost) # Cost stays the same :(
return cost
def minimizeLoss(maxIter, optimizer, lossFunc, lossFuncArgs):
for i in range(maxIter):
optimizer.zero_grad()
loss = lossFunc(*lossFuncArgs)
loss.backward(retain_graph = True)
optimizer.step()
if i % 100 == 0:
print(loss)
input = torch.randn(1, 10)
weight = torch.ones(10, 1)
bias = torch.ones(1, 1)
parameters = (weight, bias)
lossArgs = (parameters, input)
optimizer = torch.optim.Adam(parameters, lr = 0.01)
minimizeLoss(10, optimizer, computeCost, lossArgs)

Related

Why is my neural network predicting -0 (PYTHON - backpropagation XOR)?

I'm working on developing a neural network from scratch. The issue seems to maybe be with my relu back-propagation. When I train the model it sometimes outputs -0 and sometimes outputs good predictions (relatively). Can someone tell me if I'm doing my back propagation incorrectly or if there's a reason why my relu would be predicting -0?
--
[edit]
Fixed the issue of predicting -0, but now it just predicts 0 for all inputs for the XOR. Can someone look over my backpropagation?
import numpy as np
# Each layer in our neural network
class NeuralLayer:
def __init__(self, input_neurons, output_neurons):
self.weights = np.random.randn(input_neurons, output_neurons)* np.sqrt(2. / input_neurons)
self.bias = np.ones((1,output_neurons)) * 0.5
# Two different activations, sigmoid by default
def sigmoid(self, neurons):
self.act = 1.0/(1.0 + np.exp(-neurons))
return self.act
def sigmoidBackward(self, grad):
return grad * self.act * (1 - self.act)
def relu(self, neurons):
self.act = (neurons > 0)
return neurons * self.act
def reluBackward(self, grad):
return grad * self.act
# Forward pass for this layer
def forward(self, input, activation):
self.input = np.atleast_2d(input)
if activation == 'sigmoid':
return self.sigmoid(input # self.weights + self.bias)
else:
return self.relu(input # self.weights + self.bias)
# backward pass for this layer
def backward(self, grad, activation):
if activation == 'sigmoid':
grad = self.sigmoidBackward(np.atleast_2d(grad))
else:
grad = self.reluBackward(np.atleast_2d(grad))
self.grad_weights = np.matmul(self.input.T, grad)
self.grad_bias = grad.sum()
return grad # self.weights.T
def step(self, step_size):
self.weights -= step_size*self.grad_weights
self.bias -= step_size*self.grad_bias
# Our neural net
class NeuralNetwork:
# Dynamically create all layers
def __init__(self, input_neurons, hidden_neurons, layer_count, activation, output_neurons = 1):
self.activation = activation
# Used to ensure input neurons match inputted data
self.neuron_safety = input_neurons
assert layer_count >= 2 and output_neurons >= 1
# Input layer
self.layers = [NeuralLayer(input_neurons, hidden_neurons)]
# Hidden Layers
for i in range(layer_count - 2):
self.layers.append(NeuralLayer(hidden_neurons, hidden_neurons))
# Output layer
self.layers.append(NeuralLayer(hidden_neurons, output_neurons))
# Forward pass for each layer
def forward(self, inp):
assert inp.shape[0] == self.neuron_safety
for layer in self.layers:
inp = layer.forward(inp, self.activation)
return inp
def backward(self, grad):
for layer in reversed(self.layers):
grad = layer.backward(grad, self.activation)
def step(self, step_size = 0.01):
for layer in self.layers:
layer.step(step_size)
# loss function - only 1 output neuron
def meanSquaredError(self, preds, labels):
self.labels = labels
self.preds = preds
return (self.preds - self.labels)**2
def meanSquaredErrorGrad(self):
return 2 * (self.preds - self.labels)
# Create a neural network with 2 inputs, 2 hidden neurons in each layer, and 2 layers
net = NeuralNetwork(2,16,4, 'relu')
epochs = 5000
# Input data (A,B) for XOR
X = np.array([[0,0],[1,1], [1,0],[0,1]])
# Expected output data
Y = np.array([[0],[0],[1],[1]])
for i in range(epochs):
preds = []
for idx, x in enumerate(X):
predictions = net.forward(x)
preds.append(predictions)
loss = net.meanSquaredError(predictions, Y[idx])
loss_grad = net.meanSquaredErrorGrad()
net.backward(loss_grad)
net.step()
print("Model predicted: {}\nactual values: {} ".format(preds, Y.T))
Output:
Model predicted: [array([[-0.]]), array([[-0.]]), array([[1.]]), array([[-0.]])]
actual values: [[0 0 1 1]]
Sometimes the predictions are perfect, but most of the time at least one prediction will be -0
The bias gradient is incorrect. You are using self.grad_bias = grad.sum(). This will compute the sum of the entire matrix. It needs to be self.grad_bias = grad.sum(axis=0, keepdims=True) to compute a 1 x output_neurons array that will properly update the bias vector. Otherwise, grad.sum() provides a single number that you are using to update all of your biases, which is not correct.
Also, make sure you update your forward pass for your ReLU to np.maximum(neurons, 0) as described in the comments.
def relu(self, neurons):
self.act = (neurons > 0)
return np.maximum(neurons, 0)
The gradient of the activations will be 0 or 1 depending on which parts of the inputs were positive.
Finally, for the XOR problem you typically do not use ReLU as the activation for the output layer because it is not bounded between [0-1] as per the XOR problem. The reason why you got good results with the sigmoid activation function is that the dynamic range of that activation function suits the XOR problem well. As an experiment, you can modify the output layer to be sigmoid, and the hidden layers to be ReLU. If you do this, you should get just as good a performance as using sigmoid all the way.

Does adding a forward hook to a layer of ensure that the gradient of the loss calculated using the layer's output will be calculated automatically?

I have a model
class NewModel(nn.Module):
def __init__(self,output_layer,*args):
self.output_layer = output_layer
super().__init__(*args)
self.output_layer = output_layer
self.selected_out = None
#PRETRAINED MODEL
self.pretrained = models.resnet18(pretrained=True)
#TAKING OUTPUT FROM AN INTERMEDIATE LAYER
#self._layers = []
for l in list(self.pretrained._modules.keys()):
#self._layers.append(l)
if l == self.output_layer:
handle = getattr(self.pretrained,l).register_forward_hook(self.hook)
def hook(self,module, input,output):
self.selected_out = output
def forward(self, x):
return x = self.pretrained(x)
I have two target outputs, one which is same as any label of an image and the second one is the same dimensions as the output obtained from self.output_layer, called target_feature
out = model(img)
layerout = model.selected_out
Now, if I want to calculate the loss of layerout with the target feature map, can it be done like the line written below?
loss = criterion(y_true, out) + feature_criterion(layerout, target_feature)
Or do I need to add backward_hooks?
In this Kaggle notebook
https://www.kaggle.com/sironghuang/understanding-pytorch-hooks
it is written that loss.backward() cannot be used when using backward_hooks.
Quoting the author
# backprop once to get the backward hook results
out.backward(torch.tensor([1,1],dtype=torch.float),retain_graph=True)
#! loss.backward(retain_graph=True) # doesn't work with backward hooks,
#! since it's not a network layer but an aggregated result from the outputs of last layer vs target
Then how can be gradient be calculated based on the loss function?
If I understand you correctly, you want to get two outputs from your model, calculate two losses, then combine them and backpropagate. I imagine you come from Tensorflow & Keras from the way you tried implementing it. In Pytorch, it's actually fairly straight foward, you can do this very easily because of its purely functional aspect.
This is just an example:
class NewModel(nn.Module):
def __init__(self, output_layer, *args):
super(MyModel, self).__init__()
self.pretrained = models.resnet18(pretrained=True)
self.output_layer = output_layer
def forward(self, x):
out = self.pretrained(x)
features = self.output_layer(out)
return out, features
On inference, you will get two results per call:
>>> m = NewModel(nn.Linear(1000, 10))
>>> x = torch.rand(16, 3, 224, 224)
>>> y_pred, y_feature = m(x)
Call you loss functions:
>>> loss = criterion(y_pred, y_true) + feature_criterion(y_feature, target_feature)
Then, backpropagate with loss.backward().
So no need for hooks, nor complicated gradient on your .backward call!
Edit - If you wish to extract an intermediate layer output, keep the hook, that's good. And just modify the forward definition.
def forward(self, x):
out = self.pretrained(x)
return out, self.selected_out
For example:
>>> m = NewModel(output_layer='layer1')
>>> x = torch.rand(16, 3, 224, 224)
>>> y_pred, y_feature = m(x)
>>> y_pred.shape, y_feature.shape
(torch.Size([16, 1000]), torch.Size([16, 64, 56, 56]))
Also, what I said above about the loss stills stands. Compute your loss, then call loss.backward().

Alternatively train multi task learning model in pytorch - weight updating

I want to build a multi task learning model on two related datasets with different inputs and targets. The two tasks are sharing lower-level layers but with different header layers, a minimal example:
class MultiMLP(nn.Module):
"""
A simple dense network for MTL on hard parameter sharing.
"""
def __init__(self):
super().__init__()
self.hidden = nn.Linear(100, 200)
self.out_task0= nn.Linear(200, 1)
self.out_task0= nn.Linear(200, 1)
def forward(self, x):
x = self.hidden(x)
x = F.relu(x)
y_task0 = self.out_task0(x)
y_task1 = self.out_task1(x)
return [y_task0, y_task1]
The dataloader is constructed so that the batches are alternatively generated from two datasets, i.e. batch 0, 2, 4, ... from task 0, batch 1, 3, 5, ... from task 1. I wanted to train the network in this way: only update weights for hidden layer and out_task0 for batches from task 0, and update only hidden and out_task1 for task 1.
I then alternatively switch requires_grad for the corresponding tasks during training as following. But I observed that all weights are updated for every iteration.
...
criterion = MSELoss()
for i, data in enumerate(combined_loader):
x, y = data[0], data[1]
optimizer.zero_grad()
# controller is 0 for task0, 1 for task1
# altenate the header layer
controller = i % 2
task0_mode = True if controller == 0 else False
for name, param in model.named_parameters():
if name in ['out_task0.weight', 'out_task0.bias']:
param.requires_grad = task0_mode
elif name in ['out_task1.weight', 'out_task1.bias']:
param.requires_grad = not task0_mode
outputs = model(x)[controller]
loss = criterion(outputs, y)
loss.backward()
optimizer.step()
# Monitor the parameter updates
for name, p in model.named_parameters():
if name in ['out_task0.weight', 'out_task1.weight']:
print(f"Controller: {controller}")
print(name, p)
Did I miss anything in the training procedure? Or the overall setup will not work?
Disclaimer: the question has been answered from PyTorch Forum, I put things together here in case someone runs into the same problem, the credit goes to ptrblk
The problem could arise from any variants of stochastic gradient descent(sgd) which utilizes gradients from previous steps, for instance, stochastic gradient descent with momentum(sgd-m), Nesterov accelerated gradient (NAG), Adagrad, RMSprop, Adam and so on. Zero-ing gradient at step t would not affect the terms relying on historical gradients. Thus the weights are still updated with the setting in the posted question.
One can see that from the following code example.
model = nn.Linear(1, 1, bias=False)
#optimizer = torch.optim.SGD(model.parameters(), lr=1., momentum=0.) # same results for w1 and w2
optimizer = torch.optim.SGD(model.parameters(), lr=1., momentum=0.5) # w2 gets updated
#optimizer = torch.optim.Adam(model.parameters(), lr=1.) # w2 gets updated
w0 = model.weight.clone()
out = model(torch.randn(1, 1))
out.mean().backward()
optimizer.step()
w1 = model.weight.clone()
optimizer.zero_grad()
print(model.weight.grad)
optimizer.step()
w2 = model.weight.clone()
print(w1 - w0)
print(w2 - w1)
With the native SGD optimizer, w2 and w1 are the same. But it is not the case for SGD-M and Adam.

How to increase accuracy of predictions by neural network made from scratch?

I'm relatively new to machine learning, and as a starter project, I decided to implement my own neural network from scratch in Python using NumPy. As such, I have manually implemented methods for forward propagation, backpropagation, and calculating function derivatives.
For my testing data, I wrote a function that generates values of sin(x). When I finally create and train my network, my outputs fluctuate quite a lot with each trial and are significantly off the true values(although they are a decent improvement over the initial predictions).
I have tried adjusting quite a few settings, including the learning rate, number of neurons, number of layers, training iterations, and activation function, but I still end up with a squared cost of around 0.1 over my input data.
I think my derivative functions and chain rule expressions are correct since when I use just one input sample I get a near-perfect answer.
Adding more input data, however, significantly reduces the accuracy of the network.
Do you guys have any suggestions for how to improve this network, or is there anything I'm doing wrong currently?
My code:
import numpy as np
#Generate input data for the network
def inputgen():
inputs=[]
outputs=[]
i=0.01
for x in range(10000):
inputs.append([round(i,7)])
outputs.append([np.sin(i)]) #output is sin(x)
i+=0.0001
return [inputs,outputs]
#set training input and output
inputs = np.array(inputgen()[0])
outputs = np.array(inputgen()[1])
#sigmoid activation function and derivative
def sigmoid(x):
return 1/(1+np.exp(-x))
def sigmoid_derivative(x):
return sigmoid(x)*(1-sigmoid(x))
#tanh activation function and derivative
def tanh(x):
return np.tanh(x)
def tanh_derivative(x):
return 1-((tanh(x))**2)
#Layer class
class Layer:
def __init__(self,num_neurons,num_inputs,inputs):
self.num_neurons = num_neurons #number of neurons in hidden layers
self.num_inputs = num_inputs #number of input neurons(1 in the case of testing data)
self.inputs = inputs
self.weights = np.random.rand(num_inputs,num_neurons)*np.sqrt(1/num_inputs) #weights initialized by Xavier function
self.biases = np.zeros((1,num_neurons)) #biases initialized as 0
self.z = np.dot(self.inputs,self.weights)+self.biases #Cacluate z
self.a = tanh(self.z) #Calculate activation
self.dcost_a = [] #derivative of cost with respect to activation
self.da_z = [] #derivative of activation with respect to z
self.dz_w = [] #derivative of z with respect to weight
self.dcost_w = [] #derivative of cost with respect to weight
self.dcost_b = [] #derivative of cost with respect to bias
#functions used in forwardpropagation
def compute_z(self):
self.z = np.dot(self.inputs,self.weights)+self.biases
return self.z
def activation(self):
self.a = tanh(self.compute_z())
def forward(self):
self.activation()
#Network class
class Network:
def __init__(self,num_layers,num_neurons,num_inputs,inputs,num_outputs,outputs):
self.learningrate = 0.01 #learning rate
self.num_layers=num_layers #number of hidden layers
self.num_neurons=num_neurons #number of neurons in hidden layers
self.num_inputs = num_inputs #number of input neurons
self.inputs=inputs
self.expected_outputs=outputs
self.layers=[]
for x in range(num_layers):
if x==0:
self.layers.append(Layer(num_neurons,num_inputs,inputs)) #Initial layer with given inputs
else:
#Other layers have an input which is the activation of previous layer
self.layers.append(Layer(num_neurons,len(self.layers[x-1].a[0]),self.layers[x-1].a))
self.prediction = Layer(num_outputs,num_neurons,self.layers[-1].a) #prediction
self.layers.append(self.prediction)
self.cost = (self.prediction.a-self.expected_outputs)**2 #cost
#forwardpropagation
def forwardprop(self):
for x in range(self.num_layers+1):
if(x!=0):
self.layers[x].inputs=self.layers[x-1].a
self.layers[x].forward()
self.prediction=self.layers[-1] #update prediction value
def backprop(self):
self.cost = (self.prediction.a-self.expected_outputs)**2
for x in range(len(self.layers)-1,-1,-1):
if(x==len(self.layers)-1):
dcost_a = 2*(self.prediction.a-self.expected_outputs) #derivative of cost with respect to activation for output layer
else:
#derivative of cost with respect to activation for hidden layers(chain rule)
dcost_a=np.zeros((len(self.layers[x].inputs),self.num_neurons)).T
dcost_a1=self.layers[x+1].dcost_a.T
da_z1=self.layers[x+1].da_z.T
dz_a=(self.layers[x+1].weights).T
for z in range(len(dcost_a1)):
dcost_a+=((dcost_a1[z])*da_z1)
for j in range(len(dcost_a)):
dcost_a[j]*=dz_a[z][j]
dcost_a=dcost_a.T
self.layers[x].dcost_a=dcost_a
#derivative of activation with respect to z
da_z = tanh_derivative(self.layers[x].z)
self.layers[x].da_z=da_z
#derivative of z with respect to weights
dz_w = []
if x!=0:
dz_w=self.layers[x-1].a
else:
dz_w=self.inputs
self.layers[x].dz_w=dz_w
#change weights and biases
for x in range(len(self.layers)-1,-1,-1):
#Average each of the derivatives over all training samples
self.layers[x].dcost_a=np.average(self.layers[x].dcost_a,axis=0)
self.layers[x].da_z=np.average(self.layers[x].da_z,axis=0)
self.layers[x].dz_w=(np.average(self.layers[x].dz_w,axis=0)).T
self.layers[x].dcost_w = np.zeros((self.layers[x].weights.shape))
self.layers[x].dcost_b = self.layers[x].dcost_a*self.layers[x].da_z
for v in range(len(self.layers[x].dz_w)):
self.layers[x].dcost_w[v] = (self.layers[x].dcost_a*self.layers[x].da_z)*self.layers[x].dz_w[v]
#update weights and biases
self.layers[x].weights-=(self.layers[x].dcost_w)*self.learningrate
self.layers[x].biases-=(self.layers[x].dcost_b)*self.learningrate
#train the network
def train(self):
for x in range(1000):
self.backprop()
self.forwardprop()
Network1 = Network(3,3,1,inputs,1,outputs)
Network1.train()
print(Network1.prediction.a)
Sample input:
[[0.01 ]
[0.0101]
[0.0102]
...
[1.0097]
[1.0098]
[1.0099]]
Sample output:
[[0.37656753]
[0.37658777]
[0.37660802]
...
[0.53088048]
[0.53089046]
[0.53090043]]
Expected output:
[[0.00999983]
[0.01009983]
[0.01019982]
...
[0.84667225]
[0.84672546]
[0.84677865]]
Few things I would recommend to try:
ReLu activation for hidden layers. Tanh may not work so well for
multi-layered network.
If you are doing regression, try linear activation for output layer.
Experiment with different target functions. sin(x) may be crazy
difficult for small neural network to understand. Try something simpler
like polynomials and increase complexity gradually.
I would keep track of the cost_history and update your learning rate as such.
If you have been
- getting closer to the actual value, increase learning rate by 5%
- getting further away, decrease the learning rate by 50%
def update_learning_rate(self):
if(len(self.cost_history) < 2):
return
if(self.cost_history[0] > self.cost_history[1]):
self.learning_rate /= 2
else:
self.learning_rate *= 1.05
this should actually yield surprisingly better results
what usually happens is that you might be getting stuck in one of the local minima (d) and not the absolute minimum (b). Ignore the labels, this is just a random photo I found online.

Regression loss functions incorrect

I'm trying a basic averaging example, but the validation and loss don't match and the network fails to converge if I increase the training time. I'm training a network with 2 hidden layers, each 500 units wide on three integers from the range [0,9] with a learning rate of 1e-1, Adam, batch size of 1, and dropout for 3000 iterations and validate every 100 iterations. If the absolute difference between the label and the hypothesis is less than a threshold, here I set the threshold to 1, I consider that correct. Could someone let me know if this is an issue with the choice of loss function, something wrong with Pytorch, or something I'm doing. Below are some plots:
val_diff = 1
acc_diff = torch.FloatTensor([val_diff]).expand(self.batch_size)
Loop 100 times to during validation:
num_correct += torch.sum(torch.abs(val_h - val_y) < acc_diff)
Append after each validation phase:
validate.append(num_correct / total_val)
Here are some examples of the (hypothesis, and labels):
[...(-0.7043088674545288, 6.0), (-0.15691305696964264, 2.6666667461395264),
(0.2827358841896057, 3.3333332538604736)]
I tried six of the loss functions in the API that are typically used for regression:
torch.nn.L1Loss(size_average=False)
torch.nn.L1Loss()
torch.nn.MSELoss(size_average=False)
torch.nn.MSELoss()
torch.nn.SmoothL1Loss(size_average=False)
torch.nn.SmoothL1Loss()
Thanks.
Network code:
class Feedforward(nn.Module):
def __init__(self, topology):
super(Feedforward, self).__init__()
self.input_dim = topology['features']
self.num_hidden = topology['hidden_layers']
self.hidden_dim = topology['hidden_dim']
self.output_dim = topology['output_dim']
self.input_layer = nn.Linear(self.input_dim, self.hidden_dim)
self.hidden_layer = nn.Linear(self.hidden_dim, self.hidden_dim)
self.output_layer = nn.Linear(self.hidden_dim, self.output_dim)
self.dropout_layer = nn.Dropout(p=0.2)
def forward(self, x):
batch_size = x.size()[0]
feat_size = x.size()[1]
input_size = batch_size * feat_size
self.input_layer = nn.Linear(input_size, self.hidden_dim).cuda()
hidden = self.input_layer(x.view(1, input_size)).clamp(min=0)
for _ in range(self.num_hidden):
hidden = self.dropout_layer(F.relu(self.hidden_layer(hidden)))
output_size = batch_size * self.output_dim
self.output_layer = nn.Linear(self.hidden_dim, output_size).cuda()
return self.output_layer(hidden).view(output_size)
Training code:
def train(self):
if self.cuda:
self.network.cuda()
dh = DataHandler(self.data)
# loss_fn = nn.L1Loss(size_average=False)
# loss_fn = nn.L1Loss()
# loss_fn = nn.SmoothL1Loss(size_average=False)
# loss_fn = nn.SmoothL1Loss()
# loss_fn = nn.MSELoss(size_average=False)
loss_fn = torch.nn.MSELoss()
losses = []
validate = []
hypos = []
labels = []
val_size = 100
val_diff = 1
total_val = float(val_size * self.batch_size)
for i in range(self.iterations):
x, y = dh.get_batch(self.batch_size)
x = self.tensor_to_Variable(x)
y = self.tensor_to_Variable(y)
self.optimizer.zero_grad()
loss = loss_fn(self.network(x), y)
loss.backward()
self.optimizer.step()
It looks like you've misunderstood how layers in pytorch works, here are a few tips:
In your forward when you do nn.Linear(...) you are definining new layers instead of using those you pre-defined in your network __init__. Therefore, it cannot learn anything as weights are constantly reinitalized.
You shouldn't need to call .cuda() inside net.forward(...) since you've already copied the network on gpu in your train by calling self.network.cuda()
Ideally the net.forward(...) input should directly have the shape of the first layer so you won't have to modify it. Here you should have x.size() <=> Linear -- > (Batch_size, Features).
Your forward should look close to this:
def forward(self, x):
x = F.relu(self.input_layer(x))
x = F.dropout(F.relu(self.hidden_layer(x)),training=self.training)
x = self.output_layer(x)
return x

Categories

Resources