I'm working on developing a neural network from scratch. The issue seems to maybe be with my relu back-propagation. When I train the model it sometimes outputs -0 and sometimes outputs good predictions (relatively). Can someone tell me if I'm doing my back propagation incorrectly or if there's a reason why my relu would be predicting -0?
--
[edit]
Fixed the issue of predicting -0, but now it just predicts 0 for all inputs for the XOR. Can someone look over my backpropagation?
import numpy as np
# Each layer in our neural network
class NeuralLayer:
def __init__(self, input_neurons, output_neurons):
self.weights = np.random.randn(input_neurons, output_neurons)* np.sqrt(2. / input_neurons)
self.bias = np.ones((1,output_neurons)) * 0.5
# Two different activations, sigmoid by default
def sigmoid(self, neurons):
self.act = 1.0/(1.0 + np.exp(-neurons))
return self.act
def sigmoidBackward(self, grad):
return grad * self.act * (1 - self.act)
def relu(self, neurons):
self.act = (neurons > 0)
return neurons * self.act
def reluBackward(self, grad):
return grad * self.act
# Forward pass for this layer
def forward(self, input, activation):
self.input = np.atleast_2d(input)
if activation == 'sigmoid':
return self.sigmoid(input # self.weights + self.bias)
else:
return self.relu(input # self.weights + self.bias)
# backward pass for this layer
def backward(self, grad, activation):
if activation == 'sigmoid':
grad = self.sigmoidBackward(np.atleast_2d(grad))
else:
grad = self.reluBackward(np.atleast_2d(grad))
self.grad_weights = np.matmul(self.input.T, grad)
self.grad_bias = grad.sum()
return grad # self.weights.T
def step(self, step_size):
self.weights -= step_size*self.grad_weights
self.bias -= step_size*self.grad_bias
# Our neural net
class NeuralNetwork:
# Dynamically create all layers
def __init__(self, input_neurons, hidden_neurons, layer_count, activation, output_neurons = 1):
self.activation = activation
# Used to ensure input neurons match inputted data
self.neuron_safety = input_neurons
assert layer_count >= 2 and output_neurons >= 1
# Input layer
self.layers = [NeuralLayer(input_neurons, hidden_neurons)]
# Hidden Layers
for i in range(layer_count - 2):
self.layers.append(NeuralLayer(hidden_neurons, hidden_neurons))
# Output layer
self.layers.append(NeuralLayer(hidden_neurons, output_neurons))
# Forward pass for each layer
def forward(self, inp):
assert inp.shape[0] == self.neuron_safety
for layer in self.layers:
inp = layer.forward(inp, self.activation)
return inp
def backward(self, grad):
for layer in reversed(self.layers):
grad = layer.backward(grad, self.activation)
def step(self, step_size = 0.01):
for layer in self.layers:
layer.step(step_size)
# loss function - only 1 output neuron
def meanSquaredError(self, preds, labels):
self.labels = labels
self.preds = preds
return (self.preds - self.labels)**2
def meanSquaredErrorGrad(self):
return 2 * (self.preds - self.labels)
# Create a neural network with 2 inputs, 2 hidden neurons in each layer, and 2 layers
net = NeuralNetwork(2,16,4, 'relu')
epochs = 5000
# Input data (A,B) for XOR
X = np.array([[0,0],[1,1], [1,0],[0,1]])
# Expected output data
Y = np.array([[0],[0],[1],[1]])
for i in range(epochs):
preds = []
for idx, x in enumerate(X):
predictions = net.forward(x)
preds.append(predictions)
loss = net.meanSquaredError(predictions, Y[idx])
loss_grad = net.meanSquaredErrorGrad()
net.backward(loss_grad)
net.step()
print("Model predicted: {}\nactual values: {} ".format(preds, Y.T))
Output:
Model predicted: [array([[-0.]]), array([[-0.]]), array([[1.]]), array([[-0.]])]
actual values: [[0 0 1 1]]
Sometimes the predictions are perfect, but most of the time at least one prediction will be -0
The bias gradient is incorrect. You are using self.grad_bias = grad.sum(). This will compute the sum of the entire matrix. It needs to be self.grad_bias = grad.sum(axis=0, keepdims=True) to compute a 1 x output_neurons array that will properly update the bias vector. Otherwise, grad.sum() provides a single number that you are using to update all of your biases, which is not correct.
Also, make sure you update your forward pass for your ReLU to np.maximum(neurons, 0) as described in the comments.
def relu(self, neurons):
self.act = (neurons > 0)
return np.maximum(neurons, 0)
The gradient of the activations will be 0 or 1 depending on which parts of the inputs were positive.
Finally, for the XOR problem you typically do not use ReLU as the activation for the output layer because it is not bounded between [0-1] as per the XOR problem. The reason why you got good results with the sigmoid activation function is that the dynamic range of that activation function suits the XOR problem well. As an experiment, you can modify the output layer to be sigmoid, and the hidden layers to be ReLU. If you do this, you should get just as good a performance as using sigmoid all the way.
Related
I'm trying to understand exactly how the calculation are performed in the GRU pytorch class. I'm having some troubles while reading the GRU pytorch documetation and the LSTM TorchScript documentation with its code implementation.
In the GRU documentation is stated:
In a multilayer GRU, the input xt(l) of the l -th layer (l>=2) is the hidden state ht(l−1) of the previous layer multiplied by dropout δt(l−1)where each δt(l−1) is a Bernoulli random variable which is 0 with probability dropout.
So essentially given a sequence, each time point should be passed through all the layers for each loop, like this implementation
Meanwhile the LSTM code implementation is:
def script_lstm(input_size, hidden_size, num_layers, bias=True,
batch_first=False, dropout=False, bidirectional=False):
'''Returns a ScriptModule that mimics a PyTorch native LSTM.'''
# The following are not implemented.
assert bias
assert not batch_first
if bidirectional:
stack_type = StackedLSTM2
layer_type = BidirLSTMLayer
dirs = 2
elif dropout:
stack_type = StackedLSTMWithDropout
layer_type = LSTMLayer
dirs = 1
else:
stack_type = StackedLSTM
layer_type = LSTMLayer
dirs = 1
return stack_type(num_layers, layer_type,
first_layer_args=[LSTMCell, input_size, hidden_size],
other_layer_args=[LSTMCell, hidden_size * dirs,
hidden_size])
class LSTMCell(jit.ScriptModule):
def __init__(self, input_size, hidden_size):
super(LSTMCell, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.weight_ih = Parameter(torch.randn(4 * hidden_size, input_size))
self.weight_hh = Parameter(torch.randn(4 * hidden_size, hidden_size))
self.bias_ih = Parameter(torch.randn(4 * hidden_size))
self.bias_hh = Parameter(torch.randn(4 * hidden_size))
#jit.script_method
def forward(self, input: Tensor, state: Tuple[Tensor, Tensor]) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
hx, cx = state
gates = (torch.mm(input, self.weight_ih.t()) + self.bias_ih +
torch.mm(hx, self.weight_hh.t()) + self.bias_hh)
ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
ingate = torch.sigmoid(ingate)
forgetgate = torch.sigmoid(forgetgate)
cellgate = torch.tanh(cellgate)
outgate = torch.sigmoid(outgate)
cy = (forgetgate * cx) + (ingate * cellgate)
hy = outgate * torch.tanh(cy)
return hy, (hy, cy)
class LSTMLayer(jit.ScriptModule):
def __init__(self, cell, *cell_args):
super(LSTMLayer, self).__init__()
self.cell = cell(*cell_args)
#jit.script_method
def forward(self, input: Tensor, state: Tuple[Tensor, Tensor]) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
inputs = input.unbind(0)
outputs = torch.jit.annotate(List[Tensor], [])
for i in range(len(inputs)):
out, state = self.cell(inputs[i], state)
outputs += [out]
return torch.stack(outputs), state
def init_stacked_lstm(num_layers, layer, first_layer_args, other_layer_args):
layers = [layer(*first_layer_args)] + [layer(*other_layer_args)
for _ in range(num_layers - 1)]
return nn.ModuleList(layers)
class StackedLSTM(jit.ScriptModule):
__constants__ = ['layers'] # Necessary for iterating through self.layers
def __init__(self, num_layers, layer, first_layer_args, other_layer_args):
super(StackedLSTM, self).__init__()
self.layers = init_stacked_lstm(num_layers, layer, first_layer_args,
other_layer_args)
#jit.script_method
def forward(self, input: Tensor, states: List[Tuple[Tensor, Tensor]]) -> Tuple[Tensor, List[Tuple[Tensor, Tensor]]]:
# List[LSTMState]: One state per layer
output_states = jit.annotate(List[Tuple[Tensor, Tensor]], [])
output = input
# XXX: enumerate https://github.com/pytorch/pytorch/issues/14471
i = 0
for rnn_layer in self.layers:
state = states[i]
output, out_state = rnn_layer(output, state)
output_states += [out_state]
i += 1
return output, output_states
So in this case each layer does its own sequence for loop and passes another sequence tensor to the next layer.
So my question is: Which is the correct way to implement a multi-layer GRU?
I think you are misunderstanding the definition. The approach that you see in the lstm code, where each layer passes an entire sequence on to the next, is the standard approach for stacked RNN's - at least for sequence to sequence models. It's equivalent to RNN(RNN(input)).
It's also what the PyTorch GRU definition is saying, albeit, in a somewhat round-about-way. The definition is saying that for the N-th layer GRU, the input i, is the hidden state h (read: output) of the (N-1)-th layer GRU. Now, in theory, we could run all the inputs one at a time through all the layers and collect the outputs. Or we can do the entire sequence for each layer and only keep the last output sequence. This second approach should be faster, because it allows for vectorizing the calculations more efficiently.
Further, if you look at the link you sent with the two different GRU models. You'll see that the results are equivalent, whether you run the inputs through each layer one at a time using GRUCell's, or use full GRU layers.
In the Pytorch GRU Document, you would find that it contains an attribute named num_layers which allows you to specify the number of GRU layers.
If this answers your question as to how we apply the GRU layers practically?
>>> rnn = nn.GRU(input_size = 10, hidden_size = 20, num_layers = 2)
>>> input = torch.randn(5, 3, 10)
>>> h0 = torch.randn(2, 3, 20)
>>> output, hn = rnn(input, h0)
I am trying to implement a weighted average between two tensors in TensorFlow, where the weight can be learned automatically. Following the advice on how to design a custom layer for a keras model here, my attempt is the following:
class WeightedAverage(tf.keras.layers.Layer):
def __init__(self):
super(WeightedAverage, self).__init__()
init_value = tf.keras.initializers.Constant(value=0.5)
self.w = self.add_weight(name="weight",
initializer=init_value,
trainable=True)
def call(self, inputs):
return tf.keras.layers.average([inputs[0] * self.w,
inputs[1] * (1 - self.w)])
Now the problem is that after training the model, saving, and loading it again, the value for w remains 0.5. Is it possible that the parameter does not receive any gradient updates? When printing the trainable variables of my model, the parameter is listed and should therefore be included when calling model.fit.
Here is a possibility to implement a weighted average between two tensors, where the weight can be learned automatically. I also introduce the constrain that the weights must sum up to 1. To grant this we have to simply apply a softmax on our weights. In the dummy example below I combine with this method the output of two fully-connected branches but you can manage it in every other scenario
here the custom layer:
class WeightedAverage(Layer):
def __init__(self):
super(WeightedAverage, self).__init__()
def build(self, input_shape):
self.W = self.add_weight(
shape=(1,1,len(input_shape)),
initializer='uniform',
dtype=tf.float32,
trainable=True)
def call(self, inputs):
# inputs is a list of tensor of shape [(n_batch, n_feat), ..., (n_batch, n_feat)]
# expand last dim of each input passed [(n_batch, n_feat, 1), ..., (n_batch, n_feat, 1)]
inputs = [tf.expand_dims(i, -1) for i in inputs]
inputs = Concatenate(axis=-1)(inputs) # (n_batch, n_feat, n_inputs)
weights = tf.nn.softmax(self.W, axis=-1) # (1,1,n_inputs)
# weights sum up to one on last dim
return tf.reduce_sum(weights*inputs, axis=-1) # (n_batch, n_feat)
here the full example in a regression problem:
inp1 = Input((100,))
inp2 = Input((100,))
x1 = Dense(32, activation='relu')(inp1)
x2 = Dense(32, activation='relu')(inp2)
W_Avg = WeightedAverage()([x1,x2])
out = Dense(1)(W_Avg)
m = Model([inp1,inp2], out)
m.compile('adam','mse')
n_sample = 1000
X1 = np.random.uniform(0,1, (n_sample,100))
X2 = np.random.uniform(0,1, (n_sample,100))
y = np.random.uniform(0,1, (n_sample,1))
m.fit([X1,X2], y, epochs=10)
in the end, you can also visualize the value of the weights in this way:
tf.nn.softmax(m.get_weights()[-3]).numpy()
I am trying to implement a weighted average between two tensors in TensorFlow, where the weight can be learned automatically. Following the advice on how to design a custom layer for a keras model here, my attempt is the following:
class WeightedAverage(tf.keras.layers.Layer):
def __init__(self):
super(WeightedAverage, self).__init__()
init_value = tf.keras.initializers.Constant(value=0.5)
self.w = self.add_weight(name="weight",
initializer=init_value,
trainable=True)
def call(self, inputs):
return tf.keras.layers.average([inputs[0] * self.w,
inputs[1] * (1 - self.w)])
Now the problem is that after training the model, saving, and loading it again, the value for w remains 0.5. Is it possible that the parameter does not receive any gradient updates? When printing the trainable variables of my model, the parameter is listed and should therefore be included when calling model.fit.
Here is a possibility to implement a weighted average between two tensors, where the weight can be learned automatically. I also introduce the constrain that the weights must sum up to 1. To grant this we have to simply apply a softmax on our weights. In the dummy example below I combine with this method the output of two fully-connected branches but you can manage it in every other scenario
here the custom layer:
class WeightedAverage(Layer):
def __init__(self):
super(WeightedAverage, self).__init__()
def build(self, input_shape):
self.W = self.add_weight(
shape=(1,1,len(input_shape)),
initializer='uniform',
dtype=tf.float32,
trainable=True)
def call(self, inputs):
# inputs is a list of tensor of shape [(n_batch, n_feat), ..., (n_batch, n_feat)]
# expand last dim of each input passed [(n_batch, n_feat, 1), ..., (n_batch, n_feat, 1)]
inputs = [tf.expand_dims(i, -1) for i in inputs]
inputs = Concatenate(axis=-1)(inputs) # (n_batch, n_feat, n_inputs)
weights = tf.nn.softmax(self.W, axis=-1) # (1,1,n_inputs)
# weights sum up to one on last dim
return tf.reduce_sum(weights*inputs, axis=-1) # (n_batch, n_feat)
here the full example in a regression problem:
inp1 = Input((100,))
inp2 = Input((100,))
x1 = Dense(32, activation='relu')(inp1)
x2 = Dense(32, activation='relu')(inp2)
W_Avg = WeightedAverage()([x1,x2])
out = Dense(1)(W_Avg)
m = Model([inp1,inp2], out)
m.compile('adam','mse')
n_sample = 1000
X1 = np.random.uniform(0,1, (n_sample,100))
X2 = np.random.uniform(0,1, (n_sample,100))
y = np.random.uniform(0,1, (n_sample,1))
m.fit([X1,X2], y, epochs=10)
in the end, you can also visualize the value of the weights in this way:
tf.nn.softmax(m.get_weights()[-3]).numpy()
I'm trying to implement an unsupervised ANN using Hebbian updating in Keras. I found a custom Hebbian layer made by Dan Saunders here - https://github.com/djsaunde/rinns_python/blob/master/hebbian/hebbian.py
(I hope it is not poor form to ask questions about another person's code here)
In the examples I found using this layer in the repo, this layer is used as an intermediate layer between Dense/Conv layers, but I would like to construct a network using only Hebbian layers.
Two critical things are confusing me in this implementation:
It seems as though input dims and output dims must be the same for this layer to work. Why would this be the case and what can I do to make it so they can be different?
Why is the diagonal of the weight matrix set to zero? It says this is to "ensure that no neuron is laterally connected to itself", but I thought the connection weights were between the previous layer and the current layer, not the current layer and itself.
Here is the code for the Hebbian Layer Implementation:
from keras import backend as K
from keras.engine.topology import Layer
import numpy as np
import tensorflow as tf
np.set_printoptions(threshold=np.nan)
sess = tf.Session()
class Hebbian(Layer):
def __init__(self, output_dim, lmbda=1.0, eta=0.0005, connectivity='random', connectivity_prob=0.25, **kwargs):
'''
Constructor for the Hebbian learning layer.
args:
output_dim - The shape of the output / activations computed by the layer.
lambda - A floating-point valued parameter governing the strength of the Hebbian learning activation.
eta - A floating-point valued parameter governing the Hebbian learning rate.
connectivity - A string which determines the way in which the neurons in this layer are connected to
the neurons in the previous layer.
'''
self.output_dim = output_dim
self.lmbda = lmbda
self.eta = eta
self.connectivity = connectivity
self.connectivity_prob = connectivity_prob
if self.connectivity == 'random':
self.B = np.random.random(self.output_dim) < self.connectivity_prob
elif self.connectivity == 'zero':
self.B = np.zeros(self.output_dim)
super(Hebbian, self).__init__(**kwargs)
def random_conn_init(self, shape, dtype=None):
A = np.random.normal(0, 1, shape)
A[self.B] = 0
return tf.constant(A, dtype=tf.float32)
def zero_init(self, shape, dtype=None):
return np.zeros(shape)
def build(self, input_shape):
# create weight variable for this layer according to user-specified initialization
if self.connectivity == 'all':
self.kernel = self.add_weight(name='kernel', shape=(np.prod(input_shape[1:]), \
np.prod(self.output_dim)), initializer='uniform', trainable=False)
elif self.connectivity == 'random':
self.kernel = self.add_weight(name='kernel', shape=(np.prod(input_shape[1:]), \
np.prod(self.output_dim)), initializer=self.random_conn_init, trainable=False)
elif self.connectivity == 'zero':
self.kernel = self.add_weight(name='kernel', shape=(np.prod(input_shape[1:]), \
np.prod(self.output_dim)), initializer=self.zero_init, trainable=False)
else:
raise NotImplementedError
# ensure that no neuron is laterally connected to itself
self.kernel = self.kernel * tf.diag(tf.zeros(self.output_dim))
# call superclass "build" function
super(Hebbian, self).build(input_shape)
def call(self, x):
x_shape = tf.shape(x)
batch_size = tf.shape(x)[0]
# reshape to (batch_size, product of other dimensions) shape
x = tf.reshape(x, (tf.reduce_prod(x_shape[1:]), batch_size))
# compute activations using Hebbian-like update rule
activations = x + self.lmbda * tf.matmul(self.kernel, x)
# compute outer product of activations matrix with itself
outer_product = tf.matmul(tf.expand_dims(x, 1), tf.expand_dims(x, 0))
# update the weight matrix of this layer
self.kernel = self.kernel + tf.multiply(self.eta, tf.reduce_mean(outer_product, axis=2))
self.kernel = tf.multiply(self.kernel, self.B)
self.kernel = self.kernel * tf.diag(tf.zeros(self.output_dim))
return K.reshape(activations, x_shape)
At first inspection I expected this layer to be able to take inputs from a previous layer, perform a simple activation calculation (input * weight), update the weights according to Hebbian updating (something like - if activation is high b/t nodes, increase weight), then pass the activations to the next layer.
I also expected that it would be able to deal with decreasing/increasing the number of nodes from one layer to the next.
Instead, I cannot seem to figure out why the input and output dims must be the same and why the diagonals of the weight matrix are set to zero.
Where in the code (implicitly or explicitly) is the specification that the layers need to be the same dims?
Where in the code (implicitly or explicitly) is the specification that this layer's weight matrix is connecting the current layer to itself?
Apologies if this Q should have been separated into 2, but it seems like they may be related to e/o so I kept them as 1.
Happy to provide more details if needed.
Edit: Realized I forgot to add the error message I get when I try to create a layer with different output dims than the input dims:
model = Sequential()
model.add(Hebbian(input_shape = (256,1), output_dim = 256))
This compiles w/o error ^
model = Sequential()
model.add(Hebbian(input_shape = (256,1), output_dim = 24))
This ^ throws the error:
IndexError: boolean index did not match indexed array along dimension 0; dimension is 256 but corresponding boolean dimension is 24
Okay I think I maybe figured it out, sort of. There were many small problems but the biggest thing was I needed to add the compute_output_shape function which makes the layer able to modify the shape of its input as explained here:
https://keras.io/layers/writing-your-own-keras-layers/
So here is the code with all the changes I made. It will compile and modify the input shape just fine. Note that this layer computes weight changes inside the layer itself and there may be some issues with that if you try to actually use the layer (I'm still ironing these out), but this is a separate issue.
class Hebbian(Layer):
def __init__(self, output_dim, lmbda=1.0, eta=0.0005, connectivity='random', connectivity_prob=0.25, **kwargs):
'''
Constructor for the Hebbian learning layer.
args:
output_dim - The shape of the output / activations computed by the layer.
lambda - A floating-point valued parameter governing the strength of the Hebbian learning activation.
eta - A floating-point valued parameter governing the Hebbian learning rate.
connectivity - A string which determines the way in which the neurons in this layer are connected to
the neurons in the previous layer.
'''
self.output_dim = output_dim
self.lmbda = lmbda
self.eta = eta
self.connectivity = connectivity
self.connectivity_prob = connectivity_prob
super(Hebbian, self).__init__(**kwargs)
def random_conn_init(self, shape, dtype=None):
A = np.random.normal(0, 1, shape)
A[self.B] = 0
return tf.constant(A, dtype=tf.float32)
def zero_init(self, shape, dtype=None):
return np.zeros(shape)
def build(self, input_shape):
# create weight variable for this layer according to user-specified initialization
if self.connectivity == 'random':
self.B = np.random.random(input_shape[0]) < self.connectivity_prob
elif self.connectivity == 'zero':
self.B = np.zeros(self.output_dim)
if self.connectivity == 'all':
self.kernel = self.add_weight(name='kernel', shape=(np.prod(input_shape[1:]), \
np.prod(self.output_dim)), initializer='uniform', trainable=False)
elif self.connectivity == 'random':
self.kernel = self.add_weight(name='kernel', shape=(np.prod(input_shape[1:]), \
np.prod(self.output_dim)), initializer=self.random_conn_init, trainable=False)
elif self.connectivity == 'zero':
self.kernel = self.add_weight(name='kernel', shape=(np.prod(input_shape[1:]), \
np.prod(self.output_dim)), initializer=self.zero_init, trainable=False)
else:
raise NotImplementedError
# call superclass "build" function
super(Hebbian, self).build(input_shape)
def call(self, x): # x is the input to the network
x_shape = tf.shape(x)
batch_size = tf.shape(x)[0]
# reshape to (batch_size, product of other dimensions) shape
x = tf.reshape(x, (tf.reduce_prod(x_shape[1:]), batch_size))
# compute activations using Hebbian-like update rule
activations = x + self.lmbda * tf.matmul(self.kernel, x)
# compute outer product of activations matrix with itself
outer_product = tf.matmul(tf.expand_dims(x, 1), tf.expand_dims(x, 0))
# update the weight matrix of this layer
self.kernel = self.kernel + tf.multiply(self.eta, tf.reduce_mean(outer_product, axis=2))
self.kernel = tf.multiply(self.kernel, self.B)
return K.reshape(activations, x_shape)
def compute_output_shape(self, input_shape):
return (input_shape[0], self.output_dim)
If anyone comes here from Google (like me; repeatedly) trying to make a layer that learns online when called on new input, I just found this other question and I think it's relevant:
Persistent Variable in keras Custom Layer
Self.call is only called when you are defining the graph, for learning to happen on every new input you need to add self.add_update to the call function.
I'm trying a basic averaging example, but the validation and loss don't match and the network fails to converge if I increase the training time. I'm training a network with 2 hidden layers, each 500 units wide on three integers from the range [0,9] with a learning rate of 1e-1, Adam, batch size of 1, and dropout for 3000 iterations and validate every 100 iterations. If the absolute difference between the label and the hypothesis is less than a threshold, here I set the threshold to 1, I consider that correct. Could someone let me know if this is an issue with the choice of loss function, something wrong with Pytorch, or something I'm doing. Below are some plots:
val_diff = 1
acc_diff = torch.FloatTensor([val_diff]).expand(self.batch_size)
Loop 100 times to during validation:
num_correct += torch.sum(torch.abs(val_h - val_y) < acc_diff)
Append after each validation phase:
validate.append(num_correct / total_val)
Here are some examples of the (hypothesis, and labels):
[...(-0.7043088674545288, 6.0), (-0.15691305696964264, 2.6666667461395264),
(0.2827358841896057, 3.3333332538604736)]
I tried six of the loss functions in the API that are typically used for regression:
torch.nn.L1Loss(size_average=False)
torch.nn.L1Loss()
torch.nn.MSELoss(size_average=False)
torch.nn.MSELoss()
torch.nn.SmoothL1Loss(size_average=False)
torch.nn.SmoothL1Loss()
Thanks.
Network code:
class Feedforward(nn.Module):
def __init__(self, topology):
super(Feedforward, self).__init__()
self.input_dim = topology['features']
self.num_hidden = topology['hidden_layers']
self.hidden_dim = topology['hidden_dim']
self.output_dim = topology['output_dim']
self.input_layer = nn.Linear(self.input_dim, self.hidden_dim)
self.hidden_layer = nn.Linear(self.hidden_dim, self.hidden_dim)
self.output_layer = nn.Linear(self.hidden_dim, self.output_dim)
self.dropout_layer = nn.Dropout(p=0.2)
def forward(self, x):
batch_size = x.size()[0]
feat_size = x.size()[1]
input_size = batch_size * feat_size
self.input_layer = nn.Linear(input_size, self.hidden_dim).cuda()
hidden = self.input_layer(x.view(1, input_size)).clamp(min=0)
for _ in range(self.num_hidden):
hidden = self.dropout_layer(F.relu(self.hidden_layer(hidden)))
output_size = batch_size * self.output_dim
self.output_layer = nn.Linear(self.hidden_dim, output_size).cuda()
return self.output_layer(hidden).view(output_size)
Training code:
def train(self):
if self.cuda:
self.network.cuda()
dh = DataHandler(self.data)
# loss_fn = nn.L1Loss(size_average=False)
# loss_fn = nn.L1Loss()
# loss_fn = nn.SmoothL1Loss(size_average=False)
# loss_fn = nn.SmoothL1Loss()
# loss_fn = nn.MSELoss(size_average=False)
loss_fn = torch.nn.MSELoss()
losses = []
validate = []
hypos = []
labels = []
val_size = 100
val_diff = 1
total_val = float(val_size * self.batch_size)
for i in range(self.iterations):
x, y = dh.get_batch(self.batch_size)
x = self.tensor_to_Variable(x)
y = self.tensor_to_Variable(y)
self.optimizer.zero_grad()
loss = loss_fn(self.network(x), y)
loss.backward()
self.optimizer.step()
It looks like you've misunderstood how layers in pytorch works, here are a few tips:
In your forward when you do nn.Linear(...) you are definining new layers instead of using those you pre-defined in your network __init__. Therefore, it cannot learn anything as weights are constantly reinitalized.
You shouldn't need to call .cuda() inside net.forward(...) since you've already copied the network on gpu in your train by calling self.network.cuda()
Ideally the net.forward(...) input should directly have the shape of the first layer so you won't have to modify it. Here you should have x.size() <=> Linear -- > (Batch_size, Features).
Your forward should look close to this:
def forward(self, x):
x = F.relu(self.input_layer(x))
x = F.dropout(F.relu(self.hidden_layer(x)),training=self.training)
x = self.output_layer(x)
return x