Numpy RNN gradient check failure - python

So I am building an RNN from scratch using numpy just to get the hang of how they work internally. My backpropagation through time is here:
def backprop_through_time(self, X, Y):
assert(len(X.shape) == 3)
seq_length = Y.shape[1] if self.return_sequences else 1
_, (Z_states, States, Z_outs, Outs) = self.feed_forward(X, cache=True)
if not self.return_sequences:
Outs = Outs[:,-1,:]
# setup gradients
dLdU = np.zeros(self.U.shape)
dLdV = np.zeros(self.V.shape)
dLdW = np.zeros(self.W.shape)
dLdB_state = np.zeros(self.B_state.shape)
dLdB_out = np.zeros(self.B_out.shape)
dLdOuts = self.loss_function_prime(Outs, Y)
if not self.return_sequences:
# we need dLdOuts to have a seq_length dim at axis 1
dLdOuts = np.expand_dims(dLdOuts, axis=1)
for t in range(seq_length):
adjusted_t = seq_length-1 if not self.return_sequences else t
# print("adjusted_t {}".format(adjusted_t))
dOuts_tdZ_out = self.output_activation_function_prime(Z_outs[:,adjusted_t,:])
dLdZ_out = np.multiply(dLdOuts[:, adjusted_t, :], dOuts_tdZ_out)
# Z_state = dot(X_t, self.U) + dot(State_{t-1}, self.W) + self.B_state
# State_t = f(Z_state)
# Z_out = dot(State_t, self.V) + self.B_out
# Out_t = g(Z_out)
dLdV += np.dot(States[:,adjusted_t,:].T, dLdZ_out)
dLdB_out += np.sum(dLdZ_out, axis=0, keepdims=True)
dLdZ_state = np.multiply(np.dot(dLdZ_out, self.V.T),
self.hidden_activation_function_prime(Z_states[:,adjusted_t,:]))
for t_prev in range(max(0, adjusted_t-self.backprop_through_time_limit), adjusted_t+1)[::-1]:
dLdB_state += np.sum(dLdZ_state, axis=0, keepdims=True)
dLdW += np.dot(States[:,t_prev-1,:].T, dLdZ_state)
dLdU += np.dot(X[:,t_prev,:].T, dLdZ_state)
dLdZ_state = np.multiply(np.dot(dLdZ_state, self.W.T),
self.hidden_activation_function_prime(States[:,t_prev-1,:]))
return (dLdU, dLdV, dLdW), (dLdB_state, dLdB_out)
However I am still failing a gradient check for parameters `dLdU, dLdW, dLdB_state`. I have gone through the math about a dozen times now, and I cannot find what is wrong with my implementation.
I assume X and Y both are 3D matrices with X having shape: X.shape := (batch_size, seq_length, input_dim)
while Y having shape: Y.shape := (batch_size, seq_length, output_dim)
Caching the feed_forward operation, I am returning Z_states with shape Z_states.shape := (batch_size, seq_length, hidden_dim), Z_outs and Outs with shape Z_outs.shape, Outs.shape := (batch_size, seq_length, output_dim), and States as States.shape := (batch_size, seq_length+1, hidden_dim). States[:,-1,:] is the original zeros of shape States[:,-1,:].shape := (batch_size, hidden_dim) that the RNN state is initialized with. Could anyone help me?
EDIT
I found my answer. My math is right, but I was calling the wrong variable. When I update dLdZ_state in the 2nd inner loop (the backprop through time part), I am multiplying with self.hidden_activation_function_prime(States[:,t_prev-1,:]) This shoud instead be self.hidden_activation_function_prime(Z_states[:,t_prev-1,:])

Related

Neural Network From Scratch - Forward propagation error

I wanna implement the backward propagation concept in python with the next code
class MLP(object):
def __init__(self, num_inputs=3, hidden_layers=[3, 3], num_outputs=2):
self.num_inputs = num_inputs
self.hidden_layers = hidden_layers
self.num_outputs = num_outputs
layers = [num_inputs] + hidden_layers + [num_outputs]
weights = []
bias = []
for i in range(len(layers) - 1):
w = np.random.rand(layers[i], layers[i + 1])
b=np.random.randn(layers[i+1]).reshape(1, layers[i+1])
weights.append(w)
bias.append(b)
self.weights = weights
self.bias = bias
activations = []
for i in range(len(layers)):
a = np.zeros(layers[i])
activations.append(a)
self.activations = activations
def forward_propagate(self, inputs):
activations = inputs
self.activations[0] = activations
for i, w in enumerate(self.weights):
for j, b in enumerate(self.bias):
net_inputs = self._sigmoid((np.dot(activations, w)+b))
self.activations[i + 1] = net_inputs
return activations
def train(self, inputs, targets, epochs, learning_rate):
for i in range(epochs):
sum_errors = 0
for j, input in enumerate(inputs):
target = targets[j]
output = self.forward_propagate(input)
def _sigmoid(self, x):
y = 1.0 / (1 + np.exp(-x))
return y
So I created the next dummy data in order to verify everything is correct
items = np.array([[random()/2 for _ in range(2)] for _ in range(1000)])
targets = np.array([[i[0] + i[1]] for i in items])
mlp = MLP(2, [5], 1)
mlp.train(items, targets, 2, 0.1)
but when I run the code I have the next error
ValueError: shapes (2,) and (5,1) not aligned: 2 (dim 0) != 5 (dim 0)
I understand the error, but how to solve it?
a couple of major problems with forward_propagate:
change net_inputs to activations - otherwise you always compute and return the activations from the first layer
remove for j, b in enumerate(self.bias): - biases from other layers have no business here
use matmul instead of dot
so, something like
for i, w in enumerate(self.weights):
activations = self._sigmoid((np.matmul(activations, w)+self.bias[i]))
self.activations[i + 1] = activations
return activations
Also, be careful to note that this method receives 1D array, which converts to a matrix after the first matmul. Matrixes are stored in self.activations and a matrix is returned from the method.
This might or might not be what you want.

RunTime Error: Function AddmmBackward returned an invalid gradient at index 1 (Mismatch in shape)

I am trying to implement a VAE, and i am having trouble calculating the gradient for the model. I believe this is happening in the decoder. The exact error message is Function AddmmBackward returned an invalid gradient at index 1 - got [10, 32] but expected shape compatible with [10, 1024]. Here is the decoder model.
class decoderNW(nn.Module):
def __init__(self):
super(decoderNW,self).__init__()
channels = 32
kernelSize = 4
padding = (2,0)
stride = (2,2)
outputpadding = (1,0)
self.FC1 = nn.Linear(channels, 1024)
self.FC2 = nn.Linear(channels, 10656)
self.deConv3x301 = nn.ConvTranspose2d(channels, 64, kernel_size=kernelSize, stride=stride, output_padding=outputpadding)
nn.init.xavier_uniform_(self.deConv3x301.weight)
self.deConv3x302 = nn.ConvTranspose2d(64, 128, kernel_size=kernelSize, stride=stride, output_padding=outputpadding)
nn.init.xavier_uniform_(self.deConv3x302.weight)
self.deConv3x303 = nn.ConvTranspose2d(128, 64, kernel_size=kernelSize, stride=stride, output_padding=outputpadding)
nn.init.xavier_uniform_(self.deConv3x303.weight)
self.deConv3x304 = nn.ConvTranspose2d(64, 3, kernel_size=kernelSize, stride=stride)
nn.init.xavier_uniform_(self.deConv3x304.weight)
self.bn1 = nn.BatchNorm1d(1024)
self.bn2 = nn.BatchNorm2d(64)
self.bn3 = nn.BatchNorm2d(128)
self.bn4 = nn.BatchNorm2d(64)
self.ReLU = nn.ReLU(inplace=True)
self.sigmoid = nn.Sigmoid()
def forward(self,x):
x = self.FC1(x)
x = self.bn1(x)
x = self.ReLU(x)
# Shape of x => 10x1024
x = self.FC2(x)
# Shape of x => 10x10656
# Reshape x as 10x8x42x75
x = x.view(x.size(0),32,9,37)
x = self.deConv3x301(x)
x = self.bn2(x)
x = self.ReLU(x)
x = self.deConv3x302(x)
x = self.bn3(x)
x = self.ReLU(x)
x = self.deConv3x303(x)
x = self.bn4(x)
x = self.ReLU(x)
x = self.deConv3x304(x)
x = self.sigmoid(x)
return(x)
I believe its happening when I am trying to reshape the tensor into a 2D tensor (like image) from FC to deconv layer.
I have tried using reshape function, but the same problem persists. Im not sure where I am going wrong. Any help is greatly appreciated.
Thanks.
PS: I get this error when I run backward(). Here is the code snippet for that!
optimizerVAE.zero_grad()
variationalAE.train()
vaeT = vaeT.to('cuda')
mu, sigma, xHat, z = variationalAE(srcClrT)
loss = vaeLoss(srcClrT, mu, sigma, xHat, z)
loss.backward()
Edit 1: Added Code to my VAE loss.
class getVAELoss(torch.nn.Module):
def __init__(self):
super(getVAELoss, self).__init__()
def forward(self, x, mu, sigma, xHat, z):
# Caluclate ELBO
# ELBO = KLDivergence - reconstruction loss
# Reconstruction loss
# Compute the probability of x uner n-d distribution
logScale = nn.parameter.Parameter(torch.Tensor([0.0]).to('cuda'))
scale = torch.exp(logScale)
dist = torch.distributions.Normal(xHat,scale)
logProbXZ = dist.log_prob(x)
logProbXZ = logProbXZ.sum(dim=(1,2,3))
reconstructionLoss = logProbXZ
# KL Divergence
# create two probabilities p and q
# P is the reference distribution with zero mean and unit sigma
p = torch.distributions.Normal(torch.zeros_like(mu), torch.ones_like(sigma))
q = torch.distributions.Normal(mu,sigma)
# Calculating the log Probablility with the Z
logQZX = q.log_prob(z)
logPz = p.log_prob(z)
KL = logQZX - logPz
KL = KL.sum(-1)
elbo = KL - reconstructionLoss
elbo = elbo.mean()
return(elbo)
The VAE loss is very similar to one shown in here
EDIT 2
Looking at several VAE network architectures, I realized that only 1 FC is used in the decoder network, so removing the second FC layer and changing the size of the first FC, removed the error. But I don't understand why this is happening.
self.FC1 = nn.Linear(channels, 1024*4*13)
#self.FC2 = nn.Linear(channels, 10656)

Problem with implementation of Multilayer perceptron

I am trying to create a multi-layered perceptron for the purpose of classifying a dataset of hand drawn digits obtained from the MNIST database. It implements 2 hidden layers that have a sigmoid activation function while the output layer utilizes SoftMax. However, for whatever reason I am not able to get it to work. I have attached the training loop from my code below, this I am confident is where the problems stems from. Can anyone identify possible issues with my implementation of the perceptron?
def train(self, inputs, targets, eta, niterations):
"""
inputs is a numpy array of shape (num_train, D) containing the training images
consisting of num_train samples each of dimension D.
targets is a numpy array of shape (num_train, D) containing the training labels
consisting of num_train samples each of dimension D.
eta is the learning rate for optimization
niterations is the number of iterations for updating the weights
"""
ndata = np.shape(inputs)[0] # number of data samples
# adding the bias
inputs = np.concatenate((inputs, -np.ones((ndata, 1))), axis=1)
# numpy array to store the update weights
updatew1 = np.zeros((np.shape(self.weights1)))
updatew2 = np.zeros((np.shape(self.weights2)))
updatew3 = np.zeros((np.shape(self.weights3)))
for n in range(niterations):
# forward phase
self.outputs = self.forwardPass(inputs)
# Error using the sum-of-squares error function
error = 0.5*np.sum((self.outputs-targets)**2)
if (np.mod(n, 100) == 0):
print("Iteration: ", n, " Error: ", error)
# backward phase
deltao = self.outputs - targets
placeholder = np.zeros(np.shape(self.outputs))
for j in range(np.shape(self.outputs)[1]):
y = self.outputs[:, j]
placeholder[:, j] = y * (1 - y)
for y in range(np.shape(self.outputs)[1]):
if not y == j:
placeholder[:, j] += -y * self.outputs[:, y]
deltao *= placeholder
# compute the derivative of the second hidden layer
deltah2 = np.dot(deltao, np.transpose(self.weights3))
deltah2 = self.hidden2*self.beta*(1.0-self.hidden2)*deltah2
# compute the derivative of the first hidden layer
deltah1 = np.dot(deltah2[:, :-1], np.transpose(self.weights2))
deltah1 = self.hidden1*self.beta*(1.0-self.hidden1)*deltah1
# update the weights of the three layers: self.weights1, self.weights2 and self.weights3
updatew1 = eta*(np.dot(np.transpose(inputs),deltah1[:, :-1])) + (self.momentum * updatew1)
updatew2 = eta*(np.dot(np.transpose(self.hidden1),deltah2[:, :-1])) + (self.momentum * updatew2)
updatew3 = eta*(np.dot(np.transpose(self.hidden2),deltao)) + (self.momentum * updatew3)
self.weights1 -= updatew1
self.weights2 -= updatew2
self.weights3 -= updatew3
def forwardPass(self, inputs):
"""
inputs is a numpy array of shape (num_train, D) containing the training images
consisting of num_train samples each of dimension D.
"""
# layer 1
# the forward pass on the first hidden layer with the sigmoid function
self.hidden1 = np.dot(inputs, self.weights1)
self.hidden1 = 1.0/(1.0+np.exp(-self.beta*self.hidden1))
self.hidden1 = np.concatenate((self.hidden1, -np.ones((np.shape(self.hidden1)[0], 1))), axis=1)
# layer 2
# the forward pass on the second hidden layer with the sigmoid function
self.hidden2 = np.dot(self.hidden1, self.weights2)
self.hidden2 = 1.0/(1.0+np.exp(-self.beta*self.hidden2))
self.hidden2 = np.concatenate((self.hidden2, -np.ones((np.shape(self.hidden2)[0], 1))), axis=1)
# output layer
# the forward pass on the output layer with softmax function
outputs = np.dot(self.hidden2, self.weights3)
outputs = np.exp(outputs)
outputs /= np.repeat(np.sum(outputs, axis=1),outputs.shape[1], axis=0).reshape(outputs.shape)
return outputs
Update: I have since figured something out that I messed up during the backpropagation of the SoftMax algorithm. The actual deltao should be:
deltao = self.outputs - targets
placeholder = np.zeros(np.shape(self.outputs))
for j in range(np.shape(self.outputs)[1]):
y = self.outputs[:, j]
placeholder[:, j] = y * (1 - y)
# the counter for the for loop below used to also be named y causing confusion
for i in range(np.shape(self.outputs)[1]):
if not i == j:
placeholder[:, j] += -y * self.outputs[:, i]
deltao *= placeholder
After this correction the overflow errors have seemed to have sorted themselves however, there is now a new problem, no matter my efforts the accuracy of the perceptron does not exceed 15% no matter what variables I change
Second Update: After a long time I have finally found a way to get my code to work. I had to change the backpropogation of SoftMax (in code this is called deltao) to the following:
deltao = np.exp(self.outputs)
deltao/=np.repeat(np.sum(deltao,axis=1),deltao.shape[1]).reshape(deltao.shape)
deltao = deltao * (1 - deltao)
deltao *= (self.outputs - targets)/np.shape(inputs)[0]
Only problem is I have no idea why this works as a derivative of SoftMax could anyone explain this?

Dimension out of range when applying l2 normalization in Pytorch

I'm getting a runtime error:
RuntimeError: Dimension out of range (expected to be in range of [-1, 0], but got 1)`
and can't figure out how to fix it.
The error appears to refer to the line:
i_enc = F.normalize(input =i_batch, p=2, dim=1, eps=1e-12) # (batch, K, feat_dim)
I'm trying to encode image features (batch x 36 x 2038) by applying a L2 norm. Below is the full code for the section.
def forward(self, q_batch, i_batch):
# batch size = 512
# q -> 512(batch)x14(length)
# i -> 512(batch)x36(K)x2048(f_dim)
# one-hot -> glove
emb = self.embed(q_batch)
output, hn = self.gru(emb.permute(1, 0, 2))
q_enc = hn.view(-1,self.h_dim)
# image encoding with l2 norm
i_enc = F.normalize(input =i_batch, p=2, dim=1, eps=1e-12) # (batch, K, feat_dim)
q_enc_copy = q_enc.repeat(1, self.K).view(-1, self.K, self.h_dim)
q_i_concat = torch.cat((i_enc, q_enc_copy), -1)
q_i_concat = self.non_linear(q_i_concat, self.td_W, self.td_W2 )#512 x 36 x 512
i_attention = self.att_w(q_i_concat) #512x36x1
i_attention = F.softmax(i_attention.squeeze(),1)
#weighted sum
i_enc = torch.bmm(i_attention.unsqueeze(1), i_enc).squeeze() # (batch, feat_dim)
# element-wise multiplication
q = self.non_linear(q_enc, self.q_W, self.q_W2)
i = self.non_linear(i_enc, self.i_W, self.i_W2)
h = torch.mul(q, i) # (batch, hid_dim)
# output classifier
# BCE with logitsloss
score = self.c_Wo(self.non_linear(h, self.c_W, self.c_W2))
return score
I would appreciate any help.
Thanks
I would suggest to check the shape of i_batch (e.g. print(i_batch.shape)), as I suspect i_batch has only 1 dimension (e.g. of shape [N]).
This would explain why PyTorch is complaining you can normalize only over the dimension #0; while you are asking for the operation to be done over a dimension #1 (c.f. dim=1).

Splitting ndarray gives unexpected results (TensorFlow RNN tutorial)

I am following a tutorial on rnn's in TensorFlow but I have a question concerning the input formats.
They are taking raw_x (one hot vector) and basically first cutting that up in pieces of length 200 (batch_size) to form data_x. That is good.
Then they further cut up data_x in pieces of length 5 (num_step, or graph width) with:
for i in range(epoch_size):
x = data_x[:, i * num_steps:(i + 1) * num_steps]
y = data_y[:, i * num_steps:(i + 1) * num_steps]
yield (x, y)
However, if I look in the data, the slices of x do not match data_x. The first one does, but then they diverge.
Am I misunderstanding the above code? I would like to understand how x is being created or what it is supposed to look like.
I had expected the second item to be 0 1 0 1 0.
Also, I thought an epoch is when you go through the data completely, from this it seems that they split up the data in 1000 parts (epoch size)?
If it helps, this is my full code. I am trying to figure out what is going on in x. at line 48:
import numpy as np
import tensorflow as tf
# %matplotlib inline
import matplotlib.pyplot as plt
# Global config variables
num_steps = 5 # number of truncated backprop steps ('n' in the discussion above)
batch_size = 200
num_classes = 2
state_size = 4
learning_rate = 0.1
def gen_data(size=1000000):
print('generating data');
X = np.array(np.random.choice(2, size=(size,)))
Y = []
for i in range(size):
threshold = 0.5
if X[i-3] == 1:
threshold += 0.5
if X[i-8] == 1:
threshold -= 0.25
if np.random.rand() > threshold:
Y.append(0)
else:
Y.append(1)
return X, np.array(Y)
# adapted from https://github.com/tensorflow/tensorflow/blob/master/tensorflow/models/rnn/ptb/reader.py
def gen_batch(raw_data, batch_size, num_steps):
print('generating batches');
raw_x, raw_y = raw_data
data_length = len(raw_x)
# partition raw data into batches and stack them vertically in a data matrix
batch_partition_length = data_length // batch_size
data_x = np.zeros([batch_size, batch_partition_length], dtype=np.int32)
data_y = np.zeros([batch_size, batch_partition_length], dtype=np.int32)
for i in range(batch_size):
data_x[i] = raw_x[batch_partition_length * i:batch_partition_length * (i + 1)]
data_y[i] = raw_y[batch_partition_length * i:batch_partition_length * (i + 1)]
# further divide batch partitions into num_steps for truncated backprop
epoch_size = batch_partition_length // num_steps
for i in range(epoch_size):
x = data_x[:, i * num_steps:(i + 1) * num_steps]
y = data_y[:, i * num_steps:(i + 1) * num_steps]
yield (x, y)
def gen_epochs(n, num_steps):
for i in range(n):
yield gen_batch(gen_data(), batch_size, num_steps)
"""
Placeholders
"""
x = tf.placeholder(tf.int32, [batch_size, num_steps], name='input_placeholder')
y = tf.placeholder(tf.int32, [batch_size, num_steps], name='labels_placeholder')
init_state = tf.zeros([batch_size, state_size])
"""
RNN Inputs
"""
# Turn our x placeholder into a list of one-hot tensors:
# rnn_inputs is a list of num_steps tensors with shape [batch_size, num_classes]
x_one_hot = tf.one_hot(x, num_classes)
rnn_inputs = tf.unstack(x_one_hot, axis=1)
"""
Definition of rnn_cell
This is very similar to the __call__ method on Tensorflow's BasicRNNCell. See:
https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/rnn_cell.py
"""
with tf.variable_scope('rnn_cell'):
W = tf.get_variable('W', [num_classes + state_size, state_size])
b = tf.get_variable('b', [state_size], initializer=tf.constant_initializer(0.0))
def rnn_cell(rnn_input, state):
with tf.variable_scope('rnn_cell', reuse=True):
W = tf.get_variable('W', [num_classes + state_size, state_size])
b = tf.get_variable('b', [state_size], initializer=tf.constant_initializer(0.0))
return tf.tanh(tf.matmul(tf.concat(axis=1, values=[rnn_input, state]), W) + b)
"""
Adding rnn_cells to graph
This is a simplified version of the "rnn" function from Tensorflow's api. See:
https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/rnn.py
"""
state = init_state
rnn_outputs = []
for rnn_input in rnn_inputs:
state = rnn_cell(rnn_input, state)
rnn_outputs.append(state)
final_state = rnn_outputs[-1]
"""
Predictions, loss, training step
Losses and total_loss are simlar to the "sequence_loss_by_example" and "sequence_loss"
functions, respectively, from Tensorflow's api. See:
https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/seq2seq.py
"""
#logits and predictions
with tf.variable_scope('softmax'):
W = tf.get_variable('W', [state_size, num_classes])
b = tf.get_variable('b', [num_classes], initializer=tf.constant_initializer(0.0))
logits = [tf.matmul(rnn_output, W) + b for rnn_output in rnn_outputs]
predictions = [tf.nn.softmax(logit) for logit in logits]
# Turn our y placeholder into a list labels
y_as_list = [tf.squeeze(i, axis=[1]) for i in tf.split(axis=1, num_or_size_splits=num_steps, value=y)]
#losses and train_step
losses = [tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logit,labels=label) for \
logit, label in zip(logits, y_as_list)]
total_loss = tf.reduce_mean(losses)
train_step = tf.train.AdagradOptimizer(learning_rate).minimize(total_loss)
"""
Function to train the network
"""
def train_network(num_epochs, num_steps, state_size=4, verbose=True):
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
training_losses = []
for idx, epoch in enumerate(gen_epochs(num_epochs, num_steps)):
training_loss = 0
training_state = np.zeros((batch_size, state_size))
if verbose:
print("\nEPOCH", idx)
for step, (X, Y) in enumerate(epoch):
tr_losses, training_loss_, training_state, _ = \
sess.run([losses,
total_loss,
final_state,
train_step],
feed_dict={x:X, y:Y, init_state:training_state})
training_loss += training_loss_
if step % 100 == 0 and step > 0:
if verbose:
print("Average loss at step", step,
"for last 250 steps:", training_loss/100)
training_losses.append(training_loss/100)
training_loss = 0
return training_losses
training_losses = train_network(1,num_steps)
plt.plot(training_losses)
Seems like the batches are actually transposed.
So the first elements of the x-matrix (200 x 5) will fit the first 5 elements of x_raw.
Then only in the next iteration, the next 5-10 elements of x_raw will be in the first elements (again) of x.

Categories

Resources