Related
I have written a handwritten CNN neural network from scratch which takes a matrix [784x1] as an input from the mnist dataset and outputs a result of a [10x1] matrix with the index of the matrix representing the digit the neural network believes that the digit was.
I have a variable number of hidden layers, however my code is O(n^2) so having even just two hidden layers drastically slows down the program. The weights of each layer are stored as a matrix [num_outputs_of_layer x num_inputs_of_layer].
I am using the softmax function which I believe the derivative is: f'(x) = f(x)(1-f(x)) ?
MATHS:
I am amending the weights for the final hidden layer by multiplying the delta matrix (output - expected), by o x (1 - o) where o is the output of the final layer. I then matrix multiply this by the transposed inputs of this layer to get a change in weights. The change of bias is just the delta x [o x (1-o)]
For any other layer (including the weights of input layer), I take the delta and multiply it by the derivative of the final hidden layer which gives a [10 x 1] matrix
I then matrix multiply this by the weight matrix of final hidden layer [a x 10] which leaves a [a x 1] matrix, I then multiply this by the derivative of the output of the next layer which is also an [a x 1] matrix and then by the weights [b x 1] to give a [b x 1]. I do this recurrently until I get to the layer I am trying to change, I do not multiply this by the weight matrix but by the transposed input matrix.
This makes logical sense to me however it may well be wrong? Could someone please check my logic and whether I have coded this correctly as the neural network doesn't seem to be learning.
The issue is that currently it learns with one hidden layer but only to roughly 50%. I think this is due to the fact that one hidden layer is not complex enough?
As soon as I add more hidden layers it doesn't learn and averages about 10% correct (so random) eventually it also after a while tends to guess the same number for each photo ie constantly guesses 5. This improved a little with changing the starting weights, however it still does it sometimes and still doesn't learn.
If anyone could provide any suggestions of why it isn't working that would be much appreciated. I assume it is to do with my maths however I can't for the life of me see where Ive gone wrong!!!
Here is my code:
`
import numpy as np
from tensorflow.keras.datasets import mnist
class Run:
def __init__(self, num_hidden_layers, num_input, num_output):
self.layers = []
self.inputNumbers = []
self.count = 0
self.countTrue = 0
self.num_layers = num_hidden_layers + 1
self.learningRate = 0.1
for i in range(num_hidden_layers + 1):
if i != num_hidden_layers:
self.inputNumbers.append(num_input)
num_out = int(num_input * 8/9)
if num_out < num_output:
num_out = num_output
self.layers.append(Layer(num_input, num_out))
if num_input * 8/9 > num_output:
num_input = int(num_input * 8/9)
else:
num_input = 10
else:
self.inputNumbers.append(num_input)
self.layers.append(Layer(num_input, num_output))
def runNN(self, input):
for i in range(self.num_layers):
self.layers[i].calc_output(input)
input = self.layers[i].fin_outputs
self.NN_Output = input
#print(max(self.NN_Output))
def check_if_correct(self, expected):
list = []
list1 = []
for each in self.NN_Output:
list.append(float(each[0]))
for each in expected:
list1.append(float(each[0]))
print(list.index(max(list)), list1.index(max(list1)))
if list.index(max(list)) == list1.index(max(list1)):
self.countTrue += 1
self.count += 1
print(self.countTrue/self.count)
print('')
if self.count > 1000:
self.count = 0
self.countTrue = 0
def change_weights(self, expected):
change_weights = []
change_bias = []
delta = self.NN_Output - expected
for i in range(self.num_layers):
first = True
total = 0
for each in range(i):
if first == True:
deriv_soft = self.layers[self.num_layers - each - 1].deriv_softmax()
weight_mat = np.transpose(self.layers[self.num_layers - each - 1].getter()[0])
total = np.matmul(weight_mat, np.multiply(delta, deriv_soft))
first = False
else:
deriv_soft = self.layers[self.num_layers - each - 1].deriv_softmax()
weight_mat = np.transpose(self.layers[self.num_layers - each - 1].getter()[0])
total = np.multiply(total, deriv_soft)
total = np.matmul(weight_mat, total)
if i == 0:
deriv_soft = self.layers[self.num_layers - 1].deriv_softmax()
total = np.multiply(delta, deriv_soft)
change = np.matmul(total, np.transpose(self.layers[self.num_layers - i - 1].inputs)) * self.learningRate
change_weights.append(change)
change_bias.append(total * self.learningRate)
for i in range(self.num_layers):
self.layers[self.num_layers - i - 1].amend(change_weights[i], change_bias[i])
class Layer:
def __init__(self, num_inputs, num_outputs):
self.__weights = np.random.uniform(-10, 10, (num_outputs, num_inputs))
self.__bias = np.matrix([[float(0)] for x in range(num_outputs)])
def calc_output(self, inputs):
self.inputs = inputs
self.__output_1 = np.matmul(self.__weights, inputs) + self.__bias
self.softmax()
def softmax(self):
sum = 0
for each in self.__output_1:
sum += np.exp(float(each[0]))
list1 = []
for each in self.__output_1:
list1.append([float(np.exp(each[0])/sum)])
self.fin_outputs = np.matrix(list1)
return np.matrix(list1)
def deriv_softmax(self):
list = []
for each in self.fin_outputs:
list.append([float(each[0]*(1-each[0]))])
self.deriv = np.matrix(list)
return self.deriv
def amend(self, change_weights, change_bias):
self.__weights -= change_weights
self.__bias -= change_bias
def getter(self):
return self.__weights, self.__bias
class GetInput:
def __init__(self):
(self.X_train, self.Y_train), (X_test, Y_test) = mnist.load_data()
self.X_train = self.X_train.reshape(self.X_train.shape[0], 28, 28, 1)
x_test = X_test.reshape(X_test.shape[0], 28, 28, 1)
def get(self, i):
list = []
newPhoto = self.X_train[i].astype('float32')/255
for each in newPhoto:
for n in each:
list.append([float(n)])
input = np.matrix(list)
list = []
expect = self.Y_train[i]
for each in range(10):
if each == expect:
list.append([1])
else:
list.append([0])
expected = np.matrix(list)
return input, expected
Neural = Run(2, 784, 10)
getinputs = GetInput()
count = 0
while True:
input, expected = getinputs.get(count)
Neural.runNN(input)
Neural.check_if_correct(expected)
Neural.change_weights(expected)
count += 1
`
Many thanks
Daniel
I have tried changing:
number of hidden layers
starting weights
learning rate
I am trying to create a multi-layered perceptron for the purpose of classifying a dataset of hand drawn digits obtained from the MNIST database. It implements 2 hidden layers that have a sigmoid activation function while the output layer utilizes SoftMax. However, for whatever reason I am not able to get it to work. I have attached the training loop from my code below, this I am confident is where the problems stems from. Can anyone identify possible issues with my implementation of the perceptron?
def train(self, inputs, targets, eta, niterations):
"""
inputs is a numpy array of shape (num_train, D) containing the training images
consisting of num_train samples each of dimension D.
targets is a numpy array of shape (num_train, D) containing the training labels
consisting of num_train samples each of dimension D.
eta is the learning rate for optimization
niterations is the number of iterations for updating the weights
"""
ndata = np.shape(inputs)[0] # number of data samples
# adding the bias
inputs = np.concatenate((inputs, -np.ones((ndata, 1))), axis=1)
# numpy array to store the update weights
updatew1 = np.zeros((np.shape(self.weights1)))
updatew2 = np.zeros((np.shape(self.weights2)))
updatew3 = np.zeros((np.shape(self.weights3)))
for n in range(niterations):
# forward phase
self.outputs = self.forwardPass(inputs)
# Error using the sum-of-squares error function
error = 0.5*np.sum((self.outputs-targets)**2)
if (np.mod(n, 100) == 0):
print("Iteration: ", n, " Error: ", error)
# backward phase
deltao = self.outputs - targets
placeholder = np.zeros(np.shape(self.outputs))
for j in range(np.shape(self.outputs)[1]):
y = self.outputs[:, j]
placeholder[:, j] = y * (1 - y)
for y in range(np.shape(self.outputs)[1]):
if not y == j:
placeholder[:, j] += -y * self.outputs[:, y]
deltao *= placeholder
# compute the derivative of the second hidden layer
deltah2 = np.dot(deltao, np.transpose(self.weights3))
deltah2 = self.hidden2*self.beta*(1.0-self.hidden2)*deltah2
# compute the derivative of the first hidden layer
deltah1 = np.dot(deltah2[:, :-1], np.transpose(self.weights2))
deltah1 = self.hidden1*self.beta*(1.0-self.hidden1)*deltah1
# update the weights of the three layers: self.weights1, self.weights2 and self.weights3
updatew1 = eta*(np.dot(np.transpose(inputs),deltah1[:, :-1])) + (self.momentum * updatew1)
updatew2 = eta*(np.dot(np.transpose(self.hidden1),deltah2[:, :-1])) + (self.momentum * updatew2)
updatew3 = eta*(np.dot(np.transpose(self.hidden2),deltao)) + (self.momentum * updatew3)
self.weights1 -= updatew1
self.weights2 -= updatew2
self.weights3 -= updatew3
def forwardPass(self, inputs):
"""
inputs is a numpy array of shape (num_train, D) containing the training images
consisting of num_train samples each of dimension D.
"""
# layer 1
# the forward pass on the first hidden layer with the sigmoid function
self.hidden1 = np.dot(inputs, self.weights1)
self.hidden1 = 1.0/(1.0+np.exp(-self.beta*self.hidden1))
self.hidden1 = np.concatenate((self.hidden1, -np.ones((np.shape(self.hidden1)[0], 1))), axis=1)
# layer 2
# the forward pass on the second hidden layer with the sigmoid function
self.hidden2 = np.dot(self.hidden1, self.weights2)
self.hidden2 = 1.0/(1.0+np.exp(-self.beta*self.hidden2))
self.hidden2 = np.concatenate((self.hidden2, -np.ones((np.shape(self.hidden2)[0], 1))), axis=1)
# output layer
# the forward pass on the output layer with softmax function
outputs = np.dot(self.hidden2, self.weights3)
outputs = np.exp(outputs)
outputs /= np.repeat(np.sum(outputs, axis=1),outputs.shape[1], axis=0).reshape(outputs.shape)
return outputs
Update: I have since figured something out that I messed up during the backpropagation of the SoftMax algorithm. The actual deltao should be:
deltao = self.outputs - targets
placeholder = np.zeros(np.shape(self.outputs))
for j in range(np.shape(self.outputs)[1]):
y = self.outputs[:, j]
placeholder[:, j] = y * (1 - y)
# the counter for the for loop below used to also be named y causing confusion
for i in range(np.shape(self.outputs)[1]):
if not i == j:
placeholder[:, j] += -y * self.outputs[:, i]
deltao *= placeholder
After this correction the overflow errors have seemed to have sorted themselves however, there is now a new problem, no matter my efforts the accuracy of the perceptron does not exceed 15% no matter what variables I change
Second Update: After a long time I have finally found a way to get my code to work. I had to change the backpropogation of SoftMax (in code this is called deltao) to the following:
deltao = np.exp(self.outputs)
deltao/=np.repeat(np.sum(deltao,axis=1),deltao.shape[1]).reshape(deltao.shape)
deltao = deltao * (1 - deltao)
deltao *= (self.outputs - targets)/np.shape(inputs)[0]
Only problem is I have no idea why this works as a derivative of SoftMax could anyone explain this?
I'm working on a school project and am stuck on how to implement backpropagation in Numpy with the current forward prop structure I have. The aim of this script is to make a simple dynamic (meaning any number of layers and nodes) fully connected network using only numpy.
I think that I have to find the derivatives of the activation functions and multipliy it by the original error as well as the derivative of each activation function I encounter moving backward.
However, I'm having trouble figuring out how to implement this correctly in my script.
It'd be a great help if someone could explain in English what exactly I have to do given the complexities of the setup here, or even give a recommendation for a video/post that deals w dynamic size backprop.
Right now all the weights and biases are being stored in lists for future backprop, and I'm able to get the error for each output with the small amount of code currently in the backprop function.
This code block
#initialize a test model w/ 128 bacth and lr of 0.01
model = Model(128, 0.01)
#simple x data input
X = np.array([[1,1],[0,0],[12,5]])
Y = np.array([[1],[0],[-1]])
#adding 4 layers
z = model.add(X, 3, "sigmoid")
z = model.add(z, 1, "sigmoid", output=True)
#this is a full forward pass through the layers
z = model.predict(X)
print(z)
#this is the error of the predictions
print(model.backprop(z, Y))
Outputs the following vectors:
[[0.50006457]
[0.50006459]
[0.50006431]]
[[0.24993544]
[0.2500646 ]
[2.25019293]]
Like I said, not sure how to move forward ( or backward ;) ) from here.
Below is the full script needed to run the example:
import math
import numpy as np
#everything below is defining activation functions
#--------------------------------------------------------------------------------------------
def b_relu(input):
return max((0, max(input)))
def bd_relu(input):
if(input < 0 or input == 0):
return 0
else:
return 1
def b_sigmoid(x):
return 1 / (1 + math.exp(-x))
def bd_sigmoid(input):
return sigmoid(input) * (1 - sigmoid(input))
def b_tanh(input):
top = (math.exp(input) - math.exp(-input))
bottom = (math.exp(input) + math.exp(-input))
return (top/bottom)
#helper functions for tanh
def cosh(input):
return ((math.exp(input) + math.exp(-input)) / 2)
def sinh(input):
return ((math.exp(input) - math.exp(-input)) / 2)
def bd_tanh(input):
top = (math.pow(cosh(input), 2) - math.pow(sinh(input), 2))
bottom = math.pow(input, 2)
return (top / bottom)
def b_softmax(z):
# subracting the max adds numerical stability
shiftx = z - np.max(z,axis=1)[:,np.newaxis]
exps = np.exp(shiftx)
return exps / np.sum(exps,axis=1)[:,np.newaxis]
def bd_softmax(Y_hat, Y):
return Y_hat - Y
def b_linear(input):
return input
def bd_linear(input):
return 1
#vectorizing the activation and deriv. activation functions
relu = np.vectorize(b_relu)
d_relu = np.vectorize(bd_relu)
sigmoid = np.vectorize(b_sigmoid)
d_sigmoid = np.vectorize(bd_sigmoid)
tanh = np.vectorize(b_tanh)
d_tanh = np.vectorize(bd_tanh)
softmax = np.vectorize(b_softmax)
d_softmax = np.vectorize(bd_softmax)
linear = np.vectorize(b_linear)
d_linear = np.vectorize(bd_linear)
class Model:
def __init__(self, batch, lr):
#initializing self lists to keep track of stuff for bacthes, forward prop & backporp
self.batch = batch
self.lr = lr
self.W = []
self.B = []
self.A = []
self.Z = []
self.X = []
self.layers = []
self.tempW = []
self.tempB = []
#store error for backprop
self.output_error = []
#initialize the weights during 'model.add' so we can test our network shapes dynamically w/out model.compile
#added an output bool here so we can make sure the shape of the output network is (1,n)
def initial_weights(self, input_data, output_shape, output=False):
B = np.zeros((1, output_shape))
#assigning the shape
W = np.random.uniform(-1e-3, 1e-3, size = (input_data.shape[len(input_data.shape) - 1], output_shape))
self.B.append(B)
self.W.append(W)
def add(self, input_data, output_shape, activation, output=False):
#append to layers so we have a correct index value
self.layers.append(69)
#making sure our data in a numpy array
if (type(input_data) == np.ndarray):
X = input_data
else:
X = np.asarray(input_data)
#adding data and activations to self lists
self.X.append(X)
self.A.append(activation)
#keep track of our index & initializing random weights for dynamic comatibility testing
index = len(self.layers)-1
self.initial_weights(input_data, output_shape, output=False)
X2 = self.forward(input_data, index)
#printing layer info
print("Layer:", index)
print("Input Shape: ", X.shape)
print("Weight Shape: ", self.W[index].shape)
print("Output Shape: ", X2.shape)
print(" ")
return(X2)
def forward(self, input_data, index):
#pulling weights and biases from main lists for operations
B = self.B[index]
W = self.W[index]
#matmul of data # weights + bias
Z = np.matmul(input_data, W) + B
#summing each row of inputs to activation node
for x in Z:
x = sum(x)
#pulling activation from index
act = str(self.A[index])
#activating
Z = activate(Z, act)
#keeping track of Z i guess
self.Zappend = Z
return(Z)
def predict(self, input_data):
for x in range(len(self.layers)):
z = model.forward(input_data, x)
input_data = z
return z
def backprop(self, model_output, ground_truth):
#------------------------------
#now begins the backprop portion
#let's start with finding the error between predictions and actual values
#gonna do MSE to keep it simple
self.output_error = (ground_truth - model_output) ** 2
#so now we have the error of the output layer, this tells us two things, how wrong we were, and in which direction we should update
#the outputs of these nodes
'''
What to do if this was linear regression (for m & b)
1. Take the error and multiply it by the transpose of the last layer weights
(I think the error in this case is where the prime activation function should be if we had activations)
2. The last layer bias is just the error
3. The second to last layer inputs is the bias times the transpose of second layers weights
3. Then I have no idea
'''
return self.output_error
I am trying to build a simple neural network class from scratch using numpy, and test it using the XOR problem. But the backpropagation function (backprop) does not seem to be working correctly.
In the class, I construct instances by passing in the size of each layer, and the activation functions to use at each layer. I assume that the final activation function is softmax, so that I can calculate the derivative of cross-entropy loss wrt to Z of the last layer. I also do not have a separate set of bias matrices in my class. I just include them in the weight matrices as an extra column at the end.
I know that my backprop function is not working correctly, because the neural network does not ever converge on a somewhat correct output. I also created a numerical gradient function, and when comparing the results of both. I get drastically different numbers.
My understanding from what I have read is that the delta values of each layer (with L being the last layer, and i representing any other layer) should be:
And the respective gradients/weight-update of those layers should be:
Where * is the hardamard product, a represents the activation of some layer, and z represents the nonactivated output of some layer.
The sample data that I am using to test this is at the bottom of the file.
This is my first time trying to implement the backpropagation algorithm from scratch. So I am a bit lost on where to go from here.
import numpy as np
def sigmoid(n, deriv=False):
if deriv:
return np.multiply(n, np.subtract(1, n))
return 1 / (1 + np.exp(-n))
def softmax(X, deriv=False):
if not deriv:
exps = np.exp(X - np.max(X))
return exps / np.sum(exps)
else:
raise Error('Unimplemented')
def cross_entropy(y, p, deriv=False):
"""
when deriv = True, returns deriv of cost wrt z
"""
if deriv:
ret = p - y
return ret
else:
p = np.clip(p, 1e-12, 1. - 1e-12)
N = p.shape[0]
return -np.sum(y*np.log(p))/(N)
class NN:
def __init__(self, layers, activations):
"""random initialization of weights/biases
NOTE - biases are built into the standard weight matrices by adding an extra column
and multiplying it by one in every layer"""
self.activate_fns = activations
self.weights = [np.random.rand(layers[1], layers[0]+1)]
for i in range(1, len(layers)):
if i != len(layers)-1:
self.weights.append(np.random.rand(layers[i+1], layers[i]+1))
for j in range(layers[i+1]):
for k in range(layers[i]+1):
if np.random.rand(1,1)[0,0] > .5:
self.weights[-1][j,k] = -self.weights[-1][j,k]
def ff(self, X, get_activations=False):
"""Feedforward"""
activations, zs = [], []
for activate, w in zip(self.activate_fns, self.weights):
X = np.vstack([X, np.ones((1, 1))]) # adding bias
z = w.dot(X)
X = activate(z)
if get_activations:
zs.append(z)
activations.append(X)
return (activations, zs) if get_activations else X
def grad_descent(self, data, epochs, learning_rate):
"""gradient descent
data - list of 2 item tuples, the first item being an input, and the second being its label"""
grad_w = [np.zeros_like(w) for w in self.weights]
for _ in range(epochs):
for x, y in data:
grad_w = [n+o for n, o in zip(self.backprop(x, y), grad_w)]
self.weights = [w-(learning_rate/len(data))*gw for w, gw in zip(self.weights, grad_w)]
def backprop(self, X, y):
"""perfoms backprop for one layer of a NN with softmax/cross_entropy output layer"""
(activations, zs) = self.ff(X, True)
activations.insert(0, X)
deltas = [0 for _ in range(len(self.weights))]
grad_w = [0 for _ in range(len(self.weights))]
deltas[-1] = cross_entropy(y, activations[-1], True) # assumes output activation is softmax
grad_w[-1] = np.dot(deltas[-1], np.vstack([activations[-2], np.ones((1, 1))]).transpose())
for i in range(len(self.weights)-2, -1, -1):
deltas[i] = np.dot(self.weights[i+1][:, :-1].transpose(), deltas[i+1]) * self.activate_fns[i](zs[i], True)
grad_w[i] = np.hstack((np.dot(deltas[i], activations[max(0, i-1)].transpose()), deltas[i]))
# check gradient
num_gw = self.gradient_check(X, y, i)
print('numerical:', num_gw, '\nanalytic:', grad_w)
return grad_w
def gradient_check(self, x, y, i, epsilon=1e-4):
"""Numerically calculate the gradient in order to check analytical correctness"""
grad_w = [np.zeros_like(w) for w in self.weights]
for w, gw in zip(self.weights, grad_w):
for j in range(w.shape[0]):
for k in range(w.shape[1]):
w[j,k] += epsilon
out1 = cross_entropy(self.ff(x), y)
w[j,k] -= 2*epsilon
out2 = cross_entropy(self.ff(x), y)
gw[j,k] = np.float64(out1 - out2) / (2*epsilon)
w[j,k] += epsilon # return weight to original value
return grad_w
##### TESTING #####
X = [np.array([[0],[0]]), np.array([[0],[1]]), np.array([[1],[0]]), np.array([[1],[1]])]
y = [np.array([[1], [0]]), np.array([[0], [1]]), np.array([[0], [1]]), np.array([[1], [0]])]
data = []
for x, t in zip(X, y):
data.append((x, t))
def nn_test():
c = NN([2, 2, 2], [sigmoid, sigmoid, softmax])
c.grad_descent(data, 100, .01)
for x in X:
print(c.ff(x))
nn_test()
UPDATE: I found one small bug in the code, but it still does not converge correctly. I calculated/derived the gradients for both matrices by hand and found no errors in my implementation, so I still do not know what is wrong with it.
UPDATE #2: I created a procedural version of what I was using above with the following code. Upon testing I discovered that the NN was able to learn the correct weights for classifying each of the 4 cases in XOR separately, but when I try to train using all the training examples at once (as shown), the resultant weights almost always output something around .5 for both output nodes. Could someone please tell me why this is occurring?
X = [np.array([[0],[0]]), np.array([[0],[1]]), np.array([[1],[0]]), np.array([[1],[1]])]
y = [np.array([[1], [0]]), np.array([[0], [1]]), np.array([[0], [1]]), np.array([[1], [0]])]
weights = [np.random.rand(2, 3) for _ in range(2)]
for _ in range(1000):
for i in range(4):
#Feedforward
a0 = X[i]
z0 = weights[0].dot(np.vstack([a0, np.ones((1, 1))]))
a1 = sigmoid(z0)
z1 = weights[1].dot(np.vstack([a1, np.ones((1, 1))]))
a2 = softmax(z1)
# print('output:', a2, '\ncost:', cross_entropy(y[i], a2))
#backprop
del1 = cross_entropy(y[i], a2, True)
dcdw1 = del1.dot(np.vstack([a1, np.ones((1, 1))]).T)
del0 = weights[1][:, :-1].T.dot(del1)*sigmoid(z0, True)
dcdw0 = del0.dot(np.vstack([a0, np.ones((1, 1))]).T)
weights[0] -= .03*weights[0]*dcdw0
weights[1] -= .03*weights[1]*dcdw1
i = 0
a0 = X[i]
z0 = weights[0].dot(np.vstack([a0, np.ones((1, 1))]))
a1 = sigmoid(z0)
z1 = weights[1].dot(np.vstack([a1, np.ones((1, 1))]))
a2 = softmax(z1)
print(a2)
Softmax doesn't look right
Using cross entropy loss, the derivative for softmax is really nice (assuming you are using a 1 hot vector, where "1 hot" essentially means an array of all 0's except for a single 1, ie: [0,0,0,0,0,0,1,0,0])
For node y_n it ends up being y_n-t_n. So for a softmax with output:
[0.2,0.2,0.3,0.3]
And desired output:
[0,1,0,0]
The gradient at each of the softmax nodes is:
[0.2,-0.8,0.3,0.3]
It looks as if you are subtracting 1 from the entire array. The variable names aren't very clear, so if you could possibly rename them from L to what L represents, such as output_layer I'd be able to help more.
Also, for the other layers just to clear things up. When you say a^(L-1) as an example, do you mean "a to the power of (l-1)" or do you mean "a xor (l-1)"? Because in python ^ means xor.
EDIT:
I used this code and found the strange matrix dimensions (modified at line 69 in the function backprop)
deltas = [0 for _ in range(len(self.weights))]
grad_w = [0 for _ in range(len(self.weights))]
deltas[-1] = cross_entropy(y, activations[-1], True) # assumes output activation is softmax
print(deltas[-1].shape)
grad_w[-1] = np.dot(deltas[-1], np.vstack([activations[-2], np.ones((1, 1))]).transpose())
print(self.weights[-1].shape)
print(activations[-2].shape)
exit()
So I am building an RNN from scratch using numpy just to get the hang of how they work internally. My backpropagation through time is here:
def backprop_through_time(self, X, Y):
assert(len(X.shape) == 3)
seq_length = Y.shape[1] if self.return_sequences else 1
_, (Z_states, States, Z_outs, Outs) = self.feed_forward(X, cache=True)
if not self.return_sequences:
Outs = Outs[:,-1,:]
# setup gradients
dLdU = np.zeros(self.U.shape)
dLdV = np.zeros(self.V.shape)
dLdW = np.zeros(self.W.shape)
dLdB_state = np.zeros(self.B_state.shape)
dLdB_out = np.zeros(self.B_out.shape)
dLdOuts = self.loss_function_prime(Outs, Y)
if not self.return_sequences:
# we need dLdOuts to have a seq_length dim at axis 1
dLdOuts = np.expand_dims(dLdOuts, axis=1)
for t in range(seq_length):
adjusted_t = seq_length-1 if not self.return_sequences else t
# print("adjusted_t {}".format(adjusted_t))
dOuts_tdZ_out = self.output_activation_function_prime(Z_outs[:,adjusted_t,:])
dLdZ_out = np.multiply(dLdOuts[:, adjusted_t, :], dOuts_tdZ_out)
# Z_state = dot(X_t, self.U) + dot(State_{t-1}, self.W) + self.B_state
# State_t = f(Z_state)
# Z_out = dot(State_t, self.V) + self.B_out
# Out_t = g(Z_out)
dLdV += np.dot(States[:,adjusted_t,:].T, dLdZ_out)
dLdB_out += np.sum(dLdZ_out, axis=0, keepdims=True)
dLdZ_state = np.multiply(np.dot(dLdZ_out, self.V.T),
self.hidden_activation_function_prime(Z_states[:,adjusted_t,:]))
for t_prev in range(max(0, adjusted_t-self.backprop_through_time_limit), adjusted_t+1)[::-1]:
dLdB_state += np.sum(dLdZ_state, axis=0, keepdims=True)
dLdW += np.dot(States[:,t_prev-1,:].T, dLdZ_state)
dLdU += np.dot(X[:,t_prev,:].T, dLdZ_state)
dLdZ_state = np.multiply(np.dot(dLdZ_state, self.W.T),
self.hidden_activation_function_prime(States[:,t_prev-1,:]))
return (dLdU, dLdV, dLdW), (dLdB_state, dLdB_out)
However I am still failing a gradient check for parameters `dLdU, dLdW, dLdB_state`. I have gone through the math about a dozen times now, and I cannot find what is wrong with my implementation.
I assume X and Y both are 3D matrices with X having shape: X.shape := (batch_size, seq_length, input_dim)
while Y having shape: Y.shape := (batch_size, seq_length, output_dim)
Caching the feed_forward operation, I am returning Z_states with shape Z_states.shape := (batch_size, seq_length, hidden_dim), Z_outs and Outs with shape Z_outs.shape, Outs.shape := (batch_size, seq_length, output_dim), and States as States.shape := (batch_size, seq_length+1, hidden_dim). States[:,-1,:] is the original zeros of shape States[:,-1,:].shape := (batch_size, hidden_dim) that the RNN state is initialized with. Could anyone help me?
EDIT
I found my answer. My math is right, but I was calling the wrong variable. When I update dLdZ_state in the 2nd inner loop (the backprop through time part), I am multiplying with self.hidden_activation_function_prime(States[:,t_prev-1,:]) This shoud instead be self.hidden_activation_function_prime(Z_states[:,t_prev-1,:])