backpropagation trouble; getting higher and higher total cost up until its infinity - python

I made a FC neural network with numpy based on the video's of welch's lab but when I try to train it I seem to have exploding gradients at launch, which is weird, I will put down the whole code which is testable in python 3+. only costfunctionprime seem to break the gradient descent stuff going but I have no idea what is happening. Can someone smarter than me help?
EDIT: the trng_input and trng_output are not the one I use, I use a big dataset
import numpy as np
import random
trng_input = [[random.random() for _ in range(7)] for _ in range(100)]
trng_output = [[random.random() for _ in range(2)] for _ in range(100)]
def relu(x):
return x * (x > 0)
def reluprime(x):
return (x>0).astype(x.dtype)
class Neural_Net():
def __init__(self, data_input, data_output):
self.data_input = data_input
self.trng_output = trng_output
self.bias = 0
self.nodes = np.array([7, 2])
self.LR = 0.01
self.weightinit()
self.training(1000, self.LR)
def randomweight(self, n):
output = []
for i in range(n):
output.append(random.uniform(-1,1))
return output
def weightinit(self):
self.weights = []
for n in range(len(self.nodes)-1):
temp = []
for _ in range(self.nodes[n]+self.bias):
temp.append(self.randomweight(self.nodes[n+1]))
self.weights.append(temp)
self.weights = [np.array(tuple(self.weights[i])) for i in range(len(self.weights))]
def forward(self, data):
self.Z = []
self.A = [np.array(data)]
for layer in range(len(self.weights)):
self.Z.append(np.dot(self.A[layer], self.weights[layer]))
self.A.append(relu(self.Z[layer]))
self.output = self.A[-1]
return self.output
def costFunction(self):
self.totalcost = 0.5*sum((self.trng_output-self.output)**2)
return self.totalcost
def costFunctionPrime(self):
self.forward(self.data_input)
self.delta = [[] for x in range(len(self.weights))]
self.DcostDw = [[] for x in range(len(self.weights))]
for layer in reversed(range(len(self.weights))):
Zprime = reluprime(self.Z[layer])
if layer == len(self.weights)-1:
self.delta[layer] = np.multiply(-(self.trng_output-self.output), Zprime)
else:
self.delta[layer] = np.dot(self.delta[layer+1], self.weights[layer+1].T) * Zprime
self.DcostDw[layer] = np.dot(self.A[layer].T, self.delta[layer])
return self.DcostDw
def backprop(self, LR):
self.DcostDw = (np.array(self.DcostDw)*LR).tolist()
self.weights = (np.array(self.weights) - np.array(self.DcostDw)).tolist()
def training(self, iteration, LR):
for i in range(iteration):
self.costFunctionPrime()
self.backprop(LR)
if (i/1000.0) == (i/1000):
print(self.costFunction())
print(sum(self.costFunction())/len(self.costFunction()))
NN = Neural_Net(trng_input, trng_output)
as asked, this is the expected result (result I got using the sigmoid activation function):
as you can see, the numbers are going down and thus the network is training.
this is the result using the relu activation function:
Here, the network is stuck and isnt getting trained, it never gets trained using the relu activation function and would like to understand why

If your cost doesn't decrease with ReLu activation, it seems like your network is stuck in the region where the input of ReLu is negative, so its output is a constant zero, and no graident flows back - the neuron is dead.
You can tackle this problem by using leaky ReLu instead of simple ReLu. You should also start training biases. With ReLu, it is recommended to initialize biases with small positive values, to avoid this dead neuron problem.
For some problems, it would also help to decrease learning rate and make the network deeper. Maybe, you would like to make learning rate adjustable, e.g. if the cost does not decrease, multiply LR by 0.5.
With leaky ReLu, trainable biases, and some refactoring, your model could look like this:
import numpy as np
trng_input = np.random.uniform(size=(1000, 7))
trng_output = np.column_stack([np.sin(trng_input).sum(axis=1), np.cos(trng_input).sum(axis=1)])
LEAK = 0.0001
def relu(x):
return x * (x > 0) + LEAK * x * (x < 0)
def reluprime(x):
return (x>0).astype(x.dtype) + LEAK * (x<0).astype(x.dtype)
class Neural_Net():
def __init__(self, data_input, data_output):
self.data_input = data_input
self.trng_output = trng_output
self.nodes = np.array([7, 10, 2])
self.LR = 0.00001
self.weightinit()
self.training(2000, self.LR)
def weightinit(self):
self.weights = [np.random.uniform(-1, 1, size=self.nodes[i:(i+2)]) for i in range(len(self.nodes) - 1)]
self.biases = [np.random.uniform(0, 1, size=self.nodes[i+1]) for i in range(len(self.nodes) - 1)]
def forward(self, data):
self.Z = []
self.A = [np.array(data)]
for layer in range(len(self.weights)):
self.Z.append(np.dot(self.A[layer], self.weights[layer]) + self.biases[layer])
self.A.append(relu(self.Z[layer]))
self.output = self.A[-1]
return self.output
def costFunction(self):
self.totalcost = 0.5*np.sum((self.trng_output-self.output)**2, axis=0)
return self.totalcost
def costFunctionPrime(self):
self.forward(self.data_input)
self.delta = [[] for x in range(len(self.weights))]
self.DcostDw = [[] for x in range(len(self.weights))]
self.DcostDb = [[] for x in range(len(self.weights))]
for layer in reversed(range(len(self.weights))):
Zprime = reluprime(self.Z[layer])
if layer == len(self.weights)-1:
self.delta[layer] = np.multiply(-(self.trng_output-self.output), Zprime)
else:
self.delta[layer] = np.dot(self.delta[layer+1], self.weights[layer+1].T) * Zprime
self.DcostDw[layer] = np.dot(self.A[layer].T, self.delta[layer])
self.DcostDb[layer] = np.sum(self.delta[layer], axis=0)
def backprop(self, LR):
for layer in range(len(self.weights)):
self.weights[layer] -= self.DcostDw[layer] * LR
self.biases[layer] -= self.DcostDb[layer] * LR
def training(self, iteration, LR):
for i in range(iteration):
self.costFunctionPrime()
self.backprop(LR)
if (i/100.0) == (i/100):
print(self.costFunction())
print(sum(self.costFunction())/len(self.costFunction()))
NN = Neural_Net(trng_input, trng_output)

I think the problem lies in your Cost Function.
def costFunction(self):
self.totalcost = 0.5*sum((self.trng_output-self.output)**2)
return self.totalcost
Specifically this line,
self.totalcost = 0.5*sum((self.trng_output-self.output)**2)
You have have calculated the cost by summing all the errors. Since you mentioned that you use a very large dataset, self.totalcost will turn out to be very large. In turn, the gradients calculated will also be very large.
Try using stochastic gradient descent or take the mean like so,
self.totalcost = 0.5 * np.mean((self.trng_output-self.output)**2)

Related

Handwritten digit recognition neural network not working

I have written a handwritten CNN neural network from scratch which takes a matrix [784x1] as an input from the mnist dataset and outputs a result of a [10x1] matrix with the index of the matrix representing the digit the neural network believes that the digit was.
I have a variable number of hidden layers, however my code is O(n^2) so having even just two hidden layers drastically slows down the program. The weights of each layer are stored as a matrix [num_outputs_of_layer x num_inputs_of_layer].
I am using the softmax function which I believe the derivative is: f'(x) = f(x)(1-f(x)) ?
MATHS:
I am amending the weights for the final hidden layer by multiplying the delta matrix (output - expected), by o x (1 - o) where o is the output of the final layer. I then matrix multiply this by the transposed inputs of this layer to get a change in weights. The change of bias is just the delta x [o x (1-o)]
For any other layer (including the weights of input layer), I take the delta and multiply it by the derivative of the final hidden layer which gives a [10 x 1] matrix
I then matrix multiply this by the weight matrix of final hidden layer [a x 10] which leaves a [a x 1] matrix, I then multiply this by the derivative of the output of the next layer which is also an [a x 1] matrix and then by the weights [b x 1] to give a [b x 1]. I do this recurrently until I get to the layer I am trying to change, I do not multiply this by the weight matrix but by the transposed input matrix.
This makes logical sense to me however it may well be wrong? Could someone please check my logic and whether I have coded this correctly as the neural network doesn't seem to be learning.
The issue is that currently it learns with one hidden layer but only to roughly 50%. I think this is due to the fact that one hidden layer is not complex enough?
As soon as I add more hidden layers it doesn't learn and averages about 10% correct (so random) eventually it also after a while tends to guess the same number for each photo ie constantly guesses 5. This improved a little with changing the starting weights, however it still does it sometimes and still doesn't learn.
If anyone could provide any suggestions of why it isn't working that would be much appreciated. I assume it is to do with my maths however I can't for the life of me see where Ive gone wrong!!!
Here is my code:
`
import numpy as np
from tensorflow.keras.datasets import mnist
class Run:
def __init__(self, num_hidden_layers, num_input, num_output):
self.layers = []
self.inputNumbers = []
self.count = 0
self.countTrue = 0
self.num_layers = num_hidden_layers + 1
self.learningRate = 0.1
for i in range(num_hidden_layers + 1):
if i != num_hidden_layers:
self.inputNumbers.append(num_input)
num_out = int(num_input * 8/9)
if num_out < num_output:
num_out = num_output
self.layers.append(Layer(num_input, num_out))
if num_input * 8/9 > num_output:
num_input = int(num_input * 8/9)
else:
num_input = 10
else:
self.inputNumbers.append(num_input)
self.layers.append(Layer(num_input, num_output))
def runNN(self, input):
for i in range(self.num_layers):
self.layers[i].calc_output(input)
input = self.layers[i].fin_outputs
self.NN_Output = input
#print(max(self.NN_Output))
def check_if_correct(self, expected):
list = []
list1 = []
for each in self.NN_Output:
list.append(float(each[0]))
for each in expected:
list1.append(float(each[0]))
print(list.index(max(list)), list1.index(max(list1)))
if list.index(max(list)) == list1.index(max(list1)):
self.countTrue += 1
self.count += 1
print(self.countTrue/self.count)
print('')
if self.count > 1000:
self.count = 0
self.countTrue = 0
def change_weights(self, expected):
change_weights = []
change_bias = []
delta = self.NN_Output - expected
for i in range(self.num_layers):
first = True
total = 0
for each in range(i):
if first == True:
deriv_soft = self.layers[self.num_layers - each - 1].deriv_softmax()
weight_mat = np.transpose(self.layers[self.num_layers - each - 1].getter()[0])
total = np.matmul(weight_mat, np.multiply(delta, deriv_soft))
first = False
else:
deriv_soft = self.layers[self.num_layers - each - 1].deriv_softmax()
weight_mat = np.transpose(self.layers[self.num_layers - each - 1].getter()[0])
total = np.multiply(total, deriv_soft)
total = np.matmul(weight_mat, total)
if i == 0:
deriv_soft = self.layers[self.num_layers - 1].deriv_softmax()
total = np.multiply(delta, deriv_soft)
change = np.matmul(total, np.transpose(self.layers[self.num_layers - i - 1].inputs)) * self.learningRate
change_weights.append(change)
change_bias.append(total * self.learningRate)
for i in range(self.num_layers):
self.layers[self.num_layers - i - 1].amend(change_weights[i], change_bias[i])
class Layer:
def __init__(self, num_inputs, num_outputs):
self.__weights = np.random.uniform(-10, 10, (num_outputs, num_inputs))
self.__bias = np.matrix([[float(0)] for x in range(num_outputs)])
def calc_output(self, inputs):
self.inputs = inputs
self.__output_1 = np.matmul(self.__weights, inputs) + self.__bias
self.softmax()
def softmax(self):
sum = 0
for each in self.__output_1:
sum += np.exp(float(each[0]))
list1 = []
for each in self.__output_1:
list1.append([float(np.exp(each[0])/sum)])
self.fin_outputs = np.matrix(list1)
return np.matrix(list1)
def deriv_softmax(self):
list = []
for each in self.fin_outputs:
list.append([float(each[0]*(1-each[0]))])
self.deriv = np.matrix(list)
return self.deriv
def amend(self, change_weights, change_bias):
self.__weights -= change_weights
self.__bias -= change_bias
def getter(self):
return self.__weights, self.__bias
class GetInput:
def __init__(self):
(self.X_train, self.Y_train), (X_test, Y_test) = mnist.load_data()
self.X_train = self.X_train.reshape(self.X_train.shape[0], 28, 28, 1)
x_test = X_test.reshape(X_test.shape[0], 28, 28, 1)
def get(self, i):
list = []
newPhoto = self.X_train[i].astype('float32')/255
for each in newPhoto:
for n in each:
list.append([float(n)])
input = np.matrix(list)
list = []
expect = self.Y_train[i]
for each in range(10):
if each == expect:
list.append([1])
else:
list.append([0])
expected = np.matrix(list)
return input, expected
Neural = Run(2, 784, 10)
getinputs = GetInput()
count = 0
while True:
input, expected = getinputs.get(count)
Neural.runNN(input)
Neural.check_if_correct(expected)
Neural.change_weights(expected)
count += 1
`
Many thanks
Daniel
I have tried changing:
number of hidden layers
starting weights
learning rate

Neural Network From Scratch - Forward propagation error

I wanna implement the backward propagation concept in python with the next code
class MLP(object):
def __init__(self, num_inputs=3, hidden_layers=[3, 3], num_outputs=2):
self.num_inputs = num_inputs
self.hidden_layers = hidden_layers
self.num_outputs = num_outputs
layers = [num_inputs] + hidden_layers + [num_outputs]
weights = []
bias = []
for i in range(len(layers) - 1):
w = np.random.rand(layers[i], layers[i + 1])
b=np.random.randn(layers[i+1]).reshape(1, layers[i+1])
weights.append(w)
bias.append(b)
self.weights = weights
self.bias = bias
activations = []
for i in range(len(layers)):
a = np.zeros(layers[i])
activations.append(a)
self.activations = activations
def forward_propagate(self, inputs):
activations = inputs
self.activations[0] = activations
for i, w in enumerate(self.weights):
for j, b in enumerate(self.bias):
net_inputs = self._sigmoid((np.dot(activations, w)+b))
self.activations[i + 1] = net_inputs
return activations
def train(self, inputs, targets, epochs, learning_rate):
for i in range(epochs):
sum_errors = 0
for j, input in enumerate(inputs):
target = targets[j]
output = self.forward_propagate(input)
def _sigmoid(self, x):
y = 1.0 / (1 + np.exp(-x))
return y
So I created the next dummy data in order to verify everything is correct
items = np.array([[random()/2 for _ in range(2)] for _ in range(1000)])
targets = np.array([[i[0] + i[1]] for i in items])
mlp = MLP(2, [5], 1)
mlp.train(items, targets, 2, 0.1)
but when I run the code I have the next error
ValueError: shapes (2,) and (5,1) not aligned: 2 (dim 0) != 5 (dim 0)
I understand the error, but how to solve it?
a couple of major problems with forward_propagate:
change net_inputs to activations - otherwise you always compute and return the activations from the first layer
remove for j, b in enumerate(self.bias): - biases from other layers have no business here
use matmul instead of dot
so, something like
for i, w in enumerate(self.weights):
activations = self._sigmoid((np.matmul(activations, w)+self.bias[i]))
self.activations[i + 1] = activations
return activations
Also, be careful to note that this method receives 1D array, which converts to a matrix after the first matmul. Matrixes are stored in self.activations and a matrix is returned from the method.
This might or might not be what you want.

Listwrapper not allowing multiplication of learning rate and thus no update of weight for Nueral Network

I am new to tensorflow and nueral networks. I am trying to create a NN to estimate y = x^2
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
x_train = tf.constant(value = np.linspace(-10,10,50),dtype='float32')
x_train = tf.reshape(x_train,shape=[50,1])
y_train = x_train**2
layers = [1,3,4,1]
I created a nueral network class to obtain my weights and biases and run forward propagation.
class NN(tf.Module):
def __init__(self,layers,name=None):
super().__init__(name=name)
self.layers = layers
self.weights, self.biases = self.initialze(layers)
def initialze(self,layers) :
num_layers = len(layers)
weights = []
biases = []
for i in range(num_layers-1):
in_dim = layers[i]
out_dim = layers[i+1]
stddev = np.sqrt(2/(in_dim + out_dim))
b = tf.Variable(tf.zeros([1,layers[i+1]], dtype='float32'), dtype='float32')
W = tf.Variable(tf.random.truncated_normal([in_dim, out_dim], stddev=stddev), dtype='float32')
weights.append(W)
biases.append(b)
return weights, biases
def __call__(self,x):
Z = x
num_layers = len(self.layers)
for i in range(num_layers-1):
Z =tf.math.add(tf.linalg.matmul(Z ,self.weights[i]),self.biases[i])
return Z
My_NN = NN(layers)
Next I created a class updat to do backward propogation
class updat:
def __init__(self,y_train,x_train):
self.y_train = y_train
self.x_train = x_train
self.l_r = 0.1
def get_grad(self,My_NN):
with tf.GradientTape(persistent=True) as tape:
tape.watch(My_NN.weights)
tape.watch(My_NN.biases)
loss = tf.reduce_mean(tf.square(self.y_train-My_NN(self.x_train)))
dw,db = tape.gradient(loss, [My_NN.weights,My_NN.biases])
print(dw,'weight')
print(db,'biases')
My_NN.weights -= (self.l_r * dw)
My_NN.biases -=(self.l_r * db)
del tape
return loss
def report(self, loss):
return f"W = {My_NN.weights.numpy():1.2f}, b = {My_NN.biases.numpy():1.2f}, loss={loss:2.5f}"
def prop(self,epochs,My_NN):
for epoch in epochs:
loss = self.get_grad(My_NN)
current_loss = loss
print(f"Epoch {epoch:2d}:")
print(" ", report(current_loss,My_NN))
But when I run the code
model = updat(y_train,x_train)
epochs = range(10)
model.prop(epochs,My_NN)
I get an error saying
My_NN.weights -= (self.l_r * dw)
My_NN.biases -=(self.l_r * db)
TypeError: can't multiply sequence by non-int of type 'float'
I tried substituting My_NN.weights -= (lr*dw)
with My_NN.weights.assign_sub(lr*dw)
still it shows that
'ListWrapper' object has no attribute 'assign_sub'
Is there any solution for this?
TURN
My_NN.weights -= (self.l_r * dw)
My_NN.biases -=(self.l_r * db)
TO
for weight,d_weight in zip(My_NN.weights,dw):
weight.assign_sub(self.l_r * d_weight)
for bias,d_bias in zip(My_NN.biases,db):
bias.assign_sub(self.l_r * d_bias)
can solve the problem.
Because My_NN.weights is a list of tf.Variable's ref and dw is corresponding list of tf.constant. We cannot modify it outside the list unless we iterate over the list. Additionally, if we want to update tf.Variable, we should use its assign .etc methods, this is like modifying the content specified by the pointer variable in C language.
More conveniently, we usually use tf.keras.optimizers's apply_gridents(), even minimize() to updata varibales directly.
For this specific task and your more process oriented coding approach, here I give out some suggestions for stable training:
add activations to constrain the fitting ability of this model:
def __call__(self,x):
Z = x
num_layers = len(self.layers)
for i in range(num_layers-2):
y = tf.math.add(tf.linalg.matmul(Z ,self.weights[i]),self.biases[i])
Z = tf.nn.relu(y)
i+=1
return tf.math.add(tf.linalg.matmul(Z ,self.weights[i]),self.biases[i])
make lower learning_rate:
self.l_r = 0.001 # self.l_r = 0.1
do more epochs:
epochs = range(1000) # epochs = range(10)
Since initial value of trainable wights will also influence the training stability, you may need to re-train several times. In my
tests, the above modification works.

How to Implement Vectorized Backprop in Numpy

I'm working on a school project and am stuck on how to implement backpropagation in Numpy with the current forward prop structure I have. The aim of this script is to make a simple dynamic (meaning any number of layers and nodes) fully connected network using only numpy.
I think that I have to find the derivatives of the activation functions and multipliy it by the original error as well as the derivative of each activation function I encounter moving backward.
However, I'm having trouble figuring out how to implement this correctly in my script.
It'd be a great help if someone could explain in English what exactly I have to do given the complexities of the setup here, or even give a recommendation for a video/post that deals w dynamic size backprop.
Right now all the weights and biases are being stored in lists for future backprop, and I'm able to get the error for each output with the small amount of code currently in the backprop function.
This code block
#initialize a test model w/ 128 bacth and lr of 0.01
model = Model(128, 0.01)
#simple x data input
X = np.array([[1,1],[0,0],[12,5]])
Y = np.array([[1],[0],[-1]])
#adding 4 layers
z = model.add(X, 3, "sigmoid")
z = model.add(z, 1, "sigmoid", output=True)
#this is a full forward pass through the layers
z = model.predict(X)
print(z)
#this is the error of the predictions
print(model.backprop(z, Y))
Outputs the following vectors:
[[0.50006457]
[0.50006459]
[0.50006431]]
[[0.24993544]
[0.2500646 ]
[2.25019293]]
Like I said, not sure how to move forward ( or backward ;) ) from here.
Below is the full script needed to run the example:
import math
import numpy as np
#everything below is defining activation functions
#--------------------------------------------------------------------------------------------
def b_relu(input):
return max((0, max(input)))
def bd_relu(input):
if(input < 0 or input == 0):
return 0
else:
return 1
def b_sigmoid(x):
return 1 / (1 + math.exp(-x))
def bd_sigmoid(input):
return sigmoid(input) * (1 - sigmoid(input))
def b_tanh(input):
top = (math.exp(input) - math.exp(-input))
bottom = (math.exp(input) + math.exp(-input))
return (top/bottom)
#helper functions for tanh
def cosh(input):
return ((math.exp(input) + math.exp(-input)) / 2)
def sinh(input):
return ((math.exp(input) - math.exp(-input)) / 2)
def bd_tanh(input):
top = (math.pow(cosh(input), 2) - math.pow(sinh(input), 2))
bottom = math.pow(input, 2)
return (top / bottom)
def b_softmax(z):
# subracting the max adds numerical stability
shiftx = z - np.max(z,axis=1)[:,np.newaxis]
exps = np.exp(shiftx)
return exps / np.sum(exps,axis=1)[:,np.newaxis]
def bd_softmax(Y_hat, Y):
return Y_hat - Y
def b_linear(input):
return input
def bd_linear(input):
return 1
#vectorizing the activation and deriv. activation functions
relu = np.vectorize(b_relu)
d_relu = np.vectorize(bd_relu)
sigmoid = np.vectorize(b_sigmoid)
d_sigmoid = np.vectorize(bd_sigmoid)
tanh = np.vectorize(b_tanh)
d_tanh = np.vectorize(bd_tanh)
softmax = np.vectorize(b_softmax)
d_softmax = np.vectorize(bd_softmax)
linear = np.vectorize(b_linear)
d_linear = np.vectorize(bd_linear)
class Model:
def __init__(self, batch, lr):
#initializing self lists to keep track of stuff for bacthes, forward prop & backporp
self.batch = batch
self.lr = lr
self.W = []
self.B = []
self.A = []
self.Z = []
self.X = []
self.layers = []
self.tempW = []
self.tempB = []
#store error for backprop
self.output_error = []
#initialize the weights during 'model.add' so we can test our network shapes dynamically w/out model.compile
#added an output bool here so we can make sure the shape of the output network is (1,n)
def initial_weights(self, input_data, output_shape, output=False):
B = np.zeros((1, output_shape))
#assigning the shape
W = np.random.uniform(-1e-3, 1e-3, size = (input_data.shape[len(input_data.shape) - 1], output_shape))
self.B.append(B)
self.W.append(W)
def add(self, input_data, output_shape, activation, output=False):
#append to layers so we have a correct index value
self.layers.append(69)
#making sure our data in a numpy array
if (type(input_data) == np.ndarray):
X = input_data
else:
X = np.asarray(input_data)
#adding data and activations to self lists
self.X.append(X)
self.A.append(activation)
#keep track of our index & initializing random weights for dynamic comatibility testing
index = len(self.layers)-1
self.initial_weights(input_data, output_shape, output=False)
X2 = self.forward(input_data, index)
#printing layer info
print("Layer:", index)
print("Input Shape: ", X.shape)
print("Weight Shape: ", self.W[index].shape)
print("Output Shape: ", X2.shape)
print(" ")
return(X2)
def forward(self, input_data, index):
#pulling weights and biases from main lists for operations
B = self.B[index]
W = self.W[index]
#matmul of data # weights + bias
Z = np.matmul(input_data, W) + B
#summing each row of inputs to activation node
for x in Z:
x = sum(x)
#pulling activation from index
act = str(self.A[index])
#activating
Z = activate(Z, act)
#keeping track of Z i guess
self.Zappend = Z
return(Z)
def predict(self, input_data):
for x in range(len(self.layers)):
z = model.forward(input_data, x)
input_data = z
return z
def backprop(self, model_output, ground_truth):
#------------------------------
#now begins the backprop portion
#let's start with finding the error between predictions and actual values
#gonna do MSE to keep it simple
self.output_error = (ground_truth - model_output) ** 2
#so now we have the error of the output layer, this tells us two things, how wrong we were, and in which direction we should update
#the outputs of these nodes
'''
What to do if this was linear regression (for m & b)
1. Take the error and multiply it by the transpose of the last layer weights
(I think the error in this case is where the prime activation function should be if we had activations)
2. The last layer bias is just the error
3. The second to last layer inputs is the bias times the transpose of second layers weights
3. Then I have no idea
'''
return self.output_error

Softmax neural net works with error in implementation, does not work with correct implementation

I have been trying to fix this problem for several days, with no luck. I have been implementing a simple neural net with a single hidden layer from scratch, just for my own understanding. I have successfully implemented it with sigmoid, tanh and relu activations for binary classifications, and am now attempting to use softmax at the output for multi-class classifications.
In every tutorial I have come across for a softmax implementation, including my lecturer's notes, the derivative of the softmax cross entropy error at the output layer is simplified down to just predictions - labels, thus essentially subtracting 1 from the predicted value at the position of the true label.
However, I found that if this was used, then the error of my network would continuosuly increase until it converged to always predicting one random class with 100%, and the other with 0%. Interestingly, if I change this to labels - predictions, it works perfectly on my simple test of learning the binary XOR function below. Unfortunately, if I then attempt to the apply the same network to a more complex problem (hand-written letters - 26 classes), it again converges to outputting one class with 100% probability very quickly when either labels - predictions or predictions - labels is used.
I have no idea why this incorrect line of code works for the simple binary classification, but not for a classification with many classes. I assume that I have something else backwards in my code, and this incorrect change is essentially reversing this other error, but I cannot find where this may be.
import numpy as np
class MLP:
def __init__(self, numInputs, numHidden, numOutputs):
# MLP architecture sizes
self.numInputs = numInputs
self.numHidden = numHidden
self.numOutputs = numOutputs
# MLP weights
self.IH_weights = np.random.rand(numInputs, numHidden) # Input -> Hidden
self.HO_weights = np.random.rand(numHidden, numOutputs) # Hidden -> Output
# Gradients corresponding to weight matrices computed during backprop
self.IH_w_gradients = np.zeros_like(self.IH_weights)
self.HO_w_gradients = np.zeros_like(self.HO_weights)
def sigmoid(self, x):
return 1 / (1 + np.exp(-x))
def sigmoidDerivative(self, x):
return x * (1 - x)
def softmax(self, x):
# exps = np.exp(x)
exps = np.exp(x - np.max(x)) # Allows for large values
return exps / np.sum(exps)
def forward(self, input):
self.I = np.array(input).reshape(1, self.numInputs) # (numInputs, ) -> (1, numInputs)
self.H = self.I.dot(self.IH_weights)
self.H = self.sigmoid(self.H)
self.O = self.H.dot(self.HO_weights)
self.O = self.softmax(self.O)
self.O += 1e-10 # Allows for log(0)
return self.O
def backwards(self, label):
self.L = np.array(label).reshape(1, self.numOutputs) # (numOutputs, ) -> (1, numOutputs)
self.O_error = - np.sum([t * np.log(y) for y, t in zip(self.O, self.L)])
# self.O_delta = self.O - self.L # CORRECT (not working)
self.O_delta = self.L - self.O # INCORRECT (working)
self.H_error = self.O_delta.dot(self.HO_weights.T)
self.H_delta = self.H_error * self.sigmoidDerivative(self.H)
self.IH_w_gradients += self.I.T.dot(self.H_delta)
self.HO_w_gradients += self.H.T.dot(self.O_delta)
return self.O_error
def updateWeights(self, learningRate):
self.IH_weights += learningRate * self.IH_w_gradients
self.HO_weights += learningRate * self.HO_w_gradients
self.IH_w_gradients = np.zeros_like(self.IH_weights)
self.HO_w_gradients = np.zeros_like(self.HO_weights)
data = [
[[0, 0], [1, 0]],
[[0, 1], [0, 1]],
[[1, 0], [0, 1]],
[[1, 1], [1, 0]]
]
mlp = MLP(2, 5, 2)
numEpochs = 10000
learningRate = 0.1
for epoch in range(numEpochs):
epochLosses, epochAccuracies = [], []
for i in range(len(data)):
prediction = mlp.forward(data[i][0])
# print(prediction, "\n")
label = data[i][1]
loss = mlp.backwards(label)
epochLosses.append(loss)
epochAccuracies.append(np.argmax(prediction) == np.argmax(label))
mlp.updateWeights(learningRate)
if epoch % 1000 == 0 or epoch == numEpochs - 1:
print("EPOCH:", epoch)
print("LOSS: ", np.mean(epochLosses))
print("ACC: ", np.mean(epochAccuracies) * 100, "%\n")

Categories

Resources