When I train my neural network on only one training sample my code works just fine but when I train on any more it doesn't work at all. Does anyone have a clue as to why? I'm pretty sure somethings wrong with the update_mini_batch function but I have no idea. By the way, this is my first neural network and I'm doing it from scratch so I don't really know what I'm doing. Also, I'm using the stochastic gradient descent learning algorithm and programming with python. Thanks so much by the way for helping me out.
import numpy as np
import random as Ran
class Neural_Network:
def __init__(self, layersizes):
weight_shapes = [(a,b) for a,b in zip(layersizes[1:],
layersizes[:-1])]
self.weights = [np.random.standard_normal(s)/s[1]**.5 for s in
weight_shapes]
self.biases = [np.zeros((s,1)) for s in layersizes[1:]]
self.layersizes = layersizes
def feedforward(self, I):
for w,b in zip(self.weights, self.biases):
I = self.activation(np.matmul(w, I) + b)
return I
def backprop(self, input, output):
gradient_b = [np.zeros(b.shape) for b in self.biases]
gradient_w = [np.zeros(w.shape) for w in self.weights]
Activation = input
Activations = [input]
Z_value = 0.0
Z_values = []
for b, w in zip(self.biases, self.weights):
Z_value = np.matmul(w, Activation) + b
Activation = self.activation(Z_value)
Activations.append(Activation)
Z_values.append(Z_value)
Activation_derivative = self.activation_prime(Z_values[-1])
Cost_output_delta = (Activations[-1] - output)
delta = Cost_output_delta * Activation_derivative
transpose_value = np.transpose(self.weights[-2])
gradient_b[-1] = delta
gradient_w[-1] = np.matmul(delta, np.transpose(Activations[-2]))
for i in range(2, len(self.layersizes) - 1):
Z_value = Z_values[-i]
Activation_derivative = self.activation_prime(Z_value)
transpose_value = np.transpose(self.weights[-i+1])
delta = [
(a * b) for a,b in zip(np.dot(transpose_value, delta), Activation_derivative)
]
gradient_b[i] = delta
gradient_w[i] = np.matmul(np.transpose(Activations[-i-1]), delta)
return (gradient_b, gradient_w)
def stochastic_gradient_descent(self, Training_data, Epochs, mini_batch_size, eta):
for i in range(Epochs):
Ran.shuffle(Training_data)
mini_batches = [
Training_data[k:k+mini_batch_size]
for k in range(0, len(Training_data))
]
for mini_batch in mini_batches:
self.Update_mini_batch(mini_batch, eta)
print("Epoch {0} complete".format(i))
def Update_mini_batch(self, mini_batch, eta):
gradient_b = [np.zeros(b.shape) for b in self.biases]
gradient_w = [np.zeros(w.shape) for w in self.weights]
for input, output in mini_batch:
delta_gradient_pair = self.backprop(input, output)
delta_gradient_b = delta_gradient_pair[0]
delta_gradient_w = delta_gradient_pair[1]
Bias_zip = zip(gradient_b, delta_gradient_b)
Weight_zip = zip(gradient_w, delta_gradient_w)
gradient_b = [g_b + d_b for g_b, d_b in Bias_zip]
gradient_w = [g_w + d_w for g_w, d_w in Weight_zip]
Bias_zip = zip(self.biases, gradient_b)
Weight_zip = zip(self.weights, gradient_w)
self.biases = [b - (eta / len(mini_batch) * g_b) for b, g_b in Bias_zip]
self.weights = [w - (eta / len(mini_batch) * g_w) for w, g_w in Weight_zip]
def activation(self, value):
return 1 / (1 + np.exp(-value))
def activation_prime(self, value):
return np.exp(-value) / ((1 + np.exp(-value))**2)
#Test_Program:
with np.load('mnist.npz') as data:
training_images = data['training_images']
training_labels = data['training_labels']
data =[(a, b) for a,b in zip(training_images, training_labels)]
layersizes = (784, 32, 10)
nn = Neural_Network(layersizes)
nn.stochastic_gradient_descent(data, 30, 10, 3)
so I've found the problem but I don't know how to fix it. apparently, my neural network can eventually get it right but it takes a couple thousand epochs of training. this is because the return backpropagation gradient always has zeros in the first bias and weight layer. I believe this is an error in the indexing but I'm really not sure. I still don't know how to solve this problem though so that kind of sucks. if you have any idea of how I could edit my neural network to actually function that would be great.
UPDATE:
so I finally figured it out, OMG this feels so good, it turns out it was a whole mixture of problems. first of all, I started indexing the gradients from the front instead of the back, which was also causing me to not hit all of the layers, AND I was multiplying the activation transpose by the delta backward so that caused even more problems. thank god I finally figured this out.
Related
I am new to tensorflow and nueral networks. I am trying to create a NN to estimate y = x^2
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
x_train = tf.constant(value = np.linspace(-10,10,50),dtype='float32')
x_train = tf.reshape(x_train,shape=[50,1])
y_train = x_train**2
layers = [1,3,4,1]
I created a nueral network class to obtain my weights and biases and run forward propagation.
class NN(tf.Module):
def __init__(self,layers,name=None):
super().__init__(name=name)
self.layers = layers
self.weights, self.biases = self.initialze(layers)
def initialze(self,layers) :
num_layers = len(layers)
weights = []
biases = []
for i in range(num_layers-1):
in_dim = layers[i]
out_dim = layers[i+1]
stddev = np.sqrt(2/(in_dim + out_dim))
b = tf.Variable(tf.zeros([1,layers[i+1]], dtype='float32'), dtype='float32')
W = tf.Variable(tf.random.truncated_normal([in_dim, out_dim], stddev=stddev), dtype='float32')
weights.append(W)
biases.append(b)
return weights, biases
def __call__(self,x):
Z = x
num_layers = len(self.layers)
for i in range(num_layers-1):
Z =tf.math.add(tf.linalg.matmul(Z ,self.weights[i]),self.biases[i])
return Z
My_NN = NN(layers)
Next I created a class updat to do backward propogation
class updat:
def __init__(self,y_train,x_train):
self.y_train = y_train
self.x_train = x_train
self.l_r = 0.1
def get_grad(self,My_NN):
with tf.GradientTape(persistent=True) as tape:
tape.watch(My_NN.weights)
tape.watch(My_NN.biases)
loss = tf.reduce_mean(tf.square(self.y_train-My_NN(self.x_train)))
dw,db = tape.gradient(loss, [My_NN.weights,My_NN.biases])
print(dw,'weight')
print(db,'biases')
My_NN.weights -= (self.l_r * dw)
My_NN.biases -=(self.l_r * db)
del tape
return loss
def report(self, loss):
return f"W = {My_NN.weights.numpy():1.2f}, b = {My_NN.biases.numpy():1.2f}, loss={loss:2.5f}"
def prop(self,epochs,My_NN):
for epoch in epochs:
loss = self.get_grad(My_NN)
current_loss = loss
print(f"Epoch {epoch:2d}:")
print(" ", report(current_loss,My_NN))
But when I run the code
model = updat(y_train,x_train)
epochs = range(10)
model.prop(epochs,My_NN)
I get an error saying
My_NN.weights -= (self.l_r * dw)
My_NN.biases -=(self.l_r * db)
TypeError: can't multiply sequence by non-int of type 'float'
I tried substituting My_NN.weights -= (lr*dw)
with My_NN.weights.assign_sub(lr*dw)
still it shows that
'ListWrapper' object has no attribute 'assign_sub'
Is there any solution for this?
TURN
My_NN.weights -= (self.l_r * dw)
My_NN.biases -=(self.l_r * db)
TO
for weight,d_weight in zip(My_NN.weights,dw):
weight.assign_sub(self.l_r * d_weight)
for bias,d_bias in zip(My_NN.biases,db):
bias.assign_sub(self.l_r * d_bias)
can solve the problem.
Because My_NN.weights is a list of tf.Variable's ref and dw is corresponding list of tf.constant. We cannot modify it outside the list unless we iterate over the list. Additionally, if we want to update tf.Variable, we should use its assign .etc methods, this is like modifying the content specified by the pointer variable in C language.
More conveniently, we usually use tf.keras.optimizers's apply_gridents(), even minimize() to updata varibales directly.
For this specific task and your more process oriented coding approach, here I give out some suggestions for stable training:
add activations to constrain the fitting ability of this model:
def __call__(self,x):
Z = x
num_layers = len(self.layers)
for i in range(num_layers-2):
y = tf.math.add(tf.linalg.matmul(Z ,self.weights[i]),self.biases[i])
Z = tf.nn.relu(y)
i+=1
return tf.math.add(tf.linalg.matmul(Z ,self.weights[i]),self.biases[i])
make lower learning_rate:
self.l_r = 0.001 # self.l_r = 0.1
do more epochs:
epochs = range(1000) # epochs = range(10)
Since initial value of trainable wights will also influence the training stability, you may need to re-train several times. In my
tests, the above modification works.
I am trying to implement a deep learning model that I made while completing the "Neural Networks and Deep Learning" course on Coursera, using the MNIST dataset for written numbers. During the course, it worked very well identifying cats so I know that the whole model works together, and I have modified all of the input data and the output layer such that the output is an array of size 10 and the array shapes all match what they were during the course.
I have done a little experimentation and come to a very weird problem. My graph of cost over time looks like this:
I would usually expect a more sloping curve that would tend towards a value much closer to zero and the very sharp turn is odd as well. I should also point out that it isn't hundreds, it's tens on the x axis.
The shape of my NN is [784, 200, 50, 10], I'm assuming that this isn't the problem but what I'm really looking for is for someone more experienced in ml to explain why this happens.
My model as it stands goes something like this
# Initialisation of parameters
parameters = initialize_parameters_deep(layers_dims)
for i in range(0, num_iterations):
# Forward propagation: [LINEAR -> RELU]*(L-1) -> LINEAR -> SIGMOID.
AL, caches = L_model_forward(train_data, parameters, layers_dims[-1])
# Compute cost.
cost = compute_cost(AL, train_labels)
# Backward propagation.
grads = L_model_backward(AL, train_labels, caches)
# Update parameters.
parameters = update_parameters(parameters, grads, learning_rate)
# Print the cost every 100 training example
if print_cost and i % 100 == 0:
print("Cost after iteration %i: %f" % (i, cost))
if print_cost and i % 10 == 0:
costs.append(cost)
My backward propagation model is as follows
def linear_backward(dZ, cache):
A_prev, W, b = cache
m = A_prev.shape[1]
dW = 1 / m * np.dot(dZ, A_prev.T)
db = 1 / m * np.sum(dZ, axis=1, keepdims=True)
dA_prev = np.dot(W.T, dZ)
assert (dA_prev.shape == A_prev.shape)
assert (dW.shape == W.shape)
assert (db.shape == b.shape)
return dA_prev, dW, db
def linear_activation_backward(dA, cache, activation):
linear_cache, activation_cache = cache
if activation == "relu":
dZ = relu_backward(dA, activation_cache)
dA_prev, dW, db = linear_backward(dZ, linear_cache)
elif activation == "sigmoid":
dZ = sigmoid_backward(dA, activation_cache)
dA_prev, dW, db = linear_backward(dZ, linear_cache)
return dA_prev, dW, db
def L_model_backward(AL, Y, caches):
grads = {}
L = len(caches) # the number of layers
m = AL.shape[1]
Y = Y.reshape(AL.shape) # after this line, Y is the same shape as AL
# Initializing the backpropagation
dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
# Lth layer (SIGMOID -> LINEAR) gradients. Inputs: "dAL, current_cache". Outputs: "grads["dAL-1"], grads["dWL"], grads["dbL"]
current_cache = caches[L - 1]
grads["dA" + str(L - 1)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(dAL, current_cache, activation="sigmoid")
# Loop from l=L-2 to l=0
for l in reversed(range(L - 1)):
# lth layer: (RELU -> LINEAR) gradients.
# Inputs: "grads["dA" + str(l + 1)], current_cache". Outputs: "grads["dA" + str(l)] , grads["dW" + str(l + 1)] , grads["db" + str(l + 1)]
current_cache = caches[l]
dA_prev_temp, dW_temp, db_temp = linear_activation_backward(grads["dA" + str(l + 1)], current_cache, activation="relu")
grads["dA" + str(l)] = dA_prev_temp
grads["dW" + str(l + 1)] = dW_temp
grads["db" + str(l + 1)] = db_temp
return grads
If you need any more code or anything from me specifically then I would be happy to provide it.
I made a FC neural network with numpy based on the video's of welch's lab but when I try to train it I seem to have exploding gradients at launch, which is weird, I will put down the whole code which is testable in python 3+. only costfunctionprime seem to break the gradient descent stuff going but I have no idea what is happening. Can someone smarter than me help?
EDIT: the trng_input and trng_output are not the one I use, I use a big dataset
import numpy as np
import random
trng_input = [[random.random() for _ in range(7)] for _ in range(100)]
trng_output = [[random.random() for _ in range(2)] for _ in range(100)]
def relu(x):
return x * (x > 0)
def reluprime(x):
return (x>0).astype(x.dtype)
class Neural_Net():
def __init__(self, data_input, data_output):
self.data_input = data_input
self.trng_output = trng_output
self.bias = 0
self.nodes = np.array([7, 2])
self.LR = 0.01
self.weightinit()
self.training(1000, self.LR)
def randomweight(self, n):
output = []
for i in range(n):
output.append(random.uniform(-1,1))
return output
def weightinit(self):
self.weights = []
for n in range(len(self.nodes)-1):
temp = []
for _ in range(self.nodes[n]+self.bias):
temp.append(self.randomweight(self.nodes[n+1]))
self.weights.append(temp)
self.weights = [np.array(tuple(self.weights[i])) for i in range(len(self.weights))]
def forward(self, data):
self.Z = []
self.A = [np.array(data)]
for layer in range(len(self.weights)):
self.Z.append(np.dot(self.A[layer], self.weights[layer]))
self.A.append(relu(self.Z[layer]))
self.output = self.A[-1]
return self.output
def costFunction(self):
self.totalcost = 0.5*sum((self.trng_output-self.output)**2)
return self.totalcost
def costFunctionPrime(self):
self.forward(self.data_input)
self.delta = [[] for x in range(len(self.weights))]
self.DcostDw = [[] for x in range(len(self.weights))]
for layer in reversed(range(len(self.weights))):
Zprime = reluprime(self.Z[layer])
if layer == len(self.weights)-1:
self.delta[layer] = np.multiply(-(self.trng_output-self.output), Zprime)
else:
self.delta[layer] = np.dot(self.delta[layer+1], self.weights[layer+1].T) * Zprime
self.DcostDw[layer] = np.dot(self.A[layer].T, self.delta[layer])
return self.DcostDw
def backprop(self, LR):
self.DcostDw = (np.array(self.DcostDw)*LR).tolist()
self.weights = (np.array(self.weights) - np.array(self.DcostDw)).tolist()
def training(self, iteration, LR):
for i in range(iteration):
self.costFunctionPrime()
self.backprop(LR)
if (i/1000.0) == (i/1000):
print(self.costFunction())
print(sum(self.costFunction())/len(self.costFunction()))
NN = Neural_Net(trng_input, trng_output)
as asked, this is the expected result (result I got using the sigmoid activation function):
as you can see, the numbers are going down and thus the network is training.
this is the result using the relu activation function:
Here, the network is stuck and isnt getting trained, it never gets trained using the relu activation function and would like to understand why
If your cost doesn't decrease with ReLu activation, it seems like your network is stuck in the region where the input of ReLu is negative, so its output is a constant zero, and no graident flows back - the neuron is dead.
You can tackle this problem by using leaky ReLu instead of simple ReLu. You should also start training biases. With ReLu, it is recommended to initialize biases with small positive values, to avoid this dead neuron problem.
For some problems, it would also help to decrease learning rate and make the network deeper. Maybe, you would like to make learning rate adjustable, e.g. if the cost does not decrease, multiply LR by 0.5.
With leaky ReLu, trainable biases, and some refactoring, your model could look like this:
import numpy as np
trng_input = np.random.uniform(size=(1000, 7))
trng_output = np.column_stack([np.sin(trng_input).sum(axis=1), np.cos(trng_input).sum(axis=1)])
LEAK = 0.0001
def relu(x):
return x * (x > 0) + LEAK * x * (x < 0)
def reluprime(x):
return (x>0).astype(x.dtype) + LEAK * (x<0).astype(x.dtype)
class Neural_Net():
def __init__(self, data_input, data_output):
self.data_input = data_input
self.trng_output = trng_output
self.nodes = np.array([7, 10, 2])
self.LR = 0.00001
self.weightinit()
self.training(2000, self.LR)
def weightinit(self):
self.weights = [np.random.uniform(-1, 1, size=self.nodes[i:(i+2)]) for i in range(len(self.nodes) - 1)]
self.biases = [np.random.uniform(0, 1, size=self.nodes[i+1]) for i in range(len(self.nodes) - 1)]
def forward(self, data):
self.Z = []
self.A = [np.array(data)]
for layer in range(len(self.weights)):
self.Z.append(np.dot(self.A[layer], self.weights[layer]) + self.biases[layer])
self.A.append(relu(self.Z[layer]))
self.output = self.A[-1]
return self.output
def costFunction(self):
self.totalcost = 0.5*np.sum((self.trng_output-self.output)**2, axis=0)
return self.totalcost
def costFunctionPrime(self):
self.forward(self.data_input)
self.delta = [[] for x in range(len(self.weights))]
self.DcostDw = [[] for x in range(len(self.weights))]
self.DcostDb = [[] for x in range(len(self.weights))]
for layer in reversed(range(len(self.weights))):
Zprime = reluprime(self.Z[layer])
if layer == len(self.weights)-1:
self.delta[layer] = np.multiply(-(self.trng_output-self.output), Zprime)
else:
self.delta[layer] = np.dot(self.delta[layer+1], self.weights[layer+1].T) * Zprime
self.DcostDw[layer] = np.dot(self.A[layer].T, self.delta[layer])
self.DcostDb[layer] = np.sum(self.delta[layer], axis=0)
def backprop(self, LR):
for layer in range(len(self.weights)):
self.weights[layer] -= self.DcostDw[layer] * LR
self.biases[layer] -= self.DcostDb[layer] * LR
def training(self, iteration, LR):
for i in range(iteration):
self.costFunctionPrime()
self.backprop(LR)
if (i/100.0) == (i/100):
print(self.costFunction())
print(sum(self.costFunction())/len(self.costFunction()))
NN = Neural_Net(trng_input, trng_output)
I think the problem lies in your Cost Function.
def costFunction(self):
self.totalcost = 0.5*sum((self.trng_output-self.output)**2)
return self.totalcost
Specifically this line,
self.totalcost = 0.5*sum((self.trng_output-self.output)**2)
You have have calculated the cost by summing all the errors. Since you mentioned that you use a very large dataset, self.totalcost will turn out to be very large. In turn, the gradients calculated will also be very large.
Try using stochastic gradient descent or take the mean like so,
self.totalcost = 0.5 * np.mean((self.trng_output-self.output)**2)
I am trying to build an ANN in python, and I've been able to get so far as to to forward pass, but I get a problem when I try to do backward propagation. In my function nnCostFunction, the gradient grad is define as:
grad = tr(c_[Theta1_grad.swapaxes(1,0).reshape(1,-1), Theta2_grad.swapaxes(1,0).reshape(1,-1)])
But this is a problem because I am using scipy.optimize.fmin_cg to calculate nn_params and cost, and fmin_cg accepts only a single value (the J value for my forward pass) and cannot accept grad...
nn_params, cost = op.fmin_cg(lambda t: nnCostFunction(t, input_layer_size, hidden_layer_size, num_labels, X, y, lam), initial_nn_params, gtol = 0.001, maxiter = 40, full_output=1)[0, 1]
Is there a way to fix this so I can include backward propagation in my network? I know there is a scipy.optimize.minimize function, but I am having some difficulty understand how to use it and get the results I need. Does anyone know what needs to be done?
Your help is greatly appreciated, thanks.
def nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, lam):
'''
Given NN parameters, layer sizes, number of labels, data, and learning rate, returns the cost of traversing NN.
'''
Theta1 = (reshape(nn_params[:(hidden_layer_size*(input_layer_size+1))],(hidden_layer_size,(input_layer_size+1))))
Theta2 = (reshape(nn_params[((hidden_layer_size*(input_layer_size+1))):],(num_labels, (hidden_layer_size+1))))
m = X.shape[0]
n = X.shape[1]
#forward pass
y_eye = eye(num_labels)
y_new = np.zeros((y.shape[0],num_labels))
for z in range(y.shape[0]):
y_new[z,:] = y_eye[int(y[z])-1]
y = y_new
a_1 = c_[ones((m,1)),X]
z_2 = tr(Theta1.dot(tr(a_1)))
a_2 = tr(sigmoid(Theta1.dot(tr(a_1))))
a_2 = c_[ones((a_2.shape[0],1)), a_2]
a_3 = tr(sigmoid(Theta2.dot(tr(a_2))))
J_reg = lam/(2.*m) * (sum(sum(Theta1[:,1:]**2)) + sum(sum(Theta2[:,1:]**2)))
J = (1./m) * sum(sum(-y*log(a_3) - (1-y)*log(1-a_3))) + J_reg
#Backprop
d_3 = a_3 - y
d_2 = d_3.dot(Theta2[:,1:])*sigmoidGradient(z_2)
Theta1_grad = 1./m * tr(d_2).dot(a_1)
Theta2_grad = 1./m * tr(d_3).dot(a_2)
#Add regularization
Theta1_grad[:,1:] = Theta1_grad[:,1:] + lam*1.0/m*Theta1[:,1:]
Theta2_grad[:,1:] = Theta2_grad[:,1:] + lam*1.0/m*Theta2[:,1:]
#Unroll gradients
grad = tr(c_[Theta1_grad.swapaxes(1,0).reshape(1,-1), Theta2_grad.swapaxes(1,0).reshape(1,-1)])
return J, grad
def nn_train(X,y,lam = 1.0, hidden_layer_size = 10):
'''
Train neural network given the features and class arrays, learning rate, and size of the hidden layer.
Return parameters Theta1, Theta2.
'''
# NN input and output layer sizes
input_layer_size = X.shape[1]
num_labels = unique(y).shape[0] #output layer
# Initialize NN parameters
initial_Theta1 = randInitializeWeights(input_layer_size, hidden_layer_size)
initial_Theta2 = randInitializeWeights(hidden_layer_size, num_labels)
# Unroll parameters
initial_nn_params = np.append(initial_Theta1.flatten(1), initial_Theta2.flatten(1))
initial_nn_params = reshape(initial_nn_params,(len(initial_nn_params),)) #flatten into 1-d array
# Find and print initial cost:
J_init = nnCostFunction(initial_nn_params,input_layer_size,hidden_layer_size,num_labels,X,y,lam)[0]
grad_init = nnCostFunction(initial_nn_params,input_layer_size,hidden_layer_size,num_labels,X,y,lam)[1]
print 'Initial J cost: ' + str(J_init)
print 'Initial grad cost: ' + str(grad_init)
# Implement backprop and train network, run fmin
print 'Training Neural Network...'
print 'fmin results:'
nn_params, cost = op.fmin_cg(lambda t: nnCostFunction(t, input_layer_size, hidden_layer_size, num_labels, X, y, lam), initial_nn_params, gtol = 0.001, maxiter = 40, full_output=1)[0, 1]
Theta1 = (reshape(nn_params[:(hidden_layer_size*(input_layer_size+1))],(hidden_layer_size,(input_layer_size+1))))
Theta2 = (reshape(nn_params[((hidden_layer_size*(input_layer_size+1))):],(num_labels, (hidden_layer_size+1))))
return Theta1, Theta2
Trying to use Backpropagation Neural Network for multiclass classification. I have found this code and try to adapt it. It is based on the lections of Machine Learning in Coursera from Andrew Ng.
I don't understand exactly the implementation of scipy.optimize.minimize function here. It is used just once in the code. Is it iteratively updating the weights of the network? How can I visualize (plot) it's performance to see when it converges?
Using this function what parameters I can adjust to achieve better performance? I found here a list common parameters:
Number of neurons in the hidden layer: this is hidden_layer_size=25 in my code
Learning rate: can I still adjust that using built-in minimization function?
Momentum: is that reg_lambda=0 in my case? Regularization parameter to avoid overfitting, right?
Epoch: maxiter=500
Here is my training data (target class is in the last column):
65535, 3670, 65535, 3885, -0.73, 1
65535, 3962, 65535, 3556, -0.72, 1
65535, 3573, 65535, 3529, -0.61, 1
3758, 3123, 4117, 3173, -0.21, 0
3906, 3119, 4288, 3135, -0.28, 0
3750, 3073, 4080, 3212, -0.26, 0
65535, 3458, 65535, 3330, -0.85, 2
65535, 3315, 65535, 3306, -0.87, 2
65535, 3950, 65535, 3613, -0.84, 2
65535, 32576, 65535, 19613, -0.35, 3
65535, 16657, 65535, 16618, -0.37, 3
65535, 16657, 65535, 16618, -0.32, 3
The dependencies are so obvious, I think it should be so easy to classify it...
But results are terrible. I get accuracy of 0.6 to 0.8. This is absolutely inappropriate for my application. I know I need more data normally, but I would be already happy when I could at least fit the training data (without taking into account potential overfitting)
Here is the code:
import numpy as np
from scipy import optimize
from sklearn import cross_validation
from sklearn.metrics import accuracy_score
import math
class NN_1HL(object):
def __init__(self, reg_lambda=0, epsilon_init=0.12, hidden_layer_size=25, opti_method='TNC', maxiter=500):
self.reg_lambda = reg_lambda
self.epsilon_init = epsilon_init
self.hidden_layer_size = hidden_layer_size
self.activation_func = self.sigmoid
self.activation_func_prime = self.sigmoid_prime
self.method = opti_method
self.maxiter = maxiter
def sigmoid(self, z):
return 1 / (1 + np.exp(-z))
def sigmoid_prime(self, z):
sig = self.sigmoid(z)
return sig * (1 - sig)
def sumsqr(self, a):
return np.sum(a ** 2)
def rand_init(self, l_in, l_out):
self.epsilon_init = (math.sqrt(6))/(math.sqrt(l_in + l_out))
return np.random.rand(l_out, l_in + 1) * 2 * self.epsilon_init - self.epsilon_init
def pack_thetas(self, t1, t2):
return np.concatenate((t1.reshape(-1), t2.reshape(-1)))
def unpack_thetas(self, thetas, input_layer_size, hidden_layer_size, num_labels):
t1_start = 0
t1_end = hidden_layer_size * (input_layer_size + 1)
t1 = thetas[t1_start:t1_end].reshape((hidden_layer_size, input_layer_size + 1))
t2 = thetas[t1_end:].reshape((num_labels, hidden_layer_size + 1))
return t1, t2
def _forward(self, X, t1, t2):
m = X.shape[0]
ones = None
if len(X.shape) == 1:
ones = np.array(1).reshape(1,)
else:
ones = np.ones(m).reshape(m,1)
# Input layer
a1 = np.hstack((ones, X))
# Hidden Layer
z2 = np.dot(t1, a1.T)
a2 = self.activation_func(z2)
a2 = np.hstack((ones, a2.T))
# Output layer
z3 = np.dot(t2, a2.T)
a3 = self.activation_func(z3)
return a1, z2, a2, z3, a3
def function(self, thetas, input_layer_size, hidden_layer_size, num_labels, X, y, reg_lambda):
t1, t2 = self.unpack_thetas(thetas, input_layer_size, hidden_layer_size, num_labels)
m = X.shape[0]
Y = np.eye(num_labels)[y]
_, _, _, _, h = self._forward(X, t1, t2)
costPositive = -Y * np.log(h).T
costNegative = (1 - Y) * np.log(1 - h).T
cost = costPositive - costNegative
J = np.sum(cost) / m
if reg_lambda != 0:
t1f = t1[:, 1:]
t2f = t2[:, 1:]
reg = (self.reg_lambda / (2 * m)) * (self.sumsqr(t1f) + self.sumsqr(t2f))
J = J + reg
return J
def function_prime(self, thetas, input_layer_size, hidden_layer_size, num_labels, X, y, reg_lambda):
t1, t2 = self.unpack_thetas(thetas, input_layer_size, hidden_layer_size, num_labels)
m = X.shape[0]
t1f = t1[:, 1:]
t2f = t2[:, 1:]
Y = np.eye(num_labels)[y]
Delta1, Delta2 = 0, 0
for i, row in enumerate(X):
a1, z2, a2, z3, a3 = self._forward(row, t1, t2)
# Backprop
d3 = a3 - Y[i, :].T
d2 = np.dot(t2f.T, d3) * self.activation_func_prime(z2)
Delta2 += np.dot(d3[np.newaxis].T, a2[np.newaxis])
Delta1 += np.dot(d2[np.newaxis].T, a1[np.newaxis])
Theta1_grad = (1 / m) * Delta1
Theta2_grad = (1 / m) * Delta2
if reg_lambda != 0:
Theta1_grad[:, 1:] = Theta1_grad[:, 1:] + (reg_lambda / m) * t1f
Theta2_grad[:, 1:] = Theta2_grad[:, 1:] + (reg_lambda / m) * t2f
return self.pack_thetas(Theta1_grad, Theta2_grad)
def fit(self, X, y):
num_features = X.shape[0]
input_layer_size = X.shape[1]
num_labels = len(set(y))
theta1_0 = self.rand_init(input_layer_size, self.hidden_layer_size)
theta2_0 = self.rand_init(self.hidden_layer_size, num_labels)
thetas0 = self.pack_thetas(theta1_0, theta2_0)
options = {'maxiter': self.maxiter}
_res = optimize.minimize(self.function, thetas0, jac=self.function_prime, method=self.method,
args=(input_layer_size, self.hidden_layer_size, num_labels, X, y, 0), options=options)
self.t1, self.t2 = self.unpack_thetas(_res.x, input_layer_size, self.hidden_layer_size, num_labels)
np.savetxt("weights_t1.txt", self.t1, newline="\n")
np.savetxt("weights_t2.txt", self.t2, newline="\n")
def predict(self, X):
return self.predict_proba(X).argmax(0)
def predict_proba(self, X):
_, _, _, _, h = self._forward(X, self.t1, self.t2)
return h
##################
# IR data #
##################
values = np.loadtxt('infrared_data.txt', delimiter=', ', usecols=[0,1,2,3,4])
targets = np.loadtxt('infrared_data.txt', delimiter=', ', dtype=(int), usecols=[5])
X_train, X_test, y_train, y_test = cross_validation.train_test_split(values, targets, test_size=0.4)
nn = NN_1HL()
nn.fit(values, targets)
print("Accuracy of classification: "+str(accuracy_score(y_test, nn.predict(X_test))))
In the given code scipy.optimize.minimize iteratively minimizes function given it's derivative (Jacobi's matrix). According to the documentation, use can specify callback argument to a function that will be called after each iteration — this will let you measure performance, though I'm not sure if it'll let you halt the optimization process.
All parameters you listed are hyperparameters, it's hard to optimize them directly:
Number of neurons in the hidden layer is a discrete valued parameters, and, thus, is not optimizable via gradient techniques. Moreover, it affects NeuralNet architecture, so you can't optimize it while training the net. What you can do, though, is to use some higher-level routine to search for possible options, like exhaustive grid search with cross-validation (for example look at GridSearchCV) or other tools for hyperparameter search (hyperopt, spearmint, MOE, etc).
Learning rate does not seem to be customizable for most of the optimization methods available. But, actually, learning rate in gradient descent is just a Newton's method with Hessian "approximated" by 1 / eta I — diagonal matrix with inverted learning rates on the major diagonal. So you can try hessian-based methods with this heuristic.
Momentum is completely unrelated to regularization. It's an optimization technique, and, since you use scipy for optimization, is unavailable for you.