I have written a Dense Class for a FC layer in a CNN but to test if it works simply as a FC ANN, I tried to train a dataset over it but the loss never falls. I cannot seem to find the issue.
Here's the code:
class Dense:
# Constructor
def __init__(self, size, in_size, activation = 'relu'):
# Assign vars
self.size = size; self.activation = activation
# Initialize Weights and Biases
weights_dims = (size, in_size)
self.weights = np.random.standard_normal(weights_dims) * 0.01
self.biases = np.zeros([size, 1])
# Initialize Accumulators
self.sigma_acc = self.biases * 0
self.delta_acc = self.weights * 0
# ReLU Activation Function
def relu(self, arr):
return arr * (arr > 0)
# Softmax Activation Function
def softmax(self, arr):
arr -= arr.max()
exp = np.exp(arr)
return exp / np.sum(exp)
# Activation Manager Function
def activate(self, arr):
if self.activation == 'relu': return self.relu(arr)
if self.activation == 'softmax': return self.softmax(arr)
# Forward Propagation
def step(self, vec):
# Assign Input
self._in = vec
# Dot
z = np.dot(self.weights, vec) + self.biases
a = self.activate(z)
# Return
self.out = a
return self.out
# Back Propagation
def back(self, grad):
# Calculate sigma
sigma = grad if self.activation == 'softmax' else grad * (self.out > 0)
# Calculate delta
delta = np.dot(sigma, self._in.T)
# Accumulate
self.sigma_acc += sigma
self.delta_acc += delta
# Return global gradient
global_grad = np.dot(self.weights.T, sigma)
return global_grad
# Train
def update(self, alpha, batch_size):
dw = self.delta_acc / batch_size; self.delta_acc *= 0
db = self.sigma_acc / batch_size; self.sigma_acc *= 0
self.weights -= alpha * dw
self.biases -= alpha * db
To connect them as a model, I just add instances of this Dense class into a list and loop through them forwards and backwards using the step() and back() functions respectively.
Kindly inform me if you see any issue! Thanks.
This is how I created my network maybe could help you.
import numpy as np
X = np.array(([0, 0, 0], [0, 0, 1], [0, 1, 0],
[0, 1, 1], [1, 0, 0], [1, 0, 1],
[1, 1, 0], [1, 1, 1]), dtype=float)
y = np.array(([1], [0], [0], [0], [0], [0], [0], [1]), dtype=float)
xPredicted = np.array(([0, 0, 1]), dtype=float)
X = X/np.amax(X, axis=0)
xPredicted = xPredicted/np.amax(X, axis=0)
lossFile = open("Enter file", "w")
class Neural_Network(object):
def __init__(self, inputLayerSize, outputLayerSize, hiddenLayerSize):
self.inputLayerSize = inputLayerSize
self.outputLayerSize = outputLayerSize
self.hiddenLayerSize = hiddenLayerSize
self.W1 = \
np.random.randn(self.inputLayerSize, self.hiddenLayerSize)
self.W2 = \
np.random.randn(self.hiddenLayerSize, self.outputLayerSize)
def feedForward(self, X):
self.z = np.dot(X, self.W1)
self.z2 = self.activationSigmoid(self.z)
self.z3 = np.dot(self.z2, self.W2)
o = self.activationSigmoid(self.z3)
return o
def backwardPropagate(self, X, y, o):
self.o_error = y - o
self.o_delta = self.o_error*self.activationSigmoidPrime(o)
self.z2_error = self.o_delta.dot(self.W2.T)
self.z2_delta = self.z2_error*self.activationSigmoidPrime(self.z2)
self.W1 += X.T.dot(self.z2_delta)
self.W2 += self.z2.T.dot(self.o_delta)
def trainNetwork(self, X, y):
o = self.feedForward(X)
self.backwardPropagate(X, y, o)
def activationSigmoid(self, s):
return 1/(1+np.exp(-s))
def activationSigmoidPrime(self, s):
return s * (1 - s)
def saveSumSquaredLossList(self, i, error):
lossFile.write(str(i)+","+str(error.tolist())+"\n")
def saveWeights(self):
np.savetxt("Enter file", self.W1, fmt="%s")
np.savetxt("Enter file",
self.W2, fmt="%s")
def predictOutput(self):
print("Predicted XOR output data based on trained weights: ")
print("Expected (X1-X3); \n" + str(X))
print("Output (Y1): \n" + str(self.feedForward(xPredicted)))
myNeuralNetwork = Neural_Network(3, 1, 4)
trainingEpochs = 1000
for i in range(trainingEpochs):
print("Epoch # " + str(i) + "\n")
print("Network Input : \n" + str(X))
print("Expected Output of XOR Gate Neural Network: \n" + str(y))
print("Actual Output from XOR Gate Neural Network: \n" +
str(myNeuralNetwork.feedForward(X)))
Loss = np.mean(np.square(y - myNeuralNetwork.feedForward(X)))
myNeuralNetwork.saveSumSquaredLossList(i, Loss)
print("Sum Squared Loss: \n" + str(Loss))
print("\n")
myNeuralNetwork.trainNetwork(X, y)
myNeuralNetwork.saveWeights()
myNeuralNetwork.predictOutput()
Related
I want to implement a multi-layer perceptron.
I found some code on GitHub that classifies MNIST quite well (96%). However, for some reason, it does not cope with the XOR task.
I want to understand why.
Here is the code:
perceptron.py
import random
import numpy as np
class Perceptron:
def __init__(self, *, layer_sizes, activation_functions, cost_function_deriv):
self.layer_sizes = layer_sizes
if len(self.layer_sizes) - 1 != len(activation_functions):
raise ValueError("...")
self.activation_functions = activation_functions
self.cost_function_deriv = cost_function_deriv
self.biases = [np.random.randn(y, 1) for y in layer_sizes[1:]]
self.weights = [np.random.randn(y, x) for x, y in zip(layer_sizes[:-1], layer_sizes[1:])]
def train(self, training_data, test_data, epochs, mini_batch_size, lr):
test_data_len = len(test_data)
for epoch in range(epochs):
random.shuffle(training_data)
mini_batches = [training_data[x: x + mini_batch_size]
for x in range(0, len(training_data), mini_batch_size)]
for mini_batch in mini_batches:
mb_len = len(mini_batch)
gradient_weights = [np.zeros(w.shape) for w in self.weights]
gradient_biases = [np.zeros(b.shape) for b in self.biases]
for x, y in mini_batch:
delta_gradient_biases, delta_gradient_weights = self.backpropagation(np.array(x), y)
gradient_weights = [grad + delta for grad, delta in zip(gradient_weights, delta_gradient_weights)]
gradient_biases = [grad + delta for grad, delta in zip(gradient_biases, delta_gradient_biases)]
self.weights = [w - (lr / mb_len) * grad for w, grad in zip(self.weights, gradient_weights)]
self.biases = [b - (lr / mb_len) * grad for b, grad in zip(self.biases, gradient_biases)]
correct_answers = self.how_many_correct_answers(test_data)
print(f"Epoch number {epoch}: {correct_answers}/{test_data_len} correct answers")
def backpropagation(self, x, y):
gradient_b = [np.zeros(b.shape) for b in self.biases]
gradient_w = [np.zeros(w.shape) for w in self.weights]
activations = [x]
prev_activation = x
for i, (b, w) in enumerate(zip(self.biases, self.weights)):
current_activation = self.activation_functions[i](np.dot(w, prev_activation) + b)
activations.append(current_activation)
prev_activation = current_activation
delta = self.cost_function_deriv(activations[-1], y) * self.activation_functions[-1].deriv(activations[-1])
gradient_b[-1] = delta
gradient_w[-1] = np.dot(delta, activations[-2].T)
for i in range(2, len(self.layer_sizes)):
z = activations[-i]
act_der = self.activation_functions[-i + 1].deriv(z)
delta = np.dot(self.weights[-i + 1].T, delta) * act_der
gradient_b[-i] = delta
gradient_w[-i] = np.dot(delta, activations[-i - 1].T)
# Normal indexing variant:
# for i in range(len(self.layers) - 1, 0, -1):
# z = activations[i]
# act_der = self.activation_functions[i].deriv(z)
# delta = np.dot(self.weights[i].T, delta) * act_der
# gradient_b[i - 1] = delta
# gradient_w[i - 1] = np.dot(delta, activations[i - 1].T)
return gradient_b, gradient_w
def feedforward(self, a):
for i, (b, w) in enumerate(zip(self.biases, self.weights)):
a = self.activation_functions[i](np.dot(w, a) + b)
return a
def how_many_correct_answers(self, test_data):
k = 0
for x, y in test_data:
y_predict = np.argmax(self.feedforward(x))
print(y_predict, y)
k += int(y_predict == y)
return k
main.py
from copy import deepcopy
import numpy as np
from perceptron import Perceptron
class Sigmoid:
out_min_max = [0, 1]
def __call__(self, x):
return 1. / (1. + np.exp(-x))
def deriv(self, y):
# t = self(x)
# return t * (1. - t)
return y * (1. - y)
def cost_function_derivative(y_predict, y_true_label):
label_vector = np.zeros(y_predict.shape)
label_vector[y_true_label] = 1.0
return y_predict - label_vector
def main():
training_data = np.asarray([[[[0], [0]], 0],
[[[0], [1]], 1],
[[[1], [0]], 1],
[[[1], [1]], 0]])
layer_sizes = [2, 8, 2]
model = Perceptron(layer_sizes=layer_sizes,
activation_functions=[Sigmoid(), Sigmoid()],
cost_function_deriv=cost_function_derivative)
model.train(deepcopy(training_data),
deepcopy(training_data),
epochs=10000,
mini_batch_size=4,
lr=0.01)
if __name__ == '__main__':
main()
The final output in format 'y_predict y_true' (after each epoch):
0 0
0 1
0 1
0 0
If remove random.shuffle(training_data) then:
1 0
0 1
1 1
0 0
But not 0 1 1 0
I figured it out. It requires the following.
mini_batch_size=1
# random.shuffle(training_data) -- comment
epochs=10000
And it's better to do this:
lr=0.1
The result in most cases is obtained after ~1000 epochs:
0 0
1 1
1 1
0 0
I could use a second set of eyes on my neural network.
This is the mnist number recognition project.
I'm not sure where the issue is.
I previously implemented the ai with tensor flow successfully.
I'm not looking to use an api as a solution.
I would appreciate any help anyone can give.
Here's the project on github, it's only an init file and then the neural_network.
https://github.com/nealchawn/ai_trial_2
class NeuralNetwork(object):
def __init__(self, sizes):
self.activations = []
self.outputs = []
self.weights = []
self.biases = []
self.sizes = sizes
self.set_random_weights()
self.set_random_biases()
def set_random_weights(self):
for layer_index, layer_size in enumerate(self.sizes[1:], start=1):
layer_weights = []
for size in range(layer_size):
for size in range(self.sizes[layer_index-1]):
layer_weights.append(random.uniform(-5.0, 5.0))
self.weights.append(layer_weights)
def set_random_biases(self):
total_biases = 0
# add extra zero bias to help future indexing
#self.biases.append(0)
for index, size in enumerate(self.sizes[0:-1], start=1):
total_biases += 1
for x in range(total_biases):
self.biases.append(random.uniform(-5.0, 5.0))
def train_network(self, training_data, training_labels):
if len(training_data) != len(training_labels):
print("Error data and labels must be the same length")
data = list(zip(training_data, training_labels))
self.sgd(data)
def sgd(self, data, mini_batch_size = 1000):
# first we'll create batches of training data
n = len(data)
data_batches = [
data[k:k + mini_batch_size]
for k in range(0, n, mini_batch_size)
]
print(len(data_batches))
i = 0
for mini_batch in data_batches:
print("Batch: " + str(i))
i += 1
self.update_mini_batch(mini_batch)
self.network_outputs()
print("Finished All training data!")
def update_mini_batch(self, mini_data_batch):
weight_gradients = []
bias_gradients = []
i = 0
for training_input in mini_data_batch:
training_object, training_label = training_input
self.feedforward(training_object)
weights_gradient, bias_gradient = self.backpropogation(training_label)
weight_gradients.append(weights_gradient)
bias_gradients.append(bias_gradient)
# average gradients
weights_gradient = np.average(weight_gradients,axis=0)
biases_gradient = np.average(bias_gradients, axis=0)
# may need to convert to list
weights_gradient_list = []
for weight_gradient in weights_gradient:
weights_gradient_list.append(weight_gradient.tolist())
#weights_gradient = weights_gradient.tolist()
biases_gradient = biases_gradient.tolist()
for x in range(len(self.biases)):
self.biases[x] -= 0.1*biases_gradient[x]
weight_gradient_index = 0
for layer_index, layer_weights in enumerate(self.weights, start=0):
for weight_index, weight in enumerate(layer_weights):
self.weights[layer_index][weight_index] = weight - 0.1*weights_gradient_list[layer_index][weight_index]
weight_gradient_index += 1
def feedforward(self, training_object):
# set inputs
self.outputs = []
self.activations = []
temp_activations = []
for index in range(self.sizes[0]):
temp_activations.append(training_object[index])
self.activations.append(temp_activations)
for layer_index, layer_size in enumerate(self.sizes[1:], start=0):
layer_weights = self.weights[layer_index]
layer_inputs = self.activations[layer_index]
weight_index = 0
layer_outputs = []
layer_activations = []
for node_index in range(layer_size):
node_weights = []
# get node weights
#print(f"layer size: {layer_size}, previous_layer_size: {self.sizes[layer_index]}, layer weights: {len(layer_weights)}")
for x in range(self.sizes[layer_index]):
node_weights.append(layer_weights[weight_index])
weight_index += 1
output = 0
for indx in range(len(node_weights)):
output += layer_inputs[indx]*node_weights[indx]
output = output + self.biases[layer_index]
layer_outputs.append(output)
layer_activations.append(self.sigmoid(output))
self.outputs.append(layer_outputs)
self.activations.append(layer_activations)
def backpropogation(self, training_label):
costs = []
output_layer_activations = self.activations[-1]
output_layer_outputs = self.outputs[-1]
correct_labels = self.translate_label_to_array(training_label)
costs.append(self.compute_cost_derivative(correct_labels, output_layer_activations))
for cost_index, cost in enumerate(costs[0]):
costs[0][cost_index] = cost*self.sigmoid_prime(output_layer_outputs[cost_index])
# calculate costs for layers
for layer_index, layer_size in enumerate(self.sizes[::-1][1:-1], start=1):
layer_costs = []
layer_weights = self.weights[-layer_index]
layer_outputs = self.outputs[-(layer_index+1)]
previous_layer_costs = costs[layer_index-1]
next_layer_size = self.sizes[::-1][1:][layer_index]
layer_weights_formatted = []
for x in range(layer_size):
layer_weights_formatted.append([])
for weight_index, weight in enumerate(layer_weights, start=0):
#print(f"weight index:{weight_index % next_layer_size} layer_index: {weight_index}")
layer_weights_formatted[weight_index%layer_size].append(layer_weights[weight_index])
#print(f"next_layer_size:{layer_size} costs: {len(previous_layer_costs)}, layer_weights_formatted: {layer_weights_formatted}")
for x in range(layer_size):
node_cost = 0
for y, cost in enumerate(previous_layer_costs,start=0):
node_cost += layer_weights_formatted[x][y]*cost
layer_costs.append(node_cost)
# layer_costs same order as next layer's activations
for cost_index, cost in enumerate(layer_costs):
layer_costs[cost_index] = cost * self.sigmoid_prime(layer_outputs[cost_index])
costs.append(layer_costs)
# calculate weight errors
weight_errors = []
bias_errors = []
for layer_index, layer_costs in enumerate(costs[::-1]):
layer_activations = self.activations[layer_index]
layer_weight_errors = []
for cost_index, cost in enumerate(layer_costs,start=0):
for activation in layer_activations:
layer_weight_errors.append(activation * cost)
weight_errors.append(np.array(layer_weight_errors))
bias_errors.append(sum(layer_costs))
return weight_errors, bias_errors
# conversion tool
def translate_label_to_array(self, y):
translated_label = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
translated_label[y] = 1
return np.array(translated_label)
# output tools
def network_outputs(self):
print("Output layer: ")
for x in range(self.sizes[-1]):
print("node " + str(x) + ": " + str(self.activations[-1][x]))
def total_activations(self):
print(len(self.activations))
def compute_cost_derivative(self, y, output_activations):
"""Return the vector of partial derivatives \partial C_x /
\partial a for the output activations."""
return (output_activations - y)
def sigmoid(self, z):
""""The sigmoid function."""
return (1.0 / (1.0 + np.exp(-z)))
def sigmoid_prime(self, z):
return (self.sigmoid(z) * (1 - self.sigmoid(z)))
I'm trying to implement a neural network from scratch in order to gain better insight about it and I run into a weird problem. When I use Relu function for hidden layers as an activation function, the model did not converge whereas it did converge once sigmoid function is used. Here is my vanilla code: When you change first 2 layers' activation function from relu to sigmoid, you can see that it converges, though it may have a problem sometimes. Where could be the problem? It's been three days and I still couldnt find it, though I found some little bugs. Thanks in advance.
Here is the toy dataset I've been using(just paste it to where this code is located).
Dataset
import numpy as np
import pandas as pd
class NeuralNetwork():
def __init__(self, epoch=10000, alpha=0.075, algorithm="gradient_descent"):
# hyperparameters
self.epoch = epoch
self.alpha = alpha
self.algorithm = algorithm
# parameters
self.params = {}
self.layer_no = 1
# logs
self.cost_vals = []
def createLayer(self, size, activation_func, randomness=True):
if randomness == True:
self.params["W" + str(self.layer_no)] = np.random.randn(size[0], size[1]) * 0.01
else:
self.params["W" + str(self.layer_no)] = np.zeros(size)
self.params["b" + str(self.layer_no)] = np.zeros((size[0], 1))
self.params["func" + str(self.layer_no)] = activation_func
self.layer_no += 1
def sigmoid(self, X):
return 1 / (1 + np.exp(-X))
def relu(self, X):
return np.maximum(X, 0) * 0.01
def tanh(self, X):
return (np.exp(X) - np.exp(-X)) / (np.exp(X) + np.exp(-X))
def derivative_sigmoid(self, X):
der_x = self.sigmoid(X)
return der_x * (1 - der_x)
def derivative_relu(self, X):
X[X<=0] = 0
X[X>0] = 1
return X
def derivative_tanh(self, X):
tanhx = self.tanh(X)
return 1 - np.power(tanhx, 2)
def activation_function(self, Zl, act_func_name):
if act_func_name == "sigmoid":
return self.sigmoid(Zl)
elif act_func_name == "relu":
return self.relu(Zl)
elif act_func_name == "tanh":
return self.tanh(Zl)
def derivative_activation_function(self, Zl, act_func_name):
if act_func_name == "sigmoid":
return self.derivative_sigmoid(Zl)
elif act_func_name == "relu":
return self.derivative_relu(Zl)
elif act_func_name == "tanh":
return self.derivative_tanh(Zl)
def train(self, X, Y):
m = Y.shape[0] # number of training examples
self.params["A0"] = X
self.params["Z0"] = None
for i in range(self.epoch):
# forward prop
for l in range(1, self.layer_no): # 1,2,3
Zl = np.dot(self.params["W" + str(l)], self.params["A" + str(l - 1)]) + self.params["b" + str(l)] # linear function of a layer with vectorization
Al = self.activation_function(Zl, self.params["func" + str(l)]) # activated form of Zl
self.params["Z" + str(l)] = Zl
self.params["A" + str(l)] = Al
# cost function
cost_val = - 1 / m * np.sum(np.multiply(Y, np.log(Al)) + np.multiply((1 - Y), np.log(1 - Al)))
cost_val = np.squeeze(cost_val)
if i % 500 == 0:
print(cost_val)
self.cost_vals.append(cost_val)
# backward prop
dAl = - (np.divide(Y, Al) - np.divide(1 - Y, 1 - Al)) # gradiant of last layer of A
for l in reversed(range(1, self.layer_no)): # 3,2,1
# backward prop
dZl = np.multiply(dAl,
self.derivative_activation_function(self.params["Z" + str(l)], self.params["func" + str(l)])) # gradient of layer l of Z
dAl1 = np.dot(self.params["W" + str(l)].T, dZl) # gradient of previous layer of A
dWl = 1 / m * np.dot(dZl, self.params["A" + str(l - 1)].T) # gradient of parameters W in layer l
dbl = 1 / m * np.sum(dZl, axis=1, keepdims=True) # gradient of parameters b in layer l
# update parameters
self.params["W" + str(l)] -= self.alpha * dWl
self.params["b" + str(l)] -= self.alpha * dbl
dAl = dAl1 # assign gradient of previous layer of A to the current one so as to use it while back-propagation
def iris_data():
from sklearn.model_selection import train_test_split
datas = pd.read_csv('iris_nn.data').to_numpy()
X = datas[:, 0:4].astype(float)
Y = datas[:, 4:5]
Y = np.asarray([1 if (y == 'Iris-setosa') else 0 for y in Y]).reshape((Y.shape[0], 1))
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=0)
return X_train.T, Y_train.T
X, Y = iris_data()
model = NeuralNetwork()
model.createLayer((5,4), "relu")
model.createLayer((7,5), "relu")
model.createLayer((1,7), "sigmoid")
model.train(X,Y)
#
Background:
I am trying to create a MLP using Tensorflow, my first time using tensorflow. It is a simple NN that will do the xor operation. I have 2 input neurons (for the 1s and 0s) a hidden layer that is 2 neurons wide. One output that will give me a 1 or 0. My activation is a simple sigmoid.
The Issue
I am running into an issue with launching the graph. Something I noticed is that when we launch the graph we get all of the batch instead of one at a time. for example I have the following in an array [[1,0],[0,0],[0,1],[1,1]]. When I try to start the graph I do the following:
x_vals = np.array([[1.0, 0.0],[0.0, 0.0], [0.0, 1.0], [1.0, 1.0]])
y_vals = np.array([[1.0],[0.0],[1.0],[0.0]])
result = run_nn(x,y)
with tf.Session() as sess:
sess.run(init)
results = sess.run(result, feed_dict={x: x_vals, y:y_vals})
print results
As we can see I feed in x and y into the neural network. Once I do this I need to multiply the weights * outputs (essentially the input [1,0]) and sum them The issue is that I get a mismatch in size between the x values and the weights array:
tf.transpose(tf.reduce_sum(tf.multiply(tf.transpose(l0.weights) , l0.outputs), 1))
InvalidArgumentError: Incompatible shapes: [2,3] vs. [4,3]
[[Node: Mul_6 = Mul[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](transpose_10, concat_12)]]
What am I doing wrong here, I understand this is not a perfect implementation. But I want to do a NN step by step
Here is my full code:
import math
import numpy as np
momentum = 0.5
learning_rate = 2.0
class layer:
def __init__(self, num_neurons, num_weights, layer_type):#number of weights corresponds to number of neurons in next layer
self.num_neurons = num_neurons
self.num_weights = num_weights
self.layer_type = layer_type
if layer_type == 'hidden':
num_neurons = num_neurons+1#account for bias
self.num_neurons = num_neurons+1
self.weights = tf.random_normal([num_neurons, num_weights])
self.outputs = tf.zeros(num_neurons, tf.float32)
self.sums = tf.zeros(num_neurons, tf.float32)
self.deltas = tf.zeros(num_neurons, tf.float32)
self.gradiants = tf.zeros([num_neurons, num_weights], tf.float32)
self.weight_deltas = tf.zeros_like(self.gradiants)
def calculate_sums(self, p_layer):
self.sums = tf.transpose(tf.reduce_sum(tf.multiply(tf.transpose(p_layer.weights) , p_layer.outputs), 1))
return self.sums
def calculate_outputs(self, p_layer):
if self.layer_type == 'hidden':
self.outputs = tf.concat([sigmoid(self.sums, False), tf.constant([1.0])], 0)
else:
self.outputs = sigmoid(self.sums, False)
return self.outputs
def calculate_deltas(self, n_layer = None, y=None):
if self.layer_type == 'hidden':
self.deltas = sigmoid(self.sums, True) * n_layer.deltas * self.weights[:-1,0]
else:#output delta
E = self.outputs[:self.num_neurons]-y
#print 'error: {}'.format(E)
self.deltas = -E* sigmoid(self.sums, True)
return self.deltas
def calculate_gradiants(self, n_layer):
shape = (tf.shape(self.outputs)[0], 1)
self.gradiants += tf.reshape(self.outputs, shape=shape) * tf.transpose(n_layer.deltas)#we add the gradiants for every batch completion then update, dont want to update every time
return self.gradiants
def update_weights(self):
self.weight_deltas = self.gradiants*learning_rate + momentum * self.weight_deltas
self.weights += self.weight_deltas
# for i in range(len(self.gradiants)):
# for j in range(len(self.gradiants[0])):
# self.weight_deltas[i,j] = weight_change(self.gradiants[i,j], self.weight_deltas[i,j])
# self.weights[i,j] += self.weight_deltas[i,j]
def sigmoid(x, derivative = False):
if derivative == True:
return (1.0/(1+tf.exp(-x))) * (1.0 - (1.0/(1+tf.exp(-x))))
return 1.0/(1+tf.exp(-x))
#the output delta is just E*f'i, essentially the error * the derivative of the activation function
def weight_change(g, p_w_delta):#gradiant, previous weight delta
return learning_rate*g + momentum * p_w_delta
def run_nn(x_val, y_val):
l0.outputs = tf.concat([x_val, tf.ones(shape=(tf.shape(x_val)[0],1))], 1)
print 'set output'
#forward pass
# l1.calculate_sums(l0)
# print 'l1 calc sum'
# l1.calculate_outputs(l0)
# print 'l1 calc output'
# ol.calculate_sums(l1)
# print 'ol calc sum'
# ol.calculate_outputs(l1)
# print 'ol calc output'
# #backwards pass
# ol.calculate_deltas(y=y_val)
# print 'ol calc deltas'
# l1.calculate_deltas(ol)
# print 'l1 calc deltas'
# l1.calculate_gradiants(ol)
# print 'l1 calc gradiants'
# l0.calculate_gradiants(l1)
# print 'l0 calc gradiants'
# #we dont want to update the weights every time, just after we have gone through every batch/minibatch
# l1.update_weights()
# print 'l1 update weights'
# l0.update_weights()
# print 'l0 uipdate weights'
# l1.gradiants = tf.zeros_like(l1.gradiants)
# print 'l1 zero gradiants'
# l0.gradiants = tf.zeros_like(l0.gradiants)
# print 'l0 zero gradiants'
# #test
# print 'run test'
# l0.outputs = tf.concat([x, tf.constant([1.0])], 0 )
# #forward pass
# l1.calculate_sums(l0)
# l1.calculate_outputs(l0)
#
# ol.calculate_sums(l1)
# ol.calculate_outputs(l1)
# print 'DONE'
return tf.transpose(tf.reduce_sum(tf.multiply(tf.transpose(l0.weights) , l0.outputs), 1))
l0 = layer(2,2,'hidden')#input
l1 = layer(2,1,'hidden')#hidden
ol = layer(1,0,'output')#output
x_vals = np.array([[1.0, 0.0],[0.0, 0.0], [0.0, 1.0], [1.0, 1.0]])
y_vals = np.array([[1.0],[0.0],[1.0],[0.0]])
# initialize variables
init = tf.global_variables_initializer()
x = tf.placeholder('float', None)
y = tf.placeholder('float', None)
result = run_nn(x,y)
with tf.Session() as sess:
sess.run(init)
results = sess.run(result, feed_dict={x: x_vals, y:y_vals})
print results
Here is some equivalent code in pure python/numpy
import math
import numpy as np
momentum = 0.5
learning_rate = 2.0
class layer:
def __init__(self, num_neurons, num_weights, layer_type):#number of weights corresponds to number of neurons in next layer
self.layer_type = layer_type
if layer_type == 'hidden':
num_neurons = num_neurons+1#account for bias
self.weights = np.random.rand(num_neurons,num_weights)
self.outputs = np.zeros(shape=(1,num_neurons))
self.sums = np.zeros(shape=(1,num_neurons))
self.deltas = np.zeros(shape=(1,num_neurons)).T
self.gradiants = np.zeros(shape=(num_neurons,num_weights))
self.weight_deltas = np.zeros_like(self.gradiants)
def calculate_sums(self, p_layer):
self.sums = np.array([(sum(p_layer.weights * p_layer.outputs))]).T
return self.sums;
def calculate_outputs(self, p_layer):
if self.layer_type == 'hidden':
self.outputs = np.concatenate((np.array([[sigmoid(X, False)] for X in self.sums]), np.array([[1.0]])))
else:
self.outputs = np.array([[sigmoid(X, False)] for X in self.sums])
return self.outputs
def calculate_deltas(self, n_layer = None):
if self.layer_type == 'hidden':
self.deltas = np.array([[sigmoid(X, True)] for X in self.sums]) * n_layer.deltas * self.weights[:-1]
else:#output delta
E = self.outputs-y
#print 'error: {}'.format(E)
self.deltas = -E* sigmoid(self.sums, True)
return self.deltas
def calculate_gradiants(self, n_layer):
self.gradiants += self.outputs * n_layer.deltas.T#we add the gradiants for every batch completion then update, dont want to update every time
return self.gradiants
def update_weights(self):
for i in range(len(self.gradiants)):
for j in range(len(self.gradiants[0])):
self.weight_deltas[i,j] = weight_change(self.gradiants[i,j], self.weight_deltas[i,j])
self.weights[i,j] += self.weight_deltas[i,j]
def sigmoid(x, derivative = False):
if derivative == True:
return (1.0/(1+math.exp(-x))) * (1.0 - (1.0/(1+math.exp(-x))))
return 1.0/(1+math.exp(-x))
#the output delta is just E*f'i, essentially the error * the derivative of the activation function
def weight_change(g, p_w_delta):#gradiant, previous weight delta
return learning_rate*g + momentum * p_w_delta
input_layer = layer(3,2, 'hidden')
hidden_layer1 = layer(2,1, 'hidden')
output_layer = layer(1,0, 'output')
x_vals = []
y_vals = []
for i in range(2):
for j in range(2):
for k in range(2):
x_vals.append(np.array([[float(i)],[float(j)],[float(k)]]))
y_vals.append(np.array([float(i ^ j ^ k)]))
#x_vals = [np.array([[1.0], [0.0]]), np.array([[0.0], [0.0]]), np.array([[0.0], [1.0]]),np.array([[1.0], [1.0]])]
#y_vals = np.array([[1.0],[0.0],[1.0],[0.0]])
#input_layer.weights = np.array([[-0.06782947598673161,0.9487814395569221],[0.22341077197888182,0.461587116462548], [-0.4635107399577998, 0.09750161997450091]])
#hidden_layer1.weights = np.array([[-0.22791948943117624],[0.581714099641357], [0.7792991203673414]])
Error = []
for n in range(10000):
for x, y in zip(x_vals, y_vals):
input_layer.outputs = np.concatenate((x, np.array([[1.0]])))
#forward pass
hidden_layer1.calculate_sums(input_layer)
hidden_layer1.calculate_outputs(input_layer)
output_layer.calculate_sums(hidden_layer1)
output_layer.calculate_outputs(hidden_layer1)
Error.append(-(output_layer.outputs-y))
#backwards pass
output_layer.calculate_deltas()
hidden_layer1.calculate_deltas(output_layer)
hidden_layer1.calculate_gradiants(output_layer)
input_layer.calculate_gradiants(hidden_layer1)
if n % 1000 == 0:
print 'Epoch #{}; error: {}'.format(n, sum(Error)/len(Error))
Error = []
#we dont want to update the weights every time, just after we have gone through every batch/minibatch
hidden_layer1.update_weights()
input_layer.update_weights()
hidden_layer1.gradiants.fill(0.0)
input_layer.gradiants.fill(0.0)
#test
for x, y in zip(x_vals, y_vals):
input_layer.outputs = np.concatenate((x, np.array([[1.0]])))
#forward pass
hidden_layer1.calculate_sums(input_layer)
hidden_layer1.calculate_outputs(input_layer)
output_layer.calculate_sums(hidden_layer1)
output_layer.calculate_outputs(hidden_layer1)
print 'Y_hat: {}, Y: {}'.format(round(float(output_layer.outputs), 3), float(y))
Can anyone point me in the right direction.
Thanks
I'm using a neural network with 1 hidden layer (2 neurons) and 1 output neuron for solving the XOR problem.
Here's the code I'm using. It contains the main run file xor.py which creates a model defined in model.py. Each neuron is defined by the class Neuron in neuron.py
xor.py
from model import Model
import numpy as np
inputs = [[0,0], [0,1], [1,0], [1,1]]
outputs = [0, 1, 1, 0]
m = Model()
m.train(inputs, outputs)
for i in inputs:
p = m.predict(i)
print str(i) + ' => ' + str(p)
model.py
from neuron import HiddenNeuron, OutputNeuron
import numpy as np
class Model(object):
def __init__(self):
self.hidden = [HiddenNeuron(2) for i in range(2)]
self.output = OutputNeuron(2)
def predict(self, input):
temp = []
for x in range(2):
self.hidden[x].forward(input)
temp.append(self.hidden[x].out)
self.output.forward(temp)
return self.output.out
def train(self, inputs, targets):
it = 0
i = 0
size = len(inputs)
while it < 4:
if i == size:
i = 0
feature = inputs[i]
print '\n\nFeature : ' + str(feature) + '\n'
print 'Output weights : ' + str(self.output.weights)
print 'Hidden 1 weights : ' + str(self.hidden[0].weights)
print 'Hidden 2 weights : ' + str(self.hidden[1].weights)
temp = []
for x in range(2):
self.hidden[x].forward(feature)
temp.append(self.hidden[x].out)
self.output.forward(temp)
self.output.backward(targets[i])
deltas = []
deltas.append(self.output.error)
weights = []
weights.append([self.output.weights[0]])
weights.append([self.output.weights[1]])
for x in range(2):
self.hidden[x].backward(deltas, weights[x])
for x in range(2):
self.hidden[x].update(feature)
self.output.update(temp)
it += 1
i += 1
neuron.py
import numpy as np
from random import uniform
class Neuron(object):
def activation(self, fx):
return 1/(1 + np.exp(-fx))
def __init__(self, dim, lrate):
self.dim = dim
self.weights = np.empty([dim])
self.weights = [uniform(0,1) for x in range(dim)]
self.bias = uniform(0, 1)
self.lrate = lrate
self.out = None
self.error = None
def update(self, input):
j = 0
for i in input:
delta = self.lrate * self.error
self.weights[j] -= (delta*i)
self.bias += delta
j+=1
def forward(self, input):
j = 0
sum = self.bias
for f in input:
sum += f * self.weights[j]
j+=1
self.out = self.activation(sum)
def backward(self):
pass
class OutputNeuron(Neuron):
def __init__(self, dim, lrate=0.2):
super(OutputNeuron, self).__init__(dim, lrate)
def backward(self, target):
self.error = self.out * (1 - self.out) * (self.out - target)
class HiddenNeuron(Neuron):
def __init__(self, dim, lrate=0.2):
super(HiddenNeuron, self).__init__(dim, lrate)
def backward(self, deltas, weights):
sum = 0
size = len(deltas)
for x in range(size):
sum += deltas[x] * weights[x]
self.error = self.out * (1 - self.out) * sum
The final output is
[0, 0] => 0.999999991272
[0, 1] => 0.999999970788
[1, 0] => 0.999999952345
[1, 1] => 0.999715564446
I think the error is in neuron.py in the function update(). If you change self.bias += delta to self.bias -= delta it should work, at least it does for me. Otherwise you would modify your biases to ascend towards a maximum on the error surface.
Below you can see the output after 100000 training epochs.
[0, 0] => 0.0174550173543
[0, 1] => 0.983899954593
[1, 0] => 0.983895388655
[1, 1] => 0.0164172288168