I'm using a neural network with 1 hidden layer (2 neurons) and 1 output neuron for solving the XOR problem.
Here's the code I'm using. It contains the main run file xor.py which creates a model defined in model.py. Each neuron is defined by the class Neuron in neuron.py
xor.py
from model import Model
import numpy as np
inputs = [[0,0], [0,1], [1,0], [1,1]]
outputs = [0, 1, 1, 0]
m = Model()
m.train(inputs, outputs)
for i in inputs:
p = m.predict(i)
print str(i) + ' => ' + str(p)
model.py
from neuron import HiddenNeuron, OutputNeuron
import numpy as np
class Model(object):
def __init__(self):
self.hidden = [HiddenNeuron(2) for i in range(2)]
self.output = OutputNeuron(2)
def predict(self, input):
temp = []
for x in range(2):
self.hidden[x].forward(input)
temp.append(self.hidden[x].out)
self.output.forward(temp)
return self.output.out
def train(self, inputs, targets):
it = 0
i = 0
size = len(inputs)
while it < 4:
if i == size:
i = 0
feature = inputs[i]
print '\n\nFeature : ' + str(feature) + '\n'
print 'Output weights : ' + str(self.output.weights)
print 'Hidden 1 weights : ' + str(self.hidden[0].weights)
print 'Hidden 2 weights : ' + str(self.hidden[1].weights)
temp = []
for x in range(2):
self.hidden[x].forward(feature)
temp.append(self.hidden[x].out)
self.output.forward(temp)
self.output.backward(targets[i])
deltas = []
deltas.append(self.output.error)
weights = []
weights.append([self.output.weights[0]])
weights.append([self.output.weights[1]])
for x in range(2):
self.hidden[x].backward(deltas, weights[x])
for x in range(2):
self.hidden[x].update(feature)
self.output.update(temp)
it += 1
i += 1
neuron.py
import numpy as np
from random import uniform
class Neuron(object):
def activation(self, fx):
return 1/(1 + np.exp(-fx))
def __init__(self, dim, lrate):
self.dim = dim
self.weights = np.empty([dim])
self.weights = [uniform(0,1) for x in range(dim)]
self.bias = uniform(0, 1)
self.lrate = lrate
self.out = None
self.error = None
def update(self, input):
j = 0
for i in input:
delta = self.lrate * self.error
self.weights[j] -= (delta*i)
self.bias += delta
j+=1
def forward(self, input):
j = 0
sum = self.bias
for f in input:
sum += f * self.weights[j]
j+=1
self.out = self.activation(sum)
def backward(self):
pass
class OutputNeuron(Neuron):
def __init__(self, dim, lrate=0.2):
super(OutputNeuron, self).__init__(dim, lrate)
def backward(self, target):
self.error = self.out * (1 - self.out) * (self.out - target)
class HiddenNeuron(Neuron):
def __init__(self, dim, lrate=0.2):
super(HiddenNeuron, self).__init__(dim, lrate)
def backward(self, deltas, weights):
sum = 0
size = len(deltas)
for x in range(size):
sum += deltas[x] * weights[x]
self.error = self.out * (1 - self.out) * sum
The final output is
[0, 0] => 0.999999991272
[0, 1] => 0.999999970788
[1, 0] => 0.999999952345
[1, 1] => 0.999715564446
I think the error is in neuron.py in the function update(). If you change self.bias += delta to self.bias -= delta it should work, at least it does for me. Otherwise you would modify your biases to ascend towards a maximum on the error surface.
Below you can see the output after 100000 training epochs.
[0, 0] => 0.0174550173543
[0, 1] => 0.983899954593
[1, 0] => 0.983895388655
[1, 1] => 0.0164172288168
Related
I have written a simple digit recognition neural network and it does not seem to be learning. It has 2 hidden layers and uses the softmax activation function and whenever it runs it seems to converge on always picking 0. I would just like to check if the code for updating the weight matrices is correct
from cmath import exp
import numpy as np
from tensorflow.keras.datasets import mnist
class Run:
def __init__(self, num_inputs, num_hidden1, num_hidden2, num_outputs):
self.num_inputs = num_inputs
self.num_hidden1 = num_hidden1
self.num_hidden2 = num_hidden2
self.num_outputs = num_outputs
self.learningrate = 0.001
self.get = GetInput()
self.count = 0
self.countTrue = 0
self.count1 = 0
self.sum = 0
self.past = 0
self.inputLayer = Layer(num_inputs, num_hidden1)
self.hiddenLayer1 = Layer(num_hidden1, num_hidden2)
self.hiddenLayer2 = Layer(num_hidden2, num_outputs)
def getinput(self):
input, expected = self.get.get(self.count)
self.count +=1
self.count1 += 1
return input, expected
def runNN(self, input):
self.inputLayer.calc_output_1(input)
self.hiddenLayer1.calc_output_1(self.inputLayer.fin_outputs)
self.hiddenLayer2.calc_output_1(self.hiddenLayer1.fin_outputs)
self.NN_Output = self.hiddenLayer2.fin_outputs
def calculate_cost(self, expected):
error = 0
for i in range(self.num_outputs):
error += (self.NN_Output[i][0] - expected[i][0])**2 / self.num_outputs
list = []
list1 = []
for each in self.NN_Output:
list.append(float(each[0]))
self.sum += list.index(max(list))
for each in expected:
list1.append(float(each[0]))
if list1.index(max(list1)) == list.index(max(list)):
self.countTrue += 1
print(round(self.countTrue/self.count1, 3))
if self.count1 % 1000 == 0:
print(self.sum / 1000)
print('')
self.past = 0
self.sum = 0
self.count1 = 0
self.countTrue = 0
return error
def calc_new_hidden1(self, expected):
delta = self.NN_Output - expected
change = np.multiply(delta, self.hiddenLayer2.fin_outputs)
change_weights = np.matmul(change, np.transpose(self.hiddenLayer2.inputs)) * self.learningrate
change_bias = change * self.learningrate
self.hiddenLayer2.amend(change_weights, change_bias)
def calc_new_hidden2(self, expected):
delta = self.NN_Output - expected
change = np.multiply(np.matmul(np.transpose(self.hiddenLayer2.getter()[0]), delta), self.hiddenLayer1.fin_outputs)
change_weights = np.matmul(change, np.transpose(self.hiddenLayer1.inputs)) * self.learningrate
change_bias = change * self.learningrate
self.hiddenLayer1.amend(change_weights, change_bias)
def calc_new_input(self, expected):
delta = (self.NN_Output - expected)
change = np.multiply(np.matmul(np.transpose(self.hiddenLayer1.getter()[0]), np.matmul(np.transpose(self.hiddenLayer2.getter()[0]), delta)), self.inputLayer.fin_outputs)
change_weights = np.matmul(change, np.transpose(self.inputLayer.inputs)) * self.learningrate
change_bias = change * self.learningrate
self.inputLayer.amend(change_weights, change_bias)
class Layer:
def __init__(self, num_inputs, num_outputs):
self.__weights = np.random.uniform(-0.5, 0.5, (num_outputs, num_inputs))
self.__bias = np.matrix([[float(0)] for x in range(num_outputs)])
def calc_output_1(self, inputs):
self.inputs = inputs
self.__output_1 = np.matmul(self.__weights, inputs) + self.__bias
self.softmax()
def softmax(self):
sum = 0
for each in self.__output_1:
sum += np.exp(float(each[0]))
list1 = []
for each in self.__output_1:
list1.append([float(np.exp(each[0])/sum)])
self.fin_outputs = np.matrix(list1)
def amend(self, change_weights, change_bias):
self.__weights -= change_weights
self.__bias -= change_bias
def getter(self):
return self.__weights, self.__bias
class GetInput:
def __init__(self):
(self.X_train, self.Y_train), (X_test, Y_test) = mnist.load_data()
self.X_train = self.X_train.reshape(self.X_train.shape[0], 28, 28, 1)
x_test = X_test.reshape(X_test.shape[0], 28, 28, 1)
def get(self, i):
list = []
newPhoto = self.X_train[i].astype('float32')/255
for each in newPhoto:
for n in each:
list.append([float(n)])
input = np.matrix(list)
list = []
expect = self.Y_train[i]
for each in range(10):
if each == expect:
list.append([1])
else:
list.append([0])
expected = np.matrix(list)
return input, expected
if __name__ == "__main__":
initiate = Run(784, 600, 400, 10)
while True:
input, expected = initiate.getinput()
initiate.runNN(input)
initiate.calculate_cost(expected)
initiate.calc_new_hidden1(expected)
initiate.calc_new_hidden2(expected)
initiate.calc_new_input(expected)`
Here is the code I have created. The maths for updating the weight matrices is in the Run class: calc_new_hidden1(), calc_new_hidden2(), calc_new_inputs()
I think the error will probably be in the calc_new_inputs() function
I have written a Dense Class for a FC layer in a CNN but to test if it works simply as a FC ANN, I tried to train a dataset over it but the loss never falls. I cannot seem to find the issue.
Here's the code:
class Dense:
# Constructor
def __init__(self, size, in_size, activation = 'relu'):
# Assign vars
self.size = size; self.activation = activation
# Initialize Weights and Biases
weights_dims = (size, in_size)
self.weights = np.random.standard_normal(weights_dims) * 0.01
self.biases = np.zeros([size, 1])
# Initialize Accumulators
self.sigma_acc = self.biases * 0
self.delta_acc = self.weights * 0
# ReLU Activation Function
def relu(self, arr):
return arr * (arr > 0)
# Softmax Activation Function
def softmax(self, arr):
arr -= arr.max()
exp = np.exp(arr)
return exp / np.sum(exp)
# Activation Manager Function
def activate(self, arr):
if self.activation == 'relu': return self.relu(arr)
if self.activation == 'softmax': return self.softmax(arr)
# Forward Propagation
def step(self, vec):
# Assign Input
self._in = vec
# Dot
z = np.dot(self.weights, vec) + self.biases
a = self.activate(z)
# Return
self.out = a
return self.out
# Back Propagation
def back(self, grad):
# Calculate sigma
sigma = grad if self.activation == 'softmax' else grad * (self.out > 0)
# Calculate delta
delta = np.dot(sigma, self._in.T)
# Accumulate
self.sigma_acc += sigma
self.delta_acc += delta
# Return global gradient
global_grad = np.dot(self.weights.T, sigma)
return global_grad
# Train
def update(self, alpha, batch_size):
dw = self.delta_acc / batch_size; self.delta_acc *= 0
db = self.sigma_acc / batch_size; self.sigma_acc *= 0
self.weights -= alpha * dw
self.biases -= alpha * db
To connect them as a model, I just add instances of this Dense class into a list and loop through them forwards and backwards using the step() and back() functions respectively.
Kindly inform me if you see any issue! Thanks.
This is how I created my network maybe could help you.
import numpy as np
X = np.array(([0, 0, 0], [0, 0, 1], [0, 1, 0],
[0, 1, 1], [1, 0, 0], [1, 0, 1],
[1, 1, 0], [1, 1, 1]), dtype=float)
y = np.array(([1], [0], [0], [0], [0], [0], [0], [1]), dtype=float)
xPredicted = np.array(([0, 0, 1]), dtype=float)
X = X/np.amax(X, axis=0)
xPredicted = xPredicted/np.amax(X, axis=0)
lossFile = open("Enter file", "w")
class Neural_Network(object):
def __init__(self, inputLayerSize, outputLayerSize, hiddenLayerSize):
self.inputLayerSize = inputLayerSize
self.outputLayerSize = outputLayerSize
self.hiddenLayerSize = hiddenLayerSize
self.W1 = \
np.random.randn(self.inputLayerSize, self.hiddenLayerSize)
self.W2 = \
np.random.randn(self.hiddenLayerSize, self.outputLayerSize)
def feedForward(self, X):
self.z = np.dot(X, self.W1)
self.z2 = self.activationSigmoid(self.z)
self.z3 = np.dot(self.z2, self.W2)
o = self.activationSigmoid(self.z3)
return o
def backwardPropagate(self, X, y, o):
self.o_error = y - o
self.o_delta = self.o_error*self.activationSigmoidPrime(o)
self.z2_error = self.o_delta.dot(self.W2.T)
self.z2_delta = self.z2_error*self.activationSigmoidPrime(self.z2)
self.W1 += X.T.dot(self.z2_delta)
self.W2 += self.z2.T.dot(self.o_delta)
def trainNetwork(self, X, y):
o = self.feedForward(X)
self.backwardPropagate(X, y, o)
def activationSigmoid(self, s):
return 1/(1+np.exp(-s))
def activationSigmoidPrime(self, s):
return s * (1 - s)
def saveSumSquaredLossList(self, i, error):
lossFile.write(str(i)+","+str(error.tolist())+"\n")
def saveWeights(self):
np.savetxt("Enter file", self.W1, fmt="%s")
np.savetxt("Enter file",
self.W2, fmt="%s")
def predictOutput(self):
print("Predicted XOR output data based on trained weights: ")
print("Expected (X1-X3); \n" + str(X))
print("Output (Y1): \n" + str(self.feedForward(xPredicted)))
myNeuralNetwork = Neural_Network(3, 1, 4)
trainingEpochs = 1000
for i in range(trainingEpochs):
print("Epoch # " + str(i) + "\n")
print("Network Input : \n" + str(X))
print("Expected Output of XOR Gate Neural Network: \n" + str(y))
print("Actual Output from XOR Gate Neural Network: \n" +
str(myNeuralNetwork.feedForward(X)))
Loss = np.mean(np.square(y - myNeuralNetwork.feedForward(X)))
myNeuralNetwork.saveSumSquaredLossList(i, Loss)
print("Sum Squared Loss: \n" + str(Loss))
print("\n")
myNeuralNetwork.trainNetwork(X, y)
myNeuralNetwork.saveWeights()
myNeuralNetwork.predictOutput()
I could use a second set of eyes on my neural network.
This is the mnist number recognition project.
I'm not sure where the issue is.
I previously implemented the ai with tensor flow successfully.
I'm not looking to use an api as a solution.
I would appreciate any help anyone can give.
Here's the project on github, it's only an init file and then the neural_network.
https://github.com/nealchawn/ai_trial_2
class NeuralNetwork(object):
def __init__(self, sizes):
self.activations = []
self.outputs = []
self.weights = []
self.biases = []
self.sizes = sizes
self.set_random_weights()
self.set_random_biases()
def set_random_weights(self):
for layer_index, layer_size in enumerate(self.sizes[1:], start=1):
layer_weights = []
for size in range(layer_size):
for size in range(self.sizes[layer_index-1]):
layer_weights.append(random.uniform(-5.0, 5.0))
self.weights.append(layer_weights)
def set_random_biases(self):
total_biases = 0
# add extra zero bias to help future indexing
#self.biases.append(0)
for index, size in enumerate(self.sizes[0:-1], start=1):
total_biases += 1
for x in range(total_biases):
self.biases.append(random.uniform(-5.0, 5.0))
def train_network(self, training_data, training_labels):
if len(training_data) != len(training_labels):
print("Error data and labels must be the same length")
data = list(zip(training_data, training_labels))
self.sgd(data)
def sgd(self, data, mini_batch_size = 1000):
# first we'll create batches of training data
n = len(data)
data_batches = [
data[k:k + mini_batch_size]
for k in range(0, n, mini_batch_size)
]
print(len(data_batches))
i = 0
for mini_batch in data_batches:
print("Batch: " + str(i))
i += 1
self.update_mini_batch(mini_batch)
self.network_outputs()
print("Finished All training data!")
def update_mini_batch(self, mini_data_batch):
weight_gradients = []
bias_gradients = []
i = 0
for training_input in mini_data_batch:
training_object, training_label = training_input
self.feedforward(training_object)
weights_gradient, bias_gradient = self.backpropogation(training_label)
weight_gradients.append(weights_gradient)
bias_gradients.append(bias_gradient)
# average gradients
weights_gradient = np.average(weight_gradients,axis=0)
biases_gradient = np.average(bias_gradients, axis=0)
# may need to convert to list
weights_gradient_list = []
for weight_gradient in weights_gradient:
weights_gradient_list.append(weight_gradient.tolist())
#weights_gradient = weights_gradient.tolist()
biases_gradient = biases_gradient.tolist()
for x in range(len(self.biases)):
self.biases[x] -= 0.1*biases_gradient[x]
weight_gradient_index = 0
for layer_index, layer_weights in enumerate(self.weights, start=0):
for weight_index, weight in enumerate(layer_weights):
self.weights[layer_index][weight_index] = weight - 0.1*weights_gradient_list[layer_index][weight_index]
weight_gradient_index += 1
def feedforward(self, training_object):
# set inputs
self.outputs = []
self.activations = []
temp_activations = []
for index in range(self.sizes[0]):
temp_activations.append(training_object[index])
self.activations.append(temp_activations)
for layer_index, layer_size in enumerate(self.sizes[1:], start=0):
layer_weights = self.weights[layer_index]
layer_inputs = self.activations[layer_index]
weight_index = 0
layer_outputs = []
layer_activations = []
for node_index in range(layer_size):
node_weights = []
# get node weights
#print(f"layer size: {layer_size}, previous_layer_size: {self.sizes[layer_index]}, layer weights: {len(layer_weights)}")
for x in range(self.sizes[layer_index]):
node_weights.append(layer_weights[weight_index])
weight_index += 1
output = 0
for indx in range(len(node_weights)):
output += layer_inputs[indx]*node_weights[indx]
output = output + self.biases[layer_index]
layer_outputs.append(output)
layer_activations.append(self.sigmoid(output))
self.outputs.append(layer_outputs)
self.activations.append(layer_activations)
def backpropogation(self, training_label):
costs = []
output_layer_activations = self.activations[-1]
output_layer_outputs = self.outputs[-1]
correct_labels = self.translate_label_to_array(training_label)
costs.append(self.compute_cost_derivative(correct_labels, output_layer_activations))
for cost_index, cost in enumerate(costs[0]):
costs[0][cost_index] = cost*self.sigmoid_prime(output_layer_outputs[cost_index])
# calculate costs for layers
for layer_index, layer_size in enumerate(self.sizes[::-1][1:-1], start=1):
layer_costs = []
layer_weights = self.weights[-layer_index]
layer_outputs = self.outputs[-(layer_index+1)]
previous_layer_costs = costs[layer_index-1]
next_layer_size = self.sizes[::-1][1:][layer_index]
layer_weights_formatted = []
for x in range(layer_size):
layer_weights_formatted.append([])
for weight_index, weight in enumerate(layer_weights, start=0):
#print(f"weight index:{weight_index % next_layer_size} layer_index: {weight_index}")
layer_weights_formatted[weight_index%layer_size].append(layer_weights[weight_index])
#print(f"next_layer_size:{layer_size} costs: {len(previous_layer_costs)}, layer_weights_formatted: {layer_weights_formatted}")
for x in range(layer_size):
node_cost = 0
for y, cost in enumerate(previous_layer_costs,start=0):
node_cost += layer_weights_formatted[x][y]*cost
layer_costs.append(node_cost)
# layer_costs same order as next layer's activations
for cost_index, cost in enumerate(layer_costs):
layer_costs[cost_index] = cost * self.sigmoid_prime(layer_outputs[cost_index])
costs.append(layer_costs)
# calculate weight errors
weight_errors = []
bias_errors = []
for layer_index, layer_costs in enumerate(costs[::-1]):
layer_activations = self.activations[layer_index]
layer_weight_errors = []
for cost_index, cost in enumerate(layer_costs,start=0):
for activation in layer_activations:
layer_weight_errors.append(activation * cost)
weight_errors.append(np.array(layer_weight_errors))
bias_errors.append(sum(layer_costs))
return weight_errors, bias_errors
# conversion tool
def translate_label_to_array(self, y):
translated_label = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
translated_label[y] = 1
return np.array(translated_label)
# output tools
def network_outputs(self):
print("Output layer: ")
for x in range(self.sizes[-1]):
print("node " + str(x) + ": " + str(self.activations[-1][x]))
def total_activations(self):
print(len(self.activations))
def compute_cost_derivative(self, y, output_activations):
"""Return the vector of partial derivatives \partial C_x /
\partial a for the output activations."""
return (output_activations - y)
def sigmoid(self, z):
""""The sigmoid function."""
return (1.0 / (1.0 + np.exp(-z)))
def sigmoid_prime(self, z):
return (self.sigmoid(z) * (1 - self.sigmoid(z)))
I have tried constructing a neural network for the XOR-function. The network has 1 hidden layer with 2 nodes in addition to a bias node. The Sigmoid function is used as activation function. I have tested the network for multiple learning rates. The result is always the same: the network give the value 0.5 for all inputs. There are some other threads where people report the same problem, but as far as I can see, other mistakes were done in those cases.
The following code shows my network and the results.
import numpy as np
import matplotlib.pyplot as plt
class NN:
""" XOR function test. 1 hidden layer with 2 hidden nodes in addition to bias node."""
def __init__(self, nodeNumbers, learningRate, targetMatrix, inputMatrix, errorTolerance, \
maxIterations):
self.nodeNumbers, self.learningRate, self.targetMatrix, \
self.inputMatrix, self.errorTolerance, self.maxIterations = \
nodeNumbers, learningRate, targetMatrix, inputMatrix, errorTolerance, \
maxIterations
self.numberOfInputs = np.shape(self.inputMatrix)[1]
self.weightMatrices = []
for nodeNumber in range(len(nodeNumbers[1:])):
self.weightMatrices.append(np.random.random_sample((nodeNumbers[nodeNumber+1], \
nodeNumbers[nodeNumber]+1)).T - .5)
def activationFunction(self, x):
return 1./(1+np.exp(-x))
def derivative(self, weightedInputs):
return self.activationFunction(weightedInputs)*(1 - self.activationFunction(weightedInputs))
def run(self):
self.iterationNumber = 0
numberOfAdjustmentsDuringIteration = 1
while (self.iterationNumber < self.maxIterations and numberOfAdjustmentsDuringIteration != 0):
self.iterationNumber += 1
numberOfAdjustmentsDuringIteration = 0
for inputNumber in range(self.numberOfInputs):
self.inputs = self.inputMatrix[:,inputNumber]
self.targets = self.targetMatrix[inputNumber]
self.forward()
self.calculateError()
if abs(self.error2) > self.errorTolerance:
numberOfAdjustmentsDuringIteration +=1
self.backward()
print('Iterations: ', self.iterationNumber, '|Error|: ', self.error2)
def forward(self):
self.u1 = self.weightMatrices[0].T # self.inputMatrix.T[0,:]
z1 = self.activationFunction(self.u1)
self.z1 = np.concatenate([[-1], z1])
self.u2 = self.weightMatrices[1].T # self.z1
self.z2 = self.activationFunction(self.u2)
def calculateError(self):
self.error2 = (self.targets - self.z2)**2
def backward(self, inputs=False, targets=False):
self.delta2 = (self.z2 - self.targets)*self.derivative(self.u2)
delta11 = self.derivative(self.u1[0])*self.delta2* self.weightMatrices[1][0]
delta12 = self.derivative(self.u1[1])*self.delta2* self.weightMatrices[1][1]
self.delta1 = np.concatenate([delta11, delta12])
self.weightMatrices[1][0,0] -= self.learningRate*self.delta2*self.z1[0]
self.weightMatrices[1][1,0] -= self.learningRate*self.delta2*self.z1[1]
self.weightMatrices[1][2,0] -= self.learningRate*self.delta2*self.z1[2]
self.weightMatrices[0][0,0] -= self.learningRate*self.delta1[0]*self.inputs[0]
self.weightMatrices[0][1,0] -= self.learningRate*self.delta1[0]*self.inputs[1]
self.weightMatrices[0][0,1] -= self.learningRate*self.delta1[1]*self.inputs[0]
self.weightMatrices[0][1,1] -= self.learningRate*self.delta1[1]*self.inputs[1]
def predict(self, newInput):
self.inputs = newInput
self.forward()
print('Input: ', newInput, 'Predicted output: ', self.z2)
nodeNumbers = [2,2,1]
activationFunction = activationFunction
derivative = differentiateActivationFunction
learningRate = 0.3
targetMatrix = np.array(((0), (1), (1), (0))).T
inputMatrix = np.array(((-1,0, 0), (-1, 0, 1), (-1,1, 0), (-1,1,1))).T
errorTolerance = 1e-3
maxIterations= 500
nn=NN(nodeNumbers, learningRate, targetMatrix, inputMatrix, errorTolerance, maxIterations)
nn.run()
The results from the above
Iterations: 500 |Error|: [0.26341771]
Making predictions
inputs = np.array(((-1,0, 0), (-1, 0, 1), (-1,1, 0), (-1,1,1))).T
for inp in inputs:
nn.predict(inp)
The results
Input: [-1 0 0] Predicted output: [0.49987204]
Input: [-1 0 1] Predicted output: [0.49987204]
Input: [-1 1 0] Predicted output: [0.49987204]
Input: [-1 1 1] Predicted output: [0.49987204]
Does anybody spot any errors?
My model training speed becomes slower over time. Every epoch take longer time to train.
Here is the full source code with my preprocess sentiment tree bank data (put glove.840B.300d.txt into data/glove).
Install some python packages:
pip install meowlogtool
pip install tqdm
Command to run:
python sentiment.py --emblr 0 --rel_dim 0 --tag_dim 0 --optim adagrad --name basic --lr 0.05 --wd 1e-4 --at_hid_dim 0
Model source code for you to read
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable as Var
import utils
import Constants
from model import SentimentModule
from embedding_model import EmbeddingModel
class SimpleGRU(nn.Module):
"""
w[i] : (300, 1)
h[i] : (150, 1)
p[i] : (20, 1)
r[i] : (20, 1)
k[i] : (150, 1)
x[i] : (20 + 150 + 300 + 20 = 490, 1) (490, 1)
Uz, Ur, Uh : (150, 150) => 67500 => (450, 450)
Wz, Wr, Wh : (150, 20 + 150 + 300 + 20) (150, 490)
"""
def __init__(self, cuda, in_dim, hid_dim):
super(SimpleGRU, self).__init__()
self.cudaFlag = cuda
self.Uz = nn.Linear(hid_dim, hid_dim)
self.Ur = nn.Linear(hid_dim, hid_dim)
self.Uh = nn.Linear(hid_dim, hid_dim)
self.Wz = nn.Linear(in_dim, hid_dim)
self.Wr = nn.Linear(in_dim, hid_dim)
self.Wh = nn.Linear(in_dim, hid_dim)
if self.cudaFlag:
self.Uz = self.Uz.cuda()
self.Ur = self.Uz.cuda()
self.Uh = self.Uz.cuda()
self.Wz = self.Wz.cuda()
self.Wr = self.Wr.cuda()
self.Wh = self.Wh.cuda()
def forward(self, x, h_prev):
"""
Simple-GRU(compress_x[v], h[t-1]) :
z[t] := s(Wz *compress_x[t]+ Uz * h[t-1] + bz)
r[t] := s(Wr * compress_x[t] + Ur * h[t-1] + br)
h_temp[t] := g(Wh * compress_x[t] + Uh * h[t-1] + bh)
h[t] := r[t] .* h[t-1] + (1 - z[t]) .* h_temp[t]
return h[t]
:param x: compress_x[t]
:param h_prev: h[t-1]
:return:
"""
z = F.sigmoid(self.Wz(x) + self.Uz(h_prev))
r = F.sigmoid(self.Wr(x) + self.Ur(h_prev))
h_temp = F.tanh(self.Wh(x) + self.Uh(h_prev))
h = r*h_prev + (1-z)*h_temp
return h
class TreeSimpleGRU(nn.Module):
def __init__(self, cuda, word_dim, tag_dim, rel_dim, mem_dim, at_hid_dim, criterion, leaf_h = None):
super(TreeSimpleGRU, self).__init__()
self.cudaFlag = cuda
# self.gru_cell = nn.GRUCell(word_dim + tag_dim, mem_dim)
self.gru_cell = SimpleGRU(self.cudaFlag, word_dim+tag_dim, mem_dim)
self.gru_at = GRU_AT(self.cudaFlag, word_dim + tag_dim + rel_dim + mem_dim, at_hid_dim ,mem_dim)
self.mem_dim = mem_dim
self.in_dim = word_dim
self.tag_dim = tag_dim
self.rel_dim = rel_dim
self.leaf_h = leaf_h # init h for leaf node
if self.leaf_h == None:
self.leaf_h = Var(torch.rand(1, self.mem_dim))
torch.save(self.leaf_h, 'leaf_h.pth')
if self.cudaFlag:
self.leaf_h = self.leaf_h.cuda()
self.criterion = criterion
self.output_module = None
def getParameters(self):
"""
Get flatParameters
note that getParameters and parameters is not equal in this case
getParameters do not get parameters of output module
:return: 1d tensor
"""
params = []
for m in [self.gru_cell, self.gru_at]:
# we do not get param of output module
l = list(m.parameters())
params.extend(l)
one_dim = [p.view(p.numel()) for p in params]
params = F.torch.cat(one_dim)
return params
def set_output_module(self, output_module):
self.output_module = output_module
def forward(self, tree, w_emb, tag_emb, rel_emb, training = False):
loss = Var(torch.zeros(1)) # init zero loss
if self.cudaFlag:
loss = loss.cuda()
for idx in xrange(tree.num_children):
_, child_loss = self.forward(tree.children[idx], w_emb, tag_emb, rel_emb, training)
loss = loss + child_loss
if tree.num_children > 0:
child_rels, child_k = self.get_child_state(tree, rel_emb)
if self.tag_dim > 0:
tree.state = self.node_forward(w_emb[tree.idx - 1], tag_emb[tree.idx -1], child_rels, child_k)
else:
tree.state = self.node_forward(w_emb[tree.idx - 1], None, child_rels, child_k)
elif tree.num_children == 0:
if self.tag_dim > 0:
tree.state = self.leaf_forward(w_emb[tree.idx - 1], tag_emb[tree.idx -1])
else:
tree.state = self.leaf_forward(w_emb[tree.idx - 1], None)
if self.output_module != None:
output = self.output_module.forward(tree.state, training)
tree.output = output
if training and tree.gold_label != None:
target = Var(utils.map_label_to_target_sentiment(tree.gold_label))
if self.cudaFlag:
target = target.cuda()
loss = loss + self.criterion(output, target)
return tree.state, loss
def leaf_forward(self, word_emb, tag_emb):
"""
Forward function for leaf node
:param word_emb: word embedding of current node u
:param tag_emb: tag embedding of current node u
:return: k of current node u
"""
h = self.leaf_h
if self.cudaFlag:
h = h.cuda()
if self.tag_dim > 0:
x = F.torch.cat([word_emb, tag_emb], 1)
else:
x = word_emb
k = self.gru_cell(x, h)
return k
def node_forward(self, word_emb, tag_emb, child_rels, child_k):
"""
Foward function for inner node
:param word_emb: word embedding of current node u
:param tag_emb: tag embedding of current node u
:param child_rels (tensor): rels embedding of child node v
:param child_k (tensor): k of child node v
:return:
"""
n_child = child_k.size(0)
h = Var(torch.zeros(1, self.mem_dim))
if self.cudaFlag:
h = h.cuda()
for i in range(0, n_child):
k = child_k[i]
x_list = [word_emb, k]
if self.rel_dim >0:
rel = child_rels[i]
x_list.append(rel)
if self.tag_dim > 0:
x_list.append(tag_emb)
x = F.torch.cat(x_list, 1)
h = self.gru_at(x, h)
k = h
return k
def get_child_state(self, tree, rels_emb):
"""
Get child rels, get child k
:param tree: tree we need to get child
:param rels_emb (tensor):
:return:
"""
if tree.num_children == 0:
assert False # never get here
else:
child_k = Var(torch.Tensor(tree.num_children, 1, self.mem_dim))
if self.rel_dim>0:
child_rels = Var(torch.Tensor(tree.num_children, 1, self.rel_dim))
else:
child_rels = None
if self.cudaFlag:
child_k = child_k.cuda()
if self.rel_dim > 0:
child_rels = child_rels.cuda()
for idx in xrange(tree.num_children):
child_k[idx] = tree.children[idx].state
if self.rel_dim > 0:
child_rels[idx] = rels_emb[tree.children[idx].idx - 1]
return child_rels, child_k
class AT(nn.Module):
"""
AT(compress_x[v]) := sigmoid(Wa * tanh(Wb * compress_x[v] + bb) + ba)
"""
def __init__(self, cuda, in_dim, hid_dim):
super(AT, self).__init__()
self.cudaFlag = cuda
self.in_dim = in_dim
self.hid_dim = hid_dim
self.Wa = nn.Linear(hid_dim, 1)
self.Wb = nn.Linear(in_dim, hid_dim)
if self.cudaFlag:
self.Wa = self.Wa.cuda()
self.Wb = self.Wb.cuda()
def forward(self, x):
out = F.sigmoid(self.Wa(F.tanh(self.Wb(x))))
return out
class GRU_AT(nn.Module):
def __init__(self, cuda, in_dim, at_hid_dim ,mem_dim):
super(GRU_AT, self).__init__()
self.cudaFlag = cuda
self.in_dim = in_dim
self.mem_dim = mem_dim
self.at_hid_dim = at_hid_dim
if at_hid_dim > 0:
self.at = AT(cuda, in_dim, at_hid_dim)
self.gru_cell = SimpleGRU(self.cudaFlag, in_dim, mem_dim)
if self.cudaFlag:
if at_hid_dim > 0:
self.at = self.at.cuda()
self.gru_cell = self.gru_cell.cuda()
def forward(self, x, h_prev):
"""
:param x:
:param h_prev:
:return: a * m + (1 - a) * h[t-1]
"""
m = self.gru_cell(x, h_prev)
if self.at_hid_dim > 0:
a = self.at.forward(x)
h = torch.mm(a, m) + torch.mm((1-a), h_prev)
else:
h = m
return h
class TreeGRUSentiment(nn.Module):
def __init__(self, cuda, in_dim, tag_dim, rel_dim, mem_dim, at_hid_dim, num_classes, criterion):
super(TreeGRUSentiment, self).__init__()
self.cudaFlag = cuda
self.tree_module = TreeSimpleGRU(cuda, in_dim, tag_dim, rel_dim, mem_dim, at_hid_dim, criterion)
self.output_module = SentimentModule(cuda, mem_dim, num_classes, dropout=True)
self.tree_module.set_output_module(self.output_module)
def get_tree_parameters(self):
return self.tree_module.getParameters()
def forward(self, tree, sent_emb, tag_emb, rel_emb, training = False):
# sent_emb = F.torch.unsqueeze(self.word_embedding.forward(sent_inputs), 1)
# tag_emb = F.torch.unsqueeze(self.tag_emb.forward(tag_inputs), 1)
# rel_emb = F.torch.unsqueeze(self.rel_emb.forward(rel_inputs), 1)
# sent_emb, tag_emb, rel_emb = self.embedding_model(sent_inputs, tag_inputs, rel_inputs)
tree_state, loss = self.tree_module(tree, sent_emb, tag_emb, rel_emb, training)
output = tree.output
return output, loss
Why does neural network learning slow down as the error gets lower?
The reasons for the slowdown are not fully understood, but we have some basic ideas.
For classifiers, most training examples start out as incorrectly classified. Over time, more of them become correctly classified. Early in learning, you might have a nearly 100% error rate, so every example in the minibatch contributes to learning. Late in learning, you might have nearly a 0% error rate, so almost none of the examples in the minibatch contribute to learning. This problem can be resolved to some extent by using hard example mining or importance sampling. Both of these are just techniques for training on more difficult examples more often.
There are other more complicated reasons. One of them is that the condition number of the Hessian tends to worsen a lot as learning progresses, so that the optimal step size becomes smaller and smaller.