I have written a simple digit recognition neural network and it does not seem to be learning. It has 2 hidden layers and uses the softmax activation function and whenever it runs it seems to converge on always picking 0. I would just like to check if the code for updating the weight matrices is correct
from cmath import exp
import numpy as np
from tensorflow.keras.datasets import mnist
class Run:
def __init__(self, num_inputs, num_hidden1, num_hidden2, num_outputs):
self.num_inputs = num_inputs
self.num_hidden1 = num_hidden1
self.num_hidden2 = num_hidden2
self.num_outputs = num_outputs
self.learningrate = 0.001
self.get = GetInput()
self.count = 0
self.countTrue = 0
self.count1 = 0
self.sum = 0
self.past = 0
self.inputLayer = Layer(num_inputs, num_hidden1)
self.hiddenLayer1 = Layer(num_hidden1, num_hidden2)
self.hiddenLayer2 = Layer(num_hidden2, num_outputs)
def getinput(self):
input, expected = self.get.get(self.count)
self.count +=1
self.count1 += 1
return input, expected
def runNN(self, input):
self.inputLayer.calc_output_1(input)
self.hiddenLayer1.calc_output_1(self.inputLayer.fin_outputs)
self.hiddenLayer2.calc_output_1(self.hiddenLayer1.fin_outputs)
self.NN_Output = self.hiddenLayer2.fin_outputs
def calculate_cost(self, expected):
error = 0
for i in range(self.num_outputs):
error += (self.NN_Output[i][0] - expected[i][0])**2 / self.num_outputs
list = []
list1 = []
for each in self.NN_Output:
list.append(float(each[0]))
self.sum += list.index(max(list))
for each in expected:
list1.append(float(each[0]))
if list1.index(max(list1)) == list.index(max(list)):
self.countTrue += 1
print(round(self.countTrue/self.count1, 3))
if self.count1 % 1000 == 0:
print(self.sum / 1000)
print('')
self.past = 0
self.sum = 0
self.count1 = 0
self.countTrue = 0
return error
def calc_new_hidden1(self, expected):
delta = self.NN_Output - expected
change = np.multiply(delta, self.hiddenLayer2.fin_outputs)
change_weights = np.matmul(change, np.transpose(self.hiddenLayer2.inputs)) * self.learningrate
change_bias = change * self.learningrate
self.hiddenLayer2.amend(change_weights, change_bias)
def calc_new_hidden2(self, expected):
delta = self.NN_Output - expected
change = np.multiply(np.matmul(np.transpose(self.hiddenLayer2.getter()[0]), delta), self.hiddenLayer1.fin_outputs)
change_weights = np.matmul(change, np.transpose(self.hiddenLayer1.inputs)) * self.learningrate
change_bias = change * self.learningrate
self.hiddenLayer1.amend(change_weights, change_bias)
def calc_new_input(self, expected):
delta = (self.NN_Output - expected)
change = np.multiply(np.matmul(np.transpose(self.hiddenLayer1.getter()[0]), np.matmul(np.transpose(self.hiddenLayer2.getter()[0]), delta)), self.inputLayer.fin_outputs)
change_weights = np.matmul(change, np.transpose(self.inputLayer.inputs)) * self.learningrate
change_bias = change * self.learningrate
self.inputLayer.amend(change_weights, change_bias)
class Layer:
def __init__(self, num_inputs, num_outputs):
self.__weights = np.random.uniform(-0.5, 0.5, (num_outputs, num_inputs))
self.__bias = np.matrix([[float(0)] for x in range(num_outputs)])
def calc_output_1(self, inputs):
self.inputs = inputs
self.__output_1 = np.matmul(self.__weights, inputs) + self.__bias
self.softmax()
def softmax(self):
sum = 0
for each in self.__output_1:
sum += np.exp(float(each[0]))
list1 = []
for each in self.__output_1:
list1.append([float(np.exp(each[0])/sum)])
self.fin_outputs = np.matrix(list1)
def amend(self, change_weights, change_bias):
self.__weights -= change_weights
self.__bias -= change_bias
def getter(self):
return self.__weights, self.__bias
class GetInput:
def __init__(self):
(self.X_train, self.Y_train), (X_test, Y_test) = mnist.load_data()
self.X_train = self.X_train.reshape(self.X_train.shape[0], 28, 28, 1)
x_test = X_test.reshape(X_test.shape[0], 28, 28, 1)
def get(self, i):
list = []
newPhoto = self.X_train[i].astype('float32')/255
for each in newPhoto:
for n in each:
list.append([float(n)])
input = np.matrix(list)
list = []
expect = self.Y_train[i]
for each in range(10):
if each == expect:
list.append([1])
else:
list.append([0])
expected = np.matrix(list)
return input, expected
if __name__ == "__main__":
initiate = Run(784, 600, 400, 10)
while True:
input, expected = initiate.getinput()
initiate.runNN(input)
initiate.calculate_cost(expected)
initiate.calc_new_hidden1(expected)
initiate.calc_new_hidden2(expected)
initiate.calc_new_input(expected)`
Here is the code I have created. The maths for updating the weight matrices is in the Run class: calc_new_hidden1(), calc_new_hidden2(), calc_new_inputs()
I think the error will probably be in the calc_new_inputs() function
Related
The error is saying this is not the case - your InputLayer object does not have an attribute Input but I don't know how to fix it thank you for your helps
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
import random
class MultiLayerPerceptron(BaseEstimator, ClassifierMixin):
def __init__(self, params=None):
if (params == None):
self.inputLayer = 14 # Input Layer
self.hiddenLayer = 100 # Hidden Layer
self.outputLayer = 2 # Outpuy Layer
self.learningRate = 0.005 # Learning rate
self.max_epochs = 600 # Epochs
self.iasHiddenValue = -1 # Bias HiddenLayer
self.BiasOutputValue = -1 # Bias OutputLayer
self.activation = self.ativacao['sigmoid'] # Activation function
self.deriv = self.derivada['sigmoid']
else:
self.inputLayer = params['InputLayer']
self.hiddenLayer = params['HiddenLayer']
self.OutputLayer = params['OutputLayer']
self.learningRate = params['LearningRate']
self.max_epochs = params['Epocas']
self.BiasHiddenValue = params['BiasHiddenValue']
self.BiasOutputValue = params['BiasOutputValue']
self.activation = self.ativacao[params['ActivationFunction']]
self.deriv = self.derivada[params['ActivationFunction']]
'Starting Bias and Weights'
self.WEIGHT_hidden = self.starting_weights(self.hiddenLayer, self.inputLayer)
self.WEIGHT_output = self.starting_weights(self.OutputLayer, self.hiddenLayer)
self.BIAS_hidden = np.array([self.BiasHiddenValue for i in range(self.hiddenLayer)])
self.BIAS_output = np.array([self.BiasOutputValue for i in range(self.OutputLayer)])
self.classes_number = 2
pass
def starting_weights(self, x, y):
return [[2 * random.random() - 1 for i in range(x)] for j in range(y)]
ativacao = {
'sigmoid': (lambda x: 1/(1 + np.exp(-x))),
'tanh': (lambda x: np.tanh(x)),
'Relu': (lambda x: x*(x > 0)),
}
derivada = {
'sigmoid': (lambda x: x*(1-x)),
'tanh': (lambda x: 1-x**2),
'Relu': (lambda x: 1 * (x>0))
}
def Backpropagation_Algorithm(self, x):
DELTA_output = []
'Stage 1 - Error: OutputLayer'
ERROR_output = self.output - self.OUTPUT_L2
DELTA_output = ((-1)*(ERROR_output) * self.deriv(self.OUTPUT_L2))
arrayStore = []
'Stage 2 - Update weights OutputLayer and HiddenLayer'
for i in range(self.hiddenLayer):
for j in range(self.OutputLayer):
self.WEIGHT_output[i][j] -= (self.learningRate * (DELTA_output[j] * self.OUTPUT_L1[i]))
self.BIAS_output[j] -= (self.learningRate * DELTA_output[j])
'Stage 3 - Error: HiddenLayer'
delta_hidden = np.matmul(self.WEIGHT_output, DELTA_output)* self.deriv(self.OUTPUT_L1)
'Stage 4 - Update weights HiddenLayer and InputLayer(x)'
for i in range(self.OutputLayer):
for j in range(self.hiddenLayer):
self.WEIGHT_hidden[i][j] -= (self.learningRate * (delta_hidden[j] * x[i]))
self.BIAS_hidden[j] -= (self.learningRate * delta_hidden[j])
def show_err_graphic(self,v_erro,v_epoca):
plt.figure(figsize=(4,14))
plt.plot(v_epoca, v_erro, "m-",color="b", marker=11)
plt.xlabel("Number of Epochs")
plt.ylabel("Squared error (MSE) ");
plt.title("Error Minimization")
plt.show()
def predict(self, X, y):
'Returns the predictions for every element of X'
my_predictions = []
'Forward Propagation'
forward = np.matmul(X,self.WEIGHT_hidden) + self.BIAS_hidden
forward = np.matmul(forward, self.WEIGHT_output) + self.BIAS_output
for i in forward:
my_predictions.append(max(enumerate(i), key=lambda x:x[1])[0])
array_score = []
for i in range(len(my_predictions)):
if my_predictions[i] == 0:
array_score.append([i, 'No', my_predictions[i], y[i]])
elif my_predictions[i] == 1:
array_score.append([i, 'Yes', my_predictions[i], y[i]])
dataframe = pd.DataFrame(array_score, columns=['_id', 'class', 'output', 'hoped_output'])
return my_predictions, dataframe
def fit(self, X, y):
count_epoch = 1
total_error = 0
n = len(X);
epoch_array = []
error_array = []
W0 = []
W1 = []
while(count_epoch <= self.max_epochs):
for idx,inputs in enumerate(X):
self.output = np.zeros(self.classes_number)
'Stage 1 - (Forward Propagation)'
self.OUTPUT_L1 = self.activation((np.dot(self.InputLayer, self.WEIGHT_hidden) + self.BIAS_hidden.T))
self.OUTPUT_L2 = self.activation((np.dot(self.OUTPUT_L1, self.WEIGHT_output) + self.BIAS_output.T))
'Stage 2 - One-Hot-Encoding'
if(y[idx] == 0):
self.output = np.array([1,0,0]) #Class1 {1,0,0}
elif(y[idx] == 1):
self.output = np.array([0,1,0]) #Class2 {0,1,0}
square_error = 0
for i in range(self.OutputLayer):
erro = (self.output[i] - self.OUTPUT_L2[i])**2
square_error = (square_error + (0.05 * erro))
total_error = total_error + square_error
'Backpropagation : Update Weights'
self.Backpropagation_Algorithm(inputs)
total_error = (total_error / n)
if((count_epoch % 50 == 0)or(count_epoch == 1)):
print("Epoch ", count_epoch, "- Total Error: ",total_error)
error_array.append(total_error)
epoch_array.append(count_epoch)
W0.append(self.WEIGHT_hidden)
W1.append(self.WEIGHT_output)
count_epoch += 1
self.show_err_graphic(error_array,epoch_array)
plt.plot(W0[0])
plt.title('Weight Hidden update during training')
plt.legend(['neuron1', 'neuron2', 'neuron3', 'neuron4', 'neuron5'])
plt.ylabel('Value Weight')
plt.show()
plt.plot(W1[0])
plt.title('Weight Output update during training')
plt.legend(['neuron1', 'neuron2', 'neuron3'])
plt.ylabel('Value Weight')
plt.show()
return self
dictionary = {'InputLayer':14, 'HiddenLayer':100, 'OutputLayer':2,
'Epocas':700, 'LearningRate':0.005,'BiasHiddenValue':-1,
'BiasOutputValue':-1, 'ActivationFunction':'sigmoid'}
Perceptron = MultiLayerPerceptron(dictionary)
Perceptron.fit(train_X,train_y)
AttributeError: 'MultiLayerPerceptron' object has no attribute 'InputLayer'
I want after splitting the data to work on MLP Classifier but I found this error "AttributeError: 'MultiLayerPerceptron' object has no attribute 'InputLayer'
I am not good with PyTorch so I would appreciate some help in converting this code to TensorFlow. I have trying going through some articles but it was a bit intensive so a little explanation would also be worthwhile so that the whole community can benefit from this.
"""
import torch
import copy
class PESG(torch.optim.Optimizer):
def __init__(self, model, a=None, b=None, alpha=None, imratio=0.1, margin=1.0, lr=0.1, gamma=500, clip_value=1.0, weight_decay=1e-5, **kwargs):
assert a is not None, 'Found no variable a!'
assert b is not None, 'Found no variable b!'
assert alpha is not None, 'Found no variable alpha!'
self.p = imratio
self.margin = margin
self.model = model
self.lr = lr
self.gamma = gamma
self.clip_value = clip_value
self.weight_decay = weight_decay
self.a = a
self.b = b
self.alpha = alpha
# TODO!
self.model_ref = []
for var in list(self.model.parameters())+[self.a, self.b]:
self.model_ref.append(torch.empty(var.shape).normal_(mean=0, std=0.01).cuda())
self.model_acc = []
for var in list(self.model.parameters())+[self.a, self.b]:
self.model_acc.append(torch.zeros(var.shape, dtype=torch.float32, device="cuda", requires_grad=False).cuda())
self.T = 0
self.step_counts = 0
def get_parameters(params):
for p in params:
yield p
self.params = get_parameters(list(model.parameters())+[a,b])
self.defaults = dict(lr=self.lr,
margin=margin,
gamma=gamma,
p=imratio,
a=self.a,
b=self.b,
alpha=self.alpha,
clip_value=clip_value,
weight_decay=weight_decay,
model_ref = self.model_ref,
model_acc = self.model_acc
)
super(PESG, self).__init__(self.params, self.defaults)
#property
def optim_steps(self):
return self.step_counts
def update_lr(self, lr):
self.param_groups[0]['lr']=lr
#torch.no_grad()
def step(self):
"""Performs a single optimization step.
"""
for group in self.param_groups:
weight_decay = group['weight_decay']
clip_value = group['clip_value']
self.lr = group['lr']
p = group['p']
gamma = group['gamma']
m = group['margin']
model_ref = group['model_ref']
model_acc = group['model_acc']
a = group['a']
b = group['b']
alpha = group['alpha']
# updates
for i, p in enumerate(group['params']):
if p.grad is None:
continue
p.data = p.data - group['lr']*( torch.clamp(p.grad.data , -clip_value, clip_value) + 1/gamma*(p.data - model_ref[i].data) ) - group['lr']*weight_decay*p.data
model_acc[i].data = model_acc[i].data + p.data
alpha.data = alpha.data + group['lr']*(2*(m + b.data - a.data)-2*alpha.data)
alpha.data = torch.clamp(alpha.data, 0, 999)
self.T += 1
self.step_counts += 1
def zero_grad(self):
self.model.zero_grad()
self.a.grad = None
self.b.grad = None
self.alpha.grad =None
def update_regularizer(self, decay_factor=None):
if decay_factor != None:
self.param_groups[0]['lr'] = self.param_groups[0]['lr']/decay_factor
print ('Reducing learning rate to %.5f # T=%s!'%(self.param_groups[0]['lr'], self.T))
print ('Updating regularizer # T=%s!'%(self.T))
for i, param in enumerate(self.model_ref):
self.model_ref[i].data = self.model_acc[i].data/self.T
for i, param in enumerate(self.model_acc):
self.model_acc[i].data = torch.zeros(param.shape, dtype=torch.float32, device="cuda", requires_grad=False).cuda()
self.T = 0
"""
I could use a second set of eyes on my neural network.
This is the mnist number recognition project.
I'm not sure where the issue is.
I previously implemented the ai with tensor flow successfully.
I'm not looking to use an api as a solution.
I would appreciate any help anyone can give.
Here's the project on github, it's only an init file and then the neural_network.
https://github.com/nealchawn/ai_trial_2
class NeuralNetwork(object):
def __init__(self, sizes):
self.activations = []
self.outputs = []
self.weights = []
self.biases = []
self.sizes = sizes
self.set_random_weights()
self.set_random_biases()
def set_random_weights(self):
for layer_index, layer_size in enumerate(self.sizes[1:], start=1):
layer_weights = []
for size in range(layer_size):
for size in range(self.sizes[layer_index-1]):
layer_weights.append(random.uniform(-5.0, 5.0))
self.weights.append(layer_weights)
def set_random_biases(self):
total_biases = 0
# add extra zero bias to help future indexing
#self.biases.append(0)
for index, size in enumerate(self.sizes[0:-1], start=1):
total_biases += 1
for x in range(total_biases):
self.biases.append(random.uniform(-5.0, 5.0))
def train_network(self, training_data, training_labels):
if len(training_data) != len(training_labels):
print("Error data and labels must be the same length")
data = list(zip(training_data, training_labels))
self.sgd(data)
def sgd(self, data, mini_batch_size = 1000):
# first we'll create batches of training data
n = len(data)
data_batches = [
data[k:k + mini_batch_size]
for k in range(0, n, mini_batch_size)
]
print(len(data_batches))
i = 0
for mini_batch in data_batches:
print("Batch: " + str(i))
i += 1
self.update_mini_batch(mini_batch)
self.network_outputs()
print("Finished All training data!")
def update_mini_batch(self, mini_data_batch):
weight_gradients = []
bias_gradients = []
i = 0
for training_input in mini_data_batch:
training_object, training_label = training_input
self.feedforward(training_object)
weights_gradient, bias_gradient = self.backpropogation(training_label)
weight_gradients.append(weights_gradient)
bias_gradients.append(bias_gradient)
# average gradients
weights_gradient = np.average(weight_gradients,axis=0)
biases_gradient = np.average(bias_gradients, axis=0)
# may need to convert to list
weights_gradient_list = []
for weight_gradient in weights_gradient:
weights_gradient_list.append(weight_gradient.tolist())
#weights_gradient = weights_gradient.tolist()
biases_gradient = biases_gradient.tolist()
for x in range(len(self.biases)):
self.biases[x] -= 0.1*biases_gradient[x]
weight_gradient_index = 0
for layer_index, layer_weights in enumerate(self.weights, start=0):
for weight_index, weight in enumerate(layer_weights):
self.weights[layer_index][weight_index] = weight - 0.1*weights_gradient_list[layer_index][weight_index]
weight_gradient_index += 1
def feedforward(self, training_object):
# set inputs
self.outputs = []
self.activations = []
temp_activations = []
for index in range(self.sizes[0]):
temp_activations.append(training_object[index])
self.activations.append(temp_activations)
for layer_index, layer_size in enumerate(self.sizes[1:], start=0):
layer_weights = self.weights[layer_index]
layer_inputs = self.activations[layer_index]
weight_index = 0
layer_outputs = []
layer_activations = []
for node_index in range(layer_size):
node_weights = []
# get node weights
#print(f"layer size: {layer_size}, previous_layer_size: {self.sizes[layer_index]}, layer weights: {len(layer_weights)}")
for x in range(self.sizes[layer_index]):
node_weights.append(layer_weights[weight_index])
weight_index += 1
output = 0
for indx in range(len(node_weights)):
output += layer_inputs[indx]*node_weights[indx]
output = output + self.biases[layer_index]
layer_outputs.append(output)
layer_activations.append(self.sigmoid(output))
self.outputs.append(layer_outputs)
self.activations.append(layer_activations)
def backpropogation(self, training_label):
costs = []
output_layer_activations = self.activations[-1]
output_layer_outputs = self.outputs[-1]
correct_labels = self.translate_label_to_array(training_label)
costs.append(self.compute_cost_derivative(correct_labels, output_layer_activations))
for cost_index, cost in enumerate(costs[0]):
costs[0][cost_index] = cost*self.sigmoid_prime(output_layer_outputs[cost_index])
# calculate costs for layers
for layer_index, layer_size in enumerate(self.sizes[::-1][1:-1], start=1):
layer_costs = []
layer_weights = self.weights[-layer_index]
layer_outputs = self.outputs[-(layer_index+1)]
previous_layer_costs = costs[layer_index-1]
next_layer_size = self.sizes[::-1][1:][layer_index]
layer_weights_formatted = []
for x in range(layer_size):
layer_weights_formatted.append([])
for weight_index, weight in enumerate(layer_weights, start=0):
#print(f"weight index:{weight_index % next_layer_size} layer_index: {weight_index}")
layer_weights_formatted[weight_index%layer_size].append(layer_weights[weight_index])
#print(f"next_layer_size:{layer_size} costs: {len(previous_layer_costs)}, layer_weights_formatted: {layer_weights_formatted}")
for x in range(layer_size):
node_cost = 0
for y, cost in enumerate(previous_layer_costs,start=0):
node_cost += layer_weights_formatted[x][y]*cost
layer_costs.append(node_cost)
# layer_costs same order as next layer's activations
for cost_index, cost in enumerate(layer_costs):
layer_costs[cost_index] = cost * self.sigmoid_prime(layer_outputs[cost_index])
costs.append(layer_costs)
# calculate weight errors
weight_errors = []
bias_errors = []
for layer_index, layer_costs in enumerate(costs[::-1]):
layer_activations = self.activations[layer_index]
layer_weight_errors = []
for cost_index, cost in enumerate(layer_costs,start=0):
for activation in layer_activations:
layer_weight_errors.append(activation * cost)
weight_errors.append(np.array(layer_weight_errors))
bias_errors.append(sum(layer_costs))
return weight_errors, bias_errors
# conversion tool
def translate_label_to_array(self, y):
translated_label = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
translated_label[y] = 1
return np.array(translated_label)
# output tools
def network_outputs(self):
print("Output layer: ")
for x in range(self.sizes[-1]):
print("node " + str(x) + ": " + str(self.activations[-1][x]))
def total_activations(self):
print(len(self.activations))
def compute_cost_derivative(self, y, output_activations):
"""Return the vector of partial derivatives \partial C_x /
\partial a for the output activations."""
return (output_activations - y)
def sigmoid(self, z):
""""The sigmoid function."""
return (1.0 / (1.0 + np.exp(-z)))
def sigmoid_prime(self, z):
return (self.sigmoid(z) * (1 - self.sigmoid(z)))
My model training speed becomes slower over time. Every epoch take longer time to train.
Here is the full source code with my preprocess sentiment tree bank data (put glove.840B.300d.txt into data/glove).
Install some python packages:
pip install meowlogtool
pip install tqdm
Command to run:
python sentiment.py --emblr 0 --rel_dim 0 --tag_dim 0 --optim adagrad --name basic --lr 0.05 --wd 1e-4 --at_hid_dim 0
Model source code for you to read
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable as Var
import utils
import Constants
from model import SentimentModule
from embedding_model import EmbeddingModel
class SimpleGRU(nn.Module):
"""
w[i] : (300, 1)
h[i] : (150, 1)
p[i] : (20, 1)
r[i] : (20, 1)
k[i] : (150, 1)
x[i] : (20 + 150 + 300 + 20 = 490, 1) (490, 1)
Uz, Ur, Uh : (150, 150) => 67500 => (450, 450)
Wz, Wr, Wh : (150, 20 + 150 + 300 + 20) (150, 490)
"""
def __init__(self, cuda, in_dim, hid_dim):
super(SimpleGRU, self).__init__()
self.cudaFlag = cuda
self.Uz = nn.Linear(hid_dim, hid_dim)
self.Ur = nn.Linear(hid_dim, hid_dim)
self.Uh = nn.Linear(hid_dim, hid_dim)
self.Wz = nn.Linear(in_dim, hid_dim)
self.Wr = nn.Linear(in_dim, hid_dim)
self.Wh = nn.Linear(in_dim, hid_dim)
if self.cudaFlag:
self.Uz = self.Uz.cuda()
self.Ur = self.Uz.cuda()
self.Uh = self.Uz.cuda()
self.Wz = self.Wz.cuda()
self.Wr = self.Wr.cuda()
self.Wh = self.Wh.cuda()
def forward(self, x, h_prev):
"""
Simple-GRU(compress_x[v], h[t-1]) :
z[t] := s(Wz *compress_x[t]+ Uz * h[t-1] + bz)
r[t] := s(Wr * compress_x[t] + Ur * h[t-1] + br)
h_temp[t] := g(Wh * compress_x[t] + Uh * h[t-1] + bh)
h[t] := r[t] .* h[t-1] + (1 - z[t]) .* h_temp[t]
return h[t]
:param x: compress_x[t]
:param h_prev: h[t-1]
:return:
"""
z = F.sigmoid(self.Wz(x) + self.Uz(h_prev))
r = F.sigmoid(self.Wr(x) + self.Ur(h_prev))
h_temp = F.tanh(self.Wh(x) + self.Uh(h_prev))
h = r*h_prev + (1-z)*h_temp
return h
class TreeSimpleGRU(nn.Module):
def __init__(self, cuda, word_dim, tag_dim, rel_dim, mem_dim, at_hid_dim, criterion, leaf_h = None):
super(TreeSimpleGRU, self).__init__()
self.cudaFlag = cuda
# self.gru_cell = nn.GRUCell(word_dim + tag_dim, mem_dim)
self.gru_cell = SimpleGRU(self.cudaFlag, word_dim+tag_dim, mem_dim)
self.gru_at = GRU_AT(self.cudaFlag, word_dim + tag_dim + rel_dim + mem_dim, at_hid_dim ,mem_dim)
self.mem_dim = mem_dim
self.in_dim = word_dim
self.tag_dim = tag_dim
self.rel_dim = rel_dim
self.leaf_h = leaf_h # init h for leaf node
if self.leaf_h == None:
self.leaf_h = Var(torch.rand(1, self.mem_dim))
torch.save(self.leaf_h, 'leaf_h.pth')
if self.cudaFlag:
self.leaf_h = self.leaf_h.cuda()
self.criterion = criterion
self.output_module = None
def getParameters(self):
"""
Get flatParameters
note that getParameters and parameters is not equal in this case
getParameters do not get parameters of output module
:return: 1d tensor
"""
params = []
for m in [self.gru_cell, self.gru_at]:
# we do not get param of output module
l = list(m.parameters())
params.extend(l)
one_dim = [p.view(p.numel()) for p in params]
params = F.torch.cat(one_dim)
return params
def set_output_module(self, output_module):
self.output_module = output_module
def forward(self, tree, w_emb, tag_emb, rel_emb, training = False):
loss = Var(torch.zeros(1)) # init zero loss
if self.cudaFlag:
loss = loss.cuda()
for idx in xrange(tree.num_children):
_, child_loss = self.forward(tree.children[idx], w_emb, tag_emb, rel_emb, training)
loss = loss + child_loss
if tree.num_children > 0:
child_rels, child_k = self.get_child_state(tree, rel_emb)
if self.tag_dim > 0:
tree.state = self.node_forward(w_emb[tree.idx - 1], tag_emb[tree.idx -1], child_rels, child_k)
else:
tree.state = self.node_forward(w_emb[tree.idx - 1], None, child_rels, child_k)
elif tree.num_children == 0:
if self.tag_dim > 0:
tree.state = self.leaf_forward(w_emb[tree.idx - 1], tag_emb[tree.idx -1])
else:
tree.state = self.leaf_forward(w_emb[tree.idx - 1], None)
if self.output_module != None:
output = self.output_module.forward(tree.state, training)
tree.output = output
if training and tree.gold_label != None:
target = Var(utils.map_label_to_target_sentiment(tree.gold_label))
if self.cudaFlag:
target = target.cuda()
loss = loss + self.criterion(output, target)
return tree.state, loss
def leaf_forward(self, word_emb, tag_emb):
"""
Forward function for leaf node
:param word_emb: word embedding of current node u
:param tag_emb: tag embedding of current node u
:return: k of current node u
"""
h = self.leaf_h
if self.cudaFlag:
h = h.cuda()
if self.tag_dim > 0:
x = F.torch.cat([word_emb, tag_emb], 1)
else:
x = word_emb
k = self.gru_cell(x, h)
return k
def node_forward(self, word_emb, tag_emb, child_rels, child_k):
"""
Foward function for inner node
:param word_emb: word embedding of current node u
:param tag_emb: tag embedding of current node u
:param child_rels (tensor): rels embedding of child node v
:param child_k (tensor): k of child node v
:return:
"""
n_child = child_k.size(0)
h = Var(torch.zeros(1, self.mem_dim))
if self.cudaFlag:
h = h.cuda()
for i in range(0, n_child):
k = child_k[i]
x_list = [word_emb, k]
if self.rel_dim >0:
rel = child_rels[i]
x_list.append(rel)
if self.tag_dim > 0:
x_list.append(tag_emb)
x = F.torch.cat(x_list, 1)
h = self.gru_at(x, h)
k = h
return k
def get_child_state(self, tree, rels_emb):
"""
Get child rels, get child k
:param tree: tree we need to get child
:param rels_emb (tensor):
:return:
"""
if tree.num_children == 0:
assert False # never get here
else:
child_k = Var(torch.Tensor(tree.num_children, 1, self.mem_dim))
if self.rel_dim>0:
child_rels = Var(torch.Tensor(tree.num_children, 1, self.rel_dim))
else:
child_rels = None
if self.cudaFlag:
child_k = child_k.cuda()
if self.rel_dim > 0:
child_rels = child_rels.cuda()
for idx in xrange(tree.num_children):
child_k[idx] = tree.children[idx].state
if self.rel_dim > 0:
child_rels[idx] = rels_emb[tree.children[idx].idx - 1]
return child_rels, child_k
class AT(nn.Module):
"""
AT(compress_x[v]) := sigmoid(Wa * tanh(Wb * compress_x[v] + bb) + ba)
"""
def __init__(self, cuda, in_dim, hid_dim):
super(AT, self).__init__()
self.cudaFlag = cuda
self.in_dim = in_dim
self.hid_dim = hid_dim
self.Wa = nn.Linear(hid_dim, 1)
self.Wb = nn.Linear(in_dim, hid_dim)
if self.cudaFlag:
self.Wa = self.Wa.cuda()
self.Wb = self.Wb.cuda()
def forward(self, x):
out = F.sigmoid(self.Wa(F.tanh(self.Wb(x))))
return out
class GRU_AT(nn.Module):
def __init__(self, cuda, in_dim, at_hid_dim ,mem_dim):
super(GRU_AT, self).__init__()
self.cudaFlag = cuda
self.in_dim = in_dim
self.mem_dim = mem_dim
self.at_hid_dim = at_hid_dim
if at_hid_dim > 0:
self.at = AT(cuda, in_dim, at_hid_dim)
self.gru_cell = SimpleGRU(self.cudaFlag, in_dim, mem_dim)
if self.cudaFlag:
if at_hid_dim > 0:
self.at = self.at.cuda()
self.gru_cell = self.gru_cell.cuda()
def forward(self, x, h_prev):
"""
:param x:
:param h_prev:
:return: a * m + (1 - a) * h[t-1]
"""
m = self.gru_cell(x, h_prev)
if self.at_hid_dim > 0:
a = self.at.forward(x)
h = torch.mm(a, m) + torch.mm((1-a), h_prev)
else:
h = m
return h
class TreeGRUSentiment(nn.Module):
def __init__(self, cuda, in_dim, tag_dim, rel_dim, mem_dim, at_hid_dim, num_classes, criterion):
super(TreeGRUSentiment, self).__init__()
self.cudaFlag = cuda
self.tree_module = TreeSimpleGRU(cuda, in_dim, tag_dim, rel_dim, mem_dim, at_hid_dim, criterion)
self.output_module = SentimentModule(cuda, mem_dim, num_classes, dropout=True)
self.tree_module.set_output_module(self.output_module)
def get_tree_parameters(self):
return self.tree_module.getParameters()
def forward(self, tree, sent_emb, tag_emb, rel_emb, training = False):
# sent_emb = F.torch.unsqueeze(self.word_embedding.forward(sent_inputs), 1)
# tag_emb = F.torch.unsqueeze(self.tag_emb.forward(tag_inputs), 1)
# rel_emb = F.torch.unsqueeze(self.rel_emb.forward(rel_inputs), 1)
# sent_emb, tag_emb, rel_emb = self.embedding_model(sent_inputs, tag_inputs, rel_inputs)
tree_state, loss = self.tree_module(tree, sent_emb, tag_emb, rel_emb, training)
output = tree.output
return output, loss
Why does neural network learning slow down as the error gets lower?
The reasons for the slowdown are not fully understood, but we have some basic ideas.
For classifiers, most training examples start out as incorrectly classified. Over time, more of them become correctly classified. Early in learning, you might have a nearly 100% error rate, so every example in the minibatch contributes to learning. Late in learning, you might have nearly a 0% error rate, so almost none of the examples in the minibatch contribute to learning. This problem can be resolved to some extent by using hard example mining or importance sampling. Both of these are just techniques for training on more difficult examples more often.
There are other more complicated reasons. One of them is that the condition number of the Hessian tends to worsen a lot as learning progresses, so that the optimal step size becomes smaller and smaller.
I'm using a neural network with 1 hidden layer (2 neurons) and 1 output neuron for solving the XOR problem.
Here's the code I'm using. It contains the main run file xor.py which creates a model defined in model.py. Each neuron is defined by the class Neuron in neuron.py
xor.py
from model import Model
import numpy as np
inputs = [[0,0], [0,1], [1,0], [1,1]]
outputs = [0, 1, 1, 0]
m = Model()
m.train(inputs, outputs)
for i in inputs:
p = m.predict(i)
print str(i) + ' => ' + str(p)
model.py
from neuron import HiddenNeuron, OutputNeuron
import numpy as np
class Model(object):
def __init__(self):
self.hidden = [HiddenNeuron(2) for i in range(2)]
self.output = OutputNeuron(2)
def predict(self, input):
temp = []
for x in range(2):
self.hidden[x].forward(input)
temp.append(self.hidden[x].out)
self.output.forward(temp)
return self.output.out
def train(self, inputs, targets):
it = 0
i = 0
size = len(inputs)
while it < 4:
if i == size:
i = 0
feature = inputs[i]
print '\n\nFeature : ' + str(feature) + '\n'
print 'Output weights : ' + str(self.output.weights)
print 'Hidden 1 weights : ' + str(self.hidden[0].weights)
print 'Hidden 2 weights : ' + str(self.hidden[1].weights)
temp = []
for x in range(2):
self.hidden[x].forward(feature)
temp.append(self.hidden[x].out)
self.output.forward(temp)
self.output.backward(targets[i])
deltas = []
deltas.append(self.output.error)
weights = []
weights.append([self.output.weights[0]])
weights.append([self.output.weights[1]])
for x in range(2):
self.hidden[x].backward(deltas, weights[x])
for x in range(2):
self.hidden[x].update(feature)
self.output.update(temp)
it += 1
i += 1
neuron.py
import numpy as np
from random import uniform
class Neuron(object):
def activation(self, fx):
return 1/(1 + np.exp(-fx))
def __init__(self, dim, lrate):
self.dim = dim
self.weights = np.empty([dim])
self.weights = [uniform(0,1) for x in range(dim)]
self.bias = uniform(0, 1)
self.lrate = lrate
self.out = None
self.error = None
def update(self, input):
j = 0
for i in input:
delta = self.lrate * self.error
self.weights[j] -= (delta*i)
self.bias += delta
j+=1
def forward(self, input):
j = 0
sum = self.bias
for f in input:
sum += f * self.weights[j]
j+=1
self.out = self.activation(sum)
def backward(self):
pass
class OutputNeuron(Neuron):
def __init__(self, dim, lrate=0.2):
super(OutputNeuron, self).__init__(dim, lrate)
def backward(self, target):
self.error = self.out * (1 - self.out) * (self.out - target)
class HiddenNeuron(Neuron):
def __init__(self, dim, lrate=0.2):
super(HiddenNeuron, self).__init__(dim, lrate)
def backward(self, deltas, weights):
sum = 0
size = len(deltas)
for x in range(size):
sum += deltas[x] * weights[x]
self.error = self.out * (1 - self.out) * sum
The final output is
[0, 0] => 0.999999991272
[0, 1] => 0.999999970788
[1, 0] => 0.999999952345
[1, 1] => 0.999715564446
I think the error is in neuron.py in the function update(). If you change self.bias += delta to self.bias -= delta it should work, at least it does for me. Otherwise you would modify your biases to ascend towards a maximum on the error surface.
Below you can see the output after 100000 training epochs.
[0, 0] => 0.0174550173543
[0, 1] => 0.983899954593
[1, 0] => 0.983895388655
[1, 1] => 0.0164172288168