The Code below gives about 95 % accuracy if I do not use dropout in training.
The accuracy drops to 11 % if I use dropout.
The network is built using Numpy.
I have used a class Neural Networks which contains many layer objects.
The last layer has sigmoid activation and the rest have Relu.
The code is:
import numpy as np
import idx2numpy as idx
import matplotlib.pyplot as plt
np.random.seed(0)
img = r"C:\Users\Aaditya\OneDrive\Documents\ML\train-image"
lbl = r'C:\Users\Aaditya\OneDrive\Documents\ML\train-labels-idx1-ubyte'
t_lbl = r'C:\Users\Aaditya\OneDrive\Documents\ML\t10k-labels.idx1-ubyte'
t_img = r'C:\Users\Aaditya\OneDrive\Documents\ML\t10k-images.idx3-ubyte'
image = idx.convert_from_file(img)
iput = np.reshape(image, (60000,784))/255
otput = np.eye(10)[idx.convert_from_file(lbl)]
test_image = idx.convert_from_file(t_img)
test_input = np.reshape(test_image, (10000,784))/255
test_output = idx.convert_from_file(t_lbl)
def sigmoid(x):
sigmoid = 1/(1+ np.exp(-x))
return sigmoid
def tanh(x):
return np.tanh(x)
def relu(x):
return np.where(x>0,x,0)
def reluprime(x):
return (x>0).astype(x.dtype)
def sigmoid_prime(x):
return sigmoid(x)*(1-sigmoid(x))
def tanh_prime(x):
return 1 - tanh(x)**2
class Layer_Dense:
def __init__(self,n_inputs,n_neurons,activation="sigmoid",keep_prob=1):
self.n_neurons=n_neurons
if activation == "sigmoid":
self.activation = sigmoid
self.a_prime = sigmoid_prime
elif activation == "tanh":
self.activation = tanh
self.a_prime = tanh_prime
else :
self.activation = relu
self.a_prime = reluprime
self.keep_prob = keep_prob
self.weights = np.random.randn(n_inputs ,n_neurons)*0.1
self.biases = np.random.randn(1,n_neurons)*0.1
def cal_output(self,input,train=False):
output = np.array(np.dot(input,self.weights) + self.biases,dtype="float128")
if train == True:
D = np.random.randn(1,self.n_neurons)
self.D = (D>self.keep_prob).astype(int)
output = output * self.D
return output
def forward(self,input):
return self.activation(self.cal_output(input))
def back_propagate(self,delta,ap,lr=1,keep_prob=1):
dz = delta
self.weights -= 0.001*lr*(np.dot(ap.T,dz)*self.D)
self.biases -= 0.001*lr*(np.sum(dz,axis=0,keepdims=True)*self.D)
return np.multiply(np.dot(dz,self.weights.T),(1-ap**2))
class Neural_Network:
def __init__(self,input,output):
self.input=input
self.output=output
self.layers = []
def Add_layer(self,n_neurons,activation="relu",keepprob=1):
if len(self.layers) != 0:
newL = Layer_Dense(self.layers[-1].n_neurons,n_neurons,activation,keep_prob=keepprob)
else:
newL = Layer_Dense(self.input.shape[1],n_neurons,activation,keep_prob=keepprob)
self.layers.append(newL)
def predict(self,input):
output = input
for layer in self.layers:
output = layer.forward(output)
return output
def cal_zs(self,input):
self.activations = []
self.activations.append(input)
output = input
for layer in self.layers:
z = layer.cal_output(output,train=True)
activation = layer.activation(z)
self.activations.append(activation)
output = activation
def train(self,input=None,output=None,lr=10):
if input is None:
input=self.input
output=self.output
if len(input)>1000:
indices = np.arange(input.shape[0])
np.random.shuffle(indices)
input = input[indices]
output = output[indices]
for _ in range(100):
self.lr = lr
for i in range(int(len(input)/100)):
self.lr *=0.99
self.train(input[i*100:i*100+100],output[i*100:i*100+100],self.lr)
return
self.cal_zs(input)
for i in range(1,len(self.layers)+1):
if i==1:
delta = self.activations[-1] - output
self.delta = self.layers[-1].back_propagate(delta,self.activations[-2],lr)
else:
self.delta = self.layers[-i].back_propagate(self.delta,self.activations[-i-1],lr)
def MSE(self):
predict = self.predict(self.input)
error = (predict - self.output)**2
mse = sum(sum(error))
print(mse)
def Logloss(self):
predict = self.predict(self.input)
error = np.multiply(self.output,np.log(predict)) + np.multiply(1-self.output,np.log(1-predict))
logloss = -1*sum(sum(error))
print(logloss)
def accuracy(self):
predict = self.predict(test_input)
prediction = np.argmax(predict,axis=1)
correct = np.mean(prediction == test_output)
print(correct*100)
# def train(self,input,output):
model = Neural_Network(iput,otput)
# model.Add_layer(4)
model.Add_layer(64)
model.Add_layer(16)
model.Add_layer(10,"sigmoid")
lrc= 6
for _ in range(10):
model.accuracy()
model.Logloss()
model.train(lr=lrc)
model.accuracy()
I have used MNIST database the link is THIS
One of the reason can be that you might be dropping too much neurons. In below code
D = np.random.randn(1,self.n_neurons)
self.D = (D>self.keep_prob).astype(int)
Matrix generated in first line might contain many values which are less then zero. Because of that when comparing it with self.keep_prob (which has value 1) lot of neurons are getting dropped
Please try with one change
self.D = (D < self.keep_prob).astype(int)
There could be various reasons for that. One was specified by #anuragal.
Basically dropout is used to reduce overfitting and to help the network correct errors. But when you use dropout before your final layer, it could be that the network is unable to correct itself, thus leading to a lower accuracy
Another reason could be that I see your network is small. Usually, shallow networks aren't benefitted by dropouts
Related
everyone!When I was doing dqn programming, I encountered some problems. This error says
“ Userwarning: Using a target size (torch.Size([32,32])) that is different to the input size (torch.Size([32,1])).This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
return F.mse_loss(input,target,reduction=self.reduction)"
And I don't know where the mistake is because I am new to RL . And some of these codes are borrowed from other people's codes, so I don't understand some places.
here are codes:
# hyperparameters
gamma = 0.9
TARGET_REPLACE_ITER = 20
memory_capability = 100
batch_size = 32
learning_rate = 0.001
n_state = 5
n_action = 32
neural network code:
class NN(nn.Module):
def __init__(self, ):
super(NN,self).__init__()
self.fc1 = nn.Linear(n_state, 32)
self.fc1.weight.data.normal_(0, 0.1)
self.fc2 = nn.Linear(32,64)
self.out = nn.Linear(64, n_action)
self.out.weight.data.normal_(0, 0.1)
def forward(self, x):
x = self.fc1(x)
x = F.relu(x)
x = self.fc2(x)
x = F.relu(x)
action_value = self.out(x)
return action_value
agent code:
class Agent(object):
def __init__(self,):
self.learn_step_counter = 0
self.memory = np.zeros((memory_capability, n_state * 2 + 2))
self.memory_cntr = 0
self.eval_net, self.target_net = NN(), NN()
self.loss_func = nn.MSELoss()
self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=learning_rate)
def choose_action(self, state):
state = torch.unsqueeze(torch.FloatTensor(state),0) # state is 1-Dim np.array,shape = (5,)
if random.random() < epsilon:
action = random.randint(0,len(stringlist) - 1)
else:
action_value = self.eval_net.forward(state)
action = torch.max(action_value, 1)[1].numpy()[0]
return action
def learn(self):
if self.learn_step_counter % TARGET_REPLACE_ITER == 0:
self.target_net.load_state_dict(self.eval_net.state_dict())
self.learn_step_counter += 1
sample_index = np.random.choice(memory_capability, batch_size)
b_memory = self.memory[sample_index, :]
b_s = torch.FloatTensor(b_memory[:, :n_state])
b_a = torch.LongTensor(b_memory[:, n_state:n_state + 1].astype(int))
b_r = torch.FloatTensor(b_memory[:, n_state + 1:n_state + 2])
b_s_ = torch.FloatTensor(b_memory[:, -n_state:])
q_eval = self.eval_net(b_s).gather(1, b_a) # shape (batch, 1)
q_next = self.target_net(b_s_).detach()
q_target = b_r + gamma * q_next.max(1)[0] # other people's code said the shape is (batch, 1)=(32,1),but when i ran ,it was (batch,batch)=(32,32),i don't know why
loss = self.loss_func(q_eval, q_target)
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
def store_transition(self,state,action,reward,state_):
transition = np.hstack((state,action,reward,state_))
index = self.memory_cntr % memory_capability
self.memory[index,:] = transition
self.memory_cntr += 1
the problem is probably in learn(),but i don't know how to modify.I will appreciate it if someone can help me,thanks a lot
The bug is exactly at the line you pointed out:
q_target = b_r + gamma * q_next.max(1)[0]
Here q_next is of shape [batch_size, n_action], so q_next.max(1)[0] is of shape [batch_size]. We also have b_r with a shape of [batch_size,1]. Now adding those two entities will not throw an error as PyTorch is doing some automatic shape broadcasting. So the fix for this is to reshape b_r to [batch_size] from [batch_size,1] by using b_r.squeeze(1)
I am training a neural network by regression but it is predicting a constant value during testing. Which is why I want to visualize the weights of the neural network change during training and see the weights change dynamically in the jupyter notebook.
Currently, my model looks like this:
import torch
from torch import nn
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.inp = nn.Linear(2, 40)
self.act1 = nn.Tanh()
self.h1 = nn.Linear(40, 40)
self.act2 = nn.Tanh()
self.h2 = nn.Linear(40, 2)
self.act3 = nn.Tanh()
#self.h3 = nn.Linear(20, 20)
#self.act4=nn.Tanh()
self.h4 = nn.Linear(2, 1)
def forward_one_pt(self, x):
out = self.inp(x)
out = self.act1(out)
out = self.h1(out)
out = self.act2(out)
out = self.h2(out)
out = self.act3(out)
#out = self.h3(out)
#out = self.act4(out)
out = self.h4(out)
return out
def forward(self, config):
E = torch.zeros([config.shape[0], 1])
for i in range(config.shape[0]):
E[i] = self.forward_one_pt(config[i])
# print("config[",i,"] = ",config[i],"E[",i,"] = ",E[i])
return torch.sum(E, 0)
and my main function looks like this:
def main() :
learning_rate = 0.5
n_pts = 1000
t_pts = 100
epochs = 15
coords,E = load_data(n_pts,t_pts)
#generating my data to NN
G = get_symm(coords,save,load_symmetry,symmtery_pickle_file,eeta1,eeta2,Rs,ex,lambdaa,zeta,boxl,Rc,pi,E,scale)
net = Net()
if(cuda_flag):
net.cuda()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)
net_trained = train(save,text_output,epochs,n_pts,G,E,net,t_pts,optimizer,criterion,out,cuda_flag)
test(save,n_pts,t_pts,G,E,net_trained,out,criterion,cuda_flag)
torch.save(net,save_model)
any tutorials or answers would be helpful
You can use model.state_dict() to see if your weights are updating across epochs:
old_state_dict = {}
for key in model.state_dict():
old_state_dict[key] = model.state_dict()[key].clone()
output = model(input)
new_state_dict = {}
for key in model.state_dict():
new_state_dict[key] = model.state_dict()[key].clone()
for key in old_state_dict:
if not (old_state_dict[key] == new_state_dict[key]).all():
print('Diff in {}'.format(key))
else:
print('NO Diff in {}'.format(key))
On a side note, you can vectorize your forward function instead of looping over it. Following would do the same job as your original forward function but much faster:
def forward(self, config):
out= self.forward_one_pt(config)
return torch.sum(out, 0)
I've posted the following to the Pytorch discussion board too. I'm trying to keep the learning-rates per parameter for the already existing parameters when adding more neurons (to existing layers, not new layers) to a network. I’ve written the following class which allows me to add neurons to hidden layers during training:
import torch
import torch.nn as nn
class DQN(nn.Module):
def __init__(self, num_inputs, hidden, num_actions, non_linearity):
super(DQN, self).__init__()
self.num_inputs = num_inputs
self.hidden = hidden
self.num_actions = num_actions
self.non_linearity = non_linearity
self.layers = nn.ModuleList()
self.layers.append(nn.Linear(num_inputs, self.hidden[0]))
previous = self.hidden[0]
for hidden_layer_size in self.hidden[1:]:
self.layers.append(nn.Linear(previous, hidden_layer_size))
previous = hidden_layer_size
self.layers.append(nn.Linear(previous, num_actions))
def forward(self, x):
for i in range(len(self.layers) - 1):
x = self.non_linearity(self.layers[i](x))
return self.layers[-1](x)
def increase_capacity(self, increment):
for i in range(len(self.hidden)):
self.hidden[i] += increment[i]
bias = self.layers[0].bias.data
weight = self.layers[0].weight.data
self.layers[0] = nn.Linear(self.num_inputs, self.hidden[0])
if increment[0]>0:
self.layers[0].weight.data[0:-increment[0],:] = weight
self.layers[0].bias.data[0:-increment[0]] = bias
else:
self.layers[0].weight.data[0:,:] = weight
self.layers[0].weight.data = bias
for i in range(1, len(self.layers) - 1):
bias = self.layers[i].bias.data
weight = self.layers[i].weight.data
self.layers[i] = nn.Linear(self.hidden[i-1], self.hidden[i])
if increment[i] > 0:
if increment[i-1] >0:
self.layers[i].bias.data[0:-increment[i]] = bias
self.layers[i].weight.data[0:-increment[i],0:-increment[i-1]] = weight
else:
self.layers[i].bias.data[0:-increment[i]] = bias
self.layers[i].weight.data[0:-increment[i],0:] = weight
else:
if increment[i-1] >0:
self.layers[i].bias.data = bias
self.layers[i].weight.data[0:,0:-increment[i-1]] = weight
else:
self.layers[i].bias.data = bias
self.layers[i].weight.data[0:,0:] = weight
bias = self.layers[-1].bias.data
weight = self.layers[-1].weight.data
self.layers[-1] = nn.Linear(self.hidden[-1], self.num_actions)
if increment[-1] >0:
self.layers[-1].bias.data = bias
self.layers[-1].weight.data[:,0:-increment[-1]] = weight
else:
self.layers[-1].bias.data = bias
self.layers[-1].weight.data[:,0:] = weight
def act(self, state, epsilon, mask):
if np.random.rand() > epsilon:
state = torch.tensor([state], dtype=torch.float32, device=device)
mask = torch.tensor([mask], dtype=torch.float32, device=device)
q_values = self.forward(state) + mask
action = q_values.max(1)[1].view(1, 1).item()
else:
action = np.random.randint(self.num_actions)
return action
Now I’ve written a little sanity check (whether it leads to sanity is questionable at this point): a network with 2 layers with both 1 neuron should fail to learn the x-or function, whereas a network where 4 neurons have been added should. If I initialise a new optimiser this indeed works. The optimiser I use is Adam, which keeps track of learning-rates per parameter. I’d like to keep the learning-rates of Adam for the weights and biases that already existed before I add additional neurons. The following is my failed attempt to doing so:
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
# Credits to Alvations
def generate_zero():
return random.uniform(0, 49) / 100
def generate_one():
return random.uniform(50, 100) / 100
def generate_xor_XY(num_data_points):
Xs, Ys = [], []
for _ in range(num_data_points):
# xor(0, 0) -> 0
Xs.append([generate_zero(), generate_zero()]); Ys.append([0])
# xor(1, 0) -> 1
Xs.append([generate_one(), generate_zero()]); Ys.append([1])
# xor(0, 1) -> 1
Xs.append([generate_zero(), generate_one()]); Ys.append([1])
# xor(1, 1) -> 0
Xs.append([generate_one(), generate_one()]); Ys.append([0])
return Xs, Ys
# Initialisation
network = DQN(2,[1,1],1,F.relu)
# optimizer = optim.Adam(network.parameters(), amsgrad=False)
optimizer = optim.Adam(network.parameters(), amsgrad=True)
criterion = nn.MSELoss()
# Train 50000 steps to show 1 neuron cannot solve x-or task
for i in range(50000):
optimizer.zero_grad()
Xs, Ys = generate_xor_XY(1)
Xs = torch.tensor(Xs)
Ys = torch.tensor(Ys, dtype=torch.float)
prediction = network(Xs)
loss = criterion(prediction, Ys)
loss.backward()
optimizer.step()
print(network(torch.tensor([[1,0],[0,1],[1,1],[0,0]], dtype=torch.float)))
print(loss)
# Add 5 neurons to first layer
capacity = [4,4]
network.increase_capacity(capacity)
# Uncomment the following line and comment the lines following it for normal initialisation.
# optimizer = optim.Adam(network.parameters(), amsgrad=True)
nw_param = [p for p in network.parameters()]
new_param_group = []
layer_idx = 0
for idx, group in enumerate(optimizer.param_groups):
for idx_p, p in enumerate(group['params']):
# Save previous information
prev_grad = p.grad
old_p = copy.deepcopy(p)
old_state = copy.copy(optimizer.state[p])
old_step = old_state['step']
old_exp_avg = old_state['exp_avg']
old_exp_avg_sq = old_state['exp_avg_sq']
old_max_exp_avg_sq = old_state['max_exp_avg_sq']
# Remove old parameter from state
optimizer.state.pop(p)
# Weights
if p.dim()>1:
p = nn.Parameter(nw_param[layer_idx])
p.grad = torch.zeros_like(p)
new_exp_avg = torch.torch.zeros_like(p)
new_exp_avg_sq = torch.torch.zeros_like(p)
new_max_exp_avg_sq = torch.torch.zeros_like(p)
p.grad[0:prev_grad.size(0),0:prev_grad.size(1)] = prev_grad
optimizer.state[p]['step'] = old_step
optimizer.state[p]['exp_avg'] = new_exp_avg
optimizer.state[p]['exp_avg'][0:prev_grad.size(0),0:prev_grad.size(1)] = old_exp_avg
optimizer.state[p]['exp_avg_sq'] = new_exp_avg_sq
optimizer.state[p]['exp_avg_sq'][0:prev_grad.size(0),0:prev_grad.size(1)] = old_exp_avg_sq
optimizer.state[p]['max_exp_avg_sq'] = new_max_exp_avg_sq
optimizer.state[p]['max_exp_avg_sq'][0:prev_grad.size(0),0:prev_grad.size(1)] = old_max_exp_avg_sq
new_param_group.append(p)
# Biases
else:
p = nn.Parameter(nw_param[layer_idx])
p.grad = torch.zeros_like(p)
new_exp_avg = torch.zeros_like(p)
new_exp_avg_sq = torch.zeros_like(p)
new_max_exp_avg_sq = torch.zeros_like(p)
p.grad[0:prev_grad.size(0)] = prev_grad
optimizer.state[p]['step'] = old_step
optimizer.state[p]['exp_avg'] = new_exp_avg
optimizer.state[p]['exp_avg'][0:prev_grad.size(0)] = old_exp_avg
optimizer.state[p]['exp_avg_sq'] = new_exp_avg_sq
optimizer.state[p]['exp_avg_sq'][0:prev_grad.size(0)] = old_exp_avg_sq
optimizer.state[p]['max_exp_avg_sq'] = new_max_exp_avg_sq
optimizer.state[p]['max_exp_avg_sq'][0:prev_grad.size(0)] = old_max_exp_avg_sq
new_param_group.append(p)
layer_idx += 1
optimizer.param_groups[0]['params'] = new_param_group
print(network)
# Train 50000 steps to show by adding neurons the task can be solved
for i in range(50000):
optimizer.zero_grad()
Xs, Ys = generate_xor_XY(1)
Xs = torch.tensor(Xs)
Ys = torch.tensor(Ys, dtype=torch.float)
prediction = network(Xs)
loss = criterion(prediction, Ys)
loss.backward()
optimizer.step()
print(network(torch.tensor([[1,0],[0,1],[1,1],[0,0]], dtype=torch.float)))
print(loss)
I’m trying to get the same optimizer state, but with additional parameters for the added neurons. This seems like a convoluted way of doing it (and it doesn’t work:p). Does anyone know of an (easier) way to do this or see where I’m going wrong?
I have built this LSTM class:
import tensorflow as tf
import Constants
class LSTM():
def __init__(self,
inputShape,
outputShape,
numLayers=Constants.numLayers,
numHidden=Constants.numHidden,
learningRate=Constants.learningRate,
forgetBias=Constants.forgetBias):
self.inputs = tf.placeholder(tf.float32, [None] + inputShape)
self.labels = tf.placeholder(tf.float32, [None] + outputShape)
self.inputTensors = tf.unstack(self.inputs, axis=1)
self.weights = tf.Variable(tf.random_normal([numHidden] + outputShape))
self.bias = tf.Variable(tf.random_normal(outputShape))
layers = [tf.contrib.rnn.LSTMCell(numHidden, forget_bias=forgetBias, state_is_tuple=True)] * numLayers
self.cell = tf.contrib.rnn.MultiRNNCell(layers, state_is_tuple=True)
self.optimiser = tf.train.GradientDescentOptimizer(learningRate)
self.forgetBias = forgetBias
self.batchDict = None
self.outputs = None
self.finalStates = None
self.predictions = None
self.loss = None
self.accuracy = None
self.optimise = None
self.session = tf.Session()
self.__buildGraph()
def __buildGraph(self):
outputs, finalStates = tf.nn.static_rnn(self.cell, self.inputTensors, dtype=tf.float32)
predictions = tf.add(tf.matmul(outputs[-1], self.weights), self.bias)
self.predictions = tf.minimum(tf.maximum(predictions, 0), 1)
self.loss = tf.losses.mean_squared_error(predictions=self.predictions, labels=self.labels)
self.accuracy = tf.reduce_mean(1 - tf.abs(self.labels - self.predictions) / 1.0)
self.optimise = self.optimiser.minimize(self.loss)
self.session.run(tf.global_variables_initializer())
def __execute(self, operation):
return self.session.run(operation, self.batchDict)
def setBatch(self, inputs, labels):
self.batchDict = {self.inputs: inputs, self.labels: labels}
def batchLabels(self):
return self.__execute(self.labels)
def batchPredictions(self):
return self.__execute(self.predictions)
def batchLoss(self):
return self.__execute(self.loss)
def batchAccuracy(self):
return self.__execute(self.accuracy)
def processBatch(self):
self.__execute(self.optimise)
def kill(self):
self.session.close()
and I run it like so:
import DataWorker
import Constants
from Model import LSTM
inputShape = [Constants.sequenceLength, DataWorker.numFeatures]
outputShape = [1]
LSTM = LSTM(inputShape, outputShape)
# #############################################
# TRAINING
# #############################################
for epoch in range(Constants.numEpochs):
print("***** EPOCH:", epoch + 1, "*****\n")
IDPointer, TSPointer = 0, 0
epochComplete = False
batchNum = 0
while not epochComplete:
batchNum += 1
batchX, batchY, IDPointer, TSPointer, epochComplete = DataWorker.generateBatch(IDPointer, TSPointer)
LSTM.setBatch(batchX, batchY)
LSTM.processBatch()
if batchNum % Constants.printStep == 0 or epochComplete:
print("Batch:\t\t", batchNum)
print("Last Pred:\t", LSTM.batchPredictions()[-1][0])
print("Last Label:\t", LSTM.batchLabels()[-1][0])
print("Loss:\t\t", LSTM.batchLoss())
print("Accuracy:\t", str("%.2f" % (LSTM.batchAccuracy() * 100) + "%\n"))
# #############################################
# TESTING
# #############################################
testX, testY = DataWorker.generateTestBatch()
LSTM.setBatchDict(testX, testY)
testAccuracy = LSTM.batchAccuracy()
print("Testing Accuracy:", str("%.2f" % (testAccuracy * 100) + "%"))
LSTM.kill()
This all works well as it should. However, I am using time series data which consists of financial stocks spanning over ranges of timestamps far greater than the number of time steps that my LSTM is unrolled for - Constants.sequenceLength. Because of this, it takes many sequential batches for a single stock t be processed, and so the state/memory of my LSTM needs to be passed between batches. As well as this, after a batch that completes the lifespan of an ID, the next batch would be passing in a new ID from the initial timestamp of my dataset, and so I would want to reset the memory.
There are many questions asking something similar, and all of the answers are adequate, however, none seem to address the issue of using variable batch sizes - batch sizes initialised to None and then inferred when a batch is passed in. My batches are usually a constant size, but do change under certain circumstances and I cannot change this. How can I have control over passing the state between batches, as well as resetting the state, if I have not specified the batch size?
Background:
I am trying to create a MLP using Tensorflow, my first time using tensorflow. It is a simple NN that will do the xor operation. I have 2 input neurons (for the 1s and 0s) a hidden layer that is 2 neurons wide. One output that will give me a 1 or 0. My activation is a simple sigmoid.
The Issue
I am running into an issue with launching the graph. Something I noticed is that when we launch the graph we get all of the batch instead of one at a time. for example I have the following in an array [[1,0],[0,0],[0,1],[1,1]]. When I try to start the graph I do the following:
x_vals = np.array([[1.0, 0.0],[0.0, 0.0], [0.0, 1.0], [1.0, 1.0]])
y_vals = np.array([[1.0],[0.0],[1.0],[0.0]])
result = run_nn(x,y)
with tf.Session() as sess:
sess.run(init)
results = sess.run(result, feed_dict={x: x_vals, y:y_vals})
print results
As we can see I feed in x and y into the neural network. Once I do this I need to multiply the weights * outputs (essentially the input [1,0]) and sum them The issue is that I get a mismatch in size between the x values and the weights array:
tf.transpose(tf.reduce_sum(tf.multiply(tf.transpose(l0.weights) , l0.outputs), 1))
InvalidArgumentError: Incompatible shapes: [2,3] vs. [4,3]
[[Node: Mul_6 = Mul[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](transpose_10, concat_12)]]
What am I doing wrong here, I understand this is not a perfect implementation. But I want to do a NN step by step
Here is my full code:
import math
import numpy as np
momentum = 0.5
learning_rate = 2.0
class layer:
def __init__(self, num_neurons, num_weights, layer_type):#number of weights corresponds to number of neurons in next layer
self.num_neurons = num_neurons
self.num_weights = num_weights
self.layer_type = layer_type
if layer_type == 'hidden':
num_neurons = num_neurons+1#account for bias
self.num_neurons = num_neurons+1
self.weights = tf.random_normal([num_neurons, num_weights])
self.outputs = tf.zeros(num_neurons, tf.float32)
self.sums = tf.zeros(num_neurons, tf.float32)
self.deltas = tf.zeros(num_neurons, tf.float32)
self.gradiants = tf.zeros([num_neurons, num_weights], tf.float32)
self.weight_deltas = tf.zeros_like(self.gradiants)
def calculate_sums(self, p_layer):
self.sums = tf.transpose(tf.reduce_sum(tf.multiply(tf.transpose(p_layer.weights) , p_layer.outputs), 1))
return self.sums
def calculate_outputs(self, p_layer):
if self.layer_type == 'hidden':
self.outputs = tf.concat([sigmoid(self.sums, False), tf.constant([1.0])], 0)
else:
self.outputs = sigmoid(self.sums, False)
return self.outputs
def calculate_deltas(self, n_layer = None, y=None):
if self.layer_type == 'hidden':
self.deltas = sigmoid(self.sums, True) * n_layer.deltas * self.weights[:-1,0]
else:#output delta
E = self.outputs[:self.num_neurons]-y
#print 'error: {}'.format(E)
self.deltas = -E* sigmoid(self.sums, True)
return self.deltas
def calculate_gradiants(self, n_layer):
shape = (tf.shape(self.outputs)[0], 1)
self.gradiants += tf.reshape(self.outputs, shape=shape) * tf.transpose(n_layer.deltas)#we add the gradiants for every batch completion then update, dont want to update every time
return self.gradiants
def update_weights(self):
self.weight_deltas = self.gradiants*learning_rate + momentum * self.weight_deltas
self.weights += self.weight_deltas
# for i in range(len(self.gradiants)):
# for j in range(len(self.gradiants[0])):
# self.weight_deltas[i,j] = weight_change(self.gradiants[i,j], self.weight_deltas[i,j])
# self.weights[i,j] += self.weight_deltas[i,j]
def sigmoid(x, derivative = False):
if derivative == True:
return (1.0/(1+tf.exp(-x))) * (1.0 - (1.0/(1+tf.exp(-x))))
return 1.0/(1+tf.exp(-x))
#the output delta is just E*f'i, essentially the error * the derivative of the activation function
def weight_change(g, p_w_delta):#gradiant, previous weight delta
return learning_rate*g + momentum * p_w_delta
def run_nn(x_val, y_val):
l0.outputs = tf.concat([x_val, tf.ones(shape=(tf.shape(x_val)[0],1))], 1)
print 'set output'
#forward pass
# l1.calculate_sums(l0)
# print 'l1 calc sum'
# l1.calculate_outputs(l0)
# print 'l1 calc output'
# ol.calculate_sums(l1)
# print 'ol calc sum'
# ol.calculate_outputs(l1)
# print 'ol calc output'
# #backwards pass
# ol.calculate_deltas(y=y_val)
# print 'ol calc deltas'
# l1.calculate_deltas(ol)
# print 'l1 calc deltas'
# l1.calculate_gradiants(ol)
# print 'l1 calc gradiants'
# l0.calculate_gradiants(l1)
# print 'l0 calc gradiants'
# #we dont want to update the weights every time, just after we have gone through every batch/minibatch
# l1.update_weights()
# print 'l1 update weights'
# l0.update_weights()
# print 'l0 uipdate weights'
# l1.gradiants = tf.zeros_like(l1.gradiants)
# print 'l1 zero gradiants'
# l0.gradiants = tf.zeros_like(l0.gradiants)
# print 'l0 zero gradiants'
# #test
# print 'run test'
# l0.outputs = tf.concat([x, tf.constant([1.0])], 0 )
# #forward pass
# l1.calculate_sums(l0)
# l1.calculate_outputs(l0)
#
# ol.calculate_sums(l1)
# ol.calculate_outputs(l1)
# print 'DONE'
return tf.transpose(tf.reduce_sum(tf.multiply(tf.transpose(l0.weights) , l0.outputs), 1))
l0 = layer(2,2,'hidden')#input
l1 = layer(2,1,'hidden')#hidden
ol = layer(1,0,'output')#output
x_vals = np.array([[1.0, 0.0],[0.0, 0.0], [0.0, 1.0], [1.0, 1.0]])
y_vals = np.array([[1.0],[0.0],[1.0],[0.0]])
# initialize variables
init = tf.global_variables_initializer()
x = tf.placeholder('float', None)
y = tf.placeholder('float', None)
result = run_nn(x,y)
with tf.Session() as sess:
sess.run(init)
results = sess.run(result, feed_dict={x: x_vals, y:y_vals})
print results
Here is some equivalent code in pure python/numpy
import math
import numpy as np
momentum = 0.5
learning_rate = 2.0
class layer:
def __init__(self, num_neurons, num_weights, layer_type):#number of weights corresponds to number of neurons in next layer
self.layer_type = layer_type
if layer_type == 'hidden':
num_neurons = num_neurons+1#account for bias
self.weights = np.random.rand(num_neurons,num_weights)
self.outputs = np.zeros(shape=(1,num_neurons))
self.sums = np.zeros(shape=(1,num_neurons))
self.deltas = np.zeros(shape=(1,num_neurons)).T
self.gradiants = np.zeros(shape=(num_neurons,num_weights))
self.weight_deltas = np.zeros_like(self.gradiants)
def calculate_sums(self, p_layer):
self.sums = np.array([(sum(p_layer.weights * p_layer.outputs))]).T
return self.sums;
def calculate_outputs(self, p_layer):
if self.layer_type == 'hidden':
self.outputs = np.concatenate((np.array([[sigmoid(X, False)] for X in self.sums]), np.array([[1.0]])))
else:
self.outputs = np.array([[sigmoid(X, False)] for X in self.sums])
return self.outputs
def calculate_deltas(self, n_layer = None):
if self.layer_type == 'hidden':
self.deltas = np.array([[sigmoid(X, True)] for X in self.sums]) * n_layer.deltas * self.weights[:-1]
else:#output delta
E = self.outputs-y
#print 'error: {}'.format(E)
self.deltas = -E* sigmoid(self.sums, True)
return self.deltas
def calculate_gradiants(self, n_layer):
self.gradiants += self.outputs * n_layer.deltas.T#we add the gradiants for every batch completion then update, dont want to update every time
return self.gradiants
def update_weights(self):
for i in range(len(self.gradiants)):
for j in range(len(self.gradiants[0])):
self.weight_deltas[i,j] = weight_change(self.gradiants[i,j], self.weight_deltas[i,j])
self.weights[i,j] += self.weight_deltas[i,j]
def sigmoid(x, derivative = False):
if derivative == True:
return (1.0/(1+math.exp(-x))) * (1.0 - (1.0/(1+math.exp(-x))))
return 1.0/(1+math.exp(-x))
#the output delta is just E*f'i, essentially the error * the derivative of the activation function
def weight_change(g, p_w_delta):#gradiant, previous weight delta
return learning_rate*g + momentum * p_w_delta
input_layer = layer(3,2, 'hidden')
hidden_layer1 = layer(2,1, 'hidden')
output_layer = layer(1,0, 'output')
x_vals = []
y_vals = []
for i in range(2):
for j in range(2):
for k in range(2):
x_vals.append(np.array([[float(i)],[float(j)],[float(k)]]))
y_vals.append(np.array([float(i ^ j ^ k)]))
#x_vals = [np.array([[1.0], [0.0]]), np.array([[0.0], [0.0]]), np.array([[0.0], [1.0]]),np.array([[1.0], [1.0]])]
#y_vals = np.array([[1.0],[0.0],[1.0],[0.0]])
#input_layer.weights = np.array([[-0.06782947598673161,0.9487814395569221],[0.22341077197888182,0.461587116462548], [-0.4635107399577998, 0.09750161997450091]])
#hidden_layer1.weights = np.array([[-0.22791948943117624],[0.581714099641357], [0.7792991203673414]])
Error = []
for n in range(10000):
for x, y in zip(x_vals, y_vals):
input_layer.outputs = np.concatenate((x, np.array([[1.0]])))
#forward pass
hidden_layer1.calculate_sums(input_layer)
hidden_layer1.calculate_outputs(input_layer)
output_layer.calculate_sums(hidden_layer1)
output_layer.calculate_outputs(hidden_layer1)
Error.append(-(output_layer.outputs-y))
#backwards pass
output_layer.calculate_deltas()
hidden_layer1.calculate_deltas(output_layer)
hidden_layer1.calculate_gradiants(output_layer)
input_layer.calculate_gradiants(hidden_layer1)
if n % 1000 == 0:
print 'Epoch #{}; error: {}'.format(n, sum(Error)/len(Error))
Error = []
#we dont want to update the weights every time, just after we have gone through every batch/minibatch
hidden_layer1.update_weights()
input_layer.update_weights()
hidden_layer1.gradiants.fill(0.0)
input_layer.gradiants.fill(0.0)
#test
for x, y in zip(x_vals, y_vals):
input_layer.outputs = np.concatenate((x, np.array([[1.0]])))
#forward pass
hidden_layer1.calculate_sums(input_layer)
hidden_layer1.calculate_outputs(input_layer)
output_layer.calculate_sums(hidden_layer1)
output_layer.calculate_outputs(hidden_layer1)
print 'Y_hat: {}, Y: {}'.format(round(float(output_layer.outputs), 3), float(y))
Can anyone point me in the right direction.
Thanks