I have a model that takes a tensor representing the difference between two images and outputs coordinates used to make them more alike. I then calculate the loss as MSE of the created image and the original image, but when I run a backward pass no weights seems to update and the loss remains constant (although not none) throughout all epochs.
Is this because the loss isn't calculated directly on the model output?
When reading other post the requires_grad was sometimes responsible for the problem, I've tried with requires_grad = True but I'm still a bit unsure if it's used correctly.
class Net(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(10 * 10 * 3, 240)
self.fc2 = nn.Linear(240, 240)
self.fc3 = nn.Linear(240, 240)
self.fc4 = nn.Linear(240, 7) # [x1, y1, x2, y2, r, g, b]
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
x = self.fc4(x)
return x
def array_to_flatten_tensor(array_to_reshape):
prepared_image = torch.FloatTensor(array_to_reshape)
prepared_image.requires_grad = True
prepared_image.retains_grad = True
return torch.flatten(prepared_image)
image = cv2.imread("flower.png")
image = cv2.resize(image, (10, 10))
image = torch.FloatTensor(image)
image.requires_grad = True
image.retains_grad = True
image = torch.flatten(image)
for image_pass in range(1000):
running_loss = 0
# Create empty image
drawn_image = array_to_png_test.generate_empty_RGB_array(10, 10)
for _ in range(10):
optimizer.zero_grad()
# Pass (image - net drawn) image to net
difference_array = image - array_to_flatten_tensor(drawn_image)
output = net_to_train(difference_array)
# Converts the output to two points and colour
point_1 = (float(output[0]), 0, float(output[1]))
point_2 = (float(output[2]), float(output[3]))
colour = round(max(float(output[4]), 0)), round(max(float(output[5]), 0)), round(max(float(output[6]), 0))
point_1, point_2 = sorted([point_1, point_2])
# Draws a line to the empty image
drawn_image = array_to_png_test.draw_line(point_1, point_2, colour, drawn_image)
# Calculate loss as mean square difference of image and drawn image
drawn_image_tensor = array_to_flatten_tensor(drawn_image)
current_loss = criterion(drawn_image_tensor, image)
running_loss += float(current_loss)
current_loss.backward()
optimizer.step()
print("epoch " + str(image_pass) + " : " + str(running_loss / 250))
if __name__ == "__main__":
net = Net()
criterion = nn.MSELoss()
optimizer = optim.Adam(net.parameters(), lr=0.1, betas=(0.9, 0.999))
training(optimizer, criterion, net)
Related
I have an encoder model which was working fine with single channel 1024,1024 images, I'm trying to patch the original images (mega pixel images) to 256, 256, 64 images. I've changed my encoder input to match the images input that the model will get. The model call function is working fine, loss is getting calculated fine, but I'm getting the following error with tape.gradient:
2023-01-29 17:11:01.868555: F tensorflow/stream_executor/cuda/cuda_dnn.cc:593] Check failed: cudnnSetTensorNdDescriptor(handle_.get(), elem_type, nd, dims.data(), strides.data()) == CUDNN_STATUS_SUCCESS (3 vs. 0)batch_descriptor: {count: 10 feature_map_count: 64 spatial: 0 0 value_min: 0.000000 value_max: 0.000000 layout: BatchYXDepth} C:\ProgramData\Anaconda3\lib\site-packages\joblib\externals\loky\backend\resource_tracker.py:318: UserWarning: resource_tracker: There appear to be 2 leaked folder objects to clean up at shutdown warnings.warn('resource_tracker: There appear to be %d ' C:\ProgramData\Anaconda3\lib\site-packages\joblib\externals\loky\backend\resource_tracker.py:333: UserWarning: resource_tracker: C:\Users\kjhan\AppData\Local\Temp\joblib_memmapping_folder_12248_772bbeeeccff43089fa0e6d75271eebd_97f2f7c6edd04b468a4360bf96b91b84: FileNotFoundError(2, 'The system cannot find the path specified') warnings.warn('resource_tracker: %s: %r' % (name, e)) C:\ProgramData\Anaconda3\lib\site-packages\joblib\externals\loky\backend\resource_tracker.py:333: UserWarning: resource_tracker: C:\Users\kjhan\AppData\Local\Temp\joblib_memmapping_folder_12248_29db2f1e8ff54416b9a78c6f69dcff23_40a85063390f46d38d15c1877f99acc8: FileNotFoundError(2, 'The system cannot find the path specified') warnings.warn('resource_tracker: %s: %r' % (name, e)) [I 17:11:10.131 NotebookApp] KernelRestarter: restarting kernel (1/5), keep random ports kernel 286d1cc6-8ddd-46f9-baf7-5e1b05a2d033 restarted
My code is as below
class encoder(tf.keras.layers.Layer):
def __init__(self,size:tuple):
super(encoder, self).__init__()
#encoder Module
self.input_cnn = keras.layers.InputLayer(input_shape=(size[0],size[1],size[2]))
# Ex0panding features for computation
self.conv_1 = keras.layers.Conv2D(input_shape=(size[0],size[1],size[2]),filters=16,kernel_size=(3,3),padding='same',activation='relu')
# 1/4 size reduction
self.conv_2 = keras.layers.MaxPool2D(pool_size=(2,2),strides=(2,2))
self.conv_3 = keras.layers.Conv2D(filters = 16,kernel_size=(4,4),strides=(2,2),padding='same',activation='relu')
self.conv_4 = keras.layers.Conv2D(filters = 32,kernel_size=(4,4),strides=(4,4),padding='same',activation='relu')
self.conv_5 = keras.layers.BatchNormalization()
# 1/2 size reduction
self.conv_6 = keras.layers.MaxPool2D(pool_size=(2,2),strides=(2,2))
# 3x3 Filter third application
self.conv_7 = keras.layers.Conv2D(filters = 64,kernel_size=(8,8),strides=(8,8),padding='same',activation='relu')
# 1/4 size reduction
self.conv_8 = keras.layers.MaxPool2D(pool_size=(2,2),strides=(2,2))
# 3x3 Filter third application
self.conv_9 = keras.layers.BatchNormalization()
self.conv_10 = keras.layers.Conv2D(filters = 1 ,kernel_size=(3,3),strides=(1,1),padding='same',activation='relu')
def call(self,inputs,training = True):
x = self.input_cnn(inputs)
x = self.conv_1(x)
x = self.conv_2(x)
x = self.conv_3(x)
x = self.conv_4(x)
if training == True:
x = self.conv_5(x,training = True)
else:
x = self.conv_5(x,training = False)
x = self.conv_6(x)
x = self.conv_7(x)
x = self.conv_8(x)
if training == True:
x = self.conv_9(x,training = True)
else:
x = self.conv_9(x,training = False)
x = self.conv_10(x)
return x
size 0 is 256
size 1 is 256
size 2 is 64
Train_step from main model:
def __init__(self, size: tuple, optimizer = keras.optimizers.Adam(learning_rate=1e-3),loss_fn = keras.losses.BinaryCrossentropy(from_logits=False),metric = tf.keras.metrics.Accuracy()):
super(BCDClassifier, self).__init__()
#Input for catagorical data
self.input_cat = keras.layers.InputLayer(input_shape = (2,))
#Encoder Layer for each view
self.encode = encoder(size)
#flatten encoded output
self.flatten = keras.layers.Flatten()
#Concatenate Layer
self.concat = keras.layers.Concatenate(axis = 1)
#Classifier layer
self.classify = classifier(32)
#deffine model parameters
self.optimizer = optimizer
self.loss_fn = loss_fn
self.loss_tracker = keras.metrics.Mean(name="loss")
self.acc_tracker = metric
self.f1_tracker = tfa.metrics.F1Score(num_classes=2, threshold=0.5, average = 'micro')
self.sk_metric_acc = accuracy_score
self.sk_metric_f1 = f1_score
self.acc_history = []
self.loss_history = []
self.f1_history = []
# Forward pass of model - order does matter.
def call(self, cat_batch, view_batch, images_batch, training = True):
x1 = self.encode(images_batch,training)
x2 = self.input_cat(cat_batch)
x1 = self.flatten(x1)
x12 = self.concat([x1,x2])
x12 = self.classify(x12)
return x12
def train_step(self,cat_batch, views_batch, images_batch, target_batch, training = True):
with tf.GradientTape() as tape:
logits = self(cat_batch, views_batch, images_batch,training)
loss_value = self.loss_fn(target_batch, logits)
grads = tape.gradient(loss_value, self.trainable_weights)
self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
self.loss_tracker.update_state(loss_value)
pred = []
target = []
threshold = 0.5
for val in logits.numpy():
if isinstance(val,np.ndarray):
for v_1 in val:
if isinstance(v_1,np.ndarray):
for v_2 in v_1:
if v_2 > threshold:
pred.append(1.0)
else:
pred.append(0.0)
else:
if v_1 > threshold:
pred.append(1.0)
else:
pred.append(0.0)
else:
if val > threshold:
pred.append(1.0)
else:
pred.append(0.0)
for val in target_batch:
if isinstance(val,np.ndarray):
for v_1 in val:
if isinstance(v_1,np.ndarray):
for v_2 in v_1:
target.append(v_2)
else:
target.append(v_1)
else:
target.append(val)
acc = self.sk_metric_acc(target,pred)
f1 = self.sk_metric_f1(target,pred)
self.f1_tracker.update_state(target_batch,logits)
return {"Loss": self.loss_tracker.result(), "Accuracy": acc, 'F1-score':f1}
everyone!When I was doing dqn programming, I encountered some problems. This error says
“ Userwarning: Using a target size (torch.Size([32,32])) that is different to the input size (torch.Size([32,1])).This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
return F.mse_loss(input,target,reduction=self.reduction)"
And I don't know where the mistake is because I am new to RL . And some of these codes are borrowed from other people's codes, so I don't understand some places.
here are codes:
# hyperparameters
gamma = 0.9
TARGET_REPLACE_ITER = 20
memory_capability = 100
batch_size = 32
learning_rate = 0.001
n_state = 5
n_action = 32
neural network code:
class NN(nn.Module):
def __init__(self, ):
super(NN,self).__init__()
self.fc1 = nn.Linear(n_state, 32)
self.fc1.weight.data.normal_(0, 0.1)
self.fc2 = nn.Linear(32,64)
self.out = nn.Linear(64, n_action)
self.out.weight.data.normal_(0, 0.1)
def forward(self, x):
x = self.fc1(x)
x = F.relu(x)
x = self.fc2(x)
x = F.relu(x)
action_value = self.out(x)
return action_value
agent code:
class Agent(object):
def __init__(self,):
self.learn_step_counter = 0
self.memory = np.zeros((memory_capability, n_state * 2 + 2))
self.memory_cntr = 0
self.eval_net, self.target_net = NN(), NN()
self.loss_func = nn.MSELoss()
self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=learning_rate)
def choose_action(self, state):
state = torch.unsqueeze(torch.FloatTensor(state),0) # state is 1-Dim np.array,shape = (5,)
if random.random() < epsilon:
action = random.randint(0,len(stringlist) - 1)
else:
action_value = self.eval_net.forward(state)
action = torch.max(action_value, 1)[1].numpy()[0]
return action
def learn(self):
if self.learn_step_counter % TARGET_REPLACE_ITER == 0:
self.target_net.load_state_dict(self.eval_net.state_dict())
self.learn_step_counter += 1
sample_index = np.random.choice(memory_capability, batch_size)
b_memory = self.memory[sample_index, :]
b_s = torch.FloatTensor(b_memory[:, :n_state])
b_a = torch.LongTensor(b_memory[:, n_state:n_state + 1].astype(int))
b_r = torch.FloatTensor(b_memory[:, n_state + 1:n_state + 2])
b_s_ = torch.FloatTensor(b_memory[:, -n_state:])
q_eval = self.eval_net(b_s).gather(1, b_a) # shape (batch, 1)
q_next = self.target_net(b_s_).detach()
q_target = b_r + gamma * q_next.max(1)[0] # other people's code said the shape is (batch, 1)=(32,1),but when i ran ,it was (batch,batch)=(32,32),i don't know why
loss = self.loss_func(q_eval, q_target)
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
def store_transition(self,state,action,reward,state_):
transition = np.hstack((state,action,reward,state_))
index = self.memory_cntr % memory_capability
self.memory[index,:] = transition
self.memory_cntr += 1
the problem is probably in learn(),but i don't know how to modify.I will appreciate it if someone can help me,thanks a lot
The bug is exactly at the line you pointed out:
q_target = b_r + gamma * q_next.max(1)[0]
Here q_next is of shape [batch_size, n_action], so q_next.max(1)[0] is of shape [batch_size]. We also have b_r with a shape of [batch_size,1]. Now adding those two entities will not throw an error as PyTorch is doing some automatic shape broadcasting. So the fix for this is to reshape b_r to [batch_size] from [batch_size,1] by using b_r.squeeze(1)
I am training a neural network by regression but it is predicting a constant value during testing. Which is why I want to visualize the weights of the neural network change during training and see the weights change dynamically in the jupyter notebook.
Currently, my model looks like this:
import torch
from torch import nn
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.inp = nn.Linear(2, 40)
self.act1 = nn.Tanh()
self.h1 = nn.Linear(40, 40)
self.act2 = nn.Tanh()
self.h2 = nn.Linear(40, 2)
self.act3 = nn.Tanh()
#self.h3 = nn.Linear(20, 20)
#self.act4=nn.Tanh()
self.h4 = nn.Linear(2, 1)
def forward_one_pt(self, x):
out = self.inp(x)
out = self.act1(out)
out = self.h1(out)
out = self.act2(out)
out = self.h2(out)
out = self.act3(out)
#out = self.h3(out)
#out = self.act4(out)
out = self.h4(out)
return out
def forward(self, config):
E = torch.zeros([config.shape[0], 1])
for i in range(config.shape[0]):
E[i] = self.forward_one_pt(config[i])
# print("config[",i,"] = ",config[i],"E[",i,"] = ",E[i])
return torch.sum(E, 0)
and my main function looks like this:
def main() :
learning_rate = 0.5
n_pts = 1000
t_pts = 100
epochs = 15
coords,E = load_data(n_pts,t_pts)
#generating my data to NN
G = get_symm(coords,save,load_symmetry,symmtery_pickle_file,eeta1,eeta2,Rs,ex,lambdaa,zeta,boxl,Rc,pi,E,scale)
net = Net()
if(cuda_flag):
net.cuda()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)
net_trained = train(save,text_output,epochs,n_pts,G,E,net,t_pts,optimizer,criterion,out,cuda_flag)
test(save,n_pts,t_pts,G,E,net_trained,out,criterion,cuda_flag)
torch.save(net,save_model)
any tutorials or answers would be helpful
You can use model.state_dict() to see if your weights are updating across epochs:
old_state_dict = {}
for key in model.state_dict():
old_state_dict[key] = model.state_dict()[key].clone()
output = model(input)
new_state_dict = {}
for key in model.state_dict():
new_state_dict[key] = model.state_dict()[key].clone()
for key in old_state_dict:
if not (old_state_dict[key] == new_state_dict[key]).all():
print('Diff in {}'.format(key))
else:
print('NO Diff in {}'.format(key))
On a side note, you can vectorize your forward function instead of looping over it. Following would do the same job as your original forward function but much faster:
def forward(self, config):
out= self.forward_one_pt(config)
return torch.sum(out, 0)
I've posted the following to the Pytorch discussion board too. I'm trying to keep the learning-rates per parameter for the already existing parameters when adding more neurons (to existing layers, not new layers) to a network. I’ve written the following class which allows me to add neurons to hidden layers during training:
import torch
import torch.nn as nn
class DQN(nn.Module):
def __init__(self, num_inputs, hidden, num_actions, non_linearity):
super(DQN, self).__init__()
self.num_inputs = num_inputs
self.hidden = hidden
self.num_actions = num_actions
self.non_linearity = non_linearity
self.layers = nn.ModuleList()
self.layers.append(nn.Linear(num_inputs, self.hidden[0]))
previous = self.hidden[0]
for hidden_layer_size in self.hidden[1:]:
self.layers.append(nn.Linear(previous, hidden_layer_size))
previous = hidden_layer_size
self.layers.append(nn.Linear(previous, num_actions))
def forward(self, x):
for i in range(len(self.layers) - 1):
x = self.non_linearity(self.layers[i](x))
return self.layers[-1](x)
def increase_capacity(self, increment):
for i in range(len(self.hidden)):
self.hidden[i] += increment[i]
bias = self.layers[0].bias.data
weight = self.layers[0].weight.data
self.layers[0] = nn.Linear(self.num_inputs, self.hidden[0])
if increment[0]>0:
self.layers[0].weight.data[0:-increment[0],:] = weight
self.layers[0].bias.data[0:-increment[0]] = bias
else:
self.layers[0].weight.data[0:,:] = weight
self.layers[0].weight.data = bias
for i in range(1, len(self.layers) - 1):
bias = self.layers[i].bias.data
weight = self.layers[i].weight.data
self.layers[i] = nn.Linear(self.hidden[i-1], self.hidden[i])
if increment[i] > 0:
if increment[i-1] >0:
self.layers[i].bias.data[0:-increment[i]] = bias
self.layers[i].weight.data[0:-increment[i],0:-increment[i-1]] = weight
else:
self.layers[i].bias.data[0:-increment[i]] = bias
self.layers[i].weight.data[0:-increment[i],0:] = weight
else:
if increment[i-1] >0:
self.layers[i].bias.data = bias
self.layers[i].weight.data[0:,0:-increment[i-1]] = weight
else:
self.layers[i].bias.data = bias
self.layers[i].weight.data[0:,0:] = weight
bias = self.layers[-1].bias.data
weight = self.layers[-1].weight.data
self.layers[-1] = nn.Linear(self.hidden[-1], self.num_actions)
if increment[-1] >0:
self.layers[-1].bias.data = bias
self.layers[-1].weight.data[:,0:-increment[-1]] = weight
else:
self.layers[-1].bias.data = bias
self.layers[-1].weight.data[:,0:] = weight
def act(self, state, epsilon, mask):
if np.random.rand() > epsilon:
state = torch.tensor([state], dtype=torch.float32, device=device)
mask = torch.tensor([mask], dtype=torch.float32, device=device)
q_values = self.forward(state) + mask
action = q_values.max(1)[1].view(1, 1).item()
else:
action = np.random.randint(self.num_actions)
return action
Now I’ve written a little sanity check (whether it leads to sanity is questionable at this point): a network with 2 layers with both 1 neuron should fail to learn the x-or function, whereas a network where 4 neurons have been added should. If I initialise a new optimiser this indeed works. The optimiser I use is Adam, which keeps track of learning-rates per parameter. I’d like to keep the learning-rates of Adam for the weights and biases that already existed before I add additional neurons. The following is my failed attempt to doing so:
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
# Credits to Alvations
def generate_zero():
return random.uniform(0, 49) / 100
def generate_one():
return random.uniform(50, 100) / 100
def generate_xor_XY(num_data_points):
Xs, Ys = [], []
for _ in range(num_data_points):
# xor(0, 0) -> 0
Xs.append([generate_zero(), generate_zero()]); Ys.append([0])
# xor(1, 0) -> 1
Xs.append([generate_one(), generate_zero()]); Ys.append([1])
# xor(0, 1) -> 1
Xs.append([generate_zero(), generate_one()]); Ys.append([1])
# xor(1, 1) -> 0
Xs.append([generate_one(), generate_one()]); Ys.append([0])
return Xs, Ys
# Initialisation
network = DQN(2,[1,1],1,F.relu)
# optimizer = optim.Adam(network.parameters(), amsgrad=False)
optimizer = optim.Adam(network.parameters(), amsgrad=True)
criterion = nn.MSELoss()
# Train 50000 steps to show 1 neuron cannot solve x-or task
for i in range(50000):
optimizer.zero_grad()
Xs, Ys = generate_xor_XY(1)
Xs = torch.tensor(Xs)
Ys = torch.tensor(Ys, dtype=torch.float)
prediction = network(Xs)
loss = criterion(prediction, Ys)
loss.backward()
optimizer.step()
print(network(torch.tensor([[1,0],[0,1],[1,1],[0,0]], dtype=torch.float)))
print(loss)
# Add 5 neurons to first layer
capacity = [4,4]
network.increase_capacity(capacity)
# Uncomment the following line and comment the lines following it for normal initialisation.
# optimizer = optim.Adam(network.parameters(), amsgrad=True)
nw_param = [p for p in network.parameters()]
new_param_group = []
layer_idx = 0
for idx, group in enumerate(optimizer.param_groups):
for idx_p, p in enumerate(group['params']):
# Save previous information
prev_grad = p.grad
old_p = copy.deepcopy(p)
old_state = copy.copy(optimizer.state[p])
old_step = old_state['step']
old_exp_avg = old_state['exp_avg']
old_exp_avg_sq = old_state['exp_avg_sq']
old_max_exp_avg_sq = old_state['max_exp_avg_sq']
# Remove old parameter from state
optimizer.state.pop(p)
# Weights
if p.dim()>1:
p = nn.Parameter(nw_param[layer_idx])
p.grad = torch.zeros_like(p)
new_exp_avg = torch.torch.zeros_like(p)
new_exp_avg_sq = torch.torch.zeros_like(p)
new_max_exp_avg_sq = torch.torch.zeros_like(p)
p.grad[0:prev_grad.size(0),0:prev_grad.size(1)] = prev_grad
optimizer.state[p]['step'] = old_step
optimizer.state[p]['exp_avg'] = new_exp_avg
optimizer.state[p]['exp_avg'][0:prev_grad.size(0),0:prev_grad.size(1)] = old_exp_avg
optimizer.state[p]['exp_avg_sq'] = new_exp_avg_sq
optimizer.state[p]['exp_avg_sq'][0:prev_grad.size(0),0:prev_grad.size(1)] = old_exp_avg_sq
optimizer.state[p]['max_exp_avg_sq'] = new_max_exp_avg_sq
optimizer.state[p]['max_exp_avg_sq'][0:prev_grad.size(0),0:prev_grad.size(1)] = old_max_exp_avg_sq
new_param_group.append(p)
# Biases
else:
p = nn.Parameter(nw_param[layer_idx])
p.grad = torch.zeros_like(p)
new_exp_avg = torch.zeros_like(p)
new_exp_avg_sq = torch.zeros_like(p)
new_max_exp_avg_sq = torch.zeros_like(p)
p.grad[0:prev_grad.size(0)] = prev_grad
optimizer.state[p]['step'] = old_step
optimizer.state[p]['exp_avg'] = new_exp_avg
optimizer.state[p]['exp_avg'][0:prev_grad.size(0)] = old_exp_avg
optimizer.state[p]['exp_avg_sq'] = new_exp_avg_sq
optimizer.state[p]['exp_avg_sq'][0:prev_grad.size(0)] = old_exp_avg_sq
optimizer.state[p]['max_exp_avg_sq'] = new_max_exp_avg_sq
optimizer.state[p]['max_exp_avg_sq'][0:prev_grad.size(0)] = old_max_exp_avg_sq
new_param_group.append(p)
layer_idx += 1
optimizer.param_groups[0]['params'] = new_param_group
print(network)
# Train 50000 steps to show by adding neurons the task can be solved
for i in range(50000):
optimizer.zero_grad()
Xs, Ys = generate_xor_XY(1)
Xs = torch.tensor(Xs)
Ys = torch.tensor(Ys, dtype=torch.float)
prediction = network(Xs)
loss = criterion(prediction, Ys)
loss.backward()
optimizer.step()
print(network(torch.tensor([[1,0],[0,1],[1,1],[0,0]], dtype=torch.float)))
print(loss)
I’m trying to get the same optimizer state, but with additional parameters for the added neurons. This seems like a convoluted way of doing it (and it doesn’t work:p). Does anyone know of an (easier) way to do this or see where I’m going wrong?
I have built this LSTM class:
import tensorflow as tf
import Constants
class LSTM():
def __init__(self,
inputShape,
outputShape,
numLayers=Constants.numLayers,
numHidden=Constants.numHidden,
learningRate=Constants.learningRate,
forgetBias=Constants.forgetBias):
self.inputs = tf.placeholder(tf.float32, [None] + inputShape)
self.labels = tf.placeholder(tf.float32, [None] + outputShape)
self.inputTensors = tf.unstack(self.inputs, axis=1)
self.weights = tf.Variable(tf.random_normal([numHidden] + outputShape))
self.bias = tf.Variable(tf.random_normal(outputShape))
layers = [tf.contrib.rnn.LSTMCell(numHidden, forget_bias=forgetBias, state_is_tuple=True)] * numLayers
self.cell = tf.contrib.rnn.MultiRNNCell(layers, state_is_tuple=True)
self.optimiser = tf.train.GradientDescentOptimizer(learningRate)
self.forgetBias = forgetBias
self.batchDict = None
self.outputs = None
self.finalStates = None
self.predictions = None
self.loss = None
self.accuracy = None
self.optimise = None
self.session = tf.Session()
self.__buildGraph()
def __buildGraph(self):
outputs, finalStates = tf.nn.static_rnn(self.cell, self.inputTensors, dtype=tf.float32)
predictions = tf.add(tf.matmul(outputs[-1], self.weights), self.bias)
self.predictions = tf.minimum(tf.maximum(predictions, 0), 1)
self.loss = tf.losses.mean_squared_error(predictions=self.predictions, labels=self.labels)
self.accuracy = tf.reduce_mean(1 - tf.abs(self.labels - self.predictions) / 1.0)
self.optimise = self.optimiser.minimize(self.loss)
self.session.run(tf.global_variables_initializer())
def __execute(self, operation):
return self.session.run(operation, self.batchDict)
def setBatch(self, inputs, labels):
self.batchDict = {self.inputs: inputs, self.labels: labels}
def batchLabels(self):
return self.__execute(self.labels)
def batchPredictions(self):
return self.__execute(self.predictions)
def batchLoss(self):
return self.__execute(self.loss)
def batchAccuracy(self):
return self.__execute(self.accuracy)
def processBatch(self):
self.__execute(self.optimise)
def kill(self):
self.session.close()
and I run it like so:
import DataWorker
import Constants
from Model import LSTM
inputShape = [Constants.sequenceLength, DataWorker.numFeatures]
outputShape = [1]
LSTM = LSTM(inputShape, outputShape)
# #############################################
# TRAINING
# #############################################
for epoch in range(Constants.numEpochs):
print("***** EPOCH:", epoch + 1, "*****\n")
IDPointer, TSPointer = 0, 0
epochComplete = False
batchNum = 0
while not epochComplete:
batchNum += 1
batchX, batchY, IDPointer, TSPointer, epochComplete = DataWorker.generateBatch(IDPointer, TSPointer)
LSTM.setBatch(batchX, batchY)
LSTM.processBatch()
if batchNum % Constants.printStep == 0 or epochComplete:
print("Batch:\t\t", batchNum)
print("Last Pred:\t", LSTM.batchPredictions()[-1][0])
print("Last Label:\t", LSTM.batchLabels()[-1][0])
print("Loss:\t\t", LSTM.batchLoss())
print("Accuracy:\t", str("%.2f" % (LSTM.batchAccuracy() * 100) + "%\n"))
# #############################################
# TESTING
# #############################################
testX, testY = DataWorker.generateTestBatch()
LSTM.setBatchDict(testX, testY)
testAccuracy = LSTM.batchAccuracy()
print("Testing Accuracy:", str("%.2f" % (testAccuracy * 100) + "%"))
LSTM.kill()
This all works well as it should. However, I am using time series data which consists of financial stocks spanning over ranges of timestamps far greater than the number of time steps that my LSTM is unrolled for - Constants.sequenceLength. Because of this, it takes many sequential batches for a single stock t be processed, and so the state/memory of my LSTM needs to be passed between batches. As well as this, after a batch that completes the lifespan of an ID, the next batch would be passing in a new ID from the initial timestamp of my dataset, and so I would want to reset the memory.
There are many questions asking something similar, and all of the answers are adequate, however, none seem to address the issue of using variable batch sizes - batch sizes initialised to None and then inferred when a batch is passed in. My batches are usually a constant size, but do change under certain circumstances and I cannot change this. How can I have control over passing the state between batches, as well as resetting the state, if I have not specified the batch size?