I have a GAN that aims to reproduce the paper CycleGAN. However, even though everything is implemented correctly (as for some basic dataset it works), with the Facade Dataset the discriminator predicts the following:
print(f"Segm: {tf.reduce_mean(discriminator_segm(segmented_test[0:10])).numpy()}, {tf.reduce_mean(discriminator_segm(generator_real_to_segm(real_test[0:10]))).numpy()}")
print(f"Real: {tf.reduce_mean(discriminator_real(real_test[0:10])).numpy()}, {tf.reduce_mean(discriminator_real(generator_segm_to_real(segmented_test[0:10]))).numpy()}")
# Segm: 0.9463781714439392, 0.9564124941825867
# Real: 0.9635561108589172, 0.8240727782249451
In other words, it considers the generated output as good as the ground truth
This makes no sense, as the best guess, if the generated images were identical to the ground truth (which are not, are very far from being similar), should be to predict 0.5
At this point, I tried to train the discriminator many more times than the generator, with no difference (still get the accuracy near 90%)
If you need, even though it's a bit long, this is the training loop:
def train(EPOCHS = 400, batch_size = 4):
real_train_batched = real_train.reshape((-1, batch_size, *real_train.shape[1:]))
segmented_train_batched = segmented_train.reshape((-1, batch_size, *segmented_train.shape[1:]))
for e in range(EPOCHS):
print(f"Epoch: {e+1}/{EPOCHS}")
gen_perm = lambda : np.random.permutation(range(len(real_train_batched)))
perm = gen_perm()
real_train_batched_gen = real_train_batched[perm]
segmented_train_batched_gen = segmented_train_batched[perm]
real_train_batched_disc = real_train_batched[perm]
segmented_train_batched_disc = segmented_train_batched[perm]
pack = zip(real_train_batched_gen, segmented_train_batched_gen, real_train_batched_disc, segmented_train_batched_disc)
for i, (b_real_gen, b_segm_gen, b_real_disc, b_segm_disc) in enumerate(pack):
print(".", end="")
let_the_magic_happen(b_segm_gen, b_real_gen, b_segm_disc, b_real_disc, alpha=tf.constant(0.00000005))
#tf.function
def let_the_magic_happen_discriminator(batch_segmented_gen, batch_real_gen, batch_segmented_disc, batch_real_disc):
fake_real_images = generator_segm_to_real(batch_segmented_gen, training = False)
fake_segmented_images = generator_real_to_segm(batch_real_gen, training = False)
with tf.GradientTape(persistent=True) as disc_tape:
real_real_pred = discriminator_real(batch_real_disc, training=True)
real_fake_pred = discriminator_real(fake_real_images, training=True)
segm_real_pred = discriminator_segm(batch_segmented_disc, training=True)
segm_fake_pred = discriminator_segm(fake_segmented_images, training=True)
segm_disc_loss = discriminator_loss(segm_real_pred, segm_fake_pred)
real_disc_loss = discriminator_loss(real_real_pred, real_fake_pred)
segm_disc_grad = disc_tape.gradient(segm_disc_loss, discriminator_segm.trainable_weights)
real_disc_grad = disc_tape.gradient(real_disc_loss, discriminator_real.trainable_weights)
segm_disc_optimizer.apply_gradients(zip(segm_disc_grad , discriminator_segm.trainable_weights))
real_disc_optimizer.apply_gradients(zip(real_disc_grad , discriminator_real.trainable_weights))
#tf.function
def let_the_magic_happen_generator(batch_segmented_gen, batch_real_gen, alpha):
with tf.GradientTape(persistent=True) as gen_tape:
fake_real_images = generator_segm_to_real(batch_segmented_gen, training = True)
fake_segmented_images = generator_real_to_segm(batch_real_gen, training = True)
fake_real_images_pred = discriminator_real(fake_real_images, training=False)
fake_segmented_images_pred = discriminator_segm(fake_segmented_images, training=False)
batch_real_gen_reconstruction = generator_segm_to_real(fake_segmented_images, training=True)
batch_segmented_gen_reconstruction = generator_real_to_segm(fake_real_images, training=True)
reconstruction_loss = alpha * (
tf.keras.losses.MeanAbsoluteError()(batch_segmented_gen, batch_segmented_gen_reconstruction) +
tf.keras.losses.MeanAbsoluteError()(batch_real_gen, batch_real_gen_reconstruction)
)
# tf.print(reconstruction_loss)
# tf.print(generator_loss(fake_real_images_pred))
# tf.print(generator_loss(fake_segmented_images_pred))
segm_to_real_gen_loss = generator_loss(fake_real_images_pred) + reconstruction_loss
real_to_segm_gen_loss = generator_loss(fake_segmented_images_pred) + reconstruction_loss
real_to_segm_gen_grad = gen_tape.gradient(real_to_segm_gen_loss, generator_real_to_segm.trainable_weights)
segm_to_real_gen_grad = gen_tape.gradient(segm_to_real_gen_loss, generator_segm_to_real.trainable_weights)
real_to_segm_gen_optimizer.apply_gradients(zip(real_to_segm_gen_grad , generator_real_to_segm.trainable_weights))
segm_to_real_gen_optimizer.apply_gradients(zip(segm_to_real_gen_grad , generator_segm_to_real.trainable_weights))
def let_the_magic_happen(batch_segmented_gen, batch_real_gen, batch_segmented_disc, batch_real_disc, alpha):
let_the_magic_happen_discriminator(batch_segmented_gen, batch_real_gen, batch_segmented_disc, batch_real_disc)
if tf.random.uniform([1]) < 0.2 :
let_the_magic_happen_generator(batch_segmented_gen, batch_real_gen, alpha = alpha)
let_the_magic_happen_discriminator(batch_segmented_gen, batch_real_gen, batch_segmented_disc, batch_real_disc)
#tf.function
def generator_loss(fake_pred) :
return tf.keras.losses.BinaryCrossentropy()(tf.ones_like(fake_pred), fake_pred)
#tf.function
def discriminator_loss(real_pred, fake_pred):
return tf.keras.losses.BinaryCrossentropy()(
tf.concat((tf.ones_like(real_pred) - 5e-2, tf.zeros_like(fake_pred)),axis=0),
tf.concat((real_pred, fake_pred),axis=0),
)
The generators are 2 U-Net where the discriminator are 2 LeNet (also tried with PatchGAN, same thing happens)
I've also already tried with MAE instead of BCE
Related
I have an issue when calculating one of the losses of my GAN models in Tensorflow, when attempting to calculate it using MeanAbsoluteError(). I am well aware that the shapes need to match to be added together to produce a loss, but the confusing part is that after calling model.fit(), it proceeds well and doesn't throw an error, until the very end of the epoch.
I've been doing logging and verified that the shapes indeed do match:
07:40:43,483 root DEBUG Mean loss for base XY: (None, 64, 64, 64) and (None, 64, 64, 64)
07:40:43,500 root DEBUG Mean loss for base YX: (None, 64, 64, 64) and (None, 64, 64, 64)
07:40:43,516 root DEBUG Base MAE success
07:40:43,516 root DEBUG Actual value sample: Tensor("mean_absolute_error_1/weighted_loss/value:0", shape=(), dtype=float32)
This tells me that it indeed succeeded, and produced a single float32 value. After that, however, it appears to crash somewhere, and I'm guessing it's at the part where all the values are added together to a single gen_XY loss for the $X \rightarrow Y$ generator. But since we confirmed it is indeed a 1D value, this shouldn't be the case.
I've tried commenting out this specific loss (out of the several I'm using) and the issue doesn't persist, which confirms to me it's related to this specific loss.
The loss is a loss between the latent feature maps in the middle between the encoder and decoder part of the Generator network.
Here is my model definition, excluding individual definitions of Up, Down and ResNet blocks:
class LGan(Model):
def __init__(self, gen_XY, gen_YX, disc_X, disc_Y, lambda_cycle=10.0, lambda_identity=2.0, lambda_base=5.0, lambda_sim=2.0, lambda_adv=2.0):
super(LGan, self).__init__(name="BokorGan")
self.gen_XY = gen_XY
self.gen_YX = gen_YX
self.disc_X = disc_X
self.disc_Y = disc_Y
self.lambda_cycle = lambda_cycle
self.lambda_identity = lambda_identity
self.lambda_base = lambda_base
self.lambda_sim = lambda_sim
self.lambda_adv = lambda_adv
self.XY_loss = []
self.YX_loss = []
self.X_loss = []
self.Y_loss = []
self.epoch = 0
self.cc_loss = CycleLoss()
self.id_loss = IdentityLoss()
self.base_loss = MeanAbsoluteError()
self.sim_loss = SimilarityLoss()
self.gen_loss = None
self.disc_loss = None
def compile(self, gen_XY_optim, gen_YX_optim, disc_X_optim, disc_Y_optim, gen_loss, disc_loss):
super(LGan, self).compile()
self.gen_XY_optim = gen_XY_optim
self.gen_YX_optim = gen_YX_optim
self.disc_X_optim = disc_X_optim
self.disc_Y_optim = disc_Y_optim
self.gen_loss = gen_loss
self.disc_loss = disc_loss
def train_step(self, input_pair):
input_x, input_y = input_pair
with tf.GradientTape(persistent=True) as tape:
# Generator outputs
gen_x, gen_latent_y = self.gen_YX(input_y)
gen_y, gen_latent_x = self.gen_XY(input_x)
cycle_x, cycled_latent_y = self.gen_YX(gen_y)
cycle_y, cycled_latent_x = self.gen_XY(gen_x)
id_x, _ = self.gen_YX(input_x)
id_y, _ = self.gen_XY(input_y)
# Discriminator outputs
disc_true_x = self.disc_X(input_x)
disc_fake_x = self.disc_X(gen_x)
disc_true_y = self.disc_Y(input_y)
disc_fake_y = self.disc_Y(gen_y)
# Adversarial loss
adv_XY_loss = self.gen_loss(disc_fake_y)
adv_YX_loss = self.gen_loss(disc_fake_x)
# Cycle loss
cycle_XY_loss = self.cc_loss(input_y, cycle_y)
cycle_YX_loss = self.cc_loss(input_x, cycle_x)
# Identity loss
id_XY_loss = self.id_loss(input_y, id_y)
id_YX_loss = self.id_loss(input_x, id_x)
# Similarity loss
sim_XY_loss = self.sim_loss(input_y, cycle_y)
logger.debug(f"Sample inputs to similarity loss: {input_y.shape} and {cycle_y.shape}")
sim_YX_loss = self.sim_loss(input_x, cycle_x)
logger.debug(f"Actual value sample: {sim_XY_loss}")
# Base loss
logger.debug(f"Mean loss for base XY: {gen_latent_y.shape} and {cycled_latent_y.shape}")
base_XY_loss = self.base_loss(gen_latent_y, cycled_latent_y)
logger.debug(f"Mean loss for base YX: {gen_latent_x.shape} and {cycled_latent_x.shape}")
base_YX_loss = self.base_loss(gen_latent_x, cycled_latent_x)
logger.debug("Base success")
logger.debug(f"Actual value sample: {base_YX_loss}")
# Total XY loss
total_XY_loss = (
adv_XY_loss * self.lambda_adv
+ cycle_XY_loss * self.lambda_cycle
+ id_XY_loss * self.lambda_identity
+ sim_XY_loss * self.lambda_sim
+ base_XY_loss * self.lambda_base
)
# Total YX loss
total_YX_loss = (
adv_YX_loss * self.lambda_adv
+ cycle_YX_loss * self.lambda_cycle
+ id_YX_loss * self.lambda_identity
+ sim_YX_loss * self.lambda_sim
+ base_YX_loss * self.lambda_base
)
# Discriminator (X) loss
disc_X_loss = self.disc_loss(disc_true_x, disc_fake_x)
# Discriminator (Y) loss
disc_Y_loss = self.disc_loss(disc_true_y, disc_fake_y)
grads_XY = tape.gradient(total_XY_loss, self.gen_XY.trainable_variables)
grads_YX = tape.gradient(total_YX_loss, self.gen_YX.trainable_variables)
# Get the gradients for the discriminators
disc_X_grads = tape.gradient(disc_X_loss, self.disc_X.trainable_variables)
disc_Y_grads = tape.gradient(disc_Y_loss, self.disc_Y.trainable_variables)
# Update the weights of the generators
self.gen_XY_optim.apply_gradients(
zip(grads_XY, self.gen_XY.trainable_variables)
)
self.gen_YX_optim.apply_gradients(
zip(grads_YX, self.gen_YX.trainable_variables)
)
# Update the weights of the discriminators
self.disc_X_optim.apply_gradients(
zip(disc_X_grads, self.disc_X.trainable_variables)
)
self.disc_Y_optim.apply_gradients(
zip(disc_Y_grads, self.disc_Y.trainable_variables)
)
return {
"XY_loss": total_XY_loss,
"YX_loss": total_YX_loss,
"D_X_loss": disc_X_loss,
"D_Y_loss": disc_Y_loss,
}
I'm not sure why this error happens as it only happens on the very end of the epoch. According to the error, the part where the losses are calculated happens only once, and that is on the end of the epoch. I'm new to NN so I'm not sure why this happens since they're part of the same training step, and even part of the same gradient tape context.
I have build an image colorizer. It generates image quiet good, but I am afraid it generates the grayscale image. I don't know why, maybe the problem is with the model? Perhaps the loss ? Or maybe something else. The autoencoding part in Generator model works well, but doesn't produce colored image. Here is the code that you might need
def train(self,gray_scale_image_dataset,color_image_dataset,test_image):
generator = self.generator_model()
discriminator = self.discriminator_model()
gen_optimizer = tf.train.AdamOptimizer(self.learning_rate)
dis_optimizer = tf.train.AdamOptimizer(self.learning_rate)
for eachEpoch in range(self.epochs):
with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
for i in range(20):
random.shuffle(gray_scale_image_dataset)
random.shuffle(color_image_dataset)
gray_scale_dataset_image = gray_scale_image_dataset[:self.batch_size]
color_dataset_image_batch = color_image_dataset[:self.batch_size]
generated_image = generator(gray_scale_dataset_image)
real_output = discriminator(color_dataset_image_batch)
fake_output = discriminator(generated_image)
gen_loss = self.generator_loss(fake_output,generated_image,color_dataset_image_batch)
dis_loss = self.discriminator_loss(fake_output,real_output)
print("generator = {} discriminator = {}".format(gen_loss,dis_loss))
gen_gradients = gen_tape.gradient(gen_loss,generator.trainable_variables)
disc_gradients = disc_tape.gradient(dis_loss,discriminator.trainable_variables)
print("APPLYING GRADENTS")
gen_optimizer.apply_gradients(zip(gen_gradients, generator.trainable_variables))
dis_optimizer.apply_gradients(zip(disc_gradients, discriminator.trainable_variables))
print ("EPOCHS COMPLETED = {} ".format(eachEpoch))
#for drawing test_image
self.draw_images(generator,test_image)
The generator loss function has MSE and sigmoid cross entropy
def generator_loss(self,fake_output,generated_image, actual_image,regularizer_lambda=100):
mse = tf.reduce_mean(regularizer_lambda*tf.keras.losses.mean_absolute_error(generated_image, actual_image))
return tf.nn.sigmoid_cross_entropy_with_logits(labels = tf.ones_like(fake_output)*0.9,logits = fake_output) + mse
The output is not colored ,here is the sample of output image from given grayscale image:
Short Description of my model
I am trying to write my own DQN algorithm in Python, using Tensorflow following the paper(Mnih et al., 2015). In train_DQN function, I have defined the training procedure, and DQN_CartPole is for defining the function approximation(simple 3-layered Neural Network). For loss function, Huber loss or MSE is implemented followed by the gradient clipping(between -1 and 1). Then, I have implemented soft-update method instead of hard-update of the target network by copying the weights in the main network.
Question
I am trying it on the CartPole environment(OpenAI gym), but the rewards does not improve as it does in other people's algorithms, such as keras-rl. Any help will be appreciated.
reward over timestep
If possible, could you have a look at the source code?
DQN model: https://github.com/Rowing0914/TF_RL/blob/master/agents/DQN_model.py
Training Script: https://github.com/Rowing0914/TF_RL/blob/master/agents/DQN_train.py
Reddit post: https://www.reddit.com/r/reinforcementlearning/comments/ba7o55/question_dqn_algorithm_does_not_work_well_on/?utm_source=share&utm_medium=web2x
class Parameters:
def __init__(self, mode=None):
assert mode != None
print("Loading Params for {} Environment".format(mode))
if mode == "Atari":
self.state_reshape = (1, 84, 84, 1)
self.num_frames = 1000000
self.memory_size = 10000
self.learning_start = 10000
self.sync_freq = 1000
self.batch_size = 32
self.gamma = 0.99
self.update_hard_or_soft = "soft"
self.soft_update_tau = 1e-2
self.epsilon_start = 1.0
self.epsilon_end = 0.01
self.decay_steps = 1000
self.prioritized_replay_alpha = 0.6
self.prioritized_replay_beta_start = 0.4
self.prioritized_replay_beta_end = 1.0
self.prioritized_replay_noise = 1e-6
elif mode == "CartPole":
self.state_reshape = (1, 4)
self.num_frames = 10000
self.memory_size = 20000
self.learning_start = 100
self.sync_freq = 100
self.batch_size = 32
self.gamma = 0.99
self.update_hard_or_soft = "soft"
self.soft_update_tau = 1e-2
self.epsilon_start = 1.0
self.epsilon_end = 0.01
self.decay_steps = 500
self.prioritized_replay_alpha = 0.6
self.prioritized_replay_beta_start = 0.4
self.prioritized_replay_beta_end = 1.0
self.prioritized_replay_noise = 1e-6
class _DQN:
"""
Boilerplate for DQN Agent
"""
def __init__(self):
"""
define the deep learning model here!
"""
pass
def predict(self, sess, state):
"""
predict q-values given a state
:param sess:
:param state:
:return:
"""
return sess.run(self.pred, feed_dict={self.state: state})
def update(self, sess, state, action, Y):
feed_dict = {self.state: state, self.action: action, self.Y: Y}
_, loss = sess.run([self.train_op, self.loss], feed_dict=feed_dict)
# print(action, Y, sess.run(self.idx_flattened, feed_dict=feed_dict))
return loss
class DQN_CartPole(_DQN):
"""
DQN Agent for CartPole game
"""
def __init__(self, scope, env, loss_fn ="MSE"):
self.scope = scope
self.num_action = env.action_space.n
with tf.variable_scope(scope):
self.state = tf.placeholder(shape=[None, 4], dtype=tf.float32, name="X")
self.Y = tf.placeholder(shape=[None], dtype=tf.float32, name="Y")
self.action = tf.placeholder(shape=[None], dtype=tf.int32, name="action")
fc1 = tf.keras.layers.Dense(16, activation=tf.nn.relu)(self.state)
fc2 = tf.keras.layers.Dense(16, activation=tf.nn.relu)(fc1)
fc3 = tf.keras.layers.Dense(16, activation=tf.nn.relu)(fc2)
self.pred = tf.keras.layers.Dense(self.num_action, activation=tf.nn.relu)(fc3)
# indices of the executed actions
self.idx_flattened = tf.range(0, tf.shape(self.pred)[0]) * tf.shape(self.pred)[1] + self.action
# passing [-1] to tf.reshape means flatten the array
# using tf.gather, associate Q-values with the executed actions
self.action_probs = tf.gather(tf.reshape(self.pred, [-1]), self.idx_flattened)
if loss_fn == "huber_loss":
# use huber loss
self.losses = tf.subtract(self.Y, self.action_probs)
self.loss = huber_loss(self.losses)
elif loss_fn == "MSE":
# use MSE
self.losses = tf.squared_difference(self.Y, self.action_probs)
self.loss = tf.reduce_mean(self.losses)
else:
assert False
# you can choose whatever you want for the optimiser
# self.optimizer = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6)
self.optimizer = tf.train.AdamOptimizer()
# to apply Gradient Clipping, we have to directly operate on the optimiser
# check this: https://www.tensorflow.org/api_docs/python/tf/train/Optimizer#processing_gradients_before_applying_them
self.grads_and_vars = self.optimizer.compute_gradients(self.loss)
self.clipped_grads_and_vars = [(ClipIfNotNone(grad, -1., 1.), var) for grad, var in self.grads_and_vars]
self.train_op = self.optimizer.apply_gradients(self.clipped_grads_and_vars)
def train_DQN(main_model, target_model, env, replay_buffer, policy, params):
"""
Train DQN agent which defined above
:param main_model:
:param target_model:
:param env:
:param params:
:return:
"""
# log purpose
losses, all_rewards, cnt_action = [], [], []
episode_reward, index_episode = 0, 0
with tf.Session() as sess:
# initialise all variables used in the model
sess.run(tf.global_variables_initializer())
state = env.reset()
start = time.time()
for frame_idx in range(1, params.num_frames + 1):
action = policy.select_action(sess, target_model, state.reshape(params.state_reshape))
cnt_action.append(action)
next_state, reward, done, _ = env.step(action)
replay_buffer.add(state, action, reward, next_state, done)
state = next_state
episode_reward += reward
if done:
index_episode += 1
state = env.reset()
all_rewards.append(episode_reward)
if frame_idx > params.learning_start and len(replay_buffer) > params.batch_size:
states, actions, rewards, next_states, dones = replay_buffer.sample(params.batch_size)
next_Q = target_model.predict(sess, next_states)
Y = rewards + params.gamma * np.max(next_Q, axis=1) * np.logical_not(dones)
loss = main_model.update(sess, states, actions, Y)
# Logging and refreshing log purpose values
losses.append(np.mean(loss))
logging(frame_idx, params.num_frames, index_episode, time.time()-start, episode_reward, np.mean(loss), cnt_action)
episode_reward = 0
cnt_action = []
start = time.time()
if frame_idx > params.learning_start and frame_idx % params.sync_freq == 0:
# soft update means we partially add the original weights of target model instead of completely
# sharing the weights among main and target models
if params.update_hard_or_soft == "hard":
sync_main_target(sess, main_model, target_model)
elif params.update_hard_or_soft == "soft":
soft_target_model_update(sess, main_model, target_model, tau=params.soft_update_tau)
return all_rewards, losses
Modification
dones -> np.logical_not(dones)
np.argmax -> np.max
separating MSE from huber_loss
Briefly looking over, it seems that the dones variable is a binary vector where 1 denotes done, and 0 denotes not-done.
You then use dones here:
Y = rewards + params.gamma * np.argmax(next_Q, axis=1) * dones
So for all terminating transitions, you add the expected cumulative reward when following the policy for the rest of the episode (which is zero). For all non-terminating transitions, you do not add the expect cumulative reward.
I think you mean to do this the other way around, perhaps swap dones in the above line of code with np.logical_not(dones)?
Also, now that I look at it, it seems there is another major problem with this line. np.argmax(next_Q, axis=1) returns the index of the maximum value in next_Q vector, not the actual maximum value. You need np.maximum(next_Q, axis=1) (IIRC) to get the maximum expected reward of the next state's actions.
EDIT: The loss function is also very strangely defined. You are mixing Huber Loss with Mean-Squared-Error. If you want to use either huber_loss or MSE, you just compute them on the difference between the expected and predicted values. You appear to be doing both, which is certainly not a commonly defined loss function. For example, your model loss to use Huber Loss should just be:
self.loss = tf.reduce_mean(huber_loss(abs(self.Y - self.action_probs)))
I have been using TensorFlow for a reasonable length of time now. and believed I had a thorough understanding of how a TensorFlow graph works and executes within a session. However, I have written all of my TensorFlow models in a script-like fashion as such:
import tensorflow as tf
import DataWorker
import Constants
x = tf.placeholder(tf.float32, [None, Constants.sequenceLength, DataWorker.numFeatures])
y = tf.placeholder(tf.float32, [None, 1])
xTensors = tf.unstack(x, axis=1) # [seqLength tensors of shape (batchSize, numFeatures)]
W = tf.Variable(tf.random_normal([Constants.numHidden, 1])) # Weighted matrix
b = tf.Variable(tf.random_normal([1])) # Bias
cell = tf.contrib.rnn.BasicLSTMCell(Constants.numHidden, forget_bias=Constants.forgetBias)
outputs, finalState = tf.nn.static_rnn(cell, xTensors, dtype=tf.float32)
# predictions = [tf.add(tf.matmul(output, W), b) for output in outputs] # List of predictions after each time step
prediction = tf.add(tf.matmul(outputs[-1], W), b) # Prediction after final time step
prediction = tf.tanh(prediction) # Activation
mse = tf.losses.mean_squared_error(predictions=prediction, labels=y) # Mean loss over entire batch
accuracy = tf.reduce_mean(1 - (tf.abs(y - prediction) / DataWorker.labelRange)) # Accuracy over entire batch
optimiser = tf.train.AdamOptimizer(Constants.learningRate).minimize(mse) # Backpropagation
with tf.Session() as session:
session.run(tf.global_variables_initializer())
# #############################################
# TRAINING
# #############################################
for epoch in range(Constants.numEpochs):
print("***** EPOCH:", epoch + 1, "*****\n")
IDPointer, TSPointer = 0, 0 # Pointers to current ID and timestamp
epochComplete = False
batchNum = 0
while not epochComplete:
batchNum += 1
batchX, batchY, IDPointer, TSPointer, epochComplete = DataWorker.generateBatch(IDPointer, TSPointer, isTraining=True)
dict = {x: batchX, y: batchY}
session.run(optimiser, dict)
if batchNum % 1000 == 0 or epochComplete:
batchLoss = session.run(mse, dict)
batchAccuracy = session.run(accuracy, dict)
print("Iteration:", batchNum)
print(batchLoss)
print(str("%.2f" % (batchAccuracy * 100) + "%\n"))
# #############################################
# TESTING
# #############################################
testX, testY, _, _, _ = DataWorker.generateBatch(0, 0, isTraining=False)
testAccuracy = session.run(accuracy, {x: testX, y: testY})
print("Testing Accuracy:", str("%.2f" % (testAccuracy * 100) + "%"))
But now, for practicality and readability, I want to implement my model as a class, but have encountered many problems with initializing my variables, etc.
This is the closest I have got to implementing the above example using my own LSTM class
Model.py
import tensorflow as tf
import Constants
import DataWorker # Remove this dependency
class LSTM():
"""docstring."""
def __init__(self,
inputDimensionList,
outputDimensionList,
numLayers=Constants.numLayers,
numHidden=Constants.numHidden,
learningRate=Constants.learningRate,
forgetBias=Constants.forgetBias
):
"""docstring."""
self.batchInputs = tf.placeholder(tf.float32, [None] + inputDimensionList)
self.batchLabels = tf.placeholder(tf.float32, [None] + outputDimensionList)
self.weightedMatrix = tf.Variable(tf.random_normal([numHidden] + outputDimensionList))
self.biasMatrix = tf.Variable(tf.random_normal(outputDimensionList))
self.cell = tf.contrib.rnn.BasicLSTMCell(numHidden, forget_bias=forgetBias)
self.numLayers = numLayers
self.numHidden = numHidden
self.learningRate = learningRate
self.forgetBias = forgetBias
self.batchDict = {}
self.batchInputTensors = None
self.batchOutputs = None # All needed as instance variables?
self.batchFinalStates = None
self.batchPredictions = None
self.batchLoss = None
self.batchAccuracy = None
self.initialised = False
self.session = tf.Session()
# Take in activation, loss and optimiser FUNCTIONS as args
def execute(self, command):
"""docstring."""
return self.session.run(command, self.batchDict)
def setBatchDict(self, inputs, labels):
"""docstring."""
self.batchDict = {self.batchInputs: inputs, self.batchLabels: labels}
self.batchInputTensors = tf.unstack(self.batchInputs, axis=1)
def processBatch(self):
"""docstring."""
self.batchOutputs, self.batchFinalState = tf.nn.static_rnn(self.cell, self.batchInputTensors, dtype=tf.float32)
pred = tf.tanh(tf.add(tf.matmul(self.batchOutputs[-1], self.weightedMatrix), self.biasMatrix))
mse = tf.losses.mean_squared_error(predictions=pred, labels=self.batchLabels)
optimiser = tf.train.AdamOptimizer(self.learningRate).minimize(mse)
if not self.initialised:
self.session.run(tf.global_variables_initializer())
self.initialised = True
with tf.variable_scope("model") as scope:
if self.initialised:
scope.reuse_variables()
self.execute(optimiser)
self.batchPredictions = self.execute(pred)
self.batchLoss = self.execute(tf.losses.mean_squared_error(predictions=self.batchPredictions, labels=self.batchLabels))
self.batchAccuracy = self.execute(tf.reduce_mean(1 - (tf.abs(self.batchLabels - self.batchPredictions) / DataWorker.labelRange)))
return self.batchPredictions, self.batchLabels, self.batchLoss, self.batchAccuracy
def kill(self):
"""docstring."""
self.session.close()
This class is quite messy, especially processBatch() as I have just been trying to get it to work before refining it.
I then run my model here:
Main.py
import DataWorker
import Constants
from Model import LSTM
inputDim = [Constants.sequenceLength, DataWorker.numFeatures]
outputDim = [1]
lstm = LSTM(inputDimensionList=inputDim, outputDimensionList=outputDim)
# #############################################
# TRAINING
# #############################################
for epoch in range(Constants.numEpochs):
print("***** EPOCH:", epoch + 1, "*****\n")
IDPointer, TSPointer = 0, 0 # Pointers to current ID and timestamp
epochComplete = False
batchNum = 0
while not epochComplete:
batchNum += 1
batchX, batchY, IDPointer, TSPointer, epochComplete = DataWorker.generateBatch(IDPointer, TSPointer, isTraining=True)
lstm.setBatchDict(batchX, batchY)
batchPredictions, batchLabels, batchLoss, batchAccuracy = lstm.runBatch()
if batchNum % 1000 == 0 or epochComplete:
print("Iteration:", batchNum)
print("Pred:", batchPredictions[-1], "\tLabel:", batchLabels[-1])
print("Loss:", batchLoss)
print("Accuracy:", str("%.2f" % (batchAccuracy * 100) + "%\n"))
# #############################################
# TESTING
# #############################################
testX, testY, _, _, _ = DataWorker.generateBatch(0, 0, isTraining=False)
lstm.setBatchDict(testX, testY)
_, _, _, testAccuracy = lstm.runBatch()
print("Testing Accuracy:", str("%.2f" % (testAccuracy * 100) + "%"))
lstm.kill()
A single passthrough of the graph is executed fine, when all the variables are initialized, but it is on the second iteration where I get the error
ValueError: Variable rnn/basic_lstm_cell/kernel/Adam/ already exists, disallowed. Did you mean to set reuse=True in VarScope? Originally defined at:
optimiser = tf.train.AdamOptimizer(self.learningRate).minimize(mse)
I Googled this problem and learned that using scope.reuse_variables() should stop it trying to initialize the AdamOptimizer a second time, but cleary this isn't working how I have implemented it. How can I fix this issue?
As a side note, is my method of creating the TensorFlow session as an instance variable within my LSTM class acceptable, or should I create the session in Main and then pass it into the LSTM instance?
In general I wrap anything that creates variables under the hood with tf.make_template when doing object oriented model building.
However, you should avoid adding ops to the graph in a training loop, which looks like it's happening here. They will build up and cause problems, and likely give you incorrect results. Instead, define the graph (with inputs from tf.data, placeholders, or queues) and only loop over a session.run call. Even better, structure your code as an Estimator and this will be enforced.
I've setup a print statement and I've noticed that for the first batch when feeding an RNN, the embeddings exist, but after the second batch they don't and I get the following error:
ValueError: Variable RNNLM/RNNLM/Embedding/Adam_2/ does not exist, or was not created with tf.get_variable(). Did you mean to set reuse=None in VarScope?
Here is my code for generating the embeddings:
def add_embedding(self):
with tf.device('/gpu:0'):
embedding = tf.get_variable("Embedding", [len(self.vocab), self.config.embed_size])
e_x = tf.nn.embedding_lookup(embedding, self.input_placeholder)
inputs = [tf.squeeze(s, [1]) for s in tf.split(1, self.config.num_steps, e_x)]
return inputs
Here is how the model is seutp, this is where I suspect the problem lies
def model(self, inputs):
with tf.variable_scope("input_drop"):
inputs_drop = [tf.nn.dropout(i, self.dropout_placeholder) for i in inputs]
with tf.variable_scope("RNN") as scope:
self.initial_state = tf.zeros([self.config.batch_size, self.config.hidden_size], tf.float32)
state = self.initial_state
states = []
for t, e in enumerate(inputs_drop):
print "t is {0}".format(t)
if t > 0:
scope.reuse_variables()
H = tf.get_variable("Hidden", [self.config.hidden_size, self.config.hidden_size])
I = tf.get_variable("I", [self.config.embed_size, self.config.hidden_size])
b_1 = tf.get_variable("b_1", (self.config.hidden_size,))
state = tf.sigmoid(tf.matmul(state, H) + tf.matmul(e, I) + b_1)
states.append(state)
with tf.variable_scope("output_dropout"):
rnn_outputs = [tf.nn.dropout(o, self.dropout_placeholder) for o in states]
return rnn_outputs
The issue arises when I get to the loss function, defined as follows
def add_training_op(self, loss):
opt = tf.train.AdamOptimizer(self.config.lr)
train_op = opt.minimize(loss)
return train_op
EDIT: Here is some updated code to help everyone out
def __init__(self, config):
self.config = config
self.load_data(debug=False)
self.add_placeholders()
self.inputs = self.add_embedding()
self.rnn_outputs = self.add_model(self.inputs)
self.outputs = self.add_projection(self.rnn_outputs)
self.predictions = [tf.nn.softmax(tf.cast(o, 'float64')) for o in self.outputs]
output = tf.reshape(tf.concat(1, self.outputs), [-1, len(self.vocab)])
self.calculate_loss = self.add_loss_op(output)
self.train_step = self.add_training_op(self.calculate_loss)
Here are the other methods here, pertaining to add_projection and calculate_loss so we can rule them out.
def add_loss_op(self, output):
weights = tf.ones([self.config.batch_size * self.config.num_steps], tf.int32)
seq_loss = tf.python.seq2seq.sequence_loss(
[output],
tf.reshape(self.labels_placeholder, [-1]),
weights
)
tf.add_to_collection('total_loss', seq_loss)
loss = tf.add_n(tf.get_collection('total_loss'))
return loss
def add_projection(self, rnn_outputs):
with tf.variable_scope("Projection", initializer=tf.contrib.layers.xavier_initializer()) as scope:
U = tf.get_variable("U", [self.config.hidden_size, len(self.vocab)])
b_2 = tf.get_variable("b_2", [len(self.vocab)])
outputs = [tf.matmul(x, U) + b_2 for x in rnn_outputs]
return outputs
def train_RNNLM():
config = Config()
gen_config = deepcopy(config)
gen_config.batch_size = gen_config.num_steps = 1
with tf.variable_scope('RNNLM') as scope:
model = RNNLM_Model(config)
# This instructs gen_model to reuse the same variables as the model above
scope.reuse_variables()
gen_model = RNNLM_Model(gen_config)
init = tf.initialize_all_variables()
saver = tf.train.Saver()
with tf.Session() as session:
best_val_pp = float('inf')
best_val_epoch = 0
session.run(init)
for epoch in xrange(config.max_epochs):
print 'Epoch {}'.format(epoch)
start = time.time()
###
train_pp = model.run_epoch(
session, model.encoded_train,
train_op=model.train_step)
valid_pp = model.run_epoch(session, model.encoded_valid)
print 'Training perplexity: {}'.format(train_pp)
print 'Validation perplexity: {}'.format(valid_pp)
if valid_pp < best_val_pp:
best_val_pp = valid_pp
best_val_epoch = epoch
saver.save(session, './ptb_rnnlm.weights')
if epoch - best_val_epoch > config.early_stopping:
break
print 'Total time: {}'.format(time.time() - start)
Seems that the code is trying to create a new Adam Variable in each batch.
Possible that the add_training_op is called twice?
Also, the snippet of def add_training_op is incomplete since there is no return statement.
The problem turned out to be the following line of code:
model = RNNLM_Model(config)
# This instructs gen_model to reuse the same variables as the model above
scope.reuse_variables()
gen_model = RNNLM_Model(gen_config)
It turns out that the second model was an issue by using reuse_variables(). By removing this line by issues went away.