So far, I have been experimenting with Tensorflow and Keras. I took a code from image_ocr.py which allowed me to train printed text ocr. I want to see the training progress as it goes and have successfuly visualized the accuracy and loss of the training model. However, from what I have heard OCR RNN does not take accuracy as a validation but using mean edit distance instead to validate the accuracy of the words. In this case, I have been trying to get a variable called mean_ed and mean_norm_ed to be visualized in Tensorboard from class VizCallback. I have tried the method from this link but it still does not work. Can anyone help me with visualizing the mean edit distance variables? Here are the code snippets from my code:
class VizCallback(keras.callbacks.Callback):
def __init__(self, run_name, test_func, text_img_gen, num_display_words=6):
self.test_func = test_func
self.output_dir = os.path.join(
OUTPUT_DIR, run_name)
self.text_img_gen = text_img_gen
self.num_display_words = num_display_words
if not os.path.exists(self.output_dir):
os.makedirs(self.output_dir)
def on_train_begin(self, logs={}):
self.med = []
self.nmed = []
def show_edit_distance(self, num, logs={}):
num_left = num
mean_norm_ed = 0.0
mean_ed = 0.0
while num_left > 0:
word_batch = next(self.text_img_gen)[0]
num_proc = min(word_batch['the_input'].shape[0], num_left)
decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc])
for j in range(num_proc):
edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
mean_ed += float(edit_dist)
mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
num_left -= num_proc
mean_norm_ed = mean_norm_ed / num
mean_ed = mean_ed / num
#Create scalar summaries for both mean edit distance and normalized mean edit distance
tf_med_ph = tf.placeholder(tf.float32,shape=None,name='med_summary')
tf_nmed_ph = tf.placeholder(tf.float32,shape=None,name='nmed_summary')
tf_med = tf.summary.scalar('med', tf_med_ph)
tf_nmed = tf.summary.scalar('nmed', tf_nmed_ph)
performance_summaries = tf.summary.merge([tf_med,tf_nmed])
#Create a session for displaying the summary
config = tf.ConfigProto(allow_soft_placement=True)
session = tf.InteractiveSession(config=config)
summ_writer = tf.summary.FileWriter(os.path.join('summaries','first'), session.graph)
# Execute the summaries defined above
summ = session.run(performance_summaries, feed_dict={tf_med_ph:mean_ed, tf_nmed_ph:mean_norm_ed})
# Write the obtained summaries to the file, so it can be displayed in the TensorBoard
summ_writer.add_summary(summ, epoch)
session.close()
print('\nOut of %d samples: Mean edit distance: %.3f Mean normalized edit distance: %0.3f'
% (num, mean_ed, mean_norm_ed))
def on_epoch_end(self, epoch, logs={}):
self.model.save_weights(os.path.join(self.output_dir, 'weights%02d.h5' % (epoch)))
self.show_edit_distance(256)
word_batch = next(self.text_img_gen)[0]
res = decode_batch(self.test_func, word_batch['the_input'][0:self.num_display_words])
if word_batch['the_input'][0].shape[0] < 256:
cols = 2
else:
cols = 1
for i in range(self.num_display_words):
plt.subplot(self.num_display_words // cols, cols, i + 1)
if K.image_data_format() == 'channels_first':
the_input = word_batch['the_input'][i, 0, :, :]
else:
the_input = word_batch['the_input'][i, :, :, 0]
plt.imshow(the_input.T, cmap='Greys_r')
plt.xlabel('Truth = \'%s\'\nDecoded = \'%s\'' % (word_batch['source_str'][i], res[i]))
fig = plt.gcf()
fig.set_size_inches(10, 13)
plt.savefig(os.path.join(self.output_dir, 'e%02d.png' % (epoch)))
plt.close()
def train(run_name, start_epoch, stop_epoch, img_w):
# Input Parameters
img_h = 64
words_per_epoch = 16000
val_split = 0.2
val_words = int(words_per_epoch * (val_split))
# Network parameters
conv_filters = 16
kernel_size = (3, 3)
pool_size = 2
time_dense_size = 32
rnn_size = 512
minibatch_size = 32
if K.image_data_format() == 'channels_first':
input_shape = (1, img_w, img_h)
else:
input_shape = (img_w, img_h, 1)
fdir = os.path.dirname(get_file('wordlists.tgz',
origin='http://test.com/wordlist.tgz', untar=True))
img_gen = TextImageGenerator(monogram_file=os.path.join(fdir, 'wordlist_mono_clean.txt'),
bigram_file=os.path.join(fdir, 'wordlist_bi_clean.txt'),
minibatch_size=minibatch_size,
img_w=img_w,
img_h=img_h,
downsample_factor=(pool_size ** 2),
val_split=words_per_epoch - val_words
)
act = 'relu'
input_data = Input(name='the_input', shape=input_shape, dtype='float32')
inner = Conv2D(conv_filters, kernel_size, padding='same',
activation=act, kernel_initializer='he_normal',
name='conv1')(input_data)
inner = MaxPooling2D(pool_size=(pool_size, pool_size), name='max1')(inner)
inner = Conv2D(conv_filters, kernel_size, padding='same',
activation=act, kernel_initializer='he_normal',
name='conv2')(inner)
inner = MaxPooling2D(pool_size=(pool_size, pool_size), name='max2')(inner)
conv_to_rnn_dims = (img_w // (pool_size ** 2), (img_h // (pool_size ** 2)) * conv_filters)
inner = Reshape(target_shape=conv_to_rnn_dims, name='reshape')(inner)
# cuts down input size going into RNN:
inner = Dense(time_dense_size, activation=act, name='dense1')(inner)
# Two layers of bidirectional GRUs
# GRU seems to work as well, if not better than LSTM:
gru_1 = GRU(rnn_size, return_sequences=True, kernel_initializer='he_normal', name='gru1')(inner)
gru_1b = GRU(rnn_size, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', name='gru1_b')(inner)
gru1_merged = add([gru_1, gru_1b])
gru_2 = GRU(rnn_size, return_sequences=True, kernel_initializer='he_normal', name='gru2')(gru1_merged)
gru_2b = GRU(rnn_size, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', name='gru2_b')(gru1_merged)
# transforms RNN output to character activations:
inner = Dense(img_gen.get_output_size(), kernel_initializer='he_normal',
name='dense2')(concatenate([gru_2, gru_2b]))
y_pred = Activation('softmax', name='softmax')(inner)
Model(inputs=input_data, outputs=y_pred).summary()
labels = Input(name='the_labels', shape=[img_gen.absolute_max_string_len], dtype='float32')
input_length = Input(name='input_length', shape=[1], dtype='int64')
label_length = Input(name='label_length', shape=[1], dtype='int64')
# Keras doesn't currently support loss funcs with extra parameters
# so CTC loss is implemented in a lambda layer
loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([y_pred, labels, input_length, label_length])
# clipnorm seems to speeds up convergence
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5)
model = Model(inputs=[input_data, labels, input_length, label_length], outputs=loss_out)
#Make tensorboard instance
init_op = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init_op)
tbname="tensorboard-of-{}".format(int(time.time()))
tensorboard = keras.callbacks.TensorBoard(
log_dir="logs/{}".format(tbname),
histogram_freq=0,
write_images=True)
# the loss calc occurs elsewhere, so use a dummy lambda func for the loss
model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=sgd,
metrics=['accuracy'])
if start_epoch > 0:
weight_file = os.path.join(OUTPUT_DIR, os.path.join(run_name, 'weights%02d.h5' % (start_epoch - 1)))
model.load_weights(weight_file)
# captures output of softmax so we can decode the output during visualization
test_func = K.function([input_data], [y_pred])
viz_cb = VizCallback(run_name, test_func, img_gen.next_val())
model.fit_generator(generator=img_gen.next_train(),
steps_per_epoch=(words_per_epoch - val_words) // minibatch_size,
epochs=stop_epoch,
validation_data=img_gen.next_val(),
validation_steps=val_words // minibatch_size,
callbacks=[tensorboard,viz_cb, img_gen],
initial_epoch=start_epoch)
Any help would be much appriciated. Thank you!
P.S. I am using Tensorflow 1.9.0 and Python 3.6.8
UPDATE
now it is just a matter of passing the variable performance_summaries from the VizCallbak class towards the metrics in the train function. Any help here?
You could modify show_edit_distance to add the summaries every time this function is being called:
def show_edit_distance(self, num, epoch):
...
summary = tf.Summary()
summary.value.add(tag='mean_ed', simple_value=mean_ed)
summ_writer.add_summary(summary, epoch)
summary = tf.Summary()
summary.value.add(tag='mean_norm_ed', simple_value=mean_norm_ed)
summ_writer.add_summary(summary, epoch)
...
Note that you will need an extra argument epoch:
def on_epoch_end(self, epoch, logs={}):
...
self.show_edit_distance(256, epoch)
...
The Tensorboard callback should automatically pick up these summaries, as they're being added to the GraphKeys.SUMMARIES collection.
NOTE: Unfortunately, I couldn't test the solution. Please let me know if there is something I am missing.
Related
I tried to implement conditional variational auto encoder, using variational auto encoder at the Keras website : https://keras.io/examples/generative/vae/
I added the second input to the model but I don't know how to fit two inputs to encoder. I think custom train_step() function has some problem and should be updated according to encoder that has two inputs.
I got this error:
ValueError: Layer decoder expects 2 input(s), but it received 1 input tensors. Inputs received: [<tf.Tensor 'encoder/12_enc_sampling/add:0' shape=(8, 8) dtype=float32>]
and here is my script:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
from tensorflow.keras.utils import plot_model
from VAEfunctions import track_loss, sample_decoding, test, latent_space_mds
# Add Graphviz to windows PATH for plot_model
import os
# os.environ["PATH"] += os.pathsep + 'C:/Program Files/Graphviz/bin/'
print("tensorflow version:",tf.__version__)
# =====================================Set initial parameters ==============================
epoch = 3
batch_size = 8
latent_dim = 8
random_sample_points = 3 #Number of random sample points from latent space
# ==========================================================================
# for seed_value in range(7, 8): # change the seed to find the best seed_value
# Seed value
seed_value = 7 #7 #8good
# 1. Set `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED'] = str(seed_value)
# 2. Set `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)
# 3. Set `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(seed_value)
# 4. Set the `tensorflow` pseudo-random generator at a fixed value
import tensorflow as tf
tf.random.set_seed(seed_value)
# for later versions:
# tf.compat.v1.set_random_seed(seed_value)
# 5. Configure a new global `tensorflow` session
session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
tf.compat.v1.keras.backend.set_session(sess)
# ========================================Load data===============================
x_train = np.load('data_x_train_3D_all_par_3D_800_64_64_64_1.npy') # load data, #shape:(800,64,64,64,1)
x_test = np.load('data_x_test_3D_all_par_3D_200_64_64_64_1.npy')
x = np.load('data_x_3D_all_par_3D_1000_64_64_64_1.npy')
input_num = x_train.shape[0]
input_d1 = x_train.shape[1]
input_d2 = x_train.shape[2]
input_d3 = x_train.shape[3]
input_channels = x_train.shape[4]
# ================================== Load one_hot_vector (Parameters) ===================
one_hot_vec_train = np.load('one_hot_vec.npy')[:800]
print(one_hot_vec_train.shape)
# ======================View a few images=======================
print("min of x_train is:", np.min(x_train), "max of x_train is:", np.max(x_train))
print("min of x is:", np.min(x), "max of x is:", np.max(x))
plt.figure(10)
fig, axes = plt.subplots(2, 2)
# fig.tight_layout()
for ax in axes.flat:
i = 1
im = ax.imshow(x_train[100*i][31][:][:], vmin=0, vmax=1)
i = i+1
cbar_ax = fig.add_axes([0.9, 0.15, 0.03, 0.70])
fig.colorbar(im, cax=cbar_ax)
plt.suptitle('Slide[31] of four input samples')
plt.show()
""""****************************** Variational Auto Encoder Model *************************"""
# ======================================= sampling layer ===============================================
class Sampling(layers.Layer):
"""Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""
# tf.random.set_seed(0)
def call(self, inputs):
z_mean, z_log_var = inputs
batch = tf.shape(z_mean)[0]
dim = tf.shape(z_mean)[1]
epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
return z_mean + tf.exp(0.5 * z_log_var) * epsilon
# ===================== Define the VAE as a `Model` with a custom `train_step` ==============================
class CVAE(keras.Model):
# tf.random.set_seed(0)
def __init__(self, encoder, decoder, **kwargs):
super(CVAE, self).__init__(**kwargs)
self.encoder = encoder
self.decoder = decoder
self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
self.reconstruction_loss_tracker = keras.metrics.Mean(
name="reconstruction_loss"
)
self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")
self.rec_tracker = keras.metrics.Mean(name="rec") ####
#property
def metrics(self):
return [
self.total_loss_tracker,
self.reconstruction_loss_tracker,
self.kl_loss_tracker,
self.rec_tracker, ####
]
def train_step(self, data):
with tf.GradientTape() as tape:
z_mean, z_log_var, z = self.encoder(data)
reconstruction = self.decoder(z)
reconstruction_loss = tf.reduce_mean(
tf.reduce_sum(
keras.losses.mean_squared_error(data, reconstruction), axis=(1, 2, 3)
)
)
rec = tf.reduce_mean( #####
tf.reduce_mean(
keras.losses.mean_squared_error(data, reconstruction), axis=(1, 2, 3)
)
)
kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))
total_loss = reconstruction_loss + kl_loss
grads = tape.gradient(total_loss, self.trainable_weights)
self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
self.total_loss_tracker.update_state(total_loss)
self.reconstruction_loss_tracker.update_state(reconstruction_loss)
self.kl_loss_tracker.update_state(kl_loss)
self.rec_tracker.update_state(rec) ####
return {
"loss": self.total_loss_tracker.result(),
"reconstruction_loss": self.reconstruction_loss_tracker.result(),
"kl_loss": self.kl_loss_tracker.result(),
"rec": self.rec_tracker.result() #####
}
# ===================================== Build the encoder ====================================
input_shape = (input_d1, input_d2, input_d3, input_channels) # input_shape=(64, 64, 64, 1)
encoder_input = keras.Input(shape=input_shape, name="enc_input")
input_one_hot_vec = keras.Input(shape=(27, ), name="one_hot_vec_input") # one_hot_vect input layer shape=(27)
x = layers.Conv3D(32, 3, activation="relu", padding="same", name="1_enc_conv")(encoder_input) # delete strides=2
x = layers.MaxPooling3D(name="2_enc_MaxPool")(x)
x = layers.Dropout(0.2)(x)
x = layers.Conv3D(32, 3, activation="relu", padding="same", name="3_enc_conv")(x)
x = layers.MaxPooling3D(name="4_enc_MaxPool")(x)
x = layers.Dropout(0.2)(x)
x = layers.Conv3D(32, 3, activation="relu", padding="same", name="5_enc_conv")(x)
x = layers.MaxPooling3D(name="6_enc_MaxPool")(x)
x = layers.Dropout(0.2)(x)
x = layers.Conv3D(32, 3, activation="relu", padding="same", name="7_en_conv")(x)
enc_last_conv = x.shape
x = layers.Flatten(name="8_enc_flat")(x)
# ============Add one_hot_vector here: ==============
concat_layer_enc = layers.concatenate([x, input_one_hot_vec], axis=1) # ====== concat one_hot_vec here
x = layers.Dense(16, activation="relu", name="9_enc_dense")(concat_layer_enc)
z_mean = layers.Dense(latent_dim, name="10_enc_z_mean")(x)
z_log_var = layers.Dense(latent_dim, name="11_enc_z_log_var")(x)
z = Sampling(name="12_enc_sampling")([z_mean, z_log_var])
encoder = keras.Model([encoder_input, input_one_hot_vec], [z_mean, z_log_var, z], name="encoder")
print(encoder.summary())
plot_model(encoder, to_file='Plot_Model/CVAE/encoder.jpg', show_shapes=True)
print("encoder model created")
# ===================================== Build the decoder ====================
latent_input = keras.Input(shape=(latent_dim,), name="dec_input")
concat_layer_dec = layers.concatenate([latent_input, input_one_hot_vec], axis=1) # ======= concat one_hot_vec here
x = layers.Dense(enc_last_conv[1] * enc_last_conv[2] * enc_last_conv[3] * enc_last_conv[4], activation="relu", name="1_dec_dense")(concat_layer_dec)
x = layers.Reshape((enc_last_conv[1], enc_last_conv[2], enc_last_conv[3], enc_last_conv[4]), name="2_dec")(x)
x = layers.Conv3DTranspose(32, 3, activation="relu", padding="same", name="3_dec_conv")(x)
x = layers.UpSampling3D(name="4_dec_MaxPool")(x)
x = layers.Conv3DTranspose(32, 3, activation="relu", padding="same", name="5_dec_conv")(x)
x = layers.UpSampling3D(name="6_dec_MaxPool")(x)
x = layers.Conv3DTranspose(32, 3, activation="relu", padding="same", name="7_dec_conv")(x)
x = layers.UpSampling3D(name="8_dec_MaxPool")(x)
x = layers.Conv3DTranspose(32, 3, activation="relu", padding="same", name="9_dec_conv")(x)
decoder_outputs = layers.Conv3DTranspose(1, 3, activation="sigmoid", padding="same", name="10_dec_conv_out")(x)
decoder = keras.Model([latent_input, input_one_hot_vec], decoder_outputs, name="decoder")
print(decoder.summary())
plot_model(decoder, to_file='Plot_Model/CVAE/decoder.jpg', show_shapes=True)
# =========================================== **Train the CVAE** ==================================================
random.seed(seed_value)
cvae = CVAE(encoder, decoder)
cvae.compile(optimizer=keras.optimizers.Adam())
history = cvae.fit([x_train, one_hot_vec_train], epochs=epoch, batch_size=batch_size)
track_loss(history, epoch, batch_size, latent_dim, random_sample_points, seed_value)
random_sample_list = latent_space_mds(cvae, x_train, epoch, latent_dim, random_sample_points, seed_value)
sample_decoding(cvae, epoch, latent_dim, random_sample_points, random_sample_list, seed_value)
test(cvae, x_test, 4, epoch, latent_dim)
# end of seed loop
print("finish")
Here is the model that I created:
Encoder model:
and
Here is the decoder model:
Thanks!
i'm newbie in PyTorch. Can someone help? I am trying teach Neural Network to play in tetris, but can't understand why weights doesn't cange.
Neural Network:
class CNN(Module):
# define model elements
def __init__(self, n_channels):
super(CNN, self).__init__()
self.number_of_actions = 5
self.gamma = 0.01
self.final_epsilon = 0.0001
self.initial_epsilon = 0.1
self.number_of_iterations = 2000000
self.replay_memory_size = 10000
self.minibatch_size = 32
# input to first hidden layer
#self.hidden1 = Linear(n_channels, 50)
self.hidden1 = Conv2d(1, 1, (22, 10))
kaiming_uniform_(self.hidden1.weight, nonlinearity='relu')
#self.act1 = ReLU()
self.pool1 = MaxPool2d(2, 2)
# second hidden layer
self.hidden2 = Linear(1, 30)
kaiming_uniform_(self.hidden2.weight, nonlinearity='relu')
self.act2 = ReLU()
# fully connected layer
self.hidden3 = Linear(30, 10)
kaiming_uniform_(self.hidden3.weight, nonlinearity='relu')
self.act3 = ReLU()
# output layer
self.hidden4 = Linear(10, 1)
xavier_uniform_(self.hidden4.weight)
self.act4 = Softmax(dim=1)
# forward propagate input
def forward(self, X):
# input to first hidden layer
X = self.hidden1(X)
X = self.pool1(X)
# second hidden layer
X = self.hidden2(X)
X = self.act2(X)
# third hidden layer
X = self.hidden3(X)
X = self.act3(X)
# output layer
X = self.hidden4(X)
X = self.act4(X)
return X
Learning loop:
reward = torch.tensor([[0.]], requires_grad=True)
inputs = torch.tensor([[self.BoardMatrix]])
outputs = model(inputs)
# compute the model output
state_action_values = model(inputs)
replay_memory.append(self.BoardMatrix)
if len(replay_memory) > model.replay_memory_size:
replay_memory.pop(0)
if self.isWaitingAfterLine:
self.isWaitingAfterLine = False
self.newPiece()
else:
# learning
if (state_action_values[0][0][0][0] > 0.6):
self.action("left")
action_batch = 0.8
elif (state_action_values[0][0][0][0] > 0.2):
self.action("right")
action_batch = 0.6
elif (state_action_values[0][0][0][0] > -0.2):
self.action("down")
action_batch = 0.4
elif (state_action_values[0][0][0][0] > -0.6):
self.action("up")
action_batch = 0.2
elif (state_action_values[0][0][0][0] > -1):
self.action("space")
action_batch = 0
self.oneLineDown()
minibatch = random.sample(replay_memory, min(len(replay_memory), model.minibatch_size))
criterion: MSELoss = nn.MSELoss()
reward_batch = reward
y_batch = torch.cat(tuple(reward_batch if minibatch[i][4]
else reward_batch + model.gamma * torch.max(state_action_values[i])
for i in range(len(minibatch))))
y_batch = y_batch.detach()
q_value = torch.sum(outputs * action_batch, dim=1)
optimizer.zero_grad()
loss = criterion(q_value, y_batch[0])
# credit assignment
loss.backward()
optimizer.step()
print(model.hidden1.weight)
else:
super(Board, self).timerEvent(event)
NN, optimizer initialization:
model = CNN(boardSize)
criterion = CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-6)
I working by this tutorial "https://www.toptal.com/deep-learning/pytorch-reinforcement-learning-tutorial", but try change it for tetris.
P.S. I also find out y_batch never change but i don't know why
I am trying to build a model that classifies images from belgium traffic datasets, but spyder freezes when i try to run the optimizer and train operation in sess.run[].
i have attached the code below.
graph = tf.Graph()
with graph.as_default():
images_X = tf.compat.v1.placeholder(tf.float32,shape = [None,32,32,3])
labels_X = tf.compat.v1.placeholder(tf.int32,shape = [None])
#biasInit = tf.constant_initializer(0.1, dtype=tf.float32)
# Initializer
biasInit = tf.initializers.GlorotUniform()
#conv layer 1 - num_filtewrs = 128, kernel size = [6,6]
conv_1 = Conv2D(filters = 128,kernel_size = [6,6],bias_initializer = biasInit)(images_X)
#batch normalization
bn_1 = BatchNormalization(center = True,scale = True)(conv_1)
#maxpoloing
pool_1 = MaxPooling2D(pool_size = (2,2))(bn_1)
#conv layer 2
conv_2 = Conv2D(filters = 256,kernel_size = [6,6] , strides = (2,2),
bias_initializer=biasInit)(pool_1)
#Batch normalization 2
bn_2 = BatchNormalization(center = True,scale = True)(conv_2)
pool_2 = MaxPooling2D(pool_size = (2,2))(bn_2)
#faltten
images_flat = Flatten()(pool_2)
#dense layer - units = 512
fc_1 = Dense(units = 512,activation = 'relu')(images_flat)
bn_3 = BatchNormalization(center = True,scale = True)(fc_1)
dropout = Dropout(0.25)(bn_3)
#logits will be of the size [None,62]
logits = Dense(units = 62,activation = 'relu')(dropout)
#converting the lofits - [None,62] to labels - [None]
predicted_labels = tf.argmax(logits, axis=1)
loss_op = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits = logits,
labels = labels_X))
update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
# Create an optimizer, which acts as the training op.
train = tf.compat.v1.train.AdamOptimizer(learning_rate=0.10).minimize(loss_op)
init_op = tf.compat.v1.global_variables_initializer()
print("images_flat: ", images_flat)
print("logits: ", logits)
print("loss: ", loss_op)
print("predicted_labels: ", predicted_labels)
# Create a session to run the graph we created.
session = tf.compat.v1.Session(graph=graph, config=tf.compat.v1.ConfigProto(log_device_placement=True))
session.run(init_op)
for i in range(10):
_, loss_value = session.run([train, loss_op], feed_dict={images_X: images_array, labels_X: labels_array})
print("Loss: ", loss_value)
when i run the program line by line, it runs well until the last for loop, when it reaches the last for loop the memory goes to 99% and the whole pc freezes.
I am kind of a beginner in RNNs, so I coded a LSTM architecture using Pytorch, but I always run out of RAM whenever I am in the 3rd epoch. I am already using a DataLoader and I tried to detach the gradient from the input tensor but it doesn't solve the problem out.
This is my training loop
writer = SummaryWriter()
criterion = nn.CrossEntropyLoss(reduction='mean', ignore_index = 0)
optimizer = optim.Adam(lstm.parameters(), lr = 1e-5)
gradient_clip = clip_grad_norm_(lstm.parameters(), max_norm = 5)
num_epochs = 20
epoch_loss = -1.0
loss = - 1
t = trange(num_epochs, desc= "Epoch loss", leave=True)
for epoch in t:
trainLoader = iter(DataLoader(dataset, batch_size = batch_size))
tt = trange(len(trainLoader)-1, desc= "Batch loss", leave=True)
for i in tt:
text, embedding = next(trainLoader)
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
y = lstm.forward(embedding.transpose(1,0))
labels = text.transpose(0,1)[1:].transpose(0,1).flatten()
loss = criterion(y.reshape(-1, y.shape[-1]), labels)
tt.set_description("Batch loss : %.4f" % loss)
tt.refresh()
loss.backward(retain_graph=True)
optimizer.step()
epoch_loss += loss
epoch_loss = epoch_loss / (len(trainLoader) - 1)
# Saving model
save_date = datetime.now().strftime("%d%m%Y-%H:%M:%S")
PATH = './save/lstm_model_'+save_date
torch.save(lstm, PATH)
# Updating progression bar
t.set_description("Epoch loss : %.4f" % epoch_loss)
t.refresh()
# Plotting gradients histograms in Tensorboard
writer.add_scalar('Text_generation_Loss/train', epoch_loss, epoch)
for tag, parm in lstm.named_parameters():
with torch.no_grad():
writer.add_histogram(tag, parm.grad.data.cpu().numpy(), epoch)
writer.flush()
print('Finished Training')
writer.close()
And this is the LSTM class that I built:
class LSTM(nn.Module):
def __init__(self, in_size : int, hidden_size : int):
super().__init__()
self.in_size = in_size
self.hidden_size = hidden_size
self.W_fi = nn.Linear(in_size,hidden_size)
self.W_fh = nn.Linear(hidden_size,hidden_size, bias=False)
self.W_ii = nn.Linear(in_size,hidden_size)
self.W_ih = nn.Linear(hidden_size,hidden_size, bias=False)
self.W_Ci = nn.Linear(in_size,hidden_size)
self.W_Ch = nn.Linear(hidden_size,hidden_size, bias=False)
self.W_oi = nn.Linear(in_size,hidden_size)
self.W_oh = nn.Linear(hidden_size,hidden_size, bias=False)
self.sigmoid = nn.Sigmoid()
self.tanh = nn.Tanh()
def one_step(self, x, h, C):
f_t = self.sigmoid(self.W_fi(x) + self.W_fh(h))
i_t = self.sigmoid(self.W_ii(x) + self.W_ih(h))
g_t = self.tanh(self.W_Ci(x) + self.W_Ch(h))
C_t = torch.mul(f_t, C) + torch.mul(i_t, g_t)
o_t = self.sigmoid(self.W_oi(x) + self.W_oh(h))
h_t = torch.mul(o_t, self.tanh(C_t))
return h_t, C_t
def forward(self, X):
h_out = []
h = - torch.ones(X.shape[1], self.hidden_size)
C = - torch.ones(X.shape[1], self.hidden_size)
h_t, C_t = self.one_step(X[0], h, C)
h_out.append(h_t)
for i in range(1, X.shape[0] - 1):
h_t, C_t = self.one_step(X[i], h_t, C_t)
h_out.append(h_t)
h_out = torch.cat(h_out)
return h_out #h_out.reshape(-1,batch_size,num_embeddings)
I already searched for a similar case but I wasn't able to find a solution
I don't know if it may help somebody, but I solved the problem. I wasn't perhaps clear about the task, but the goal was to make text generation. The first thing I was doing is embed the sentences using torch.nn.embedding that was defined outside my LSTM. The solution was to include it as a layer of my network, since the embedding is not a pretrained one and should be learned too.
I am trying to implement the deep deterministic gradient method in tensorflow and keras, however, I seem to be stuck. There seems to be no learning occurring, the actions taken by the model do not seem to change at all and the gradient applied to the actor-network is very small as well (on the order of magnitude of e^-5). I have used another implementation as a reference and this one runs very well with the exact same hyperparameters and network architectures (except that it is implemented with tflearn and includes batch normalization layers), making me believe that there is a mistake somewhere in my code. Maybe someone here can spot it.
Thank you for your time!
Edit: I think the reason for the bad performance is that the critic-network's gradient with respect to the action is vanishing. I can't, however, figure out why. Maybe my use of the concatenate layer is wrong?
class AIInterface(object):
def __init__(self, sim):
self.sim = sim
self.pedal_pos = 0
self.steering_pos = 0
self.sess = tf.Session()
self.learning_rate = 10e-4
self.BATCH_SIZE = 64
self.epsilon = .75 #amount of random exploration
self.epsilon_decay = .997
self.gamma = .99 #reward discount factor
self.tau = .00125 #target update factor
self.rewards = deque(maxlen=100000)
self.memory = deque(maxlen=100000)
# Actor stuff
self.actor_model, self.actor_var_in = self.initialize_actor()
self.target_actor, _ = self.initialize_actor()
self.actor_critic_grad = tf.placeholder(tf.float32, [None, 1])
self.actor_model_weights = self.actor_model.trainable_weights
with tf.name_scope("actor_gradients"):
self.actor_grads = tf.gradients(self.actor_model.output, self.actor_model_weights, -self.actor_critic_grad)
self.normalized_actor_grads = list(map(lambda x: tf.div(x, self.BATCH_SIZE), self.actor_grads))
grads = zip(self.normalized_actor_grads, self.actor_model_weights)
self.optimize = tf.train.AdamOptimizer(self.learning_rate).apply_gradients(grads)
# Critic stuff
self.critic_model, self.critic_var_in, self.critic_action_in = self.initialize_critic()
self.target_critic, _, _ = self.initialize_critic()
with tf.name_scope("CriticGrads"):
self.critic_grads = tf.gradients(self.critic_model.output, self.critic_action_in)
self.sess.run(tf.global_variables_initializer())
self.target_actor.set_weights(self.actor_model.get_weights())
self.target_critic.set_weights(self.critic_model.get_weights())
self.global_step = 0
def initialize_actor(self):
state_variable_input = Input(shape=(3, ))
init = TruncatedNormal(mean=0.0, stddev=0.02)
dense = Dense(128, activation="relu", kernel_initializer=init)(state_variable_input)
dense2 = Dense(128, activation="relu", kernel_initializer=init)(dense)
output = Dense(1, activation="tanh", kernel_initializer=RandomUniform(-3e-3, 3e-3))(dense2)
model = Model(inputs=state_variable_input,
outputs=output)
model.compile(optimizer="adam", loss="mse")
return model, state_variable_input
def initialize_critic(self):
state_variable_input = Input(shape=(3, ))
action_input = Input(shape=(1, ))
init = TruncatedNormal(mean=0.0, stddev=0.02)
dense_state = Dense(128, activation="relu", kernel_initializer=init)(state_variable_input)
merge = Concatenate()([dense_state, action_input])
dense2 = Dense(128, activation="relu", kernel_initializer=init)(merge)
output = Dense(1, activation="linear", kernel_initializer=RandomUniform(-3e-3, 3e-3))(dense2)
model = Model(inputs=[state_variable_input, action_input],
outputs=output)
model.compile(optimizer="adam", loss="mse")
return model, state_variable_input, action_input
def train(self):
if len(self.memory) < self.BATCH_SIZE:
return
samples = random.sample(self.memory, self.BATCH_SIZE)
samples = [np.concatenate(x) for x in zip(*samples)]
self.train_critic(samples)
self.train_actor(samples)
self.global_step += 1
def train_critic(self, samples):
cur_state_var, action, reward, new_state_var = samples
predicted_action = self.target_actor.predict([new_state_var])
future_reward = self.target_critic.predict([new_state_var, predicted_action])
Q = reward + self.gamma*future_reward
self.critic_model.train_on_batch([cur_state_var, action], Q)
def train_actor(self, samples):
cur_state_var, action, reward, new_state_var = samples
predicted_action = self.actor_model.predict([cur_state_var])
grads = self.sess.run([self.critic_grads], feed_dict={
self.critic_var_in: cur_state_var,
self.critic_action_in: predicted_action})
self.sess.run(self.optimize, feed_dict={
self.actor_var_in: cur_state_var,
self.actor_critic_grad: grads[0]
})
def update_actor_target(self):
actor_model_weights = self.actor_model.get_weights()
actor_target_weights = self.target_actor.get_weights()
for i in range(len(actor_target_weights)):
actor_target_weights[i] = self.tau * actor_model_weights[i] + (1-self.tau)*actor_target_weights[i]
self.target_actor.set_weights(actor_target_weights)
def update_critic_target(self):
critic_model_weights = self.critic_model.get_weights()
critic_target_weights = self.target_critic.get_weights()
for i in range(len(critic_target_weights)):
critic_target_weights[i] = self.tau * critic_model_weights[i] + (1-self.tau)*critic_target_weights[i]
self.target_critic.set_weights(critic_target_weights)
def update_model(self):
self.update_actor_target()
self.update_critic_target()
def act(self, cur_state_var, noise=None, env=None):
if env:
if np.random.random() < self.epsilon:
return env.action_space.sample()
else:
sh = cur_state_var.shape
action = self.actor_model.predict([cur_state_var], batch_size=1)[0]
return action
elif not noise:
if np.random.random() < self.epsilon:
return self.sample_action_space()
return self.actor_model.predict([cur_state_var], batch_size=1)[0]
else:
no = noise()
pred = self.actor_model.predict([cur_state_var], batch_size=1)[0]
return pred + no
def sample_action_space(self):
return np.array([random.uniform(-0.5, 0.5), random.uniform(-1.0, 1.0)]).reshape(2, )
def remember(self, cur_state_var, action, reward, new_state_var):
self.memory.append([cur_state_var, action, reward, new_state_var])