Related
The following training curve is generated using the same Tensorflow + Keras script written in Python:
RED line uses five features.
GREEN line uses seven features.
BLUE line uses nine features.
Can anyone tell me the probable cause of the oscillation of the GREEN line so that I can troubleshoot my script?
Source code:
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
#os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Use both gpus for training.
import sys, random
import time
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import ModelCheckpoint
import numpy as np
from lxml import etree, objectify
# <editor-fold desc="GPU">
# resolve GPU related issues.
try:
physical_devices = tf.config.list_physical_devices('GPU')
for gpu_instance in physical_devices:
tf.config.experimental.set_memory_growth(gpu_instance, True)
except Exception as e:
pass
# END of try
# </editor-fold>
# <editor-fold desc="Lxml helper">
class LxmlHelper:
#classmethod
def objectify_xml(cls, input_path_dir):
file_dom = etree.parse(input_path_dir) # parse xml and convert it into DOM
file_xml_bin = etree.tostring(file_dom, pretty_print=False, encoding="ascii") # encode DOM into ASCII object
file_xml_text = file_xml_bin.decode() # convert binary ASCII object into ASCII text
objectified_xml = objectify.fromstring(file_xml_text) # convert text into a Doxygen object
return objectified_xml
# </editor-fold>
# <editor-fold desc="def encode(letter)">
def encode(letter: str):
if letter == 'H':
return [1.0, 0.0, 0.0]
elif letter == 'E':
return [0.0, 1.0, 0.0]
elif letter == 'C':
return [0.0, 0.0, 1.0]
elif letter == '-':
return [0.0, 0.0, 0.0]
# END of function
def encode_string_1(pattern_str: str):
# Iterate over the string
one_hot_binary_str = []
for ch in pattern_str:
try:
one_hot_binary_str = one_hot_binary_str + encode(ch)
except Exception as e:
print(pattern_str, one_hot_binary_str, ch)
# END of for loop
return one_hot_binary_str
# END of function
def encode_string_2(pattern_str: str):
# Iterate over the string
one_hot_binary_str = []
for ch in pattern_str:
temp_encoded_vect = [encode(ch)]
one_hot_binary_str = one_hot_binary_str + temp_encoded_vect
# END of for loop
return one_hot_binary_str
# END of function
# </editor-fold>
# <editor-fold desc="def load_data()">
def load_data_k(fname: str, class_index: int, feature_start_index: int, **selection):
"""Loads data for training and validation
:param fname: (``string``) - name of the file with the data
:param selection: (``kwargs``) - see below
:return: four tensorflow tensors: training input, training output, validation input and validation output
:Keyword Arguments:
* *top_n_lines* (``number``) --
take top N lines of the input and disregard the rest
* *random_n_lines* (``number``) --
take random N lines of the input and disregard the rest
* *validation_part* (``float``) --
separate N_lines * given_fraction of the input lines from the training set and use
them for validation. When the given_fraction = 1.0, then the same input set of
N_lines is used both for training and validation (this is the default)
"""
i = 0
file = open(fname)
if "top_n_lines" in selection:
lines = [next(file) for _ in range(int(selection["top_n_lines"]))]
elif "random_n_lines" in selection:
tmp_lines = file.readlines()
lines = random.sample(tmp_lines, int(selection["random_n_lines"]))
else:
lines = file.readlines()
data_x, data_y, data_z = [], [], []
for l in lines:
row = l.strip().split() # return a list of words from the line.
x = [float(ix) for ix in row[feature_start_index:]] # convert 3rd to 20th word into a vector of float numbers.
y = encode(row[class_index]) # convert the 3rd word into binary.
z = encode_string_1(row[class_index+1])
data_x.append(x) # append the vector into 'data_x'
data_y.append(y) # append the vector into 'data_y'
data_z.append(z) # append the vector into 'data_z'
# END for l in lines
num_rows = len(data_x)
given_fraction = selection.get("validation_part", 1.0)
if given_fraction > 0.9999:
valid_x, valid_y, valid_z = data_x, data_y, data_z
else:
n = int(num_rows * given_fraction)
data_x, data_y, data_z = data_x[n:], data_y[n:], data_z[n:]
valid_x, valid_y, valid_z = data_x[:n], data_y[:n], data_z[:n]
# END of if-else block
tx = tf.convert_to_tensor(data_x, np.float32)
ty = tf.convert_to_tensor(data_y, np.float32)
tz = tf.convert_to_tensor(data_z, np.float32)
vx = tf.convert_to_tensor(valid_x, np.float32)
vy = tf.convert_to_tensor(valid_y, np.float32)
vz = tf.convert_to_tensor(valid_z, np.float32)
return tx, ty, tz, vx, vy, vz
# END of the function
# </editor-fold>
# <editor-fold desc="def create_model()">
def create_model(n_hidden_1, n_hidden_2, num_classes, num_features):
# create the model
model = Sequential()
model.add(tf.keras.layers.InputLayer(input_shape=(num_features,)))
model.add(tf.keras.layers.Dense(n_hidden_1, activation='sigmoid'))
model.add(tf.keras.layers.Dense(n_hidden_2, activation='sigmoid'))
###model.add(tf.keras.layers.Dense(n_hidden_3, activation='sigmoid'))
model.add(tf.keras.layers.Dense(num_classes, activation='softmax'))
# instantiate the optimizer
opt = keras.optimizers.SGD(learning_rate=LEARNING_RATE)
# compile the model
model.compile(
optimizer=opt,
loss="categorical_crossentropy",
metrics="categorical_accuracy"
)
# return model
return model
# </editor-fold>
if __name__ == "__main__":
# <editor-fold desc="(input/output parameters)">
my_project_routine = LxmlHelper.objectify_xml("my_project_evaluate.xml")
# input data
INPUT_DATA_FILE = str(my_project_routine.input.input_data_file)
INPUT_PATH = str(my_project_routine.input.input_path)
CLASS_INDEX = int(my_project_routine.input.class_index)
FEATURE_INDEX = int(my_project_routine.input.feature_index)
# output data
OUTPUT_PATH = str(my_project_routine.output.output_path)
MODEL_FILE = str(my_project_routine.output.model_file)
TRAINING_PROGRESS_FILE = str(my_project_routine.output.training_progress_file)
# Learning parameters
LEARNING_RATE = float(my_project_routine.training_params.learning_rate)
EPOCH_SIZE = int(my_project_routine.training_params.epoch_size)
BATCH_SIZE = int(my_project_routine.training_params.batch_size)
INPUT_LINES_COUNT = int(my_project_routine.input.input_lines_count)
VALIDATION_PART = float(my_project_routine.training_params.validation_part)
SAVE_PERIOD = str(my_project_routine.output.save_period)
# NN parameters
HIDDEN_LAYER_1_NEURON_COUNT = int(my_project_routine.hidden_layers.one)
HIDDEN_LAYER_2_NEURON_COUNT = int(my_project_routine.hidden_layers.two)
###HIDDEN_LAYER_3_NEURON_COUNT = int(my_project_routine.hidden_layers.three)
CLASS_COUNT = int(my_project_routine.class_count)
FEATURES_COUNT = int(my_project_routine.features_count)
input_file_path_str = os.path.join(INPUT_PATH, INPUT_DATA_FILE)
training_progress_file_path_str = os.path.join(OUTPUT_PATH, TRAINING_PROGRESS_FILE)
model_file_path = os.path.join(OUTPUT_PATH, MODEL_FILE)
# command-line arg processing
input_file_name_str = None
if len(sys.argv) > 1:
input_file_name_str = sys.argv[1]
else:
input_file_name_str = input_file_path_str
# END of if-else
# </editor-fold>
# <editor-fold desc="(load data from file)">
# load training data from the disk
train_x, train_y, _, validate_x, validate_y, _ = \
load_data_k(
fname=input_file_name_str,
class_index=CLASS_INDEX,
feature_start_index=FEATURE_INDEX,
random_n_lines=INPUT_LINES_COUNT,
validation_part=VALIDATION_PART
)
print("training data size : ", len(train_x))
print("validation data size : ", len(validate_x))
# </editor-fold>
### STEPS_PER_EPOCH = len(train_x) // BATCH_SIZE
### VALIDATION_STEPS = len(validate_x) // BATCH_SIZE
# <editor-fold desc="(model creation)">
# load previously saved NN model
model = None
try:
model = keras.models.load_model(model_file_path)
print("Loading NN model from file.")
model.summary()
except Exception as ex:
print("No NN model found for loading.")
# END of try-except
# </editor-fold>
# <editor-fold desc="(model run)">
# # if there is no model loaded, create a new model
if model is None:
csv_logger = keras.callbacks.CSVLogger(training_progress_file_path_str)
checkpoint = ModelCheckpoint(
model_file_path,
monitor='loss',
verbose=1,
save_best_only=True,
mode='auto',
save_freq='epoch'
)
callbacks_vector = [
csv_logger,
checkpoint
]
# Set mirror strategy
#strategy = tf.distribute.MirroredStrategy(devices=["/device:GPU:0","/device:GPU:1"])
#with strategy.scope():
print("New NN model created.")
# create sequential NN model
model = create_model(
n_hidden_1=HIDDEN_LAYER_1_NEURON_COUNT,
n_hidden_2=HIDDEN_LAYER_2_NEURON_COUNT,
##n_hidden_3=HIDDEN_LAYER_3_NEURON_COUNT,
num_classes=CLASS_COUNT,
num_features=FEATURES_COUNT
)
# Train the model with the new callback
history = model.fit(
train_x, train_y,
validation_data=(validate_x, validate_y),
batch_size=BATCH_SIZE,
epochs=EPOCH_SIZE,
callbacks=[callbacks_vector],
shuffle=True,
verbose=2
)
print(history.history.keys())
# END of ... with
# END of ... if
# </editor-fold>
Plotting Script
import os
from argparse import ArgumentParser
import random
from typing import List
import matplotlib.pyplot as plt
import numpy as np
import math
import sys
import datetime
class Quad:
def __init__(self, x_vector, y_vector, color_char, label_str):
self.__x_vector = x_vector
self.__y_vector = y_vector
self.__color_char = color_char
self.__label_str = label_str
def get_x_vector(self):
return self.__x_vector
def get_y_vector(self):
return self.__y_vector
def get_color_char(self):
return self.__color_char
def get_label_str(self):
return self.__label_str
class HecaPlotClass:
def __init__(self):
self.__x_label_str: str = None
self.__y_label_str: str = None
self.__title_str: str = None
self.__trio_vector: List[Quad] = []
self.__plotter = plt
#property
def x_label_str(self):
return self.__x_label_str
#x_label_str.setter
def x_label_str(self, t):
self.__x_label_str = t
#property
def y_label_str(self):
return self.__y_label_str
#y_label_str.setter
def y_label_str(self, t):
self.__y_label_str = t
#property
def title_str(self):
return self.__title_str
#title_str.setter
def title_str(self, t):
self.__title_str = t
def add_y_axes(self, trio_obj: Quad):
self.__trio_vector.append(trio_obj)
def generate_plot(self):
for obj in self.__trio_vector:
x_vector = obj.get_x_vector()
y_vector = obj.get_y_vector()
label_str = obj.get_label_str()
# print(label_str)
# print(len(x_vector))
# print(len(y_vector))
self.__plotter.plot(
x_vector,
y_vector,
color=obj.get_color_char(),
label=label_str
)
# END of ... for loop
# Naming the x-axis, y_1_vector-axis and the whole graph
self.__plotter.xlabel(self.__x_label_str)
self.__plotter.ylabel(self.__y_label_str)
self.__plotter.title(self.__title_str)
# Adding legend, which helps us recognize the curve according to it's color
self.__plotter.legend()
# To load the display window
#self.__plotter.show()
def save_png(self, output_directory_str):
output_file_str = os.path.join(output_directory_str, self.__title_str + '.png')
self.__plotter.savefig(output_file_str)
def save_pdf(self, output_directory_str):
output_file_str = os.path.join(output_directory_str, self.__title_str + '.pdf')
self.__plotter.savefig(output_file_str)
class MainClass(object):
__colors_vector = ['red', 'green', 'blue', 'cyan', 'magenta', 'yellow', 'orange', 'lightgreen', 'crimson']
__working_dir = r"."
__file_names_vector = ["training_progress-32.txt", "training_progress-64.txt", "training_progress-128.txt"]
__input_files_vector = []
__output_directory = None
__column_no_int = 0
__split_percentage_at_tail_int = 100
__is_pdf_output = False
__is_png_output = False
# <editor-fold desc="def load_data()">
#classmethod
def __load_data(cls, fname: str, percetage_int:int, column_no_int:int):
np_array = np.loadtxt(
fname,
# usecols=range(1,11),
dtype=np.float32,
skiprows=1,
delimiter=","
)
size_vector = np_array.shape
array_len_int = size_vector[0]
rows_count_int = int(percetage_int * array_len_int / 100)
np_array = np_array[-rows_count_int:]
x = np_array[:, 0]
y = np_array[:, column_no_int]
return x, y
# END of the function
# </editor-fold>
# <editor-fold desc="(__parse_args())">
#classmethod
def __parse_args(cls):
# initialize argument parser
my_parser = ArgumentParser()
my_parser.add_argument("-c", help="column no.", type=int)
my_parser.add_argument('-i', nargs='+', help='a list of input files', required=True)
my_parser.add_argument("-o", help="output directory", type=str)
my_parser.add_argument("-n", help="percentage of data to split from tail", type=float)
my_parser.add_argument("--pdf", help="PDF output", action='store_true')
my_parser.add_argument("--png", help="PNG output", action='store_true')
# parse the argument
args = my_parser.parse_args()
cls.__input_files_vector = args.i
cls.__output_directory = args.o
cls.__split_percentage_at_tail_int = args.n
cls.__column_no_int = args.c
cls.__is_pdf_output = args.pdf
cls.__is_png_output = args.png
# </editor-fold>
#classmethod
def main(cls):
cls.__parse_args()
if cls.__input_files_vector is None:
cls.__input_files_vector = cls.__file_names_vector
if cls.__output_directory is None:
cls.__output_directory = cls.__working_dir
if cls.__split_percentage_at_tail_int is None:
cls.__split_percentage_at_tail_int = 100
if cls.__column_no_int is None:
cls.__column_no_int = 1
my_project_plot_obj = HecaPlotClass()
i = 0
for file_path_str in cls.__input_files_vector:
print(file_path_str)
x_vector, y_vector = cls.__load_data(os.path.join(cls.__working_dir, file_path_str), cls.__split_percentage_at_tail_int, cls.__column_no_int)
my_project_plot_obj.x_label_str = "Epoch"
my_project_plot_obj.y_label_str = "Accuracy"
my_project_plot_obj.title_str = "training_plot-{date:%Y-%m-%d_%H:%M:%S}".format(date=datetime.datetime.now())
my_project_plot_obj.x_axis_vector = x_vector
if i == 0:
random_int = 0
else:
random_int = i % (len(cls.__colors_vector)-1)
# END of ... if
print("random_int : ", random_int)
my_project_plot_obj.add_y_axes(Quad(x_vector, y_vector, cls.__colors_vector[random_int], file_path_str))
i = i + 1
# END of ... for loop
my_project_plot_obj.generate_plot()
my_project_plot_obj.save_png(cls.__output_directory)
my_project_plot_obj.save_pdf(cls.__output_directory)
if __name__ == "__main__":
MainClass.main()
The primary reason could be improper (non-random ~ ordered) distribution of data.
If you notice the accuracy beyond epoch 180, there is a orderly switching between the accuracy between ~0.43 (approx.) and ~0.33 (~approx.), and occasionally ~0.23 (approx.). The more important thing to notice is that the accuracy is decreasing (there's no improvement in validation accuracy) as we increase the epochs.
The accuracy can increase in such cases if you (1) reduce batch size, or (2) use a better optimizer like Adam. And check the learning rate.
These changes can help the shift and oscillation, as well.
Additionally, Running average of the accuracy can be plotted to avoid the oscillation. This is again a mitigation scheme rather than a correction scheme. But, what it does is removes the order (partition of the data) and mixes the nearby data.
Lastly, I would also reshuffle the data and normalize after each layer. See if that helps.
Generally, sharp jumps and flat lines in the accuracy usually mean that a group of examples is classified as a given class at a same time. If your dataset contains, say, 50 examples with the same combination of 7 features then they would go into the same class at the same time. This is what probably causes sharp jumps - identical or similar examples clustered together.
So for example, if you have 50 men aged 64, and a decision boundary to classify them as more prone to an illness shifts from >65 to >63, then accuracy changes rapidly as all of them change classification at the same time.
Regarding the oscillation of the curve - due to the fact above, oscillation will be amplified by small changes in learning. Your network learns based on cross entropy, which means that it minimizes the difference between target and your predictions. This means that it operates on the difference between probability and target (say, 0.3 vs class 0) instead of class and target like accuracy (so, 0 vs 0) in the same example. Cross entropy is much more smooth as it is not affected by the issue outlined above.
I'm trying to solve the 'BipedalWalker-v2' problem from Open AI, by using python and Tensorflow. In order to solve it I'm implementing an episodic policy gradient algorithms. Because the 'BipedalWalker-v2' actions are continuous my policy is approximated by a multivariate Gaussian distribution. The mean of this distribution is approximated using a fully connected neural network. My neural network has the following layers: [input:24,hidden:5,hidden:5,output:4]. My problem is that when I train the agent, the training process gets slower and slower until it almost freeze. My guess is that I'm misusing sess.run, I'm not feeding the batches in an efficient way. But is just a guess. My question is: Is my guess correct? if it is correct, how can I improve it? and if it is something else, what it is? I'm not looking for a literal solution I just want to get some lights about how to improve the training.
Thanks in advance,
my computer is a Inspiron 15 7000 Gaming, GeForce nvidia gtx 1050, 8 gb ram,cpu: I5
My CODE:
Libraries:
import tensorflow as tf
import tensorflow.contrib.slim as slim
import numpy as np
import gym
import matplotlib.pyplot as plt
Agent class:
class agent_episodic_continuous_action():
def __init__(self, lr, s_size,a_size,batch_size,dist_type):
self.stuck = False
self.gamma = 0.99
self.dist_type = dist_type
self.is_brain_present = False
self.s_size = s_size
self.batch_size=batch_size
self.state_in= tf.placeholder(shape=[None,s_size],dtype=tf.float32)
self.a_size=a_size
self.reward_holder = tf.placeholder(shape=[None],dtype=tf.float32)
self.cov = tf.eye(a_size)
self.reduction = 0.01
if a_size > 1:
self.action_holder = tf.placeholder(shape=[None,a_size],dtype=tf.float32)
else:
self.action_holder = tf.placeholder(shape=[None],dtype=tf.float32)
self.gradient_holders = []
self.optimizer = tf.train.AdamOptimizer(learning_rate=lr)
def save_model(self,path,sess):
self.saver.save(sess, path)
def load_model(self,path,sess):
self.saver.restore(sess, path)
def create_brain(self,hidd_layer,hidd_layer_act_fn,output_act_fn):
self.is_brain_present = True
hidden_output=slim.stack(self.state_in,slim.fully_connected,hidd_layer,activation_fn=hidd_layer_act_fn)
self.output = slim.fully_connected(hidden_output,self.a_size,activation_fn=output_act_fn,biases_initializer=None)
def create_pi_dist(self):
if self.dist_type == "normal":
# amplify= tf.pow(slim.fully_connected(self.output,1,activation_fn=None,biases_initializer=None),2)
mean= self.output
#cov =tf.eye(self.a_size,batch_shape=[self.batch_size])*amplify
normal = tf.contrib.distributions.MultivariateNormalFullCovariance(
loc=mean,
covariance_matrix=self.cov*self.reduction)
self.dist = normal
def create_loss(self):
self.loss = -tf.reduce_mean(tf.log(self.dist.prob(self.action_holder))*self.reward_holder)
def get_gradients_holder(self):
for idx,var in enumerate(self.tvars):
placeholder = tf.placeholder(tf.float32,name=str(idx)+'_holder')
self.gradient_holders.append(placeholder)
def sample_action(self,sess,state):
sample_action= sess.run(self.dist.sample(),feed_dict={self.state_in:state})
return sample_action
def calculate_loss_gradient(self):
self.gradients = tf.gradients(self.loss,self.tvars)
def update_weights(self):
self.update_batch = self.optimizer.apply_gradients(zip(self.gradients,self.tvars))
return self.update_batch
def memorize_data(self,episode,first):
if first:
self.episode_history = episode
self.stuck = False
else:
self.episode_history = np.vstack((self.episode_history,episode))
def shuffle_memories(self):
np.random.shuffle(self.episode_history)
def create_graph_connections(self):
if self.is_brain_present:
self.create_pi_dist()
self.create_loss()
self.tvars = tf.trainable_variables()
self.calculate_loss_gradient()
self.saver = tf.train.Saver()
self.update_weights()
else:
print("initialize brain first")
self.init = tf.global_variables_initializer()
def memory_batch_generator(self):
total=self.episode_history.shape[0]
amount_of_batches= int(total/self.batch_size)
for i in range(amount_of_batches+1):
if i < amount_of_batches:
top=(i+1)*self.batch_size
bottom =i*self.batch_size
yield (self.episode_history[bottom:top,0:self.s_size],self.episode_history[bottom:top,self.s_size:self.s_size+self.a_size],self.episode_history[bottom:top,self.s_size+self.a_size:self.s_size+self.a_size+1],self.episode_history[bottom:top,self.s_size+self.a_size+1:])
else:
yield (self.episode_history[top:,0:self.s_size],self.episode_history[top:,self.s_size:self.s_size+self.a_size],self.episode_history[top:,self.s_size+self.a_size:self.s_size+self.a_size+1],self.episode_history[top:,self.s_size+self.a_size+1:])
def train_with_current_memories(self,sess):
self.sess = sess
for step_sample_batch in self.memory_batch_generator():
sess.run(self.update_weights(), feed_dict={self.state_in:step_sample_batch[0],self.action_holder:step_sample_batch[1],self.reward_holder:step_sample_batch[2].reshape([step_sample_batch[2].shape[0]])})
def get_returns(self):
self.episode_history[:,self.s_size+self.a_size:self.s_size+self.a_size+1] = self.discount_rewards(self.episode_history[:,self.s_size+self.a_size:self.s_size+self.a_size+1])
def discount_rewards(self,r):
""" take 1D float array of rewards and compute discounted reward """
discounted_r = np.zeros_like(r)
running_add = 0
for t in reversed(range(0, r.size)):
running_add = running_add * self.gamma + r[t]
discounted_r[t] = running_add
return discounted_r
def prob_action(self,sess,action,state):
prob = sess.run(self.dist.prob(action),feed_dict={self.state_in:state})
return prob
def check_movement(self):
ep_back = 5
jump = 3
threshold = 3
if len(self.episode_history) > ep_back*2:
difference = sum(abs(self.episode_history[-ep_back:-1,:]-self.episode_history[-ep_back-jump:-1-jump,:]).flatten())
print(difference)
if difference < threshold:
self.stuck = True
def print_last_n_returns(self,n):
if len(self.episode_history[:,self.s_size+self.a_size:self.s_size+self.a_size+1])>n:
n_returns = sum(self.episode_history[-n:,self.s_size+self.a_size:self.s_size+self.a_size+1])/float(n)
print(n_returns)
return n_returns
Training loops:
tf.reset_default_graph()
agent_2= agent_episodic_continuous_action(1e-2,s_size=24,a_size=4,batch_size=30,dist_type="normal")
agent_2.create_brain([5,5],tf.nn.relu,None)
agent_2.create_graph_connections()
env = gym.make('BipedalWalker-v2')
with tf.Session() as sess:
sess.run(agent_2.init)
for i in range(200):
s = env.reset()
d = False
a=agent_2.sample_action(sess,[s])[0]
print(a)
if None in a:
print("None in a! inside for")
print(s)
s1,r,d,_ = env.step(a)
episode = np.hstack((s,a,r,s1))
agent_2.memorize_data(episode=episode,first=True)
count = 0
while not d:
count = count + 1
s = s1
a=agent_2.sample_action(sess,[s])[0]
s1,r,d,_ = env.step(a)
episode = np.hstack((s,a,r,s1))
# env.render()
agent_2.memorize_data(episode=episode,first=False)
# print(s1)
if count % 5 == 0 :
agent_2.check_movement()
if agent_2.stuck:
d = True
agent_2.get_returns()
agent_2.print_last_n_returns(20)
agent_2.shuffle_memories()
agent_2.train_with_current_memories(sess)
env.close()
For each batch of 30 samples I execute Agent.update_weights()
def update_weights(self):
self.update_batch = self.optimizer.apply_gradients(zip(self.gradients,self.tvars))
When I execute:
def train_with_current_memories(self,sess):
self.sess = sess
for step_sample_batch in self.memory_batch_generator():
sess.run(self.update_weights(), feed_dict={self.state_in:step_sample_batch[0],self.action_holder:step_sample_batch[1],self.reward_holder:step_sample_batch[2].reshape([step_sample_batch[2].shape[0]])})
Or maybe this sluggishness is an expected behavior.
The code was slowing down after each iteration because the graph was getting bigger at each iteration. This is because I was creating new graph elements inside the iteration loop.
during each iteration the following function was being called:
def update_weights(self):
self.update_batch = self.optimizer.apply_gradients(zip(self.gradients,self.tvars))
return self.update_batch
This function was creating a new element to the graph.
The best way to avoid "graph leaking" is to add the line
sess.graph.finalize()
as soon as you create your session. In this way, if there is a graph leaking, Tensorflow will raise an exception.
I have been using TensorFlow for a reasonable length of time now. and believed I had a thorough understanding of how a TensorFlow graph works and executes within a session. However, I have written all of my TensorFlow models in a script-like fashion as such:
import tensorflow as tf
import DataWorker
import Constants
x = tf.placeholder(tf.float32, [None, Constants.sequenceLength, DataWorker.numFeatures])
y = tf.placeholder(tf.float32, [None, 1])
xTensors = tf.unstack(x, axis=1) # [seqLength tensors of shape (batchSize, numFeatures)]
W = tf.Variable(tf.random_normal([Constants.numHidden, 1])) # Weighted matrix
b = tf.Variable(tf.random_normal([1])) # Bias
cell = tf.contrib.rnn.BasicLSTMCell(Constants.numHidden, forget_bias=Constants.forgetBias)
outputs, finalState = tf.nn.static_rnn(cell, xTensors, dtype=tf.float32)
# predictions = [tf.add(tf.matmul(output, W), b) for output in outputs] # List of predictions after each time step
prediction = tf.add(tf.matmul(outputs[-1], W), b) # Prediction after final time step
prediction = tf.tanh(prediction) # Activation
mse = tf.losses.mean_squared_error(predictions=prediction, labels=y) # Mean loss over entire batch
accuracy = tf.reduce_mean(1 - (tf.abs(y - prediction) / DataWorker.labelRange)) # Accuracy over entire batch
optimiser = tf.train.AdamOptimizer(Constants.learningRate).minimize(mse) # Backpropagation
with tf.Session() as session:
session.run(tf.global_variables_initializer())
# #############################################
# TRAINING
# #############################################
for epoch in range(Constants.numEpochs):
print("***** EPOCH:", epoch + 1, "*****\n")
IDPointer, TSPointer = 0, 0 # Pointers to current ID and timestamp
epochComplete = False
batchNum = 0
while not epochComplete:
batchNum += 1
batchX, batchY, IDPointer, TSPointer, epochComplete = DataWorker.generateBatch(IDPointer, TSPointer, isTraining=True)
dict = {x: batchX, y: batchY}
session.run(optimiser, dict)
if batchNum % 1000 == 0 or epochComplete:
batchLoss = session.run(mse, dict)
batchAccuracy = session.run(accuracy, dict)
print("Iteration:", batchNum)
print(batchLoss)
print(str("%.2f" % (batchAccuracy * 100) + "%\n"))
# #############################################
# TESTING
# #############################################
testX, testY, _, _, _ = DataWorker.generateBatch(0, 0, isTraining=False)
testAccuracy = session.run(accuracy, {x: testX, y: testY})
print("Testing Accuracy:", str("%.2f" % (testAccuracy * 100) + "%"))
But now, for practicality and readability, I want to implement my model as a class, but have encountered many problems with initializing my variables, etc.
This is the closest I have got to implementing the above example using my own LSTM class
Model.py
import tensorflow as tf
import Constants
import DataWorker # Remove this dependency
class LSTM():
"""docstring."""
def __init__(self,
inputDimensionList,
outputDimensionList,
numLayers=Constants.numLayers,
numHidden=Constants.numHidden,
learningRate=Constants.learningRate,
forgetBias=Constants.forgetBias
):
"""docstring."""
self.batchInputs = tf.placeholder(tf.float32, [None] + inputDimensionList)
self.batchLabels = tf.placeholder(tf.float32, [None] + outputDimensionList)
self.weightedMatrix = tf.Variable(tf.random_normal([numHidden] + outputDimensionList))
self.biasMatrix = tf.Variable(tf.random_normal(outputDimensionList))
self.cell = tf.contrib.rnn.BasicLSTMCell(numHidden, forget_bias=forgetBias)
self.numLayers = numLayers
self.numHidden = numHidden
self.learningRate = learningRate
self.forgetBias = forgetBias
self.batchDict = {}
self.batchInputTensors = None
self.batchOutputs = None # All needed as instance variables?
self.batchFinalStates = None
self.batchPredictions = None
self.batchLoss = None
self.batchAccuracy = None
self.initialised = False
self.session = tf.Session()
# Take in activation, loss and optimiser FUNCTIONS as args
def execute(self, command):
"""docstring."""
return self.session.run(command, self.batchDict)
def setBatchDict(self, inputs, labels):
"""docstring."""
self.batchDict = {self.batchInputs: inputs, self.batchLabels: labels}
self.batchInputTensors = tf.unstack(self.batchInputs, axis=1)
def processBatch(self):
"""docstring."""
self.batchOutputs, self.batchFinalState = tf.nn.static_rnn(self.cell, self.batchInputTensors, dtype=tf.float32)
pred = tf.tanh(tf.add(tf.matmul(self.batchOutputs[-1], self.weightedMatrix), self.biasMatrix))
mse = tf.losses.mean_squared_error(predictions=pred, labels=self.batchLabels)
optimiser = tf.train.AdamOptimizer(self.learningRate).minimize(mse)
if not self.initialised:
self.session.run(tf.global_variables_initializer())
self.initialised = True
with tf.variable_scope("model") as scope:
if self.initialised:
scope.reuse_variables()
self.execute(optimiser)
self.batchPredictions = self.execute(pred)
self.batchLoss = self.execute(tf.losses.mean_squared_error(predictions=self.batchPredictions, labels=self.batchLabels))
self.batchAccuracy = self.execute(tf.reduce_mean(1 - (tf.abs(self.batchLabels - self.batchPredictions) / DataWorker.labelRange)))
return self.batchPredictions, self.batchLabels, self.batchLoss, self.batchAccuracy
def kill(self):
"""docstring."""
self.session.close()
This class is quite messy, especially processBatch() as I have just been trying to get it to work before refining it.
I then run my model here:
Main.py
import DataWorker
import Constants
from Model import LSTM
inputDim = [Constants.sequenceLength, DataWorker.numFeatures]
outputDim = [1]
lstm = LSTM(inputDimensionList=inputDim, outputDimensionList=outputDim)
# #############################################
# TRAINING
# #############################################
for epoch in range(Constants.numEpochs):
print("***** EPOCH:", epoch + 1, "*****\n")
IDPointer, TSPointer = 0, 0 # Pointers to current ID and timestamp
epochComplete = False
batchNum = 0
while not epochComplete:
batchNum += 1
batchX, batchY, IDPointer, TSPointer, epochComplete = DataWorker.generateBatch(IDPointer, TSPointer, isTraining=True)
lstm.setBatchDict(batchX, batchY)
batchPredictions, batchLabels, batchLoss, batchAccuracy = lstm.runBatch()
if batchNum % 1000 == 0 or epochComplete:
print("Iteration:", batchNum)
print("Pred:", batchPredictions[-1], "\tLabel:", batchLabels[-1])
print("Loss:", batchLoss)
print("Accuracy:", str("%.2f" % (batchAccuracy * 100) + "%\n"))
# #############################################
# TESTING
# #############################################
testX, testY, _, _, _ = DataWorker.generateBatch(0, 0, isTraining=False)
lstm.setBatchDict(testX, testY)
_, _, _, testAccuracy = lstm.runBatch()
print("Testing Accuracy:", str("%.2f" % (testAccuracy * 100) + "%"))
lstm.kill()
A single passthrough of the graph is executed fine, when all the variables are initialized, but it is on the second iteration where I get the error
ValueError: Variable rnn/basic_lstm_cell/kernel/Adam/ already exists, disallowed. Did you mean to set reuse=True in VarScope? Originally defined at:
optimiser = tf.train.AdamOptimizer(self.learningRate).minimize(mse)
I Googled this problem and learned that using scope.reuse_variables() should stop it trying to initialize the AdamOptimizer a second time, but cleary this isn't working how I have implemented it. How can I fix this issue?
As a side note, is my method of creating the TensorFlow session as an instance variable within my LSTM class acceptable, or should I create the session in Main and then pass it into the LSTM instance?
In general I wrap anything that creates variables under the hood with tf.make_template when doing object oriented model building.
However, you should avoid adding ops to the graph in a training loop, which looks like it's happening here. They will build up and cause problems, and likely give you incorrect results. Instead, define the graph (with inputs from tf.data, placeholders, or queues) and only loop over a session.run call. Even better, structure your code as an Estimator and this will be enforced.
This is part of my current python code for NN training in python using CNTK module
batch_axis = C.Axis.default_batch_axis()
input_seq_axis = C.Axis.default_dynamic_axis()
input_dynamic_axes = [batch_axis, input_seq_axis]
input_dynamic_axes2 = [batch_axis, input_seq_axis]
input = C.input_variable(n_ins, dynamic_axes=input_dynamic_axes, dtype=numpy.float32)
output = C.input_variable(n_outs, dynamic_axes=input_dynamic_axes2, dtype=numpy.float32)
dnn_model = cntk_model.create_model(input, hidden_layer_type, hidden_layer_size, n_outs)
loss = C.squared_error(dnn_model, output)
error = C.squared_error(dnn_model, output)
lr_schedule = C.learning_rate_schedule(current_finetune_lr, C.UnitType.minibatch)
momentum_schedule = C.momentum_schedule(current_momentum)
learner = C.adam(dnn_model.parameters, lr_schedule, momentum_schedule, unit_gain = False, l1_regularization_weight=l1_reg, l2_regularization_weight= l2_reg)
trainer = C.Trainer(dnn_model, (loss, error), [learner])
And here is code for creating NN model
def create_model(features, hidden_layer_type, hidden_layer_size, n_out):
logger.debug('Creating cntk model')
assert len(hidden_layer_size) == len(hidden_layer_type)
n_layers = len(hidden_layer_size)
my_layers = list()
for i in xrange(n_layers):
if(hidden_layer_type[i] == 'TANH'):
my_layers.append(C.layers.Dense(hidden_layer_size[i], activation=C.tanh, init=C.layers.glorot_uniform()))
elif (hidden_layer_type[i] == 'LSTM'):
my_layers.append(C.layers.Recurrence(C.layers.LSTM(hidden_layer_size[i])))
else:
raise Exception('Unknown hidden layer type')
my_layers.append(C.layers.Dense(n_out, activation=None))
my_model = C.layers.Sequential([my_layers])
my_model = my_model(features)
return my_model
Now, I would like to change a backpropagation, so when the error is calculated not direct network output is used, but the output after some additional calculation. I tried to define something like this
def create_error_function(self, prediction, target):
prediction_denorm = C.element_times(prediction, self.std_vector)
prediction_denorm = C.plus(prediction_denorm, self.mean_vector)
prediction_denorm_rounded = C.round(C.element_times(prediction_denorm[0:5], C.round(prediction_denorm[5])))
prediction_denorm_rounded = C.element_divide(prediction_denorm_rounded, C.round(prediction_denorm[5]))
prediction_norm = C.minus(prediction_denorm_rounded, self.mean_vector[0:5])
prediction_norm = C.element_divide(prediction_norm, self.std_vector[0:5])
first = C.squared_error(prediction_norm, target[0:5])
second = C.minus(C.round(prediction_denorm[5]), self.mean_vector[5])
second = C.element_divide(second, self.std_vector[5])
return C.plus(first, C.squared_error(second, target[5]))
and use it instead standard squared_error.
And the part for NN training
dnn_model = cntk_model.create_model(input, hidden_layer_type, hidden_layer_size, n_outs)
error_function = cntk_model.ErrorFunction(cmp_mean_vector, cmp_std_vector)
loss = error_function.create_error_function(dnn_model, output)
error = error_function.create_error_function(dnn_model, output)
lr_schedule = C.learning_rate_schedule(current_finetune_lr, C.UnitType.minibatch)
momentum_schedule = C.momentum_schedule(current_momentum)
learner = C.adam(dnn_model.parameters, lr_schedule, momentum_schedule, unit_gain = False, l1_regularization_weight=l1_reg,
l2_regularization_weight= l2_reg)
trainer = C.Trainer(dnn_model, (loss, error), [learner])
trainer.train_minibatch({input: temp_train_x, output: temp_train_y})
But after two epochs I start gettting always the same average loss, as my network is not learning
Every time you want to change how backprop works, you need to use stop_gradient. This is the only function whose gradient is different from the gradient of the operation of the forward pass. In the forward pass stop_gradient acts as identity. In the backward pass it blocks the gradient from propagating.
To do an operation f(x) on some x in the forward pass and pretend as if it never happened in the backward pass you need to do something like:
C.stop_gradient(f(x) - x) + x. In your case that would be
norm_features = C.stop_gradient(features/normalization - features) + features
I've setup a print statement and I've noticed that for the first batch when feeding an RNN, the embeddings exist, but after the second batch they don't and I get the following error:
ValueError: Variable RNNLM/RNNLM/Embedding/Adam_2/ does not exist, or was not created with tf.get_variable(). Did you mean to set reuse=None in VarScope?
Here is my code for generating the embeddings:
def add_embedding(self):
with tf.device('/gpu:0'):
embedding = tf.get_variable("Embedding", [len(self.vocab), self.config.embed_size])
e_x = tf.nn.embedding_lookup(embedding, self.input_placeholder)
inputs = [tf.squeeze(s, [1]) for s in tf.split(1, self.config.num_steps, e_x)]
return inputs
Here is how the model is seutp, this is where I suspect the problem lies
def model(self, inputs):
with tf.variable_scope("input_drop"):
inputs_drop = [tf.nn.dropout(i, self.dropout_placeholder) for i in inputs]
with tf.variable_scope("RNN") as scope:
self.initial_state = tf.zeros([self.config.batch_size, self.config.hidden_size], tf.float32)
state = self.initial_state
states = []
for t, e in enumerate(inputs_drop):
print "t is {0}".format(t)
if t > 0:
scope.reuse_variables()
H = tf.get_variable("Hidden", [self.config.hidden_size, self.config.hidden_size])
I = tf.get_variable("I", [self.config.embed_size, self.config.hidden_size])
b_1 = tf.get_variable("b_1", (self.config.hidden_size,))
state = tf.sigmoid(tf.matmul(state, H) + tf.matmul(e, I) + b_1)
states.append(state)
with tf.variable_scope("output_dropout"):
rnn_outputs = [tf.nn.dropout(o, self.dropout_placeholder) for o in states]
return rnn_outputs
The issue arises when I get to the loss function, defined as follows
def add_training_op(self, loss):
opt = tf.train.AdamOptimizer(self.config.lr)
train_op = opt.minimize(loss)
return train_op
EDIT: Here is some updated code to help everyone out
def __init__(self, config):
self.config = config
self.load_data(debug=False)
self.add_placeholders()
self.inputs = self.add_embedding()
self.rnn_outputs = self.add_model(self.inputs)
self.outputs = self.add_projection(self.rnn_outputs)
self.predictions = [tf.nn.softmax(tf.cast(o, 'float64')) for o in self.outputs]
output = tf.reshape(tf.concat(1, self.outputs), [-1, len(self.vocab)])
self.calculate_loss = self.add_loss_op(output)
self.train_step = self.add_training_op(self.calculate_loss)
Here are the other methods here, pertaining to add_projection and calculate_loss so we can rule them out.
def add_loss_op(self, output):
weights = tf.ones([self.config.batch_size * self.config.num_steps], tf.int32)
seq_loss = tf.python.seq2seq.sequence_loss(
[output],
tf.reshape(self.labels_placeholder, [-1]),
weights
)
tf.add_to_collection('total_loss', seq_loss)
loss = tf.add_n(tf.get_collection('total_loss'))
return loss
def add_projection(self, rnn_outputs):
with tf.variable_scope("Projection", initializer=tf.contrib.layers.xavier_initializer()) as scope:
U = tf.get_variable("U", [self.config.hidden_size, len(self.vocab)])
b_2 = tf.get_variable("b_2", [len(self.vocab)])
outputs = [tf.matmul(x, U) + b_2 for x in rnn_outputs]
return outputs
def train_RNNLM():
config = Config()
gen_config = deepcopy(config)
gen_config.batch_size = gen_config.num_steps = 1
with tf.variable_scope('RNNLM') as scope:
model = RNNLM_Model(config)
# This instructs gen_model to reuse the same variables as the model above
scope.reuse_variables()
gen_model = RNNLM_Model(gen_config)
init = tf.initialize_all_variables()
saver = tf.train.Saver()
with tf.Session() as session:
best_val_pp = float('inf')
best_val_epoch = 0
session.run(init)
for epoch in xrange(config.max_epochs):
print 'Epoch {}'.format(epoch)
start = time.time()
###
train_pp = model.run_epoch(
session, model.encoded_train,
train_op=model.train_step)
valid_pp = model.run_epoch(session, model.encoded_valid)
print 'Training perplexity: {}'.format(train_pp)
print 'Validation perplexity: {}'.format(valid_pp)
if valid_pp < best_val_pp:
best_val_pp = valid_pp
best_val_epoch = epoch
saver.save(session, './ptb_rnnlm.weights')
if epoch - best_val_epoch > config.early_stopping:
break
print 'Total time: {}'.format(time.time() - start)
Seems that the code is trying to create a new Adam Variable in each batch.
Possible that the add_training_op is called twice?
Also, the snippet of def add_training_op is incomplete since there is no return statement.
The problem turned out to be the following line of code:
model = RNNLM_Model(config)
# This instructs gen_model to reuse the same variables as the model above
scope.reuse_variables()
gen_model = RNNLM_Model(gen_config)
It turns out that the second model was an issue by using reuse_variables(). By removing this line by issues went away.