How do I properly restore a Tensorflow Checkpoint?

How do I properly restore a Tensorflow Checkpoint? - python

I've extended the python implementation of WGAN-GP from here: https://keras.io/examples/generative/wgan_gp/
Basically, I added a callback to the fit function:
class GANCheckpoint(keras.callbacks.Callback):
def __init__(self, cpkt=None, manager=None):
self.cpkt = cpkt
self.manager = manager
def on_epoch_begin(self, epoch, logs=None):
if self.manager.latest_checkpoint:
self.cpkt.restore(self.manager.latest_checkpoint)
print("Restored from {}".format(self.manager.latest_checkpoint))
else:
print("Initializing from scratch.")
def on_epoch_end(self, epoch, logs=None):
save_path = manager.save()
self.cpkt.step.assign_add(1)
print("\nSaved checkpoint for step {}: {}".format(int(checkpoint.step), save_path))
And the checkpoint manager is initialized as:
Checkpoint manager
checkpoint_dir = './training_checkpoints/GAN/'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(step=tf.Variable(1),
d_model=d_model, g_model=g_model,
discriminator_optimizer=discriminator_optimizer, generator_optimizer=generator_optimizer)
manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=None)
cbk = GANCheckpoint(cpkt=checkpoint, manager=manager)
Finally I have the fit call:
wgan.fit(X, batch_size=BATCH_SIZE, epochs=epochs, verbose = True, callbacks=[cbk])
I'm using checkpoint.restore(manager.latest_checkpoint) to restore weights in another python file.
However, my generator results are way off compared to what it is supposed to be.
I'm using the following code:
for i in range(10):
a = tf.random.normal(shape=(1, 128))
sample = checkpoint.g_model.predict(a)
print(sample)
I checked the weights of the generator and optimizer, they're coherent and seem identical.
Are checkpoints tied to a specific python file ?
Additionaly, even when I try to restore a checkpoint without fitting the model a first time, in the original python file, it does not work either.
Do you have any idea ?
Thanks in advance

Related

Running different models in one script in Tensorflow 1.9

I have very simple model which consists of one tf.Variable() and here is who code:
import tensorflow as tf
save_path="model1/model1.ckpt"
num_input = 2
n_nodes_hl1 = 2
with tf.variable_scope("model1"):
hidden_1_layer = {
'weights' : tf.Variable(tf.random_normal([num_input, n_nodes_hl1]), name='Weight1')
}
def train_model():
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
save_model(sess)
def save_model(sess):
saver = tf.train.Saver(tf.global_variables(), save_path)
saver.save(sess, save_path)
def load_model(sess):
saver = tf.train.Saver(tf.global_variables(), save_path)
saver.restore(sess, save_path)
def run_model():
print("model1 running...")
with tf.Session() as sess:
load_model(sess)
x = sess.run(hidden_1_layer)
print(x)
#train_model()
The second model is completely the same, but with changed names "model1" to "model2". Both models are trained, saved and work perfect separately. So now I want to test them using following script:
import model1 as m1
import model2 as m2
m1.run_model()
m2.run_model()
And here I got an error message:
NotFoundError (see above for traceback): Key model2/Weight2 not found in checkpoint
So it looks like running imports causes adding all variables to common graph (even though they are in separate variable scopes) and then it cannot find variable from model2 saved in checkpoint in model1.
Can anyone solve my problem?
Is it possible in Tensorflow to run a few different models in one script?
EDIT - PROBLEM SOLVED
The solution is very easy. What you have to do is to create separate graphs for each model like. It means that all tensors you declare or calculate must be within that graph. You also must put it as an argument in Session, like: tf.Session(graph=self.graph)
Whole example below:
import tensorflow as tf
save_path="model1/model1.ckpt"
class model1:
num_input = 2
n_nodes_hl1 = 2
def init(self):
self.graph = tf.Graph()
with self.graph.as_default():
with tf.variable_scope("model1"):
self.hidden_1_layer = {
'weights' : tf.Variable(tf.random_normal([self.num_input, self.n_nodes_hl1]), name='Weight1')
}
def train_model(self):
init = tf.global_variables_initializer()
with tf.Session(graph = self.graph) as sess:
sess.run(init)
self.save_model(sess)
def save_model(self, sess):
saver = tf.train.Saver(tf.global_variables(), save_path)
saver.save(sess, save_path)
def load_model(self, sess):
saver = tf.train.Saver(tf.global_variables(), save_path)
saver.restore(sess, save_path)
def run_model(self):
print("model1 running...")
with tf.Session(graph = self.graph) as sess:
self.load_model(sess)
x = sess.run(self.hidden_1_layer)
print(x)

Oh! the common "I want to use several models" question! just make sure that you reset the graph after each model:
tf.reset_default_graph()
Your code would look like:
import tensorflow as tf
import model1 as m1
m1.run_model()
tf.reset_default_graph()
import model2 as m2
m2.run_model()
Why? The moment you create a variable in tensorflow using tf.Variable, that variable is added to the default graph. If you import both models one after the other, you just created all the variables in the default graph! This is by far the easiest solution. Consider the default graph as a blackboard: you can draw your fancy ML model, but you need to wipe it clean before reuse!
NOTE: If you are wondering, the alternative is to create separate graphs for each of the models, but it is much more worrysome and I only recommend it for times when you must have both models at the same time.
EXTRA: Encapsulating your model in a Tensorflow class
A fancier way to do it while avoiding several graphs (seriously, it is horrible!) is to encapsulate the whole model in a class. Thus, your code would look like this:
import tensorflow as tf
class model():
self.num_input = 2
self.n_nodes_hl1 = 2
def init(self, new_save_path)
self.save_path=new_save_path
tf.reset_default_graph()
with tf.variable_scope("model1"):
self.hidden_1_layer = {
'weights' : tf.Variable(tf.random_normal([self.num_input,
self.n_nodes_hl1]), name='Weight1')
}
self.saver = tf.train.Saver(tf.global_variables(), self.save_path)
self.sess = tf.Session()
self.sess.run(tf.global_variables_initializer())
def save_model(self):
self.saver.save(self.sess, self.save_path)
def load_model(self):
self.saver.restore(self.sess, self.save_path)
def run_model(self):
print("model1 running...")
load_model()
x = sess.run(self.hidden_1_layer)
print(x)
#train_model(self)
This way you could simply do:
import model
m1 = model('model1/model1.ckpt') # These two lines could be put into one
m1.run_model() # m1 = model('model1/model1.ckpt').run_model()
m2 = model('model2/model2.ckpt')
m2.run_model()
You still want it in a for loop?
import model
model_file_list = ['model1/model1.ckpt', 'model2/model2.ckpt']
for model_file in model_list:
m = model(model_file ).run_model()
# Run tests, print stuff, save stuff here!

Broken Pipe in pytorch DataLoader

I was trying to understand how DataLoader works.
This is how I applied it:
# DATASET
class Word2VecDataset(torch_data.Dataset):
def __init__(self, vocabulary):
super(Word2VecDataset, self).__init__()
self.data_list = []
self.vocab = vocabulary
self.generate_batch_list()
def __getitem__(self, index):
return self.data_list[index]
def __len__(self):
return len(self.data_list)
def generate_batch_list(self):
training_data = self.vocab.get_training_phrases()
for query in training_data.Query:
query = utils.skip_gram_tokenize(vocab=self.vocab, sentence=query)
for entry in query:
self.data_list.append(entry)
for response in training_data.Response:
response = utils.skip_gram_tokenize(vocab=self.vocab, sentence=response)
for entry in response:
self.data_list.append(entry)
And this is the actual dataloader part:
dataset = Word2VecDataset(self.vocab)
data_loader = torch_data.DataLoader(dataset, self.batch_size, True, num_workers=4)
print('Model Initialized')
for epo in range(self.num_epochs):
loss_val = None
for i_batch, sample_batched in enumerate(data_loader): # This seems to be causing issues. For some reason this is the part that 'reboots' the whole model, making it print twice (more info under the code)
loss_val = 0
for data, target in sample_batched:
....
Now, weirdly enough both the initialization phase (which you don't see here) that says 'This is the gpu detected: xxx', and the print('Model Initialized') get printed two times.
Finally here (pastebin) is the full console log (with the error).

I have the same issue. I solved it using if __name__ == '__main__': in my python code, but I was enable to solve the broken pipe in my jupyter notebook...

Parallel fitting of multiple Keras Models on single GPU

I'm trying to fit multiple small Keras models in parallel on a single GPU. Because of reasons i need to get them out of a list and train them one step at a time. Since I was not lucky with the standard multiprocessing module i use pathos.
What I tried to do is something like this:
from pathos.multiprocessing import ProcessPool as Pool
import tensorflow as tf
import keras.backend as K
def multiprocess_step(self, model):
K.set_session(sess)
with sess.graph.as_default():
model = step(model, sess)
return model
def step(model, sess):
K.set_session(sess)
with sess.graph.as_default():
model.fit(x=data['X_train'], y=data['y_train'],
batch_size=batch_size
validation_data=(data['X_test'], data['y_test']),
verbose=verbose,
shuffle=True,
initial_epoch=self.step_num - 1)
return model
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = "0"
sess = tf.Session(config=config)
K.set_session(sess)
with sess.graph.as_default():
pool = Pool(8).map
model_list = pool(multiprocess_step, model_list)
but whatever I try I keep getting an error claiming that the models dont seem to be on the same graph...
ValueError: Tensor("training/RMSprop/Variable:0", shape=(25, 352), dtype=float32_ref) must be from the same graph as Tensor("RMSprop/rho/read:0", shape=(), dtype=float32).
The exception originates in the model.fit() row so I must have done something wrong with the assignment of the session graph even though I tried to set that in every possible location?
Does anyone have experience with something similar?

The following was suggested on the Keras issue tracker. I'm not sure about the relative merits of the approach compared to using multiprocessing.
in_1 = Input()
lstm_1 = LSTM(...)(in_1)
out_1 = Dense(...)(lstm_1)
in_2 = Input()
lstm_2 = LSTM(...)(in_2)
out_2 = Dense(...)(lstm_2)
model_1 = Model(input=in_1, output=out_1)
model_2 = Model(input=in_2, output=out_2)
model = Model(input = [in_1, in_2], output = [out_1, out_2])
model.compile(...)
model.fit(...)
model_1.predict(...)
model_2.predict(...)

Considering the backend is set to tensorflow for the keras. you can use code and do parallel processing for multiple model invocation/ multiple model loading.
def model1(dir_model):
model = os.path.join(dir_model, 'model.json')
dir_weights = os.path.join(dir_model, 'model.h5')
graph1 = Graph()
with graph1.as_default():
session1 = Session(graph=graph1, config=config)
with session1.as_default():
with open(model, 'r') as data:
model_json = data.read()
model_1 = model_from_json(model_json)
model_1.load_weights(dir_weights)
return model_1,gap_weights,session1,graph1
def model_2(dir_model):
model = os.path.join(dir_model, 'model.json')
dir_weights = os.path.join(dir_model, 'model.h5')
graph2 = Graph()
with graph2.as_default():
session2 = Session(graph=graph2, config=config)
with session2.as_default():
with open(model, 'r') as data:
model_json = data.read()
model_2 = model_from_json(model_json)
model_2.load_weights(dir_weights)
return model_2,session2,graph2
and for invocation of the specific model do the following experiments.
for model 1 predict do the following
K.set_session(session2)
with graph2.as_default():
img_pred[img_name] =
patch_dict[np.argmax(np.squeeze(model_2.predict(img_invoke)))
and for the model 2 it follows same as
K.set_session(session2)
with graph2.as_default():
img_pred[img_name] =
patch_dict[np.argmax(np.squeeze(model_2.predict(img_invoke)))]

How to display Runtime Statistics in Tensorboard using Estimator API in a distributed environment

This article illustrates how to add Runtime statistics to Tensorboard:
run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
run_metadata = tf.RunMetadata()
summary, _ = sess.run([merged, train_step],
feed_dict=feed_dict(True),
options=run_options,
run_metadata=run_metadata)
train_writer.add_run_metadata(run_metadata, 'step%d' % i)
train_writer.add_summary(summary, i)
print('Adding run metadata for', i)
which creates the following details in Tensorboard:
This is fairly straightforward on a single machine. How could one do this in a distributed environment using Estimators?

I use the following hook, based on ProfilerHook, to have the estimator output the run metadata into the model directory and inspect it later with Tensorboard.
import tensorflow as tf
from tensorflow.python.training.session_run_hook import SessionRunHook, SessionRunArgs
from tensorflow.python.training import training_util
from tensorflow.python.training.basic_session_run_hooks import SecondOrStepTimer
class MetadataHook(SessionRunHook):
def __init__ (self,
save_steps=None,
save_secs=None,
output_dir=""):
self._output_tag = "step-{}"
self._output_dir = output_dir
self._timer = SecondOrStepTimer(
every_secs=save_secs, every_steps=save_steps)
def begin(self):
self._next_step = None
self._global_step_tensor = training_util.get_global_step()
self._writer = tf.summary.FileWriter (self._output_dir, tf.get_default_graph())
if self._global_step_tensor is None:
raise RuntimeError("Global step should be created to use ProfilerHook.")
def before_run(self, run_context):
self._request_summary = (
self._next_step is None or
self._timer.should_trigger_for_step(self._next_step)
)
requests = {"global_step": self._global_step_tensor}
opts = (tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
if self._request_summary else None)
return SessionRunArgs(requests, options=opts)
def after_run(self, run_context, run_values):
stale_global_step = run_values.results["global_step"]
global_step = stale_global_step + 1
if self._request_summary:
global_step = run_context.session.run(self._global_step_tensor)
self._writer.add_run_metadata(
run_values.run_metadata, self._output_tag.format(global_step))
self._writer.flush()
self._next_step = global_step + 1
def end(self, session):
self._writer.close()
To use it, one creates the estimator instance (my_estimator) as usual, whether it is pre-made one or a custom estimator. The desired operation is called passing an instance of the class above as a hook. For example:
hook = MetadataHook(save_steps=1, output_dir=<model dir>)
my_estimator.train( train_input_fn, hooks=[hook] )
The run metadata will be placed in the model dir and can be inspected by TensorBoard.

You may use tf.train.ProfilerHook. However the catch is that it was released at 1.14.
Example usage:
estimator = tf.estimator.LinearClassifier(...)
hooks = [tf.train.ProfilerHook(output_dir=model_dir, save_secs=600, show_memory=False)]
estimator.train(input_fn=train_input_fn, hooks=hooks)
Executing the hook will generate files timeline-xx.json in output_dir.
Then open chrome://tracing/ in chrome browser and load the file. You will get a time usage timeline like below.

Variable not created when restore graph with import_meta_graph?

I am trying to restore graph from model which I train with TensorFlow tutorials, then I try to restore the model:
import tensorflow as tf
import reader
from ptb_word_lm import PTBInput, PTBModel, get_config, run_epoch
def main(_):
checkpoint_path = "/Users/roger/data/ptb_out"
checkpoint_path = tf.train.latest_checkpoint(checkpoint_path)
raw_data = reader.ptb_raw_data("/Users/roger/data/simple-examples/small_data")
train_data, valid_data, test_data, _ = raw_data
config = get_config()
eval_config = get_config()
eval_config.batch_size = 1
eval_config.num_steps = 1
with tf.Session() as session:
initializer = tf.random_uniform_initializer(-config.init_scale,
config.init_scale)
saver = tf.train.import_meta_graph(checkpoint_path + ".meta")
saver.restore(session, checkpoint_path)
with tf.name_scope("Test"):
test_input = PTBInput(config=eval_config, data=test_data, name="TestInput")
with tf.variable_scope("Model", reuse=True, initializer=initializer):
mtest = PTBModel(is_training=False, config=eval_config,
input_=test_input)
test_perplexity = run_epoch(session, mtest)
print("Test Perplexity: %.3f" % test_perplexity)
if __name__ == "__main__":
tf.app.run()
However, I find that Varible Model/embedding which created here is not restored from graph. So I get error like this:
ValueError: Variable Model/embedding does not exist, or was not created with tf.get_variable(). Did you mean to set reuse=None in VarScope?
So how can I restore the model correctly?

I think, since you set reuse=True in your variable scope, it tries to find that variable instead of creating it when you call PTBModel(). If you use get_variable() with reuse=True in a scope, it will never create a variable.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How do I properly restore a Tensorflow Checkpoint? - python

Related

Running different models in one script in Tensorflow 1.9

Broken Pipe in pytorch DataLoader

Parallel fitting of multiple Keras Models on single GPU

How to display Runtime Statistics in Tensorboard using Estimator API in a distributed environment

Variable not created when restore graph with import_meta_graph?

Categories

Resources