I am trying to implement a suggestion from answers:
Tensorflow: how to save/restore a model?
I have an object which wraps a tensorflow model in a sklearn style.
import tensorflow as tf
class tflasso():
saver = tf.train.Saver()
def __init__(self,
learning_rate = 2e-2,
training_epochs = 5000,
display_step = 50,
BATCH_SIZE = 100,
ALPHA = 1e-5,
checkpoint_dir = "./",
):
...
def _create_network(self):
...
def _load_(self, sess, checkpoint_dir = None):
if checkpoint_dir:
self.checkpoint_dir = checkpoint_dir
print("loading a session")
ckpt = tf.train.get_checkpoint_state(self.checkpoint_dir)
if ckpt and ckpt.model_checkpoint_path:
self.saver.restore(sess, ckpt.model_checkpoint_path)
else:
raise Exception("no checkpoint found")
return
def fit(self, train_X, train_Y , load = True):
self.X = train_X
self.xlen = train_X.shape[1]
# n_samples = y.shape[0]
self._create_network()
tot_loss = self._create_loss()
optimizer = tf.train.AdagradOptimizer( self.learning_rate).minimize(tot_loss)
# Initializing the variables
init = tf.initialize_all_variables()
" training per se"
getb = batchgen( self.BATCH_SIZE)
yvar = train_Y.var()
print(yvar)
# Launch the graph
NUM_CORES = 3 # Choose how many cores to use.
sess_config = tf.ConfigProto(inter_op_parallelism_threads=NUM_CORES,
intra_op_parallelism_threads=NUM_CORES)
with tf.Session(config= sess_config) as sess:
sess.run(init)
if load:
self._load_(sess)
# Fit all training data
for epoch in range( self.training_epochs):
for (_x_, _y_) in getb(train_X, train_Y):
_y_ = np.reshape(_y_, [-1, 1])
sess.run(optimizer, feed_dict={ self.vars.xx: _x_, self.vars.yy: _y_})
# Display logs per epoch step
if (1+epoch) % self.display_step == 0:
cost = sess.run(tot_loss,
feed_dict={ self.vars.xx: train_X,
self.vars.yy: np.reshape(train_Y, [-1, 1])})
rsq = 1 - cost / yvar
logstr = "Epoch: {:4d}\tcost = {:.4f}\tR^2 = {:.4f}".format((epoch+1), cost, rsq)
print(logstr )
self.saver.save(sess, self.checkpoint_dir + 'model.ckpt',
global_step= 1+ epoch)
print("Optimization Finished!")
return self
When I run:
tfl = tflasso()
tfl.fit( train_X, train_Y , load = False)
I get output:
Epoch: 50 cost = 38.4705 R^2 = -1.2036
b1: 0.118122
Epoch: 100 cost = 26.4506 R^2 = -0.5151
b1: 0.133597
Epoch: 150 cost = 22.4330 R^2 = -0.2850
b1: 0.142261
Epoch: 200 cost = 20.0361 R^2 = -0.1477
b1: 0.147998
However, when I try to recover the parameters (even without killing the object):
tfl.fit( train_X, train_Y , load = True)
I get strange results. First of all, the loaded value does not correspond the saved one.
loading a session
loaded b1: 0.1 <------- Loaded another value than saved
Epoch: 50 cost = 30.8483 R^2 = -0.7670
b1: 0.137484
What is the right way to load, and probably first inspect the saved variables?
TL;DR: You should try to rework this class so that self.create_network() is called (i) only once, and (ii) before the tf.train.Saver() is constructed.
There are two subtle issues here, which are due to the code structure, and the default behavior of the tf.train.Saver constructor. When you construct a saver with no arguments (as in your code), it collects the current set of variables in your program, and adds ops to the graph for saving and restoring them. In your code, when you call tflasso(), it will construct a saver, and there will be no variables (because create_network() has not yet been called). As a result, the checkpoint should be empty.
The second issue is that—by default—the format of a saved checkpoint is a map from the name property of a variable to its current value. If you create two variables with the same name, they will be automatically "uniquified" by TensorFlow:
v = tf.Variable(..., name="weights")
assert v.name == "weights"
w = tf.Variable(..., name="weights")
assert v.name == "weights_1" # The "_1" is added by TensorFlow.
The consequence of this is that, when you call self.create_network() in the second call to tfl.fit(), the variables will all have different names from the names that are stored in the checkpoint—or would have been if the saver had been constructed after the network. (You can avoid this behavior by passing a name-Variable dictionary to the saver constructor, but this is usually quite awkward.)
There are two main workarounds:
In each call to tflasso.fit(), create the whole model afresh, by defining a new tf.Graph, then in that graph building the network and creating a tf.train.Saver.
RECOMMENDED Create the network, then the tf.train.Saver in the tflasso constructor, and reuse this graph on each call to tflasso.fit(). Note that you might need to do some more work to reorganize things (in particular, I'm not sure what you do with self.X and self.xlen) but it should be possible to achieve this with placeholders and feeding.
Related
I want to use torch.save() to save a trained model for inference. However, with either torch.load_state_dict() or torch.load(), I can't get the saved model. The loss computed by the loaded model is just different from the loss computed by the saved model.
The relevant Libraries:
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F
The model:
class nn_block(nn.Module):
def __init__(self, feats_dim):
super(nn_block, self).__init__()
self.linear = nn.Linear(feats_dim, feats_dim)
self.bn = nn.BatchNorm1d(feats_dim)
self.softplus1 = nn.Softplus()
self.softplus2 = nn.Softplus()
def forward(self, rep_mat):
transformed_mat = self.linear(rep_mat)
transformed_mat = self.bn(transformed_mat)
transformed_mat = self.softplus1(transformed_mat)
transformed_mat = self.softplus2(transformed_mat + rep_mat)
return transformed_mat
class test_nn(nn.Module):
def __init__(self, in_feats, feats_dim, num_conv, num_classes):
super(test_nn, self).__init__()
self.linear1 = nn.Linear(in_feats, feats_dim)
self.convs = [nn_block(feats_dim) for _ in range(num_conv)]
self.linear2 = nn.Linear(feats_dim, num_classes)
self.softmax = nn.Softmax()
def forward(self, rep_mat):
h = self.linear1(rep_mat)
for conv_func in self.convs:
h = conv_func(h)
h = self.linear2(h)
h = self.softmax(h)
return h
Train, save, and reload a model:
# fake a classification task
num_classes = 2; input_dim = 8
one = np.random.multivariate_normal(np.zeros(input_dim),np.eye(input_dim),20)
two = np.random.multivariate_normal(np.ones(input_dim),np.eye(input_dim),20)
inputs = np.concatenate([one, two], axis=0)
labels = np.concatenate([np.zeros(20), np.ones(20)])
inputs = Variable(torch.Tensor(inputs))
labels = torch.LongTensor(labels)
# build a model
net = test_nn(input_dim, 5, 2, num_classes)
optimizer = torch.optim.Adam(net.parameters(), lr=0.01)
net.train()
losses = []
best_score = 1e10
for epoch in range(25):
preds = net(inputs)
loss = F.cross_entropy(preds, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
state_dict = {'state_dict': net.state_dict()}
if loss.item()-best_score<-1e-4:
# save only parameters
torch.save(state_dict, 'model_params.torch')
# save the whole model
torch.save(net, 'whole_model.torch')
best_score = np.min([best_score, loss.item()])
losses.append(loss.item())
net_params = test_nn(input_dim, 5, 2, num_classes)
net_params.load_state_dict(torch.load('model_params.torch')['state_dict'])
net_params.eval()
preds_params = net_params(inputs)
loss_params = F.cross_entropy(preds_params, labels)
print('reloaded params %.4f %.4f' % (loss_params.item(), np.min(losses)))
net_whole = torch.load('whole_model.torch')
net_whole.eval()
preds_whole = net_whole(inputs)
loss_whole = F.cross_entropy(preds_whole, labels)
print('reloaded whole %.4f %.4f' % (loss_whole.item(), np.min(losses)))
As you can see by running the code, the losses computed by the two loaded models are different, while the two loaded models are exactly the same. Not just the two losses are different, they are also different from the loss computed by the best model that was saved in the first place.
Why this can happen?
The state dict contains every parameter (nn.Parameter
) and buffer (similar to parameter, but which should not be trained/optimised) that has been registered on the module and all of its submodules. Everything else will not be included in that state dict.
Your test_nn module uses a list for convs, therefore it is not included in the state dict:
self.convs = [nn_block(feats_dim) for _ in range(num_conv)]
Not only are they not contained in the state dict, they are also not visible to net.parameters(), which means they are not trained/optimised at all.
To register the modules from the list you can wrap it in nn.ModuleList, which is a module that acts like a list, while correctly registering the modules it contains:
self.convs = nn.ModuleList([nn_block(feats_dim) for _ in range(num_conv)])
With that change both models produce the same result.
Since you are calling the convs modules sequentially in the for-loop (output of one module is the input of the next), you may consider using nn.Sequential, which you can call directly instead of having to use the for-loop. Sequencing is used a lot and it just makes it a little simpler, for example if you want to replace the sequence of modules with a single module, you don't need to change anything in the forward method.
Not just the two losses are different, they are also different from the loss computed by the best model that was saved in the first place.
When you are training, you calculate the loss for the current input (batch) and then you optimise the parameters based on that input. This means your parameters differ from the ones used to calculate the loss. Because you are saving the model after that, it will also have a different loss (the one that would occur in the next iteration).
preds = net(inputs)
# Calculating the loss of the current model
loss = F.cross_entropy(preds, labels)
optimizer.zero_grad()
loss.backward()
# Updating the model's parameters based on the loss
optimizer.step()
# State of the model after it has been updated
state_dict = {'state_dict': net.state_dict()}
# Comparing the loss from BEFORE the update
# But saving the model from AFTER the update
if loss.item()-best_score<-1e-4:
# save only parameters
torch.save(state_dict, 'model_params.torch')
# save the whole model
torch.save(net, 'whole_model.torch')
It's important to evaluate the model after the updates have been made. For this reason a validation set should be used, which is run after each epoch to assess the model's accuracy.
I am using tensorflow and I have developer a deep multilayer feedforward model. To be sure about the performance of the model, I decided to use it in 10-fold cross validation. In each fold I create a new instance of the neural network, call the train and the predict functions.
In each fold I call the following codes:
for each fold:
nn= ffNN(hidden_nodes, epochs, learning_rate, saveFrequency, save_path, decay, decay_step, decay_factor, stop_loss, keep_probability, regularization_factor,minimum_cost,activation_function,batch_size,shuffle,stopping_iteration)
nn.initialize(x_size)
nn.train(X,y)
nn.predict(X_test)
in ffNN file I have the initialization and train and predict functions as follow:
nn.train:
sess = tf.InteractiveSession()
init = tf.global_variables_initializer()
sess.run(init)
saver = tf.train.Saver()
for each epoch:
for each batch:
_ , loss = session.run([self.optimizer,self.loss],feed_dict={self.X:X1, self.y:y})
if epoch % save_frequency == 0:
saver.save(session,save_path)
sess.close()
The problem is in saver.save, in each fold it takes longer and longer to save. Although I create all of the variables from the scratch, I don't know what is making it dependent on the folds and make the saving takes longer and longer.
Thanks in advance.
Edit:
The code for building the model nn.initialize is as follow:
self.X = tf.placeholder("float", shape=[None, x_size], name='XValue')
self.y = tf.placeholder("float", shape=[None, y_size], name='yValue')
with tf.variable_scope("initialization", reuse=tf.AUTO_REUSE):
w_in, b_in = init_weights((x_size, self.hidden_nodes))
h_out = self.forwardprop(self.X, w_in, b_in, self.keep_prob,self.activation_function)
l2_norm = tf.add(tf.nn.l2_loss(w_in), tf.nn.l2_loss(b_in))
w_out, b_out = init_weights((self.hidden_nodes, y_size))
l2_norm = tf.add(tf.nn.l2_loss(w_out), l2_norm)
l2_norm = tf.add(tf.nn.l2_loss(b_out), l2_norm)
self.yhat = tf.add(tf.matmul(h_out, w_out), b_out)
self.mse = tf.losses.mean_squared_error(labels=self.y, predictions=self.yhat)
self.loss = tf.add(self.mse,self.regularization_factor * l2_norm)
self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.loss)
Based on what you described in the question the problem is not in saver.save, but the computational graph getting bigger and bigger instead. Thus, the saving takes more time. Make sure to structure the code in the following way:
for each fold:
# Clear the previous computational graph
tf.reset_default_graph()
# Then build the graph
nn = ffNN()
# Create the saver
saver = tf.train.Saver()
# Create a session
with tf.Session() as sess:
# Initialize the variables in the graph
sess.run(tf.global_variables_initializer())
# Train the model
for each epoch:
for each batch:
nn.train_on_batch()
if epoch % save_frequency == 0:
saver.save(sess,save_path)
I have wrote a program with Tensorflow that identifies a number of figures in an image. The model is trained with a function and then used with another function to label the figures. The training have been done on my computer and the resulting model upload to aws with the solve function.
I my computer it works well, but when create a lambda in aws it works strange and start giving different answers with the same test data.
The model in the solve function is this:
# Recreate neural network from model file generated during training
# input
x = tf.placeholder(tf.float32, [None, size_of_image])
# weights
W = tf.Variable(tf.zeros([size_of_image, num_chars]))
# biases
b = tf.Variable(tf.zeros([num_chars]))
The solve function code to label the figures is this:
for testi in range(captcha_letters_num):
# load model from file
saver = tf.train.import_meta_graph(model_path + '.meta',
clear_devices=True)
saver.restore(sess, model_path)
# Data to label
test_x = np.asarray(char_imgs[testi], dtype=np.float32)
predict_op = model(test_x, W, b)
op = sess.run(predict_op, feed_dict={x: test_x})
# find max probability from the probability distribution returned by softmax
max_probability = op[0][0]
max_probability_index = -1
for i in range(num_chars):
if op[0][i] > max_probability:
max_probability = op[0][i]
max_probability_index = i
# append it to final output
final_text += char_map_list[max_probability_index]
# Reset the model so it can be used again
tf.reset_default_graph()
With the same test data it gives different answers, don't know why.
Solved!
What I finally do was to keep the Session outside the loop and initialize the variables. After ending the loop, reset the graph.
saver = tf.train.Saver()
sess = tf.Session()
# Initialize variables
sess.run(tf.global_variables_initializer())
.
.
.
# passing each of the 5 characters through the NNet
for testi in range(captcha_letters_num):
# Data to label
test_x = np.asarray(char_imgs[testi], dtype=np.float32)
predict_op = model(test_x, W, b)
op = sess.run(predict_op, feed_dict={x: test_x})
# find max probability from the probability distribution returned by softmax
max_probability = op[0][0]
max_probability_index = -1
for i in range(num_chars):
if op[0][i] > max_probability:
max_probability = op[0][i]
max_probability_index = i
# append it to final output
final_text += char_map_list[max_probability_index]
# Reset the model so it can be used again
tf.reset_default_graph()
sess.close()
I'm still newbie in tensorflow so I'm sorry if this is a naive question. I'm trying to use the inception_V4 model pretrained on ImageNet dataset published on this site. Also, I'm using their network as it is, I mean the one published on their site.
Here is how I call the network:
def network(images_op, keep_prob):
width_needed_InceptionV4Net = 342
shape = images_op.get_shape().as_list()
H = int(round(width_needed_InceptionV4Net * shape[1] / shape[2], 2))
resized_images = tf.image.resize_images(images_op, [width_needed_InceptionV4Net, H], tf.image.ResizeMethod.BILINEAR)
with slim.arg_scope(inception.inception_v4_arg_scope()):
logits, _ = inception.inception_v4(resized_images, num_classes=20, is_training=True, dropout_keep_prob = keep_prob)
return logits
Since I need to retrain the Inception_V4's final layer for my categories, I modified the number of classes to be 20 as you can see in the method call (inception.inception_v4).
Here is the train method I have so far:
def optimistic_restore(session, save_file, flags):
reader = tf.train.NewCheckpointReader(save_file)
saved_shapes = reader.get_variable_to_shape_map()
var_names = sorted([(var.name, var.name.split(':')[0]) for var in tf.global_variables()
if var.name.split(':')[0] in saved_shapes])
restore_vars = []
name2var = dict(zip(map(lambda x:x.name.split(':')[0], tf.global_variables()), tf.global_variables()))
if flags.checkpoint_exclude_scopes is not None:
exclusions = [scope.strip() for scope in flags.checkpoint_exclude_scopes.split(',')]
with tf.variable_scope('', reuse=True):
variables_to_init = []
for var_name, saved_var_name in var_names:
curr_var = name2var[saved_var_name]
var_shape = curr_var.get_shape().as_list()
if var_shape == saved_shapes[saved_var_name]:
print(saved_var_name)
excluded = False
for exclusion in exclusions:
if saved_var_name.startswith(exclusion):
variables_to_init.append(var)
excluded = True
break
if not excluded:
restore_vars.append(curr_var)
saver = tf.train.Saver(restore_vars)
saver.restore(session, save_file)
def train(images, ids, labels, total_num_examples, batch_size, train_dir, network, flags,
optimizer, log_periods, resume):
"""!#brief Trains the network for a number of steps.
#param images image tensor
#param ids id tensor
#param labels label tensor
#param total_num_examples total number of training examples
#param batch_size batch size
#param train_dir directory where checkpoints should be saved
#param network pointer to a function describing the network
#param flags command-line arguments
#param optimizer pointer to the optimization class
#param log_periods list containing the step intervals at which 1) logs should be printed,
2) logs should be saved for TensorBoard and 3) variables should be saved
#param resume should training be resumed (or restarted from scratch)?
#return the number of training steps performed since the first call to 'train'
"""
# clearing the training directory
if not resume:
if tf.gfile.Exists(train_dir):
tf.gfile.DeleteRecursively(train_dir)
tf.gfile.MakeDirs(train_dir)
print('Training the network in directory %s...' % train_dir)
global_step = tf.Variable(0, trainable = False)
# creating a placeholder, set to ones, used to assess the importance of each pixel
mask, ones = _mask(images, batch_size, flags)
# building a Graph that computes the logits predictions from the inference model
keep_prob = tf.placeholder_with_default(0.5, [])
logits = network(images * mask, keep_prob)
# creating the optimizer
if optimizer == tf.train.MomentumOptimizer:
opt = optimizer(flags.learning_rate, flags.momentum)
else:
opt = optimizer(flags.learning_rate)
# calculating the semantic loss, defined as the classification or regression loss
if flags.boosting_weights is not None and os.path.isfile(flags.boosting_weights):
boosting_weights_value = np.loadtxt(flags.boosting_weights, dtype = np.float32,
delimiter = ',')
boosting_weights = tf.placeholder_with_default(boosting_weights_value,
list(boosting_weights_value.shape),
name = 'boosting_weights')
semantic_loss = _boosting_loss(logits, ids, boosting_weights, flags)
else:
semantic_loss = _loss(logits, labels, flags)
tf.add_to_collection('losses', semantic_loss)
# computing the loss gradient with respect to the mask (i.e. the insight tensor) and
# penalizing its L1-norm
# replace 'semantic_loss' with 'tf.reduce_sum(logits)'?
insight = tf.gradients(semantic_loss, [mask])[0]
insight_loss = tf.reduce_sum(tf.abs(insight))
if flags.insight_loss > 0.0:
with tf.control_dependencies([semantic_loss]):
tf.add_to_collection('losses', tf.multiply(flags.insight_loss, insight_loss,
name = 'insight_loss'))
else:
tf.summary.scalar('insight_loss_raw', insight_loss)
# summing all loss factors and computing the moving average of all individual losses and of
# the sum
loss = tf.add_n(tf.get_collection('losses'), name = 'total_loss')
loss_averages_op = tf.train.ExponentialMovingAverage(0.9, name = 'avg')
losses = tf.get_collection('losses')
loss_averages = loss_averages_op.apply(losses + [loss])
# attaching a scalar summary to all individual losses and the total loss;
# do the same for the averaged version of the losses
for l in losses + [loss]:
tf.summary.scalar(l.op.name + '_raw', l)
tf.summary.scalar(l.op.name + '_avg', loss_averages_op.average(l))
# computing and applying gradients
with tf.control_dependencies([loss_averages]):
grads = opt.compute_gradients(loss)
apply_gradient = opt.apply_gradients(grads, global_step = global_step)
# adding histograms for trainable variables and gradients
for var in tf.trainable_variables():
tf.summary.histogram(var.op.name, var)
for grad, var in grads:
if grad is not None:
tf.summary.histogram(var.op.name + '/gradients', grad)
tf.summary.histogram('insight', insight)
# tracking the moving averages of all trainable variables
variable_averages_op = tf.train.ExponentialMovingAverage(flags.moving_average_decay,
global_step)
variable_averages = variable_averages_op.apply(tf.trainable_variables())
# building a Graph that trains the model with one batch of examples and
# updates the model parameters
with tf.control_dependencies([apply_gradient, variable_averages]):
train_op = tf.no_op(name = 'train')
# creating a saver
saver = tf.train.Saver(tf.global_variables())
# building the summary operation based on the TF collection of Summaries
summary_op = tf.summary.merge_all()
# creating a session
current_global_step = -1
with tf.Session(config = tf.ConfigProto(log_device_placement = False,
inter_op_parallelism_threads = flags.num_cpus,
device_count = {'GPU': flags.num_gpus})) as sess:
# initializing variables
if flags.checkpoint_exclude_scopes is not None:
optimistic_restore(sess, os.path.join(train_dir, 'inception_V4.ckpt'), flags)
# starting the queue runners
..
# creating a summary writer
..
# training itself
..
# saving the model checkpoint
checkpoint_path = os.path.join(train_dir, 'model.ckpt')
saver.save(sess, checkpoint_path, global_step = current_global_step)
# stopping the queue runners
..
return current_global_step
I added a flag to the python script called checkpoint_exclude_scopes where I precise which Tensors should not be restored. This is required to change the number of classes in the last layer of the network. Here is how I call the python script:
./toolDetectionInceptions.py --batch_size=32 --running_mode=resume --checkpoint_exclude_scopes=InceptionV4/Logits,InceptionV4/AuxLogits
My first tests were terrible because I got too much problems.. something like:
tensorflow.python.framework.errors.NotFoundError: Tensor name "InceptionV4/Mixed_6b/Branch_3/Conv2d_0b_1x1/weights/read:0" not found in checkpoint files
After some googling I could find a workaround on this site where they propose to use the function optimistic_restore presented in the code above including some modifications to it.
But now the problem is something else:
W tensorflow/core/framework/op_kernel.cc:993] Failed precondition: Attempting to use uninitialized value Variable
[[Node: Variable/read = Identity[T=DT_INT32, _class=["loc:#Variable"], _device="/job:localhost/replica:0/task:0/cpu:0"](Variable)]]
It seems there is a local variable that it's not initialized but I could not find it. Can u please help?
EDITED:
To debug this problem, I checked the number of variables that should be initialized and restored by adding some logs in the function optimistic_restore. Here is a brief:
# saved_shapes 609
# var_names 608
# name2var 1519
# variables_to_init: 7
# restore_vars: 596
# global_variables: 1519
For your information, CheckpointReader.get_variable_to_shape_map(): returns a dict mapping tensor names to lists of ints, representing the shape of the corresponding tensor in the checkpoint. This means the number of variables in this checkpoint is 609 and the total number of variables needed for the restore is 1519.
It seems there is a huge gap between the pretrained checkpoint tensors and the variables used by the network architecture (It's actually their network as well). Is there any kind of compression done on the checkpoint? Is it accurate what I'm saying?
I know now what's missing: it's just the initialization of the variables that have not been restored. Yet, I need to know why there is a huge difference between their InceptionV4 network architecture and the pretrained checkpoint?
Variables that are not restored with the saver need to be initialized. To this end, you could run v.initializer.run() for each variable v that you don't restore.
Here is how I should define the optimistic_restore function to let it work as expected:
def optimistic_restore(session, save_file, flags):
if flags.checkpoint_exclude_scopes is not None:
exclusions = [scope.strip() for scope in flags.checkpoint_exclude_scopes.split(',')]
reader = tf.train.NewCheckpointReader(save_file)
saved_shapes = reader.get_variable_to_shape_map()
print ('saved_shapes %d' % len(saved_shapes))
var_names = sorted([(var.name, var.name.split(':')[0]) for var in tf.global_variables()
if var.name.split(':')[0] in saved_shapes])
var_names_to_be_initialized = sorted([(var.name, var.name.split(':')[0]) for var in tf.global_variables()
if var.name.split(':')[0] not in saved_shapes])
print('var_names %d' % len(var_names))
print('var_names_to_be_initialized %d' % len(var_names_to_be_initialized))
restore_vars = []
name2var = dict(zip(map(lambda x: x.name.split(':')[0], tf.global_variables()), tf.global_variables()))
print('name2var %d' % len(name2var))
with tf.variable_scope('', reuse=True):
variables_to_init = []
for var_name, saved_var_name in var_names:
curr_var = name2var[saved_var_name]
var_shape = curr_var.get_shape().as_list()
if var_shape == saved_shapes[saved_var_name]:
excluded = False
for exclusion in exclusions:
if saved_var_name.startswith(exclusion):
variables_to_init.append(curr_var)
excluded = True
break
if not excluded:
restore_vars.append(curr_var)
else:
variables2_to_init.append(curr_var)
for var_name, saved_var_name in var_names_to_be_initialized:
curr_var = name2var[saved_var_name]
variables2_to_init.append(curr_var)
print('variables2_to_init : %d ' % len(variables_to_init))
print('global_variables: %d ' % len(tf.global_variables()))
print('restore_vars: %d ' % len(restore_vars))
saver = tf.train.Saver(restore_vars)
saver.restore(session, save_file)
session.run(tf.variables_initializer(variables_to_init))
I am trying to use the Transfer Learning approach. Here is a snapshot for the code where my code is learning over the Training data :
max_accuracy = 0.0
saver = tf.train.Saver()
for epoch in range(epocs):
shuffledRange = np.random.permutation(n_train)
y_one_hot_train = encode_one_hot(len(classes), Y_input)
y_one_hot_validation = encode_one_hot(len(classes), Y_validation)
shuffledX = X_input[shuffledRange,:]
shuffledY = y_one_hot_train[shuffledRange]
for Xi, Yi in iterate_mini_batches(shuffledX, shuffledY, mini_batch_size):
sess.run(train_step,
feed_dict={bottleneck_tensor: Xi,
ground_truth_tensor: Yi})
# Every so often, print out how well the graph is training.
is_last_step = (i + 1 == FLAGS.how_many_training_steps)
if (i % FLAGS.eval_step_interval) == 0 or is_last_step:
train_accuracy, cross_entropy_value = sess.run(
[evaluation_step, cross_entropy],
feed_dict={bottleneck_tensor: Xi,
ground_truth_tensor: Yi})
validation_accuracy = sess.run(
evaluation_step,
feed_dict={bottleneck_tensor: X_validation,
ground_truth_tensor: y_one_hot_validation})
print('%s: Step %d: Train accuracy = %.1f%%, Cross entropy = %f, Validation accuracy = %.1f%%' %
(datetime.now(), i, train_accuracy * 100, cross_entropy_value, validation_accuracy * 100))
result_tensor = sess.graph.get_tensor_by_name(ensure_name_has_port(FLAGS.final_tensor_name))
probs = sess.run(result_tensor,feed_dict={'pool_3/_reshape:0': Xi[0].reshape(1,2048)})
if validation_accuracy > max_accuracy :
saver.save(sess, 'models/superheroes_model')
max_accuracy = validation_accuracy
print(probs)
i+=1
Here is where my code, where I am loading the model :
def load_model () :
sess=tf.Session()
#First let's load meta graph and restore weights
saver = tf.train.import_meta_graph('models/superheroes_model.meta')
saver.restore(sess,tf.train.latest_checkpoint('models/'))
sess.run(tf.global_variables_initializer())
result_tensor = sess.graph.get_tensor_by_name(ensure_name_has_port(FLAGS.final_tensor_name))
X_feature = features[0].reshape(1,2048)
probs = sess.run(result_tensor,
feed_dict={'pool_3/_reshape:0': X_feature})
print probs
return sess
So now for the same data point I am getting totally different results while training and testing. Its not even close. During testing, my probabilities are near to 25% as I have 4 classes. But during training highest class probability is 90%.
Is there any issue while saving or restoring the model?
Be careful -- you are calling
sess.run(tf.global_variables_initializer())
after calling
saver.restore(sess,tf.train.latest_checkpoint('models/'))
I've done similar before, and I think that resets all your trained weights/biases/etc. in the restored model.
If you must, call the initializer prior to restoring the model, and if you need to initialize something specific from the restored model, do it individually.
delete sess.run(tf.global_variables_initializer()) in your function load_model, if you do it, all your trained parameters will be replaced with the initial value that will produce 1/4 probability for each class