I am new with Tensorflow and I can't figure out why I am getting this error since I think I've initialized all my variables.
FailedPreconditionError (see above for traceback): Attempting to use uninitialized value Variable_2
[[Node: Variable_2/read = Identity[T=DT_FLOAT, _class=["loc:#Variable_2"], _device="/job:localhost/replica:0/task:0/cpu:0"](Variable_2)]]
It seems to belong to the follow summary_ops:
File "/Users/ldg/PycharmProjects/TF", line 274, in train
summary_ops = setup_summaries()
File "/Users/ldg/PycharmProjects/TF.py", line 238, in setup_summaries
logged_epsilon = tf.Variable(0.)
I am putting the dependent code in order make it clear.
g = tf.Graph()
session = tf.InteractiveSession(graph=g)
with g.as_default(), session.as_default():
K.set_session(session)
num_actions = get_num_actions()
graph_ops = build_graph(num_actions)
saver = tf.train.Saver()
session.run(tf.global_variables_initializer())
# session.run(init_op)
# Initialize target network weights
session.run(graph_ops["reset_target_network_params"])
# Set up game environments (one per thread)
envs = [gym.make(FLAGS.game) for i in range(FLAGS.num_concurrent)]
summary_ops = setup_summaries()
summary_op = summary_ops[-1]
# Initialize variables
summary_save_path = summary_dir + "/" + experiment
writer = tf.summary.FileWriter(summary_save_path, session.graph)
if not os.path.exists(checkpoint_dir):
os.makedirs(checkpoint_dir)
# Show the agents training and write summary statistics
last_summary_time = 0
while True:
now = time.time()
if now - last_summary_time > FLAGS.summary_interval:
summary_str = session.run(summary_op)
writer.add_summary(summary_str, float(T))
last_summary_time = now
whit the encapsulated setup_summaries()
def setup_summaries():
episode_reward = tf.Variable(0.)
tf.summary.scalar("Episode Reward", episode_reward)
episode_ave_max_q = tf.Variable(0.)
tf.summary.scalar("Max Q Value", episode_ave_max_q)
logged_epsilon = tf.Variable(0.)
tf.summary.scalar("Epsilon", logged_epsilon)
logged_T = tf.Variable(0.)
summary_vars = [episode_reward, episode_ave_max_q, logged_epsilon]
summary_placeholders = [tf.placeholder("float") for i in range(len(summary_vars))]
update_ops = [summary_vars[i].assign(summary_placeholders[i]) for i in range(len(summary_vars))]
summary_op = tf.summary.merge_all()
return summary_placeholders, update_ops, summary_op
I had a look everywhere on the similar posts on Stackoverflow but I could not figure out a solution and really can't understand where I don't initialize my var.
Thank you in advance for your help.
You need to put the global variable initializer after summary_setup. The problem is caused by the fact that you are declaring tf.variable after you run the initializer. The following code snippet works
import tensorflow as tf
with tf.Graph().as_default():
sess = tf.Session()
with sess.as_default():
summary_ops = setup_summaries()
summary_op = summary_ops[-1]
sess.run(tf.global_variables_initializer())
sess.run(summary_op)
Related
import numpy as np
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior
class DQNagent:
def __init__(self, session, structure, input_dim, output_dim):
self.session = session
self.structure = structure
self.input_dim = input_dim
self.output_dim = output_dim
self.input_vec = tf.placeholder(tf.float32, shape=[None, self.input_dim])
self.network = self.create_model()
def create_model(self):
for i in range(0, len(self.structure)-1):
if i == 0:
network = tf.layers.Dense(self.structure[i], tf.keras.activations.relu)(self.input_vec)
network = tf.layers.BatchNormalization()(network)
else:
network = tf.layers.Dense(self.structure[i], tf.keras.activations.relu)(network)
network = tf.layers.BatchNormalization()(network)
network = tf.layers.Dense(self.output_dim, tf.keras.activations.linear)(network)
return network
def temp_func(self, input_data):
temp_input_data = np.expand_dims(input_data, 0)
output_vec = self.session.run(self.network, feed_dict={self.input_vec: temp_input_data})
print(output_vec)
So for the above code, I implemented a simple, sequential, fully connected NN
and temp_func is meant to print the output of the NN given an np array
So I tested it by creating an artificial example with an np array via
data = []
for _ in range(10):
data.append(np.random.normal(0.0,1.0))
NNstructure = []
for _ in range(2):
NNstructure.append(10)
So the NNstructure[i] represents the number of neurons at i'th layer
Then I created a session,
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
dqn = DQNagent(sess, NNstructure, len(data), 5)
dqn.temp_func(data)
and in theory, it should print out the output of the NN with this artificially generated "data"
and if I code all these without using the concept of classes, it runs well.
The error message I'm getting is
tensorflow.python.framework.errors_impl.FailedPreconditionError: Attempting to use uninitialized value dense_1/kernel
[[{{node dense_1/kernel/read}}]]
You are creating your variables after doing variable initialization (so there's nothing being initialized). Need to change the order of the of the following lines.
sess.run(tf.global_variables_initializer())
dqn = DQNagent(sess, NNstructure, len(data), 5)
to
dqn = DQNagent(sess, NNstructure, len(data), 5)
sess.run(tf.global_variables_initializer())
I am currently playing around and learning about distributed tensorflow.
I recently created a cluster with One GPU server(two cards) - One CPU server
I was browsing through various articles and in the TensorFlow distributed guide I saw that distribution happened across cards by explicitly calling them with names.
https://github.com/tensorflow/models/blob/master/tutorials/image/cifar10/cifar10_multi_gpu_train.py
but here no cluster are being created.
Can i create a TensorFlow cluster and then specify which card the code should run on?
If yes, does the below look correct?
In one github question who's link i dont have right now but the code below, the card is specified under with tf.device(replica_device_setter) but when i try to do that my code throws an error stating "Cannot assign a device for operation 'dummy_queue_Close_1': Could not satisfy explicit device specification '/job:ps/task:0/device:GPU:0' because no supported kernel for GPU devices is available."
Is this because i am assinging tasks which were supposed to happen on a CPU but instead as i gave with tf.device('/gpu:0/') it throws the error ?
Also I cant share my official code but it looks very similar to the below code which i took for reference.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy
import tensorflow as tf
tf.app.flags.DEFINE_string("ps_hosts", "localhost:2222", "...")
tf.app.flags.DEFINE_string("worker_hosts", "localhost:2223", "...")
tf.app.flags.DEFINE_string("job_name", "", "...")
tf.app.flags.DEFINE_integer("task_index", 0, "...")
tf.app.flags.DEFINE_integer('gpu_cards', 4, 'Number of GPU cards in a machine to use.')
FLAGS = tf.app.flags.FLAGS
def dense_to_one_hot(labels_dense, num_classes = 10) :
"""Convert class labels from scalars to one-hot vectors."""
num_labels = labels_dense.shape[0]
index_offset = numpy.arange(num_labels) * num_classes
labels_one_hot = numpy.zeros((num_labels, num_classes))
labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
return labels_one_hot
def run_training(server, cluster_spec, num_workers) :
is_chief = (FLAGS.task_index == 0)
with tf.Graph().as_default():
with tf.device(tf.train.replica_device_setter(cluster = cluster_spec)) :
with tf.device('/cpu:0') :
global_step = tf.get_variable('global_step', [],
initializer = tf.constant_initializer(0), trainable = False)
with tf.device('/gpu:%d' % (FLAGS.task_index % FLAGS.gpu_cards)) :
# Create the model
x = tf.placeholder("float", [None, 784])
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))
y = tf.nn.softmax(tf.matmul(x, W) + b)
# Define loss and optimizer
y_ = tf.placeholder("float", [None, 10])
cross_entropy = -tf.reduce_sum(y_ * tf.log(y))
opt = tf.train.GradientDescentOptimizer(0.01)
opt = tf.train.SyncReplicasOptimizer(opt, replicas_to_aggregate = num_workers,
replica_id = FLAGS.task_index, total_num_replicas = num_workers)
train_step = opt.minimize(cross_entropy, global_step = global_step)
# Test trained model
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
init_token_op = opt.get_init_tokens_op()
chief_queue_runner = opt.get_chief_queue_runner()
init = tf.initialize_all_variables()
sv = tf.train.Supervisor(is_chief = is_chief,
init_op = init,
global_step = global_step)
# Create a session for running Ops on the Graph.
config = tf.ConfigProto(allow_soft_placement = True)
sess = sv.prepare_or_wait_for_session(server.target, config = config)
if is_chief:
sv.start_queue_runners(sess, [chief_queue_runner])
sess.run(init_token_op)
for i in range(100000):
source_data = numpy.random.normal(loc = 0.0, scale = 1.0, size = (100, 784))
labels_dense = numpy.clip(numpy.sum(source_data, axis = 1) / 5 + 5, 0, 9).astype(int)
labels_one_hot = dense_to_one_hot(labels_dense)
_, cost, acc, step = sess.run([train_step, cross_entropy, accuracy, global_step], feed_dict = { x: source_data, y_ : labels_one_hot })
print("[%d]: cost=%.2f, accuracy=%.2f" % (step, cost, acc))
def main(_) :
ps_hosts = FLAGS.ps_hosts.split(",")
worker_hosts = FLAGS.worker_hosts.split(",")
num_workers = len(worker_hosts)
print("gup_cards=%d; num_worders=%d" % (FLAGS.gpu_cards, num_workers))
cluster_spec = tf.train.ClusterSpec({ "ps":ps_hosts, "worker" : worker_hosts })
server = tf.train.Server(cluster_spec, job_name = FLAGS.job_name, task_index = FLAGS.task_index)
if FLAGS.job_name == "ps":
server.join()
elif FLAGS.job_name == "worker" :
run_training(server, cluster_spec, num_workers)
if __name__ == '__main__' :
tf.app.run()
I found a way to do this it sounds very simple, and it is very simple.
I created a TensorFlow cluster in the same way and passed the n_workers parameter to the cluster, and I called different instances of the code with an extra parameter for CUDA_VISIBLE_DEVICES.
CUDA_VISIBLE_DEVICES is an environment variable which can be used to restrict the vision of TensorFlow or any DL framework to a limited number of cards.
CUDA_VISIBLE_DEVICES value can range from -1 to n (where n is the number of GPUs).
-1 indicates no cards to use
n indicates nth card to use
I hope someone who is looking for a similar answer can find this useful.
I can't enable epoch limits on my string_input_producer without getting a OutOfRange error (requested x, current size 0). It doesn't seem to matter how many elements I request, there is always 0 available.
Here is my FileQueue builder:
def get_queue(base_directory):
files = [f for f in os.listdir(base_directory) if f.endswith('.bin')]
shuffle(files)
file = [os.path.join(base_directory, files[0])]
fileQueue = tf.train.string_input_producer(file, shuffle=False, num_epochs=1)
return fileQueue
If I remove num_epochs=1 from the string_input_producer it can create samples fine.
My input pipeline:
def input_pipeline(instructions, fileQueue):
example, label, feature_name_list = read_binary_format(fileQueue, instructions)
num_preprocess_threads = 16
capacity = 20
example, label = tf.train.batch(
[example, label],
batch_size=50000, # set the batch size way bigger so we always return the full amount of samples from the file
allow_smaller_final_batch=True,
capacity=capacity,
num_threads=num_preprocess_threads)
return example, label
And lastly my session:
with tf.Session(graph=tf.Graph()) as sess:
train_inst_set = sf.DeserializationInstructions.from_filename(os.path.join(input_dir, "Train/config.json"))
fileQueue = sf.get_queue(os.path.join(input_dir, "Train"))
features_train, labels_train = sf.input_pipeline(train_inst_set, fileQueue)
sess.run(tf.global_variables_initializer())
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord, sess=sess)
train_feature_batch, train_label_batch = sess.run([features_train, labels_train])
The issue was caused by this: Issue #1045
For whatever reason, tf.global_variable_initialiser does not initialise all variables. You need to initialise the local variables too.
Add
sess.run(tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()))
to your session.
I'm trying to generate text using previously trained LSTM. I found an existing solution but the problem is that it throws some exceptions. As I understand it happens because of older library usage. After some fixes here's my final function for text generation:
def generate_text(train_path, num_sentences, rnn_data):
gen_config = get_config()
gen_config.num_steps = 1
gen_config.batch_size = 1
with tf.Graph().as_default(), tf.Session() as session:
initializer = tf.random_uniform_initializer(-gen_config.init_scale,
gen_config.init_scale)
with tf.name_scope("Generate"):
rnn_input = PTBInput(config=gen_config, data=rnn_data, name="GenOut")
with tf.variable_scope("OutModel", reuse=None, initializer=initializer):
mout = PTBModel(is_training=False, config=gen_config, input_=rnn_input)
# Restore variables from disk. TODO: save/load trained models
# saver = tf.train.Saver()
# saver.restore(session, model_path)
# print("Model restored from file " + model_path)
print('Getting Vocabulary')
words = reader.get_vocab(train_path)
mout.initial_state = tf.convert_to_tensor(mout.initial_state)
state = mout.initial_state.eval()
# state = session.run(mout.initial_state)
x = 0 # the id for '<eos>' from the training set //TODO: fix this
word_input = np.matrix([[x]]) # a 2D numpy matrix
text = ""
count = 0
while count < num_sentences:
output_probs, state = session.run([mout.output_probs, mout.final_state],
{mout.input.input_data: word_input,
mout.initial_state: state})
print('Output Probs = ' + str(output_probs[0]))
x = sample(output_probs[0], 0.9)
if words[x] == "<eos>":
text += ".\n\n"
count += 1
else:
text += " " + words[x]
# now feed this new word as input into the next iteration
word_input = np.matrix([[x]])
print(text)
return
But I get an exception:
FailedPreconditionError (see above for traceback): Attempting to use uninitialized value OutModel/softmax_b
[[Node: OutModel/softmax_b/read = IdentityT=DT_FLOAT, _class=["loc:#OutModel/softmax_b"], _device="/job:localhost/replica:0/task:0/cpu:0"]]
How can I fix it? And is there any other problems with my code?
The problem is an uninitialised variable, you can fix this by either individually init'ing all the variables or by using the helper tf.global_variables_initializer()
I've setup a print statement and I've noticed that for the first batch when feeding an RNN, the embeddings exist, but after the second batch they don't and I get the following error:
ValueError: Variable RNNLM/RNNLM/Embedding/Adam_2/ does not exist, or was not created with tf.get_variable(). Did you mean to set reuse=None in VarScope?
Here is my code for generating the embeddings:
def add_embedding(self):
with tf.device('/gpu:0'):
embedding = tf.get_variable("Embedding", [len(self.vocab), self.config.embed_size])
e_x = tf.nn.embedding_lookup(embedding, self.input_placeholder)
inputs = [tf.squeeze(s, [1]) for s in tf.split(1, self.config.num_steps, e_x)]
return inputs
Here is how the model is seutp, this is where I suspect the problem lies
def model(self, inputs):
with tf.variable_scope("input_drop"):
inputs_drop = [tf.nn.dropout(i, self.dropout_placeholder) for i in inputs]
with tf.variable_scope("RNN") as scope:
self.initial_state = tf.zeros([self.config.batch_size, self.config.hidden_size], tf.float32)
state = self.initial_state
states = []
for t, e in enumerate(inputs_drop):
print "t is {0}".format(t)
if t > 0:
scope.reuse_variables()
H = tf.get_variable("Hidden", [self.config.hidden_size, self.config.hidden_size])
I = tf.get_variable("I", [self.config.embed_size, self.config.hidden_size])
b_1 = tf.get_variable("b_1", (self.config.hidden_size,))
state = tf.sigmoid(tf.matmul(state, H) + tf.matmul(e, I) + b_1)
states.append(state)
with tf.variable_scope("output_dropout"):
rnn_outputs = [tf.nn.dropout(o, self.dropout_placeholder) for o in states]
return rnn_outputs
The issue arises when I get to the loss function, defined as follows
def add_training_op(self, loss):
opt = tf.train.AdamOptimizer(self.config.lr)
train_op = opt.minimize(loss)
return train_op
EDIT: Here is some updated code to help everyone out
def __init__(self, config):
self.config = config
self.load_data(debug=False)
self.add_placeholders()
self.inputs = self.add_embedding()
self.rnn_outputs = self.add_model(self.inputs)
self.outputs = self.add_projection(self.rnn_outputs)
self.predictions = [tf.nn.softmax(tf.cast(o, 'float64')) for o in self.outputs]
output = tf.reshape(tf.concat(1, self.outputs), [-1, len(self.vocab)])
self.calculate_loss = self.add_loss_op(output)
self.train_step = self.add_training_op(self.calculate_loss)
Here are the other methods here, pertaining to add_projection and calculate_loss so we can rule them out.
def add_loss_op(self, output):
weights = tf.ones([self.config.batch_size * self.config.num_steps], tf.int32)
seq_loss = tf.python.seq2seq.sequence_loss(
[output],
tf.reshape(self.labels_placeholder, [-1]),
weights
)
tf.add_to_collection('total_loss', seq_loss)
loss = tf.add_n(tf.get_collection('total_loss'))
return loss
def add_projection(self, rnn_outputs):
with tf.variable_scope("Projection", initializer=tf.contrib.layers.xavier_initializer()) as scope:
U = tf.get_variable("U", [self.config.hidden_size, len(self.vocab)])
b_2 = tf.get_variable("b_2", [len(self.vocab)])
outputs = [tf.matmul(x, U) + b_2 for x in rnn_outputs]
return outputs
def train_RNNLM():
config = Config()
gen_config = deepcopy(config)
gen_config.batch_size = gen_config.num_steps = 1
with tf.variable_scope('RNNLM') as scope:
model = RNNLM_Model(config)
# This instructs gen_model to reuse the same variables as the model above
scope.reuse_variables()
gen_model = RNNLM_Model(gen_config)
init = tf.initialize_all_variables()
saver = tf.train.Saver()
with tf.Session() as session:
best_val_pp = float('inf')
best_val_epoch = 0
session.run(init)
for epoch in xrange(config.max_epochs):
print 'Epoch {}'.format(epoch)
start = time.time()
###
train_pp = model.run_epoch(
session, model.encoded_train,
train_op=model.train_step)
valid_pp = model.run_epoch(session, model.encoded_valid)
print 'Training perplexity: {}'.format(train_pp)
print 'Validation perplexity: {}'.format(valid_pp)
if valid_pp < best_val_pp:
best_val_pp = valid_pp
best_val_epoch = epoch
saver.save(session, './ptb_rnnlm.weights')
if epoch - best_val_epoch > config.early_stopping:
break
print 'Total time: {}'.format(time.time() - start)
Seems that the code is trying to create a new Adam Variable in each batch.
Possible that the add_training_op is called twice?
Also, the snippet of def add_training_op is incomplete since there is no return statement.
The problem turned out to be the following line of code:
model = RNNLM_Model(config)
# This instructs gen_model to reuse the same variables as the model above
scope.reuse_variables()
gen_model = RNNLM_Model(gen_config)
It turns out that the second model was an issue by using reuse_variables(). By removing this line by issues went away.