I have written a simple version bidirectional lstm for sentence classification. But it keeps giving me "You must feed a value for placeholder tensor 'train_x'" error and it seems this come from the variable initialization step.
data = load_data(FLAGS.data)
model = RNNClassifier(FLAGS)
init = tf.initialize_all_variables()
with tf.Session() as sess:
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
sess.run(init)
print("Graph initialized..")
print()
np.random.seed(FLAGS.random_state)
for epoch in range(FLAGS.max_max_epoch):
loss = sess.run(model.cost, feed_dict={model.train_x: data.train_x, model.train_y: data.train_y,
model.embedding_placeholder: data.glove_vec})
print("Epoch {:2d}: Loss = {:.6f} = {:.5f}".format(epoch+1, loss))
coord.request_stop()
coord.join(threads)
And the RNNClassifier class code (in a different directory):
class RNNClassifier:
def __init__(self, FLAGS):
self.params = FLAGS
with tf.device("/cpu:0"):
self.train_x = tf.placeholder(tf.int32, [6248, 42], name='train_x')
self.train_y = tf.placeholder(tf.int32, [6248, 3], name='train_y')
self.embedding_placeholder = tf.placeholder(tf.float32, [1193515, 100])
with tf.variable_scope('forward_lstm'):
lstm_fw_cell = tf.nn.rnn_cell.LSTMCell(num_units=self.params.num_hidden, use_peepholes=False,
activation=tf.nn.relu, forget_bias=0.0,
state_is_tuple=True)
with tf.variable_scope('backward_lstm'):
lstm_bw_cell = tf.nn.rnn_cell.LSTMCell(num_units=self.params.num_hidden, use_peepholes=False,
activation=tf.nn.relu, forget_bias=0.0,
state_is_tuple=True)
fw_initial_state = lstm_fw_cell.zero_state(self.params.batch_size, tf.float32)
bw_initial_state = lstm_bw_cell.zero_state(self.params.batch_size, tf.float32)
self._initial_state = [fw_initial_state, bw_initial_state]
with tf.device("/cpu:0"), tf.variable_scope('softmax'):
self.W = tf.get_variable('W', [self.params.num_hidden*2, self.params.num_classes])
self.b = tf.get_variable('b', [self.params.num_classes], initializer=tf.constant_initializer(0.0))
batched_inputs, batched_labels = self.batch_data()
embed_inputs = self.use_embedding(batched_inputs)
rnn_outputs, output_state_fw, output_state_bw = tf.nn.bidirectional_rnn(
cell_fw=lstm_fw_cell,
cell_bw=lstm_bw_cell,
inputs=embed_inputs,
initial_state_fw=fw_initial_state,
initial_state_bw=bw_initial_state
)
logits = tf.matmul(rnn_outputs[-1], self.W) + self.b
self._cost = cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf.cast(batched_labels, tf.float32)))
optimizer = tf.train.AdamOptimizer(learning_rate=0.05).minimize(cost)
def batch_data(self):
# inputs = tf.convert_to_tensor(train_x, dtype=tf.int32)
# labels = tf.convert_to_tensor(train_y, dtype=tf.int32)
batched_inputs, batched_labels = tf.train.batch(
tensors=[self._train_x, self._train_y],
batch_size=self.params.batch_size,
dynamic_pad=True,
enqueue_many=True,
name='batching'
)
return batched_inputs, batched_labels
def use_embedding(self, batched_inputs):
with tf.device("/cpu:0"), tf.name_scope("input_embedding"):
embedding = tf.get_variable("embedding", shape=[1193515, 100], trainable=False)
embedding_init = embedding.assign(self.embedding_placeholder)
embed_inputs = tf.split(1, self.params.seq_len, tf.nn.embedding_lookup(embedding_init, batched_inputs))
embed_inputs = [tf.squeeze(input_, [1]) for input_ in embed_inputs]
return embed_inputs
#property
def cost(self):
return self._cost
The output (including the error):
I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:925] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
I tensorflow/core/common_runtime/gpu/gpu_init.cc:102] Found device 0 with properties:
name: GeForce GTX 750 Ti
major: 5 minor: 0 memoryClockRate (GHz) 1.0845
pciBusID 0000:01:00.0
Total memory: 2.00GiB
Free memory: 1.41GiB
I tensorflow/core/common_runtime/gpu/gpu_init.cc:126] DMA: 0
I tensorflow/core/common_runtime/gpu/gpu_init.cc:136] 0: Y
I tensorflow/core/common_runtime/gpu/gpu_device.cc:839] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GTX 750 Ti, pci bus id: 0000:01:00.0)
E tensorflow/core/client/tensor_c_api.cc:485] You must feed a value for placeholder tensor 'train_x' with dtype int32 and shape [6248,42]
[[Node: train_x = Placeholder[dtype=DT_INT32, shape=[6248,42], _device="/job:localhost/replica:0/task:0/cpu:0"]()]]
Graph initialized..
W tensorflow/core/framework/op_kernel.cc:936] Out of range: PaddingFIFOQueue '_0_batching/padding_fifo_queue' is closed and has insufficient elements (requested 50, current size 0)
[[Node: batching = QueueDequeueMany[_class=["loc:#batching/padding_fifo_queue"], component_types=[DT_INT32, DT_INT32], timeout_ms=-1, _device="/job:localhost/replica:0/task:0/cpu:0"](batching/padding_fifo_queue, batching/n)]]
W tensorflow/core/framework/op_kernel.cc:936] Out of range: PaddingFIFOQueue '_0_batching/padding_fifo_queue' is closed and has insufficient elements (requested 50, current size 0)
[[Node: batching = QueueDequeueMany[_class=["loc:#batching/padding_fifo_queue"], component_types=[DT_INT32, DT_INT32], timeout_ms=-1, _device="/job:localhost/replica:0/task:0/cpu:0"](batching/padding_fifo_queue, batching/n)]]
E tensorflow/core/client/tensor_c_api.cc:485] PaddingFIFOQueue '_0_batching/padding_fifo_queue' is closed and has insufficient elements (requested 50, current size 0)
[[Node: batching = QueueDequeueMany[_class=["loc:#batching/padding_fifo_queue"], component_types=[DT_INT32, DT_INT32], timeout_ms=-1, _device="/job:localhost/replica:0/task:0/cpu:0"](batching/padding_fifo_queue, batching/n)]]
[[Node: batching/_9 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/gpu:0", send_device="/job:localhost/replica:0/task:0/cpu:0", send_device_incarnation=1, tensor_name="edge_1191_batching", tensor_type=DT_INT32, _device="/job:localhost/replica:0/task:0/gpu:0"]()]]
Traceback (most recent call last):
File "train_lstm.py", line 66, in <module>
model.embedding_placeholder: data.glove_vec})
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 382, in run
run_metadata_ptr)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 655, in _run
feed_dict_string, options, run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 723, in _do_run
target_list, options, run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 743, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors.OutOfRangeError: PaddingFIFOQueue '_0_batching/padding_fifo_queue' is closed and has insufficient elements (requested 50, current size 0)
[[Node: batching = QueueDequeueMany[_class=["loc:#batching/padding_fifo_queue"], component_types=[DT_INT32, DT_INT32], timeout_ms=-1, _device="/job:localhost/replica:0/task:0/cpu:0"](batching/padding_fifo_queue, batching/n)]]
[[Node: batching/_9 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/gpu:0", send_device="/job:localhost/replica:0/task:0/cpu:0", send_device_incarnation=1, tensor_name="edge_1191_batching", tensor_type=DT_INT32, _device="/job:localhost/replica:0/task:0/gpu:0"]()]]
Caused by op u'batching', defined at:
File "train_lstm.py", line 49, in <module>
model = RNNClassifier(FLAGS)
File "/home/ccrmad/Code/TDLSTM/models/rnn_classifier.py", line 34, in __init__
batched_inputs, batched_labels = self.batch_data()
File "/home/ccrmad/Code/TDLSTM/models/rnn_classifier.py", line 74, in batch_data
name='batching'
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/input.py", line 595, in batch
dequeued = queue.dequeue_many(batch_size, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/data_flow_ops.py", line 435, in dequeue_many
self._queue_ref, n=n, component_types=self._dtypes, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_data_flow_ops.py", line 867, in _queue_dequeue_many
timeout_ms=timeout_ms, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/op_def_library.py", line 703, in apply_op
op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 2310, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1232, in __init__
self._traceback = _extract_stack()
I have tried move the train_x and train_y placeholder initialization before init = tf.initialize_all_variables() and feed them to RNNClassifier() as two args but it still give the same error. Why?
It looks like the problem is in this method, in RNNClassifier:
def batch_data(self):
# ...
batched_inputs, batched_labels = tf.train.batch(
tensors=[self._train_x, self._train_y],
batch_size=self.params.batch_size,
dynamic_pad=True,
enqueue_many=True,
name='batching')
The two tensor arguments to tf.train.batch() are self._train_x and self._train_y. In the RNNClassifier constructor, it seems like you create these as tf.placeholder() tensors:
def __init__(self, FLAGS):
# ...
with tf.device("/cpu:0"):
self.train_x = tf.placeholder(tf.int32, [6248, 42], name='train_x')
self.train_y = tf.placeholder(tf.int32, [6248, 3], name='train_y')
...though I'm assuming that the difference between self._train_x and self.train_x is a copy-paste error, because self._train_x doesn't seem to be defined anywhere else.
Now, one of the surprising things about tf.train.batch() is that it consumes its inputs in a completely separate thread, called a "queue runner", and started when you call tf.train.start_queue_runners(). This thread calls Session.run() on a subgraph that depends on your placeholders, but doesn't know how to feed these placeholders, so it is this call that fails, leading to the error you are seeing.
How then should you fix it? One option would be to use a "feeding queue runner", for which there is experimental support. A simpler option would be to use the tf.train.slice_input_producer() to produce slices from your input data, as follows:
def batch_data(self, train_x, train_y):
input_slice, label_slice = tf.train.slice_input_producer([train_x, train_y])
batched_inputs, batched_labels = tf.train.batch(
tensors=[input_slice, label_slice],
batch_size=self.params.batch_size,
dynamic_pad=False, # All rows are the same shape.
enqueue_many=False, # tf.train.slice_input_producer() produces one row at a time.
name='batching')
return batched_inputs, batched_labels
# ...
# Create batches from the entire training data, where `array_input` and
# `array_labels` are two NumPy arrays.
batched_inputs, batched_labels = model.batch_data(array_input, array_labels)
This is what I did..
I could change the way I initialised my input variables as:
data = load_data(FLAGS.data)
model = RNNClassifier(FLAGS, data)
init = tf.initialize_all_variables()
with tf.Session() as sess:
sess.run(init)
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
for epoch in range(FLAGS.max_max_epoch):
sess.run(model.train_step)
loss, acc = sess.run([model.mean_cost, model.accuracy])
print("Epoch {:2d}: Loss = {:.6f}; Training Accuracy = {:.5f}".format(epoch+1, loss, acc))
print()
coord.request_stop()
coord.join(threads)
And in RNNClassifier class I could replace
self.train_x = tf.placeholder(tf.int32, [6248, 42], name='train_x')
self.train_y = tf.placeholder(tf.int32, [6248, 3], name='train_y')
self.embedding_placeholder = tf.placeholder(tf.float32, [1193515, 100])
(and remove use_embedding()) to
def __init__(self, FLAGS, data):
self._train_x = tf.convert_to_tensor(data.train_x, dtype=tf.int32)
self._train_y = tf.convert_to_tensor(data.train_y, dtype=tf.int32)
embedding = tf.get_variable("embedding", shape=self.embedding_shape, trainable=False)
self.embedding_init = embedding.assign(data.glove_vec)
This way everything is initialised with RNNClassifier(FLAGS, data) before making the call to the queue runners.
I still don't know why the previous way aka using placeholders, didn't work. I have checked the dtype and data shape, they all match. This post seems to have a similar problem and the "answer" suggests to replace placeholder with tf.Variable. I have tried this -> input values can now be fed but obvs it doesn't work as placeholders do and only feed the initialised values..
UPDATE: I think the error might be caused by operating inputs batching in the RNNClassifier class, that somehow effected feeding input data into the graph (again please correct me if you know the cause to the error, thanks!). I have instead relocated my batching and embedding functions onto my main code before initialising my session, this way I think it makes sure all inputs are defined to the main pipeline and batching&prefetching would be handled inside the tf graph (by not using placeholders).
Here's my code:
def batch(x, y):
with tf.device("/cpu:0"):
x = tf.convert_to_tensor(x, dtype=tf.int32)
y = tf.convert_to_tensor(y, dtype=tf.int32)
batched_x, batched_y = tf.train.batch(
tensors=[x, y],
batch_size=50,
dynamic_pad=True,
enqueue_many=True,
name='batching'
)
return (batched_x, batched_y)
def embed(df):
with tf.device("/cpu:0"), tf.variable_scope("embed"):
embedding = tf.get_variable("embedding", shape=df.embed_shape, trainable=False)
embedding_init = embedding.assign(df.embed_vec)
return embedding_init
data = load_data(FLAGS.data)
pretrained_embed = embed(data)
batched_train_x, batched_train_y = batch(data.train_x, data.train_y)
batched_test_x, batched_test_y = batch(data.train_x, data.train_y)
model = RNNClassifier(FLAGS, pretrained_embed)
logits = model.inference(batched_train_x)
test_pred = model.inference(batched_test_x, reuse=True)
loss = model.loss(logits, batched_train_y)
test_loss = model.loss(test_pred, batched_test_y)
train_op = model.training(loss[0])
init = tf.group(tf.initialize_all_variables(),
tf.initialize_local_variables())
with tf.Session() as sess:
t0 = time.time()
sess.run(init)
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
np.random.seed(FLAGS.random_state)
for epoch in range(FLAGS.max_max_epoch):
_, output = sess.run([train_op, loss])
loss_value, acc = output
print("Epoch {:2d}: Loss = {:.6f}; Training Accuracy = {:.5f}".format(epoch+1, loss_value, acc))
print()
coord.request_stop()
coord.join(threads)
And the RNNClassifier:
class RNNClassifier:
def __init__(self, FLAGS, embedding_init):
self.batch_size = FLAGS.batch_size
self.num_hidden = FLAGS.num_hidden
self.num_classes = FLAGS.num_classes
self.seq_len = FLAGS.seq_len
self.embedding_init = embedding_init
def inference(self, batched_inputs, reuse=None):
embed_inputs = tf.nn.embedding_lookup(self.embedding_init, tf.transpose(batched_inputs))
with tf.variable_scope('hidden', reuse=reuse):
with tf.variable_scope('forward_lstm'):
lstm_fw_cell = tf.nn.rnn_cell.LSTMCell(num_units=self.num_hidden, use_peepholes=False,
activation=tf.nn.relu, forget_bias=0.0,
initializer=tf.random_uniform_initializer(-1.0, 1.0),
state_is_tuple=True)
lstm_fw_cell = tf.nn.rnn_cell.DropoutWrapper(cell=lstm_fw_cell, input_keep_prob=0.7)
with tf.variable_scope('backward_lstm'):
lstm_bw_cell = tf.nn.rnn_cell.LSTMCell(num_units=self.num_hidden, use_peepholes=False,
activation=tf.nn.relu, forget_bias=0.0,
initializer=tf.random_uniform_initializer(-1.0, 1.0),
state_is_tuple=True)
lstm_bw_cell = tf.nn.rnn_cell.DropoutWrapper(cell=lstm_bw_cell, input_keep_prob=0.7)
fw_initial_state = lstm_fw_cell.zero_state(self.batch_size, tf.float32)
bw_initial_state = lstm_bw_cell.zero_state(self.batch_size, tf.float32)
rnn_outputs, output_state_fw, output_state_bw = tf.nn.bidirectional_rnn(
cell_fw=lstm_fw_cell,
cell_bw=lstm_bw_cell,
inputs=tf.unpack(embed_inputs),
# sequence_length=self.seq_len,
initial_state_fw=fw_initial_state,
initial_state_bw=bw_initial_state
)
with tf.variable_scope('output', reuse=reuse):
with tf.variable_scope('softmax'):
W = tf.get_variable('W', [self.num_hidden*2, self.num_classes],
initializer=tf.truncated_normal_initializer(stddev=0.1))
b = tf.get_variable('b', [self.num_classes], initializer=tf.constant_initializer(0.1))
logits = tf.matmul(rnn_outputs[-1], W) + b
return logits
def loss(self, logits, labels):
cost = tf.nn.softmax_cross_entropy_with_logits(logits, tf.cast(labels, tf.float32))
# self.total_cost = tf.reduce_sum(self.cost)
mean_cost = tf.reduce_mean(cost)
correct_pred = tf.equal(tf.argmax(logits, 1), tf.argmax(labels, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
return mean_cost, accuracy
def training(self, cost):
optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
train_op = optimizer.minimize(cost)
return train_op
On a side note, I wish the Tensorflow community on stackoverflow would be more active..
Related
I have around 550K samples, each sample being 200x50x1. The size of this dataset is around 57GB.
I want to train a network on this set but I am having trouble reading it.
batch_size=8
def _read_py_function(filename,labels_slice):
with h5py.File(filename, 'r') as f:
data_slice = np.asarray(f['feats'])
print(data_slice.shape)
return data_slice, labels_slice
placeholder_files = tf.placeholder(tf.string, [None])
placeholder_labels = tf.placeholder(tf.int32, [None])
dataset = tf.data.Dataset.from_tensor_slices((placeholder_files,placeholder_labels))
dataset = dataset.map(
lambda filename, label: tuple(tf.py_func(
_read_py_function, [filename,label], [tf.uint8, tf.int32])))
dataset = dataset.shuffle(buffer_size=50000)
dataset = dataset.batch(batch_size)
iterator = tf.data.Iterator.from_structure(dataset.output_types, dataset.output_shapes)
data_X, data_y = iterator.get_next()
data_y = tf.cast(data_y, tf.int32)
net = conv_layer(inputs=data_X,num_outputs=8, kernel_size=3, stride=2, scope='rcl_0')
net = pool_layer(inputs=net,kernel_size=2,scope='pl_0')
net = dropout_layer(inputs=net,scope='dl_0')
net = flatten_layer(inputs=net,scope='flatten_0')
net = dense_layer(inputs=net,num_outputs=256,scope='dense_0')
net = dense_layer(inputs=net,num_outputs=64,scope='dense_1')
out = dense_layer(inputs=net,num_outputs=10,scope='dense_2')
And I run the session using :
sess.run(train_iterator, feed_dict = {placeholder_files: filenames, placeholder_labels: ytrain})
try:
while True:
_, loss, acc = sess.run([train_op, loss_op, accuracy_op])
train_loss += loss
train_accuracy += acc
except tf.errors.OutOfRangeError:
pass
But I am getting the error even before running the session :
Traceback (most recent call last):
File "SFCC-trial-134.py", line 297, in <module>
net = rcnn_layer(inputs=data_X,num_outputs=8, kernel_size=3, stride=2, scope='rcl_0')
File "SFCC-trial-134.py", line 123, in rcnn_layer
reuse=False)
File "SFCC-trial-134.py", line 109, in conv_layer
reuse = reuse
File "/home/priyam.jain/tensorflow-gpu-python3/lib/python3.5/site-packages/tensorflow/contrib/framework/python/ops/arg_scope.py", line 183, in func_with_args
return func(*args, **current_args)
File "/home/priyam.jain/tensorflow-gpu-python3/lib/python3.5/site-packages/tensorflow/contrib/layers/python/layers/layers.py", line 1154, in convolution2d
conv_dims=2)
File "/home/priyam.jain/tensorflow-gpu-python3/lib/python3.5/site-packages/tensorflow/contrib/framework/python/ops/arg_scope.py", line 183, in func_with_args
return func(*args, **current_args)
File "/home/priyam.jain/tensorflow-gpu-python3/lib/python3.5/site-packages/tensorflow/contrib/layers/python/layers/layers.py", line 1025, in convolution
(conv_dims + 2, input_rank))
TypeError: %d format: a number is required, not NoneType
I though about using TFRecords but had a hard time creating those. Couldn't find a good post where I learn to create them for my kind of dataset.
conv_layer is defined as follows :
def conv_layer(inputs, num_outputs, kernel_size, stride, normalizer_fn=None, activation_fn=nn.relu, trainable=True, scope='noname', reuse=False):
net = slim.conv2d(inputs = inputs,
num_outputs = num_outputs,
kernel_size = kernel_size,
stride = stride,
normalizer_fn = normalizer_fn,
activation_fn = activation_fn,
trainable = trainable,
scope = scope,
reuse = reuse
)
return net
Do not pass tf.py_func inside your map function. You can read the file image by passing the function name directly inside your map function. I am posing only the relevant parts of the code.
def _read_py_function(filename, label):
return tf.zeros((224, 224, 3), dtype=tf.float32), tf.ones((1,), dtype=tf.int32)
dataset = dataset.map(lambda filename, label: _read_py_function(filename, label))
Another change is your iterator will expect only floating point of input. So you will have to change your tf.uint8 type of output to float.
I was try to copy a code which implement adaptive gradient
error comes from this code
for i, n_hidden in enumerate(n_hiddens):
if i == 0:
input = x
input_dim = n_in
else:
input = output
input_dim = n_hiddens[i - 1]
W = weight_variable([input_dim, n_hidden])
b = bias_variable([n_hidden])
h = tf.nn.relu(tf.matmul(input, W) + b)
output = tf.nn.dropout(h, keep_prob)
at
h = tf.nn.relu(tf.matmul(input, w) + b)
and the interpreter said
AbortedError (see above for traceback): Operation received an exception:Status: 3, message: could not initialize a memory primitive descriptor, in file tensorflow/core/kernels/mkl_relu_op.cc:873
[[{{node Relu}} = _MklRelu[T=DT_FLOAT, _kernel="MklOp", _device="/job:localhost/replica:0/task:0/device:CPU:0"](add, DMT/_0)]]
there's other errors too, but error occur same like that.
I was really good at copying this code, so I think there's nothing special mistake at code but I can't confirm it.
I think that something wrong at 'relu' function which comes with tensorflow, but I don't know what to do.
here is my full code, if you need
import numpy as np
import tensorflow as tf
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
np.random.seed(0)
tf.set_random_seed(1234)
def inference(x, keep_prob, n_in, n_hiddens, n_out):
def weight_variable(shape):
initial = np.sqrt(2.0 / shape[0]) * tf.truncated_normal(shape)
return tf.Variable(initial)
def bias_variable(shape):
initial = tf.zeros(shape)
return tf.Variable(initial)
for i, n_hidden in enumerate(n_hiddens):
if i == 0:
input = x
input_dim = n_in
else:
input = output
input_dim = n_hiddens[i - 1]
W = weight_variable([input_dim, n_hidden])
b = bias_variable([n_hidden])
h = tf.nn.relu(tf.matmul(input, W) + b)
output = tf.nn.dropout(h, keep_prob)
W_out = weight_variable([n_hiddens[-1], n_out])
b_out = bias_variable([n_out])
y = tf.nn.softmax(tf.matmul(output, W_out) + b_out)
return y
def loss(y, t):
cross_entropy = tf.reduce_mean(-tf.reduce_sum(t * tf.log(tf.clip_by_value(y, 1e-10, 1.0)), reduction_indices=[1]))
return cross_entropy
def training(loss):
optimizer = tf.train.AdagradOptimizer(0.01)
train_step = optimizer.minimize(loss)
return train_step
def accuracy(y, t):
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(t, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
return accuracy
if __name__ == '__main__':
mnist = datasets.fetch_mldata('MNIST original')
n = len(mnist.data)
N = 30000
N_train = 20000
N_validation = 4000
indices = np.random.permutation(range(n))[:N]
X = mnist.data[indices]
X = X / 255.0
X = X - X.mean(axis=1).reshape(len(X), 1)
y = mnist.target[indices]
Y = np.eye(10)[y.astype(int)]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=N_train)
X_train, X_validation, Y_train, Y_validation = train_test_split(X_train, Y_train, test_size=N_validation)
n_in = len(X[0])
n_hiddens = [200, 200, 200]
n_out = len(Y[0])
p_keep = 0.5
x = tf.placeholder(tf.float32, shape=[None, n_in])
t = tf.placeholder(tf.float32, shape=[None, n_out])
keep_prob = tf.placeholder(tf.float32)
y = inference(x, keep_prob, n_in=n_in, n_hiddens=n_hiddens, n_out=n_out)
loss = loss(y, t)
train_step = training(loss)
accuracy = accuracy(y, t)
history = {'val_loss' : [], 'val_acc' : []}
epochs = 50
batch_size = 200
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
n_batches = N_train // batch_size
for epoch in range(epochs):
X_, Y_ = shuffle(X_train, Y_train)
for i in range(n_batches):
start = i * batch_size
end = start + batch_size
sess.run(train_step, feed_dict = {x : X_[start:end], t : Y_[start:end], keep_prob : p_keep})
val_loss = loss.eval(session=sess, feed_dict = {x : X_validation, t : Y_validation, keep_prob : 1.0})
val_acc = accuracy.eval(session = sess, feed_dict = {x : X_validation, t : Y_validation, keep_prob : 1.0})
history['val_loss'].append(val_loss)
history['val_acc'].append(val_acc)
print('epoch : ', epoch, ' validation loss : ', val_loss, ' validation accuracy : ', val_acc)
plt.rc('font', family = 'serif')
fig = plt.figure()
plt.plot(range(epochs), history['val_loss'], label = 'loss', color = 'black')
plt.xlabel('epochs')
plt.show()
accuracy_rate = accuracy.eval(session = sess, feed_dict = {x : X_test, t : Y_test, keep_prob : 1.0})
print('accuracy : ', accuracy_rate)
and here is my whole error log
/home/shin/anaconda3/envs/tensorflow/lib/python3.6/site-packages/sklearn/model_selection/_split.py:2069: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.
FutureWarning)
2018-11-05 18:11:21.403663: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA
2018-11-05 18:11:21.406748: I tensorflow/core/common_runtime/process_util.cc:69] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
2018-11-05 18:11:22.406667: W tensorflow/core/framework/op_kernel.cc:1273] OP_REQUIRES failed at mkl_relu_op.cc:876 : Aborted: Operation received an exception:Status: 3, message: could not initialize a memory primitive descriptor, in file tensorflow/core/kernels/mkl_relu_op.cc:873
Traceback (most recent call last):
File "/home/shin/anaconda3/envs/tensorflow/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1292, in _do_call
return fn(*args)
File "/home/shin/anaconda3/envs/tensorflow/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1277, in _run_fn
options, feed_dict, fetch_list, target_list, run_metadata)
File "/home/shin/anaconda3/envs/tensorflow/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1367, in _call_tf_sessionrun
run_metadata)
tensorflow.python.framework.errors_impl.AbortedError: Operation received an exception:Status: 3, message: could not initialize a memory primitive descriptor, in file tensorflow/core/kernels/mkl_relu_op.cc:873
[[{{node Relu}} = _MklRelu[T=DT_FLOAT, _kernel="MklOp", _device="/job:localhost/replica:0/task:0/device:CPU:0"](add, DMT/_0)]]
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/shin/pythonClass/AdaGrad.py", line 105, in <module>
sess.run(train_step, feed_dict = {x : X_[start:end], t : Y_[start:end], keep_prob : p_keep})
File "/home/shin/anaconda3/envs/tensorflow/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 887, in run
run_metadata_ptr)
File "/home/shin/anaconda3/envs/tensorflow/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1110, in _run
feed_dict_tensor, options, run_metadata)
File "/home/shin/anaconda3/envs/tensorflow/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1286, in _do_run
run_metadata)
File "/home/shin/anaconda3/envs/tensorflow/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1308, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.AbortedError: Operation received an exception:Status: 3, message: could not initialize a memory primitive descriptor, in file tensorflow/core/kernels/mkl_relu_op.cc:873
[[{{node Relu}} = _MklRelu[T=DT_FLOAT, _kernel="MklOp", _device="/job:localhost/replica:0/task:0/device:CPU:0"](add, DMT/_0)]]
Caused by op 'Relu', defined at:
File "/home/shin/pythonClass/AdaGrad.py", line 81, in <module>
y = inference(x, keep_prob, n_in=n_in, n_hiddens=n_hiddens, n_out=n_out)
File "/home/shin/pythonClass/AdaGrad.py", line 31, in inference
h = tf.nn.relu(tf.matmul(input, W) + b)
File "/home/shin/anaconda3/envs/tensorflow/lib/python3.6/site-packages/tensorflow/python/ops/gen_nn_ops.py", line 6780, in relu
"Relu", features=features, name=name)
File "/home/shin/anaconda3/envs/tensorflow/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "/home/shin/anaconda3/envs/tensorflow/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py", line 488, in new_func
return func(*args, **kwargs)
File "/home/shin/anaconda3/envs/tensorflow/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3272, in create_op
op_def=op_def)
File "/home/shin/anaconda3/envs/tensorflow/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1768, in __init__
self._traceback = tf_stack.extract_stack()
AbortedError (see above for traceback): Operation received an exception:Status: 3, message: could not initialize a memory primitive descriptor, in file tensorflow/core/kernels/mkl_relu_op.cc:873
[[{{node Relu}} = _MklRelu[T=DT_FLOAT, _kernel="MklOp", _device="/job:localhost/replica:0/task:0/device:CPU:0"](add, DMT/_0)]]
I have the following code
flags = tf.flags
logging = tf.logging
flags.DEFINE_string('model', 'small',
'A type of model. Possible options are: small, medium, large.'
)
flags.DEFINE_string('data_path', None, 'data_path')
flags.DEFINE_string('checkpoint_dir', 'ckpt', 'checkpoint_dir')
flags.DEFINE_bool('use_fp16', False,
'Train using 16-bit floats instead of 32bit floats')
flags.DEFINE_bool('train', False, 'should we train or test')
FLAGS = flags.FLAGS
def data_type():
return tf.float16 if FLAGS.use_fp16 else tf.float32
class PTBModel(object):
"""The PTB model."""
def __init__(self, is_training, config):
self.batch_size = batch_size = config.batch_size
self.num_steps = num_steps = config.num_steps
size = config.hidden_size
vocab_size = config.vocab_size
self._input_data = tf.placeholder(tf.float32, [batch_size,
num_steps])
self._targets = tf.placeholder(tf.int32, [batch_size,
num_steps])
# Slightly better results can be obtained with forget gate biases
# initialized to 1 but the hyperparameters of the model would need to be
# different than reported in the paper.
lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(size, forget_bias=0.0,
state_is_tuple=True)
if is_training and config.keep_prob < 1:
lstm_cell = tf.nn.rnn_cell.DropoutWrapper(lstm_cell,
output_keep_prob=config.keep_prob)
cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell]
* config.num_layers, state_is_tuple=True)
self._initial_state = cell.zero_state(batch_size, data_type())
with tf.device('/cpu:0'):
embedding = tf.get_variable('embedding', [vocab_size,
size], dtype=data_type())
inputs = tf.nn.embedding_lookup(embedding, self._input_data)
if is_training and config.keep_prob < 1:
inputs = tf.nn.dropout(inputs, config.keep_prob)
# Simplified version of tensorflow.models.rnn.rnn.py's rnn().
# This builds an unrolled LSTM for tutorial purposes only.
# In general, use the rnn() or state_saving_rnn() from rnn.py.
#
# The alternative version of the code below is:
#
# from tensorflow.models.rnn import rnn
inputs = [tf.squeeze(input_, [1]) for input_ in tf.split(inputs, num_steps, axis=1)]
(outputs, state) = tf.nn.rnn(cell, inputs, initial_state=self._initial_state)
# outputs = []
# state = self._initial_state
# with tf.variable_scope("RNN"):
# for time_step in range(num_steps):
# if time_step > 0: tf.get_variable_scope().reuse_variables()
# (cell_output, state) = cell(inputs[:, time_step, :], state)
# outputs.append(cell_output)
output = tf.reshape(tf.concat(outputs, axis=1), [-1, size])
softmax_w = tf.get_variable('softmax_w', [size, vocab_size],
dtype=data_type())
softmax_b = tf.get_variable('softmax_b', [vocab_size],
dtype=data_type())
logits = tf.matmul(output, softmax_w) + softmax_b
loss = tf.nn.seq2seq.sequence_loss_by_example([logits],
[tf.reshape(self._targets, [-1])], [tf.ones([batch_size
* num_steps],
dtype=data_type())])
self._cost = cost = tf.reduce_sum(loss) / batch_size
self._final_state = state
# RANI
self.logits = logits
if not is_training:
return
self._lr = tf.Variable(0.0, trainable=False)
tvars = tf.trainable_variables()
(grads, _) = tf.clip_by_global_norm(tf.gradients(cost, tvars),
config.max_grad_norm)
optimizer = tf.train.GradientDescentOptimizer(self._lr)
self._train_op = optimizer.apply_gradients(zip(grads, tvars))
self._new_lr = tf.placeholder(tf.float32, shape=[],
name='new_learning_rate')
self._lr_update = tf.assign(self._lr, self._new_lr)
def assign_lr(self, session, lr_value):
session.run(self._lr_update, feed_dict={self._new_lr: lr_value})
...
However, When I run it, I get the following errors
File "ptb_word_lm.py", line 349, in <module>
tf.app.run()
File "C:\Users\Josh Goldman\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\platform\app.py", line 48, in run
_sys.exit(main(_sys.argv[:1] + flags_passthrough))
File "ptb_word_lm.py", line 299, in main
m = PTBModel(is_training=True, config=config)
File "ptb_word_lm.py", line 60, in __init__
inputs = tf.nn.embedding_lookup(embedding, self._input_data)
File "C:\Users\Josh Goldman\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\ops\embedding_ops.py", line 122, in embedding_lookup
return maybe_normalize(_do_gather(params[0], ids, name=name))
File "C:\Users\Josh Goldman\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\ops\embedding_ops.py", line 42, in _do_gather
return array_ops.gather(params, ids, name=name)
File "C:\Users\Josh Goldman\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\ops\gen_array_ops.py", line 1179, in gather
validate_indices=validate_indices, name=name)
File "C:\Users\Josh Goldman\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 589, in apply_op
param_name=input_name)
File "C:\Users\Josh Goldman\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 60, in _SatisfiesTypeConstraint
", ".join(dtypes.as_dtype(x).name for x in allowed_list)))
TypeError: Value passed to parameter 'indices' has DataType float32 not in list of allowed values: int32, int64
Someone, please help me. I have all my packages upgraded to the newest version. I'm using the correct interpreter. I'm sorry if the error is very simple. I'm only 13 and am very new to programming. By the way, this code is not mine; I got it from Github.
The error is due to tensorflow version, syntax of tf.split is changed in the newer version. there is another same problem with tf.concat
# replace this line with the following one
inputs = [tf.squeeze(input_, [1]) for input_ in tf.split(1, num_steps, inputs)]
# this support `tensorflow >= 1.0.0`
inputs = [tf.squeeze(input_, [1]) for input_ in tf.split(inputs, num_steps, axis=1)]
# Also use dtype float32 for inputs
self._input_data = tf.placeholder(tf.float32, [batch_size,
num_steps])
# replace this line
output = tf.reshape(tf.concat(1, outputs), [-1, size])
# with this one
output = tf.reshape(tf.concat(outputs, axis=1), [-1, size])
I'm trying to make a convolutional neural network in tensorflow.
I've trained it and saved the model with
saver = tf.tain.Saver()
saver.save(sess, "model.ckpt")
Then I have restored the model with
saver.restore(sess, "model.ckpt")
When I try to predict the label of one image, there's no problem.
But when I try to predict the label of two or more images, the network predicts only the first and then I get an error.
Here's the code for training the network:
import tensorflow as tf
import pickle
import numpy as np
with open('X_train.pickle', 'rb') as y:
u = pickle._Unpickler(y)
u.encoding = 'latin1'
X_train = u.load()
with open('X_test.pickle', 'rb') as y:
u = pickle._Unpickler(y)
u.encoding = 'latin1'
X_test = u.load()
X_test = np.array(X_test).reshape(-1, 2500)
with open('y_train.pickle', 'rb') as y:
u = pickle._Unpickler(y)
u.encoding = 'latin1'
y_train = u.load()
with open('y_test.pickle', 'rb') as y:
u = pickle._Unpickler(y)
u.encoding = 'latin1'
y_test = u.load()
n_classes = 3
batch_size = 100
x = tf.placeholder('float', [None, 2500])
y = tf.placeholder('float')
keep_rate = 0.8
keep_prob = tf.placeholder(tf.float32)
def conv2d(x, W):
return tf.nn.conv2d(x, W, strides=[1,1,1,1], padding='SAME')
def maxpool2d(x):
return tf.nn.max_pool(x, ksize=[1,2,2,1], strides=[1,2,2,1], padding='SAME')
def convolutional_neural_network(x):
weights = {'W_conv1':tf.Variable(tf.random_normal([5,5,1,32])),
'W_conv2':tf.Variable(tf.random_normal([5,5,32,64])),
'W_fc':tf.Variable(tf.random_normal([13*13*64,1024])),
'out':tf.Variable(tf.random_normal([1024, n_classes]))}
biases = {'b_conv1':tf.Variable(tf.random_normal([32])),
'b_conv2':tf.Variable(tf.random_normal([64])),
'b_fc':tf.Variable(tf.random_normal([1024])),
'out':tf.Variable(tf.random_normal([n_classes]))}
x = tf.reshape(x, shape=[-1, 50, 50, 1])
conv1 = tf.nn.relu(conv2d(x, weights['W_conv1']) + biases['b_conv1'])
conv1 = maxpool2d(conv1)
conv2 = tf.nn.relu(conv2d(conv1, weights['W_conv2']) + biases['b_conv2'])
conv2 = maxpool2d(conv2)
fc = tf.reshape(conv2,[-1, 13*13*64])
fc = tf.nn.relu(tf.matmul(fc, weights['W_fc'])+biases['b_fc'])
fc = tf.nn.dropout(fc, keep_rate)
output = tf.matmul(fc, weights['out'])+biases['out']
return output
def train_neural_network(x):
prediction = convolutional_neural_network(x)
cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits = prediction, labels = y) )
optimizer = tf.train.AdamOptimizer().minimize(cost)
saver = tf.train.Saver()
hm_epochs = 3
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for epoch in range(hm_epochs):
if epoch != 0:
saver.restore(sess, "model.ckpt")
epoch_loss = 0
i = 0
while i < len(X_train):
start = 1
end = i + batch_size
batch_x = np.array(X_train[start:end]).reshape(-1, 2500)
batch_y = np.array(y_train[start:end])
_, c = sess.run([optimizer, cost], feed_dict={x: batch_x, y: batch_y})
epoch_loss += c
i += batch_size
print(i)
saver.save(sess, "model.ckpt")
print('Epoch', epoch + 1, 'completed out of',hm_epochs,'loss:',epoch_loss)
correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
print('Accuracy:',accuracy.eval({x:X_test, y:y_test}))
And here's the code for using the network:
import tensorflow as tf
import matplotlib.pyplot as plt
import pickle
import numpy as np
with open('X_train.pickle', 'rb') as y:
u = pickle._Unpickler(y)
u.encoding = 'latin1'
X_train = u.load()
with open('X_test.pickle', 'rb') as y:
u = pickle._Unpickler(y)
u.encoding = 'latin1'
X_test = u.load()
X_test = np.array(X_test).reshape(-1, 2500)
with open('y_train.pickle', 'rb') as y:
u = pickle._Unpickler(y)
u.encoding = 'latin1'
y_train = u.load()
with open('y_test.pickle', 'rb') as y:
u = pickle._Unpickler(y)
u.encoding = 'latin1'
y_test = u.load()
n_classes = 3
batch_size = 100
x = tf.placeholder('float', [None, 2500])
y = tf.placeholder('float')
keep_rate = 0.8
keep_prob = tf.placeholder(tf.float32)
def conv2d(x, W):
return tf.nn.conv2d(x, W, strides=[1,1,1,1], padding='SAME')
def maxpool2d(x):
return tf.nn.max_pool(x, ksize=[1,2,2,1], strides=[1,2,2,1], padding='SAME')
def convolutional_neural_network(x):
weights = {'W_conv1':tf.Variable(tf.random_normal([5,5,1,32])),
'W_conv2':tf.Variable(tf.random_normal([5,5,32,64])),
'W_fc':tf.Variable(tf.random_normal([13*13*64,1024])),
'out':tf.Variable(tf.random_normal([1024, n_classes]))}
biases = {'b_conv1':tf.Variable(tf.random_normal([32])),
'b_conv2':tf.Variable(tf.random_normal([64])),
'b_fc':tf.Variable(tf.random_normal([1024])),
'out':tf.Variable(tf.random_normal([n_classes]))}
x = tf.reshape(x, shape=[-1, 50, 50, 1])
conv1 = tf.nn.relu(conv2d(x, weights['W_conv1']) + biases['b_conv1'])
conv1 = maxpool2d(conv1)
conv2 = tf.nn.relu(conv2d(conv1, weights['W_conv2']) + biases['b_conv2'])
conv2 = maxpool2d(conv2)
fc = tf.reshape(conv2,[-1, 13*13*64])
fc = tf.nn.relu(tf.matmul(fc, weights['W_fc'])+biases['b_fc'])
fc = tf.nn.dropout(fc, keep_rate)
output = tf.matmul(fc, weights['out'])+biases['out']
return output
def use_neural_network(input_data):
prediction = convolutional_neural_network(x)
saver = tf.train.Saver()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
saver.restore(sess, "model.ckpt")
result = (sess.run(tf.argmax(prediction.eval(feed_dict={x:[input_data]}),1)))
correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
print('Accuracy:',accuracy.eval({x:X_test, y:y_test}))
return result
sample1 = X_train[432].reshape(2500)
res1 =use_neural_network(sample1)
if res == [0]: print('Go straight')
elif res == [1]: print('Turn right')
else: print('Turn left')
img = sample.reshape(50,50)
plt.imshow(img)
plt.show()
#till now there is no problem
sample2 = X_train[1222].reshape(2500)
res =use_neural_network(sample2)
#ERROR
if res2 == [0]: print('Go straight')
elif res2 == [1]: print('Turn right')
else: print('Turn left')
img2 = sample2.reshape(50,50)
plt.imshow(img2)
plt.show()
Here's the error:
2017-07-24 14:22:21.277633: W tensorflow/core/framework/op_kernel.cc:1158] Not found: Key Variable_10 not found in checkpoint
2017-07-24 14:22:21.278563: W tensorflow/core/framework/op_kernel.cc:1158] Not found: Key Variable_11 not found in checkpoint
2017-07-24 14:22:21.278577: W tensorflow/core/framework/op_kernel.cc:1158] Not found: Key Variable_12 not found in checkpoint
2017-07-24 14:22:21.278995: W tensorflow/core/framework/op_kernel.cc:1158] Not found: Key Variable_9 not found in checkpoint
2017-07-24 14:22:21.279316: W tensorflow/core/framework/op_kernel.cc:1158] Not found: Key Variable_13 not found in checkpoint
2017-07-24 14:22:21.279325: W tensorflow/core/framework/op_kernel.cc:1158] Not found: Key Variable_14 not found in checkpoint
2017-07-24 14:22:21.279997: W tensorflow/core/framework/op_kernel.cc:1158] Not found: Key Variable_15 not found in checkpoint
2017-07-24 14:22:21.284003: W tensorflow/core/framework/op_kernel.cc:1158] Not found: Key Variable_8 not found in checkpoint
Traceback (most recent call last):
File "/anaconda/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1139, in _do_call
return fn(*args)
File "/anaconda/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1121, in _run_fn
status, run_metadata)
File "/anaconda/lib/python3.6/contextlib.py", line 89, in __exit__
next(self.gen)
File "/anaconda/lib/python3.6/site-packages/tensorflow/python/framework/errors_impl.py", line 466, in raise_exception_on_not_ok_status
pywrap_tensorflow.TF_GetCode(status))
tensorflow.python.framework.errors_impl.NotFoundError: Key Variable_10 not found in checkpoint
[[Node: save_1/RestoreV2_2 = RestoreV2[dtypes=[DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](_arg_save_1/Const_0_0, save_1/RestoreV2_2/tensor_names, save_1/RestoreV2_2/shape_and_slices)]]
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "USE_NEURAL_NETWORK.py", line 103, in <module>
res =use_neural_network(sample)
File "USE_NEURAL_NETWORK.py", line 80, in use_neural_network
saver.restore(sess, "model.ckpt")
File "/anaconda/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 1548, in restore
{self.saver_def.filename_tensor_name: save_path})
File "/anaconda/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 789, in run
run_metadata_ptr)
File "/anaconda/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 997, in _run
feed_dict_string, options, run_metadata)
File "/anaconda/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1132, in _do_run
target_list, options, run_metadata)
File "/anaconda/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1152, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.NotFoundError: Key Variable_10 not found in checkpoint
[[Node: save_1/RestoreV2_2 = RestoreV2[dtypes=[DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](_arg_save_1/Const_0_0, save_1/RestoreV2_2/tensor_names, save_1/RestoreV2_2/shape_and_slices)]]
Caused by op 'save_1/RestoreV2_2', defined at:
File "USE_NEURAL_NETWORK.py", line 103, in <module>
res =use_neural_network(sample)
File "USE_NEURAL_NETWORK.py", line 75, in use_neural_network
saver = tf.train.Saver()
File "/anaconda/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 1139, in __init__
self.build()
File "/anaconda/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 1170, in build
restore_sequentially=self._restore_sequentially)
File "/anaconda/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 691, in build
restore_sequentially, reshape)
File "/anaconda/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 407, in _AddRestoreOps
tensors = self.restore_op(filename_tensor, saveable, preferred_shard)
File "/anaconda/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 247, in restore_op
[spec.tensor.dtype])[0])
File "/anaconda/lib/python3.6/site-packages/tensorflow/python/ops/gen_io_ops.py", line 640, in restore_v2
dtypes=dtypes, name=name)
File "/anaconda/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 767, in apply_op
op_def=op_def)
File "/anaconda/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 2506, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/anaconda/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1269, in __init__
self._traceback = _extract_stack()
NotFoundError (see above for traceback): Key Variable_10 not found in checkpoint
[[Node: save_1/RestoreV2_2 = RestoreV2[dtypes=[DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](_arg_save_1/Const_0_0, save_1/RestoreV2_2/tensor_names, save_1/RestoreV2_2/shape_and_slices)]]
How can I avoid this error and predict as many images as I want ?
When you do prediction = convolutional_neural_network(x) in use_neural_network(), you create your networks, with all variables having specific names.
If you do it twice, you will create 2 networks, identical in every point but the name of the variables, the second network will have other names for the variables (as you can't have the same name for 2 variables).
Then, problems arise when you use the saver to restore your model, as it tries to find values for every variables in your graph, but your checkpoint will only contain values for the variables in the first network, as they are properly named.
The Saver raise an exception because it failed to restore every variables.
So, your mistake is to restore your model everytime you want to process a new image.
You should have two functions, one for restoring, and one for processing.
I have one bug I cannot find out the reason. Here is the code:
with tf.Graph().as_default():
global_step = tf.Variable(0, trainable=False)
images = tf.placeholder(tf.float32, shape = [FLAGS.batch_size,33,33,1])
labels = tf.placeholder(tf.float32, shape = [FLAGS.batch_size,21,21,1])
logits = inference(images)
losses = loss(logits, labels)
train_op = train(losses, global_step)
saver = tf.train.Saver(tf.all_variables())
summary_op = tf.merge_all_summaries()
init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)
summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)
for step in xrange(FLAGS.max_steps):
start_time = time.time()
data_batch, label_batch = SRCNN_inputs.next_batch(np_data, np_label,
FLAGS.batch_size)
_, loss_value = sess.run([train_op, losses], feed_dict={images: data_batch, labels: label_batch})
duration = time.time() - start_time
def next_batch(np_data, np_label, batchsize,
training_number = NUM_EXAMPLES_PER_EPOCH_TRAIN):
perm = np.arange(training_number)
np.random.shuffle(perm)
data = np_data[perm]
label = np_label[perm]
data_batch = data[0:batchsize,:]
label_batch = label[0:batchsize,:]
return data_batch, label_batch
where np_data is the whole training samples read from hdf5 file, and the same to np_label.
After I run the code, I got the error like this :
2016-07-07 11:16:36.900831: step 0, loss = 55.22 (218.9 examples/sec; 0.585 sec/batch)
Traceback (most recent call last):
File "<ipython-input-1-19672e1f8f12>", line 1, in <module>
runfile('/home/kang/Documents/work_code_PC1/tf_SRCNN/SRCNN_train.py', wdir='/home/kang/Documents/work_code_PC1/tf_SRCNN')
File "/usr/lib/python3/dist-packages/spyderlib/widgets/externalshell/sitecustomize.py", line 685, in runfile
execfile(filename, namespace)
File "/usr/lib/python3/dist-packages/spyderlib/widgets/externalshell/sitecustomize.py", line 85, in execfile
exec(compile(open(filename, 'rb').read(), filename, 'exec'), namespace)
File "/home/kang/Documents/work_code_PC1/tf_SRCNN/SRCNN_train.py", line 155, in <module>
train_test()
File "/home/kang/Documents/work_code_PC1/tf_SRCNN/SRCNN_train.py", line 146, in train_test
summary_str = sess.run(summary_op)
File "/usr/local/lib/python3.4/dist-packages/tensorflow/python/client/session.py", line 372, in run
run_metadata_ptr)
File "/usr/local/lib/python3.4/dist-packages/tensorflow/python/client/session.py", line 636, in _run
feed_dict_string, options, run_metadata)
File "/usr/local/lib/python3.4/dist-packages/tensorflow/python/client/session.py", line 708, in _do_run
target_list, options, run_metadata)
File "/usr/local/lib/python3.4/dist-packages/tensorflow/python/client/session.py", line 728, in _do_call
raise type(e)(node_def, op, message)
InvalidArgumentError: You must feed a value for placeholder tensor 'Placeholder' with dtype float and shape [128,33,33,1]
[[Node: Placeholder = Placeholder[dtype=DT_FLOAT, shape=[128,33,33,1], _device="/job:localhost/replica:0/task:0/gpu:0"]()]]
[[Node: truediv/_74 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/gpu:0", send_device_incarnation=1, tensor_name="edge_56_truediv", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()]]
Caused by op 'Placeholder', defined at:
So, It shows that for the step 0 it has the result, which means that the data has been fed into the Placeholders.
But why does it come the error of feeding data into Placeholder in the next time?
When I try to comment the code summary_op = tf.merge_all_summaries() and the code works fine. why is it the case?
When I try to comment the code summary_op = tf.merge_all_summaries() and the code works fine. why is it the case?
summary_op is an operation. If there exists (and this is true in your case) a summary operation related to the result of another operation that depends upon the values of the placeholders, you have to feed the graph the required values.
So, your line summary_str = sess.run(summary_op) needs the dictionary of the values to store.
Usually, instead of re-executing the operations to log the values, you run the operations and the summary_op once.
Do something like
if step % LOGGING_TIME_STEP == 0:
_, loss_value, summary_str = sess.run([train_op, losses, summary_op], feed_dict={images: data_batch, labels: label_batch})
else:
_, loss_value = sess.run([train_op, losses], feed_dict={images: data_batch, labels: label_batch})