error when implementing a tensorflow input pipeline with tf.data - python

I have an issue implementing an input pipeline with the new tf.data tensorflow class.
Specifically, when I include a convolution operation to the preprocessing - which I add to the pipeline with the map method - I get the following error
tensorflow.python.framework.errors_impl.UnimplementedError: Generic conv implementation only supports NHWC tensor format for now.
[[{{node conv_debug}} = Conv2D[T=DT_FLOAT, data_format="NCHW", dilations=[1, 1, 1, 1], padding="SAME", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true](conv_debug-0-TransposeNHWCToNCHW-LayoutOptimizer, ArithmeticOptimizer/FoldMultiplyIntoConv_scaled_conv_debug_Const)]]
When I exclude the convolution from the pipeline, everything works as expected.
I attach below the minimal code needed to reproduce the problem.
Tested with 3 configurations:
Tensorflow 1.12.0, CUDA 10.0, CUDnn 7.4.1, got the error.
Tensorflow 1.11.0, CUDA 9.0, CUDnn 7.3.1, got the error.
Tensorflow 1.8.0, CUDA 8.0, CUDnn 6.0, it works.
Am I doing it wrong or is it a CUDA/CUDnn related issue?
Thanks!
import numpy as np
import tensorflow as tf
image_height, image_width = 100, 200
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def serialize_to_record(record_name, label, image):
"""Create a data record and store it"""
writer = tf.python_io.TFRecordWriter(record_name)
image_raw = image.tostring()
label_raw = label
sample = tf.train.Example(features=tf.train.Features(feature={
'image_raw': _bytes_feature(image_raw),
'label_raw': _bytes_feature(label_raw)}))
writer.write(sample.SerializeToString())
writer.close()
return
def _dataset_parser(record):
"""Read and deserialize a tensorflow record"""
parsed = tf.parse_single_example(record,
features={'image_raw': tf.FixedLenFeature([], tf.string),
'label_raw': tf.FixedLenFeature([], tf.string)})
image_ = tf.decode_raw(parsed['image_raw'], tf.uint8)
image_.set_shape(image_height * image_width * 3)
image_ = tf.reshape(image_, (image_height, image_width, 3))
image = tf.cast(image_, tf.float32) / 255.0
label = parsed['label_raw']
return {'image': image, 'label': label}
def _dataset_preprocessor(datum):
"""dummy preprocessor consisting of a convolution with a random kernel"""
image = datum['image']
kernel = np.random.rand(5, 5, 3, 3)
kernel_tf = tf.constant(kernel, dtype=tf.float32)
image = tf.expand_dims(image, axis=0)
image = tf.nn.conv2d(image, kernel_tf, [1, 1, 1, 1], padding='SAME', name='conv_debug')
image = tf.squeeze(image, axis=0)
datum['image'] = image
return datum
def _dataset_operator(record):
"""define a sequence of operation to run on the dataset"""
datum = _dataset_parser(record)
datum = _dataset_preprocessor(datum)
return datum
def _dataset_operator_noconv(record):
"""define a sequence of operation to run on the dataset"""
datum = _dataset_parser(record)
return datum
if __name__ == '__main__':
# create a random tensor
image = (255.0 * np.random.rand(image_height, image_width, 3)).astype(np.uint8)
record_path = 'example.tfrecord'
# store a tf record to disk
serialize_to_record(record_path, label='example', image=image)
# build a dummy dataset of copies of the generated image
N = 32
dataset_filenames = [record_path for n in range(N)]
dataset = tf.data.TFRecordDataset(dataset_filenames)
# add parser and preprocessor to the pipeline
include_convolution_to_pipeline = True
if include_convolution_to_pipeline:
dataset = dataset.map(_dataset_operator)
else:
dataset = dataset.map(_dataset_operator_noconv)
# complete pipeline for iteratively visiting the dataset in batches of 8 samples
dataset = dataset.shuffle(buffer_size=100)
dataset = dataset.batch(8)
dataset = dataset.repeat()
iterator = dataset.make_initializable_iterator()
next_data = iterator.get_next()
# init session and go for the first batch
sess = tf.Session()
sess.run(iterator.initializer)
next_data_ = sess.run(next_data)
print('***')

As error message states, convolution operation requires NCHW data format. Regardless of what data format you want, it still needs batch_size as one of dimensions. But you're trying to apply map function prior to batching. It's usually not standard order but if you need convolution, you need to apply map function after batch.
dataset = dataset.map(_dataset_operator)
dataset = dataset.shuffle(buffer_size=100)
dataset = dataset.batch(8)
dataset = dataset.map(_dataset_operator)
dataset = dataset.repeat()

It is tensorflow's layout optimizer problem.
Tensorflow "map" function executes the graph in CPU and placing tensors in the map otherwise confuses the layout optimizer.
Placing tf.device("/cpu:0") when creating the tensors inside the map function solves the layout optimizer confusion. Another option is to disable the layout optimizer which may cost in extra training time( it may not be feasible not to optimize the whole graph layout to execute "map" phase ).
There is already an open issue regarding this problem :
https://github.com/tensorflow/tensorflow/issues/26411
As this is a workaround, I think more robust solutions(executing "map" tensors in GPU, fixes for layout optimizer etc.) may come in the next releases of TF. But for now, suggested workaround solves my problem without hassling any layout deoptimization issues.

Related

How does tf.dataset interact with keras.conv1D?

I'm using tf 1.15, i'm trying to make a regression task using a signal.
First of all i load my signals into the pipeline, i have several files, here i simulate the loading using a np.zeros to make the code usable by you.
Every file has this shape (?, 75000, 3), where ? is a random number of elements, 75000 is the number of samples in each element and 3 is the number of signals.
Using the tf.data i unpack them and i get a dataset who output signals with this shape (75000,), and i use them in my keras model.
Everything should be fine until i create the keras model, i copied my input pipeline because during my tests i got different errors using a generic tf.data.dataset or using the dataset built in this way.
import numpy as np
import tensorflow as tf
# called in the dataset pipeline
def my_func(x):
p = np.zeros([86, 75000, 3])
x = p[:,:,0]
y = p[:, :, 1]
z = p[:, :, 2]
return x, y, z
# called in the dataset pipeline
def load_sign(path):
func = tf.compat.v1.numpy_function(my_func, [path], [tf.float64, tf.float64, tf.float64])
return func
# Dataset pipeline
s = [1, 2] # here i have the file paths, i simulate it with numbers
AUTOTUNE = tf.data.experimental.AUTOTUNE
ds = tf.data.Dataset.from_tensor_slices(s)
# ds = ds.map(load_sign, num_parallel_calls=AUTOTUNE)
ds = ds.map(load_sign, num_parallel_calls=AUTOTUNE).unbatch()
itera = tf.data.make_one_shot_iterator(ds)
ABP, ECG, PLETH = itera.get_next()
# Until there everything should be fine
# Here i create my convolutional network
signal = tf.keras.layers.Input(shape=(None,75000), dtype='float32')
x = tf.compat.v1.keras.layers.Conv1D(64, (1), strides=1, padding='same')(signal)
x = tf.keras.layers.Dense(75000)(x)
model = tf.keras.Model(inputs=signal, outputs=x, name='resnet18')
# And finally i try to insert my signal into model
logits = model(PLETH)
I get this error:
ValueError: Input 0 of layer conv1d is incompatible with the layer: its rank is undefined, but the layer requires a defined rank.
Why? And how can i make it works?
Also the input size of my net should be this one according the documentation:
3D tensor with shape: (batch_size, steps, input_dim)
What is the steps? In my case i assume it should be (batch_size, 1, 75000), right?

How to import jpg and npy files in pair for deep learning with tensorflow?

I have 1500 RGB files(.jpg) and 1500 feature map values(.npy). I want to use them as a dataset for my deep learning project. I am using tensorflow 1.12.
I wrote them into a .tfrecords file using the tf.Example. Here is the code I used to import this file with tf.data(Thanks to Uday's comment).
import tensorflow as tf
import numpy as np
import pdb
IMAGE_HEIGHT = 228
IMAGE_WIDTH = 304
def tfdata_generator(tfrname, is_training, batch_size):
'''Construct a data generator using tf.Dataset'''
## You can write your own parse function
def parse_function(example):
features = tf.parse_single_example(example, features={
'image_raw': tf.FixedLenFeature([], tf.string, default_value=""),
'hint_raw': tf.FixedLenFeature([], tf.string, default_value="")
})
image = features['image_raw']
hint = features['hint_raw']
image = tf.decode_raw(image, tf.uint8)
image = tf.cast(image, tf.float32)
image = tf.reshape(image, [IMAGE_HEIGHT, IMAGE_WIDTH, 3])
hint = tf.decode_raw(hint, tf.uint8)
hint = tf.cast(hint, tf.float32)
hint = tf.reshape(hint, [8, 10, 1024])
return image, hint
dataset = tf.data.TFRecordDataset(tfrname)
#pdb.set_trace()
if is_training:
dataset = dataset.shuffle(100) # depends on sample size
#pdb.set_trace()
# Transform and batch data at the same time
dataset = dataset.apply(tf.data.experimental.map_and_batch(parse_function,
8, num_parallel_batches=4)) # cpu cores
dataset = dataset.repeat(-1)
dataset = dataset.prefetch(2)
return dataset
I set the batch_size to be 8. But when I did the debugging, the shape of the dataset is
((?, 228, 304, 3), (?, 8, 10, 1024)), types: (tf.float32, tf.float32)
Is this correct? Is this code wrong? Or there are mistakes when I making the tfrecords?.
you can use code like below,
def tfdata_generator(images, labels, is_training, batch_size=32):
'''Construct a data generator using tf.Dataset'''
## You can write your own parse function
def parse_function(filename, label):
image_string = tf.read_file(filename)
image = tf.image.decode_jpeg(image_string)
image = tf.image.convert_image_dtype(image, tf.float32)
y = tf.one_hot(tf.cast(label, tf.uint8), 16)
return image, y
dataset = tf.data.Dataset.from_tensor_slices((images, labels))
if is_training:
dataset = dataset.shuffle(1000) # depends on sample size
# Transform and batch data at the same time
dataset = dataset.apply(tf.data.experimental.map_and_batch( parse_function,
batch_size,num_parallel_batches=6, # cpu cores
drop_remainder=True if is_training else False))
dataset = dataset.repeat()
dataset = dataset.prefetch(no_of_prefetch_needed)
return dataset

TensorFlow use dataset to replace function feed_dict

when I learn a tensorflow project,find one line code:
cls_prob, box_pred = sess.run([output_cls_prob, output_box_pred], feed_dict={input_img: blob})
But, this line code It took a lot of time. (use CPU need 15 seconds...┭┮﹏┭┮)
By consulting information, I find use function 'dataset' could solve this problem which took a lot of time, How should I use it?
source of 'blob':
img = cv2.imread('./imgs/001.jpg')
img_scale = float(600) / min(img_data.shape[0], img_data.shape[1])
if np.round(img_scale * max(img_data.shape[0], img_data.shape[1])) > 1200:
img_scale = float(1200) / max(img_data.shape[0], img_data.shape[1])
img_data = cv2.resize(img_data, None, None, fx=img_scale, fy=img_scale, interpolation=cv2.INTER_LINEAR)
img_orig = img_data.astype(np.float32, copy=True)
blob = np.zeros((1, img_data.shape[0], img_data.shape[1], 3),dtype=np.float32)
blob[0, 0:img_data.shape[0], 0:img_data.shape[1], :] = img_orig
source of 'output_cls_prob'&'output_box_pred'&'input_img':
# Actually,read PB model...
input_img = sess.graph.get_tensor_by_name('Placeholder:0')
output_cls_prob = sess.graph.get_tensor_by_name('Reshape_2:0')
output_box_pred = sess.graph.get_tensor_by_name('rpn_bbox_pred/Reshape_1:0')
Parameter type:
blob:type 'numpy.ndarray'
output_cls_prob:class 'tensorflow.python.framework.ops.Tensor'
output_box_pred:class 'tensorflow.python.framework.ops.Tensor'
input_img:class 'tensorflow.python.framework.ops.Tensor'
tf.data is the recommended API for tensorflow input pipelines. Here is a tutorial on tensorflow.org. For your example, the section "Decoding image data and resizing it" could be most useful. For example, you could do something like:
# Reads an image from a file, decodes it into a dense tensor, and resizes it
# to a fixed shape.
def _parse_function(filename):
image_string = tf.read_file(filename)
image_decoded = tf.image.decode_jpeg(image_string)
image_resized = tf.image.resize_images(image_decoded, [new_width, new_height])
image_resized = tf.expand_dims(image_resized, 0) # Adds size 1 dimension
return image_resized
# A vector of filenames.
filenames = tf.constant(["./imgs/001.jpg", ...])
dataset = tf.data.Dataset.from_tensor_slices(filenames)
dataset = dataset.map(_parse_function)
And instead of having input_img be a placeholder, change:
input_img = tf.placeholder(tf.float32)
output_class_prob, output_class_pred = (... use input_img ...)
to:
iterator = dataset.make_one_shot_iterator()
input_img = iterator.get_next()
output_class_prob, output_class_pred = (... use input_img ...)
First of all you should know that the use of Dataset API has a great impact in performance when multiples GPUs are used... Otherwise is almost identical to feed_dict. I recommend you to read this other answer from a TF developer, it has almost everything one needs to know to create a mental image of the benefits of this new API.

Invalid character found in base64 while using a deployed model on cloudml

For better context, I have uploaded a pre-trained model on cloud ml. It's an inceptionV3 model converted from keras to acceptable format in tensorflow.
from keras.applications.inception_v3 import InceptionV3
model = InceptionV3(weights='imagenet')
from keras.models import Model
intermediate_layer_model = Model(inputs=model.input,outputs=model.layers[311].output)
with tf.Graph().as_default() as g_input:
input_b64 = tf.placeholder(shape=(1,),
dtype=tf.string,
name='input')
input_bytes = tf.decode_base64(input_b64[0])
image = tf.image.decode_image(input_bytes)
image_f = tf.image.convert_image_dtype(image, dtype=tf.float32)
input_image = tf.expand_dims(image_f, 0)
output = tf.identity(input_image, name='input_image')
g_input_def = g_input.as_graph_def()
K.set_learning_phase(0)
sess = K.get_session()
from tensorflow.python.framework import graph_util
g_trans = sess.graph
g_trans_def = graph_util.convert_variables_to_constants(sess,
g_trans.as_graph_def(),
[intermediate_layer_model.output.name.replace(':0','')])
with tf.Graph().as_default() as g_combined:
x = tf.placeholder(tf.string, name="input_b64")
im, = tf.import_graph_def(g_input_def,
input_map={'input:0': x},
return_elements=["input_image:0"])
pred, = tf.import_graph_def(g_trans_def,
input_map={intermediate_layer_model.input.name: im,
'batch_normalization_1/keras_learning_phase:0': False},
return_elements=[intermediate_layer_model.output.name])
with tf.Session() as sess2:
inputs = {"inputs": tf.saved_model.utils.build_tensor_info(x)}
outputs = {"outputs":tf.saved_model.utils.build_tensor_info(pred)}
signature =tf.saved_model.signature_def_utils.build_signature_def(
inputs=inputs,
outputs=outputs,
method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME
)
# save as SavedModel
b = tf.saved_model.builder.SavedModelBuilder('inceptionv4/')
b.add_meta_graph_and_variables(sess2,
[tf.saved_model.tag_constants.SERVING],
signature_def_map={'serving_default': signature})
b.save()
The generated pb file works fine when I use it locally. But when I deploy it on cloud ml I get the following error.
RuntimeError: Prediction failed: Error during model execution: AbortionError(code=StatusCode.INVALID_ARGUMENT, details="Invalid character found in base64.
[[Node: import/DecodeBase64 = DecodeBase64[_output_shapes=[<unknown>], _device="/job:localhost/replica:0/task:0/device:CPU:0"](import/strided_slice)]]")
Following is the code I use for getting local predictions.
import base64
import json
with open('MEL_BE_0.jpg', 'rb') as image_file:
encoded_string = str(base64.urlsafe_b64encode(image_file.read()),'ascii')
import tensorflow as tf
with tf.Session(graph=tf.Graph()) as sess:
MetaGraphDef=tf.saved_model.loader.load(
sess,
[tf.saved_model.tag_constants.SERVING],
'inceptionv4')
input_tensor = tf.get_default_graph().get_tensor_by_name('input_b64:0')
print(input_tensor)
avg_tensor = tf.get_default_graph().get_tensor_by_name('import_1/avg_pool/Mean:0')
print(avg_tensor)
predictions = sess.run(avg_tensor, {input_tensor: [encoded_string]})
And finally following is the code snippet that I use for wrapping the encoded string in the request that is sent to the cloud-ml engine.
request_body= json.dumps({"key":"0", "image_bytes": {"b64": [encoded_string]}})
It looks like you are trying to do the base64 decoding in TensorFlow and use the {"b64": ...} JSON format. You need to do one or the other; we typically recommend the latter.
As a side note, your input placeholder must have an outer dimension of None. That can make some things tricky, e.g., you'll either have to reshape the dimensions to be size 1 (which will prevent you from using the batch prediction service in its current state) or you'll have to us tf.map_fn to apply the same set of transformations to each element of the input "batch". You can find an example of that technique in this example.
Finally, I recommend the use of tf.saved_model.simple_save.
Putting it altogether, here is some modified code. Note that I'm inlining your input function (as opposed to serializing it to a graph def and reimporting):
HEIGHT = 299
WIDTH = 299
# Get Keras Model
from keras.applications.inception_v3 import InceptionV3
model = InceptionV3(weights='imagenet')
from keras.models import Model
intermediate_layer_model = Model(inputs=model.input,outputs=model.layers[311].output)
K.set_learning_phase(0)
sess = K.get_session()
from tensorflow.python.framework import graph_util
g_trans = sess.graph
g_trans_def = graph_util.convert_variables_to_constants(sess,
g_trans.as_graph_def(),
[intermediate_layer_model.output.name.replace(':0','')])
# Create inputs to model and export
with tf.Graph().as_default() as g_combined:
def decode_and_resize(image_bytes):
image = tf.image.decode_image(image_bytes)
# Note resize expects a batch_size, but tf_map supresses that index,
# thus we have to expand then squeeze. Resize returns float32 in the
# range [0, uint8_max]
image = tf.expand_dims(image, 0)
image = tf.image.resize_bilinear(
image, [HEIGHT, WIDTH], align_corners=False)
image = tf.squeeze(image, squeeze_dims=[0])
image = tf.cast(image, dtype=tf.uint8)
return image
input_byes = tf.placeholder(shape=(None,),
dtype=tf.string,
name='input')
images = tf.map_fn(
decode_and_resize, input_bytes, back_prop=False, dtype=tf.uint8)
images = tf.image.convert_image_dtype(images, dtype=tf.float32)
pred, = tf.import_graph_def(g_trans_def,
input_map={intermediate_layer_model.input.name: images,
'batch_normalization_1/keras_learning_phase:0': False},
return_elements=[intermediate_layer_model.output.name])
with tf.Session() as sess2:
tf.saved_model.simple_save(
sess2,
model_dir='inceptionv4/'
inputs={"inputs": input_bytes},
outputs={"outputs": pred})
Note: I'm not 100% certain that the shapes of intermediate_layer_model and images are compatible. The shape of images will be [None, height, width, num_channels].
Also note that your local prediction code will change a bit. You don't base64 encode the images and you need to send a "batch"/list of images rather than single images. Something like:
with open('MEL_BE_0.jpg', 'rb') as image_file:
encoded_string = image_file.read()
input_tensor = tf.get_default_graph().get_tensor_by_name('input:0')
print(input_tensor)
avg_tensor = tf.get_default_graph().get_tensor_by_name('import_1/avg_pool/Mean:0')
print(avg_tensor)
predictions = sess.run(avg_tensor, {input_tensor: [encoded_string]})
You didn't specify whether you're doing batch prediction or online prediction, which have similar but slightly different "formats" for the inputs. In either case, your model is not exporting a "key" field (did you mean to? It's probably helpful for batch prediction, but not for online).
For batch prediction, the file format is JSON lines; each line contains one example. Each line can be generated like so from Python:
example = json.dumps({"image_bytes": {"b64": ENCODED_STRING}})
(Note the omission of "key" for now). Since you only have one input, there is a shorthand:
example = json.dumps({"b64": ENCODED_STRING})
If you want to do online prediction, you'll note that if you are using gcloud to send requests, you actually use the same file format as for batch prediction.
In fact, we highly recommend using gcloud ml-engine local predict --json-instances=FILE --model-dir=... before deploying to the cloud to help debug.
If you intend to use some other client besides gcloud, e.g., in a web app, mobile app, frontend server, etc., then you won't be sending a file and you need to construct the full request yourself. It's very similar to the file format above. Basically, take each line of the JSON lines file and put them in an array calle "instances", i.e.,
request_body= json.dumps({"instances": [{"image_bytes": {"b64": [encoded_string]}}]})
You can use the same syntactic sugar if you'd like:
request_body= json.dumps({"instances": [{"b64": [encoded_string]}]})
I hope this helps!

ValueError: Tensor A must be from same graph as Tensor B

I am trying to run this ResNet with a few alterations.https://github.com/tensorflow/models/tree/master/official/resnet
After looking up the error, I understand the problem to be either :
That the tensors belong to different graphs, but I can't figure out how that came to be, as I'm not creating any graphs myself.
I have uninitialized variables in the parser function replacement.
If it's the initialization - How should I initialize them when using Estimator, which auto initializes and creates sessions?
This is the error:
ValueError: Tensor("IsVariableInitialized:0", shape=(), dtype=bool) must be from the same graph as Tensor("report_uninitialized_variables/IsVariableInitialized:0", shape=(), dtype=bool).
The entire code is very voluminous, so I'll only supply what I've made changes to (as it runs without these changes). Rest of the code is untouched (repo in the link above)
this is the original parser function(reads from binary files):
def parse_record(raw_record, is_training):
"""Parse CIFAR-10 image and label from a raw record."""
# Convert bytes to a vector of uint8 that is record_bytes long.
record_vector = tf.decode_raw(raw_record, tf.uint8)
# The first byte represents the label, which we convert from uint8 to int32
# and then to one-hot.
label = tf.cast(record_vector[0], tf.int32)
label = tf.one_hot(label, _NUM_CLASSES)
# The remaining bytes after the label represent the image, which we reshape
# from [depth * height * width] to [depth, height, width].
depth_major = tf.reshape(record_vector[1:_RECORD_BYTES],
[_NUM_CHANNELS, _HEIGHT, _WIDTH])
# Convert from [depth, height, width] to [height, width, depth], and cast as
# float32.
image = tf.cast(tf.transpose(depth_major, [1, 2, 0]), tf.float32)
image = preprocess_image(image, is_training)
return image, label
and this is my replacement to read from TFRecords:
def parse_record(raw_record, is_training):
mode = 'train' if is_training else 'val'
feature = {mode + '/image': tf.FixedLenFeature([], tf.string),
mode + '/label': tf.FixedLenFeature([], tf.int64)}
filename_queue = tf.train.string_input_producer([raw_record], num_epochs=1)
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
features = tf.parse_single_example(serialized_example, features=feature)
label = tf.cast(features['train/label'], tf.int32)
label = tf.one_hot(label, _NUM_CLASSES)
image = tf.decode_raw(features['train/image'], tf.float32)
image = tf.reshape(image, [_HEIGHT, _WIDTH, _NUM_CHANNELS])
image = preprocess_image(image, is_training)
return image, label
This is where the Estimator is created (I've not modified this bit)
def resnet_main(flags, model_function, input_function):
"""Shared main loop for ResNet Models."""
# Using the Winograd non-fused algorithms provides a small performance boost.
os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
if flags.multi_gpu:
validate_batch_size_for_multi_gpu(flags.batch_size)
# There are two steps required if using multi-GPU: (1) wrap the model_fn,
# and (2) wrap the optimizer. The first happens here, and (2) happens
# in the model_fn itself when the optimizer is defined.
model_function = tf.contrib.estimator.replicate_model_fn(
model_function,
loss_reduction=tf.losses.Reduction.MEAN)
# Create session config based on values of inter_op_parallelism_threads and
# intra_op_parallelism_threads. Note that we default to having
# allow_soft_placement = True, which is required for multi-GPU and not
# harmful for other modes.
session_config = tf.ConfigProto(
inter_op_parallelism_threads=flags.inter_op_parallelism_threads,
intra_op_parallelism_threads=flags.intra_op_parallelism_threads,
allow_soft_placement=True)
# Set up a RunConfig to save checkpoint and set session config.
run_config = tf.estimator.RunConfig().replace(save_checkpoints_secs=1e9,
session_config=session_config)
classifier = tf.estimator.Estimator(
model_fn=model_function, model_dir=flags.model_dir, config=run_config,
params={
'resnet_size': flags.resnet_size,
'data_format': flags.data_format,
'batch_size': flags.batch_size,
'multi_gpu': flags.multi_gpu,
'version': flags.version,
})
for _ in range(flags.train_epochs // flags.epochs_between_evals):
train_hooks = hooks_helper.get_train_hooks(
flags.hooks,
batch_size=flags.batch_size,
benchmark_log_dir=flags.benchmark_log_dir)
print('Starting a training cycle.')
def input_fn_train():
return input_function(True, flags.data_dir, flags.batch_size,
flags.epochs_between_evals,
flags.num_parallel_calls, flags.multi_gpu)
classifier.train(input_fn=input_fn_train, hooks=train_hooks,
max_steps=flags.max_train_steps)
print('Starting to evaluate.')
# Evaluate the model and print results
def input_fn_eval():
return input_function(False, flags.data_dir, flags.batch_size,
1, flags.num_parallel_calls, flags.multi_gpu)
# flags.max_train_steps is generally associated with testing and profiling.
# As a result it is frequently called with synthetic data, which will
# iterate forever. Passing steps=flags.max_train_steps allows the eval
# (which is generally unimportant in those circumstances) to terminate.
# Note that eval will run for max_train_steps each loop, regardless of the
# global_step count.
eval_results = classifier.evaluate(input_fn=input_fn_eval,
steps=flags.max_train_steps)
print(eval_results)
if flags.benchmark_log_dir is not None:
benchmark_logger = logger.BenchmarkLogger(flags.benchmark_log_dir)
benchmark_logger.log_estimator_evaluation_result(eval_results)

Categories

Resources