Tensorflow parsing and reshaping float list in Dataset.map() - python

I am trying to write a 3D float list into a TFrecord, so I successfully, write it by flattening it first, I parse it but it raises an error while reshaping it.
Error: ValueError: Shapes () and (8,) are not compatible
This is how I write the TFrecord file
def _floats_feature(value):
return tf.train.Feature(float_list=tf.train.FloatList(value=value.flatten()))
def write(output_path, data_rgb, data_depth, data_decalib):
with tf.python_io.TFRecordWriter(output_path) as writer:
feature = {'data_rgb': _floats_feature(data_rgb),
'data_depth': _floats_feature(data_depth),
'data_decalib': _floats_feature(data_decalib)}
sample = tf.train.Example(features=tf.train.Features(feature=feature))
writer.write(sample.SerializeToString())
And This is how I read the TFrecord file
def get_batches(date, drives, batch_size=1):
"""
Create a generator that returns batches of tuples
rgb, depth and calibration
:param date: date of the drive
:param drives: array of the drive_numbers within the drive date
:return: batch generator
"""
filenames = get_paths_drives(date, drives)
dataset = tf.data.TFRecordDataset(filenames)
dataset = dataset.map(input_parser) # Parse the record into tensors.
dataset = dataset.repeat() # Repeat the input indefinitely.
dataset = dataset.batch(batch_size)
return dataset
config = configparser.ConfigParser()
config.read(path_helpers.get_config_file_path())
IMAGE_WIDTH = int(config['DATA_INFORMATION']['IMAGE_WIDTH'])
IMAGE_HEIGHT = int(config['DATA_INFORMATION']['IMAGE_HEIGHT'])
INPUT_RGB_SHAPE = [IMAGE_HEIGHT, IMAGE_WIDTH, 3]
INPUT_DEPTH_SHAPE = [IMAGE_HEIGHT, IMAGE_WIDTH, 1]
LABEL_CALIB_SHAPE = [8]
def input_parser(example_proto):
features = {'data_rgb': tf.FixedLenFeature([], tf.float32),
'data_depth': tf.FixedLenFeature([], tf.float32),
'data_decalib': tf.FixedLenFeature([], tf.float32)}
parsed_features = tf.parse_single_example(example_proto, features)
data_rgb = parsed_features['data_rgb']
data_rgb.set_shape(np.prod(INPUT_RGB_SHAPE))
img_rgb = tf.reshape(data_rgb, INPUT_RGB_SHAPE)
data_depth = parsed_features['data_depth']
data_depth.set_shape(np.prod(INPUT_DEPTH_SHAPE))
img_depth = tf.reshape(data_depth, INPUT_DEPTH_SHAPE)
data_decalib = parsed_features['data_decalib']
data_decalib.set_shape(LABEL_CALIB_SHAPE)
return img_rgb, img_depth, data_decalib

Turns out I needed to change my input parser as follows:
def input_parser(example_proto):
features = {'data_rgb': tf.FixedLenFeature(shape=[np.prod(INPUT_RGB_SHAPE)], dtype=tf.float32),
'data_depth': tf.FixedLenFeature(shape=[np.prod(INPUT_DEPTH_SHAPE)], dtype=tf.float32),
'data_decalib': tf.FixedLenFeature(shape=LABEL_CALIB_SHAPE, dtype=tf.float32)}
parsed_features = tf.parse_single_example(example_proto, features)
as the documentation for tf.FixedLenFeature (now tf.io.FixedLenFeature) dictates. The first argument is the shape, which I set to [] hence the error ValueError: Shapes () and (8,) are not compatible. Setting it to their real values worked out.

Related

extracting numpy value from tensorflow object during transformation

i am trying to get word embeddings using tensorflow, and i have created adjacent work lists using my corpus.
Number of unique words in my vocab are 8000 and number of adjacent word lists are around 1.6 million
Word Lists sample photo
Since the data is very large i am trying to write the word lists in batches to TFRecords file.
def save_tfrecords_wordlist(toprocess_word_lists, path ):
writer = tf.io.TFRecordWriter(path)
for word_list in toprocess_word_lists:
features=tf.train.Features(
feature={
'word_list_X': tf.train.Feature( bytes_list=tf.train.BytesList(value=[word_list[0].encode('utf-8')] )),
'word_list_Y': tf.train.Feature( bytes_list=tf.train.BytesList(value=[word_list[1].encode('utf-8') ]))
}
)
example = tf.train.Example(features = features)
writer.write(example.SerializeToString())
writer.close()
defining batches
batches = [0,250000,500000,750000,1000000,1250000,1500000,1641790]
for i in range(len(batches) - 1 ):
batches_start = batches[i]
batches_end = batches[i + 1]
print( str(batches_start) + " -- " + str(batches_end ))
toprocess_word_lists = word_lists[batches_start:batches_end]
save_tfrecords_wordlist( toprocess_word_lists, path +"/TFRecords/data_" + str(i) +".tfrecords")
##############################
def _parse_function(example_proto):
features = {"word_list_X": tf.io.FixedLenFeature((), tf.string),
"word_list_Y": tf.io.FixedLenFeature((), tf.string)}
parsed_features = tf.io.parse_single_example(example_proto, features)
"""
word_list_X = parsed_features['word_list_X'].numpy()
word_list_Y = parsed_features['word_list_Y'].numpy()
## need help is getting the numpy values from parsed_features variable so that i can get the one hot encoding matrix which can be directly sent to tensorflow for training
sample word_list_X value is <tf.Tensor: shape=(10,), dtype=string, numpy=array([b'for', b'for', b'for', b'you', b'you', b'you', b'you', b'to',b'to', b'to'], dtype=object)>
sample word_list_Y value is <tf.Tensor: shape=(10,), dtype=string, numpy=array([b'is', b'to', b'recommend', b'to', b'for', b'contact', b'is',b'contact', b'you', b'the'], dtype=object)>)
"""
return parsed_features['word_list_X'],parsed_features['word_list_Y']
filenames = [ path + "/JustEat_TFRecords/data.tfrecords" ]
dataset = tf.data.TFRecordDataset(filenames)
dataset = dataset.map(_parse_function)
dataset = dataset.batch(10)
# Defining the size of the embedding
embed_size = 100
# Defining the neural network
inp = tf.keras.Input(shape=(7958,))
x = tf.keras.layers.Dense(units=embed_size, activation='linear')(inp)
x = tf.keras.layers.Dense(units=7958, activation='softmax')(x)
model = tf.keras.Model(inputs=inp, outputs=x)
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')
# Optimizing the network weights
#model.fit( x=X, y=Y, batch_size=256,epochs= 100)
model.fit(dataset,epochs= 2)
It appears that you can't call the .numpy() function from inside the mapping function (1, 2) although i was able to manage by using the py_function from (doc).
On the example below i have mapped my parsed dataset to a function that converts my images to np.uint8 in order to plot them using matplotlib.
records_path = data_directory+'TFRecords'+'/data_0.tfrecord'
# Create a dataset
dataset = tf.data.TFRecordDataset(filenames=records_path)
# Map our dataset to the parsing function
parsed_dataset = dataset.map(parsing_fn)
converted_dataset = parsed_dataset.map(lambda image,label:
tf.py_function(func=converting_function,
inp=[image,label],
Tout=[np.uint8,tf.int64]))
# Gets the iterator
iterator = tf.compat.v1.data.make_one_shot_iterator(converted_dataset)
for i in range(5):
image,label = iterator.get_next()
plt.imshow(image)
plt.show()
print('label: ', label)
Output:
Parsing Function:
def parsing_fn(serialized):
# Define a dict with the data-names and types we expect to
# find in the TFRecords file.
features = \
{
'image': tf.io.FixedLenFeature([], tf.string),
'label': tf.io.FixedLenFeature([], tf.int64)
}
# Parse the serialized data so we get a dict with our data.
parsed_example = tf.io.parse_single_example(serialized=serialized,
features=features)
# Get the image as raw bytes.
image_raw = parsed_example['image']
# Decode the raw bytes so it becomes a tensor with type.
image = tf.io.decode_jpeg(image_raw)
# Get the label associated with the image.
label = parsed_example['label']
# The image and label are now correct TensorFlow types.
return image, label
Related issue: TF.data.dataset.map(map_func) with Eager Mode
Update: Didn't actually checked out but tf.shape() seems also to be a promising alternative.

How to import jpg and npy files in pair for deep learning with tensorflow?

I have 1500 RGB files(.jpg) and 1500 feature map values(.npy). I want to use them as a dataset for my deep learning project. I am using tensorflow 1.12.
I wrote them into a .tfrecords file using the tf.Example. Here is the code I used to import this file with tf.data(Thanks to Uday's comment).
import tensorflow as tf
import numpy as np
import pdb
IMAGE_HEIGHT = 228
IMAGE_WIDTH = 304
def tfdata_generator(tfrname, is_training, batch_size):
'''Construct a data generator using tf.Dataset'''
## You can write your own parse function
def parse_function(example):
features = tf.parse_single_example(example, features={
'image_raw': tf.FixedLenFeature([], tf.string, default_value=""),
'hint_raw': tf.FixedLenFeature([], tf.string, default_value="")
})
image = features['image_raw']
hint = features['hint_raw']
image = tf.decode_raw(image, tf.uint8)
image = tf.cast(image, tf.float32)
image = tf.reshape(image, [IMAGE_HEIGHT, IMAGE_WIDTH, 3])
hint = tf.decode_raw(hint, tf.uint8)
hint = tf.cast(hint, tf.float32)
hint = tf.reshape(hint, [8, 10, 1024])
return image, hint
dataset = tf.data.TFRecordDataset(tfrname)
#pdb.set_trace()
if is_training:
dataset = dataset.shuffle(100) # depends on sample size
#pdb.set_trace()
# Transform and batch data at the same time
dataset = dataset.apply(tf.data.experimental.map_and_batch(parse_function,
8, num_parallel_batches=4)) # cpu cores
dataset = dataset.repeat(-1)
dataset = dataset.prefetch(2)
return dataset
I set the batch_size to be 8. But when I did the debugging, the shape of the dataset is
((?, 228, 304, 3), (?, 8, 10, 1024)), types: (tf.float32, tf.float32)
Is this correct? Is this code wrong? Or there are mistakes when I making the tfrecords?.
you can use code like below,
def tfdata_generator(images, labels, is_training, batch_size=32):
'''Construct a data generator using tf.Dataset'''
## You can write your own parse function
def parse_function(filename, label):
image_string = tf.read_file(filename)
image = tf.image.decode_jpeg(image_string)
image = tf.image.convert_image_dtype(image, tf.float32)
y = tf.one_hot(tf.cast(label, tf.uint8), 16)
return image, y
dataset = tf.data.Dataset.from_tensor_slices((images, labels))
if is_training:
dataset = dataset.shuffle(1000) # depends on sample size
# Transform and batch data at the same time
dataset = dataset.apply(tf.data.experimental.map_and_batch( parse_function,
batch_size,num_parallel_batches=6, # cpu cores
drop_remainder=True if is_training else False))
dataset = dataset.repeat()
dataset = dataset.prefetch(no_of_prefetch_needed)
return dataset

tensorflow performance bottleneck on IteratorGetNext and less efficient than tf.FIFOQueue

I have training a speech recognition network with tensorflow and use tf.FIFIQueue to feed data before. the training speed didn't meet the requirements.and the speed is slow down when i use tf.data.TFRecordDataset and transform binary file to tfrecords, i have refer to the https://www.tensorflow.org/guide/performance/datasets, the pre-process code as follow:
def read_and_decode(loader, handle, num_epochs=1):
""" read tfrecord format data"""
batch_size = loader.batch_size()
feature_size = model_settings['fingerprint_size']
def parse_exmp(serialized_example):
features = tf.parse_single_example(serialized_example, features={
'feature': tf.VarLenFeature(tf.float32),
'label': tf.VarLenFeature(tf.int64),
'mask': tf.VarLenFeature(tf.int64),
'length': tf.FixedLenFeature((),tf.int64)
})
length = tf.cast(features['length'], tf.int32)
feature = tf.sparse_tensor_to_dense(features['feature'])
feature = tf.reshape(feature, [length, feature_size])
label = tf.sparse_tensor_to_dense(features['label'])
mask = tf.sparse_tensor_to_dense(features['mask'])
return feature, label, mask, length
'''
filenames = tf.data.Dataset.list_files("./train_input/tfrecords_file/train_dataset_*.tfrecords")
dataset = filenames.apply(tf.contrib.data.parallel_interleave(\
lambda filename: tf.data.TFRecordDataset(filename),cycle_length=4))
'''
filenames = ['./train_input/tfrecords_file/train_dataset_%d.tfrecords'%i for i in range(cpu_count()-1)]
dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=10)
dataset = dataset.map(parse_exmp, num_parallel_calls=48)
dataset = dataset.prefetch(buffer_size=batch_size)
dataset = dataset.apply(tf.contrib.data.shuffle_and_repeat(10000, num_epochs))
dataset = dataset.padded_batch(batch_size, padded_shapes=([None, feature_size],[None],[None],[]))
train_iterator = dataset.make_initializable_iterator()
iterator = tf.data.Iterator.from_string_handle(handle, \
dataset.output_types, dataset.output_shapes)
batch_data, batch_label, batch_mask, batch_length = iterator.get_next()
batch_data =[tf.transpose(data, (1,0,2)) for data in tf.split(batch_data, FLAGS.gpu_num, axis=0)]
batch_label = tf.split(batch_label, FLAGS.gpu_num, axis=0)
batch_mask = tf.split(batch_mask, FLAGS.gpu_num, axis=0)
return train_iterator,batch_data,[tf.transpose(label, (1,0)) for label in batch_label], [tf.transpose(mask, (1,0)) for mask in batch_mask], batch_length
here is my Speed chart:
enter image description here
we can see that tf.Dataset is slow than tf.FIFOQueue. and the timeline for tf.FIFOQueue:enter image description here the timeline for tf.Dataset:
enter image description here
we can see IteratorGetNext is too long about 200ms, tf.Dataset is also slow even no IteratorGetNext.
my question is why tf.Dataset is so long than tf.FIFOQueue even i have Optimize the code.
thanks.

TensorFlow Dataset API tensor evaluation within map function

I have the following dataset input function to create a Dataset generator.
def dataset_input_fn(filenames, shuffle, batch_size, sample):
def parser(record):
features = {
'mean_rgb': tf.FixedLenFeature([1024], tf.float32),
'category': tf.FixedLenFeature([], tf.int64)
}
parsed = tf.parse_single_example(record, features)
vrv = parsed['mean_rgb']
label = tf.cast(parsed['category'], tf.int32)
return {"mean_rgb": vrv}, label
dataset = tf.data.TFRecordDataset(filenames)
dataset = dataset.map(parser)
if sample:
dataset = dataset.flat_map(
lambda x, y: tf.data.Dataset.from_tensors((x, y)).repeat(oversample_classes(y))
)
dataset = dataset.filter(undersampling_filter)
dataset = dataset.shuffle(buffer_size=100 * batch_size)
dataset = dataset.batch(batch_size).repeat(1)
iterator = dataset.make_one_shot_iterator()
features, labels = iterator.get_next()
return features, labels
I am trying to follow this code to over/subsample data based on the label. Within my dataset.flat_map function I iterate over each label and would like to determine how often to repeat it. However, y is a Tensor, and I am unable to evaluate it as an integer. When I try sess.run(label) I get
ValueError: Fetch argument
cannot be interpreted as a Tensor. (Tensor Tensor("arg1:0", shape=(),
dtype=int32) is not an element of this graph.)

ValueError: Tensor A must be from same graph as Tensor B

I am trying to run this ResNet with a few alterations.https://github.com/tensorflow/models/tree/master/official/resnet
After looking up the error, I understand the problem to be either :
That the tensors belong to different graphs, but I can't figure out how that came to be, as I'm not creating any graphs myself.
I have uninitialized variables in the parser function replacement.
If it's the initialization - How should I initialize them when using Estimator, which auto initializes and creates sessions?
This is the error:
ValueError: Tensor("IsVariableInitialized:0", shape=(), dtype=bool) must be from the same graph as Tensor("report_uninitialized_variables/IsVariableInitialized:0", shape=(), dtype=bool).
The entire code is very voluminous, so I'll only supply what I've made changes to (as it runs without these changes). Rest of the code is untouched (repo in the link above)
this is the original parser function(reads from binary files):
def parse_record(raw_record, is_training):
"""Parse CIFAR-10 image and label from a raw record."""
# Convert bytes to a vector of uint8 that is record_bytes long.
record_vector = tf.decode_raw(raw_record, tf.uint8)
# The first byte represents the label, which we convert from uint8 to int32
# and then to one-hot.
label = tf.cast(record_vector[0], tf.int32)
label = tf.one_hot(label, _NUM_CLASSES)
# The remaining bytes after the label represent the image, which we reshape
# from [depth * height * width] to [depth, height, width].
depth_major = tf.reshape(record_vector[1:_RECORD_BYTES],
[_NUM_CHANNELS, _HEIGHT, _WIDTH])
# Convert from [depth, height, width] to [height, width, depth], and cast as
# float32.
image = tf.cast(tf.transpose(depth_major, [1, 2, 0]), tf.float32)
image = preprocess_image(image, is_training)
return image, label
and this is my replacement to read from TFRecords:
def parse_record(raw_record, is_training):
mode = 'train' if is_training else 'val'
feature = {mode + '/image': tf.FixedLenFeature([], tf.string),
mode + '/label': tf.FixedLenFeature([], tf.int64)}
filename_queue = tf.train.string_input_producer([raw_record], num_epochs=1)
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
features = tf.parse_single_example(serialized_example, features=feature)
label = tf.cast(features['train/label'], tf.int32)
label = tf.one_hot(label, _NUM_CLASSES)
image = tf.decode_raw(features['train/image'], tf.float32)
image = tf.reshape(image, [_HEIGHT, _WIDTH, _NUM_CHANNELS])
image = preprocess_image(image, is_training)
return image, label
This is where the Estimator is created (I've not modified this bit)
def resnet_main(flags, model_function, input_function):
"""Shared main loop for ResNet Models."""
# Using the Winograd non-fused algorithms provides a small performance boost.
os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
if flags.multi_gpu:
validate_batch_size_for_multi_gpu(flags.batch_size)
# There are two steps required if using multi-GPU: (1) wrap the model_fn,
# and (2) wrap the optimizer. The first happens here, and (2) happens
# in the model_fn itself when the optimizer is defined.
model_function = tf.contrib.estimator.replicate_model_fn(
model_function,
loss_reduction=tf.losses.Reduction.MEAN)
# Create session config based on values of inter_op_parallelism_threads and
# intra_op_parallelism_threads. Note that we default to having
# allow_soft_placement = True, which is required for multi-GPU and not
# harmful for other modes.
session_config = tf.ConfigProto(
inter_op_parallelism_threads=flags.inter_op_parallelism_threads,
intra_op_parallelism_threads=flags.intra_op_parallelism_threads,
allow_soft_placement=True)
# Set up a RunConfig to save checkpoint and set session config.
run_config = tf.estimator.RunConfig().replace(save_checkpoints_secs=1e9,
session_config=session_config)
classifier = tf.estimator.Estimator(
model_fn=model_function, model_dir=flags.model_dir, config=run_config,
params={
'resnet_size': flags.resnet_size,
'data_format': flags.data_format,
'batch_size': flags.batch_size,
'multi_gpu': flags.multi_gpu,
'version': flags.version,
})
for _ in range(flags.train_epochs // flags.epochs_between_evals):
train_hooks = hooks_helper.get_train_hooks(
flags.hooks,
batch_size=flags.batch_size,
benchmark_log_dir=flags.benchmark_log_dir)
print('Starting a training cycle.')
def input_fn_train():
return input_function(True, flags.data_dir, flags.batch_size,
flags.epochs_between_evals,
flags.num_parallel_calls, flags.multi_gpu)
classifier.train(input_fn=input_fn_train, hooks=train_hooks,
max_steps=flags.max_train_steps)
print('Starting to evaluate.')
# Evaluate the model and print results
def input_fn_eval():
return input_function(False, flags.data_dir, flags.batch_size,
1, flags.num_parallel_calls, flags.multi_gpu)
# flags.max_train_steps is generally associated with testing and profiling.
# As a result it is frequently called with synthetic data, which will
# iterate forever. Passing steps=flags.max_train_steps allows the eval
# (which is generally unimportant in those circumstances) to terminate.
# Note that eval will run for max_train_steps each loop, regardless of the
# global_step count.
eval_results = classifier.evaluate(input_fn=input_fn_eval,
steps=flags.max_train_steps)
print(eval_results)
if flags.benchmark_log_dir is not None:
benchmark_logger = logger.BenchmarkLogger(flags.benchmark_log_dir)
benchmark_logger.log_estimator_evaluation_result(eval_results)

Categories

Resources