Why would this dataset implementation run out of memory? - python

I follow this instruction and write the following code to create a Dataset for images(COCO2014 training set)
from pathlib import Path
import tensorflow as tf
def image_dataset(filepath, image_size, batch_size, norm=True):
def preprocess_image(image):
image = tf.image.decode_jpeg(image, channels=3)
image = tf.image.resize(image, image_size)
if norm:
image /= 255.0 # normalize to [0,1] range
return image
def load_and_preprocess_image(path):
image = tf.read_file(path)
return preprocess_image(image)
all_image_paths = [str(f) for f in Path(filepath).glob('*')]
path_ds = tf.data.Dataset.from_tensor_slices(all_image_paths)
ds = path_ds.map(load_and_preprocess_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)
ds = ds.shuffle(buffer_size = len(all_image_paths))
ds = ds.repeat()
ds = ds.batch(batch_size)
ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
return ds
ds = image_dataset(train2014_dir, (256, 256), 4, False)
image = ds.make_one_shot_iterator().get_next('images')
# image is then fed to the network
This code will always run out of both memory(32G) and GPU(11G) and kill the process. Here is the messages shown on terminal.
I also spot that the program get stuck at sess.run(opt_op). Where is wrong? How can I fix it?

The problem is this:
ds = ds.shuffle(buffer_size = len(all_image_paths))
The buffer that Dataset.shuffle() uses is an 'in memory' buffer so you are effectively trying to load the whole dataset in memory.
You have a couple of options (which you can combine) to fix this:
Option 1:
Reduce the buffer size to a much smaller number.
Option 2:
Move the shuffle() statment before the map() statement.
This means we would be shuffling before we load the images therefore we'd just be storing the filenames in the memory buffer for the shuffle rather than storing huge tensors.

Related

tf.data.Dataset caching causes memory error

I've been trying to use cache() to speed up my training but every time after ~300 iterations I get an error that I've tried to allocate more memory than available (I'm doing the training in a kaggle notebook so my resources are 13GB RAM and 16GB gpu memory). My dataset is about 5GB in total and I'm loading it like this:
paths = glob(str(Path(BASE_TRAIN) / '*' / '*'), recursive = True)
ds_train = tf.data.Dataset.list_files(str(Path(BASE_TRAIN) / '*' / '*'))
ds_train = (ds_train.shuffle(len(paths))
.map(load_image, num_parallel_calls = tf.data.experimental.AUTOTUNE)
.cache()
.batch(64)
.prefetch(tf.data.experimental.AUTOTUNE))
This is my load_image function:
def load_image(file_path):
image = tf.io.read_file(file_path)
image = tf.io.decode_jpeg(image, channels = 3, dct_method = 'INTEGER_ACCURATE')
image = tf.image.resize(image, (224, 224), method = 'nearest')
image = tf.cast(image, tf.float32) / 255.0
return image, image
(I'm returning image, image because I'm working on a Convolutional Autoencoder)
So my question is is this just an issue that I don't have enough resources to use cache() with the dataset of my size or am I doing something wrong that can be corrected in order for me to use it?

Problem reading and augmenting images in tf.data API using CSV / pandas DataFrames

I'm trying to (pre)process and augment my data and target variables when reading in the data each epoch/batch using the tf.data API. My unprocessed data is a CSV/pandas DataFrame with the format
index, img_id, c1, ..., c5 where img_id contains the path to an image while c1,...,c5 are run length encodings of different defects in the image, both are strings. To increase the amount of data I want to augment (e.g. flip) the images (and therefore the masks of defects aswell) with a certain probability for each image when reading it each batch/epoch. I want to read each image from my drive to save memory and because this seems to still yield good performance within the API (due to prefetching etc).
I'm familiar doing this using pytorchs DataLoader API (using version 1.8.1+cu111), but as this is for a course where I have to use tensorflow (using version 2.4.1), I read up on the tf.data API and came to the conclusion that I should do this augmentation and reading of the image using the map function. However, even reading the images throws different errors. The following is a mix of the code I've tried to use, most lines for reading the images are commented out with an extra comment in the line above with the error message it will produce.
import tensorflow as tf
test = tf.data.experimental.make_csv_dataset("data/mini_formatted.csv", batch_size=4)
def map_fn(df_):
img_path = df_["img_id"]
masks = restore_masks(df_) # get maps from RLE with same shape as images
imgs = []
# has to be declared before loop with correct shape, used for reading imgs later
img = np.empty(shape=(256,1600,1), dtype=np.float32)
# produces TypeError: Can't convert object of type 'Tensor' to 'str' for 'filename'
img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
for i in img_path:
# produces TypeError: Can't convert object of type 'Tensor' to 'str' for 'filename'
#img = cv2.imread(i, cv2.IMREAD_GRAYSCALE)
# produces AttributeError: 'NoneType' object has no attribute 'shape'
#img = cv2.imread(str(i), cv2.IMREAD_GRAYSCALE)
# produces ValueError: 'img' has shape (256, 1600, 1) before the loop, but shape <unknown> after one iteration. Use tf.autograph.experimental.set_loop_options to set shape invariants.
#img_file = tf.io.read_file(i)
#img = tf.io.decode_image(img_file, dtype=tf.float32, channels=1)
#imgs.append(img)
pass
# since img_path is a list, this doesn't work either
# ValueError: Shape must be rank 0 but is rank 1 for '{{node ReadFile}} = ReadFile[](args_6)' with input shapes: [4].
img_file = tf.io.read_file(img_path)
img = tf.io.decode_image(img_file, dtype=tf.float32)
##########################################
#
# DO AUGMENTING PER BATCH HERE
#
##########################################
# return augmented images and masks
return imgs, class_masks
proc_ds = test.map(map_fn)
As you can see, reading the image throws different errors I do not quite unterstand, especially because reading the image as follows (i.e. with the exact same commands after getting the first batch from the dataset without applying the map function) works without problems.
it = test.as_numpy_iterator()
x_proc = it.next()
img_files = [tf.io.read_file(i) for i in x_proc["img_id"]]
imgs = [img = tf.io.decode_image(img_file, dtype=tf.float32, channels=1) for img_file in img_files]
From my understanding, using the map function on a dataset should execute the code on each example once per epoch, but from the example given, it seems the function is executed once per batch, what I tried to work around. This doesn't explain to me, why the same code doesn't work inside the map function, while working fine outside it.
To help understand what I want to do, I've written a short Dataset/DataLoader in torch as an example of what my desired outputs are.
import torch
import pandas as pd
class MyDataset(torch.utils.data.Dataset):
def __init__(self, df, mode="train", shuffle=True, augment=False, union=False,
greyscale=False, normalize=True):
self.df = df
self.length = len(df)
self.mode = mode
self.shuffle = shuffle
self.augment = augment
self.union = union
self.greyscale = greyscale
self.normalize = normalize
def __len__(self):
return self.length
def __getitem__(self, idx_):
# gets called for a single item when added to batch -> one line of the dataframe
# in the tf example, these are grouped in an OrderedDict with arrays of length (BATCH_SIZE) as values
df_ = self.df.loc[idx_]
img = self._load_img(df_["img_id"])
if self.union:
masks = build_masks(df_["c1":"c_all"], union_only=True)
else:
masks = build_masks(df_["c1":"c_all"])
# could also add augmentation here instead of in collate_ds
if self.mode == "train":
return {"img": img, "masks": masks}
return {"img": img, "masks": None}
def _load_img(self, img_path):
if self.greyscale:
img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
else:
img = cv2.imread(img_path)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
if self.normalize:
img = img.astype(np.float32) / 255.
else:
img = img.astype(np.float32)
return img
def collate_ds(self, batch):
# gets called with BATCH_SIZE examples that were processed using __getitem__
imgs = [d["img"] for d in batch]
masks = [d["masks"] for d in batch]
if self.augment:
# augmentation steps for each image
pass
imgs = torch.tensor(imgs, dtype=torch.float32)
masks = torch.tensor(masks, dtype=torch.float32)
res = (imgs, masks)
return res
mini_df = pd.read_csv("data/mini_formatted.csv", index_col=0)
torch_ds = MyDataset(mini_df, mode="train", shuffle=True, augment=False, union=False,
greyscale=False, normalize=True)
dataloader = torch.utils.data.DataLoader(torch_ds, batch_size=8, shuffle=True,
collate_fn=torch_ds.collate_ds)
batch = next(iter(dataloader))
print(batch[0].shape, batch[1].shape)
# output: (torch.Size([8, 256, 1600, 3]), torch.Size([8, 256, 1600, 5]))
I still don't understand, why even reading the images inside the map function doesn't work (e.g. using cv2 -> neither using imread(img_path) #TypeError: Can't convert object of type 'Tensor' to 'str' for 'filename' nor imread(str(i) #AttributeError: 'NoneType' object has no attribute 'shape' -> image wasn't found works, while the tf.io.* functions work outside the function, but throw errors when the exact same code is executed inside it.
I would be very thankful for any help on what I'm misunderstanding/doing wrong using the map function with the tf.data API and how I could achieve the same results as the provided torch dataloader using the tf.data API.

Google colab not loading image files while using tensorflow 2.0 batched dataset

A little bit of background, I am loading about 60,000 images to colab to train a GAN. I have already uploaded them to Drive and the directory structure contains folders for different classes (about 7-8) inside root. I am loading them to colab as follows:
root = "drive/My Drive/data/images"
root = pathlib.Path(root)
list_ds = tf.data.Dataset.list_files(str(root/'*/*'))
for f in list_ds.take(3):
print(f.numpy())
which gives the ouput:
b'drive/My Drive/data/images/folder_1/2994.jpg'
b'drive/My Drive/data/images/folder_1/6628.jpg'
b'drive/My Drive/data/images/folder_2/37872.jpg'
I am further processing them as follows:
def process_path(file_path):
label = tf.strings.split(file_path, '/')[-2]
image = tf.io.read_file(file_path)
image = tf.image.decode_jpeg(image)
image = tf.image.convert_image_dtype(image, tf.float32)
return image#, label
ds = list_ds.map(process_path)
BUFFER_SIZE = 60000
BATCH_SIZE = 128
train_dataset = ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
Each image is of size 128x128. Now coming to the problem when I try to view a batch in colab the execution goes on forever and never stops, for example, with this code:
for batch in train_dataset.take(4):
print([arr.numpy() for arr in batch])
Earlier I thought that batch_size might be a issue so tried changing it but still same problem. Can it be a problem due to colab as I am loading a large number of files?
Or due to the size of images as it was working when using MNIST(28x28)? If so, what are the possible solutions?
Thanks in advance.
EDIT:
After removing the shuffle statement, the last line gets executed within a few seconds. So I thought it could be a problem due to BUFFER_SIZE of shuffle, but even with a reduced BUFFER_SIZE, it is again taking a very long time to execute. Any workaround?
Here is how I load a 1.12GB zipped FLICKR image dataset from my personal Google Drive. First, I unzip the dataset in the colab environment. Some features that can speed up the performance is prefetch and autotune. Additionally, I use the local colab cache to store the processed images. This takes ~20 seconds to execute the first time (assuming you have unzipped the dataset). The cache then allows subsequent calls to load very fast.
Assuming you have authorized the google drive API, I start with unzipping the folder(s)
!unzip /content/drive/My\ Drive/Flickr8k
!unzip Flickr8k_Dataset
!ls
I then used your code with the addition of prefetch(), autotune, and cache file.
import pathlib
import tensorflow as tf
def prepare_for_training(ds, cache, BUFFER_SIZE, BATCH_SIZE):
if cache:
if isinstance(cache, str):
ds = ds.cache(cache)
else:
ds = ds.cache()
ds = ds.shuffle(buffer_size=BUFFER_SIZE)
ds = ds.batch(BATCH_SIZE)
ds = ds.prefetch(buffer_size=AUTOTUNE)
return ds
AUTOTUNE = tf.data.experimental.AUTOTUNE
root = "Flicker8k_Dataset"
root = pathlib.Path(root)
list_ds = tf.data.Dataset.list_files(str(root/'**'))
for f in list_ds.take(3):
print(f.numpy())
def process_path(file_path):
label = tf.strings.split(file_path, '/')[-2]
img = tf.io.read_file(file_path)
img = tf.image.decode_jpeg(img)
img = tf.image.convert_image_dtype(img, tf.float32)
# resize the image to the desired size.
img = tf.image.resize(img, [128, 128])
return img#, label
ds = list_ds.map(process_path, num_parallel_calls=AUTOTUNE)
train_dataset = prepare_for_training(ds, cache="./custom_ds.tfcache", BUFFER_SIZE=600000, BATCH_SIZE=128)
for batch in train_dataset.take(4):
print([arr.numpy() for arr in batch])
Here is a way to do it with keras flow_from_directory(). The benefit of this approach is that you avoid the tensorflow shuffle() which depending on the buffer size may require a processing of the whole dataset. Keras gives you an iterator which you can call to fetch the data batch and has the random shuffling built in.
import pathlib
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
root = "Flicker8k_Dataset"
BATCH_SIZE=128
train_datagen = ImageDataGenerator(
rescale=1./255 )
train_generator = train_datagen.flow_from_directory(
directory = root, # This is the source directory for training images
target_size=(128, 128), # All images will be resized
batch_size=BATCH_SIZE,
shuffle=True,
seed=42, #for the shuffle
classes=[''])
i = 4
for batch in range(i):
[print(x[0]) for x in next(train_generator)]

How do I create image sequence samples using tf.data?

I want to create image sequence samples using the tf.data API. But as of now, it seems like there is no easy way to concatenate multiple images to form a single sample. I have tried to use the dataset.window function, which groups my images right. But I don't know how to concatenate them.
import tensorflow as tf
from glob import glob
IMG_WIDTH = 256
IMG_HEIGHT = 256
def load_and_process_image(path):
img = tf.io.read_file(path)
img = tf.image.decode_jpeg(img, channels=3)
img = tf.image.resize(img, [IMG_WIDTH, IMG_HEIGHT])
img = tf.reshape(img, shape=(IMG_WIDTH, IMG_HEIGHT, 1, 3))
return img
def create_dataset(files, time_distance=8, frame_step=1):
dataset = tf.data.Dataset.from_tensor_slices(files)
dataset = dataset.map(load_and_process_image)
dataset = dataset.window(time_distance, 1, frame_step, True)
# TODO: Concatenate elements from dataset.window
return dataset
files = sorted(glob('some/path/*.jpg'))
images = create_dataset(images)
I know that I could save my image sequences as TFRecords but that would make my data pipeline much more unflexible and would cost tons of memory.
My input batches should have the form N x W x H x T x C
(N: Number of samples
W: Image Width
H: Image Height
T: Image Sequence length
C: Image Channels).
You can use batching to create batches of size N.
iterations = #
batched_dataset = dataset.batch(N)
for batch in batched_dataset.take(iterations):
# process your batch
Here iterations is the number of batches you want to generate.

TensorFlow use dataset to replace function feed_dict

when I learn a tensorflow project,find one line code:
cls_prob, box_pred = sess.run([output_cls_prob, output_box_pred], feed_dict={input_img: blob})
But, this line code It took a lot of time. (use CPU need 15 seconds...┭┮﹏┭┮)
By consulting information, I find use function 'dataset' could solve this problem which took a lot of time, How should I use it?
source of 'blob':
img = cv2.imread('./imgs/001.jpg')
img_scale = float(600) / min(img_data.shape[0], img_data.shape[1])
if np.round(img_scale * max(img_data.shape[0], img_data.shape[1])) > 1200:
img_scale = float(1200) / max(img_data.shape[0], img_data.shape[1])
img_data = cv2.resize(img_data, None, None, fx=img_scale, fy=img_scale, interpolation=cv2.INTER_LINEAR)
img_orig = img_data.astype(np.float32, copy=True)
blob = np.zeros((1, img_data.shape[0], img_data.shape[1], 3),dtype=np.float32)
blob[0, 0:img_data.shape[0], 0:img_data.shape[1], :] = img_orig
source of 'output_cls_prob'&'output_box_pred'&'input_img':
# Actually,read PB model...
input_img = sess.graph.get_tensor_by_name('Placeholder:0')
output_cls_prob = sess.graph.get_tensor_by_name('Reshape_2:0')
output_box_pred = sess.graph.get_tensor_by_name('rpn_bbox_pred/Reshape_1:0')
Parameter type:
blob:type 'numpy.ndarray'
output_cls_prob:class 'tensorflow.python.framework.ops.Tensor'
output_box_pred:class 'tensorflow.python.framework.ops.Tensor'
input_img:class 'tensorflow.python.framework.ops.Tensor'
tf.data is the recommended API for tensorflow input pipelines. Here is a tutorial on tensorflow.org. For your example, the section "Decoding image data and resizing it" could be most useful. For example, you could do something like:
# Reads an image from a file, decodes it into a dense tensor, and resizes it
# to a fixed shape.
def _parse_function(filename):
image_string = tf.read_file(filename)
image_decoded = tf.image.decode_jpeg(image_string)
image_resized = tf.image.resize_images(image_decoded, [new_width, new_height])
image_resized = tf.expand_dims(image_resized, 0) # Adds size 1 dimension
return image_resized
# A vector of filenames.
filenames = tf.constant(["./imgs/001.jpg", ...])
dataset = tf.data.Dataset.from_tensor_slices(filenames)
dataset = dataset.map(_parse_function)
And instead of having input_img be a placeholder, change:
input_img = tf.placeholder(tf.float32)
output_class_prob, output_class_pred = (... use input_img ...)
to:
iterator = dataset.make_one_shot_iterator()
input_img = iterator.get_next()
output_class_prob, output_class_pred = (... use input_img ...)
First of all you should know that the use of Dataset API has a great impact in performance when multiples GPUs are used... Otherwise is almost identical to feed_dict. I recommend you to read this other answer from a TF developer, it has almost everything one needs to know to create a mental image of the benefits of this new API.

Categories

Resources