TensorFlow 2.0 - Begineer Implementing simple CNN - python

I just finished DL specialization from Coursera and I am trying to implement a CNN with TensorFlow 2.0 and my own collected data. I followed the guide and documentation from tensorflow.org and was able to set up a pipeline to load my image. However, when I ran the model I kept running into memory/resource-related issues.
My model should do a multi-label classification with 30 categories. below is my code:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D
#Import helper modules
import os
import numpy as np
import matplotlib.pyplot as plt
import pathlib
import os
import pandas as pd
import IPython.display as display
from PIL import Image
AUTOTUNE = tf.data.experimental.AUTOTUNE
#Define path to directories
train_dir = pathlib.Path.cwd() / 'train'
validation_dir = pathlib.Path.cwd() / 'validation'
test_dir = pathlib.Path.cwd() / 'test'
#Read csv file containing filename and label
train_csv = pd.read_csv(pathlib.Path.cwd() / 'train.csv')
validation_csv = pd.read_csv(pathlib.Path.cwd() / 'validation.csv')
#Define total number of training and validation set
total_train = train_csv.shape[0]
total_val = validation_csv.shape[0]
print(f'Total training images: {total_train}')
print(f'Total validation images: {total_val}')
the below code is from https://www.tensorflow.org/tutorials/load_data/images
CLASS_NAMES = np.array([item.name for item in train_dir.glob('*')])
#set up variables
BATCH_SIZE = 128
TRAIN_STEPS_PER_EPOCH = np.ceil(total_train/BATCH_SIZE)
VAL_STEPS_PER_EPOCH = np.ceil(total_val/BATCH_SIZE)
IMG_HEIGHT = 150
IMG_WIDTH = 150
#using td.data.Dataset
train_list_ds = tf.data.Dataset.list_files(str(train_dir/'*/*'))
valid_list_ds = tf.data.Dataset.list_files(str(validation_dir/'*/*'))
def get_label(file_path):
# convert the path to a list of path components
parts = tf.strings.split(file_path, os.path.sep)
# The second to last is the class-directory
return parts[-2] == CLASS_NAMES
def decode_img(img):
# convert the compressed string to a 3D uint8 tensor
img = tf.image.decode_jpeg(img, channels=3)
# Use `convert_image_dtype` to convert to floats in the [0,1] range.
img = tf.image.convert_image_dtype(img, tf.float32)
# resize the image to the desired size.
return tf.image.resize(img, [IMG_HEIGHT, IMG_WIDTH])
def process_path(file_path):
label = get_label(file_path)
# load the raw data from the file as a string
img = tf.io.read_file(file_path)
img = decode_img(img)
return img, label
def show_batch(image_batch, label_batch):
plt.figure(figsize=(10,10))
for n in range(25):
ax = plt.subplot(5,5,n+1)
plt.imshow(image_batch[n])
plt.title(CLASS_NAMES[label_batch[n]==1][0].title())
plt.axis('off')
# Set `num_parallel_calls` so multiple images are loaded/processed in parallel.
train_labeled_ds = train_list_ds.map(process_path, num_parallel_calls=AUTOTUNE)
valid_labeled_ds = valid_list_ds.map(process_path, num_parallel_calls=AUTOTUNE)
def prepare_for_training(ds, cache=True, shuffle_buffer_size=1000):
# use `.cache(filename)` to cache preprocessing work for datasets that don't
# fit in memory.
if cache:
if isinstance(cache, str):
ds = ds.cache(cache)
else:
ds = ds.cache()
ds = ds.shuffle(buffer_size=shuffle_buffer_size)
# Repeat forever
ds = ds.repeat()
ds = ds.batch(BATCH_SIZE)
# `prefetch` lets the dataset fetch batches in the background while the model
# is training.
ds = ds.prefetch(buffer_size=AUTOTUNE)
return ds
train_ds = prepare_for_training(train_labeled_ds)
x_train, y_train = next(iter(train_ds))
valid_ds = prepare_for_training(valid_labeled_ds)
the model is from the tutorial: https://www.tensorflow.org/tutorials/images/classification
model = Sequential([
Conv2D(16, 3, padding='same', activation='relu', input_shape=(IMG_HEIGHT, IMG_WIDTH ,3)),
MaxPooling2D(),
Conv2D(32, 3, padding='same', activation='relu'),
MaxPooling2D(),
Conv2D(64, 3, padding='same', activation='relu'),
MaxPooling2D(),
Flatten(),
Dense(512, activation='relu'),
Dense(30, activation='softmax')
])
model.compile(optimizer='adam',
loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
metrics=['accuracy'])
model.summary()
model.fit(train_ds.repeat(),
epochs=15, batch_size= BATCH_SIZE,
validation_data=valid_ds.repeat(), steps_per_epoch= TRAIN_STEPS_PER_EPOCH,
validation_steps=VAL_STEPS_PER_EPOCH)
at first I ran into Input ran out of data, so i changed my input dataset into train_ds.repeat() instead of x=x_train, y=y_train.
next issue I encountered is
100/741 [===>..........................] - ETA: 18:07 - loss: 3.7188 - accuracy: 0.04942020-06-23 14:52:29.232604: E tensorflow/core/lib/jpeg/jpeg_mem.cc:323] Premature end of JPEG data. Stopped at line 910/1000
Traceback (most recent call last):
File "product_detection.py", line 123, in <module>
model.fit(train_ds.repeat(), epochs=15, batch_size= BATCH_SIZE, validation_data=valid_ds.repeat(), steps_per_epoch= TRAIN_STEPS_PER_EPOCH, validation_steps=VAL_STEPS_PER_EPOCH)
File "C:\Users\sl199\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\keras\engine\training.py", line 66, in _method_wrapper
return method(self, *args, **kwargs)
File "C:\Users\sl199\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\keras\engine\training.py", line 848, in fit
tmp_logs = train_function(iterator)
File "C:\Users\sl199\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\eager\def_function.py", line 580, in __call__
result = self._call(*args, **kwds)
File "C:\Users\sl199\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\eager\def_function.py", line 611, in _call
return self._stateless_fn(*args, **kwds) # pylint: disable=not-callable
File "C:\Users\sl199\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\eager\function.py", line 2420, in __call__
return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access
File "C:\Users\sl199\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\eager\function.py", line 1665, in _filtered_call
self.captured_inputs)
File "C:\Users\sl199\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\eager\function.py", line 1746, in _call_flat
ctx, args, cancellation_manager=cancellation_manager))
File "C:\Users\sl199\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\eager\function.py", line 598, in call
ctx=ctx)
File "C:\Users\sl199\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\eager\execute.py", line 60, in quick_execute
inputs, attrs, num_outputs)
tensorflow.python.framework.errors_impl.InvalidArgumentError: 2 root error(s) found.
(0) Invalid argument: Invalid JPEG data or crop window, data size 101360
[[{{node DecodeJpeg}}]]
[[IteratorGetNext]]
(1) Invalid argument: Invalid JPEG data or crop window, data size 101360
[[{{node DecodeJpeg}}]]
[[IteratorGetNext]]
[[IteratorGetNext/_2]]
0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_1113]
Function call stack:
train_function -> train_function
2020-06-23 14:52:29.411288: W tensorflow/core/kernels/data/cache_dataset_ops.cc:794] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
at this point I am completely lost, I suspect the prepare_for_training function I copied is not suitable for my application but I don't understand enough to make changes. It explicitly said that it is for a data set of 1000+ images while I'm working with 90k training and 10k validation data sets. I try to change the batch_size yet the issue persists.
I am using TensorFlow-gpu with GTX 1050 Ti. May I ask for a pointer on how to proceed with this? Thank you in advance.
Edit 1: Changed my batch_size to 10 and this error appeared
3537/9484 [==========>...................] - ETA: 12:31 - loss: 3.7519 - accuracy: 0.02602020-06-23 16:10:13.148245: W tensorflow/core/framework/op_kernel.cc:1753] OP_REQUIRES failed at cast_op.cc:109 : Resource exhausted: OOM when allocating tensor with shape[943,943,3] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
2020-06-23 16:10:13.159005: W tensorflow/core/framework/op_kernel.cc:1753] OP_REQUIRES failed at cast_op.cc:109 : Resource exhausted: OOM when allocating tensor with shape[678,678,3] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
Resou3538/9484 [==========>...................] - ETA: 12:31 - loss: 3.7519 - accuracy: 0.0260rce exhausted: OOM when allocating tensor with shape[956,956,3] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
tensorflow.python.framework.errors_impl.ResourceExhaustedError: 2 root error(s) found.
(0) Resource exhausted: OOM when allocating tensor with shape[943,943,3] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
[[{{node convert_image/Cast}}]]
[[IteratorGetNext]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
[[IteratorGetNext/_2]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
(1) Resource exhausted: OOM when allocating tensor with shape[943,943,3] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
[[{{node convert_image/Cast}}]]
[[IteratorGetNext]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

According to this section of the output :
tensorflow.python.framework.errors_impl.InvalidArgumentError: 2 root error(s) found.
(0) Invalid argument: Invalid JPEG data or crop window, data size 101360
[[{{node DecodeJpeg}}]]
[[IteratorGetNext]]
(1) Invalid argument: Invalid JPEG data or crop window, data size 101360
[[{{node DecodeJpeg}}]]
[[IteratorGetNext]]
[[IteratorGetNext/_2]]
It seems like if some of your data are corrupted, check your data and remove the corrupted entries, that is a common error that you can check HERE and THERE

Related

Loading a testing set into a Dvector generator that works for the training set

So I am having this issue where I created Dvectors and they are working for the testing set, from a checkpoint.
model.load_state_dict(checkpoint['state_dict'])
model.eval()
with torch.no_grad():
dvectors = model.generateDVec(x)
the generateDVec():
def generateDVec(self, x):
batch_size, num_chunks, features = x.shape
x = x.reshape((batch_size * num_chunks, 1, features))
embeddings = self.encoder(x) # TODO
embeddings = embeddings.reshape(batch_size, num_chunks, -1)
return self.classifier.generateVector(embeddings) #dvectors
However I am getting this error:
Traceback (most recent call last):
model.load_state_dict(checkpoint['state_dict']) File "C:\Users\me\AppData\Roaming\Python\Python38\site-packages\torch\nn\modules\module.py",
line 1051, in load_state_dict
raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( RuntimeError: Error(s) in loading state_dict for
Model: size mismatch for classifier.proj.weight: copying a param with
shape torch.Size([462, 1024]) from checkpoint, the shape in current
model is torch.Size([168, 1024]). size mismatch for
classifier.proj.bias: copying a param with shape torch.Size([462])
from checkpoint, the shape in current model is torch.Size([168]).
The 462 is there because in my training file path there were 462 people, and in testing - 168... What could go wrong there?

MemoryError: Unable to allocate 64.0 KiB for an array with shape (1, 128, 128) and data type float32

I want to perform segmentation using a CNN approach and used the keras ImageDataGenerator to generate more data and fed them into my network.
whenever I run the code I get this error
File "C:\Users\abirf\AppData\Local\Continuum\anaconda3\envs\deep_learning\lib\site-packages\numpy\core\shape_base.py", line 434, in stack
return _nx.concatenate(expanded_arrays, axis=axis, out=out)
File "<array_function internals>", line 6, in concatenate
MemoryError: Unable to allocate 64.0 KiB for an array with shape (1, 128, 128) and data type float32
what's the problem exactly ?
this is a snippet of my code
X_path= os.path.join('.........../train_data/', 'images') # input image
Y_path = os.path.join('........./train_data/', 'masks') # ground-truth label
# we create two instances with the same arguments
data_gen_args = dict(featurewise_center=True,
featurewise_std_normalization=True,
rotation_range=45.,
width_shift_range=0.1,
height_shift_range=0.1,
zoom_range=[0.2])
seed = 1
image_datagen =ImageDataGenerator(**data_gen_args)
mask_datagen =ImageDataGenerator(**data_gen_args)
image_generator = mask_datagen.flow_from_directory(X_path, class_mode=None,batch_size=16, seed=seed,target_size=(img_col,img_row),color_mode='grayscale')
mask_generator = mask_datagen.flow_from_directory(Y_path, class_mode=None,batch_size=16, seed=seed,target_size=(img_col, img_row),color_mode='grayscale')
train_generator = zip(image_generator, mask_generator)
num_train = len(image_generator)
#########################################################
this contains the architecture used to perform the training
#########################################################
history = model.fit(list(train_generator), steps_per_epoch=num_train, shuffle=True, validation_split=0.1 , batch_size=16, epochs=50,callbacks=[earlystopper, checkpointer])

How to fix "ResourceExhaustedError: OOM when allocating tensor"

I wanna make a model with multiple inputs. So, I try to build a model like this.
# define two sets of inputs
inputA = Input(shape=(32,64,1))
inputB = Input(shape=(32,1024))
# CNN
x = layers.Conv2D(32, kernel_size = (3, 3), activation = 'relu')(inputA)
x = layers.Conv2D(32, (3,3), activation='relu')(x)
x = layers.MaxPooling2D(pool_size=(2,2))(x)
x = layers.Dropout(0.2)(x)
x = layers.Flatten()(x)
x = layers.Dense(500, activation = 'relu')(x)
x = layers.Dropout(0.5)(x)
x = layers.Dense(500, activation='relu')(x)
x = Model(inputs=inputA, outputs=x)
# DNN
y = layers.Flatten()(inputB)
y = Dense(64, activation="relu")(y)
y = Dense(250, activation="relu")(y)
y = Dense(500, activation="relu")(y)
y = Model(inputs=inputB, outputs=y)
# Combine the output of the two models
combined = concatenate([x.output, y.output])
# combined outputs
z = Dense(300, activation="relu")(combined)
z = Dense(100, activation="relu")(combined)
z = Dense(1, activation="softmax")(combined)
model = Model(inputs=[x.input, y.input], outputs=z)
model.summary()
opt = Adam(lr=1e-3, decay=1e-3 / 200)
model.compile(loss = 'sparse_categorical_crossentropy', optimizer = opt,
metrics = ['accuracy'])
and the summary
:
_
But, when i try to train this model,
history = model.fit([trainimage, train_product_embd],train_label,
validation_data=([validimage,valid_product_embd],valid_label), epochs=10,
steps_per_epoch=100, validation_steps=10)
the problem happens....
:
ResourceExhaustedError Traceback (most recent call
last) <ipython-input-18-2b79f16d63c0> in <module>()
----> 1 history = model.fit([trainimage, train_product_embd],train_label,
validation_data=([validimage,valid_product_embd],valid_label),
epochs=10, steps_per_epoch=100, validation_steps=10)
4 frames
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py
in __call__(self, *args, **kwargs) 1470 ret =
tf_session.TF_SessionRunCallable(self._session._session, 1471
self._handle, args,
-> 1472 run_metadata_ptr) 1473 if run_metadata: 1474
proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
ResourceExhaustedError: 2 root error(s) found. (0) Resource
exhausted: OOM when allocating tensor with shape[800000,32,30,62] and
type float on /job:localhost/replica:0/task:0/device:GPU:0 by
allocator GPU_0_bfc [[{{node conv2d_1/convolution}}]] Hint: If you
want to see a list of allocated tensors when OOM happens, add
report_tensor_allocations_upon_oom to RunOptions for current
allocation info.
[[metrics/acc/Mean_1/_185]] Hint: If you want to see a list of
allocated tensors when OOM happens, add
report_tensor_allocations_upon_oom to RunOptions for current
allocation info.
(1) Resource exhausted: OOM when allocating tensor with
shape[800000,32,30,62] and type float on
/job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
[[{{node conv2d_1/convolution}}]] Hint: If you want to see a list of
allocated tensors when OOM happens, add
report_tensor_allocations_upon_oom to RunOptions for current
allocation info.
0 successful operations. 0 derived errors ignored.
Thanks for reading and hopefully helping me :)
OOM stands for "out of memory". Your GPU is running out of memory, so it can't allocate memory for this tensor. There are a few things you can do:
Decrease the number of filters in your Dense, Conv2D layers
Use a smaller batch_size (or increase steps_per_epoch and validation_steps)
Use grayscale images (you can use tf.image.rgb_to_grayscale)
Reduce the number of layers
Use MaxPooling2D layers after convolutional layers
Reduce the size of your images (you can use tf.image.resize for that)
Use smaller float precision for your input, namely np.float32
If you're using a pre-trained model, freeze the first layers (like this)
There is more useful information about this error:
OOM when allocating tensor with shape[800000,32,30,62]
This is a weird shape. If you're working with images, you should normally have 3 or 1 channel. On top of that, it seems like you are passing your entire dataset at once; you should instead pass it in batches.
From [800000,32,30,62] it seems your model put all the data in one batch.
Try specified batch size like
history = model.fit([trainimage, train_product_embd],train_label, validation_data=([validimage,valid_product_embd],valid_label), epochs=10, steps_per_epoch=100, validation_steps=10, batch_size=32)
If it still OOM then try reduce the batch_size
Happened to me as well.
You can try reducing trainable parameters by using some form of Transfer Learning - try freezing the initial few layers and use lower batch sizes.
I think the most common reason for this case to arise would be the absence of MaxPooling layers.
Use the same architecture, but add atleast one MaxPool layer after Conv2D layers. This might even improve the overall performance of the model.
You can even try reducing the depth of the model, i.e., remove the unnecessary layers and optimize.

train with muliple gpu with tensorflow2.0 get error: Out of range: End of sequence

I am training with tensorflow2.0 with multiple GPU. It got the following errors. But if I use only one GPU it ran without any error. My tensorflow version is tensorflow-gpu-2.0.0:
tensorflow.python.framework.errors_impl.CancelledError: 4 root error(s) found.
(0) Cancelled: Operation was cancelled
[[{{node cond_6/else/_59/IteratorGetNext}}]]
(1) Out of range: End of sequence
[[{{node cond_4/else/_37/IteratorGetNext}}]]
(2) Out of range: End of sequence
[[{{node cond_7/else/_70/IteratorGetNext}}]]
[[metrics/accuracy/div_no_nan/ReadVariableOp_6/_154]]
(3) Out of range: End of sequence
[[{{node cond_7/else/_70/IteratorGetNext}}]]
0 successful operations.
1 derived errors ignored. [Op:__inference_distributed_function_83325]
Function call stack:
distributed_function -> distributed_function -> distributed_function -> distributed_function
This is my code, you can try with environment variable: CUDA_VISIBLE_DEVICES=0 or CUDA_VISIBLE_DEVICES=0,1. That will get different result:
import tensorflow as tf
import tensorflow_datasets as tfds
data_name = 'uc_merced'
dataset = tfds.load(data_name)
train_data, test_data = dataset['train'], dataset['train']
def parse(img_dict):
img = tf.image.resize_with_pad(img_dict['image'], 256, 256)
label = img_dict['label']
return img, label
train_data = train_data.map(parse)
train_data = train_data.batch(96)
test_data = test_data.map(parse)
test_data = test_data.batch(96)
strategy = tf.distribute.MirroredStrategy()
with strategy.scope():
model = tf.keras.applications.ResNet50(weights=None, classes=21, input_shape=(256, 256, 3))
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
model.fit(train_data, epochs=50, verbose=2, validation_data=test_data)
model.save('model/resnet_{}.h5'.format(data_name))
Instead of selecting GPUs using CUDA_VISIBLE_DEVICES, can you try following:
strategy = tf.distribute.MirroredStrategy()
with strategy.scope(devices=["/gpu:0", "/gpu:1"]):

Keras : Exception when training YOLO model : OOM when allocating tensor

I want to run an implementation of the YOLO algorithm (object detection) with Keras. The code I use come mostly from here.
I am trying to train my model with a sample of the Open Image Dataset V4 from Google.
The problem is that, when I try to train my model, I get the following warnings and exception:
W tensorflow/core/common_runtime/bfc_allocator.cc:211] Allocator (GPU_0_bfc) ran out of memory trying to allocate 831.81MiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
W tensorflow/core/common_runtime/bfc_allocator.cc:211] Allocator (GPU_0_bfc) ran out of memory trying to allocate 380.25MiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
W tensorflow/core/common_runtime/bfc_allocator.cc:267] Allocator (GPU_0_bfc) ran out of memory trying to allocate 84.50MiB. Current allocation summary follows.
...
tensorflow.python.framework.errors_impl.ResourceExhaustedError: OOM when allocating tensor with shape[8,64,208,208] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
[[{{node conv2d_3/Conv2D}} = Conv2D[T=DT_FLOAT, _class=["loc:#training/Adam/gradients/conv2d_3/Conv2D_grad/Conv2DBackpropInput"], data_format="NHWC", dilations=[1, 1, 1, 1], padding="SAME", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/device:GPU:0"](leaky_re_lu_2/LeakyRelu, conv2d_3/Conv2D/ReadVariableOp)]]
(Here I am using the tensorflow-GPU lib, but I have a similar error without the non-GPU tensorflow.)
At first I though it was because of the size of my dataset (200.000 pictures => ~60GB), but when running the code with a minimal sample (500 pictures => ~150MB), I get exactly the same error.
So I guess there is a problem with my code.
Here is a minimal example of the problematic part (I guess) :
def _main():
input_shape = [416,416]
model = ### #Create YOLO model
anchors = ### #Collection of 9 anchors
num_classes = 601
train_data = ### # A collection of the form [PathToImage, X1,X2,Y1,Y2, class], where the X,Y values define the bounding box
valid_data = ### # A collection of the form [PathToImage, X1,X2,Y1,Y2, class], where the X,Y values define the bounding box
batch_size = 8
model.fit_generator(data_generator(train_data, batch_size, input_shape, anchors, num_classes),
steps_per_epoch=max(1, len(train_data)//batch_size),
validation_data=data_generator(valid_data, batch_size, input_shape, anchors, num_classes),
validation_steps=max(1, len(valid_data)//batch_size),
epochs=50,
initial_epoch=0)
# Unfreeze and continue training, to fine-tune.
for i in range(len(model.layers)):
model.layers[i].trainable = True
model.compile(optimizer=Adam(lr=1e-4), loss={'yolo_loss': lambda y_true, y_pred: y_pred}) # recompile to apply the change
print('Unfreeze all of the layers.')
print('Train on {} samples, val on {} samples, with batch size {}.'.format(num_train, num_val, batch_size))
model.fit_generator(data_generator(train_data, batch_size, input_shape, anchors, num_classes),
steps_per_epoch=max(1, len(train_data)//batch_size),
validation_data=data_generator(valid_data, batch_size, input_shape, anchors, num_classes),
validation_steps=max(1, len(valid_data)//batch_size),
epochs=100,
initial_epoch=50)
def data_generator(lines, batch_size, input_shape, anchors, num_classes):
'''data generator for fit_generator'''
n = len(lines)
i = 0
while True:
image_data = []
box_data = []
for b in range(batch_size):
if i==0:
np.random.shuffle(lines)
image, box = get_data(lines[i], input_shape) # Retrieve the image from path and return it with the bounding box (the object class is in box object)
image_data.append(image)
box_data.append(box)
i = (i+1) % n
image_data = np.array(image_data)
box_data = np.array(box_data)
y_true = preprocess_true_boxes(box_data, input_shape, anchors, num_classes) # For each boxes, find the best anchor
yield [image_data, *y_true], np.zeros(batch_size)
The OOM exception is raised on the second call to fit_generator()
Following answer on similar question, I added the gpu_options allow_growth ont my TensorFlow session :
K.clear_session() # get a new session
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
K.set_session(sess)
But id does not solved the problem.
So I am a bit stuck here. What am I doing wrong ?
Notes :
I have a Quadro P1000 GPU with 20GB GPU Memory (according to the Windows task manager)
I have 32GB RAM
I haven't changed the model architecture, you can find it here

Categories

Resources