Related
Im having problems when I use the library tensorflow_model_optimization.
I am developing a code to prune an already trained neural network.
I imported the weights from an h5 file and so I use tensorflor_model_optimization to prune my neural network.
I have this error when I call the fit method:
InvalidArgumentError Traceback (most recent call last)
<ipython-input-26-ce9759e4dd53> in <module>
----> 1 model_for_pruning.fit_generator(base_treinamento, steps_per_epoch = 6000 /64, epochs = 5, validation_data = base_teste, validation_steps = 30, callbacks=callbacks)
~\anaconda3\envs\supernova\lib\site-packages\tensorflow\python\keras\engine\training.py in fit_generator(self, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, validation_freq, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
1859 use_multiprocessing=use_multiprocessing,
1860 shuffle=shuffle,
-> 1861 initial_epoch=initial_epoch)
1862
1863 def evaluate_generator(self,
~\anaconda3\envs\supernova\lib\site-packages\tensorflow\python\keras\engine\training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
1098 _r=1):
1099 callbacks.on_train_batch_begin(step)
-> 1100 tmp_logs = self.train_function(iterator)
1101 if data_handler.should_sync:
1102 context.async_wait()
~\anaconda3\envs\supernova\lib\site-packages\tensorflow\python\eager\def_function.py in __call__(self, *args, **kwds)
826 tracing_count = self.experimental_get_tracing_count()
827 with trace.Trace(self._name) as tm:
--> 828 result = self._call(*args, **kwds)
829 compiler = "xla" if self._experimental_compile else "nonXla"
830 new_tracing_count = self.experimental_get_tracing_count()
~\anaconda3\envs\supernova\lib\site-packages\tensorflow\python\eager\def_function.py in _call(self, *args, **kwds)
853 # In this case we have created variables on the first call, so we run the
854 # defunned version which is guaranteed to never create variables.
--> 855 return self._stateless_fn(*args, **kwds) # pylint: disable=not-callable
856 elif self._stateful_fn is not None:
857 # Release the lock early so that multiple threads can perform the call
~\anaconda3\envs\supernova\lib\site-packages\tensorflow\python\eager\function.py in __call__(self, *args, **kwargs)
2941 filtered_flat_args) = self._maybe_define_function(args, kwargs)
2942 return graph_function._call_flat(
-> 2943 filtered_flat_args, captured_inputs=graph_function.captured_inputs) # pylint: disable=protected-access
2944
2945 #property
~\anaconda3\envs\supernova\lib\site-packages\tensorflow\python\eager\function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
1917 # No tape is watching; skip to running the function.
1918 return self._build_call_outputs(self._inference_function.call(
-> 1919 ctx, args, cancellation_manager=cancellation_manager))
1920 forward_backward = self._select_forward_and_backward_functions(
1921 args,
~\anaconda3\envs\supernova\lib\site-packages\tensorflow\python\eager\function.py in call(self, ctx, args, cancellation_manager)
558 inputs=args,
559 attrs=attrs,
--> 560 ctx=ctx)
561 else:
562 outputs = execute.execute_with_cancellation(
~\anaconda3\envs\supernova\lib\site-packages\tensorflow\python\eager\execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
58 ctx.ensure_initialized()
59 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
---> 60 inputs, attrs, num_outputs)
61 except core._NotOkStatusException as e:
62 if name is not None:
InvalidArgumentError: Conv2DCustomBackpropFilterOp only supports NHWC.
[[node gradient_tape/sequential_3/prune_low_magnitude_conv2d_18/Conv2D/Conv2DBackpropFilter (defined at <ipython-input-24-fc85f8818d30>:1) ]] [Op:__inference_train_function_10084]
Errors may have originated from an input operation.
Input Source operations connected to node gradient_tape/sequential_3/prune_low_magnitude_conv2d_18/Conv2D/Conv2DBackpropFilter:
sequential_3/prune_low_magnitude_activation_18/Relu (defined at C:\Users\Pichau\anaconda3\envs\supernova\lib\site-packages\tensorflow_model_optimization\python\core\sparsity\keras\pruning_wrapper.py:270)
Function call stack:
train_function
enter code here
My code:
from keras.models import model_from_json
from keras.preprocessing.image import ImageDataGenerator
import numpy as np
arquivo = open('model.json', 'r')
estrutura_rede = arquivo.read()
arquivo.close()
model = model_from_json(estrutura_rede)
model.load_weights('model.h5')
gerador_treinamento = ImageDataGenerator(rescale=None)
base_treinamento = gerador_treinamento.flow_from_directory('data/train', target_size = (51,51), batch_size = 64, class_mode = 'binary')
gerador_teste = ImageDataGenerator(rescale=None)
base_teste = gerador_teste.flow_from_directory('data/test', target_size = (51,51), batch_size = 64, class_mode = 'binary')
import tempfile
import tensorflow as tf
import tensorflow_model_optimization as tfmot
prune_low_magnitude = tfmot.sparsity.keras.prune_low_magnitude
batch_size = 128
epochs = 2
validation_split = 0.1
num_images = int(len(base_treinamento) * (1 - validation_split))
end_step = np.ceil(num_images / batch_size).astype(np.int32) * epochs
model_for_pruning.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
model_for_pruning.summary()
logdir = tempfile.mkdtemp()
callbacks = [
tfmot.sparsity.keras.UpdatePruningStep(),
tfmot.sparsity.keras.PruningSummaries(log_dir=logdir),
]
model_for_pruning.fit_generator(base_treinamento, steps_per_epoch = 6000 /64, epochs = 5, validation_data = base_teste, validation_steps = 30, callbacks=callbacks)
python: 3.6.12
tensorflow: 2.2.0
tensorflow-model-optimization: 0.5.0
Can someone help me?
Changing the runtime from CPU to GPU worked for me.
If you are running it on CPU consider changing it to GPU.
This does not seem like a pruning issue, but instead a problem with the data format of the model. It looks like one Conv2D layer should be using NHWC format (data_format="channels_last").
Could you share the code for the model?
I am trying to evaluate my deep learning RNN model but I keep getting this one error. I have absolutely no idea what the error means and I am unable to solve it. Any help is appreciated. Thanks.
This is the code I am using to evaluate my model:
model = keras.models.load_model('D:/Semester 3.2 OFFICIAL/Deep Learning/Assignment 2/test_model_14 files/Model14Checkpoint-188-0.60.h5')
model.load_weights('D:/Semester 3.2 OFFICIAL/Deep Learning/Assignment 2/test_model_14 files/Model14Checkpoint-188-0.60.h5')
evaluate = model.evaluate(X_test_seq_padded, y_test, batch_size=128)
loss = evaluate[0]
acc = evaluate[1] * 100
print("Loss: {:0.3f} - Accuracy: {:0.3f}%".format(loss,acc))
And this is the error that it returns:
128/8510 [..............................] - ETA: 1:05
---------------------------------------------------------------------------
InvalidArgumentError Traceback (most recent call last)
<ipython-input-8-8b3cba2991da> in <module>
13 model = keras.models.load_model('D:/Semester 3.2 OFFICIAL/Deep Learning/Assignment 2/test_model_14 files/Model14Checkpoint-188-0.60.h5')
14 model.load_weights('D:/Semester 3.2 OFFICIAL/Deep Learning/Assignment 2/test_model_14 files/Model14Checkpoint-188-0.60.h5')
---> 15 evaluate = model.evaluate(X_test_seq_padded, y_test, batch_size=128)
16
17 loss = evaluate[0]
~\anaconda3\envs\three point seven\lib\site-packages\tensorflow_core\python\keras\engine\training.py in evaluate(self, x, y, batch_size, verbose, sample_weight, steps, callbacks, max_queue_size, workers, use_multiprocessing)
928 max_queue_size=max_queue_size,
929 workers=workers,
--> 930 use_multiprocessing=use_multiprocessing)
931
932 def predict(self,
~\anaconda3\envs\three point seven\lib\site-packages\tensorflow_core\python\keras\engine\training_v2.py in evaluate(self, model, x, y, batch_size, verbose, sample_weight, steps, callbacks, max_queue_size, workers, use_multiprocessing, **kwargs)
488 sample_weight=sample_weight, steps=steps, callbacks=callbacks,
489 max_queue_size=max_queue_size, workers=workers,
--> 490 use_multiprocessing=use_multiprocessing, **kwargs)
491
492 def predict(self, model, x, batch_size=None, verbose=0, steps=None,
~\anaconda3\envs\three point seven\lib\site-packages\tensorflow_core\python\keras\engine\training_v2.py in _model_iteration(self, model, mode, x, y, batch_size, verbose, sample_weight, steps, callbacks, max_queue_size, workers, use_multiprocessing, **kwargs)
473 mode=mode,
474 training_context=training_context,
--> 475 total_epochs=1)
476 cbks.make_logs(model, epoch_logs, result, mode)
477
~\anaconda3\envs\three point seven\lib\site-packages\tensorflow_core\python\keras\engine\training_v2.py in run_one_epoch(model, iterator, execution_function, dataset_size, batch_size, strategy, steps_per_epoch, num_samples, mode, training_context, total_epochs)
126 step=step, mode=mode, size=current_batch_size) as batch_logs:
127 try:
--> 128 batch_outs = execution_function(iterator)
129 except (StopIteration, errors.OutOfRangeError):
130 # TODO(kaftan): File bug about tf function and errors.OutOfRangeError?
~\anaconda3\envs\three point seven\lib\site-packages\tensorflow_core\python\keras\engine\training_v2_utils.py in execution_function(input_fn)
96 # `numpy` translates Tensors to values in Eager mode.
97 return nest.map_structure(_non_none_constant_value,
---> 98 distributed_function(input_fn))
99
100 return execution_function
~\anaconda3\envs\three point seven\lib\site-packages\tensorflow_core\python\eager\def_function.py in __call__(self, *args, **kwds)
566 xla_context.Exit()
567 else:
--> 568 result = self._call(*args, **kwds)
569
570 if tracing_count == self._get_tracing_count():
~\anaconda3\envs\three point seven\lib\site-packages\tensorflow_core\python\eager\def_function.py in _call(self, *args, **kwds)
636 *args, **kwds)
637 # If we did not create any variables the trace we have is good enough.
--> 638 return self._concrete_stateful_fn._filtered_call(canon_args, canon_kwds) # pylint: disable=protected-access
639
640 def fn_with_cond(*inner_args, **inner_kwds):
~\anaconda3\envs\three point seven\lib\site-packages\tensorflow_core\python\eager\function.py in _filtered_call(self, args, kwargs)
1609 if isinstance(t, (ops.Tensor,
1610 resource_variable_ops.BaseResourceVariable))),
-> 1611 self.captured_inputs)
1612
1613 def _call_flat(self, args, captured_inputs, cancellation_manager=None):
~\anaconda3\envs\three point seven\lib\site-packages\tensorflow_core\python\eager\function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
1690 # No tape is watching; skip to running the function.
1691 return self._build_call_outputs(self._inference_function.call(
-> 1692 ctx, args, cancellation_manager=cancellation_manager))
1693 forward_backward = self._select_forward_and_backward_functions(
1694 args,
~\anaconda3\envs\three point seven\lib\site-packages\tensorflow_core\python\eager\function.py in call(self, ctx, args, cancellation_manager)
543 inputs=args,
544 attrs=("executor_type", executor_type, "config_proto", config),
--> 545 ctx=ctx)
546 else:
547 outputs = execute.execute_with_cancellation(
~\anaconda3\envs\three point seven\lib\site-packages\tensorflow_core\python\eager\execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
65 else:
66 message = e.message
---> 67 six.raise_from(core._status_to_exception(e.code, message), None)
68 except TypeError as e:
69 keras_symbolic_tensors = [
~\anaconda3\envs\three point seven\lib\site-packages\six.py in raise_from(value, from_value)
InvalidArgumentError: 2 root error(s) found.
(0) Invalid argument: indices[28,27] = 10792 is not in [0, 10000)
[[node sequential_3/embedding_3/embedding_lookup (defined at <ipython-input-8-8b3cba2991da>:15) ]]
[[sequential_3/embedding_3/embedding_lookup/_17]]
(1) Invalid argument: indices[28,27] = 10792 is not in [0, 10000)
[[node sequential_3/embedding_3/embedding_lookup (defined at <ipython-input-8-8b3cba2991da>:15) ]]
0 successful operations.
0 derived errors ignored. [Op:__inference_distributed_function_8770]
Errors may have originated from an input operation.
Input Source operations connected to node sequential_3/embedding_3/embedding_lookup:
sequential_3/embedding_3/embedding_lookup/7696 (defined at C:\Users\acer\anaconda3\envs\three point seven\lib\contextlib.py:112)
Input Source operations connected to node sequential_3/embedding_3/embedding_lookup:
sequential_3/embedding_3/embedding_lookup/7696 (defined at C:\Users\acer\anaconda3\envs\three point seven\lib\contextlib.py:112)
Function call stack:
distributed_function -> distributed_function
This question already has answers here:
Failed to get convolution algorithm. This is probably because cuDNN failed to initialize,
(30 answers)
Closed 2 years ago.
Here is my code:
model = Sequential()
# CONVOLUTIONAL LAYER
model.add(Conv2D(filters=32, kernel_size=(4,4),input_shape=(28, 28, 1), activation='relu',))
# POOLING LAYER
model.add(MaxPool2D(pool_size=(2, 2)))
# FLATTEN IMAGES FROM 28 by 28 to 764 BEFORE FINAL LAYER
model.add(Flatten())
# 128 NEURONS IN DENSE HIDDEN LAYER (YOU CAN CHANGE THIS NUMBER OF NEURONS)
model.add(Dense(128, activation='relu'))
# LAST LAYER IS THE CLASSIFIER, THUS 10 POSSIBLE CLASSES
model.add(Dense(10, activation='softmax'))
# https://keras.io/metrics/
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy']) # we can add in additional metrics https://keras.io/metrics/
model.fit(x_train,y_cat_train,epochs=10,validation_data=(x_test,y_cat_test))
And I get the following when I run the model.fit line:
UnknownError Traceback (most recent call last)
<ipython-input-37-0eafbb732ade> in <module>
----> 1 model.fit(x_train,y_cat_train,epochs=10,validation_data=(x_test,y_cat_test))
~\anaconda3\envs\GPU\lib\site-packages\tensorflow_core\python\keras\engine\training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
817 max_queue_size=max_queue_size,
818 workers=workers,
--> 819 use_multiprocessing=use_multiprocessing)
820
821 def evaluate(self,
~\anaconda3\envs\GPU\lib\site-packages\tensorflow_core\python\keras\engine\training_v2.py in fit(self, model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
340 mode=ModeKeys.TRAIN,
341 training_context=training_context,
--> 342 total_epochs=epochs)
343 cbks.make_logs(model, epoch_logs, training_result, ModeKeys.TRAIN)
344
~\anaconda3\envs\GPU\lib\site-packages\tensorflow_core\python\keras\engine\training_v2.py in run_one_epoch(model, iterator, execution_function, dataset_size, batch_size, strategy, steps_per_epoch, num_samples, mode, training_context, total_epochs)
126 step=step, mode=mode, size=current_batch_size) as batch_logs:
127 try:
--> 128 batch_outs = execution_function(iterator)
129 except (StopIteration, errors.OutOfRangeError):
130 # TODO(kaftan): File bug about tf function and errors.OutOfRangeError?
~\anaconda3\envs\GPU\lib\site-packages\tensorflow_core\python\keras\engine\training_v2_utils.py in execution_function(input_fn)
96 # `numpy` translates Tensors to values in Eager mode.
97 return nest.map_structure(_non_none_constant_value,
---> 98 distributed_function(input_fn))
99
100 return execution_function
~\anaconda3\envs\GPU\lib\site-packages\tensorflow_core\python\eager\def_function.py in __call__(self, *args, **kwds)
566 xla_context.Exit()
567 else:
--> 568 result = self._call(*args, **kwds)
569
570 if tracing_count == self._get_tracing_count():
~\anaconda3\envs\GPU\lib\site-packages\tensorflow_core\python\eager\def_function.py in _call(self, *args, **kwds)
597 # In this case we have created variables on the first call, so we run the
598 # defunned version which is guaranteed to never create variables.
--> 599 return self._stateless_fn(*args, **kwds) # pylint: disable=not-callable
600 elif self._stateful_fn is not None:
601 # Release the lock early so that multiple threads can perform the call
~\anaconda3\envs\GPU\lib\site-packages\tensorflow_core\python\eager\function.py in __call__(self, *args, **kwargs)
2361 with self._lock:
2362 graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
-> 2363 return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access
2364
2365 #property
~\anaconda3\envs\GPU\lib\site-packages\tensorflow_core\python\eager\function.py in _filtered_call(self, args, kwargs)
1609 if isinstance(t, (ops.Tensor,
1610 resource_variable_ops.BaseResourceVariable))),
-> 1611 self.captured_inputs)
1612
1613 def _call_flat(self, args, captured_inputs, cancellation_manager=None):
~\anaconda3\envs\GPU\lib\site-packages\tensorflow_core\python\eager\function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
1690 # No tape is watching; skip to running the function.
1691 return self._build_call_outputs(self._inference_function.call(
-> 1692 ctx, args, cancellation_manager=cancellation_manager))
1693 forward_backward = self._select_forward_and_backward_functions(
1694 args,
~\anaconda3\envs\GPU\lib\site-packages\tensorflow_core\python\eager\function.py in call(self, ctx, args, cancellation_manager)
543 inputs=args,
544 attrs=("executor_type", executor_type, "config_proto", config),
--> 545 ctx=ctx)
546 else:
547 outputs = execute.execute_with_cancellation(
~\anaconda3\envs\GPU\lib\site-packages\tensorflow_core\python\eager\execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
65 else:
66 message = e.message
---> 67 six.raise_from(core._status_to_exception(e.code, message), None)
68 except TypeError as e:
69 keras_symbolic_tensors = [
~\anaconda3\envs\GPU\lib\site-packages\six.py in raise_from(value, from_value)
UnknownError: Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
[[node sequential/conv2d/Conv2D (defined at <ipython-input-36-6ec5ba2e7ade>:1) ]] [Op:__inference_distributed_function_788]
Function call stack:
distributed_function
I would just like to clarify that I have run this CNN before on regular tensorflow without issues, and this newly installed tensorflow-gpu works on ANN's and RNN's. Additionally here are my specs and installation versions:
Tensorflow 2.1
Keras 2.3.1
CudNN 7.6.5
cudatoolkit 10.1.243
GPU: Nvidia 2070
Let me know if any other information is necessary, thank you!
I have had this error happen periodically. Not sure of the details of the cause but this happens when I have to many instances of jupyter notebook or python running at the same time that are running tensorflow. If you are on windows go task manager and check how many instances of notebook.exe or python are running and shut down all of them then reload the notebook.This always fixes the problem for me
I am using Microsoft Azure to train a CNN (Convolutional Neural Network) to recognize 11 classes of food using 16k images. The Virtual Machine I'm using is a "STANDARD_NC24_PROMO" with the following specs:
24 vCPUs, 4 GPUs, 224 GB memory, 1440 GB storage.
The problem is that at a simple run of the program I get the following error about Resource Exhaustion:
2-conv-256-nodes-0-dense-1576530179
Train on 10636 samples, validate on 2660 samples
Epoch 1/10
32/10636 [..............................] - ETA: 57:51
---------------------------------------------------------------------------
ResourceExhaustedError Traceback (most recent call last)
<ipython-input-10-ee913a07a18b> in <module>
86 model.compile(loss="sparse_categorical_crossentropy",optimizer="adam",metrics=["accuracy"])
87 ### TRAIN
---> 88 model.fit(train_images, train_labels,validation_split=0.20, epochs=10,use_multiprocessing=True)
89
90 loss, acc = model.evaluate(test_images, test_labels, verbose = 0)
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
726 max_queue_size=max_queue_size,
727 workers=workers,
--> 728 use_multiprocessing=use_multiprocessing)
729
730 def evaluate(self,
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training_v2.py in fit(self, model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, **kwargs)
322 mode=ModeKeys.TRAIN,
323 training_context=training_context,
--> 324 total_epochs=epochs)
325 cbks.make_logs(model, epoch_logs, training_result, ModeKeys.TRAIN)
326
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training_v2.py in run_one_epoch(model, iterator, execution_function, dataset_size, batch_size, strategy, steps_per_epoch, num_samples, mode, training_context, total_epochs)
121 step=step, mode=mode, size=current_batch_size) as batch_logs:
122 try:
--> 123 batch_outs = execution_function(iterator)
124 except (StopIteration, errors.OutOfRangeError):
125 # TODO(kaftan): File bug about tf function and errors.OutOfRangeError?
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training_v2_utils.py in execution_function(input_fn)
84 # `numpy` translates Tensors to values in Eager mode.
85 return nest.map_structure(_non_none_constant_value,
---> 86 distributed_function(input_fn))
87
88 return execution_function
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/tensorflow_core/python/eager/def_function.py in __call__(self, *args, **kwds)
455
456 tracing_count = self._get_tracing_count()
--> 457 result = self._call(*args, **kwds)
458 if tracing_count == self._get_tracing_count():
459 self._call_counter.called_without_tracing()
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/tensorflow_core/python/eager/def_function.py in _call(self, *args, **kwds)
518 # Lifting succeeded, so variables are initialized and we can run the
519 # stateless function.
--> 520 return self._stateless_fn(*args, **kwds)
521 else:
522 canon_args, canon_kwds = \
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/tensorflow_core/python/eager/function.py in __call__(self, *args, **kwargs)
1821 """Calls a graph function specialized to the inputs."""
1822 graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
-> 1823 return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access
1824
1825 #property
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/tensorflow_core/python/eager/function.py in _filtered_call(self, args, kwargs)
1139 if isinstance(t, (ops.Tensor,
1140 resource_variable_ops.BaseResourceVariable))),
-> 1141 self.captured_inputs)
1142
1143 def _call_flat(self, args, captured_inputs, cancellation_manager=None):
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/tensorflow_core/python/eager/function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
1222 if executing_eagerly:
1223 flat_outputs = forward_function.call(
-> 1224 ctx, args, cancellation_manager=cancellation_manager)
1225 else:
1226 gradient_name = self._delayed_rewrite_functions.register()
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/tensorflow_core/python/eager/function.py in call(self, ctx, args, cancellation_manager)
509 inputs=args,
510 attrs=("executor_type", executor_type, "config_proto", config),
--> 511 ctx=ctx)
512 else:
513 outputs = execute.execute_with_cancellation(
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/tensorflow_core/python/eager/execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
65 else:
66 message = e.message
---> 67 six.raise_from(core._status_to_exception(e.code, message), None)
68 except TypeError as e:
69 keras_symbolic_tensors = [
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/six.py in raise_from(value, from_value)
ResourceExhaustedError: OOM when allocating tensor with shape[32,256,98,98] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
[[node sequential_7/conv2d_14/Conv2D (defined at /anaconda/envs/azureml_py36/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py:1751) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
[Op:__inference_distributed_function_7727]
Function call stack:
distributed_function
I will attach below the bit of code that does the training:
for dense_layer in dense_layers:
for layer_size in layer_sizes:
for conv_layer in conv_layers:
NAME="{}-conv-{}-nodes-{}-dense-{}".format(conv_layer,
layer_size, dense_layer, int(time.time()))
print(NAME)
model = Sequential()
model.add(Conv2D(layer_size,(3,3),input_shape=(IMG_SIZE, IMG_SIZE, 1)))
model.add(Activation("relu"))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.5))
for l in range(conv_layer-1):
model.add(Conv2D(layer_size,(3,3)))
model.add(Activation("relu"))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.5))
model.add(Flatten())
for l in range(dense_layer):
model.add(Dense(layer_size))
model.add(Activation("relu"))
#The output layer with 11 neurons
model.add(Dense(11))
model.add(Activation("softmax"))
### COMPILE MODEL
model.compile(loss="sparse_categorical_crossentropy",
optimizer="adam",
metrics=["accuracy"])
### TRAIN
model.fit(train_images, train_labels,validation_split=0.20, epochs=10)
loss, acc = model.evaluate(test_images, test_labels, verbose = 0)
print(acc * 100)
if maxacc<acc*100:
maxacc=acc*100
maxname=NAME
maxdict[maxacc]=maxname
print("\n\n",maxacc," ",maxname)
My laptop which is nowhere near as good has no problem executing this, yet running it on azure gives me that error. The iteration variables don't matter as I still get the error no matter what their values are.
Any help would be greatly appreciated, thank you for your time!
I would like to add that the program is not even working with this small amount of layers:
dense_layers = [0]
layer_sizes = [32]
conv_layers = [1]
Unfortunately, I never used azure for training some kind of networks. But I would try:
simplify your network and setup, maybe use a powerful single gpu first. Also, figure out what hyperparameter has to change to make it fail after you got it to work with a simpler approach
reduce the batch size. Most of gpu OOM exceptions are due to too many data that is processed at once.
There is a lot of optimization happening that might cause it to work locally but that works slightly different for multi gpu machines.
When i reading Google tensorflow2.0 tutorials, i meet an surprised error when i try to test in my jupyter. It is so strange!It run fluently in Google colab! The tutorials is this
My computer GPU is gtx1060 6G, and memory is 16G, I think my computer is ok to run this tutorials.
I try run it on Jupyter, and it run error.But run fluently on Google colab!
You can see the error code following or go to the web:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_datasets as tfds
# tfds.disable_progress_bar()
(train_data, test_data), info = tfds.load(
'imdb_reviews/subwords8k',
split = (tfds.Split.TRAIN, tfds.Split.TEST),
with_info=True, as_supervised=True)
encoder = info.features['text'].encoder
padded_shapes = ([None],())
train_batches = train_data.shuffle(1000).padded_batch(10, padded_shapes = padded_shapes)
test_batches = test_data.shuffle(1000).padded_batch(10, padded_shapes = padded_shapes)
embedding_dim=16
model = keras.Sequential([
layers.Embedding(encoder.vocab_size, embedding_dim,mask_zero=True),
layers.Bidirectional(tf.keras.layers.LSTM(32)),
layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
history = model.fit(
train_batches,
epochs=10,
validation_data=test_batches, validation_steps=20,verbose=2)
It is my first time meeting this error, and I don't know how to fix it, but it run fluently on Google colab, I don't know why?
The error following:
Epoch 1/10
---------------------------------------------------------------------------
CancelledError Traceback (most recent call last)
<ipython-input-2-8f27353fef79> in <module>
31 train_batches,
32 epochs=10,
---> 33 validation_data=test_batches, validation_steps=20,verbose=2)
c:\users\sha\anaconda3\envs\tensorflow2\lib\site-packages\tensorflow_core\python\keras\engine\training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
726 max_queue_size=max_queue_size,
727 workers=workers,
--> 728 use_multiprocessing=use_multiprocessing)
729
730 def evaluate(self,
c:\users\sha\anaconda3\envs\tensorflow2\lib\site-packages\tensorflow_core\python\keras\engine\training_v2.py in fit(self, model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, **kwargs)
322 mode=ModeKeys.TRAIN,
323 training_context=training_context,
--> 324 total_epochs=epochs)
325 cbks.make_logs(model, epoch_logs, training_result, ModeKeys.TRAIN)
326
c:\users\sha\anaconda3\envs\tensorflow2\lib\site-packages\tensorflow_core\python\keras\engine\training_v2.py in run_one_epoch(model, iterator, execution_function, dataset_size, batch_size, strategy, steps_per_epoch, num_samples, mode, training_context, total_epochs)
121 step=step, mode=mode, size=current_batch_size) as batch_logs:
122 try:
--> 123 batch_outs = execution_function(iterator)
124 except (StopIteration, errors.OutOfRangeError):
125 # TODO(kaftan): File bug about tf function and errors.OutOfRangeError?
c:\users\sha\anaconda3\envs\tensorflow2\lib\site-packages\tensorflow_core\python\keras\engine\training_v2_utils.py in execution_function(input_fn)
84 # `numpy` translates Tensors to values in Eager mode.
85 return nest.map_structure(_non_none_constant_value,
---> 86 distributed_function(input_fn))
87
88 return execution_function
c:\users\sha\anaconda3\envs\tensorflow2\lib\site-packages\tensorflow_core\python\eager\def_function.py in __call__(self, *args, **kwds)
455
456 tracing_count = self._get_tracing_count()
--> 457 result = self._call(*args, **kwds)
458 if tracing_count == self._get_tracing_count():
459 self._call_counter.called_without_tracing()
c:\users\sha\anaconda3\envs\tensorflow2\lib\site-packages\tensorflow_core\python\eager\def_function.py in _call(self, *args, **kwds)
485 # In this case we have created variables on the first call, so we run the
486 # defunned version which is guaranteed to never create variables.
--> 487 return self._stateless_fn(*args, **kwds) # pylint: disable=not-callable
488 elif self._stateful_fn is not None:
489 # Release the lock early so that multiple threads can perform the call
c:\users\sha\anaconda3\envs\tensorflow2\lib\site-packages\tensorflow_core\python\eager\function.py in __call__(self, *args, **kwargs)
1821 """Calls a graph function specialized to the inputs."""
1822 graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
-> 1823 return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access
1824
1825 #property
c:\users\sha\anaconda3\envs\tensorflow2\lib\site-packages\tensorflow_core\python\eager\function.py in _filtered_call(self, args, kwargs)
1139 if isinstance(t, (ops.Tensor,
1140 resource_variable_ops.BaseResourceVariable))),
-> 1141 self.captured_inputs)
1142
1143 def _call_flat(self, args, captured_inputs, cancellation_manager=None):
c:\users\sha\anaconda3\envs\tensorflow2\lib\site-packages\tensorflow_core\python\eager\function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
1222 if executing_eagerly:
1223 flat_outputs = forward_function.call(
-> 1224 ctx, args, cancellation_manager=cancellation_manager)
1225 else:
1226 gradient_name = self._delayed_rewrite_functions.register()
c:\users\sha\anaconda3\envs\tensorflow2\lib\site-packages\tensorflow_core\python\eager\function.py in call(self, ctx, args, cancellation_manager)
509 inputs=args,
510 attrs=("executor_type", executor_type, "config_proto", config),
--> 511 ctx=ctx)
512 else:
513 outputs = execute.execute_with_cancellation(
c:\users\sha\anaconda3\envs\tensorflow2\lib\site-packages\tensorflow_core\python\eager\execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
65 else:
66 message = e.message
---> 67 six.raise_from(core._status_to_exception(e.code, message), None)
68 except TypeError as e:
69 keras_symbolic_tensors = [
c:\users\sha\anaconda3\envs\tensorflow2\lib\site-packages\six.py in raise_from(value, from_value)
CancelledError: [_Derived_]RecvAsync is cancelled.
[[{{node Reshape_11/_38}}]] [Op:__inference_distributed_function_16087]
Function call stack:
distributed_function
Thanks for anyone help me!
Try to reduce the batch_size, it should work.