A relatively simple block of code running in Jupyter notebook (because I cannot for the life of me get it working in Google Colab) but I'm getting the below error when running the model. The linked post shows some more of the detail around the inputs I'm working with.
I think it may be a GPU related error. I've followed several similarly titled posts and CUDA/Visual Studio installation tutorials, to no avail.
import gensim
import os
from gensim.models import KeyedVectors
from gensim.test.utils import datapath, get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec
glove_file = datapath('glove.6B.50d.txt')
word2vec_glove_file = get_tmpfile("glove.6B.50d.word2vec.txt")
glove2word2vec(glove_file, word2vec_glove_file)
model = KeyedVectors.load_word2vec_format(word2vec_glove_file, binary=False)
# get the vector for each word in the glove file
emb_dict = {}
glove = open(glove_file, encoding="utf8")
for line in glove:
values = line.split()
word = values[0]
vector = numpy.asarray(values[1:], dtype='float32')
emb_dict[word] = vector
glove.close()
# get the top 10k words in the tweets, and find their vector from the glove files
num_words = 10000
glove_dims = 50
emb_matrix = numpy.zeros((num_words, glove_dims))
for w, i in token.word_index.items():
if i < num_words:
vect = emb_dict.get(w)
if vect is not None:
emb_matrix[i] = vect
else:
break
from keras import models, layers
from keras.layers import Embedding, Flatten, Dense
glove_model = models.Sequential()
glove_model.add(Embedding(num_words, glove_dims, input_length=max_tweet_len))
glove_model.add(layers.LSTM(100))
#glove_model.add(Flatten())
glove_model.add(Dense(3, activation='softmax'))
glove_model.layers[0].set_weights([emb_matrix])
glove_model.layers[0].trainable = False
glove_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
history = glove_model.fit(train_seq_x, y_train, epochs=10, batch_size=64)
The error returned is:
Epoch 1/10
---------------------------------------------------------------------------
InvalidArgumentError Traceback (most recent call last)
<ipython-input-11-224ee1a00f0e> in <module>
13 glove_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
14
---> 15 history = glove_model.fit(train_seq_x, y_train, epochs=10, batch_size=64)
~\anaconda3\lib\site-packages\keras\engine\training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
1237 steps_per_epoch=steps_per_epoch,
1238 validation_steps=validation_steps,
-> 1239 validation_freq=validation_freq)
1240
1241 def evaluate(self,
~\anaconda3\lib\site-packages\keras\engine\training_arrays.py in fit_loop(model, fit_function, fit_inputs, out_labels, batch_size, epochs, verbose, callbacks, val_function, val_inputs, shuffle, initial_epoch, steps_per_epoch, validation_steps, validation_freq)
194 ins_batch[i] = ins_batch[i].toarray()
195
--> 196 outs = fit_function(ins_batch)
197 outs = to_list(outs)
198 for l, o in zip(out_labels, outs):
~\anaconda3\lib\site-packages\tensorflow\python\keras\backend.py in __call__(self, inputs)
3790 value = math_ops.cast(value, tensor.dtype)
3791 converted_inputs.append(value)
-> 3792 outputs = self._graph_fn(*converted_inputs)
3793
3794 # EagerTensor.numpy() will often make a copy to ensure memory safety.
~\anaconda3\lib\site-packages\tensorflow\python\eager\function.py in __call__(self, *args, **kwargs)
1603 TypeError: For invalid positional/keyword argument combinations.
1604 """
-> 1605 return self._call_impl(args, kwargs)
1606
1607 def _call_impl(self, args, kwargs, cancellation_manager=None):
~\anaconda3\lib\site-packages\tensorflow\python\eager\function.py in _call_impl(self, args, kwargs, cancellation_manager)
1643 raise TypeError("Keyword arguments {} unknown. Expected {}.".format(
1644 list(kwargs.keys()), list(self._arg_keywords)))
-> 1645 return self._call_flat(args, self.captured_inputs, cancellation_manager)
1646
1647 def _filtered_call(self, args, kwargs):
~\anaconda3\lib\site-packages\tensorflow\python\eager\function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
1744 # No tape is watching; skip to running the function.
1745 return self._build_call_outputs(self._inference_function.call(
-> 1746 ctx, args, cancellation_manager=cancellation_manager))
1747 forward_backward = self._select_forward_and_backward_functions(
1748 args,
~\anaconda3\lib\site-packages\tensorflow\python\eager\function.py in call(self, ctx, args, cancellation_manager)
596 inputs=args,
597 attrs=attrs,
--> 598 ctx=ctx)
599 else:
600 outputs = execute.execute_with_cancellation(
~\anaconda3\lib\site-packages\tensorflow\python\eager\execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
58 ctx.ensure_initialized()
59 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
---> 60 inputs, attrs, num_outputs)
61 except core._NotOkStatusException as e:
62 if name is not None:
InvalidArgumentError: indices[40,19] = 11263 is not in [0, 10000)
[[node embedding_1/embedding_lookup (defined at C:\Users\Jorda\anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:3009) ]] [Op:__inference_keras_scratch_graph_1707]
Function call stack:
keras_scratch_graph
As requested in the comments, the method of tokenisation is as follows:
from keras.preprocessing import text, sequence
# create a tokenizer
token = text.Tokenizer()
token.fit_on_texts(tweets)
word_index = token.word_index
max_tweet_len = 30
# convert text to sequence of tokens and pad them to ensure equal length vectors
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(x_train), maxlen=max_tweet_len)
test_seq_x = sequence.pad_sequences(token.texts_to_sequences(x_test), maxlen=max_tweet_len)
print(train_seq_x)
print(test_seq_x)
try to fit your tokenizer in this way
num_words = 10000
token = text.Tokenizer(num_words=num_words)
and when you define your model, change num_words with word_index+1 in Embedding layer
Related
Im having problems when I use the library tensorflow_model_optimization.
I am developing a code to prune an already trained neural network.
I imported the weights from an h5 file and so I use tensorflor_model_optimization to prune my neural network.
I have this error when I call the fit method:
InvalidArgumentError Traceback (most recent call last)
<ipython-input-26-ce9759e4dd53> in <module>
----> 1 model_for_pruning.fit_generator(base_treinamento, steps_per_epoch = 6000 /64, epochs = 5, validation_data = base_teste, validation_steps = 30, callbacks=callbacks)
~\anaconda3\envs\supernova\lib\site-packages\tensorflow\python\keras\engine\training.py in fit_generator(self, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, validation_freq, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
1859 use_multiprocessing=use_multiprocessing,
1860 shuffle=shuffle,
-> 1861 initial_epoch=initial_epoch)
1862
1863 def evaluate_generator(self,
~\anaconda3\envs\supernova\lib\site-packages\tensorflow\python\keras\engine\training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
1098 _r=1):
1099 callbacks.on_train_batch_begin(step)
-> 1100 tmp_logs = self.train_function(iterator)
1101 if data_handler.should_sync:
1102 context.async_wait()
~\anaconda3\envs\supernova\lib\site-packages\tensorflow\python\eager\def_function.py in __call__(self, *args, **kwds)
826 tracing_count = self.experimental_get_tracing_count()
827 with trace.Trace(self._name) as tm:
--> 828 result = self._call(*args, **kwds)
829 compiler = "xla" if self._experimental_compile else "nonXla"
830 new_tracing_count = self.experimental_get_tracing_count()
~\anaconda3\envs\supernova\lib\site-packages\tensorflow\python\eager\def_function.py in _call(self, *args, **kwds)
853 # In this case we have created variables on the first call, so we run the
854 # defunned version which is guaranteed to never create variables.
--> 855 return self._stateless_fn(*args, **kwds) # pylint: disable=not-callable
856 elif self._stateful_fn is not None:
857 # Release the lock early so that multiple threads can perform the call
~\anaconda3\envs\supernova\lib\site-packages\tensorflow\python\eager\function.py in __call__(self, *args, **kwargs)
2941 filtered_flat_args) = self._maybe_define_function(args, kwargs)
2942 return graph_function._call_flat(
-> 2943 filtered_flat_args, captured_inputs=graph_function.captured_inputs) # pylint: disable=protected-access
2944
2945 #property
~\anaconda3\envs\supernova\lib\site-packages\tensorflow\python\eager\function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
1917 # No tape is watching; skip to running the function.
1918 return self._build_call_outputs(self._inference_function.call(
-> 1919 ctx, args, cancellation_manager=cancellation_manager))
1920 forward_backward = self._select_forward_and_backward_functions(
1921 args,
~\anaconda3\envs\supernova\lib\site-packages\tensorflow\python\eager\function.py in call(self, ctx, args, cancellation_manager)
558 inputs=args,
559 attrs=attrs,
--> 560 ctx=ctx)
561 else:
562 outputs = execute.execute_with_cancellation(
~\anaconda3\envs\supernova\lib\site-packages\tensorflow\python\eager\execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
58 ctx.ensure_initialized()
59 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
---> 60 inputs, attrs, num_outputs)
61 except core._NotOkStatusException as e:
62 if name is not None:
InvalidArgumentError: Conv2DCustomBackpropFilterOp only supports NHWC.
[[node gradient_tape/sequential_3/prune_low_magnitude_conv2d_18/Conv2D/Conv2DBackpropFilter (defined at <ipython-input-24-fc85f8818d30>:1) ]] [Op:__inference_train_function_10084]
Errors may have originated from an input operation.
Input Source operations connected to node gradient_tape/sequential_3/prune_low_magnitude_conv2d_18/Conv2D/Conv2DBackpropFilter:
sequential_3/prune_low_magnitude_activation_18/Relu (defined at C:\Users\Pichau\anaconda3\envs\supernova\lib\site-packages\tensorflow_model_optimization\python\core\sparsity\keras\pruning_wrapper.py:270)
Function call stack:
train_function
enter code here
My code:
from keras.models import model_from_json
from keras.preprocessing.image import ImageDataGenerator
import numpy as np
arquivo = open('model.json', 'r')
estrutura_rede = arquivo.read()
arquivo.close()
model = model_from_json(estrutura_rede)
model.load_weights('model.h5')
gerador_treinamento = ImageDataGenerator(rescale=None)
base_treinamento = gerador_treinamento.flow_from_directory('data/train', target_size = (51,51), batch_size = 64, class_mode = 'binary')
gerador_teste = ImageDataGenerator(rescale=None)
base_teste = gerador_teste.flow_from_directory('data/test', target_size = (51,51), batch_size = 64, class_mode = 'binary')
import tempfile
import tensorflow as tf
import tensorflow_model_optimization as tfmot
prune_low_magnitude = tfmot.sparsity.keras.prune_low_magnitude
batch_size = 128
epochs = 2
validation_split = 0.1
num_images = int(len(base_treinamento) * (1 - validation_split))
end_step = np.ceil(num_images / batch_size).astype(np.int32) * epochs
model_for_pruning.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
model_for_pruning.summary()
logdir = tempfile.mkdtemp()
callbacks = [
tfmot.sparsity.keras.UpdatePruningStep(),
tfmot.sparsity.keras.PruningSummaries(log_dir=logdir),
]
model_for_pruning.fit_generator(base_treinamento, steps_per_epoch = 6000 /64, epochs = 5, validation_data = base_teste, validation_steps = 30, callbacks=callbacks)
python: 3.6.12
tensorflow: 2.2.0
tensorflow-model-optimization: 0.5.0
Can someone help me?
Changing the runtime from CPU to GPU worked for me.
If you are running it on CPU consider changing it to GPU.
This does not seem like a pruning issue, but instead a problem with the data format of the model. It looks like one Conv2D layer should be using NHWC format (data_format="channels_last").
Could you share the code for the model?
I'm just working through my own sandbox project wanting to try and implement NLP but with a linear regression as an outcome. As reference, the dataset I am working with comes Kaggle wine-reviews which has the wine reviews and a corresponding score of 1 through 100 hence why I'm using linear regression instead of classification.
But I am getting an error message and I'm not sure if it's a result of a data type or dimensionality problem, and I'm not sue why or how to resolve it.
I'll provide the code bellow, and some intermediate outcomes displaying the dimensions of some of the objects as I'm assuming that that might be useful in solving this.
df = pd.read_csv('winemag-data_first150k.csv', encoding='ISO-8859-1')
y = df['points'].astype(int)
X = df['description'].astype(str)
# split up the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
MAX_VOCAB_SIZE = 35000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(X_train)
sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_test = tokenizer.texts_to_sequences(X_test)
word2idx = tokenizer.word_index
V = len(word2idx)
print('Found %s unique tokens.' % V)
Found 33012 unique tokens.
data_train = pad_sequences(sequences_train)
print('Shape of data train tensor:', data_train.shape)
# get sequence length
T = data_train.shape[1]
Shape of data train tensor: (101123, 136)
data_test = pad_sequences(sequences_test, maxlen=T)
print('Shape of data test tensor:', data_test.shape)
Shape of data test tensor: (49807, 136)
# Create the model
# We get to choose embedding dimensionality
D = 20
# Hidden state dimensionality
M = 15
i = Input(shape=(T,))
x = Embedding(V + 1, D)(i)
x = LSTM(M, return_sequences=True)(x)
x = GlobalMaxPooling1D()(x)
x = Dense(1)(x)
model = Model(i, x)
model.compile(optimizer='adam', loss='mse')
# learning rate scheduler
def schedule(epoch, lr):
if epoch >= 50:
return 0.0001
return 0.001
scheduler = tf.keras.callbacks.LearningRateScheduler(schedule)
# Train the model
r = model.fit(X, y, epochs=200, callbacks=[scheduler])
And then I get the error message along with a warning about dimensionality:
Epoch 1/200
WARNING:tensorflow:Model was constructed with shape (None, 136) for input Tensor("input_10:0", shape=(None, 136), dtype=float32), but it was called on an input with incompatible shape (None, 1).
WARNING:tensorflow:Model was constructed with shape (None, 136) for input Tensor("input_10:0", shape=(None, 136), dtype=float32), but it was called on an input with incompatible shape (None, 1).
---------------------------------------------------------------------------
UnimplementedError Traceback (most recent call last)
<ipython-input-121-0f68916ec23b> in <module>
13
14 # Train the model
---> 15 r = model.fit(X, y, epochs=200, callbacks=[scheduler])
~\anaconda3\envs\newenvt\lib\site-packages\tensorflow\python\keras\engine\training.py in _method_wrapper(self, *args, **kwargs)
106 def _method_wrapper(self, *args, **kwargs):
107 if not self._in_multi_worker_mode(): # pylint: disable=protected-access
--> 108 return method(self, *args, **kwargs)
109
110 # Running inside `run_distribute_coordinator` already.
~\anaconda3\envs\newenvt\lib\site-packages\tensorflow\python\keras\engine\training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
1096 batch_size=batch_size):
1097 callbacks.on_train_batch_begin(step)
-> 1098 tmp_logs = train_function(iterator)
1099 if data_handler.should_sync:
1100 context.async_wait()
~\anaconda3\envs\newenvt\lib\site-packages\tensorflow\python\eager\def_function.py in __call__(self, *args, **kwds)
778 else:
779 compiler = "nonXla"
--> 780 result = self._call(*args, **kwds)
781
782 new_tracing_count = self._get_tracing_count()
~\anaconda3\envs\newenvt\lib\site-packages\tensorflow\python\eager\def_function.py in _call(self, *args, **kwds)
838 # Lifting succeeded, so variables are initialized and we can run the
839 # stateless function.
--> 840 return self._stateless_fn(*args, **kwds)
841 else:
842 canon_args, canon_kwds = \
~\anaconda3\envs\newenvt\lib\site-packages\tensorflow\python\eager\function.py in __call__(self, *args, **kwargs)
2827 with self._lock:
2828 graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
-> 2829 return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access
2830
2831 #property
~\anaconda3\envs\newenvt\lib\site-packages\tensorflow\python\eager\function.py in _filtered_call(self, args, kwargs, cancellation_manager)
1841 `args` and `kwargs`.
1842 """
-> 1843 return self._call_flat(
1844 [t for t in nest.flatten((args, kwargs), expand_composites=True)
1845 if isinstance(t, (ops.Tensor,
~\anaconda3\envs\newenvt\lib\site-packages\tensorflow\python\eager\function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
1921 and executing_eagerly):
1922 # No tape is watching; skip to running the function.
-> 1923 return self._build_call_outputs(self._inference_function.call(
1924 ctx, args, cancellation_manager=cancellation_manager))
1925 forward_backward = self._select_forward_and_backward_functions(
~\anaconda3\envs\newenvt\lib\site-packages\tensorflow\python\eager\function.py in call(self, ctx, args, cancellation_manager)
543 with _InterpolateFunctionError(self):
544 if cancellation_manager is None:
--> 545 outputs = execute.execute(
546 str(self.signature.name),
547 num_outputs=self._num_outputs,
~\anaconda3\envs\newenvt\lib\site-packages\tensorflow\python\eager\execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
57 try:
58 ctx.ensure_initialized()
---> 59 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
60 inputs, attrs, num_outputs)
61 except core._NotOkStatusException as e:
UnimplementedError: Cast string to float is not supported
[[node functional_11/Cast (defined at <ipython-input-121-0f68916ec23b>:15) ]] [Op:__inference_train_function_17206]
Function call stack:
train_function
I'm not entirely sure what needs to be changed, but any suggestions would be appreciated.
From comments
Passing X to model.fit, which literally has string values, a neural
network cannot be input string values (paraphrased from Dr. Snoopy)
I am a tad new to Tensorflow and I am having trouble running this simple CNN.
I have my images separated into separate directories for each class, which I load into train_dataset using image_dataset_from_directory.
from the documentation, this should yield a tuple (images, labels), where images has shape (batch_size, image_size[0], image_size[1], num_channels), and labels are a float32 tensor of shape (batch_size, num_classes). num_channels is 3 as the images are rgb
However when I try to fit using my model, I get an error saying that the predictions are [32,5] and labels shape [160]. It seems to me the batches in the labels have 'collapsed'.
Here's some snippets:
BATCH_SIZE = 32
EPOCHS = 1
IMG_SIZE=(300, 300)
SEED = 1
train_dataset = tf.keras.preprocessing.image_dataset_from_directory(
directory='train/train_images/', label_mode='categorical', class_names=class_names, color_mode='rgb', batch_size=BATCH_SIZE, image_size=IMG_SIZE)
IMG_SHAPE = IMG_SIZE + (3,)
n_classes = len(train_dataset.class_names)
def build_model():
model = tf.keras.Sequential([
tf.keras.layers.Conv2D(input_shape=IMG_SHAPE, kernel_size=(5, 5), filters=32, activation='relu'),
tf.keras.layers.MaxPool2D(pool_size=(3, 3)),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(256, activation='relu'),
tf.keras.layers.Dropout(0.25),
tf.keras.layers.Dense(units=n_classes, activation='softmax')
])
return model
model = build_model()
model.compile(optimizer=tf.keras.optimizers.Adam(),
loss=tf.keras.losses.sparse_categorical_crossentropy,
metrics=['accuracy'])
model.fit(train_dataset, epochs = EPOCHS, batch_size = BATCH_SIZE)
Error Message:
---------------------------------------------------------------------------
InvalidArgumentError Traceback (most recent call last)
<ipython-input-19-86d96e744ef0> in <module>
----> 1 model.fit(train_dataset, epochs = EPOCHS, batch_size = BATCH_SIZE)
/opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py in _method_wrapper(self, *args, **kwargs)
106 def _method_wrapper(self, *args, **kwargs):
107 if not self._in_multi_worker_mode(): # pylint: disable=protected-access
--> 108 return method(self, *args, **kwargs)
109
110 # Running inside `run_distribute_coordinator` already.
/opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
1096 batch_size=batch_size):
1097 callbacks.on_train_batch_begin(step)
-> 1098 tmp_logs = train_function(iterator)
1099 if data_handler.should_sync:
1100 context.async_wait()
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py in __call__(self, *args, **kwds)
778 else:
779 compiler = "nonXla"
--> 780 result = self._call(*args, **kwds)
781
782 new_tracing_count = self._get_tracing_count()
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py in _call(self, *args, **kwds)
805 # In this case we have created variables on the first call, so we run the
806 # defunned version which is guaranteed to never create variables.
--> 807 return self._stateless_fn(*args, **kwds) # pylint: disable=not-callable
808 elif self._stateful_fn is not None:
809 # Release the lock early so that multiple threads can perform the call
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/function.py in __call__(self, *args, **kwargs)
2827 with self._lock:
2828 graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
-> 2829 return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access
2830
2831 #property
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/function.py in _filtered_call(self, args, kwargs, cancellation_manager)
1846 resource_variable_ops.BaseResourceVariable))],
1847 captured_inputs=self.captured_inputs,
-> 1848 cancellation_manager=cancellation_manager)
1849
1850 def _call_flat(self, args, captured_inputs, cancellation_manager=None):
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
1922 # No tape is watching; skip to running the function.
1923 return self._build_call_outputs(self._inference_function.call(
-> 1924 ctx, args, cancellation_manager=cancellation_manager))
1925 forward_backward = self._select_forward_and_backward_functions(
1926 args,
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/function.py in call(self, ctx, args, cancellation_manager)
548 inputs=args,
549 attrs=attrs,
--> 550 ctx=ctx)
551 else:
552 outputs = execute.execute_with_cancellation(
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
58 ctx.ensure_initialized()
59 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
---> 60 inputs, attrs, num_outputs)
61 except core._NotOkStatusException as e:
62 if name is not None:
InvalidArgumentError: logits and labels must have the same first dimension, got logits shape [32,5] and labels shape [160]
[[node sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits (defined at <ipython-input-18-1904262c6a7b>:1) ]] [Op:__inference_train_function_928]
Function call stack:
train_function
I think you should explicitly compile your model before executing. For that, you may omit 'input_shape' parameter in first layer. See model.compile in keras documentation. make sure that you keep loss as "categorical_crossentropy" or tf.keras.losses.CategoricalCrossentroy() . Then try again. I hope this helps.
Also it would help if you share your file structure.
I am using Microsoft Azure to train a CNN (Convolutional Neural Network) to recognize 11 classes of food using 16k images. The Virtual Machine I'm using is a "STANDARD_NC24_PROMO" with the following specs:
24 vCPUs, 4 GPUs, 224 GB memory, 1440 GB storage.
The problem is that at a simple run of the program I get the following error about Resource Exhaustion:
2-conv-256-nodes-0-dense-1576530179
Train on 10636 samples, validate on 2660 samples
Epoch 1/10
32/10636 [..............................] - ETA: 57:51
---------------------------------------------------------------------------
ResourceExhaustedError Traceback (most recent call last)
<ipython-input-10-ee913a07a18b> in <module>
86 model.compile(loss="sparse_categorical_crossentropy",optimizer="adam",metrics=["accuracy"])
87 ### TRAIN
---> 88 model.fit(train_images, train_labels,validation_split=0.20, epochs=10,use_multiprocessing=True)
89
90 loss, acc = model.evaluate(test_images, test_labels, verbose = 0)
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
726 max_queue_size=max_queue_size,
727 workers=workers,
--> 728 use_multiprocessing=use_multiprocessing)
729
730 def evaluate(self,
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training_v2.py in fit(self, model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, **kwargs)
322 mode=ModeKeys.TRAIN,
323 training_context=training_context,
--> 324 total_epochs=epochs)
325 cbks.make_logs(model, epoch_logs, training_result, ModeKeys.TRAIN)
326
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training_v2.py in run_one_epoch(model, iterator, execution_function, dataset_size, batch_size, strategy, steps_per_epoch, num_samples, mode, training_context, total_epochs)
121 step=step, mode=mode, size=current_batch_size) as batch_logs:
122 try:
--> 123 batch_outs = execution_function(iterator)
124 except (StopIteration, errors.OutOfRangeError):
125 # TODO(kaftan): File bug about tf function and errors.OutOfRangeError?
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training_v2_utils.py in execution_function(input_fn)
84 # `numpy` translates Tensors to values in Eager mode.
85 return nest.map_structure(_non_none_constant_value,
---> 86 distributed_function(input_fn))
87
88 return execution_function
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/tensorflow_core/python/eager/def_function.py in __call__(self, *args, **kwds)
455
456 tracing_count = self._get_tracing_count()
--> 457 result = self._call(*args, **kwds)
458 if tracing_count == self._get_tracing_count():
459 self._call_counter.called_without_tracing()
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/tensorflow_core/python/eager/def_function.py in _call(self, *args, **kwds)
518 # Lifting succeeded, so variables are initialized and we can run the
519 # stateless function.
--> 520 return self._stateless_fn(*args, **kwds)
521 else:
522 canon_args, canon_kwds = \
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/tensorflow_core/python/eager/function.py in __call__(self, *args, **kwargs)
1821 """Calls a graph function specialized to the inputs."""
1822 graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
-> 1823 return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access
1824
1825 #property
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/tensorflow_core/python/eager/function.py in _filtered_call(self, args, kwargs)
1139 if isinstance(t, (ops.Tensor,
1140 resource_variable_ops.BaseResourceVariable))),
-> 1141 self.captured_inputs)
1142
1143 def _call_flat(self, args, captured_inputs, cancellation_manager=None):
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/tensorflow_core/python/eager/function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
1222 if executing_eagerly:
1223 flat_outputs = forward_function.call(
-> 1224 ctx, args, cancellation_manager=cancellation_manager)
1225 else:
1226 gradient_name = self._delayed_rewrite_functions.register()
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/tensorflow_core/python/eager/function.py in call(self, ctx, args, cancellation_manager)
509 inputs=args,
510 attrs=("executor_type", executor_type, "config_proto", config),
--> 511 ctx=ctx)
512 else:
513 outputs = execute.execute_with_cancellation(
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/tensorflow_core/python/eager/execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
65 else:
66 message = e.message
---> 67 six.raise_from(core._status_to_exception(e.code, message), None)
68 except TypeError as e:
69 keras_symbolic_tensors = [
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/six.py in raise_from(value, from_value)
ResourceExhaustedError: OOM when allocating tensor with shape[32,256,98,98] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
[[node sequential_7/conv2d_14/Conv2D (defined at /anaconda/envs/azureml_py36/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py:1751) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
[Op:__inference_distributed_function_7727]
Function call stack:
distributed_function
I will attach below the bit of code that does the training:
for dense_layer in dense_layers:
for layer_size in layer_sizes:
for conv_layer in conv_layers:
NAME="{}-conv-{}-nodes-{}-dense-{}".format(conv_layer,
layer_size, dense_layer, int(time.time()))
print(NAME)
model = Sequential()
model.add(Conv2D(layer_size,(3,3),input_shape=(IMG_SIZE, IMG_SIZE, 1)))
model.add(Activation("relu"))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.5))
for l in range(conv_layer-1):
model.add(Conv2D(layer_size,(3,3)))
model.add(Activation("relu"))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.5))
model.add(Flatten())
for l in range(dense_layer):
model.add(Dense(layer_size))
model.add(Activation("relu"))
#The output layer with 11 neurons
model.add(Dense(11))
model.add(Activation("softmax"))
### COMPILE MODEL
model.compile(loss="sparse_categorical_crossentropy",
optimizer="adam",
metrics=["accuracy"])
### TRAIN
model.fit(train_images, train_labels,validation_split=0.20, epochs=10)
loss, acc = model.evaluate(test_images, test_labels, verbose = 0)
print(acc * 100)
if maxacc<acc*100:
maxacc=acc*100
maxname=NAME
maxdict[maxacc]=maxname
print("\n\n",maxacc," ",maxname)
My laptop which is nowhere near as good has no problem executing this, yet running it on azure gives me that error. The iteration variables don't matter as I still get the error no matter what their values are.
Any help would be greatly appreciated, thank you for your time!
I would like to add that the program is not even working with this small amount of layers:
dense_layers = [0]
layer_sizes = [32]
conv_layers = [1]
Unfortunately, I never used azure for training some kind of networks. But I would try:
simplify your network and setup, maybe use a powerful single gpu first. Also, figure out what hyperparameter has to change to make it fail after you got it to work with a simpler approach
reduce the batch size. Most of gpu OOM exceptions are due to too many data that is processed at once.
There is a lot of optimization happening that might cause it to work locally but that works slightly different for multi gpu machines.
When i reading Google tensorflow2.0 tutorials, i meet an surprised error when i try to test in my jupyter. It is so strange!It run fluently in Google colab! The tutorials is this
My computer GPU is gtx1060 6G, and memory is 16G, I think my computer is ok to run this tutorials.
I try run it on Jupyter, and it run error.But run fluently on Google colab!
You can see the error code following or go to the web:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_datasets as tfds
# tfds.disable_progress_bar()
(train_data, test_data), info = tfds.load(
'imdb_reviews/subwords8k',
split = (tfds.Split.TRAIN, tfds.Split.TEST),
with_info=True, as_supervised=True)
encoder = info.features['text'].encoder
padded_shapes = ([None],())
train_batches = train_data.shuffle(1000).padded_batch(10, padded_shapes = padded_shapes)
test_batches = test_data.shuffle(1000).padded_batch(10, padded_shapes = padded_shapes)
embedding_dim=16
model = keras.Sequential([
layers.Embedding(encoder.vocab_size, embedding_dim,mask_zero=True),
layers.Bidirectional(tf.keras.layers.LSTM(32)),
layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
history = model.fit(
train_batches,
epochs=10,
validation_data=test_batches, validation_steps=20,verbose=2)
It is my first time meeting this error, and I don't know how to fix it, but it run fluently on Google colab, I don't know why?
The error following:
Epoch 1/10
---------------------------------------------------------------------------
CancelledError Traceback (most recent call last)
<ipython-input-2-8f27353fef79> in <module>
31 train_batches,
32 epochs=10,
---> 33 validation_data=test_batches, validation_steps=20,verbose=2)
c:\users\sha\anaconda3\envs\tensorflow2\lib\site-packages\tensorflow_core\python\keras\engine\training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
726 max_queue_size=max_queue_size,
727 workers=workers,
--> 728 use_multiprocessing=use_multiprocessing)
729
730 def evaluate(self,
c:\users\sha\anaconda3\envs\tensorflow2\lib\site-packages\tensorflow_core\python\keras\engine\training_v2.py in fit(self, model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, **kwargs)
322 mode=ModeKeys.TRAIN,
323 training_context=training_context,
--> 324 total_epochs=epochs)
325 cbks.make_logs(model, epoch_logs, training_result, ModeKeys.TRAIN)
326
c:\users\sha\anaconda3\envs\tensorflow2\lib\site-packages\tensorflow_core\python\keras\engine\training_v2.py in run_one_epoch(model, iterator, execution_function, dataset_size, batch_size, strategy, steps_per_epoch, num_samples, mode, training_context, total_epochs)
121 step=step, mode=mode, size=current_batch_size) as batch_logs:
122 try:
--> 123 batch_outs = execution_function(iterator)
124 except (StopIteration, errors.OutOfRangeError):
125 # TODO(kaftan): File bug about tf function and errors.OutOfRangeError?
c:\users\sha\anaconda3\envs\tensorflow2\lib\site-packages\tensorflow_core\python\keras\engine\training_v2_utils.py in execution_function(input_fn)
84 # `numpy` translates Tensors to values in Eager mode.
85 return nest.map_structure(_non_none_constant_value,
---> 86 distributed_function(input_fn))
87
88 return execution_function
c:\users\sha\anaconda3\envs\tensorflow2\lib\site-packages\tensorflow_core\python\eager\def_function.py in __call__(self, *args, **kwds)
455
456 tracing_count = self._get_tracing_count()
--> 457 result = self._call(*args, **kwds)
458 if tracing_count == self._get_tracing_count():
459 self._call_counter.called_without_tracing()
c:\users\sha\anaconda3\envs\tensorflow2\lib\site-packages\tensorflow_core\python\eager\def_function.py in _call(self, *args, **kwds)
485 # In this case we have created variables on the first call, so we run the
486 # defunned version which is guaranteed to never create variables.
--> 487 return self._stateless_fn(*args, **kwds) # pylint: disable=not-callable
488 elif self._stateful_fn is not None:
489 # Release the lock early so that multiple threads can perform the call
c:\users\sha\anaconda3\envs\tensorflow2\lib\site-packages\tensorflow_core\python\eager\function.py in __call__(self, *args, **kwargs)
1821 """Calls a graph function specialized to the inputs."""
1822 graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
-> 1823 return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access
1824
1825 #property
c:\users\sha\anaconda3\envs\tensorflow2\lib\site-packages\tensorflow_core\python\eager\function.py in _filtered_call(self, args, kwargs)
1139 if isinstance(t, (ops.Tensor,
1140 resource_variable_ops.BaseResourceVariable))),
-> 1141 self.captured_inputs)
1142
1143 def _call_flat(self, args, captured_inputs, cancellation_manager=None):
c:\users\sha\anaconda3\envs\tensorflow2\lib\site-packages\tensorflow_core\python\eager\function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
1222 if executing_eagerly:
1223 flat_outputs = forward_function.call(
-> 1224 ctx, args, cancellation_manager=cancellation_manager)
1225 else:
1226 gradient_name = self._delayed_rewrite_functions.register()
c:\users\sha\anaconda3\envs\tensorflow2\lib\site-packages\tensorflow_core\python\eager\function.py in call(self, ctx, args, cancellation_manager)
509 inputs=args,
510 attrs=("executor_type", executor_type, "config_proto", config),
--> 511 ctx=ctx)
512 else:
513 outputs = execute.execute_with_cancellation(
c:\users\sha\anaconda3\envs\tensorflow2\lib\site-packages\tensorflow_core\python\eager\execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
65 else:
66 message = e.message
---> 67 six.raise_from(core._status_to_exception(e.code, message), None)
68 except TypeError as e:
69 keras_symbolic_tensors = [
c:\users\sha\anaconda3\envs\tensorflow2\lib\site-packages\six.py in raise_from(value, from_value)
CancelledError: [_Derived_]RecvAsync is cancelled.
[[{{node Reshape_11/_38}}]] [Op:__inference_distributed_function_16087]
Function call stack:
distributed_function
Thanks for anyone help me!
Try to reduce the batch_size, it should work.