When trying to fit Keras model, written in tensorflow.keras API with tf.Dataset induced iterator, the model is complaining about steps_per_epoch argument, even though I've set this one to a concrete value.
Here below is my model class
import tensorflow as tf
import numpy as np
from typing import Union, List
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras import layers
from tftools import TFTools
class TestServe():
def __init__(self, tfrecords: Union[List[tf.train.Example], tf.train.Example], batch_size: int = 10, input_shape: tuple = (64, 23)) -> None:
self.tfrecords = tfrecords
self.batch_size = batch_size
self.input_shape = input_shape
def get_model(self):
ins = layers.Input(shape=(64, 23))
l = layers.Reshape((*self.input_shape, 1))(ins)
l = layers.Conv2D(8, (30, 23), padding='same', activation='relu')(l)
l = layers.MaxPool2D((4, 5), strides=(4, 5))(l)
l = layers.Conv2D(16, (3, 3), padding='same', activation='relu')(l)
l = layers.Conv2D(32, (3, 3), padding='same', activation='relu')(l)
l = layers.MaxPool2D((2, 2), strides=(2, 2))(l)
l = layers.Flatten()(l)
out = layers.Dense(1, activation='softmax')(l)
return tf.keras.models.Model(ins, out)
def train(self):
# Create Dataset
dataset = TFTools.create_dataset(self.tfrecords)
dataset = dataset.repeat(6).batch(self.batch_size)
val_iterator = dataset.take(300).make_one_shot_iterator()
train_iterator = dataset.skip(300).make_one_shot_iterator()
model = self.get_model()
model.summary()
model.compile(optimizer='rmsprop',
loss='binary_crossentropy', metrics=['accuracy'])
model.fit(train_iterator, validation_data=val_iterator,
epochs=10, verbose=1, steps_per_epoch=20)
def predict(self, X: np.array) -> np.array:
pass
ts = TestServe(['./ok.tfrecord', './nok.tfrecord'])
ts.train()
But as soon I start the training, before the first batch is finished, I get an exception from tensorflow
2019-06-13 14:22:25.393398: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 1995445000 Hz
2019-06-13 14:22:25.393681: I tensorflow/compiler/xla/service/service.cc:150] XLA service 0x2f7d120 executing computations on platform Host. Devices:
2019-06-13 14:22:25.393708: I tensorflow/compiler/xla/service/service.cc:158] StreamExecutor device (0): <undefined>, <undefined>
Epoch 1/2
19/20 [===========================>..] - ETA: 0s - loss: 1.1921e-07 - acc: 1.0000Traceback (most recent call last):
File "TestServe.py", line 62, in <module>
ts.train()
File "TestServe.py", line 56, in train
epochs=2, verbose=1, callbacks=callbacks, steps_per_epoch=20) #The steps_per_epoch is typically samples_per_epoch / batch_size
File "/home/josef/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py", line 880, in fit
validation_steps=validation_steps)
File "/home/josef/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training_arrays.py", line 364, in model_iteration
validation_in_fit=True)
File "/home/josef/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training_arrays.py", line 202, in model_iteration
steps_per_epoch)
File "/home/josef/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training_arrays.py", line 76, in _get_num_samples_or_steps
'steps_per_epoch')
File "/home/josef/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training_utils.py", line 230, in check_num_samples
if check_steps_argument(ins, steps, steps_name):
File "/home/josef/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training_utils.py", line 960, in check_steps_argument
input_type=input_type_str, steps_name=steps_name))
ValueError: When using data tensors as input to a model, you should specify the `steps_per_epoch` argument.
The original dataset contains around 1500 samples, but I want to join multiple tfrecord files to TFRecordDataset so I wont have the information about the length.
Anyone saw something similar before? I dont know where to go for help, since the tf.keras API is relatively new. The create_dataset function just returns the dataset mapped with the right parse function.
Found the solution.
There is not only steps_per_epoch but also validation_steps parameter, which you also have to specify.
This error was reported when I tried to use a TensorFlow 2.0 model when I actually had an older version (TensorFlow 1.14) installed locally.
To upgrade to the latest TensorFlow version, run:
python -m pip install --upgrade pip
python -m pip install --upgrade tensorflow
Related
I am using Python 3.8, Tensorflow 2.5.0 and keras 2.3.1 and I am trying to make a model, but I get error from keras.
This is my code :
import cv2
import os
import numpy as np
from keras.layers import Conv2D,Dropout, Flatten, Dense,MaxPooling2D, MaxPool2D
import keras.layers.normalization
#from tensorflow.keras.layers import Conv2D,Dropout, Flatten, Dense,MaxPooling2D, MaxPool2D
from keras_preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
from keras.models import Sequential
import pandas as pd
import random
from tensorflow.python.keras.utils.np_utils import to_categorical
count = 0
images = []
classNo = []
labelFile = 'signnames.csv'
classes = 43
testRatio = 0.2 # if 1000 images split will 200 for testing
validationRatio = 0.2 # if 1000 images 20% from remaining 800 will be 160 for valid
path_current = os.getcwd()
imageDim = (32, 32, 3)
####IMPORTING THE IMAGES FROM TRAIN FOLDER
for j in range(classes):
path = os.path.join(path_current, 'train', str(j))
imagesList = os.listdir(path)
for i in imagesList:
image = cv2.imread(path + '\\' + i)
imageResized = cv2.resize(image, (32, 32))
imageResized = np.array(imageResized)
images.append(imageResized)
classNo.append(count)
count += 1
images = np.array(images)
classNo = np.array(classNo)
print(images.shape, classNo.shape)
##### Split Data - make the train
X_train, X_test, y_train, y_test = train_test_split(images, classNo, test_size=testRatio)
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=validationRatio)
#####processing all the images from train, test, validation
X_train = np.array(list(map(preprocessing, X_train))) # for all the images
X_validation = np.array(list(map(preprocessing, X_validation)))
X_test = np.array(list(map(preprocessing, X_test)))
cv2.imshow("GrayScale Images", X_train[random.randint(0, len(X_train) - 1)]) # just to verify the tain
# cv2.waitKey(5000)
##### add a depth of 1 - for better lines
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], X_train.shape[2], 1)
X_validation = X_validation.reshape(X_validation.shape[0], X_validation.shape[1], X_validation.shape[2], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], X_test.shape[2], 1)
####augmentation of images : to make from some images more images, making it more generic, creating various similar images
dataGen = ImageDataGenerator(width_shift_range=0.1, # 10%
height_shift_range=0.1,
zoom_range=0.2,
shear_range=0.1, # distorted along an axis(aplecata)
rotation_range=10) # degrees
dataGen.fit(X_train)
batches = dataGen.flow(X_train, y_train, batch_size=20) # generate 20 images when it s called
X_batch, y_batch = next(batches)
#######from label to one encoding(making matrix with 0 and 1 based on classes number)
y_test = to_categorical(y_test, classes)
y_train = to_categorical(y_train, classes)
y_validation = to_categorical(y_validation, classes)
###########convolution neural network model
def myModel():
nodesNr = 500
filterNr = 60 ##to dont remove pixels based on filter size
filterSize = (5, 5) ##the kernel that move around the image to get the features
# making padding
filterSize2 = (3, 3)
poolSize = (
2, 2) # for more generalize, to reduce overfitting(when detail and noise in training and go to negative result)
model = Sequential();
model.add(Conv2D(filterNr, filterSize, activation='relu', input_shape=X_train.shape[1:]))
model.add(Conv2D(filterNr, filterSize, activation='relu'))
model.add(MaxPooling2D(pool_size=poolSize))
model.add(Conv2D(filterNr // 2, filterSize2, activation='relu'))
model.add(Conv2D(filterNr // 2, filterSize2, activation='relu'))
model.add(MaxPool2D(pool_size=poolSize))
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(nodesNr, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(classes, activation='softmax')) # output layer
model.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy'])
return model
####TRAIN
model = myModel()
print(model.summary())
model.save('traffic_classifier.h5')
I am using PyCharm and I get error from first keras import, at line 8.
There are the errors:
Using TensorFlow backend.
2021-05-15 20:43:16.281415: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library cudart64_110.dll
Traceback (most recent call last):
File "E:/FACULTATE ANUL 3 SEMESTRUL 2/Procesarea Imaginilor/proiect/main.py", line 8, in <module>
from keras.layers import Conv2D,Dropout, Flatten, Dense,MaxPooling2D, MaxPool2D
File "C:\Users\My-Pc\AppData\Local\Programs\Python\Python38\lib\site-packages\keras\__init__.py", line 3, in <module>
from . import utils
File "C:\Users\My-Pc\AppData\Local\Programs\Python\Python38\lib\site-packages\keras\utils\__init__.py", line 6, in <module>
from . import conv_utils
File "C:\Users\My-Pc\AppData\Local\Programs\Python\Python38\lib\site-packages\keras\utils\conv_utils.py", line 9, in <module>
from .. import backend as K
File "C:\Users\My-Pc\AppData\Local\Programs\Python\Python38\lib\site-packages\keras\backend\__init__.py", line 1, in <module>
from .load_backend import epsilon
File "C:\Users\My-Pc\AppData\Local\Programs\Python\Python38\lib\site-packages\keras\backend\load_backend.py", line 90, in <module>
from .tensorflow_backend import *
File "C:\Users\My-Pc\AppData\Local\Programs\Python\Python38\lib\site-packages\keras\backend\tensorflow_backend.py", line 5, in <module>
import tensorflow as tf
File "C:\Users\My-Pc\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\__init__.py", line 41, in <module>
from tensorflow.python.tools import module_util as _module_util
File "C:\Users\My-Pc\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\__init__.py", line 48, in <module>
from tensorflow.python import keras
File "C:\Users\My-Pc\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\keras\__init__.py", line 25, in <module>
from tensorflow.python.keras import models
File "C:\Users\My-Pc\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\keras\models.py", line 20, in <module>
from tensorflow.python.keras import metrics as metrics_module
File "C:\Users\My-Pc\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\keras\metrics.py", line 37, in <module>
from tensorflow.python.keras import activations
File "C:\Users\My-Pc\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\keras\activations.py", line 18, in <module>
from tensorflow.python.keras.layers import advanced_activations
File "C:\Users\My-Pc\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\keras\layers\__init__.py", line 146, in <module>
from tensorflow.python.keras.layers.normalization import LayerNormalization
ImportError: cannot import name 'LayerNormalization' from 'tensorflow.python.keras.layers.normalization' (C:\Users\My-Pc\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\keras\layers\normalization\__init__.py)
Try importing your modules through the Tensorflow repository instead of the Keras repository.
For example:
from tensorflow.keras.models import Sequential
I had the same error with Python 3.8, Tensorflow 2.5.0 and keras 2.3.1. Gone through numerous solutions from all sources. What fixed it for me was to downgrade Python to 3.7. For some reason it seems Keras LayerNormalization is incompatible with Python 3.8 locally on my computer, even though I was unable to replicate the problem over Colab.
If you use Anaconda, you could create a new environment just for Tensorflow. Here is what worked for me:
conda create -n tensorflow_env tensorflow
conda activate tensorflow_env
which installed Python 3.7.10 and Tensorflow 2.0.0. You can then upgrade Tensorflow to 2.5.0.
The package has been renamed. This import worked for me.
from keras.layers.normalization import layer_normalization
it seems to be a combination of versions mismatch between python/tensorflow/keras. Here is the versions that worked for me python 3.8.6/tensorflow==2.5.0/keras==2.4.3 and got rid of the layer_normalization error
Might not be 100% related to the original question, but have landed here trying to solve this on MacBook Pro M1 in a conda environment
Joining bits and pieces from multiple places my ultimate fix was:
pip uninstall -y tensorflow keras tf-nightly keras-nightly
python -m pip install tensorflow-macos
if you want to install tensorflow in your base environment use
pip install tensorflow==2.2.0 --user command .
Chiming in with the 2022 solution - I had the same issue with
tensorflow == 2.5.0
python == 3.9
An upgrade to tensorflow == 2.7.0 did it for me
pip install --upgrade tensorflow
I use keras for training a model (theano backend). I've created a dataset with images from google using an expansion for downloading. But I have the error.
When I start running my code some time passes and then the error appears. If target_size < width & height of a picture - the first error appears if target_size = width & height of a picture - the second.
import numpy as np
from PIL import Image
from matplotlib import pyplot as plt
import os
import keras
import joblib
from keras.preprocessing.image import ImageDataGenerator
train_images = 'C:\\Users\\Администратор\\AppData\\Local\\Programs\\Python\\Python36-32\\train_images'
model = keras.Sequential([
keras.layers.Flatten(input_shape=(36, 36, 3)),
keras.layers.Dense(64, activation='relu'),
keras.layers.Dropout(0.5),
keras.layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
datagen = ImageDataGenerator(rescale = 1. /255)
train_generator = datagen.flow_from_directory(
train_images,
target_size = (36,36),
batch_size = 4,
class_mode = 'binary')
model.fit(np.array(train_generator), epochs=10, validation_split = 0.1)
Error. If you need more details let me know.
C:\Users\Администратор>C:\Users\Администратор\AppData\Local\Programs\Python\Pyth
on36-32\image_guess.py
Using Theano backend.
WARNING (theano.configdefaults): g++ not available, if using conda: `conda insta
ll m2w64-toolchain`
C:\Users\Администратор\AppData\Local\Programs\Python\Python36-32\lib\site-packag
es\theano\configdefaults.py:560: UserWarning: DeprecationWarning: there is no c+
+ compiler.This is deprecated and with Theano 0.11 a c++ compiler will be mandat
ory
warnings.warn("DeprecationWarning: there is no c++ compiler."
WARNING (theano.configdefaults): g++ not detected ! Theano will be unable to exe
cute optimized C-implementations (for both CPU and GPU) and will default to Pyth
on implementations. Performance will be severely degraded. To remove this warnin
g, set Theano flags cxx to an empty string.
WARNING (theano.tensor.blas): Using NumPy C-API based implementation for BLAS fu
nctions.
Found 336 images belonging to 2 classes.
Traceback (most recent call last):
File "C:\Users\Администратор\AppData\Local\Programs\Python\Python36-32\image_g
uess.py", line 32, in <module>
model.fit(np.array(train_generator), epochs=10, validation_split = 0.1)
File "C:\Users\Администратор\AppData\Local\Programs\Python\Python36-32\lib\sit
e-packages\keras_preprocessing\image\iterator.py", line 104, in __next__
return self.next(*args, **kwargs)
File "C:\Users\Администратор\AppData\Local\Programs\Python\Python36-32\lib\sit
e-packages\keras_preprocessing\image\iterator.py", line 116, in next
return self._get_batches_of_transformed_samples(index_array)
File "C:\Users\Администратор\AppData\Local\Programs\Python\Python36-32\lib\sit
e-packages\keras_preprocessing\image\iterator.py", line 231, in _get_batches_of_
transformed_samples
x = img_to_array(img, data_format=self.data_format)
File "C:\Users\Администратор\AppData\Local\Programs\Python\Python36-32\lib\sit
e-packages\keras_preprocessing\image\utils.py", line 309, in img_to_array
x = np.asarray(img, dtype=dtype)
File "C:\Users\Администратор\AppData\Local\Programs\Python\Python36-32\lib\sit
e-packages\numpy\core\_asarray.py", line 83, in asarray
return array(a, dtype, copy=False, order=order)
TypeError: float() argument must be a string or a number, not 'JpegImageFile'
When the model is taking sufficiently long to infer (i.e. enough parameters and data big enough), and when profile_batch is on, the TensorBoard callback fails to write the training metrics to the log events (at least they are not visible in Tensorboard).
Here is the code used to get that failure:
import os.path as op
import time
import numpy as np
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.layers import Conv2D, Input
from tensorflow.keras.models import Model
size = 512
im = Input((size, size, 1))
im_conv = Conv2D(512, 3, padding='same', activation='relu')(im)
im_conv = Conv2D(1, 3, padding='same', activation='linear')(im_conv)
model = Model(im, im_conv)
model.compile(loss='mse', optimizer='adam', metrics=['mae'])
data = np.random.rand(1, size, size, 1)
run_id = f'{int(time.time())}'
log_dir = op.join('logs', run_id)
tboard_cback = TensorBoard(
log_dir=log_dir,
histogram_freq=0,
write_graph=False,
write_images=False,
profile_batch=2,
)
model.fit(
x=data,
y=data,
validation_data=[data, data],
callbacks=[tboard_cback,],
epochs=100,
verbose=0,
);
Here is the Tensorboard viz I have:
Is there something wrong with the way I am using this callback?
I use Python 3.6.8, tensorflow 2.0.0 on GPU (but the behaviour is the same on CPU).
So apparently, this is due to the profiling done in the callback. We can disable it via profile_batch=0. The issue is ongoing and to be followed here: https://github.com/tensorflow/tensorboard/issues/2084
I've an issue running a Keras model on a Google Cloud Platform instance.
The model is the following:
n_timesteps, n_features, n_outputs = train_x.shape[1], train_x.shape[2], train_y.shape[1]
train_y = train_y.reshape((train_y.shape[0], train_y.shape[1], 1))
verbose, epochs, batch_size = 1, 1, 64 # low number of epochs just for testing purpose
with tf.device('/cpu:0'):
m = Sequential()
m.add(CuDNNLSTM(20, input_shape=(n_timesteps, n_features)))
m.add(LeakyReLU(alpha=0.1))
m.add(RepeatVector(n_outputs))
m.add(CuDNNLSTM(20, return_sequences=True))
m.add(LeakyReLU(alpha=0.1))
m.add(TimeDistributed(Dense(20)))
m.add(LeakyReLU(alpha=0.1))
m.add(TimeDistributed(Dense(1)))
self.model = multi_gpu_model(m, gpus=8)
self.model.compile(loss='mse', optimizer='adam')
self.model.fit(train_x, train_y, epochs=epochs, batch_size=batch_size, verbose=verbose)
As you can see from the code above, I run the model on machine with 8 GPUs (Nvidia Tesla K80).
Train works well, without any errors. However, the prediction fails and returns the following error:
W tensorflow/core/framework/op_kernel.cc:1502] OP_REQUIRES failed at cudnn_rnn_ops.cc:1336 : Unknown: CUDNN_STATUS_BAD_PARAM
in tensorflow/stream_executor/cuda/cuda_dnn.cc(1285): 'cudnnSetTensorNdDescriptor( tensor_desc.get(), data_type, sizeof(dims) / sizeof(dims[0]), dims, strides)'
Here the code to run the prediction:
self.model.predict(input_x)
What I've noticed is that if I remove the code for multi-GPU data parallelism, the code works well using a single GPU.
To be more precise, if I comment this line, the code works without error
self.model = multi_gpu_model(m, gpus=8)
What am I missing?
virtualenv information
cudatoolkit - 10.0.130
cudnn - 7.6.4
keras - 2.2.4
keras-applications - 1.0.8
keras-base - 2.2.4
keras-gpu - 2.2.4
python - 3.6
UPDATE
train_x.shape = (1441, 288, 1)
train_y.shape = (1441, 288, 1)
input_x.shape = (1, 288, 1)
After Olivier Dehaene's reply I tried his suggestion and it worked.
I tried to modify the input_x shape in order to obtain (8, 288, 1).
In order to do that I also modified train_x and train_y shapes.
Here a recap:
train_x.shape = (8065, 288, 1)
train_y.shape = (8065, 288, 1)
input_x.shape = (8, 288, 1)
But now I've the same error on the training phase, on this line:
self.model.fit(train_x, train_y, epochs=epochs, batch_size=batch_size, verbose=verbose)
From the tf.keras.utils.multi_gpu_model we can see that it works in the following way:
Divide the model's input(s) into multiple sub-batches.
Apply a model copy on each sub-batch. Every model copy is executed on a dedicated GPU.
Concatenate the results (on CPU) into one big batch.
You are triggering an error because the input of the CuDNNLSTM layer is empty for at least one of the model copy. This is because the divide operations requires that: input // n_gpus > 0
Try this code out:
input_x = np.random.randn(8, n_timesteps, n_features)
model.predict(input_x)
I have installed Tensorflow and Tflearn on my Jetson Tx1. Tensorflow works and the program I'm trying to run works on my mac. But I get this error when I run it on my jetson.
Traceback (most recent call last):
File "net.py", line 164, in <module>
net = tflearn.regression(net, optimizer='adam', learning_rate=0.00001)
File "/usr/local/lib/python3.5/dist-packages/tflearn/layers/estimator.py", line 174, in regression
loss = objectives.get(loss)(incoming, placeholder)
File "/usr/local/lib/python3.5/dist-packages/tflearn/objectives.py", line 66, in categorical_crossentropy
keepdims=True)
TypeError: reduce_sum() got an unexpected keyword argument 'keepdims'
The code for the neural net
# Network building
net = tflearn.input_data([None, 25])
net = tflearn.embedding(net, input_dim=len(words), output_dim=256) #Embedding instead of one hot encoding.
net = tflearn.lstm(net, 256, dropout=0.9) #0.9, 0.00001, 30 was good -->63%
net = tflearn.fully_connected(net, 2, activation='softmax')
net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy', learning_rate=0.00001)
# Training
model = tflearn.DNN(net, tensorboard_verbose=0)
model.fit(x_train, y_train, n_epoch=15, validation_set=(x_test, y_test), show_metric=True, batch_size=30)
model.save('mod.model')
For Tensorflow v1.4 or below, the parameter to preserve dimensions is written keep_dims (with underscore). The change (to keepdims, currently with retro-compatibility) was introduced in v1.5.
It is thus possible that your TFlearn version is too recent for your Tensorflow. Upgrading the latter may solve your problem.