When the model is taking sufficiently long to infer (i.e. enough parameters and data big enough), and when profile_batch is on, the TensorBoard callback fails to write the training metrics to the log events (at least they are not visible in Tensorboard).
Here is the code used to get that failure:
import os.path as op
import time
import numpy as np
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.layers import Conv2D, Input
from tensorflow.keras.models import Model
size = 512
im = Input((size, size, 1))
im_conv = Conv2D(512, 3, padding='same', activation='relu')(im)
im_conv = Conv2D(1, 3, padding='same', activation='linear')(im_conv)
model = Model(im, im_conv)
model.compile(loss='mse', optimizer='adam', metrics=['mae'])
data = np.random.rand(1, size, size, 1)
run_id = f'{int(time.time())}'
log_dir = op.join('logs', run_id)
tboard_cback = TensorBoard(
log_dir=log_dir,
histogram_freq=0,
write_graph=False,
write_images=False,
profile_batch=2,
)
model.fit(
x=data,
y=data,
validation_data=[data, data],
callbacks=[tboard_cback,],
epochs=100,
verbose=0,
);
Here is the Tensorboard viz I have:
Is there something wrong with the way I am using this callback?
I use Python 3.6.8, tensorflow 2.0.0 on GPU (but the behaviour is the same on CPU).
So apparently, this is due to the profiling done in the callback. We can disable it via profile_batch=0. The issue is ongoing and to be followed here: https://github.com/tensorflow/tensorboard/issues/2084
Related
I want to learn how to prepare data for training samples in python. I found a simple example of a neural network that predicts the stock price. At the moment I am not interested in the accuracy of training the network, but I am interested in how to take any data and prepare it for submission to the neural network.
As an example, I took these stocks over the past 5 years. As planned, the neural network accepts data for the last 50 days as input and predicts the course for the next 5 days. To do this, I read the .csv file, processed the data in such a way that after the transformation I got two dataframes, the first one is responsible for the input data, and the second for the output.
The problem is, no matter what I do, I keep getting errors and so I cannot complete the training. What am I doing wrong? The code is shown below:
import matplotlib.pylab as plt
import torch
import random
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize
import pandas_profiling as pprf
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, BatchNormalization, LeakyReLU
from tensorflow.keras.layers import Activation, Input, MaxPooling1D, Dropout
from tensorflow.keras.layers import AveragePooling1D, Conv1D, Flatten
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import Adam, RMSprop, SGD
from tensorflow.keras.utils import plot_model
from IPython.display import display, Image
random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
data = pd.read_csv('F:\\YNDX_ME.csv')[::]
data = data.drop('Date',axis=1)
data = data.drop('Adj Close',axis=1)
data = data.drop(np.where(data['Volume'] == 0)[0])
data = data.reset_index(drop=True)
#profiler = pprf.ProfileReport(data)
#profiler.to_file(r'F:\profiling.html')
days_edu = 50
days_pred = 5
df_edu_list = []
for i in range(len(data.index)-days_edu-days_pred+1):
df_temp = []
for j in range(days_edu):
df_temp.extend(data.loc[i+j,:].tolist())
df_edu_list.append(df_temp)
df_edu_out_list = []
for i in range(len(data.index)-days_edu-days_pred+1):
df_temp = []
for j in range(5):
df_temp.extend(data.loc[i+j+days_edu,:].tolist())
df_edu_out_list.append(df_temp)
df_edu_train = pd.DataFrame(df_edu_list[:int(len(df_edu_list)*0.8)])
df_edu_val = pd.DataFrame(df_edu_list[int(len(df_edu_list)*0.8):])
df_edu_train_out = pd.DataFrame(df_edu_out_list[:int(len(df_edu_out_list)*0.8)])
df_edu_val_out = pd.DataFrame(df_edu_out_list[int(len(df_edu_out_list)*0.8):])
df_edu_train = normalize(df_edu_train.values)
df_edu_val = normalize(df_edu_val.values)
df_edu_train_out = normalize(df_edu_train_out.values)
df_edu_val_out = normalize(df_edu_val_out.values)
df_edu_train = np.expand_dims(df_edu_train,axis=0)
df_edu_train_out = np.expand_dims(df_edu_train_out,axis=0)
model = Sequential()
model.add(Conv1D(filters=32, kernel_size=5, padding="same", strides=1, input_shape= (959,250),data_format='channels_first'))
model.add(Conv1D(32, 5))
model.add(Dropout(0.3))
model.add(Conv1D(16, 5))
model.add(Dropout(0.3))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(25, activation=None))
optimizer = Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999, amsgrad=False)
model.compile(optimizer=optimizer, loss='mae', metrics=['accuracy'])
EPOCHS = 1000
model.fit(df_edu_train, df_edu_train_out, epochs=EPOCHS)
Error:
InvalidArgumentError: Conv2DCustomBackpropFilterOp only supports NHWC.
[[node gradient_tape/sequential/conv1d/Conv1D/Conv2DBackpropFilter
(defined at C:\Users\nick0\anaconda3\lib\site-packages\keras\optimizer_v2\optimizer_v2.py:464)
]] [Op:__inference_train_function_1046]
Errors may have originated from an input operation.
Input Source operations connected to node gradient_tape/sequential/conv1d/Conv1D/Conv2DBackpropFilter:
In[0] sequential/conv1d/Conv1D/ExpandDims (defined at C:\Users\nick0\anaconda3\lib\site-packages\keras\layers\convolutional.py:231)
In[1] gradient_tape/sequential/conv1d/Conv1D/ShapeN:
In[2] gradient_tape/sequential/conv1d/Conv1D/Reshape:
Update:
Changed data_format = 'channels_first' to data_format = 'channels_last'. The training began, but as I understood, the training took place on the entire training set, i.e. the neural network just thought that there was one example and it was trained on it specifically. How to make the neural network take each line in turn? is each line essentially a separate example?
from keras.applications.resnet50 import ResNet50
from keras.preprocessing import image
from keras.applications.resnet50 import preprocess_input, decode_predictions
import numpy as np
model = ResNet50(weights='imagenet')
In this code, there is the "wrapper" (that's what it's referred to) ResNet50. What are the other types of weights I can use for this? I tried looking around but I don't even understand the source code; there is nothing conclusive there either
You can find it on the keras doc
Or github code
There is only two options, either None if you just want the architecture without the weights, or imagenet to load imagenet weights.
Edit : how to use our own weights :
# Take a DenseNET201
backbone = tf.keras.applications.DenseNet201(input_shape=input_shape, weights=None, include_top=False)
# Change the model a little bit, because why not
input_image = tf.keras.layers.Input(shape=input_shape)
x = backcone(input_image)
x = tf.keras.layers.Conv2D(classes, (3, 3), padding='same', name='final_conv')(input)
x = tf.keras.layers.Activation(activation, name=activation)(x)
model = tf.keras.Model(input, x)
#... some additional code
# training part
optimizer = tf.keras.optimizers.Adam(lr=FLAGS.learning_rate)
model.compile(loss=loss,
optimizer=optimizer,
metrics=['accuracy', f1_m, recall_m, precision_m])
callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=ckpt_name)]
model.fit(train_generator, validation_data=validation_generator, validation_freq=1, epochs=10, callbacks=callbacks)
# using the callback there will weights saved in cktp_name each epoch
# Inference part, just need to reinstance the model (lines after #Change part comment)
model.load_weights(ckpt_name)
results = model.predict(test_generator, verbose=1)
You don't need to change the model obviously, you could have used x = backbone(x) and then model = tf.keras.Model(input, x)
This is a copy-paste of an issue I posted on the tensorflow Github.
System information
Have I written custom code: yes
OS Platform and Distribution: Linux Ubuntu 16.04
TensorFlow installed from: pip
TensorFlow version: 2.0.0b1
Python version: 3.6.8
CUDA/cuDNN version: V10.0.130
GPU model and memory: Quadro P5000 (16GB)
Describe the current behavior
I have a very complicated model solving an image-to-image problem. I also use a custom callback which at some point generates some noise using numpy.
When I use fit_generator on this model, it manages to do the first epoch, then on the second, third or fourth it hangs at the beginning of the epoch. I managed to see where the problem was happening, and it happens here: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/utils/data_utils.py#L875
Basically, if I put a timeout on the second get it times out after a few successful epochs (sometimes just one). There is no error thrown out so I don't know why it hangs. Furthermore, if I debug at that point in code, I can just execute the function synchronously and everything will work just fine.
Code to reproduce the issue
I didn't manage to get a minimal example using fit_generator (basically it relies too much on me using my model which is complex). However, I have a minimal example which reproduces the bug when I mimic the model_iteration function.
You need to install the following to make it work: pip install tensorflow-gpu==2.0.0b1 numpy tqdm
# imports
import time
import numpy as np
import tensorflow as tf
from tensorflow.python.keras import callbacks as cbks
from tensorflow.keras.callbacks import Callback
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.engine import training_utils
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.python.keras.utils import data_utils
from tensorflow.python.keras.utils import generic_utils
from tqdm import tqdm_notebook
# helper function (taken from https://github.com/tensorflow/tensorflow/blob/r2.0/tensorflow/python/keras/engine/training_generator.py#L500)
def _make_enqueued_generator(generator,
workers=1,
use_multiprocessing=False,
max_queue_size=10,
shuffle=False):
enqueuer = data_utils.OrderedEnqueuer(
generator, use_multiprocessing=use_multiprocessing, shuffle=shuffle)
enqueuer.start(workers=workers, max_queue_size=max_queue_size)
output_generator = enqueuer.get()
return output_generator, enqueuer
# My silly callback
class Noise(Callback):
def on_batch_end(self, batch, logs={}):
image_shape = [1, 2**7, 2**7, 1]
noise = np.random.normal(scale=1.0, size=image_shape)
# My data
batch_size = 8
n_samples_train = 720
x = np.random.rand(n_samples_train, 256, 256, 1)
im_gen_train = ImageDataGenerator().flow(x, batch_size=batch_size)
# My training set up (to mimic https://github.com/tensorflow/tensorflow/blob/r2.0/tensorflow/python/keras/engine/training_generator.py#L41)
data = im_gen_train
steps_per_epoch = int(n_samples_train / batch_size)
epochs = 20
max_queue_size=35
workers=35
use_multiprocessing=True
shuffle=False
initial_epoch=0
mode=1
steps_name='steps'
noise_cb = Noise()
noise_cb.on_train_batch_end = noise_cb.on_batch_end
callbacks=[noise_cb]
generator, enqueuer = _make_enqueued_generator(
im_gen_train,
workers=workers,
use_multiprocessing=use_multiprocessing,
max_queue_size=max_queue_size,
shuffle=shuffle)
callbacks = cbks.configure_callbacks(
callbacks,
Model(),
do_validation=False,
epochs=epochs,
steps_per_epoch=steps_per_epoch,
batch_size=batch_size,
samples=n_samples_train,
verbose=0, # Handle ProgBar as part of Callbacks once hooks are ready.
mode=mode,
)
callbacks._call_begin_hook(mode)
for epoch in tqdm_notebook(range(initial_epoch, epochs)):
callbacks.on_epoch_begin(epoch, {})
for step in tqdm_notebook(range(steps_per_epoch), leave=False):
callbacks._call_batch_hook('train', 'begin', step, {})
batch_data = next(generator)
# I don't actually train a model, so I just sleep for this time, this would be the backprop
time.sleep(0.1)
callbacks._call_batch_hook('train', 'end', step, {})
If you leave it as such, it will hang after about 1, 2, 3, or 4 iterations.
You can comment out the noise = np.random.normal(scale=1.0, size=image_shape) line and see that it doesn't hang.
You can also modify tensorflow's source code and timeout here in the second get so you can debug.
Note also that if the sleeping time is not high enough, hanging doesn't appear.
EDIT
I finally managed to put together a minimal example involving fit_generator directly:
# imports
import time
from keras_tqdm import TQDMNotebookCallback
import numpy as np
import tensorflow as tf
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.layers import Input, Conv2D, Lambda, concatenate
from tensorflow.python.keras.models import Model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import Sequence
# My silly callback
class Noise(Callback):
def on_batch_end(self, batch, logs={}):
image_shape = [1, 2**7, 2**7, 1]
noise = np.random.normal(scale=1.0, size=image_shape)
# my metrics
def keras_psnr(y_true, y_pred):
max_pixel = tf.math.reduce_max(y_true)
min_pixel = tf.math.reduce_min(y_true)
return tf.image.psnr(y_true, y_pred, max_pixel - min_pixel)
def keras_ssim(y_true, y_pred):
max_pixel = tf.math.reduce_max(y_true)
min_pixel = tf.math.reduce_min(y_true)
return tf.image.ssim(y_true, y_pred, max_pixel - min_pixel)
# My data
class MergedGenerators(Sequence):
def __init__(self, *generators):
self.generators = generators
# TODO add a check to verify that all generators have the same length
def __len__(self):
return len(self.generators[0])
def __getitem__(self, index):
return tuple([generator[index] for generator in self.generators])
batch_size = 8
n_samples_train = 720
size = 256
x = np.random.rand(n_samples_train, size, size, 1)
im_gen_train_1 = ImageDataGenerator().flow(x, batch_size=batch_size, seed=0)
im_gen_train_2 = ImageDataGenerator().flow(x, batch_size=batch_size, seed=0)
im_gen_train = MergedGenerators(im_gen_train_1, im_gen_train_2)
# my fake model
im = Input((None, None, 1))
conv = Conv2D(256, 3, padding='same')(im)
conv = Conv2D(256, 3, padding='same')(conv)
conv = Conv2D(1, 3, padding='same')(conv)
ident = Lambda(lambda x: x)(conv)
model = Model(im, ident)
model.compile(loss='mse', optimizer='adam', metrics=[keras_psnr, keras_ssim])
print(model.summary(line_length=150))
# My training set up
noise_cb = Noise()
noise_cb.on_train_batch_end = noise_cb.on_batch_end
tqdm_cb = TQDMNotebookCallback(metric_format="{name}: {value:e}")
tqdm_cb.on_train_batch_begin = tqdm_cb.on_batch_begin
tqdm_cb.on_train_batch_end = tqdm_cb.on_batch_end
model.fit_generator(
im_gen_train,
steps_per_epoch=int(n_samples_train / batch_size),
epochs=20,
max_queue_size=35,
workers=35,
use_multiprocessing=True,
shuffle=False,
callbacks=[noise_cb, tqdm_cb],
verbose=0,
)
It's not very bare, but at least it's directly usable (you just need to install keras-tqdm additionally).
This issue has been resolved in version 2.1.
Another fix would be to use the new random number generation API of numpy as advised here. That changes the line noise = np.random.normal(scale=1.0, size=image_shape) to noise = np.random.default_rng().normal(scale=1.0, size=image_shape). This fix works even in version 2.0.
This is a copy-paste of the answer I gave on Github.
I am trying to incriment the per_process_gpu_memory_fraction value in my tf.GPUOptions() and then change the Keras session with set_session() however, the memory fraction never actually changes. After the first run of the while loop, 319MB is reserved as shown in nvidia-smi, which
a) never gets released when clear_session() is called, and
b) doesn't go up on the next iteration of the while loop.
import GPUtil
import time
import tensorflow as tf
import numpy as np
from keras.backend.tensorflow_backend import set_session, clear_session, get_session
from tensorflow.python.framework.errors_impl import ResourceExhaustedError, UnknownError
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
def model_trainer():
y_pred = None
errors = 0
total_ram = GPUtil.getGPUs()[0].memoryTotal
total_ram_allowed = GPUtil.getGPUs()[0].memoryTotal * 0.90
mem_amount = 0.005 # intentionally allocated a small amount so it needs to
# increment the mem_amount
x_train = np.empty((10000, 100))
y_train = np.random.randint(0, 9, size=10000)
y_train = to_categorical(y_train, 10)
while y_pred is None:
print("mem", mem_amount)
if total_ram_allowed > total_ram * mem_amount and GPUtil.getGPUs()[0].memoryFree > total_ram * mem_amount:
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=mem_amount)
config = tf.ConfigProto(
intra_op_parallelism_threads=2,
inter_op_parallelism_threads=2,
gpu_options=gpu_options)
sess = tf.Session(config=config)
set_session(sess)
model = Sequential()
model.add(Dense(units=64, activation='relu', input_dim=100))
model.add(Dense(units=1024, activation='relu'))
model.add(Dense(units=1024, activation='relu'))
model.add(Dense(units=1024, activation='relu'))
model.add(Dense(units=1024, activation='relu'))
model.add(Dense(units=1024, activation='relu'))
model.add(Dense(units=10, activation='softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='sgd',
metrics=['accuracy'])
try:
print(sess)
model.fit(x_train, y_train, epochs=5, batch_size=32)
y_pred = model.predict(x_train)
except (ResourceExhaustedError, UnknownError) as e:
if mem_amount > 1.0:
raise ValueError('model too large for vram')
else:
mem_amount += 0.05
clear_session()
errors += 1
pass
else:
clear_session()
if __name__ == "__main__":
model_trainer()
The puzzling thing is that Keras willingly takes the new session (as shown by a get_session() call), but won't apply the new GPUOptions.
In addition to the example above I have tried doing:
clear_session()
del model
clear_session()
del model
gc.collect()
None of this has worked in releasing the VRAM.
My overall goal is to use "trial and error" until the process has enough VRAM to train on, as there seems to be no good way of figuring out how much VRAM is needed for a Keras model without just running it, so that I can run multiple models in parallel on a single GPU. When the ResourceExhaustedError occurs, I want to release the VRAM that is held by Keras and then try again with a larger amount of VRAM. Is there any way to accomplish this?
After searching for a while, I found that Tensorflow will only take VRAM, and will never release it until it dies, even if del model, clear_session() is used. I also tried the method displayed here (https://github.com/keras-team/keras/issues/9379), which uses:
from keras import backend as K
K.clear_session()
from numba import cuda
cuda.select_device(0)
cuda.close()
This resulted in an error for me as when Tensorflow tried to access the GPU again, its pointer to the memory space was invalid (as it was killed with cuda.close()). Thus the only way around it is to use processes, and not threads (tried that too, same issue as before).
The other thing I found is that while there are methods to try to estimate the amount of VRAM a Keras model will use, it is not a very accurate way of doing it. (see: How to determine needed memory of Keras model?) I also tried computing directly from the Keras layers and that varied wildly, so that wasn't accurate either. So that really only leaves you to do trial an error by catching the ResourceExhaustedError and trying again.
Below is my code for running multiple different Keras model on a single GPU.
import GPUtil
import time
import multiprocessing
import tensorflow as tf
import numpy as np
from keras.backend.tensorflow_backend import set_session, clear_session, get_session
from tensorflow.python.framework.errors_impl import ResourceExhaustedError, UnknownError
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
def model_trainer():
mem_amount = 0.05
x_train = np.empty((100000, 100))
y_train = np.random.randint(0, 9, size=100000)
y_train = to_categorical(y_train, 10)
manager = multiprocessing.Manager()
return_dict = manager.dict()
def worker(mem_amount, return_dict):
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=mem_amount)
config = tf.ConfigProto(
intra_op_parallelism_threads=2,
inter_op_parallelism_threads=2,
gpu_options=gpu_options)
sess = tf.Session(config=config)
set_session(sess)
model = Sequential()
model.add(Dense(units=64, activation='relu', input_dim=100))
model.add(Dense(units=1024, activation='relu'))
model.add(Dense(units=1024, activation='relu'))
model.add(Dense(units=2048, activation='relu'))
model.add(Dense(units=10, activation='softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='sgd',
metrics=['accuracy'])
try:
get_session()
model.fit(x_train, y_train, epochs=5, batch_size=1000)
return_dict["valid"] = True
except (ResourceExhaustedError, UnknownError) as e:
return
while "valid" not in list(return_dict.keys()):
print("mem", mem_amount)
total_ram = GPUtil.getGPUs()[0].memoryTotal
total_ram_allowed = GPUtil.getGPUs()[0].memoryTotal * 0.90
# can add in a for loop to have multiple models
if total_ram_allowed > total_ram * mem_amount and GPUtil.getGPUs()[0].memoryFree > total_ram * mem_amount:
p = multiprocessing.Process(target=worker, args=(mem_amount, return_dict))
p.start()
p.join()
print(return_dict.values())
if "valid" not in list(return_dict.keys()):
if mem_amount > 1.0:
raise ValueError('model too large for vram')
else:
mem_amount += 0.05
else:
break
else:
time.sleep(10)
if __name__ == "__main__":
model_trainer()
I am running a keras script (no direct call to theano in my script) and I get the following error:
TypeError: ('An update must have the same type as the original shared
variable (shared_var=<TensorType(float32, matrix)>,
shared_var.type=TensorType(float32, matrix),
update_val=Elemwise{add,no_inplace}.0,
update_val.type=TensorType(float64, matrix)).',
'If the difference is related to the broadcast pattern,
you can call the tensor.unbroadcast(var, axis_to_unbroadcast[, ...])
function to remove broadcastable dimensions.')
I have seen the error from folks running theano directly, but not through keras. Not sure what I should do, since I am not dealing with tensors directly.
the problem was that there is a change in keras version (I am currently using keras 0.3.2 with theano 0.8.0) and what used to be fine does not work well with he new keras version.
The following was the original code, and see the fix below.
from keras.models import Sequential
import keras.optimizers
from keras.layers.core import Dense, Dropout
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import PReLU
from keras.layers.core import Activation
from keras.optimizers import SGD, Adam
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, RegressorMixin
class NnRegression(BaseEstimator, RegressorMixin):
def __init__(self, apply_standart_scaling=True,
dropx=[0.2, 0.5, 0.5], nb_neuronx=[50, 30], nb_epoch=105, validation_split=0.,
verbose=1):
self.apply_standart_scaling = apply_standart_scaling
self.dropx = dropx
self.nb_neuronx = nb_neuronx
self.nb_epoch = nb_epoch
self.validation_split = validation_split
self.verbose = verbose
def fit(self, X, y):
nb_features = X.shape[1]
self.standart_scaling = StandardScaler() if self.apply_standart_scaling else None
if self.standart_scaling:
X = self.standart_scaling.fit_transform(X)
model = Sequential()
model.add(Dropout(input_shape = (nb_features,),p= self.dropx[0]))
model.add(Dense(output_dim = self.nb_neuronx[0], init='glorot_uniform'))
model.add(PReLU())
model.add(BatchNormalization(self.nb_neuronx[0],)))
model.add(Dropout(self.dropx[1]))
model.add(Dense(self.nb_neuronx[1], init='glorot_uniform'))
model.add(PReLU())
model.add(BatchNormalization(self.nb_neuronx[0],)))
model.add(Dropout(self.dropx[2]))
model.add(Dense(1, init='glorot_uniform'))
nn_verbose = 1 if self.verbose>0 else 0
optz = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
model.compile(optimizer=Adam(),loss='mse')
model.fit(X, y, batch_size=16,
nb_epoch=self.nb_epoch, validation_split=self.validation_split, verbose=nn_verbose)
self.model = model
def predict(self, X):
if self.standart_scaling:
X = self.standart_scaling.transform(X)
return self.model.predict_proba(X, verbose=0)
well, it turns out that the problem is this single line of code:
model.add(BatchNormalization(self.nb_neuronx[0],)))
It should actually be:
model.add(BatchNormalization())
because the number of neurons has no business within the normalization layer (however this did not bother in a previous keras version).
This apparently causes theano to generate new weights that are not float32 but float64, and that triggers the message above.