I have tried and failed to make Keras model.fit() work on my multi-output model with a custom loss that uses all outputs' targets and predictions (specifically for 2 outputs) in TF 2.
When I tried to do this on a model made with the Keras functional API, I get the error: "SymbolicException: Inputs to eager execution function cannot be Keras symbolic tensors, but found ..."
meaning I can't use my loss function because it returns an eager tensor to a Keras DAG that works with symbolic tensors (functional API model). To get around this, I used model.add_loss() instead of passing my loss function into model.compile(), but I believe this hogged GPU memory and caused OOM errors.
I've tried workarounds, where I put my functional API model inside a Keras subclassed model or make a completely new Keras subclassed model.
Workaround 1 is below in code, and runs yet gives me NaNs across the epochs on training on a variety of gradient clippings, and gives 0-valued outputs.
Workaround 2 gives me an error inside the override call() method because the inputs param is different shapes during model compile-time and run-time because my model (in a quirky way) has 3 inputs: 1 is the actual input to the DLNN, and the 2 others are the targets for the input sample. This is so that I can get the targets from each sample into the loss function.
from scipy.io import wavfile
import scipy.signal as sg
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.layers import Input, SimpleRNN, Dense, Lambda, TimeDistributed, Layer, LSTM, Bidirectional, BatchNormalization, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.activations import relu
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np
import datetime
import numpy as np
import math
import random
import json
import os
import sys
# Loss function
def discriminative_loss(piano_true, noise_true, piano_pred, noise_pred, loss_const):
last_dim = piano_pred.shape[1] * piano_pred.shape[2]
return (
tf.math.reduce_mean(tf.reshape(noise_pred - noise_true, shape=(-1, last_dim)) ** 2, axis=-1) -
(loss_const * tf.math.reduce_mean(tf.reshape(noise_pred - piano_true, shape=(-1, last_dim)) ** 2, axis=-1)) +
tf.math.reduce_mean(tf.reshape(piano_pred - piano_true, shape=(-1, last_dim)) ** 2, axis=-1) -
(loss_const * tf.math.reduce_mean(tf.reshape(piano_pred - noise_true, shape=(-1, last_dim)) ** 2, axis=-1))
)
def make_model(features, sequences, name='Model'):
input_layer = Input(shape=(sequences, features), dtype='float32',
name='piano_noise_mixed')
piano_true = Input(shape=(sequences, features), dtype='float32',
name='piano_true')
noise_true = Input(shape=(sequences, features), dtype='float32',
name='noise_true')
x = SimpleRNN(features // 2,
activation='relu',
return_sequences=True) (input_layer)
piano_pred = TimeDistributed(Dense(features), name='piano_hat') (x) # source 1 branch
noise_pred = TimeDistributed(Dense(features), name='noise_hat') (x) # source 2 branch
model = Model(inputs=[input_layer, piano_true, noise_true],
outputs=[piano_pred, noise_pred])
return model
# Model "wrapper" for many-input loss function
class RestorationModel2(Model):
def __init__(self, model, loss_const):
super(RestorationModel2, self).__init__()
self.model = model
self.loss_const = loss_const
def call(self, inputs):
return self.model(inputs)
def compile(self, optimizer, loss):
super(RestorationModel2, self).compile()
self.optimizer = optimizer
self.loss = loss
def train_step(self, data):
# Unpack data - what generator yeilds
x, piano_true, noise_true = data
with tf.GradientTape() as tape:
piano_pred, noise_pred = self.model((x, piano_true, noise_true), training=True)
loss = self.loss(piano_true, noise_true, piano_pred, noise_pred, self.loss_const)
trainable_vars = self.model.trainable_variables
gradients = tape.gradient(loss, trainable_vars)
self.optimizer.apply_gradients(zip(gradients, trainable_vars))
return {'loss': loss}
def test_step(self, data):
x, piano_true, noise_true = data
piano_pred, noise_pred = self.model((x, piano_true, noise_true), training=False)
loss = self.loss(piano_true, noise_true, piano_pred, noise_pred, self.loss_const)
return {'loss': loss}
def make_imp_model(features, sequences, loss_const=0.05,
optimizer=tf.keras.optimizers.RMSprop(clipvalue=0.7),
name='Restoration Model', epsilon=10 ** (-10)):
# NEW Semi-imperative model
model = RestorationModel2(make_model(features, sequences, name='Training Model'),
loss_const=loss_const)
model.compile(optimizer=optimizer, loss=discriminative_loss)
return model
# MODEL TRAIN & EVAL FUNCTION
def evaluate_source_sep(train_generator, validation_generator,
num_train, num_val, n_feat, n_seq, batch_size,
loss_const, epochs=20,
optimizer=tf.keras.optimizers.RMSprop(clipvalue=0.75),
patience=10, epsilon=10 ** (-10)):
print('Making model...') # IMPERATIVE MODEL - Customize Fit
model = make_imp_model(n_feat, n_seq, loss_const=loss_const, optimizer=optimizer, epsilon=epsilon)
print('Going into training now...')
hist = model.fit(train_generator,
steps_per_epoch=math.ceil(num_train / batch_size),
epochs=epochs,
validation_data=validation_generator,
validation_steps=math.ceil(num_val / batch_size),
callbacks=[EarlyStopping('val_loss', patience=patience, mode='min')])
print(model.summary())
# NEURAL NETWORK DATA GENERATOR
def my_dummy_generator(num_samples, batch_size, train_seq, train_feat):
while True:
for offset in range(0, num_samples, batch_size):
# Initialise x, y1 and y2 arrays for this batch
x, y1, y2 = (np.empty((batch_size, train_seq, train_feat)),
np.empty((batch_size, train_seq, train_feat)),
np.empty((batch_size, train_seq, train_feat)))
yield (x, y1, y2)
def main():
epsilon = 10 ** (-10)
train_batch_size = 5
loss_const, epochs, val_split = 0.05, 10, 0.25
optimizer = tf.keras.optimizers.RMSprop(clipvalue=0.9)
TRAIN_SEQ_LEN, TRAIN_FEAT_LEN = 1847, 2049
TOTAL_SMPLS = 60
# Validation & Training Split
indices = list(range(TOTAL_SMPLS))
val_indices = indices[:math.ceil(TOTAL_SMPLS * val_split)]
num_val = len(val_indices)
num_train = TOTAL_SMPLS - num_val
train_seq, train_feat = TRAIN_SEQ_LEN, TRAIN_FEAT_LEN
print('Train Input Stats:')
print('N Feat:', train_feat, 'Seq Len:', train_seq, 'Batch Size:', train_batch_size)
# Create data generators and evaluate model with them
train_generator = my_dummy_generator(num_train,
batch_size=train_batch_size, train_seq=train_seq,
train_feat=train_feat)
validation_generator = my_dummy_generator(num_val,
batch_size=train_batch_size, train_seq=train_seq,
train_feat=train_feat)
evaluate_source_sep(train_generator, validation_generator, num_train, num_val,
n_feat=train_feat, n_seq=train_seq,
batch_size=train_batch_size,
loss_const=loss_const, epochs=epochs,
optimizer=optimizer, epsilon=epsilon)
if __name__ == '__main__':
main()
Thanks for the help!
Solution, don't pass your loss into model.add_loss(). Instead concatenate your outputs together which lets you pass your custom loss into model.compile(). Then deal with the outputs in the custom loss function.
class TimeFreqMasking(Layer):
# Init is for input-independent variables
def __init__(self, epsilon, **kwargs):
super(TimeFreqMasking, self).__init__(**kwargs)
self.epsilon = epsilon
# No build method, b/c passing in multiple inputs to layer (no single shape)
def call(self, inputs):
y_hat_self, y_hat_other, x_mixed = inputs
mask = tf.abs(y_hat_self) / (tf.abs(y_hat_self) + tf.abs(y_hat_other) + self.epsilon)
y_tilde_self = mask * x_mixed
return y_tilde_self
def discrim_loss(y_true, y_pred):
piano_true, noise_true = tf.split(y_true, num_or_size_splits=2, axis=-1)
loss_const = y_pred[-1, :, :][0][0]
piano_pred, noise_pred = tf.split(y_pred[:-1, :, :], num_or_size_splits=2, axis=0)
last_dim = piano_pred.shape[1] * piano_pred.shape[2]
return (
tf.math.reduce_mean(tf.reshape(noise_pred - noise_true, shape=(-1, last_dim)) ** 2) -
(loss_const * tf.math.reduce_mean(tf.reshape(noise_pred - piano_true, shape=(-1, last_dim)) ** 2)) +
tf.math.reduce_mean(tf.reshape(piano_pred - piano_true, shape=(-1, last_dim)) ** 2) -
(loss_const * tf.math.reduce_mean(tf.reshape(piano_pred - noise_true, shape=(-1, last_dim)) ** 2))
)
def make_model(features, sequences, epsilon, loss_const):
input_layer = Input(shape=(sequences, features), name='piano_noise_mixed')
x = SimpleRNN(features // 2,
activation='relu',
return_sequences=True) (input_layer)
x = SimpleRNN(features // 2,
activation='relu',
return_sequences=True) (x)
piano_hat = TimeDistributed(Dense(features), name='piano_hat') (x) # source 1 branch
noise_hat = TimeDistributed(Dense(features), name='noise_hat') (x) # source 2 branch
piano_pred = TimeFreqMasking(epsilon=epsilon,
name='piano_pred') ((piano_hat, noise_hat, input_layer))
noise_pred = TimeFreqMasking(epsilon=epsilon,
name='noise_pred') ((noise_hat, piano_hat, input_layer))
preds_and_gamma = Concatenate(axis=0) ([piano_pred,
noise_pred,
# loss_const_tensor
tf.broadcast_to(tf.constant(loss_const), [1, sequences, features])
])
model = Model(inputs=input_layer, outputs=preds_and_gamma)
model.compile(optimizer=optimizer, loss=discrim_loss)
return model
def dummy_generator(num_samples, batch_size, num_seq, num_feat):
while True:
for _ in range(0, num_samples, batch_size):
x, y1, y2 = (np.random.rand(batch_size, num_seq, num_feat),
np.random.rand(batch_size, num_seq, num_feat),
np.random.rand(batch_size, num_seq, num_feat))
yield ([x, np.concatenate((y1, y2), axis=-1)])
total_samples = 6
batch_size = 2
time_steps = 3
features = 4
loss_const = 2
epochs = 10
val_split = 0.25
epsilon = 10 ** (-10)
model = make_model(features, time_steps, epsilon, loss_const)
print(model.summary())
num_val = math.ceil(actual_samples * val_split)
num_train = total_samples - val_samples
train_dataset = dummy_generator(num_train, batch_size, time_steps, features)
val_dataset = dummy_generator(num_val, batch_size, time_steps, features)
model.fit(train_dataset,
steps_per_epoch=math.ceil(num_train / batch_size),
epochs=epochs,
validation_data=val_dataset,
validation_steps=math.ceil(num_val / batch_size)
I'm trying to implement linear regression using Rms Prop optimizer from scratch.
Code:
EPOCHS = 100
w3 = tf.Variable(w_vector, dtype = tf.float32)
w4 = tf.Variable(0, dtype = tf.float32)
lr = 1e-5
beta = 0.9
epilson = 1e-7
momentum = 0.0
for epoch in range(1,EPOCHS+1):
mom_w = 0
mom_b = 0
mean_square_w = 0
mean_gradient_w = 0
mean_square_b = 0
mean_gradient_b = 0
y_pred1 = tf.squeeze(tf.matmul(w3,x, transpose_a = True, transpose_b = True) + w4)
dw3, dw4 = gradients_mse(x, y, y_pred1)
# My eqautions for RMS prop
mean_square_w = beta * mean_square_w + (1-beta) * dw3 ** 2
mean_gradient_w = beta * mean_gradient_w + (1-beta) * dw3
mom_w = momentum * mom_w + lr * (dw3/(tf.sqrt(mean_square_w + epilson - mean_gradient_w ** 2)))
mean_square_b = beta * mean_square_b + (1-beta) * dw4 ** 2
mean_gradient_b = beta * mean_gradient_b + (1-beta) * dw4
mom_b = momentum * mom_b + lr * (dw4/(tf.sqrt(mean_square_b + epilson - mean_gradient_b ** 2)))
w3.assign_sub(mom_w)
w4.assign_sub(mom_b)
print('w3 : {}'.format(w3.numpy()))
print('w4 : {}'.format(w4.numpy()))
Output:
w3 : [[-1.2507935]]
w4 : 0.0033333366736769676
Now I create a single layer and single neuron neural network with no activation function. Assign the same weights in its neuron and use RMS prop as optimizer I get different final weights. However, this was not the case for sgd optimizer.
Code:
# using keras to get same results
def create_model():
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(units = 1, name = 'd1', input_shape = (x.shape[1],)))
model.compile(optimizer=tf.keras.optimizers.RMSprop(
learning_rate=1e-5, rho=0.9, momentum=0.0, epsilon=1e-07, centered=False),
loss="mse")
return model
model = create_model()
d1 = model.get_layer('d1')
d1_weights = [tf.constant(w_vector, dtype = tf.float32), tf.constant(np.array([0]), dtype = tf.float32)]
d1.set_weights(d1_weights)
model.fit(x, y, epochs = 100)
d1 = model.get_layer('d1')
print('w3 = {}'.format(d1.weights[0].numpy()))
print('w4 = {}'.format(d1.weights[1].numpy()[0]))
Output:
w3 = [[-1.2530397]]
w4 = 0.0010913893347606063
My gradients are calculate correctly for mse loss function. I have crosschecked them with tensorflows inbuilt gradient computation function gradient tape.
Code:
# Computing gradients
def gradients_mse(X, Y, Y_PREDS):
DW1 = tf.matmul(X, tf.reshape(Y-Y_PREDS, (X.shape[0],1)), transpose_a = True) * (-2/X.shape[0])
DW0 = (-2 / X.shape[0]) * tf.reduce_sum(Y - Y_PREDS)
return DW1, DW0
The only thing that can go wrong in this implementation is I think calculation of mom_w and mom_b using incorrect equations.
x.shape = [10,1]
The default batch size is 32 so it will have no effects on weight updates. The same code gives perfectly matching output when I try to use simple gradient descent instead of RMS prop.
I am trying to build a custom loss function that takes the previous output(output from the previous iteration) from the network and use it with the current output.
Here is what I am trying to do, but I don't know how to complete it
def l_loss(prev_output):
def loss(y_true, y_pred):
pix_loss = K.mean(K.square(y_pred - y_true), axis=-1)
pase = K.variable(100)
diff = K.mean(K.abs(prev_output - y_pred))
movement_loss = K.abs(pase - diff)
total_loss = pix_loss + movement_loss
return total_loss
return loss
self.model.compile(optimizer=Adam(0.001, beta_1=0.5, beta_2=0.9),
loss=l_loss(?))
I hope you can help me.
This is what I tried:
from tensorflow import keras
from tensorflow.keras.layers import *
from tensorflow.keras.models import Sequential
from tensorflow.keras import backend as K
class MovementLoss(object):
def __init__(self):
self.var = None
def __call__(self, y_true, y_pred, sample_weight=None):
mse = K.mean(K.square(y_true - y_pred), axis=-1)
if self.var is None:
z = np.zeros((32,))
self.var = K.variable(z)
delta = K.update(self.var, mse - self.var)
return mse + delta
def make_model():
model = Sequential()
model.add(Dense(1, input_shape=(4,)))
loss = MovementLoss()
model.compile('adam', loss)
return model
model = make_model()
model.summary()
Using an example test data.
import numpy as np
X = np.random.rand(32, 4)
POLY = [1.0, 2.0, 0.5, 3.0]
def test_fn(xi):
return np.dot(xi, POLY)
Y = np.apply_along_axis(test_fn, 1, X)
history = model.fit(X, Y, epochs=4)
I do see the loss function oscillate in a way that appears to me is influenced by the last batch delta. Note that the loss function details are not according to your application.
The crucial step is that the K.update step must be part of the graph (as far as I understand it).
That is achieved by:
delta = K.update(var, delta)
return x + delta
I am trying to code up an implementation of the variational autoencoder, however I am facing some difficulties regarding the loss function:
def vae_loss(sigma, mu):
def loss(y_true, y_pred):
recon = K.sum(K.binary_crossentropy(y_true, y_pred), axis=-1)
kl = 0.5 * K.sum(K.exp(sigma) + K.square(mu) - 1. - sigma, axis=-1)
return recon + kl
return loss
The binary crossentropy part works fine, but whenever I return only the divergence term kl for testing I get the following error:
ValueError: "Tried to convert 'x' to a tensor and failed. Error: None values not supported.".
I am looking forward to possible hints as to what I have done wrong. You will find my entire code below. Thank you for your time!
import numpy as np
from keras import Model
from keras.layers import Input, Dense, Lambda
import keras.backend as K
from keras.datasets import mnist
from matplotlib import pyplot as plt
class VAE(object):
def __init__(self, n_latent, batch_size):
self.encoder, self.encoder_input, self.mu, self.sigma = self.create_encoder(n_latent, batch_size)
self.decoder, self.decoder_input, self.decoder_output = self.create_decoder(n_latent, batch_size)
pipeline = self.decoder(self.encoder.outputs[0])
def vae_loss(sigma, mu):
def loss(y_true, y_pred):
recon = K.sum(K.binary_crossentropy(y_true, y_pred), axis=-1)
kl = 0.5 * K.sum(K.exp(sigma) + K.square(mu) - 1. - sigma, axis=-1)
return recon + kl
return loss
self.VAE = Model(self.encoder_input, pipeline)
self.VAE.compile(optimizer="adadelta", loss=vae_loss(self.sigma, self.mu))
def create_encoder(self, n_latent, batch_size):
input_layer = Input(shape=(784,))
#net = Dense(512, activation="relu")(input_layer)
mu = Dense(n_latent, activation="linear")(input_layer)
print(mu)
sigma = Dense(n_latent, activation="linear")(input_layer)
def sample_z(args):
mu, log_sigma = args
eps = K.random_normal(shape=(K.shape(input_layer)[0], n_latent), mean=0., stddev=1.)
K.print_tensor(K.shape(eps))
return mu + K.exp(log_sigma / 2) * eps
sample_z = Lambda(sample_z)([mu, sigma])
model = Model(inputs=input_layer, outputs=[sample_z, mu, sigma])
return model, input_layer, mu, sigma
def create_decoder(self, n_latent, batch_size):
input_layer = Input(shape=(n_latent,))
#net = Dense(512, activation="relu")(input_layer)
reconstruct = Dense(784, activation="linear")(input_layer)
model = Model(inputs=input_layer, outputs=reconstruct)
return model, input_layer, reconstruct
I am going to assume the error appears when you are "testing"/debugging your training phase, during backpropagation (let me if I am wrong).
If so, the problem is that you are asking Keras to optimize your whole network (model.VAE.fit(...)) while using a loss (kl) covering only the encoder part. The gradients for the decoder stay undefined (without a loss like recon covering it), causing the optimization error.
For your debugging purpose, the error would disappear if you try to compile and fit only the encoder with this amputated loss (kl), or if you come up with a dummy (differentiable) loss covering also the decoder (e.g. K.sum(y_pred - y_pred, axis=-1) + kl).