keras seems not using my custom designed loss function - python

I defined a triplet loss function as follow:
def triplet_loss(self, alpha):
def loss(y_true, y_pred):
anc, pos, neg = (
y_pred[:, :emb_size],
y_pred[:, emb_size : 2 * emb_size],
y_pred[:, 2 * emb_size :],
distance1 = tf.sqrt(tf.reduce_sum(tf.pow(anc - pos, 2), 1, keepdims=True))
distance2 = tf.sqrt(tf.reduce_sum(tf.pow(anc - neg, 2), 1, keepdims=True))
return tf.reduce_mean(tf.maximum(distance1 - distance2 + alpha, 0.0))
return loss
and then I compile it with my model:
optimizer = SGD(
# lr_multipliers=learning_rate_multipliers,
# emb_dim=emb_size
then the model start training process by using self.model.train_on_batch(data),
Iteration 1/10000: Train loss: 4.558698, lr = 0.001000
Iteration 2/10000: Train loss: 4.558681, lr = 0.001000
Iteration 3/10000: Train loss: 4.558654, lr = 0.001000
Iteration 4/10000: Train loss: 4.558623, lr = 0.001000
However, the problem is that I didn't even pass the emb_size to the loss function. And I tried to remove the every thing in my functionloss in my loss function as:
def triplet_loss(self, alpha):
def loss(y_true, y_pred):
print("My loss function")
return loss
and run the training program again, the training can still working:
Iteration 1/10000: Train loss: 4.556189, lr = 0.001000
Iteration 2/10000: Train loss: 4.556171, lr = 0.001000
Iteration 3/10000: Train loss: 4.556145, lr = 0.001000
So I think the keras is not using my loss function at all, why would this happen? What should I do to fix this problem?


VAE autoencoder training fit_generator doesn't seems reduced the loss

I have tried to use the fit_generator function. It's weird that when I use fit_generator, the loss seems just slowly going down by a little. Like I already trained about 5 days my loss just decrease from 129 to 119. But I have search how long should the model training and it said on the internet like only 8 hours. I do think it's my problem but i don't know what is the problem so I wish somebody could help me! And my data was only using 100 images only.
Below is my training code
vae_model.compile(optimizer=adam_optimizer, loss = total_loss, metrics = [r_loss, kl_loss],experimental_run_tf_function=False)
N_EPOCHS = 3000
zz = '/weights.h5'
checkpoint_vae = ModelCheckpoint(zz, save_weights_only = True, verbose=1)
epochs = N_EPOCHS,
initial_epoch = 0,
steps_per_epoch=tr_num / BATCH_SIZE,
def r_loss(y_true, y_pred):
return K.mean(K.square(y_true - y_pred), axis = [1,2,3])
def kl_loss(y_true, y_pred):
kl_loss = -0.5 * K.sum(1 + log_var - K.square(mean_mu) - K.exp(log_var), axis = 1)
return kl_loss
def total_loss(y_true, y_pred):
return LOSS_FACTOR*r_loss(y_true, y_pred) + kl_loss(y_true, y_pred)
adam_optimizer = Adam(lr = LEARNING_RATE, beta_1=0.5, beta_2=0.9)

If-Else Statement in Custom Training Loop in Tensorflow

I created a model class which is a subclass of keras.Model. While training the model, I want to change the weights of the loss functions after some epochs. In order to do that I created boolean variables to my model indicating that the model should start training with additional loss function. I add a pseudo code that mainly shows what I am trying to achieve.
class MyModel(keras.Model):
self.start_loss_2 = False
def train_step(self):
# Check if training with loss_2 started
weight_loss_2 = 0.0
if self.start_loss_2:
weight_loss_2 = 0.5
# Pass the data through model
# Calculate two loss values
total_loss = loss_1 + weight_loss_2 * loss_2
# Calculate gradients with tf.Tape
# Update variables
# This is called via Callback after each epoch
def epoch_finised(epoch_num):
if epoch_num > START_LOSS_2:
self.start_loss_2 = True
My questions is:
Is it valid to use if-else statement whose value changes after some time? If it is not, how can achieve this?
Yes. You can create a tf.Variable and then assign a new value to it based on some training criteria.
import numpy as np
import tensorflow as tf
# simple toy network
x_in = tf.keras.Input((10))
x = tf.keras.layers.Dense(25)(x_in)
x_out = tf.keras.layers.Dense(1)(x)
# model
m = tf.keras.Model(x_in, x_out)
# fake data
X = tf.random.normal((100, 10))
y0 = tf.random.normal((100, ))
y1 = tf.random.normal((100, ))
# optimizer
m_opt = tf.keras.optimizers.Adam(1e-2)
# prep data
ds =, y0, y1))
ds = ds.repeat().batch(5)
train_iter = iter(ds)
# toy loss function that uses a weight
def loss_fn(y_true0, y_true1, y_pred, weight):
mse = tf.keras.losses.MSE
mse_0 = tf.math.reduce_mean(mse(y_true0, y_pred))
mse_1 = tf.math.reduce_mean(mse(y_true1, y_pred))
return mse_0 + weight * mse_1
# the weight variable set to 0 initially and then
# will be changed after a certain number of steps
# (or some other training criteria)
w = tf.Variable(0.0, trainable=False)
for epoch in range(NUM_EPOCHS):
losses = []
for batch in range(NUM_BATCHES_PER_EPOCH):
X_train, y0_train, y1_train = next(train_iter)
with tf.GradientTape() as tape:
y_hat = m(X_train)
loss = loss_fn(y0_train, y1_train, y_hat, w)
m_vars = m.trainable_variables
m_grads = tape.gradient(loss, m_vars)
m_opt.apply_gradients(zip(m_grads, m_vars))
print(f"epoch: {epoch}\tloss: {np.mean(losses):.4f}")
losses = []
# if the criteria is met assign a huge number to see if the
# loss spikes up
if (epoch + 1) * (batch + 1) >= START_NEW_LOSS_AT_GLOBAL_STEP:
# epoch: 0 loss: 1.8226
# epoch: 1 loss: 1.1143
# epoch: 2 loss: 8788.2227 <= looks like assign worked
# epoch: 3 loss: 10999.5449

can keras use sklearn in custom metrics to create micro f1_score

I found a version in stackoverflow
from keras import backend as K
def f1(y_true, y_pred):
def recall(y_true, y_pred):
"""Recall metric.
Only computes a batch-wise average of recall.
Computes the recall, a metric for multi-label classification of
how many relevant items are selected.
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
recall = true_positives / (possible_positives + K.epsilon())
return recall
def precision(y_true, y_pred):
"""Precision metric.
Only computes a batch-wise average of precision.
Computes the precision, a metric for multi-label classification of
how many selected items are relevant.
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
precision = true_positives / (predicted_positives + K.epsilon())
return precision
precision = precision(y_true, y_pred)
recall = recall(y_true, y_pred)
return 2*((precision*recall)/(precision+recall+K.epsilon()))
optimizer= "adam",
but could I use sklearn f1_score in creating custom metrics?
I want to use the average of f1_score macro and f1_score micro, could anybody help me? thanks
I think you can use the code you have showed above during training each batch. Because which is calculating each batch's F1 Score, and you can see the log printed in your terminal.
1/13 [=>............................] - ETA: 4s - loss: 0.2646 - f1: 0.2927
2/13 [===>..........................] - ETA: 4s - loss: 0.2664 - f1: 0.1463
13/13 [==============================] - 7s 505ms/step - loss: 0.2615 - f1: 0.1008 - val_loss: 0.2887 - val_f1: 0.1464
If you use fit method and want to calculate each epoch's F1, you should try to code like below.
class Metrics(Callback):
Defined your personal callback
def on_train_begin(self, logs={}):
self.val_f1s = []
self.val_recalls = []
self.val_precisions = []
def on_epoch_end(self, epoch, logs={}):
# val_predict = (np.asarray(self.model.predict(self.validation_data[0]))).round()
val_predict = np.argmax(np.asarray(self.model.predict(self.validation_data[0])), axis=1)
# val_targ = self.validation_data[1]
val_targ = np.argmax(self.validation_data[1], axis=1)
_val_f1 = f1_score(val_targ, val_predict, average='macro')
# _val_recall = recall_score(val_targ, val_predict)
# _val_precision = precision_score(val_targ, val_predict)
# self.val_recalls.append(_val_recall)
# self.val_precisions.append(_val_precision)
# print('— val_f1: %f — val_precision: %f — val_recall %f' %(_val_f1, _val_precision, _val_recall))
print(' — val_f1:', _val_f1)
use your callback fit method.
metrics = Metrics()
There're tips to notice:
If you use fit_generator() method to train, you can only use the code you showed. Otherways, if using fit() method, you can try Callback function.
There all!

Simple Machine learning model training returning Nan

I am trying to start learning ML.
I wrote a simple example:
import numpy as np
# Prepare the data
input = np.array(list(range(100)))
output = np.array([x**2 + 2 for x in list(range(100))])
# Visualize Data
import matplotlib.pyplot as plt
plt.plot(input, output, 'ro')
# Define your Model
a = 1
b = 1
# y = ax + b # we put a bias in the model based on our knowledge
# Train your model == Optimize the parameters so that they give very less loss
for e in range(10):
for x, y in zip(input, output):
y_hat = a*x + b
loss = 0.5*(y_hat-y)**2
# Now that we have loss, we want gradient of the parameters a and b
# derivative of loss wrt a = (-x)(y-ax+b)
# so gradient descent: a = a - (learning_rate)*(derivative wrt a)
a = a - 0.1*(-x)*(y_hat-y)
b = b - 0.1*(-1)*(y_hat-y)
print("Epoch {0} Training loss = {1}".format(e, loss))
# Make Prections on new data
test_input = np.array(list(range(101,150)))
test_output = np.array([x**2.0 + 2 for x in list(range(101,150))])
model_predictions = np.array([a*x + b for x in list(range(101,150))])
plt.plot(test_input, test_output, 'ro')
plt.plot(test_input, model_predictions, '-')
Now when I run the code: RuntimeWarning: overflow encountered in double_scalars
loss = 0.5*(y_hat-y)**2
Epoch 0 Training loss = inf RuntimeWarning: overflow encountered in double_scalars
y_hat = a*x + b
Epoch 1 Training loss = inf RuntimeWarning: invalid value encountered in double_scalars
y_hat = a*x + b
Epoch 2 Training loss = nan
Epoch 3 Training loss = nan
Epoch 4 Training loss = nan
Epoch 5 Training loss = nan
Epoch 6 Training loss = nan
Epoch 7 Training loss = nan
Epoch 8 Training loss = nan
Epoch 9 Training loss = nan
Why is the error nan? I wrote the simplest model, but with python I was getting:
Traceback (most recent call last):
File "", line 20, in <module>
loss = (y_hat-y)**2
OverflowError: (34, 'Result too large')
Then I converted all Python lists to numpy. Now, I get Nan error, I just don't understand why these small values are giving theses errors.
With Daniele's answer to replace the loss with mean squared loss, i.e. dividing the loss by total number of inputs, I get this output:
Epoch 0 Training loss = 1.7942781420994678e+36
Epoch 1 Training loss = 9.232837400842652e+70
Epoch 2 Training loss = 4.751367833814119e+105
Epoch 3 Training loss = 2.4455835946216386e+140
Epoch 4 Training loss = 1.2585275201812707e+175
Epoch 5 Training loss = 6.4767849625200624e+209
Epoch 6 Training loss = 3.331617554363007e+244
Epoch 7 Training loss = 1.714758503849272e+279 RuntimeWarning: overflow encountered in double_scalars
loss = 0.5*(y-y_hat)**2
Epoch 8 Training loss = inf
Epoch 9 Training loss = inf
At least it runs, but I am trying to learn the linear function using Stochastic gradient descent, which updates parameters after each point's loss.
Still not getting how people work with these models, loss should decrease why is it increasing with gradient descent?
You got the math wrong. When you compute the gradient update for GD you have to divide by the number of samples in your dataset: that's why it is called mean squared error and not just squared error.
Also, you might want to use smaller inputs since you're trying to work with an exponential, as it tends to grow... well, exponentially with x.
Look at this post for a good intro to LR and GD.
I took the liberty of rewriting your code a bit, this should work:
import numpy as np
import matplotlib.pyplot as plt
# Prepare the data
input_ = np.linspace(0, 10, 100) # Don't assign user data to Python's input builtin
output = np.array([x**2 + 2 for x in input_])
# Define model
a = 1
b = 1
# Train model
N = input_.shape[0] # Number of samples
for e in range(10):
loss = 0.
for x, y in zip(input_, output):
y_hat = a * x + b
a = a - 0.1 * (2. / N) * (-x) * (y - y_hat)
b = b - 0.1 * (2. / N) * (-1) * (y - y_hat)
loss += 0.5 * ((y - y_hat) ** 2)
loss /= N
print("Epoch {:2d}\tLoss: {:4f}".format(e, loss))
# Predict on test data
test_input = np.linspace(0, 15, 150) # Training data [0-10] + test data [10 - 15]
test_output = np.array([x**2.0 + 2 for x in test_input])
model_predictions = np.array([a*x + b for x in test_input])
plt.plot(test_input, test_output, 'ro')
plt.plot(test_input, model_predictions, '-')
This should give you as output something along these lines:
Epoch 0 Loss: 33.117127
Epoch 1 Loss: 42.949756
Epoch 2 Loss: 40.733332
Epoch 3 Loss: 38.657764
Epoch 4 Loss: 36.774646
Epoch 5 Loss: 35.067299
Epoch 6 Loss: 33.520409
Epoch 7 Loss: 32.119958
Epoch 8 Loss: 30.853112
Epoch 9 Loss: 29.708126
And this is the output plot:
EDIT: OP was asking about SGD. The above answer is still valid code, but it's for standard GD (where you iterate on the whole dataset at the same time).
For SGD, the main loop must be slightly changed:
for e in range(10):
for x, y in zip(input_, output):
y_hat = a * x + b
loss = 0.5 * ((y - y_hat) ** 2)
a = a - 0.01 * (2.) * (-x) * (y - y_hat)
b = b - 0.01 * (2.) * (-1) * (y - y_hat)
print("Epoch {:2d}\tLoss: {:4f}".format(e, loss))
Note that I had to lower the learning rate to avoid divergence. When you train with a batch size of 1 it becomes really important to avoid this kind of gradient explosions, because a single sample may substantially mess up your descent towards the optimum.
Example output:
Epoch 0 Loss: 0.130379
Epoch 1 Loss: 0.123007
Epoch 2 Loss: 0.117352
Epoch 3 Loss: 0.112991
Epoch 4 Loss: 0.109615
Epoch 5 Loss: 0.106992
Epoch 6 Loss: 0.104948
Epoch 7 Loss: 0.103353
Epoch 8 Loss: 0.102105
Epoch 9 Loss: 0.101127

TensorFlow for binary classification

I am trying to adapt this MNIST example to binary classification.
But when changing my NLABELS from NLABELS=2 to NLABELS=1, the loss function always returns 0 (and accuracy 1).
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf
# Import data
mnist = input_data.read_data_sets('data', one_hot=True)
sess = tf.InteractiveSession()
# Create the model
x = tf.placeholder(tf.float32, [None, 784], name='x-input')
W = tf.Variable(tf.zeros([784, NLABELS]), name='weights')
b = tf.Variable(tf.zeros([NLABELS], name='bias'))
y = tf.nn.softmax(tf.matmul(x, W) + b)
# Add summary ops to collect data
_ = tf.histogram_summary('weights', W)
_ = tf.histogram_summary('biases', b)
_ = tf.histogram_summary('y', y)
# Define loss and optimizer
y_ = tf.placeholder(tf.float32, [None, NLABELS], name='y-input')
# More name scopes will clean up the graph representation
with tf.name_scope('cross_entropy'):
cross_entropy = -tf.reduce_mean(y_ * tf.log(y))
_ = tf.scalar_summary('cross entropy', cross_entropy)
with tf.name_scope('train'):
train_step = tf.train.GradientDescentOptimizer(10.).minimize(cross_entropy)
with tf.name_scope('test'):
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
_ = tf.scalar_summary('accuracy', accuracy)
# Merge all the summaries and write them out to /tmp/mnist_logs
merged = tf.merge_all_summaries()
writer = tf.train.SummaryWriter('logs', sess.graph_def)
# Train the model, and feed in test data and record summaries every 10 steps
for i in range(1000):
if i % 10 == 0: # Record summary data and the accuracy
labels = mnist.test.labels[:, 0:NLABELS]
feed = {x: mnist.test.images, y_: labels}
result =[merged, accuracy, cross_entropy], feed_dict=feed)
summary_str = result[0]
acc = result[1]
loss = result[2]
writer.add_summary(summary_str, i)
print('Accuracy at step %s: %s - loss: %f' % (i, acc, loss))
batch_xs, batch_ys = mnist.train.next_batch(100)
batch_ys = batch_ys[:, 0:NLABELS]
feed = {x: batch_xs, y_: batch_ys}, feed_dict=feed)
I have checked the dimensions of both batch_ys (fed into y) and _y and they are both 1xN matrices when NLABELS=1 so the problem seems to be prior to that. Maybe something to do with the matrix multiplication?
I actually have got this same problem in a real project, so any help would be appreciated... Thanks!
The original MNIST example uses a one-hot encoding to represent the labels in the data: this means that if there are NLABELS = 10 classes (as in MNIST), the target output is [1 0 0 0 0 0 0 0 0 0] for class 0, [0 1 0 0 0 0 0 0 0 0] for class 1, etc. The tf.nn.softmax() operator converts the logits computed by tf.matmul(x, W) + b into a probability distribution across the different output classes, which is then compared to the fed-in value for y_.
If NLABELS = 1, this acts as if there were only a single class, and the tf.nn.softmax() op would compute a probability of 1.0 for that class, leading to a cross-entropy of 0.0, since tf.log(1.0) is 0.0 for all of the examples.
There are (at least) two approaches you could try for binary classification:
The simplest would be to set NLABELS = 2 for the two possible classes, and encode your training data as [1 0] for label 0 and [0 1] for label 1. This answer has a suggestion for how to do that.
You could keep the labels as integers 0 and 1 and use tf.nn.sparse_softmax_cross_entropy_with_logits(), as suggested in this answer.
I've been looking for good examples of how to implement binary classification in TensorFlow in a similar manner to the way it would be done in Keras. I didn't find any, but after digging through the code a bit, I think I have it figured out. I modified the problem here to implement a solution that uses sigmoid_cross_entropy_with_logits the way Keras does under the hood.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf
# Import data
mnist = input_data.read_data_sets('data', one_hot=True)
sess = tf.InteractiveSession()
# Create the model
x = tf.placeholder(tf.float32, [None, 784], name='x-input')
W = tf.get_variable('weights', [784, NLABELS],
initializer=tf.truncated_normal_initializer()) * 0.1
b = tf.Variable(tf.zeros([NLABELS], name='bias'))
logits = tf.matmul(x, W) + b
# Define loss and optimizer
y_ = tf.placeholder(tf.float32, [None, NLABELS], name='y-input')
# More name scopes will clean up the graph representation
with tf.name_scope('cross_entropy'):
#manual calculation : under the hood math, don't use this it will have gradient problems
# entropy = tf.multiply(tf.log(tf.sigmoid(logits)), y_) + tf.multiply((1 - y_), tf.log(1 - tf.sigmoid(logits)))
# loss = -tf.reduce_mean(entropy, name='loss')
entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=y_, logits=logits)
loss = tf.reduce_mean(entropy, name='loss')
with tf.name_scope('train'):
# Using Adam instead
# train_step = tf.train.GradientDescentOptimizer(learning_rate=0.001).minimize(loss)
train_step = tf.train.AdamOptimizer(learning_rate=0.002).minimize(loss)
with tf.name_scope('test'):
preds = tf.cast((logits > 0.5), tf.float32)
correct_prediction = tf.equal(preds, y_)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
# Train the model, and feed in test data and record summaries every 10 steps
for i in range(2000):
if i % 100 == 0: # Record summary data and the accuracy
labels = mnist.test.labels[:, 0:NLABELS]
feed = {x: mnist.test.images, y_: labels}
result =[loss, accuracy], feed_dict=feed)
print('Accuracy at step %s: %s - loss: %f' % (i, result[1], result[0]))
batch_xs, batch_ys = mnist.train.next_batch(100)
batch_ys = batch_ys[:, 0:NLABELS]
feed = {x: batch_xs, y_: batch_ys}, feed_dict=feed)
Accuracy at step 0: 0.7373 - loss: 0.758670
Accuracy at step 100: 0.9017 - loss: 0.423321
Accuracy at step 200: 0.9031 - loss: 0.322541
Accuracy at step 300: 0.9085 - loss: 0.255705
Accuracy at step 400: 0.9188 - loss: 0.209892
Accuracy at step 500: 0.9308 - loss: 0.178372
Accuracy at step 600: 0.9453 - loss: 0.155927
Accuracy at step 700: 0.9507 - loss: 0.139031
Accuracy at step 800: 0.9556 - loss: 0.125855
Accuracy at step 900: 0.9607 - loss: 0.115340
Accuracy at step 1000: 0.9633 - loss: 0.106709
Accuracy at step 1100: 0.9667 - loss: 0.099286
Accuracy at step 1200: 0.971 - loss: 0.093048
Accuracy at step 1300: 0.9714 - loss: 0.087915
Accuracy at step 1400: 0.9745 - loss: 0.083300
Accuracy at step 1500: 0.9745 - loss: 0.079019
Accuracy at step 1600: 0.9761 - loss: 0.075164
Accuracy at step 1700: 0.9768 - loss: 0.071803
Accuracy at step 1800: 0.9777 - loss: 0.068825
Accuracy at step 1900: 0.9788 - loss: 0.066270

