I am trying to code up an implementation of the variational autoencoder, however I am facing some difficulties regarding the loss function:
def vae_loss(sigma, mu):
def loss(y_true, y_pred):
recon = K.sum(K.binary_crossentropy(y_true, y_pred), axis=-1)
kl = 0.5 * K.sum(K.exp(sigma) + K.square(mu) - 1. - sigma, axis=-1)
return recon + kl
return loss
The binary crossentropy part works fine, but whenever I return only the divergence term kl for testing I get the following error:
ValueError: "Tried to convert 'x' to a tensor and failed. Error: None values not supported.".
I am looking forward to possible hints as to what I have done wrong. You will find my entire code below. Thank you for your time!
import numpy as np
from keras import Model
from keras.layers import Input, Dense, Lambda
import keras.backend as K
from keras.datasets import mnist
from matplotlib import pyplot as plt
class VAE(object):
def __init__(self, n_latent, batch_size):
self.encoder, self.encoder_input, self.mu, self.sigma = self.create_encoder(n_latent, batch_size)
self.decoder, self.decoder_input, self.decoder_output = self.create_decoder(n_latent, batch_size)
pipeline = self.decoder(self.encoder.outputs[0])
def vae_loss(sigma, mu):
def loss(y_true, y_pred):
recon = K.sum(K.binary_crossentropy(y_true, y_pred), axis=-1)
kl = 0.5 * K.sum(K.exp(sigma) + K.square(mu) - 1. - sigma, axis=-1)
return recon + kl
return loss
self.VAE = Model(self.encoder_input, pipeline)
self.VAE.compile(optimizer="adadelta", loss=vae_loss(self.sigma, self.mu))
def create_encoder(self, n_latent, batch_size):
input_layer = Input(shape=(784,))
#net = Dense(512, activation="relu")(input_layer)
mu = Dense(n_latent, activation="linear")(input_layer)
print(mu)
sigma = Dense(n_latent, activation="linear")(input_layer)
def sample_z(args):
mu, log_sigma = args
eps = K.random_normal(shape=(K.shape(input_layer)[0], n_latent), mean=0., stddev=1.)
K.print_tensor(K.shape(eps))
return mu + K.exp(log_sigma / 2) * eps
sample_z = Lambda(sample_z)([mu, sigma])
model = Model(inputs=input_layer, outputs=[sample_z, mu, sigma])
return model, input_layer, mu, sigma
def create_decoder(self, n_latent, batch_size):
input_layer = Input(shape=(n_latent,))
#net = Dense(512, activation="relu")(input_layer)
reconstruct = Dense(784, activation="linear")(input_layer)
model = Model(inputs=input_layer, outputs=reconstruct)
return model, input_layer, reconstruct
I am going to assume the error appears when you are "testing"/debugging your training phase, during backpropagation (let me if I am wrong).
If so, the problem is that you are asking Keras to optimize your whole network (model.VAE.fit(...)) while using a loss (kl) covering only the encoder part. The gradients for the decoder stay undefined (without a loss like recon covering it), causing the optimization error.
For your debugging purpose, the error would disappear if you try to compile and fit only the encoder with this amputated loss (kl), or if you come up with a dummy (differentiable) loss covering also the decoder (e.g. K.sum(y_pred - y_pred, axis=-1) + kl).
Related
i am new to tensorflow2.9 and i have finished writing a function to realize linear regression. But I faced some problems when I want to visualize this function with tensorboard.I know how to record data, but I dont know how to generate a graph with tf.summary.trace_on
Here is my code.
def linear_regression_1():
writer = tf.summary.create_file_writer("./tmp/linear")
x = tf.random.normal(shape=[100, 1])
y_true = tf.matmul(x, [[0.8]]) + 0.7
weights = tf.Variable(initial_value=tf.random.normal(shape=[1, 1]))
bias = tf.Variable(initial_value=tf.random.normal(shape=[1, 1]))
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)
with writer.as_default():
for i in range(1000):
# tf.print('weights:', weights)
# tf.print('bias:', bias)
tf.summary.histogram('weights', weights, i)
tf.summary.histogram('bias', bias, i)
with tf.GradientTape() as tape:
y_predict = tf.matmul(x, weights) + bias
error = tf.reduce_mean(tf.square(y_predict - y_true))
tf.summary.histogram('error', error, i)
gradients = tape.gradient(error, [weights, bias])
optimizer.apply_gradients(zip(gradients, [weights, bias]))
print('weights:', weights)
print('bias:', bias)
linear_regression_1()
when I put a #tf.function before this function, this function just report errors.
I'm trying to create a contractive autoencoder in Pytorch. I found this thread and tried according to that. This is the snippet I wrote based on the mentioned thread:
import datetime
import numpy as np
import torch
import torchvision
from torchvision import datasets, transforms
from torchvision.utils import save_image, make_grid
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
%matplotlib inline
dataset_train = datasets.MNIST(root='MNIST',
train=True,
transform = transforms.ToTensor(),
download=True)
dataset_test = datasets.MNIST(root='MNIST',
train=False,
transform = transforms.ToTensor(),
download=True)
batch_size = 128
num_workers = 2
dataloader_train = torch.utils.data.DataLoader(dataset_train,
batch_size = batch_size,
shuffle=True,
num_workers = num_workers,
pin_memory=True)
dataloader_test = torch.utils.data.DataLoader(dataset_test,
batch_size = batch_size,
num_workers = num_workers,
pin_memory=True)
def view_images(imgs, labels, rows = 4, cols =11):
imgs = imgs.detach().cpu().numpy().transpose(0,2,3,1)
fig = plt.figure(figsize=(8,4))
for i in range(imgs.shape[0]):
ax = fig.add_subplot(rows, cols, i+1, xticks=[], yticks=[])
ax.imshow(imgs[i].squeeze(), cmap='Greys_r')
ax.set_title(labels[i].item())
# now let's view some
imgs, labels = next(iter(dataloader_train))
view_images(imgs, labels,13,10)
class Contractive_AutoEncoder(nn.Module):
def __init__(self):
super().__init__()
self.encoder = nn.Linear(784, 512)
self.decoder = nn.Linear(512, 784)
def forward(self, input):
# flatten the input
shape = input.shape
input = input.view(input.size(0), -1)
output_e = F.relu(self.encoder(input))
output = F.sigmoid(self.decoder(output_e))
output = output.view(*shape)
return output_e, output
def loss_function(output_e, outputs, imgs, device):
output_e.backward(torch.ones(output_e.size()).to(device), retain_graph=True)
criterion = nn.MSELoss()
assert outputs.shape == imgs.shape ,f'outputs.shape : {outputs.shape} != imgs.shape : {imgs.shape}'
imgs.grad.requires_grad = True
loss1 = criterion(outputs, imgs)
print(imgs.grad)
loss2 = torch.mean(pow(imgs.grad,2))
loss = loss1 + loss2
return loss
epochs = 50
interval = 2000
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Contractive_AutoEncoder().to(device)
optimizer = optim.Adam(model.parameters(), lr =0.001)
for e in range(epochs):
for i, (imgs, labels) in enumerate(dataloader_train):
imgs = imgs.to(device)
labels = labels.to(device)
outputs_e, outputs = model(imgs)
loss = loss_function(outputs_e, outputs, imgs,device)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if i%interval:
print('')
print(f'epoch/epoechs: {e}/{epochs} loss : {loss.item():.4f} ')
For the sake of brevity I just used one layer for the encoder and the decoder. It should work regardless of number of layers in either of them obviously!
But the catch here is, aside from the fact that I don't know if this is the correct way of doing this, (calculating gradients with respect to the input), I get an error which makes the former solution wrong/not applicable.
That is:
imgs.grad.requires_grad = True
produces the error :
AttributeError : 'NoneType' object has no attribute 'requires_grad'
I also tried the second method suggested in that thread which is as follows:
class Contractive_Encoder(nn.Module):
def __init__(self):
super().__init__()
self.encoder = nn.Linear(784, 512)
def forward(self, input):
# flatten the input
input = input.view(input.size(0), -1)
output_e = F.relu(self.encoder(input))
return output_e
class Contractive_Decoder(nn.Module):
def __init__(self):
super().__init__()
self.decoder = nn.Linear(512, 784)
def forward(self, input):
# flatten the input
output = F.sigmoid(self.decoder(input))
output = output.view(-1,1,28,28)
return output
epochs = 50
interval = 2000
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_enc = Contractive_Encoder().to(device)
model_dec = Contractive_Decoder().to(device)
optimizer = optim.Adam([{"params":model_enc.parameters()},
{"params":model_dec.parameters()}], lr =0.001)
optimizer_cond = optim.Adam(model_enc.parameters(), lr = 0.001)
criterion = nn.MSELoss()
for e in range(epochs):
for i, (imgs, labels) in enumerate(dataloader_train):
imgs = imgs.to(device)
labels = labels.to(device)
outputs_e = model_enc(imgs)
outputs = model_dec(outputs_e)
loss_rec = criterion(outputs, imgs)
optimizer.zero_grad()
loss_rec.backward()
optimizer.step()
imgs.requires_grad_(True)
y = model_enc(imgs)
optimizer_cond.zero_grad()
y.backward(torch.ones(imgs.view(-1,28*28).size()))
imgs.grad.requires_grad = True
loss = torch.mean([pow(imgs.grad,2)])
optimizer_cond.zero_grad()
loss.backward()
optimizer_cond.step()
if i%interval:
print('')
print(f'epoch/epoechs: {e}/{epochs} loss : {loss.item():.4f} ')
but I face the error :
RuntimeError: invalid gradient at index 0 - got [128, 784] but expected shape compatible with [128, 512]
How should I go about this in Pytorch?
Summary
The final implementation for contractive loss that I wrote is as follows:
def loss_function(output_e, outputs, imgs, lamda = 1e-4, device=torch.device('cuda')):
criterion = nn.MSELoss()
assert outputs.shape == imgs.shape ,f'outputs.shape : {outputs.shape} != imgs.shape : {imgs.shape}'
loss1 = criterion(outputs, imgs)
output_e.backward(torch.ones(outputs_e.size()).to(device), retain_graph=True)
# Frobenious norm, the square root of sum of all elements (square value)
# in a jacobian matrix
loss2 = torch.sqrt(torch.sum(torch.pow(imgs.grad,2)))
imgs.grad.data.zero_()
loss = loss1 + (lamda*loss2)
return loss
and inside training loop you need to do:
for e in range(epochs):
for i, (imgs, labels) in enumerate(dataloader_train):
imgs = imgs.to(device)
labels = labels.to(device)
imgs.retain_grad()
imgs.requires_grad_(True)
outputs_e, outputs = model(imgs)
loss = loss_function(outputs_e, outputs, imgs, lam,device)
imgs.requires_grad_(False)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f'epoch/epochs: {e}/{epochs} loss: {loss.item():.4f}')
Full explanation
As it turns out and rightfully #akshayk07 pointed out in the comments, the implementation found in Pytorch forum was wrong in multiple places. The notable thing, being it wasn't implementing the actual contractive loss that was introduced in Contractive Auto-Encoders:Explicit Invariance During Feature Extraction paper! and also aside from that, the implementation wouldn't work at all for obvious reasons that will be explained in a moment.
The changes are obvious so I try to explain what's going on here. First of all note that imgs is not a leaf node, so the gradients would not be retained in the image .grad attribute.
In order to retain gradients for non leaf nodes, you should use retain_graph(). grad is only populated for leaf Tensors. Also imgs.retain_grad() should be called before doing forward() as it will instruct the autograd to store grads into non-leaf nodes.
Update
Thanks to #Michael for pointing out that the correct calculation of Frobenius Norm is actually (from ScienceDirect):
the square root of the sum of the squares of all the matrix entries
and not
the the square root of the sum of the absolute values of all the
matrix entries as explained here
In PyTorch 1.5.0, a high level torch.autograd.functional.jacobian API is added. This should make the contractive objective easier to implement for an arbitrary encoder. For torch>=v1.5.0, the contractive loss would look like this:
contractive_loss = torch.norm(torch.autograd.functional.jacobian(self.encoder, imgs, create_graph=True))
The create_graph argument makes the jacobian differentiable.
The main challenge in implementing the contractive autoencoder is in calculating the Frobenius norm of the Jacobian, which is the gradient of the code or bottleneck layer (vector) with respect to the input layer (vector). This is the regularization term in the loss function. Fortunately, you have done the hard work in solving this for me. Thank you! You are using MSE loss for the first term. Cross entropy loss is sometimes used instead. It's worth considering. I think you are almost there with the Frobenius norm, except that you need to take the square root of the sum of the squares of the Jacobian, where you are calculating the square root of the sum of the absolute values. Here's how I'd define the loss function (sorry I changed notation a little to keep myself straight):
def cae_loss_fcn(code, img_out, img_in, lamda=1e-4, device=torch.device('cuda')):
# First term in the loss function, for ensuring representational fidelity
criterion=nn.MSELoss()
assert img_out.shape == img_in.shape, f'img_out.shape : {img_out.shape} != img_in.shape : {img_in.shape}'
loss1 = criterion(img_out, img_in)
# Second term in the loss function, for enforcing contraction of representation
code.backward(torch.ones(code.size()).to(device), retain_graph=True)
# Frobenius norm of Jacobian of code with respect to input image
loss2 = torch.sqrt(torch.sum(torch.pow(img_in.grad, 2))) # THE CORRECTION
img_in.grad.data.zero_()
# Total loss, the sum of the two loss terms, with weight applied to second term
loss = loss1 + (lamda*loss2)
return loss
I am currently experimenting with generative adversarial networks in Keras.
As proposed in this paper, I want to use the historical averaging loss function. Meaning that I want to penalize the change of the network weights.
I am not sure how to implement it in a clever way.
I was implementing the custom loss function according to the answer to this post.
def historical_averaging_wrapper(current_weights, prev_weights):
def historical_averaging(y_true, y_pred):
diff = 0
for i in range(len(current_weights)):
diff += abs(np.sum(current_weights[i]) + np.sum(prev_weights[i]))
return K.binary_crossentropy(y_true, y_pred) + diff
return historical_averaging
The weights of the network are penalized, and the weights are changing after each batch of data.
My first idea was to update the loss function after each batch.
Roughly like this:
prev_weights = model.get_weights()
for i in range(len(data)/batch_len):
current_weights = model.get_weights()
model.compile(loss=historical_averaging_wrapper(current_weights, prev_weights), optimizer='adam')
model.fit(training_data[i*batch_size:(i+1)*batch_size], training_labels[i*batch_size:(i+1)*batch_size], epochs=1, batch_size=batch_size)
prev_weights = current_weights
Is this reasonable? That approach seems to be a bit "messy" in my opinion.
Is there another possibility to do this in a "smarter" way?
Like maybe updating the loss function in a data generator and use fit_generator()?
Thanks in advance.
Loss functions are operations on the graph using tensors.
You can define additional tensors in the loss function to hold previous values. This is an example:
import tensorflow as tf
import tensorflow.keras.backend as K
keras = tf.keras
class HistoricalAvgLoss(object):
def __init__(self, model):
# create tensors (initialized to zero) to hold the previous value of the
# weights
self.prev_weights = []
for w in model.get_weights():
self.prev_weights.append(K.variable(np.zeros(w.shape)))
def loss(self, y_true, y_pred):
err = keras.losses.mean_squared_error(y_true, y_pred)
werr = [K.mean(K.abs(c - p)) for c, p in zip(model.get_weights(), self.prev_weights)]
self.prev_weights = K.in_train_phase(
[K.update(p, c) for c, p in zip(model.get_weights(), self.prev_weights)],
self.prev_weights
)
return K.in_train_phase(err + K.sum(werr), err)
The variable prev_weights holds the previous values. Note that we added a K.update operation after the weight errors are calculated.
A sample model for testing:
model = keras.models.Sequential([
keras.layers.Input(shape=(4,)),
keras.layers.Dense(8),
keras.layers.Dense(4),
keras.layers.Dense(1),
])
loss_obj = HistoricalAvgLoss(model)
model.compile('adam', loss_obj.loss)
model.summary()
Some test data and objective function:
import numpy as np
def test_fn(x):
return x[0]*x[1] + 2.0 * x[1]**2 + x[2]/x[3] + 3.0 * x[3]
X = np.random.rand(1000, 4)
y = np.apply_along_axis(test_fn, 1, X)
hist = model.fit(X, y, validation_split=0.25, epochs=10)
The model losses decrease over time, in my test.
I am trying to build a custom loss function that takes the previous output(output from the previous iteration) from the network and use it with the current output.
Here is what I am trying to do, but I don't know how to complete it
def l_loss(prev_output):
def loss(y_true, y_pred):
pix_loss = K.mean(K.square(y_pred - y_true), axis=-1)
pase = K.variable(100)
diff = K.mean(K.abs(prev_output - y_pred))
movement_loss = K.abs(pase - diff)
total_loss = pix_loss + movement_loss
return total_loss
return loss
self.model.compile(optimizer=Adam(0.001, beta_1=0.5, beta_2=0.9),
loss=l_loss(?))
I hope you can help me.
This is what I tried:
from tensorflow import keras
from tensorflow.keras.layers import *
from tensorflow.keras.models import Sequential
from tensorflow.keras import backend as K
class MovementLoss(object):
def __init__(self):
self.var = None
def __call__(self, y_true, y_pred, sample_weight=None):
mse = K.mean(K.square(y_true - y_pred), axis=-1)
if self.var is None:
z = np.zeros((32,))
self.var = K.variable(z)
delta = K.update(self.var, mse - self.var)
return mse + delta
def make_model():
model = Sequential()
model.add(Dense(1, input_shape=(4,)))
loss = MovementLoss()
model.compile('adam', loss)
return model
model = make_model()
model.summary()
Using an example test data.
import numpy as np
X = np.random.rand(32, 4)
POLY = [1.0, 2.0, 0.5, 3.0]
def test_fn(xi):
return np.dot(xi, POLY)
Y = np.apply_along_axis(test_fn, 1, X)
history = model.fit(X, Y, epochs=4)
I do see the loss function oscillate in a way that appears to me is influenced by the last batch delta. Note that the loss function details are not according to your application.
The crucial step is that the K.update step must be part of the graph (as far as I understand it).
That is achieved by:
delta = K.update(var, delta)
return x + delta
Hi I have been trying to make a custom loss function in keras for dice_error_coefficient. It has its implementations in tensorboard and I tried using the same function in keras with tensorflow but it keeps returning a NoneType when I used model.train_on_batch or model.fit where as it gives proper values when used in metrics in the model. Can please someone help me out with what should i do? I have tried following libraries like Keras-FCN by ahundt where he has used custom loss functions but none of it seems to work. The target and output in the code are y_true and y_pred respectively as used in the losses.py file in keras.
def dice_hard_coe(target, output, threshold=0.5, axis=[1,2], smooth=1e-5):
"""References
-----------
- `Wiki-Dice <https://en.wikipedia.org/wiki/Sørensen–Dice_coefficient>`_
"""
output = tf.cast(output > threshold, dtype=tf.float32)
target = tf.cast(target > threshold, dtype=tf.float32)
inse = tf.reduce_sum(tf.multiply(output, target), axis=axis)
l = tf.reduce_sum(output, axis=axis)
r = tf.reduce_sum(target, axis=axis)
hard_dice = (2. * inse + smooth) / (l + r + smooth)
hard_dice = tf.reduce_mean(hard_dice)
return hard_dice
There are two steps in implementing a parameterized custom loss function in Keras. First, writing a method for the coefficient/metric. Second, writing a wrapper function to format things the way Keras needs them to be.
It's actually quite a bit cleaner to use the Keras backend instead of tensorflow directly for simple custom loss functions like DICE. Here's an example of the coefficient implemented that way:
import keras.backend as K
def dice_coef(y_true, y_pred, smooth, thresh):
y_pred = y_pred > thresh
y_true_f = K.flatten(y_true)
y_pred_f = K.flatten(y_pred)
intersection = K.sum(y_true_f * y_pred_f)
return (2. * intersection + smooth) / (K.sum(y_true_f) + K.sum(y_pred_f) + smooth)
Now for the tricky part. Keras loss functions must only take (y_true, y_pred) as parameters. So we need a separate function that returns another function.
def dice_loss(smooth, thresh):
def dice(y_true, y_pred)
return -dice_coef(y_true, y_pred, smooth, thresh)
return dice
Finally, you can use it as follows in Keras compile.
# build model
model = my_model()
# get the loss function
model_dice = dice_loss(smooth=1e-5, thresh=0.5)
# compile model
model.compile(loss=model_dice)
According to the documentation, you can use a custom loss function like this:
Any callable with the signature loss_fn(y_true, y_pred) that returns an array of losses (one of sample in the input batch) can be passed to compile() as a loss. Note that sample weighting is automatically supported for any such loss.
As a simple example:
def my_loss_fn(y_true, y_pred):
squared_difference = tf.square(y_true - y_pred)
return tf.reduce_mean(squared_difference, axis=-1) # Note the `axis=-1`
model.compile(optimizer='adam', loss=my_loss_fn)
Complete example:
import tensorflow as tf
import numpy as np
def my_loss_fn(y_true, y_pred):
squared_difference = tf.square(y_true - y_pred)
return tf.reduce_mean(squared_difference, axis=-1) # Note the `axis=-1`
model = tf.keras.Sequential([
tf.keras.layers.Dense(8, activation='relu'),
tf.keras.layers.Dense(16, activation='relu'),
tf.keras.layers.Dense(1)])
model.compile(optimizer='adam', loss=my_loss_fn)
x = np.random.rand(1000)
y = x**2
history = model.fit(x, y, epochs=10)
In addition, you can extend an existing loss function by inheriting from it. For example masking the BinaryCrossEntropy:
class MaskedBinaryCrossentropy(tf.keras.losses.BinaryCrossentropy):
def call(self, y_true, y_pred):
mask = y_true != -1
y_true = y_true[mask]
y_pred = y_pred[mask]
return super().call(y_true, y_pred)
A good starting point is the custom log guide: https://www.tensorflow.org/guide/keras/train_and_evaluate#custom_losses