Tensorflow version: Tensorflow 2.1
I want to get the gradients with respect to the input instead of the gradient with respect to the trainable weights. I adjust the example from https://www.tensorflow.org/guide/keras/train_and_evaluate to
import tensorflow as tf
import numpy as np
physical_devices = tf.config.experimental.list_physical_devices('GPU')
assert len(physical_devices) > 0, 'Not enough GPU hardware devices available'
tf.config.experimental.set_memory_growth(physical_devices[0], True)
def loss_fun(y_true, y_pred):
loss = tf.reduce_mean(tf.square(y_true - y_pred), axis=-1)
return loss
# Create a dataset
x = np.random.rand(10, 180, 320, 3).astype(np.float32)
y = np.random.rand(10, 1).astype(np.float32)
dataset = tf.data.Dataset.from_tensor_slices((x, y)).batch(1)
# Create a model
base_model = tf.keras.applications.MobileNet(input_shape=(180, 320, 3), weights=None, include_top=False)
x = tf.keras.layers.GlobalAveragePooling2D()(base_model.output)
output = tf.keras.layers.Dense(1)(x)
model = tf.keras.models.Model(inputs=base_model.input, outputs=output)
for input, target in dataset:
for iteration in range(400):
with tf.GradientTape() as tape:
# Run the forward pass of the layer.
# The operations that the layer applies
# to its inputs are going to be recorded
# on the GradientTape.
prediction = model(input, training=False) # Logits for this minibatch
# Compute the loss value for this minibatch.
loss_value = loss_fun(target, prediction)
# Use the gradient tape to automatically retrieve
# the gradients of the trainable variables with respect to the loss.
grads = tape.gradient(loss_value, model.inputs)
print(grads) # output: [None]
# Run one step of gradient descent by updating
# the value of the variables to minimize the loss.
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
optimizer.apply_gradients(zip(grads, model.inputs))
print('Iteration {}'.format(iteration))
However, this doesnot work, because grads = tape.gradient(loss_value, model.inputs) returns [None]. Is this intended behaviour or not? If yes, what is the recommended way to get the gradients with respect to the input?
To get it working two things needs to be added:
Converting image to a tf.Variable
Using tape.watch to watch the gradient with respect to the desired variable
image = tf.Variable(input)
for iteration in range(400):
with tf.GradientTape() as tape:
tape.watch(image)
# Run the forward pass of the layer.
# The operations that the layer applies
# to its inputs are going to be recorded
# on the GradientTape.
prediction = model(image, training=False) # Logits for this minibatch
# Compute the loss value for this minibatch.
loss_value = loss_fun(target, prediction)
# Use the gradient tape to automatically retrieve
# the gradients of the trainable variables with respect to the loss.
grads = tape.gradient(loss_value, image)
#print(grads) # output: [None]
# Run one step of gradient descent by updating
# the value of the variables to minimize the loss.
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
optimizer.apply_gradients(zip([grads], [image]))
print('Iteration {}'.format(iteration))
Related
Have one doubt. I am also using mask rcnn but tensorflow is 2.0. I am trying to run the tensorboard but I am only getting one loss(plotting graph using tensorboard-only one loss) instead of each loss graph. I am not sure how to retrieve each loss.
This is my code def compile section of model.py
def compile(self, learning_rate, momentum):
"""Gets the model ready for training. Adds losses, regularization, and
metrics. Then calls the Keras compile() function.
"""
# Optimizer object
optimizer = keras.optimizers.SGD(
lr=learning_rate, momentum=momentum,
clipnorm=self.config.GRADIENT_CLIP_NORM)
loss_names = [
"rpn_class_loss", "rpn_bbox_loss",
"mrcnn_class_loss", "mrcnn_bbox_loss", "mrcnn_mask_loss"]
added_loss_name = []
for name in loss_names:
layer = self.keras_model.get_layer(name)
if layer.output.name in added_loss_name:
#if layer.output in self.keras_model.losses:
continue
loss = (
tf.math.reduce_mean(layer.output, keepdims=True)
* self.config.LOSS_WEIGHTS.get(name, 1.))
self.keras_model.add_loss(loss)
added_loss_name.append(layer.output.name)
# Add L2 Regularization
# Skip gamma and beta weights of batch normalization layers.
reg_losses = [
keras.regularizers.l2(self.config.WEIGHT_DECAY)(w) / tf.cast(tf.size(w), tf.float32)
for w in self.keras_model.trainable_weights
if 'gamma' not in w.name and 'beta' not in w.name]
self.keras_model.add_loss(tf.add_n(reg_losses))
# Compile
self.keras_model.compile(
optimizer=optimizer,
loss=[None] * len(self.keras_model.outputs))
# Add metrics for losses
for name in loss_names:
if name in self.keras_model.metrics_names:
continue
layer = self.keras_model.get_layer(name)
self.keras_model.metrics_names.append(name)
loss = (
tf.reduce_mean(layer.output, keepdims=True)
* self.config.LOSS_WEIGHTS.get(name, 1.))
self.keras_model.metrics.append(loss)
I want to see each loss seperately instead of one single loss like the image given below( matterport /Mask_RCNN ).
separate losses
I'm trying to create a contractive autoencoder in Pytorch. I found this thread and tried according to that. This is the snippet I wrote based on the mentioned thread:
import datetime
import numpy as np
import torch
import torchvision
from torchvision import datasets, transforms
from torchvision.utils import save_image, make_grid
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
%matplotlib inline
dataset_train = datasets.MNIST(root='MNIST',
train=True,
transform = transforms.ToTensor(),
download=True)
dataset_test = datasets.MNIST(root='MNIST',
train=False,
transform = transforms.ToTensor(),
download=True)
batch_size = 128
num_workers = 2
dataloader_train = torch.utils.data.DataLoader(dataset_train,
batch_size = batch_size,
shuffle=True,
num_workers = num_workers,
pin_memory=True)
dataloader_test = torch.utils.data.DataLoader(dataset_test,
batch_size = batch_size,
num_workers = num_workers,
pin_memory=True)
def view_images(imgs, labels, rows = 4, cols =11):
imgs = imgs.detach().cpu().numpy().transpose(0,2,3,1)
fig = plt.figure(figsize=(8,4))
for i in range(imgs.shape[0]):
ax = fig.add_subplot(rows, cols, i+1, xticks=[], yticks=[])
ax.imshow(imgs[i].squeeze(), cmap='Greys_r')
ax.set_title(labels[i].item())
# now let's view some
imgs, labels = next(iter(dataloader_train))
view_images(imgs, labels,13,10)
class Contractive_AutoEncoder(nn.Module):
def __init__(self):
super().__init__()
self.encoder = nn.Linear(784, 512)
self.decoder = nn.Linear(512, 784)
def forward(self, input):
# flatten the input
shape = input.shape
input = input.view(input.size(0), -1)
output_e = F.relu(self.encoder(input))
output = F.sigmoid(self.decoder(output_e))
output = output.view(*shape)
return output_e, output
def loss_function(output_e, outputs, imgs, device):
output_e.backward(torch.ones(output_e.size()).to(device), retain_graph=True)
criterion = nn.MSELoss()
assert outputs.shape == imgs.shape ,f'outputs.shape : {outputs.shape} != imgs.shape : {imgs.shape}'
imgs.grad.requires_grad = True
loss1 = criterion(outputs, imgs)
print(imgs.grad)
loss2 = torch.mean(pow(imgs.grad,2))
loss = loss1 + loss2
return loss
epochs = 50
interval = 2000
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Contractive_AutoEncoder().to(device)
optimizer = optim.Adam(model.parameters(), lr =0.001)
for e in range(epochs):
for i, (imgs, labels) in enumerate(dataloader_train):
imgs = imgs.to(device)
labels = labels.to(device)
outputs_e, outputs = model(imgs)
loss = loss_function(outputs_e, outputs, imgs,device)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if i%interval:
print('')
print(f'epoch/epoechs: {e}/{epochs} loss : {loss.item():.4f} ')
For the sake of brevity I just used one layer for the encoder and the decoder. It should work regardless of number of layers in either of them obviously!
But the catch here is, aside from the fact that I don't know if this is the correct way of doing this, (calculating gradients with respect to the input), I get an error which makes the former solution wrong/not applicable.
That is:
imgs.grad.requires_grad = True
produces the error :
AttributeError : 'NoneType' object has no attribute 'requires_grad'
I also tried the second method suggested in that thread which is as follows:
class Contractive_Encoder(nn.Module):
def __init__(self):
super().__init__()
self.encoder = nn.Linear(784, 512)
def forward(self, input):
# flatten the input
input = input.view(input.size(0), -1)
output_e = F.relu(self.encoder(input))
return output_e
class Contractive_Decoder(nn.Module):
def __init__(self):
super().__init__()
self.decoder = nn.Linear(512, 784)
def forward(self, input):
# flatten the input
output = F.sigmoid(self.decoder(input))
output = output.view(-1,1,28,28)
return output
epochs = 50
interval = 2000
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_enc = Contractive_Encoder().to(device)
model_dec = Contractive_Decoder().to(device)
optimizer = optim.Adam([{"params":model_enc.parameters()},
{"params":model_dec.parameters()}], lr =0.001)
optimizer_cond = optim.Adam(model_enc.parameters(), lr = 0.001)
criterion = nn.MSELoss()
for e in range(epochs):
for i, (imgs, labels) in enumerate(dataloader_train):
imgs = imgs.to(device)
labels = labels.to(device)
outputs_e = model_enc(imgs)
outputs = model_dec(outputs_e)
loss_rec = criterion(outputs, imgs)
optimizer.zero_grad()
loss_rec.backward()
optimizer.step()
imgs.requires_grad_(True)
y = model_enc(imgs)
optimizer_cond.zero_grad()
y.backward(torch.ones(imgs.view(-1,28*28).size()))
imgs.grad.requires_grad = True
loss = torch.mean([pow(imgs.grad,2)])
optimizer_cond.zero_grad()
loss.backward()
optimizer_cond.step()
if i%interval:
print('')
print(f'epoch/epoechs: {e}/{epochs} loss : {loss.item():.4f} ')
but I face the error :
RuntimeError: invalid gradient at index 0 - got [128, 784] but expected shape compatible with [128, 512]
How should I go about this in Pytorch?
Summary
The final implementation for contractive loss that I wrote is as follows:
def loss_function(output_e, outputs, imgs, lamda = 1e-4, device=torch.device('cuda')):
criterion = nn.MSELoss()
assert outputs.shape == imgs.shape ,f'outputs.shape : {outputs.shape} != imgs.shape : {imgs.shape}'
loss1 = criterion(outputs, imgs)
output_e.backward(torch.ones(outputs_e.size()).to(device), retain_graph=True)
# Frobenious norm, the square root of sum of all elements (square value)
# in a jacobian matrix
loss2 = torch.sqrt(torch.sum(torch.pow(imgs.grad,2)))
imgs.grad.data.zero_()
loss = loss1 + (lamda*loss2)
return loss
and inside training loop you need to do:
for e in range(epochs):
for i, (imgs, labels) in enumerate(dataloader_train):
imgs = imgs.to(device)
labels = labels.to(device)
imgs.retain_grad()
imgs.requires_grad_(True)
outputs_e, outputs = model(imgs)
loss = loss_function(outputs_e, outputs, imgs, lam,device)
imgs.requires_grad_(False)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f'epoch/epochs: {e}/{epochs} loss: {loss.item():.4f}')
Full explanation
As it turns out and rightfully #akshayk07 pointed out in the comments, the implementation found in Pytorch forum was wrong in multiple places. The notable thing, being it wasn't implementing the actual contractive loss that was introduced in Contractive Auto-Encoders:Explicit Invariance During Feature Extraction paper! and also aside from that, the implementation wouldn't work at all for obvious reasons that will be explained in a moment.
The changes are obvious so I try to explain what's going on here. First of all note that imgs is not a leaf node, so the gradients would not be retained in the image .grad attribute.
In order to retain gradients for non leaf nodes, you should use retain_graph(). grad is only populated for leaf Tensors. Also imgs.retain_grad() should be called before doing forward() as it will instruct the autograd to store grads into non-leaf nodes.
Update
Thanks to #Michael for pointing out that the correct calculation of Frobenius Norm is actually (from ScienceDirect):
the square root of the sum of the squares of all the matrix entries
and not
the the square root of the sum of the absolute values of all the
matrix entries as explained here
In PyTorch 1.5.0, a high level torch.autograd.functional.jacobian API is added. This should make the contractive objective easier to implement for an arbitrary encoder. For torch>=v1.5.0, the contractive loss would look like this:
contractive_loss = torch.norm(torch.autograd.functional.jacobian(self.encoder, imgs, create_graph=True))
The create_graph argument makes the jacobian differentiable.
The main challenge in implementing the contractive autoencoder is in calculating the Frobenius norm of the Jacobian, which is the gradient of the code or bottleneck layer (vector) with respect to the input layer (vector). This is the regularization term in the loss function. Fortunately, you have done the hard work in solving this for me. Thank you! You are using MSE loss for the first term. Cross entropy loss is sometimes used instead. It's worth considering. I think you are almost there with the Frobenius norm, except that you need to take the square root of the sum of the squares of the Jacobian, where you are calculating the square root of the sum of the absolute values. Here's how I'd define the loss function (sorry I changed notation a little to keep myself straight):
def cae_loss_fcn(code, img_out, img_in, lamda=1e-4, device=torch.device('cuda')):
# First term in the loss function, for ensuring representational fidelity
criterion=nn.MSELoss()
assert img_out.shape == img_in.shape, f'img_out.shape : {img_out.shape} != img_in.shape : {img_in.shape}'
loss1 = criterion(img_out, img_in)
# Second term in the loss function, for enforcing contraction of representation
code.backward(torch.ones(code.size()).to(device), retain_graph=True)
# Frobenius norm of Jacobian of code with respect to input image
loss2 = torch.sqrt(torch.sum(torch.pow(img_in.grad, 2))) # THE CORRECTION
img_in.grad.data.zero_()
# Total loss, the sum of the two loss terms, with weight applied to second term
loss = loss1 + (lamda*loss2)
return loss
I am trying to use tf.GradientTape loosely based on the example in https://www.tensorflow.org/beta/tutorials/eager/custom_training_walkthrough, and have the need to create a custom loss function, where each prediction gets a weighted loss value, depending on the outcome.
This is a three class classification problem, where the loss function takes the features 'x' (130), the labels 'y' (0, 1 or 2), and the 'weights' (one weight for each label) which depend on whether the prediction match the label or not. Here is my code:
def TF_learning(training_data,training_results,testing_data):
odds = [i[-2:] for i in training_data]
training_data = tf.keras.utils.normalize(training_data, axis=1)
testing_data = tf.keras.utils.normalize(testing_data, axis=1)
minutes = int((len(training_data[0]) - 10) / 2)
dense_layers = 1
neurons = 32
epochs = 70
NAME = "{}-nodes-{}-dense".format(neurons, dense_layers)
tensorboard = TensorBoard(log_dir='logs/{}'.format(NAME))
#print(NAME)
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Flatten())
for i_layer in range(0,dense_layers):
#model.add(tf.keras.layers.batch_normalization(training_data))
model.add(tf.keras.layers.Dense(neurons, activation=tf.nn.relu))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(neurons/2., activation=tf.nn.relu))
model.add(tf.keras.layers.Dropout(0.1))
model.add(tf.keras.layers.Dense(3, activation=tf.nn.softmax))
#tf.function
def loss(model, x, y, weights):
x = model(x)
x_range = tf.range(x.shape.as_list()[-1], dtype=x.dtype)
y_ = tf.reduce_sum(tf.nn.softmax(x*1e10) * x_range, axis=-1)
y_ = tf.cast(y_, dtype=tf.int32)
y_ = tf.one_hot(y_, depth=3)
y = tf.cast(y, tf.int64)
y = tf.one_hot(y, depth=3)
correct = tf.multiply(y_, y)
wrong = tf.add(tf.multiply(y[:,0], y_[:,2]), tf.multiply(y[:,2], y_[:,0]))
indices = tf.cast(tf.stack([tf.range(tf.shape(weights)[0], dtype=tf.int32), tf.ones(tf.shape(weights)[0], dtype=tf.int32)], axis=1), dtype=tf.int32)
scatter = tf.tensor_scatter_nd_update(correct, indices, wrong)
scatter = tf.cast(scatter, dtype=tf.float64)
loss_array = tf.multiply(scatter, weights)
loss = tf.reduce_sum(loss_array)
return loss
#tf.function
def grad(model, inputs, targets, weights):
with tf.GradientTape(persistent=True, watch_accessed_variables=False) as tape:
loss_value = loss(model, training_data, training_results, weights)
print(tape.gradient(loss_value, model.trainable_variables))
return loss_value, tape.gradient(loss_value, model.trainable_variables) # Virker ikke, model.variables er tom
weights = - tf.Variable(np.insert(odds, 1, values=0, axis=1), dtype=tf.float64) + 1
l = loss(model, training_data, training_results, weights)
print("Loss test: {}".format(l))
optimizer = tf.keras.optimizers.Adam(lr=0.1, decay=1e-5)
loss_value, grads = grad(model, training_data, training_results, weights)
print("Step: {}, Initial Loss: {}".format(optimizer.iterations.numpy(),
loss_value.numpy()))
optimizer.apply_gradients(zip(grads, model.trainable_variables))
print("Step: {}, Loss: {}".format(optimizer.iterations.numpy(),
loss(model, training_data, training_results).numpy()))
How do i make something like this in Tensorflow? I just need a loss that is weighted depending on whether the prediction is correct or not.
I guess that the gradient can't be calculated, because when it takes a small step, the number will still be converted to the same integer. I get the following error.
Loss test: 7.040000000000001
WARNING: Logging before flag parsing goes to stderr.
W0711 18:04:30.068719 9868 backprop.py:935] Calling GradientTape.gradient on a persistent tape inside it's context is significantly less efficient than calling it outside the context (it causes the gradient ops to be recorded on the tape, leading to increased CPU and memory usage). Only call GradientTape.gradient inside the context if you actually want to trace the gradient in order to compute higher order derrivatives.
[None, None, None, None, None, None]
Step: 0, Initial Loss: 7.040000000000001
Traceback (most recent call last):
File "ML_test.py", line 322, in <module>
predictions = TF_learning(training_data=X_train,training_results=Y_train,testing_data=X_test)
File "C:\Code\ATP\Ad_hoc_opgaver\Test\ML_tests\machine_learning_tf2.py", line 157, in TF_learning
optimizer.apply_gradients(zip(grads, model.trainable_variables))
File "C:\Code\lib\site-packages\tensorflow\python\keras\optimizer_v2\optimizer_v2.py", line 396, in apply_gradients
grads_and_vars = _filter_grads(grads_and_vars)
File "C:\Code\lib\site-packages\tensorflow\python\keras\optimizer_v2\optimizer_v2.py", line 924, in _filter_grads
([v.name for _, v in grads_and_vars],))
ValueError: No gradients provided for any variable: ['sequential/dense/kernel:0', 'sequential/dense/bias:0', 'sequential/dense_1/kernel:0', 'sequential/dense_1/bias:0', 'sequential/dense_2/kernel:0', 'sequential/dense_2/bias:0'].
Is there any way to make this work? maybe with an optimizer that doesn't use gradient decent, but random sampling? Or one that takes a big enough step, to get a gradient?
My main question is; is averaging the loss the same thing as averaging the gradient and how do i accumulate my loss over mini batches then calculate my gradient?
I have been trying to implement policy gradient in Tensorflow and run into the issue where i can not feed all my game states into my network at once and then update. The problem is if i lower my network size then train on all frames at once and take the mean of the loss then it begins to converge nicely. But if I accumulate the gradients over mini batches then average them, my gradients explode and i overflow my weights.
Any help or insight will be very appreciated.
Keep in mind also, this is my first time asking a question here.
What you can do is to accumulate gradients after each mini-batch and then update the weights based on gradient averages. Consider following simple case for fitting 50 Gaussian blobs with a single-layered perceptron:
from sklearn.datasets import make_blobs
import tensorflow as tf
import numpy as np
x_train, y_train = make_blobs(n_samples=50,
n_features=2,
centers=[[1, 1], [-1, -1]],
cluster_std=0.5)
with tf.name_scope('x'):
x = tf.placeholder(tf.float32, [None, 2])
y = tf.placeholder(tf.int32, [None])
with tf.name_scope('layer'):
logits = tf.layers.dense(x,
units=2,
kernel_initializer=tf.contrib.layers.xavier_initializer())
with tf.name_scope('loss'):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss_op = tf.reduce_mean(xentropy)
The minimize() method of the tensorflow optimizers calls compute_gradients() and then apply_gradients(). Instead of calling the minimize(), I'm going to call both methods directly. First, to get the gradients we call compute_gradients() (which returns a list of tuples grads_and_vars) and for apply_gradients() instead of gradients I'm going to feed placeholders for future gradient's averages:
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
grads_and_vars = optimizer.compute_gradients(loss_op)
grads = [g for g, v in grads_and_vars]
# placeholders for gradients averages
placeholder_grads = [tf.placeholder(tf.float32, [None] + g.get_shape().as_list())
for g in grads]
new_grads_and_vars = [(tf.reduce_mean(p, axis=0), gv[1])
for p, gv in zip(placeholder_grads, grads_and_vars)]
apply_grads_op = optimizer.apply_gradients(new_grads_and_vars)
During mini-batches we only compute losses (you can accumulate losses as well - append to some list and then compute average) and gradients, without applying gradients to weights. At the end of each epoch we execute apply_grads_op operation while feeding accumulated gradients to its placeholders:
data = tf.data.Dataset.from_tensor_slices({'x':x_train, 'y':y_train}).batch(10)
iterator = data.make_initializable_iterator()
n_epochs = 2
with tf.Session() as sess:
_ = sess.run([tf.global_variables_initializer(), iterator.initializer])
next_batch = iterator.get_next()
for epoch in range(n_epochs):
epoch_grads = []
while True:
try:
batch = sess.run(next_batch)
evaled = sess.run([loss_op] + grads,
feed_dict={x:batch['x'], y:batch['y']})
epoch_grads.append(evaled[1:])
print('batch loss:', evaled[0])
except tf.errors.OutOfRangeError:
_ = sess.run(iterator.initializer)
feed_dict = {p:[g[i] for g in epoch_grads]
for i, p in enumerate(placeholder_grads)}
_ = sess.run(apply_grads_op, feed_dict=feed_dict)
break
I'm using Tensorboard 1.5 and I would like to see how my gradients are doing.
Here is an example of layer I am using:
net = tf.layers.dense(features, 40, activation=tf.nn.relu, kernel_regularizer=regularizer,
kernel_initializer=tf.contrib.layers.xavier_initializer())
And here is my optimizer:
train_op = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(loss)
For my model parameters I create summaries this way:
for var in tf.trainable_variables():
tf.summary.histogram(var.name, var)
Is there a similar way to get the all gradients in a for loop to create my summaries?
You should first get the gradients using compute_gradients of the optimizer and then pass them to summary:
opt = tf.train.AdamOptimizer(learning_rate = learning_rate)
# Calculate the gradients for the batch of data
grads = opt.compute_gradients(loss)
# Add histograms for gradients.
for grad, var in grads:
if grad is not None:
summaries.append(tf.summary.histogram(var.op.name + '/gradients', grad))
And then to perform the training, you can call the apply_gradients of optimizer:
# Apply the gradients to adjust the shared variables.
train_op = opt.apply_gradients(grads, global_step=global_step)
for more, you can go to tensorflow cifar10 tutorial.