I have a TensorFlow model f(x) and I sometimes need its gradients and sometimes not, depending on the result of the forward pass. In order to save computation time, I only want to compute the gradients when I need them. If I stop the gradient computation using stop_gradient() or don't record them on a GradientTape, it seems like I can never obtain the gradients without computing the forward pass again. A simplified example of what I'm trying to do looks like this (in pseudocode):
x = 5
y = f(x)
if y > 0:
compute_gradients(f, x)
Is it possible to accomplish this in TensorFlow and if so, how would I do that?
Yes, you can skip gradient updates with a simple conditional.
import tensorflow as tf
from tensorflow.python.platform import test as test_lib
# network
x_in = tf.keras.Input([10])
x_out = tf.keras.layers.Dense(1)(x_in)
# optimizer
opt = tf.keras.optimizers.Adam(1e-1)
# forward pass
def train_step(model, X, y, threshold):
with tf.GradientTape() as tape:
y_hat = model(X)
# threshold = tf.math.reduce_mean(y_hat)
loss = tf.math.reduce_mean(tf.keras.losses.MSE(y, y_hat))
if tf.math.greater(threshold, 1.0):
m_vars = model.trainable_variables
m_grads = tape.gradient(loss, m_vars)
opt.apply_gradients(zip(m_grads, m_vars))
return loss
# test cases
class SporaticGradientUpdateTest(test_lib.TestCase):
def setUp(self):
self.model = tf.keras.Model(x_in, x_out)
self.X = tf.random.normal([100, 10])
self.y = tf.random.normal([100])
self.w_before = self.model.get_weights()
def test_weights_dont_change(self):
_ = train_step(self.model, self.X, self.y, 0.99)
# get weights that shouldn't have updated
w_after = self.model.get_weights()
self.assertAllClose(self.w_before, w_after)
def test_weights_change(self):
_ = train_step(self.model, self.X, self.y, 1.01)
# get weights that should updated
w_after = self.model.get_weights()
self.assertNotAllClose(self.w_before, w_after)
if __name__ == "__main__":
test_lib.main()
# [ RUN ] SporaticGradientUpdate.test_weights_change
# [ OK ] SporaticGradientUpdate.test_weights_change
# [ RUN ] SporaticGradientUpdate.test_weights_dont_change
# [ OK ] SporaticGradientUpdate.test_weights_dont_change
Per your comment, it looks like your use-case is a little different than this example but should be adaptable to whatever you are trying to do.
In the example, I passed in the threshold as an arg so I could test both cases, but normally you would create it by doing something to the output of the network (like the commented out portion).
Related
I am trying to compute cross_entropy loss manually in Pytorch for an encoder-decoder model.
I used the code posted here to compute it: Cross Entropy in PyTorch
I updated the code to discard padded tokens (-100). The final code is this:
class compute_crossentropyloss_manual:
"""
y0 is the vector with shape (batch_size,C)
x shape is the same (batch_size), whose entries are integers from 0 to C-1
"""
def __init__(self, ignore_index=-100) -> None:
self.ignore_index=ignore_index
def __call__(self, y0, x):
loss = 0.
n_batch, n_class = y0.shape
# print(n_class)
for y1, x1 in zip(y0, x):
class_index = int(x1.item())
if class_index == self.ignore_index: # <------ I added this if-statement
continue
loss = loss + torch.log(torch.exp(y1[class_index])/(torch.exp(y1).sum()))
loss = - loss/n_batch
return loss
To verify that it works fine, I tested it on a text generation task, and I computed the loss using pytorch.nn implementation and using this code.
The loss values are not identical:
using nn.CrossEntropyLoss:
Using the code from the link above:
Am I missing something?
I tried to get the source code of nn.CrossEntropyLoss but I wasn't able. In this link nn/functional.py at line 2955, you will see that the function points to another cross_entropy loss called torch._C._nn.cross_entropy_loss; I can't find this function in the repo.
Edit:
I noticed that the differences appear only when I have -100 tokens in the gold.
Demo example:
y = torch.randint(1, 50, (100, 50), dtype=float)
x = torch.randint(1, 50, (100,))
x[40:] = -100
print(criterion(y, x).item())
print(criterion2(y, x).item())
> 25.55788695847976
> 10.223154783391905
and when we don't have -100:
x[40:] = 30 # any positive number
print(criterion(y, x).item())
print(criterion2(y, x).item())
> 24.684453267596453
> 24.684453267596453
I solved the problem by updating the code. I discarded before the -100 tokens (the if-statement above), but I forgot to reduce the hidden_state size (which is called n_batch in the code above). After doing that, the loss numbers are identical to the nn.CrossEntropyLoss values. The final code:
class CrossEntropyLossManual:
"""
y0 is the vector with shape (batch_size,C)
x shape is the same (batch_size), whose entries are integers from 0 to C-1
"""
def __init__(self, ignore_index=-100) -> None:
self.ignore_index=ignore_index
def __call__(self, y0, x):
loss = 0.
n_batch, n_class = y0.shape
# print(n_class)
for y1, x1 in zip(y0, x):
class_index = int(x1.item())
if class_index == self.ignore_index:
n_batch -= 1
continue
loss = loss + torch.log(torch.exp(y1[class_index])/(torch.exp(y1).sum()))
loss = - loss/n_batch
return loss
I needed this too - thank you for the manual Cross Entropy Loss code. It matches the pytorch results perfectly (with my data). I have one little fix to your fix above. In the end you need to divide by the final count of non-ignored rows (those without label -100). So you need a counter:
class compute_crossentropyloss_manual:
"""
y0 is the vector with shape (batch_size,C)
x shape is the same (batch_size), whose entries are integers from 0 to C-1
"""
def __init__(self, ignore_index=-100) -> None:
self.ignore_index=ignore_index
def __call__(self, y0, x):
loss = 0.
n_batch, n_class = y0.shape
# print(n_class)
cnt = 0 # <----- I added this
for y1, x1 in zip(y0, x):
class_index = int(x1.item())
if class_index == self.ignore_index:
continue
loss = loss + torch.log(torch.exp(y1[class_index])/(torch.exp(y1).sum()))
cnt += 1 # <----- I added this
loss = - loss/cnt # <---- I changed this from nbatch to 'cnt'
return loss
I try to create implement a papaer I read, on which the loss has some kind of probablistic dependence on a variable in the network. Therefore I need an access to hidden variables. Therefore I use low level API for loss.
Here is my model:
class probablistic_model(tf.keras.Model):
def call(self,inputs):
return self.auto_encoder(inputs),self.z
# get gradients
def get_grad(self, X, Y):
return self.auto_encoder.get_grad(X,Y)
with tf.GradientTape() as tape:
L = self.get_loss(X, Y)
g = tape.gradient(L,tf.ones(self.input.shape))
#might be incorrect
return g
def get_loss(self,X, Y):
with tf.GradientTape() as tape:
z = self.z
X=X[0]
diff = (X - Y)
diff = tf.norm(diff, ord=2, axis=1)
diff *= 2
diff *= z
score = diff - λ * tf.norm(diff, ord=1)
return score
def __init__(self,dimension,sigma):
super().__init__()
self.z=probablistic_feature(dimension,sigma)
self.auto_encoder=keras_auto_encoder(dimension[0], 30)
self.λ=2e-1
but when I try to run the model
import csv
import sys
dataset=[]
tf.config.run_functions_eagerly(True)
dataset=np.loadtxt(sys.argv[1],delimiter=",")[1:-1,:]
model=probablistic_model(dataset.shape[::-1],1)
model.compile()
model.fit(x=dataset,y=dataset)
I get:
ValueError: No gradients provided for any variable: ['dense/kernel:0', 'dense/bias:0', 'dense_1/kernel:0', 'dense_1/bias:0'].
Though get_grads is defined (though not correctly).
Why tensorflow ignores get_grads?
I am trying to debug a relatively complex custom training method using custom loss functions, etc. In particular I am trying to debug an issue in a custom training step, which is compiled into a Tensorflow #function and fitted as a Keras compiled model. I want to be able to print out an intermediate value of a tensor in a function call that is crashing. The difficulty is that since tensors inside an #function are graph values and arent evaluated immediately, and since the function crashes during evaluation, it seems like the values aren't actually calculated. Here is a simple example:
class debug_model(tf.keras.Model):
def __init__(self, width,depth,insize,outsize,batch_size):
super(debug_model, self).__init__()
self.width = width
self.depth = depth
self.insize = insize
self.outsize = outsize
self.net = tf.keras.models.Sequential()
self.net.add(tf.keras.Input(shape = (insize,)))
for i in range(depth):
self.net.add(tf.keras.layers.Dense(width,activation = 'swish'))
self.net.add(tf.keras.layers.Dense(outsize))
def call(self,ipts):
return self.net(ipts)
#tf.function
def train_step(self,data):
ipt, target = data
with tf.GradientTape(persistent=True) as tape_1:
tape_1.watch(ipt)
y = self(ipt)
tf.print('y:',y)
assert False
loss = tf.keras.losses.MAE(target,y)
trainable_vars = self.trainable_variables
loss_grad = tape_1.gradient(loss,trainable_vars)
self.optimizer.apply_gradients(zip(loss_grad, trainable_vars))
self.compiled_metrics.update_state(target, y)
# Return a dict mapping metric names to current value
return {m.name: m.result() for m in self.metrics}
If you compile this model with some data of your choice and run it:
train_set = tf.data.Dataset.from_tensor_slices(data_tuple).batch(opt.batchSize)
train_set.shuffle(buffer_size = trainpoints)
model = debug_model(opt.width,opt.depth,in_size,out_size,batchSize)
optimizer = tf.keras.optimizers.Adam(learning_rate=opt.lr)
lr_sched = lambda epoch, lr: lr * 0.95**(1 / (8))
cb_scheduler = tf.keras.callbacks.LearningRateScheduler(schedule = lr_sched, verbose = 1)
model.build((None,1))
model.summary()
model.compile(optimizer=optimizer,
loss = tf.keras.losses.MeanAbsoluteError(),
)
callbacks = [
tf.keras.callbacks.ModelCheckpoint(path,
verbose=2
),
cb_scheduler,
tf.keras.callbacks.CSVLogger(path+'log.csv')
]
hist = model.fit(train_set,epochs = opt.nEpochs,callbacks = callbacks)
If you load this up and run it you will see that it exits due to the assertion error without printing. Is there a way I can force this tensor to evaluate so I can print it?
I am forwarding, and backpropping tensor data X through two simple nn.Module PyTorch models instances, model1 and model2.
I can't get this process to work without usage of the depreciated Variable API.
So this works just fine:
y1 = model1(X)
v = Variable(y1.data, requires_grad=training) # Its all about this line!
y2 = model2(v)
criterion = nn.NLLLoss()
loss = criterion(y2, y)
loss.backward()
y1.backward(v.grad)
self.step()
But this will throw an error:
y1 = model1(X)
y2 = model2(y1)
criterion = nn.NLLLoss()
loss = criterion(y2, y)
loss.backward()
y1.backward(y1.grad) # it breaks here
self.step()
>>> RuntimeError: grad can be implicitly created only for scalar outputs
I just can't seem to find a relevant difference between v in the first implementation, and y1 in the second. In both cases requires_grad is set to True. The only thing I could find was that y1.grad_fn=<ThnnConv2DBackward> and v.grad_fn=<ThnnConv2DBackward>
What am I missing here? What (tensor attributes?) do I not know about, and if Variable is depreciated, what other implementation would work?
[UPDATED]
You are not correctly passing the y1.grad into y1.backward in the second example. After the first backward all the intermediate gradient will be destroyed, you need a special hook to extract that gradients. And in your case you are passing the None value. Here is small example to reproduce your case:
Code:
import torch
import torch.nn as nn
torch.manual_seed(42)
class Model1(nn.Module):
def __init__(self):
super().__init__()
def forward(self, x):
return x.pow(3)
class Model2(nn.Module):
def __init__(self):
super().__init__()
def forward(self, x):
return x / 2
model1 = Model1()
model2 = Model2()
criterion = nn.MSELoss()
X = torch.randn(1, 5, requires_grad=True)
y = torch.randn(1, 5)
y1 = model1(X)
y2 = model2(y1)
loss = criterion(y2, y)
# We are going to backprop 2 times, so we need to
# retain_graph=True while first backward
loss.backward(retain_graph=True)
try:
y1.backward(y1.grad)
except RuntimeError as err:
print(err)
print('y1.grad: ', y1.grad)
Output:
grad can be implicitly created only for scalar outputs
y1.grad: None
So you need to extract them correctly:
Code:
def extract(V):
"""Gradient extractor.
"""
def hook(grad):
V.grad = grad
return hook
model1 = Model1()
model2 = Model2()
criterion = nn.MSELoss()
X = torch.randn(1, 5, requires_grad=True)
y = torch.randn(1, 5)
y1 = model1(X)
y2 = model2(y1)
loss = criterion(y2, y)
y1.register_hook(extract(y1))
loss.backward(retain_graph=True)
print('y1.grad', y1.grad)
y1.backward(y1.grad)
Output:
y1.grad: tensor([[-0.1763, -0.2114, -0.0266, -0.3293, 0.0534]])
After some investigation I came to the following two solutions.
The solution provided elsewhere in this thread retained the computation graph manually, without an option the free them, thus running fine initially, but causing OOM errors later on.
The first solution is to tie the models together using the built in torch.nn.Sequential as such:
model = torch.nn.Sequential(Model1(), Model2())
it's as easy as that. It looks clean and behaves exactly like an ordinary model would.
The alternative is to simply tie them together manually:
model1 = Model1()
model2 = Model2()
y1 = model1(X)
y2 = model2(y1)
loss = criterion(y2, y)
loss.backward()
My fear that this would only backpropagate model2 turned out to be unsubstantiated, since model1 is also stored in the computation graph that is back propagated over.
This implementation enabled inceased transparancy of the interface between the two models, compared to the previous implementation.
Right now I have a model configured to take its inputs with feed_dict. The code looks something like this:
# model.py
class MyModel(object):
def __init__(self, hyperparams):
self.build_model(hyperparams)
def build_model(self, hps):
self.input_data = tf.placeholder(dtype=tf.float32, shape=[hps.batch_size, hps.nfeats])
self.labels = tf.placeholder(dtype=tf.float32, shape=[hps.batch_size])
# Define hidden layers, loss, training step, etc.
# train.py
model = MyModel(hps)
for _ in range(100):
x, y = some_python_function() # Read a batch from disk, preprocess
sess.run(model.train_step, feed_dict={model.input_data: x, model.labels: y})
For performance reasons, I'd like to switch to using queues for training. But I'd like to maintain the ability to use feed_dict, e.g. for inference or testing.
Is there an elegant way to do this? What I'd like to do is, when using queues, 'swap out' the placeholder variables for the tensors returned by my queue's dequeue op. I thought that tf.assign would be the way to do this, i.e.:
single_x, single_y = tf.parse_single_example(...)
x, y = tf.train.batch([single_x, single_y], batch_size)
model = MyModel(hps)
sess.run([tf.assign(model.input_data, x), tf.assign(model.labels, y)])
for _ in range(100):
sess.run(model.train_step)
But this raises AttributeError: 'Tensor' object has no attribute 'assign'. The API docs for tf.assign describe the first argument as: "A mutable Tensor. Should be from a Variable node. May be uninitialized." Does this mean my placeholders aren't mutable? Can I make them so? Or am I approaching this the wrong way?
Minimal runnable example here.
You could separate the creation of the Variables and the Operations by:
adding a build_variables method called at the instantiation of your Model class,
changing the interface of the build_model method so it accepts your xand y tensors as arguments and so it builds the model operations based on them.
This way you would reuse the variables and constants of your model. The downside being that the operations will be duplicated for the placeholder version and any other version.
import tensorflow as tf
import numpy as np
BATCH_SIZE = 2
class Model(object):
def __init__(self):
self.build_variables()
def build_variables(self):
self.w = tf.Variable(tf.random_normal([3, 1]))
def build_model(self, x, y):
self.x = x
self.y = y
self.output = tf.matmul(self.x, self.w)
self.loss = tf.losses.absolute_difference(self.y, self.output)
model = Model()
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
def placeholder_run():
x = tf.placeholder(dtype=tf.float32, shape=[BATCH_SIZE, 3])
y = tf.placeholder(dtype=tf.float32, shape=[BATCH_SIZE, 1])
model.build_model(x, y)
for i in range(3):
x = np.random.rand(BATCH_SIZE, 3)
y = x.sum(axis=1, keepdims=True)
loss = sess.run(model.loss, feed_dict={model.x:x, model.y:y})
print(loss)
def nonph_run():
x = tf.random_normal([BATCH_SIZE, 3])
y = tf.reduce_sum(x, axis=1, keep_dims=True)
model.build_model(x, y)
for i in range(3):
loss = sess.run(model.loss)
print(loss)
if __name__ == '__main__':
# Works
placeholder_run()
# Doesn't fail
nonph_run()
If you have control of your graph and know what you want upfront, you could use a switch on your input. For example,
x_plh = tf.placeholder(tf.float32, myshape)
x_dsk = my_input_from_disk()
use_dsk = tf.placeholder(tf.bool, ())
x = tf.cond(use_dsk, lambda: x_dsk, lambda: x_plh)
If you want a more flexible solution and take the somewhat pioneer route, you could have a go a the Dataset API of tensorflow. Take time to go through the doc, it is a nice read. A single Iterator can have several initializers using different Datasets, which could fit your case.