I try to create implement a papaer I read, on which the loss has some kind of probablistic dependence on a variable in the network. Therefore I need an access to hidden variables. Therefore I use low level API for loss.
Here is my model:
class probablistic_model(tf.keras.Model):
def call(self,inputs):
return self.auto_encoder(inputs),self.z
# get gradients
def get_grad(self, X, Y):
return self.auto_encoder.get_grad(X,Y)
with tf.GradientTape() as tape:
L = self.get_loss(X, Y)
g = tape.gradient(L,tf.ones(self.input.shape))
#might be incorrect
return g
def get_loss(self,X, Y):
with tf.GradientTape() as tape:
z = self.z
X=X[0]
diff = (X - Y)
diff = tf.norm(diff, ord=2, axis=1)
diff *= 2
diff *= z
score = diff - λ * tf.norm(diff, ord=1)
return score
def __init__(self,dimension,sigma):
super().__init__()
self.z=probablistic_feature(dimension,sigma)
self.auto_encoder=keras_auto_encoder(dimension[0], 30)
self.λ=2e-1
but when I try to run the model
import csv
import sys
dataset=[]
tf.config.run_functions_eagerly(True)
dataset=np.loadtxt(sys.argv[1],delimiter=",")[1:-1,:]
model=probablistic_model(dataset.shape[::-1],1)
model.compile()
model.fit(x=dataset,y=dataset)
I get:
ValueError: No gradients provided for any variable: ['dense/kernel:0', 'dense/bias:0', 'dense_1/kernel:0', 'dense_1/bias:0'].
Though get_grads is defined (though not correctly).
Why tensorflow ignores get_grads?
Related
I am new to tensorflow and nueral networks. I am trying to create a NN to estimate y = x^2
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
x_train = tf.constant(value = np.linspace(-10,10,50),dtype='float32')
x_train = tf.reshape(x_train,shape=[50,1])
y_train = x_train**2
layers = [1,3,4,1]
I created a nueral network class to obtain my weights and biases and run forward propagation.
class NN(tf.Module):
def __init__(self,layers,name=None):
super().__init__(name=name)
self.layers = layers
self.weights, self.biases = self.initialze(layers)
def initialze(self,layers) :
num_layers = len(layers)
weights = []
biases = []
for i in range(num_layers-1):
in_dim = layers[i]
out_dim = layers[i+1]
stddev = np.sqrt(2/(in_dim + out_dim))
b = tf.Variable(tf.zeros([1,layers[i+1]], dtype='float32'), dtype='float32')
W = tf.Variable(tf.random.truncated_normal([in_dim, out_dim], stddev=stddev), dtype='float32')
weights.append(W)
biases.append(b)
return weights, biases
def __call__(self,x):
Z = x
num_layers = len(self.layers)
for i in range(num_layers-1):
Z =tf.math.add(tf.linalg.matmul(Z ,self.weights[i]),self.biases[i])
return Z
My_NN = NN(layers)
Next I created a class updat to do backward propogation
class updat:
def __init__(self,y_train,x_train):
self.y_train = y_train
self.x_train = x_train
self.l_r = 0.1
def get_grad(self,My_NN):
with tf.GradientTape(persistent=True) as tape:
tape.watch(My_NN.weights)
tape.watch(My_NN.biases)
loss = tf.reduce_mean(tf.square(self.y_train-My_NN(self.x_train)))
dw,db = tape.gradient(loss, [My_NN.weights,My_NN.biases])
print(dw,'weight')
print(db,'biases')
My_NN.weights -= (self.l_r * dw)
My_NN.biases -=(self.l_r * db)
del tape
return loss
def report(self, loss):
return f"W = {My_NN.weights.numpy():1.2f}, b = {My_NN.biases.numpy():1.2f}, loss={loss:2.5f}"
def prop(self,epochs,My_NN):
for epoch in epochs:
loss = self.get_grad(My_NN)
current_loss = loss
print(f"Epoch {epoch:2d}:")
print(" ", report(current_loss,My_NN))
But when I run the code
model = updat(y_train,x_train)
epochs = range(10)
model.prop(epochs,My_NN)
I get an error saying
My_NN.weights -= (self.l_r * dw)
My_NN.biases -=(self.l_r * db)
TypeError: can't multiply sequence by non-int of type 'float'
I tried substituting My_NN.weights -= (lr*dw)
with My_NN.weights.assign_sub(lr*dw)
still it shows that
'ListWrapper' object has no attribute 'assign_sub'
Is there any solution for this?
TURN
My_NN.weights -= (self.l_r * dw)
My_NN.biases -=(self.l_r * db)
TO
for weight,d_weight in zip(My_NN.weights,dw):
weight.assign_sub(self.l_r * d_weight)
for bias,d_bias in zip(My_NN.biases,db):
bias.assign_sub(self.l_r * d_bias)
can solve the problem.
Because My_NN.weights is a list of tf.Variable's ref and dw is corresponding list of tf.constant. We cannot modify it outside the list unless we iterate over the list. Additionally, if we want to update tf.Variable, we should use its assign .etc methods, this is like modifying the content specified by the pointer variable in C language.
More conveniently, we usually use tf.keras.optimizers's apply_gridents(), even minimize() to updata varibales directly.
For this specific task and your more process oriented coding approach, here I give out some suggestions for stable training:
add activations to constrain the fitting ability of this model:
def __call__(self,x):
Z = x
num_layers = len(self.layers)
for i in range(num_layers-2):
y = tf.math.add(tf.linalg.matmul(Z ,self.weights[i]),self.biases[i])
Z = tf.nn.relu(y)
i+=1
return tf.math.add(tf.linalg.matmul(Z ,self.weights[i]),self.biases[i])
make lower learning_rate:
self.l_r = 0.001 # self.l_r = 0.1
do more epochs:
epochs = range(1000) # epochs = range(10)
Since initial value of trainable wights will also influence the training stability, you may need to re-train several times. In my
tests, the above modification works.
I am trying to compute cross_entropy loss manually in Pytorch for an encoder-decoder model.
I used the code posted here to compute it: Cross Entropy in PyTorch
I updated the code to discard padded tokens (-100). The final code is this:
class compute_crossentropyloss_manual:
"""
y0 is the vector with shape (batch_size,C)
x shape is the same (batch_size), whose entries are integers from 0 to C-1
"""
def __init__(self, ignore_index=-100) -> None:
self.ignore_index=ignore_index
def __call__(self, y0, x):
loss = 0.
n_batch, n_class = y0.shape
# print(n_class)
for y1, x1 in zip(y0, x):
class_index = int(x1.item())
if class_index == self.ignore_index: # <------ I added this if-statement
continue
loss = loss + torch.log(torch.exp(y1[class_index])/(torch.exp(y1).sum()))
loss = - loss/n_batch
return loss
To verify that it works fine, I tested it on a text generation task, and I computed the loss using pytorch.nn implementation and using this code.
The loss values are not identical:
using nn.CrossEntropyLoss:
Using the code from the link above:
Am I missing something?
I tried to get the source code of nn.CrossEntropyLoss but I wasn't able. In this link nn/functional.py at line 2955, you will see that the function points to another cross_entropy loss called torch._C._nn.cross_entropy_loss; I can't find this function in the repo.
Edit:
I noticed that the differences appear only when I have -100 tokens in the gold.
Demo example:
y = torch.randint(1, 50, (100, 50), dtype=float)
x = torch.randint(1, 50, (100,))
x[40:] = -100
print(criterion(y, x).item())
print(criterion2(y, x).item())
> 25.55788695847976
> 10.223154783391905
and when we don't have -100:
x[40:] = 30 # any positive number
print(criterion(y, x).item())
print(criterion2(y, x).item())
> 24.684453267596453
> 24.684453267596453
I solved the problem by updating the code. I discarded before the -100 tokens (the if-statement above), but I forgot to reduce the hidden_state size (which is called n_batch in the code above). After doing that, the loss numbers are identical to the nn.CrossEntropyLoss values. The final code:
class CrossEntropyLossManual:
"""
y0 is the vector with shape (batch_size,C)
x shape is the same (batch_size), whose entries are integers from 0 to C-1
"""
def __init__(self, ignore_index=-100) -> None:
self.ignore_index=ignore_index
def __call__(self, y0, x):
loss = 0.
n_batch, n_class = y0.shape
# print(n_class)
for y1, x1 in zip(y0, x):
class_index = int(x1.item())
if class_index == self.ignore_index:
n_batch -= 1
continue
loss = loss + torch.log(torch.exp(y1[class_index])/(torch.exp(y1).sum()))
loss = - loss/n_batch
return loss
I needed this too - thank you for the manual Cross Entropy Loss code. It matches the pytorch results perfectly (with my data). I have one little fix to your fix above. In the end you need to divide by the final count of non-ignored rows (those without label -100). So you need a counter:
class compute_crossentropyloss_manual:
"""
y0 is the vector with shape (batch_size,C)
x shape is the same (batch_size), whose entries are integers from 0 to C-1
"""
def __init__(self, ignore_index=-100) -> None:
self.ignore_index=ignore_index
def __call__(self, y0, x):
loss = 0.
n_batch, n_class = y0.shape
# print(n_class)
cnt = 0 # <----- I added this
for y1, x1 in zip(y0, x):
class_index = int(x1.item())
if class_index == self.ignore_index:
continue
loss = loss + torch.log(torch.exp(y1[class_index])/(torch.exp(y1).sum()))
cnt += 1 # <----- I added this
loss = - loss/cnt # <---- I changed this from nbatch to 'cnt'
return loss
I have a TensorFlow model f(x) and I sometimes need its gradients and sometimes not, depending on the result of the forward pass. In order to save computation time, I only want to compute the gradients when I need them. If I stop the gradient computation using stop_gradient() or don't record them on a GradientTape, it seems like I can never obtain the gradients without computing the forward pass again. A simplified example of what I'm trying to do looks like this (in pseudocode):
x = 5
y = f(x)
if y > 0:
compute_gradients(f, x)
Is it possible to accomplish this in TensorFlow and if so, how would I do that?
Yes, you can skip gradient updates with a simple conditional.
import tensorflow as tf
from tensorflow.python.platform import test as test_lib
# network
x_in = tf.keras.Input([10])
x_out = tf.keras.layers.Dense(1)(x_in)
# optimizer
opt = tf.keras.optimizers.Adam(1e-1)
# forward pass
def train_step(model, X, y, threshold):
with tf.GradientTape() as tape:
y_hat = model(X)
# threshold = tf.math.reduce_mean(y_hat)
loss = tf.math.reduce_mean(tf.keras.losses.MSE(y, y_hat))
if tf.math.greater(threshold, 1.0):
m_vars = model.trainable_variables
m_grads = tape.gradient(loss, m_vars)
opt.apply_gradients(zip(m_grads, m_vars))
return loss
# test cases
class SporaticGradientUpdateTest(test_lib.TestCase):
def setUp(self):
self.model = tf.keras.Model(x_in, x_out)
self.X = tf.random.normal([100, 10])
self.y = tf.random.normal([100])
self.w_before = self.model.get_weights()
def test_weights_dont_change(self):
_ = train_step(self.model, self.X, self.y, 0.99)
# get weights that shouldn't have updated
w_after = self.model.get_weights()
self.assertAllClose(self.w_before, w_after)
def test_weights_change(self):
_ = train_step(self.model, self.X, self.y, 1.01)
# get weights that should updated
w_after = self.model.get_weights()
self.assertNotAllClose(self.w_before, w_after)
if __name__ == "__main__":
test_lib.main()
# [ RUN ] SporaticGradientUpdate.test_weights_change
# [ OK ] SporaticGradientUpdate.test_weights_change
# [ RUN ] SporaticGradientUpdate.test_weights_dont_change
# [ OK ] SporaticGradientUpdate.test_weights_dont_change
Per your comment, it looks like your use-case is a little different than this example but should be adaptable to whatever you are trying to do.
In the example, I passed in the threshold as an arg so I could test both cases, but normally you would create it by doing something to the output of the network (like the commented out portion).
I want to have a custom gradient as follows with rounding, and then get the custom gradient in a simple operation. When I run the code below it won't output any values with the tf.cast of the mask >= 0.5 present. This should be pretty simple....what am I missing here? Note that I want the output of a call to the softhardthresh function to not affect the gradient calculation in any way.
#tf.custom_gradient
def softhardthresh(x):
mask = tf.nn.sigmoid(x)
def grad(dy):
return dy * (mask * (1 - mask))
return tf.cast(mask >= 0.5, tf.float32), grad
inputs = tf.keras.Input(shape=(32,))
x = layers.Dense(32, activation=actfun)(inputs)
output = layers.Dense(32)(x) ## hard threshold
output = softhardthresh(output)
mask_ann = tf.keras.Model(inputs, output, name='mask_ann')
with tf.GradientTape(persistent=False) as tape:
x = tf.random.uniform(shape=[10,32])
tape.watch(x)
y = mask_ann(x)
grads = tape.gradient(y, mask_ann.trainable_variables)
I am forwarding, and backpropping tensor data X through two simple nn.Module PyTorch models instances, model1 and model2.
I can't get this process to work without usage of the depreciated Variable API.
So this works just fine:
y1 = model1(X)
v = Variable(y1.data, requires_grad=training) # Its all about this line!
y2 = model2(v)
criterion = nn.NLLLoss()
loss = criterion(y2, y)
loss.backward()
y1.backward(v.grad)
self.step()
But this will throw an error:
y1 = model1(X)
y2 = model2(y1)
criterion = nn.NLLLoss()
loss = criterion(y2, y)
loss.backward()
y1.backward(y1.grad) # it breaks here
self.step()
>>> RuntimeError: grad can be implicitly created only for scalar outputs
I just can't seem to find a relevant difference between v in the first implementation, and y1 in the second. In both cases requires_grad is set to True. The only thing I could find was that y1.grad_fn=<ThnnConv2DBackward> and v.grad_fn=<ThnnConv2DBackward>
What am I missing here? What (tensor attributes?) do I not know about, and if Variable is depreciated, what other implementation would work?
[UPDATED]
You are not correctly passing the y1.grad into y1.backward in the second example. After the first backward all the intermediate gradient will be destroyed, you need a special hook to extract that gradients. And in your case you are passing the None value. Here is small example to reproduce your case:
Code:
import torch
import torch.nn as nn
torch.manual_seed(42)
class Model1(nn.Module):
def __init__(self):
super().__init__()
def forward(self, x):
return x.pow(3)
class Model2(nn.Module):
def __init__(self):
super().__init__()
def forward(self, x):
return x / 2
model1 = Model1()
model2 = Model2()
criterion = nn.MSELoss()
X = torch.randn(1, 5, requires_grad=True)
y = torch.randn(1, 5)
y1 = model1(X)
y2 = model2(y1)
loss = criterion(y2, y)
# We are going to backprop 2 times, so we need to
# retain_graph=True while first backward
loss.backward(retain_graph=True)
try:
y1.backward(y1.grad)
except RuntimeError as err:
print(err)
print('y1.grad: ', y1.grad)
Output:
grad can be implicitly created only for scalar outputs
y1.grad: None
So you need to extract them correctly:
Code:
def extract(V):
"""Gradient extractor.
"""
def hook(grad):
V.grad = grad
return hook
model1 = Model1()
model2 = Model2()
criterion = nn.MSELoss()
X = torch.randn(1, 5, requires_grad=True)
y = torch.randn(1, 5)
y1 = model1(X)
y2 = model2(y1)
loss = criterion(y2, y)
y1.register_hook(extract(y1))
loss.backward(retain_graph=True)
print('y1.grad', y1.grad)
y1.backward(y1.grad)
Output:
y1.grad: tensor([[-0.1763, -0.2114, -0.0266, -0.3293, 0.0534]])
After some investigation I came to the following two solutions.
The solution provided elsewhere in this thread retained the computation graph manually, without an option the free them, thus running fine initially, but causing OOM errors later on.
The first solution is to tie the models together using the built in torch.nn.Sequential as such:
model = torch.nn.Sequential(Model1(), Model2())
it's as easy as that. It looks clean and behaves exactly like an ordinary model would.
The alternative is to simply tie them together manually:
model1 = Model1()
model2 = Model2()
y1 = model1(X)
y2 = model2(y1)
loss = criterion(y2, y)
loss.backward()
My fear that this would only backpropagate model2 turned out to be unsubstantiated, since model1 is also stored in the computation graph that is back propagated over.
This implementation enabled inceased transparancy of the interface between the two models, compared to the previous implementation.