Debugging Tensorflow 2.0: Printing in a tf.function that crashes - python

I am trying to debug a relatively complex custom training method using custom loss functions, etc. In particular I am trying to debug an issue in a custom training step, which is compiled into a Tensorflow #function and fitted as a Keras compiled model. I want to be able to print out an intermediate value of a tensor in a function call that is crashing. The difficulty is that since tensors inside an #function are graph values and arent evaluated immediately, and since the function crashes during evaluation, it seems like the values aren't actually calculated. Here is a simple example:
class debug_model(tf.keras.Model):
def __init__(self, width,depth,insize,outsize,batch_size):
super(debug_model, self).__init__()
self.width = width
self.depth = depth
self.insize = insize
self.outsize = outsize
self.net = tf.keras.models.Sequential()
self.net.add(tf.keras.Input(shape = (insize,)))
for i in range(depth):
self.net.add(tf.keras.layers.Dense(width,activation = 'swish'))
self.net.add(tf.keras.layers.Dense(outsize))
def call(self,ipts):
return self.net(ipts)
#tf.function
def train_step(self,data):
ipt, target = data
with tf.GradientTape(persistent=True) as tape_1:
tape_1.watch(ipt)
y = self(ipt)
tf.print('y:',y)
assert False
loss = tf.keras.losses.MAE(target,y)
trainable_vars = self.trainable_variables
loss_grad = tape_1.gradient(loss,trainable_vars)
self.optimizer.apply_gradients(zip(loss_grad, trainable_vars))
self.compiled_metrics.update_state(target, y)
# Return a dict mapping metric names to current value
return {m.name: m.result() for m in self.metrics}
If you compile this model with some data of your choice and run it:
train_set = tf.data.Dataset.from_tensor_slices(data_tuple).batch(opt.batchSize)
train_set.shuffle(buffer_size = trainpoints)
model = debug_model(opt.width,opt.depth,in_size,out_size,batchSize)
optimizer = tf.keras.optimizers.Adam(learning_rate=opt.lr)
lr_sched = lambda epoch, lr: lr * 0.95**(1 / (8))
cb_scheduler = tf.keras.callbacks.LearningRateScheduler(schedule = lr_sched, verbose = 1)
model.build((None,1))
model.summary()
model.compile(optimizer=optimizer,
loss = tf.keras.losses.MeanAbsoluteError(),
)
callbacks = [
tf.keras.callbacks.ModelCheckpoint(path,
verbose=2
),
cb_scheduler,
tf.keras.callbacks.CSVLogger(path+'log.csv')
]
hist = model.fit(train_set,epochs = opt.nEpochs,callbacks = callbacks)
If you load this up and run it you will see that it exits due to the assertion error without printing. Is there a way I can force this tensor to evaluate so I can print it?

Related

Training a tensorflow model with an intermediate function call in training loop

I am trying to train a simple neural network where the input data is taken from a matlab simulink simulation and the output is then fed back into a different matlab simulink simulation. My code is as follows:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
def get_pid_values():
# call simulink model that just produces PID values
return random.random()
def get_plant(intermediate_val):
# get plant output.
return random.random()
class CustomDataGen(tf.keras.utils.Sequence):
def __init__(self, df, X_col,
batch_size,
input_size=(1,),
shuffle=True):
self.df = df.copy()
self.X_col = X_col
self.batch_size = batch_size
self.input_size = input_size
self.shuffle = shuffle
self.n = len(self.df)
def __get_input(self, index):
# Need to adjust this to support retrieving ref voltage.
return self.df[self.X_col].iloc[index]
def on_epoch_end(self):
if self.shuffle:
self.df = self.df.sample(frac=1).reset_index(drop=True)
def __getitem__(self, index):
X = self.__get_input(index)
return X
def __len__(self):
return self.n // self.batch_size
def get_model(input_shape, hidden, output_shape):
inputs = keras.layers.Input(shape=input_shape)
x = layers.Dense(hidden, activation="relu")(inputs)
x = layers.Dense(hidden, activation='relu')(x)
outputs = layers.Dense(output_shape)(x)
model = keras.Model(inputs=inputs, outputs=outputs, name="pid-modifier")
return model
loss_object = tf.keras.losses.MeanSquaredError()
def loss(y_ref, y_plant):
y_ = y_plant
y = y_ref
return loss_object(y_true=y, y_pred=y_)
if __name__ == "__main__":
# Hyperparameters
lr = 0.01
num_epochs = 1
hidden_size = 4
net_input_size = 1
net_output_size = 1
batch_size = 1
reference_fpath = "Run2_rThrottleTarget.csv"
references = pd.read_csv(reference_fpath)
data_generator = CustomDataGen(df=references, X_col='Throttle', batch_size=1)
# Keep results for plotting
train_loss_results = []
# Initialize optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
# error intitial condition
err = 0
# instantiate model
model = get_model(input_shape=(2,), hidden=hidden_size, output_shape=net_output_size)
for epoch in range(num_epochs):
for ref in data_generator:
with tf.GradientTape() as tape:
tape.watch(model.trainable_variables)
# Get pid values
pid = get_pid_values()
# Group ref with pid voltage for input
input = tf.constant([[ref, pid]])
# Get the adjusted voltage from the network
intermediate_val = model(input)
# Get the plant output based on the adjusted value.
plant = get_plant(intermediate_val)
plant = tf.constant([plant], dtype=tf.float64)
ref = tf.constant([ref], dtype=tf.float64)
# Calculate loss
loss_value = loss(ref, plant)
grads = tape.gradient(loss_value, model.trainable_weights)
optimizer.apply_gradients(zip(grads, model.trainable_weights))
err = ref - plant
if epoch % 50 == 0:
print("Epoch {:03d}: Loss: {:.3f}".format(epoch, loss_value))
fig, axes = plt.subplots(1, figsize=(12, 8))
fig.suptitle('Training Metrics')
axes[0].set_ylabel("Loss", fontsize=14)
axes[0].plot(train_loss_results)
plt.show()
For the moment I am just mocking the calls to simulink by returning a random number. My problem is that when I take the model output and then call the function that mocks a call to simulink and calculate my loss:
# Get the adjusted voltage from the network
intermediate_val = model(input)
# Get the plant output based on the adjusted value.
plant = get_plant(intermediate_val)
plant = tf.constant([plant], dtype=tf.float64)
ref = tf.constant([ref], dtype=tf.float64)
# Calculate loss
loss_value = loss(ref, plant)
I get the error ValueError: No gradients provided for any variable. I've figured out that if I pass the model's output directly to the loss function everything works fine. My question is how can I have the intermediate step of passing my model's output to another function and using the returned value to calculate loss?
a gradient exists between intermediate_val and model.trainable_variables as it is calculated by back propagation, the tape however cannot perform back-propagation on plant because it wasn't calculated by tensorflow, it's just a constant to it, it has no gradient.
since the model knows nothing about the relation between the loss and how it is generated, this becomes a case of reinforcement learning, which can be done using the tensorflow-agents module.
this is a tutorial about it on youtube Everything You Need To Master Actor Critic Methods | Tensorflow 2 Tutorial , it's about a certain network architecture but its gradient calculation method is exactly the same as your case, the code is easily adaptabe.

How to implement a multiple prediction custom loss function in TensorFlow?

I am trying to implement a Custom Loss function that uses multiple predictions/forward propagations of images for an image classification model.
The general concept of this loss function is to evaluate the model's consistency with non-augmented and augmented images. That is to say, the model is given 2 images; the original image and its augmented counterpart. Then, both images are forward propagated through the model. The more different the two outputs are from each other, the higher the loss.
What this meant is a fairly low-level change, and the most apparent way of solving this, to me, was model subclassing. I created a subclass of the keras.Model class and changed the train_step() method to include a small algorithm for locating the respective augmented counterpart of each original image (not relevant to the issue at all), and more significantly, a line that gave a prediction on the augmented counterpart:
with tf.GradientTape() as tape:
y_pred = self(x, training=True)
y_aug = self(self.augmented_data[aug_index:aug_index+self.batch_size], training=True)
loss = self.comparative_loss(y, y_pred, y_aug)
The whole self.augmented_data[aug_index:aug_index+self.batch_size] isn't relevant at all, it can be thought of just as the augmented data input. The intent was for the method "comparative_loss" to take the two predictions and then perform the aforementioned loss calculations on it.
The issue came when I tried to compile the model; there was a required loss parameter, but it refused to accept my custom loss method as it required 3 parameters. I couldn't go with the standard fix of putting the functions into a structure like this:
def new_loss(extra_parameter):
def loss(y_true, y_pred):
return loss_value
return loss
since my "extra_parameter" was not just a standard output of the model; it was a completely separate forward propagation on it, that relied on my custom train_step() method.
TL;DR:
What I'm most confused about is, why does tf.compile() even require a loss function, if my "train_step" method doesn't use it? The train_step method in my custom subclass has the loss built-in, so is there a way to override the .compile()'s loss parameter and have it work without me having to give it a method? If not, what other solutions are there?
The full code is below, though I sincerely apologize to anyone that reads it, as it's not quite finished:
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 18 11:37:08 2022
Custom Loss Function
Description:
For each element of y_true, compare the y_predict of
the original image and the complemented one, then return
a loss accordingly using the Euclidian distance
between the predictions for the original images and the complements.
y_predict are labels for the images, these labels can
come in any form: CIFAR labels, species labels, or labels of which
individual a given image is.
y_predict will be in the shape (batch_size, number_of_classes), using the
#author: hudso
"""
import tensorflow as tf
import keras
from keras.layers import Conv2D, MaxPooling2D, Dense, Flatten, BatchNormalization
import ssl
import numpy as np
import cv2 as cv
class CustomModel(keras.Model):
def __init__(self, classes):
super().__init__() #call parent constructor
self.conv_1 = Conv2D(32,(3,3),activation='relu',padding='same')
self.batch_1 = BatchNormalization()
self.conv_2 = Conv2D(32,(3,3),activation='relu',padding='same')
self.batch_2 = BatchNormalization()
self.pool_1 = MaxPooling2D((2,2))
self.conv_3 = Conv2D(64,(3,3),activation='relu',padding='same')
self.batch_3 = BatchNormalization()
self.conv_4 = Conv2D(64,(3,3),activation='relu',padding='same')
self.batch_4 = BatchNormalization()
self.pool_2 = MaxPooling2D((2,2))
self.conv_5 = Conv2D(128,(3,3),activation='relu',padding='same')
self.batch_5 = BatchNormalization()
self.conv_6 = Conv2D(128,(3,3),activation='relu',padding='same')
self.batch_6 = BatchNormalization()
self.flatten = Flatten()
self.layer_1 = keras.layers.Dropout(0.2)
self.layer_2 = Dense(256,activation='relu')
self.dropout = keras.layers.Dropout(0.2)
self.outputs = Dense(classes, activation='softmax') #no. of classes
self.classes = classes #Initializes the number of classes variable
#essentially the Functional API forward-pass call-structure shenanigans
#called each forward propagation (calculating loss, training, etc.)
def call(self, inputs):
#print("INPUTS: " + str(inputs))
x = self.conv_1(inputs)
x = self.batch_1(x)
x = self.conv_2(x)
x = self.batch_2(x)
x = self.pool_1(x)
x = self.conv_3(x)
x = self.batch_3(x)
x = self.conv_4(x)
x = self.batch_4(x)
x = self.pool_2(x)
x = self.conv_5(x)
x = self.batch_5(x)
x = self.conv_6(x)
x = self.batch_6(x)
x = self.flatten(x)
x = self.layer_1(x)
x = self.layer_2(x)
x = self.dropout(x)
x = self.outputs(x)
return x #returns the constructed model
#Imports necessary data (It's hard to gain access of the values handed to .fit())
def data_import(self, augmented_data, x_all, batch_size):
self.augmented_data = augmented_data
self.x_all = np.asarray(x_all, dtype=np.float32)
self.batch_size = batch_size
#Very useful advice: https://stackoverflow.com/questions/65889381/going-from-a-tensorarray-to-a-tensor
def comparative_loss(self, y_true, y_pred, y_aug):
output_loss = tf.TensorArray(tf.float32, size=self.classes)
batch_loss = tf.TensorArray(tf.float32, size=self.batch_size)
for n in range(self.batch_size):
for i in range(self.classes):
output_loss = output_loss.write(i, tf.square(tf.abs(tf.subtract(y_pred[n][i], y_aug[n][i])))) #finds Euclidean Distance for each prediction, then averages the loss across all iterations in the batch
indexes = tf.keras.backend.arange(0, self.classes, step=1, dtype='int32')
output_loss_tensor = output_loss.gather(indexes)
batch_loss = batch_loss.write(n, tf.math.reduce_sum(output_loss_tensor))
indexes = tf.keras.backend.arange(0, self.batch_size, step=1, dtype='int32')
batch_loss_tensor = batch_loss.gather(indexes)
total_loss = tf.math.reduce_sum(batch_loss_tensor)
total_loss = tf.math.divide(total_loss, self.batch_size)
print("TOTAL LOSS: " + str(total_loss))
return total_loss
def train_step(self, data):
x, y = data #Current batch
#Finds the range of indexes for the complements of the current batch of images
#A lower level implementation could make this significantly more efficient by avoiding searching each time
aug_index = 0
x_arr = x.numpy() #Turns the input data iterable Tensor into a numpy array, Eager Execution must be enabled for this to work
for i in range(np.size(self.x_all, axis = 0)):
difference = cv.subtract(self.x_all[i], x_arr[0])
if np.count_nonzero(difference) == 0: #In the .fit() line for this CustomModel, shuffle = False for this to work
aug_index = i #Lower bound of the batch of images
found = True
if found == False:
print("Yikes mate the x_arr wasn't found in x_all... probably a rounding error")
print("\nCurrent Index: " + str(aug_index))
#Forward pass/predictions + loss calculation
with tf.GradientTape() as tape:
y_pred = self(x, training=True)
y_aug = self(self.augmented_data[aug_index:aug_index+self.batch_size], training=True)
loss = self.comparative_loss(y, y_pred, y_aug) #Computes the actual loss value
#I didn't touch any of this code
trainable_vars = self.trainable_variables
gradients = tape.gradient(loss, trainable_vars)
self.optimizer.apply_gradients(zip(gradients, trainable_vars))
self.compiled_metrics.update_state(y, y_pred)
return {m.name: m.result() for m in self.metrics}
#Essentially emulates the environment that the model would normally be running in
#E.g. Creates the dataset, does Image Augmentation, etc.
#In the actual implementation, only the "CustomModel" class will be used, this is purely for testing purposes
class shrek_is_love:
def __init__(self):
self.complements = []
self.create_dataset()
#automatically runs
def create_dataset(self):
ssl._create_default_https_context = ssl._create_unverified_context
(images, labels), (_, _) = keras.datasets.cifar10.load_data() #only uses the training sets and then splits it again later since that'll be what we'll be dealing with in the happywhale dataset anyways
self.labels = labels
self.images = images
self.data_aug()
#NOT MY CODE this is liam's image data generator (thx liam ur cool)
#automatically runs
def data_aug(self):
imageGen = keras.preprocessing.image.ImageDataGenerator(width_shift_range=.3, height_shift_range=.3, horizontal_flip=True, zoom_range=.3)
imagees = np.zeros(shape=(1, 32, 32, 3))
for l in range(np.size(self.images, 0)):
# adjust the tuple inside of cv.resize to adjust resolution
temp = cv.resize(self.images[l], (32, 32))
imagees[0] = (cv.cvtColor(temp, cv.COLOR_BGR2RGB))
it = imageGen.flow(imagees)
im = it.next()
im = im[0].astype('float32')
im = im / 255.0
self.complements.append(im)
self.complements = np.asarray(self.complements, dtype=np.float)
self.images = self.images.astype(np.float)
self.images = self.images / 255.0
self.preprocessor()
def preprocessor(self):
from sklearn.preprocessing import OneHotEncoder
onehot_encoder = OneHotEncoder(sparse=False)
self.labels = onehot_encoder.fit_transform(np.reshape(self.labels, (-1, 1)))
from sklearn.model_selection import train_test_split
shared_seed = 5 #the indexes of complements_train and image_train have to line up, so that labels_train can apply to both
self.complements_train, self.complements_test = train_test_split(self.complements, test_size=0.25, random_state=shared_seed)
self.images_train, self.images_test, self.labels_train, self.labels_test = train_test_split(self.images, self.labels, test_size=0.25, random_state=shared_seed)
#The following code will be all that is necessary to run the CustomModel classs
batch_size = 32
shrek_is_life = shrek_is_love()
model = CustomModel(10) #10 classes
model.data_import(shrek_is_life.complements_train, shrek_is_life.images_train, batch_size) #the model will not be training on aug_data, essentially turning it into a secondary test set
model.compile(optimizer='adam', loss=None, metrics=['accuracy'], run_eagerly=True) #loss=None brings up an error, but I have no idea what else to put in there
model.fit(x = shrek_is_life.images_train, y = shrek_is_life.labels_train, shuffle = False, batch_size = batch_size, epochs = 1)
EDIT:
Running it without a .compile line yields this error:
Traceback (most recent call last):
File "D:\Downloads\untitled0.py", line 191, in <module>
model.fit(x = shrek_is_life.images_train, y = shrek_is_life.labels_train, shuffle = False, batch_size = batch_size, epochs = 1)
File "C:\Users\hudso\anaconda3\envs\mlTens\lib\site-packages\keras\engine\training.py", line 1150, in fit
x, y, sample_weights = self._standardize_user_data(
File "C:\Users\hudso\anaconda3\envs\mlTens\lib\site-packages\keras\engine\training.py", line 508, in _standardize_user_data
raise RuntimeError('You must compile a model before '
RuntimeError: You must compile a model before training/testing. Use `model.compile(optimizer, loss)`.
Running .compile without the loss argument or with loss=None yields:
File "C:\Users\hudso\anaconda3\envs\mlTens\lib\site-packages\keras\engine\training.py", line 706, in _prepare_total_loss
raise ValueError('The model cannot be compiled '
ValueError: The model cannot be compiled because it has no loss to optimize.

Model behaves differently after saving and loading

I want to use torch.save() to save a trained model for inference. However, with either torch.load_state_dict() or torch.load(), I can't get the saved model. The loss computed by the loaded model is just different from the loss computed by the saved model.
The relevant Libraries:
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F
The model:
class nn_block(nn.Module):
def __init__(self, feats_dim):
super(nn_block, self).__init__()
self.linear = nn.Linear(feats_dim, feats_dim)
self.bn = nn.BatchNorm1d(feats_dim)
self.softplus1 = nn.Softplus()
self.softplus2 = nn.Softplus()
def forward(self, rep_mat):
transformed_mat = self.linear(rep_mat)
transformed_mat = self.bn(transformed_mat)
transformed_mat = self.softplus1(transformed_mat)
transformed_mat = self.softplus2(transformed_mat + rep_mat)
return transformed_mat
class test_nn(nn.Module):
def __init__(self, in_feats, feats_dim, num_conv, num_classes):
super(test_nn, self).__init__()
self.linear1 = nn.Linear(in_feats, feats_dim)
self.convs = [nn_block(feats_dim) for _ in range(num_conv)]
self.linear2 = nn.Linear(feats_dim, num_classes)
self.softmax = nn.Softmax()
def forward(self, rep_mat):
h = self.linear1(rep_mat)
for conv_func in self.convs:
h = conv_func(h)
h = self.linear2(h)
h = self.softmax(h)
return h
Train, save, and reload a model:
# fake a classification task
num_classes = 2; input_dim = 8
one = np.random.multivariate_normal(np.zeros(input_dim),np.eye(input_dim),20)
two = np.random.multivariate_normal(np.ones(input_dim),np.eye(input_dim),20)
inputs = np.concatenate([one, two], axis=0)
labels = np.concatenate([np.zeros(20), np.ones(20)])
inputs = Variable(torch.Tensor(inputs))
labels = torch.LongTensor(labels)
# build a model
net = test_nn(input_dim, 5, 2, num_classes)
optimizer = torch.optim.Adam(net.parameters(), lr=0.01)
net.train()
losses = []
best_score = 1e10
for epoch in range(25):
preds = net(inputs)
loss = F.cross_entropy(preds, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
state_dict = {'state_dict': net.state_dict()}
if loss.item()-best_score<-1e-4:
# save only parameters
torch.save(state_dict, 'model_params.torch')
# save the whole model
torch.save(net, 'whole_model.torch')
best_score = np.min([best_score, loss.item()])
losses.append(loss.item())
net_params = test_nn(input_dim, 5, 2, num_classes)
net_params.load_state_dict(torch.load('model_params.torch')['state_dict'])
net_params.eval()
preds_params = net_params(inputs)
loss_params = F.cross_entropy(preds_params, labels)
print('reloaded params %.4f %.4f' % (loss_params.item(), np.min(losses)))
net_whole = torch.load('whole_model.torch')
net_whole.eval()
preds_whole = net_whole(inputs)
loss_whole = F.cross_entropy(preds_whole, labels)
print('reloaded whole %.4f %.4f' % (loss_whole.item(), np.min(losses)))
As you can see by running the code, the losses computed by the two loaded models are different, while the two loaded models are exactly the same. Not just the two losses are different, they are also different from the loss computed by the best model that was saved in the first place.
Why this can happen?
The state dict contains every parameter (nn.Parameter
) and buffer (similar to parameter, but which should not be trained/optimised) that has been registered on the module and all of its submodules. Everything else will not be included in that state dict.
Your test_nn module uses a list for convs, therefore it is not included in the state dict:
self.convs = [nn_block(feats_dim) for _ in range(num_conv)]
Not only are they not contained in the state dict, they are also not visible to net.parameters(), which means they are not trained/optimised at all.
To register the modules from the list you can wrap it in nn.ModuleList, which is a module that acts like a list, while correctly registering the modules it contains:
self.convs = nn.ModuleList([nn_block(feats_dim) for _ in range(num_conv)])
With that change both models produce the same result.
Since you are calling the convs modules sequentially in the for-loop (output of one module is the input of the next), you may consider using nn.Sequential, which you can call directly instead of having to use the for-loop. Sequencing is used a lot and it just makes it a little simpler, for example if you want to replace the sequence of modules with a single module, you don't need to change anything in the forward method.
Not just the two losses are different, they are also different from the loss computed by the best model that was saved in the first place.
When you are training, you calculate the loss for the current input (batch) and then you optimise the parameters based on that input. This means your parameters differ from the ones used to calculate the loss. Because you are saving the model after that, it will also have a different loss (the one that would occur in the next iteration).
preds = net(inputs)
# Calculating the loss of the current model
loss = F.cross_entropy(preds, labels)
optimizer.zero_grad()
loss.backward()
# Updating the model's parameters based on the loss
optimizer.step()
# State of the model after it has been updated
state_dict = {'state_dict': net.state_dict()}
# Comparing the loss from BEFORE the update
# But saving the model from AFTER the update
if loss.item()-best_score<-1e-4:
# save only parameters
torch.save(state_dict, 'model_params.torch')
# save the whole model
torch.save(net, 'whole_model.torch')
It's important to evaluate the model after the updates have been made. For this reason a validation set should be used, which is run after each epoch to assess the model's accuracy.

How to get rid of Variable API in PyTorch.autograd?

I am forwarding, and backpropping tensor data X through two simple nn.Module PyTorch models instances, model1 and model2.
I can't get this process to work without usage of the depreciated Variable API.
So this works just fine:
y1 = model1(X)
v = Variable(y1.data, requires_grad=training) # Its all about this line!
y2 = model2(v)
criterion = nn.NLLLoss()
loss = criterion(y2, y)
loss.backward()
y1.backward(v.grad)
self.step()
But this will throw an error:
y1 = model1(X)
y2 = model2(y1)
criterion = nn.NLLLoss()
loss = criterion(y2, y)
loss.backward()
y1.backward(y1.grad) # it breaks here
self.step()
>>> RuntimeError: grad can be implicitly created only for scalar outputs
I just can't seem to find a relevant difference between v in the first implementation, and y1 in the second. In both cases requires_grad is set to True. The only thing I could find was that y1.grad_fn=<ThnnConv2DBackward> and v.grad_fn=<ThnnConv2DBackward>
What am I missing here? What (tensor attributes?) do I not know about, and if Variable is depreciated, what other implementation would work?
[UPDATED]
You are not correctly passing the y1.grad into y1.backward in the second example. After the first backward all the intermediate gradient will be destroyed, you need a special hook to extract that gradients. And in your case you are passing the None value. Here is small example to reproduce your case:
Code:
import torch
import torch.nn as nn
torch.manual_seed(42)
class Model1(nn.Module):
def __init__(self):
super().__init__()
def forward(self, x):
return x.pow(3)
class Model2(nn.Module):
def __init__(self):
super().__init__()
def forward(self, x):
return x / 2
model1 = Model1()
model2 = Model2()
criterion = nn.MSELoss()
X = torch.randn(1, 5, requires_grad=True)
y = torch.randn(1, 5)
y1 = model1(X)
y2 = model2(y1)
loss = criterion(y2, y)
# We are going to backprop 2 times, so we need to
# retain_graph=True while first backward
loss.backward(retain_graph=True)
try:
y1.backward(y1.grad)
except RuntimeError as err:
print(err)
print('y1.grad: ', y1.grad)
Output:
grad can be implicitly created only for scalar outputs
y1.grad: None
So you need to extract them correctly:
Code:
def extract(V):
"""Gradient extractor.
"""
def hook(grad):
V.grad = grad
return hook
model1 = Model1()
model2 = Model2()
criterion = nn.MSELoss()
X = torch.randn(1, 5, requires_grad=True)
y = torch.randn(1, 5)
y1 = model1(X)
y2 = model2(y1)
loss = criterion(y2, y)
y1.register_hook(extract(y1))
loss.backward(retain_graph=True)
print('y1.grad', y1.grad)
y1.backward(y1.grad)
Output:
y1.grad: tensor([[-0.1763, -0.2114, -0.0266, -0.3293, 0.0534]])
After some investigation I came to the following two solutions.
The solution provided elsewhere in this thread retained the computation graph manually, without an option the free them, thus running fine initially, but causing OOM errors later on.
The first solution is to tie the models together using the built in torch.nn.Sequential as such:
model = torch.nn.Sequential(Model1(), Model2())
it's as easy as that. It looks clean and behaves exactly like an ordinary model would.
The alternative is to simply tie them together manually:
model1 = Model1()
model2 = Model2()
y1 = model1(X)
y2 = model2(y1)
loss = criterion(y2, y)
loss.backward()
My fear that this would only backpropagate model2 turned out to be unsubstantiated, since model1 is also stored in the computation graph that is back propagated over.
This implementation enabled inceased transparancy of the interface between the two models, compared to the previous implementation.

Pytorch parameters won't update with custom loss function (Pytorch)

I am trying to use the optimizer to tune a set of parameters for a cost function that includes, among other things, a forward pass across a neural network. The parameters specify the means and variances of the weights of this neural network. However, when updating the parameters at every iteration of the optimization process, all the terms of the cost function except the one belonging to the forward pass contribute to the parameter update. That is, if all other terms are commented out, no parameters will update. Are there any ways of fixing this issue?
EDIT: I added a contrived example below.
import torch
class TestNN(torch.nn.Module):
def __init__(self):
super(TestNN, self).__init__()
self.fc1 = torch.nn.Linear(10, 1)
def forward(self, x):
x = self.fc1(x)
return x
def getParameters(self):
return [self.fc1.weight.transpose(0, 1), self.fc1.bias]
def setParameters(self, parameters):
# Can anything be done here to keep parameters in the graph?
weight, bias = parameters
self.fc1.weight = torch.nn.Parameter(weight.transpose(0, 1))
self.fc1.bias = torch.nn.Parameter(bias)
def computeCost(parameters, input):
testNN = TestNN()
testNN.setParameters(parameters)
cost = testNN(input) ** 2
print(cost) # Cost stays the same :(
return cost
def minimizeLoss(maxIter, optimizer, lossFunc, lossFuncArgs):
for i in range(maxIter):
optimizer.zero_grad()
loss = lossFunc(*lossFuncArgs)
loss.backward(retain_graph = True)
optimizer.step()
if i % 100 == 0:
print(loss)
input = torch.randn(1, 10)
weight = torch.ones(10, 1)
bias = torch.ones(1, 1)
parameters = (weight, bias)
lossArgs = (parameters, input)
optimizer = torch.optim.Adam(parameters, lr = 0.01)
minimizeLoss(10, optimizer, computeCost, lossArgs)

Categories

Resources