Moving member tensors with module.to() in PyTorch - python

I am building a Variational Autoencoder (VAE) in PyTorch and have a problem writing device agnostic code. The Autoencoder is a child of nn.Module with an encoder and decoder network, which are too. All weights of the network can be moved from one device to another by calling net.to(device).
The problem I have is with the reparametrization trick:
encoding = mu + noise * sigma
The noise is a tensor of the same size as mu and sigma and saved as a member variable of the autoencoder module. It is initialized in the constructor and resampled in-place each training step. I do it that way to avoid constructing a new noise tensor each step and pushing it to the desired device. Additionally, I want to fix the noise in the evaluation. Here is the code:
class VariationalGenerator(nn.Module):
def __init__(self, input_nc, output_nc):
super(VariationalGenerator, self).__init__()
self.input_nc = input_nc
self.output_nc = output_nc
embedding_size = 128
self._train_noise = torch.randn(batch_size, embedding_size)
self._eval_noise = torch.randn(1, embedding_size)
self.noise = self._train_noise
# Create encoder
self.encoder = Encoder(input_nc, embedding_size)
# Create decoder
self.decoder = Decoder(output_nc, embedding_size)
def train(self, mode=True):
super(VariationalGenerator, self).train(mode)
self.noise = self._train_noise
def eval(self):
super(VariationalGenerator, self).eval()
self.noise = self._eval_noise
def forward(self, inputs):
# Calculate parameters of embedding space
mu, log_sigma = self.encoder.forward(inputs)
# Resample noise if training
if self.training:
self.noise.normal_()
# Reparametrize noise to embedding space
inputs = mu + self.noise * torch.exp(0.5 * log_sigma)
# Decode to image
inputs = self.decoder(inputs)
return inputs, mu, log_sigma
When I now move the autoencoder to the GPU with net.to('cuda:0') I get an error in forwarding because the noise tensor is not moved.
I don't want to add a device parameter to the constructor, because then it is still not possible to move it to another device later. I also tried to wrap the noise into nn.Parameter so that it is affected by net.to(), but that gives an error from the optimizer, as the noise is flagged as requires_grad=False.
Anyone has a solution to move all of the modules with net.to()?

A better version of tilman151's second approach is probably to override _apply, rather than to. That way net.cuda(), net.float(), etc will all work as well, since those all call _apply rather than to (as can be seen in the source, which is simpler than you might think):
def _apply(self, fn):
super(VariationalGenerator, self)._apply(fn)
self._train_noise = fn(self._train_noise)
self._eval_noise = fn(self._eval_noise)
return self

After some more trial and error I found two methods:
Use Buffers: By replacing self._train_noise = torch.randn(batch_size, embedding_size) with self.register_buffer('_train_noise', torch.randn(batch_size, embedding_size) the noise tensor is added to the module as a buffer. This lets net.to(device) affect it, too. Additionally the tensor is now part of the state_dict.
Override net.to(device): Using this the noise stays out of the state_dict.
def to(device):
new_self = super(VariationalGenerator, self).to(device)
new_self._train_noise = new_self._train_noise.to(device)
new_self._eval_noise = new_self._eval_noise.to(device)
return new_self

By using this you may apply the same arguments to your tensors and the module
def to(self, **kwargs):
module = super(VariationalGenerator, self).to(**kwargs)
module._train_noise = self._train_noise.to(**kwargs)
module._eval_noise = self._eval_noise.to(**kwargs)
return module

Use this:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
Now for both the model and every tensor you use
net.to(device)
input = input.to(device)

You can use nn.Module buffers and parameters - both are considered when calling .to(device) and moved to the device.
Parameters are being updated by optimizer (so they need requires_grad=True), buffers are not.
So in your case, I'd write constructor as:
def __init__(self, input_nc, output_nc):
super(VariationalGenerator, self).__init__()
self.input_nc = input_nc
self.output_nc = output_nc
embedding_size = 128
# --- CHANGED LINES ---
self.register_buffer('_train_noise', torch.randn(batch_size, embedding_size))
self.register_buffer('_eval_noise', torch.randn(1, embedding_size))
# --- CHANGED LINES ---
self.noise = self._train_noise
# Create encoder
self.encoder = Encoder(input_nc, embedding_size)
# Create decoder
self.decoder = Decoder(output_nc, embedding_size)

Related

Does adding a forward hook to a layer of ensure that the gradient of the loss calculated using the layer's output will be calculated automatically?

I have a model
class NewModel(nn.Module):
def __init__(self,output_layer,*args):
self.output_layer = output_layer
super().__init__(*args)
self.output_layer = output_layer
self.selected_out = None
#PRETRAINED MODEL
self.pretrained = models.resnet18(pretrained=True)
#TAKING OUTPUT FROM AN INTERMEDIATE LAYER
#self._layers = []
for l in list(self.pretrained._modules.keys()):
#self._layers.append(l)
if l == self.output_layer:
handle = getattr(self.pretrained,l).register_forward_hook(self.hook)
def hook(self,module, input,output):
self.selected_out = output
def forward(self, x):
return x = self.pretrained(x)
I have two target outputs, one which is same as any label of an image and the second one is the same dimensions as the output obtained from self.output_layer, called target_feature
out = model(img)
layerout = model.selected_out
Now, if I want to calculate the loss of layerout with the target feature map, can it be done like the line written below?
loss = criterion(y_true, out) + feature_criterion(layerout, target_feature)
Or do I need to add backward_hooks?
In this Kaggle notebook
https://www.kaggle.com/sironghuang/understanding-pytorch-hooks
it is written that loss.backward() cannot be used when using backward_hooks.
Quoting the author
# backprop once to get the backward hook results
out.backward(torch.tensor([1,1],dtype=torch.float),retain_graph=True)
#! loss.backward(retain_graph=True) # doesn't work with backward hooks,
#! since it's not a network layer but an aggregated result from the outputs of last layer vs target
Then how can be gradient be calculated based on the loss function?
If I understand you correctly, you want to get two outputs from your model, calculate two losses, then combine them and backpropagate. I imagine you come from Tensorflow & Keras from the way you tried implementing it. In Pytorch, it's actually fairly straight foward, you can do this very easily because of its purely functional aspect.
This is just an example:
class NewModel(nn.Module):
def __init__(self, output_layer, *args):
super(MyModel, self).__init__()
self.pretrained = models.resnet18(pretrained=True)
self.output_layer = output_layer
def forward(self, x):
out = self.pretrained(x)
features = self.output_layer(out)
return out, features
On inference, you will get two results per call:
>>> m = NewModel(nn.Linear(1000, 10))
>>> x = torch.rand(16, 3, 224, 224)
>>> y_pred, y_feature = m(x)
Call you loss functions:
>>> loss = criterion(y_pred, y_true) + feature_criterion(y_feature, target_feature)
Then, backpropagate with loss.backward().
So no need for hooks, nor complicated gradient on your .backward call!
Edit - If you wish to extract an intermediate layer output, keep the hook, that's good. And just modify the forward definition.
def forward(self, x):
out = self.pretrained(x)
return out, self.selected_out
For example:
>>> m = NewModel(output_layer='layer1')
>>> x = torch.rand(16, 3, 224, 224)
>>> y_pred, y_feature = m(x)
>>> y_pred.shape, y_feature.shape
(torch.Size([16, 1000]), torch.Size([16, 64, 56, 56]))
Also, what I said above about the loss stills stands. Compute your loss, then call loss.backward().

How to create a custom PreprocessingLayer in TF 2.2

I would like to create a custom preprocessing layer using the tf.keras.layers.experimental.preprocessing.PreprocessingLayer layer.
In this custom layer, placed after the input layer, I would like to normalize my image using tf.cast(img, tf.float32) / 255.
I tried to find some code or example showing how to create this preprocessing layer, but I couldn't find.
Please, can someone provide a full example creating and using the PreprocessingLayer layer ?
If you want to have a custom preprocessing layer, actually you don't need to use PreprocessingLayer. You can simply subclass Layer
Take the simplest preprocessing layer Rescaling as an example, it is under the tf.keras.layers.experimental.preprocessing.Rescaling namespace. However, if you check the actual implementation, it is just subclass Layer class Source Code Link Here but has #keras_export('keras.layers.experimental.preprocessing.Rescaling')
#keras_export('keras.layers.experimental.preprocessing.Rescaling')
class Rescaling(Layer):
"""Multiply inputs by `scale` and adds `offset`.
For instance:
1. To rescale an input in the `[0, 255]` range
to be in the `[0, 1]` range, you would pass `scale=1./255`.
2. To rescale an input in the `[0, 255]` range to be in the `[-1, 1]` range,
you would pass `scale=1./127.5, offset=-1`.
The rescaling is applied both during training and inference.
Input shape:
Arbitrary.
Output shape:
Same as input.
Arguments:
scale: Float, the scale to apply to the inputs.
offset: Float, the offset to apply to the inputs.
name: A string, the name of the layer.
"""
def __init__(self, scale, offset=0., name=None, **kwargs):
self.scale = scale
self.offset = offset
super(Rescaling, self).__init__(name=name, **kwargs)
def call(self, inputs):
dtype = self._compute_dtype
scale = math_ops.cast(self.scale, dtype)
offset = math_ops.cast(self.offset, dtype)
return math_ops.cast(inputs, dtype) * scale + offset
def compute_output_shape(self, input_shape):
return input_shape
def get_config(self):
config = {
'scale': self.scale,
'offset': self.offset,
}
base_config = super(Rescaling, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
So it proves that Rescaling preprocessing is just another normal layer.
The main part is the def call(self, inputs) function. You can create whatever complicated logic to preprocess your inputs and then return.
A easier documentation about custom layer can be find here
In a nutshell, you can do the preprocessing by layer, either by Lambda which for simple operation or by subclassing Layer to achieve your goal.
I think that the best and cleaner solution to do this is using a simple Lambda layer where you can wrap your pre-processing function
this is a dummy working example
import numpy as np
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
X = np.random.randint(0,256, (200,32,32,3))
y = np.random.randint(0,3, 200)
inp = Input((32,32,3))
x = Lambda(lambda x: x/255)(inp)
x = Conv2D(8, 3, activation='relu')(x)
x = Flatten()(x)
out = Dense(3, activation='softmax')(x)
m = Model(inp, out)
m.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
history = m.fit(X, y, epochs=10)

Short circuit computation in mixture of experts model using tensorflow keras functional api

I am trying to swap between multiple different "expert" layers based on the output of a "gating" layer (as a mixture of experts).
I created a custom layer that takes in the outputs of the expert and gating layers, but this ends up throwing away some outputs rather than not computing them in the first place.
How can I make the model "short circuit" to only evaluate the gating layer and the selected expert layer(s) to save computation time?
I am using tensorflow 2.0 gpu and the keras functional api
Keras models can be implemented fully dynamically, to support the efficient routing that you mentioned. The following example shows one way in which this can be done. The example is written with the following premises:
It assumes there are two experts (LayerA and LayerB)
It assumes that a mix-of-experts model (MixOfExpertsModel) switches dynamically between the two expert layer classes depending on the per-example output of a Keras Dense layer
It satisfies the need to run training on the model in a batch fashion.
Pay attention to the comments in the code to see how the switching is done.
import numpy as np
import tensorflow as tf
# This is your Expert A class.
class LayerA(tf.keras.layers.Layer):
def build(self, input_shape):
self.weight = self.add_weight("weight_a", shape=input_shape[1:])
#tf.function
def call(self, x):
return x + self.weight
# This is your Expert B class.
class LayerB(tf.keras.layers.Layer):
def build(self, input_shape):
self.weight = self.add_weight("weight_b", shape=input_shape[1:])
#tf.function
def call(self, x):
return x * self.weight
class MixOfExpertsModel(tf.keras.models.Model):
def __init__(self):
super(MixOfExpertsModel, self).__init__()
self._expert_a = LayerA()
self._expert_b = LayerB()
self._gating_layer = tf.keras.layers.Dense(1, activation="sigmoid")
#tf.function
def call(self, x):
z = self._gating_layer(x)
# The switching logic:
# - examples with gating output <= 0.5 are routed to expert A
# - examples with gating output > 0.5 are routed to expert B.
mask_a = tf.squeeze(tf.less_equal(z, 0.5), axis=-1)
mask_b = tf.squeeze(tf.greater(z, 0.5), axis=-1)
# `input_a` is a subset of slices of the original input (`x`).
# So is `input_b`. As such, no compute is wasted.
input_a = tf.boolean_mask(x, mask_a, axis=0)
input_b = tf.boolean_mask(x, mask_b, axis=0)
if tf.size(input_a) > 0:
output_a = self._expert_a(input_a)
else:
output_a = tf.zeros_like(input_a)
if tf.size(input_b) > 0:
output_b = self._expert_b(input_b)
else:
output_b = tf.zeros_like(input_b)
# Return `mask_a`, and `mask_b`, so that the caller can know
# which example is routed to which expert and whether its output
# appears in `output_a` or `output_b`. # This is necessary
# for writing a (custom) loss function for this class.
return output_a, output_b, mask_a, mask_b
# Create an intance of the mix-of-experts model.
mix_of_experts_model = MixOfExpertsModel()
# Generate some dummy data.
num_examples = 32
xs = np.random.random([num_examples, 8]).astype(np.float32)
# Call the model.
print(mix_of_experts_model(xs))
I didn't write a custom loss function that would support the training of this class. But that's doable by using the return values of MixOfExpertsModel.call(), namely the outputs and masks.

save and load custom attention model lstm in keras

I want to run a seq2seq model using lstm for a customer journey analysis.I am able to run the model but unable to load the saved model on a different notebook.
Code for attention model is here:
# RNN "Cell" classes in Keras perform the actual data transformations at each timestep. Therefore, in order to add attention to LSTM, we need to make a custom subclass of LSTMCell.
class AttentionLSTMCell(LSTMCell):
def __init__(self, **kwargs):
self.attentionMode = False
super(AttentionLSTMCell, self).__init__(**kwargs)
# Build is called to initialize the variables that our cell will use. We will let other Keras
# classes (e.g. "Dense") actually initialize these variables.
#tf_utils.shape_type_conversion
def build(self, input_shape):
# Converts the input sequence into a sequence which can be matched up to the internal
# hidden state.
self.dense_constant = TimeDistributed(Dense(self.units, name="AttLstmInternal_DenseConstant"))
# Transforms the internal hidden state into something that can be used by the attention
# mechanism.
self.dense_state = Dense(self.units, name="AttLstmInternal_DenseState")
# Transforms the combined hidden state and converted input sequence into a vector of
# probabilities for attention.
self.dense_transform = Dense(1, name="AttLstmInternal_DenseTransform")
# We will augment the input into LSTMCell by concatenating the context vector. Modify
# input_shape to reflect this.
batch, input_dim = input_shape[0]
batch, timesteps, context_size = input_shape[-1]
lstm_input = (batch, input_dim + context_size)
# The LSTMCell superclass expects no constant input, so strip that out.
return super(AttentionLSTMCell, self).build(lstm_input)
# This must be called before call(). The "input sequence" is the output from the
# encoder. This function will do some pre-processing on that sequence which will
# then be used in subsequent calls.
def setInputSequence(self, input_seq):
self.input_seq = input_seq
self.input_seq_shaped = self.dense_constant(input_seq)
self.timesteps = tf.shape(self.input_seq)[-2]
# This is a utility method to adjust the output of this cell. When attention mode is
# turned on, the cell outputs attention probability vectors across the input sequence.
def setAttentionMode(self, mode_on=False):
self.attentionMode = mode_on
# This method sets up the computational graph for the cell. It implements the actual logic
# that the model follows.
def call(self, inputs, states, constants):
# Separate the state list into the two discrete state vectors.
# ytm is the "memory state", stm is the "carry state".
ytm, stm = states
# We will use the "carry state" to guide the attention mechanism. Repeat it across all
# input timesteps to perform some calculations on it.
stm_repeated = K.repeat(self.dense_state(stm), self.timesteps)
# Now apply our "dense_transform" operation on the sum of our transformed "carry state"
# and all encoder states. This will squash the resultant sum down to a vector of size
# [batch,timesteps,1]
# Note: Most sources I encounter use tanh for the activation here. I have found with this dataset
# and this model, relu seems to perform better. It makes the attention mechanism far more crisp
# and produces better translation performance, especially with respect to proper sentence termination.
combined_stm_input = self.dense_transform(
keras.activations.relu(stm_repeated + self.input_seq_shaped))
# Performing a softmax generates a log probability for each encoder output to receive attention.
score_vector = keras.activations.softmax(combined_stm_input, 1)
# In this implementation, we grant "partial attention" to each encoder output based on
# it's log probability accumulated above. Other options would be to only give attention
# to the highest probability encoder output or some similar set.
context_vector = K.sum(score_vector * self.input_seq, 1)
# Finally, mutate the input vector. It will now contain the traditional inputs (like the seq2seq
# we trained above) in addition to the attention context vector we calculated earlier in this method.
inputs = K.concatenate([inputs, context_vector])
# Call into the super-class to invoke the LSTM math.
res = super(AttentionLSTMCell, self).call(inputs=inputs, states=states)
# This if statement switches the return value of this method if "attentionMode" is turned on.
if(self.attentionMode):
return (K.reshape(score_vector, (-1, self.timesteps)), res[1])
else:
return res
# Custom implementation of the Keras LSTM that adds an attention mechanism.
# This is implemented by taking an additional input (using the "constants" of the RNN class into the LSTM: The encoder output vectors across the entire input sequence.
class LSTMWithAttention(RNN):
def __init__(self, units, **kwargs):
cell = AttentionLSTMCell(units=units)
self.units = units
super(LSTMWithAttention, self).__init__(cell, **kwargs)
#tf_utils.shape_type_conversion
def build(self, input_shape):
self.input_dim = input_shape[0][-1]
self.timesteps = input_shape[0][-2]
return super(LSTMWithAttention, self).build(input_shape)
# This call is invoked with the entire time sequence. The RNN sub-class is responsible
# for breaking this up into calls into the cell for each step.
# The "constants" variable is the key to our implementation. It was specifically added
# to Keras to accomodate the "attention" mechanism we are implementing.
def call(self, x, constants, **kwargs):
if isinstance(x, list):
self.x_initial = x[0]
else:
self.x_initial = x
# The only difference in the LSTM computational graph really comes from the custom
# LSTM Cell that we utilize.
self.cell._dropout_mask = None
self.cell._recurrent_dropout_mask = None
self.cell.setInputSequence(constants[0])
return super(LSTMWithAttention, self).call(inputs=x, constants=constants, **kwargs)
Code defining encoder and decoder model:
# Encoder Layers
encoder_inputs = Input(shape=(None,len_input), name="attenc_inputs")
encoder = LSTM(units=units, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder((encoder_inputs))
encoder_states = [state_h, state_c]
#define inference decoder
encoder_model = Model(encoder_inputs, encoder_states)
#encoder_model.save('atten_enc_model.h5')
# define training decoder
decoder_inputs = Input(shape=(None, n_output))
Attention_dec_lstm = LSTMWithAttention(units=units, return_sequences=True, return_state=True)
# Note that the only real difference here is that we are feeding attenc_outputs to the decoder now.
attdec_lstm_out, _, _ = Attention_dec_lstm(inputs=decoder_inputs,
constants=encoder_outputs,
initial_state=encoder_states)
decoder_dense1 = Dense(units, activation="relu")
decoder_dense2 = Dense(n_output, activation='softmax')
decoder_outputs = decoder_dense2(Dropout(rate=.10)(decoder_dense1(Dropout(rate=.10)(attdec_lstm_out))))
atten_model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
atten_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
atten_model.summary()
#Defining inference decoder
state_input_h = Input(shape=(units,), name="state_input_h")
state_input_c = Input(shape=(units,), name="state_input_c")
decoder_states_inputs = [state_input_h, state_input_c]
attenc_seq_out = Input(shape=encoder_outputs.get_shape()[1:], name="attenc_seq_out")
attenc_seq_out
inf_attdec_inputs = Input(shape=(None,n_output), name="inf_attdec_inputs")
attdec_res, attdec_h, attdec_c = Attention_dec_lstm(inputs=inf_attdec_inputs,
constants=attenc_seq_out,
initial_state=decoder_states_inputs)
decoder_states = [attdec_h, attdec_c]
decoder_model = Model(inputs=[inf_attdec_inputs, state_input_h, state_input_c, attenc_seq_out],
outputs=[attdec_res, attdec_h, attdec_c])
Code for model fit and save:
history = atten_model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
batch_size=batch_size_num,
epochs=epochs,
validation_split=0.2,verbose=1)
atten_model.save('atten_model_lstm.h5')
Code to load the encoder decoder model with custom Attention layer:
with open('atten_model_lstm.json') as mdl:
json_string = mdl.read()
model = model_from_json(json_string, custom_objects={'AttentionLSTMCell': AttentionLSTMCell, 'LSTMWithAttention': LSTMWithAttention})
This code to load is giving error :
TypeError: int() argument must be a string, a bytes-like object or a number, not 'AttentionLSTMCell'
Here's a solution inspired by the link in my comment:
# serialize model to JSON
atten_model_json = atten_model.to_json()
with open("atten_model.json", "w") as json_file:
json_file.write(atten_model_json)
# serialize weights to HDF5
atten_model.save_weights("atten_model.h5")
print("Saved model to disk")
# Different part of your code or different file
# load json and create model
json_file = open('atten_model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("atten_model.h5")
print("Loaded model from disk")

Extracting the dropout mask from a keras dropout layer?

I would like to extract and store the dropout mask [array of 1/0s] from a dropout layer in a Sequential Keras model at each batch while training. I was wondering if there was a straight forward way way to do this within Keras or if I would need to switch over to tensorflow (How to get the dropout mask in Tensorflow).
Would appreciate any help! I'm quite new to TensorFlow and Keras.
There are a couple of functions (dropout_layer.get_output_mask(), dropout_layer.get_input_mask()) for the dropout layer that I tried using but got None after calling on the previous layer.
model = tf.keras.Sequential()
model.add(tf.keras.layers.Flatten(name="flat", input_shape=(28, 28, 1)))
model.add(tf.keras.layers.Dense(
512,
activation='relu',
name = 'dense_1',
kernel_initializer=tf.keras.initializers.GlorotUniform(seed=123),
bias_initializer='zeros'))
dropout = tf.keras.layers.Dropout(0.2, name = 'dropout') #want this layer's mask
model.add(dropout)
x = dropout.output_mask
y = dropout.input_mask
model.add(tf.keras.layers.Dense(
10,
activation='softmax',
name='dense_2',
kernel_initializer=tf.keras.initializers.GlorotUniform(seed=123),
bias_initializer='zeros'))
model.compile(...)
model.fit(...)
It's not easily exposed in Keras. It goes deep until it calls the Tensorflow dropout.
So, although you're using Keras, it's will also be a tensor in the graph that can be gotten by name (finding it's name: In Tensorflow, get the names of all the Tensors in a graph).
This option, of course will lack some keras information, you should probably have to do that inside a Lambda layer so Keras adds certain information to the tensor. And you must take extra care because the tensor will exist even when not training (where the mask is skipped)
Now, you can also use a less hacky way, that may consume a little processing:
def getMask(x):
boolMask = tf.not_equal(x, 0)
floatMask = tf.cast(boolMask, tf.float32) #or tf.float64
return floatMask
Use a Lambda(getMasc)(output_of_dropout_layer)
But instead of using a Sequential model, you will need a functional API Model.
inputs = tf.keras.layers.Input((28, 28, 1))
outputs = tf.keras.layers.Flatten(name="flat")(inputs)
outputs = tf.keras.layers.Dense(
512,
# activation='relu', #relu will be a problem here
name = 'dense_1',
kernel_initializer=tf.keras.initializers.GlorotUniform(seed=123),
bias_initializer='zeros')(outputs)
outputs = tf.keras.layers.Dropout(0.2, name = 'dropout')(outputs)
mask = Lambda(getMask)(outputs)
#there isn't "input_mask"
#add the missing relu:
outputs = tf.keras.layers.Activation('relu')(outputs)
outputs = tf.keras.layers.Dense(
10,
activation='softmax',
name='dense_2',
kernel_initializer=tf.keras.initializers.GlorotUniform(seed=123),
bias_initializer='zeros')(outputs)
model = Model(inputs, outputs)
model.compile(...)
model.fit(...)
Training and predicting
Since you can't train the masks (it doesn't make any sense), it should not be an output of the model for training.
Now, we could try this:
trainingModel = Model(inputs, outputs)
predictingModel = Model(inputs, [output, mask])
But masks don't exist in prediction, because dropout is only applied in training. So this doesn't bring us anything good in the end.
The only way for training is then using a dummy loss and dummy targets:
def dummyLoss(y_true, y_pred):
return y_true #but this might evoke a "None" gradient problem since it's not trainable, there is no connection to any weights, etc.
model.compile(loss=[loss_for_main_output, dummyLoss], ....)
model.fit(x_train, [y_train, np.zeros((len(y_Train),) + mask_shape), ...)
It's not guaranteed that these will work.
I found a very hacky way to do this by trivially extending the provided dropout layer. (Almost all code from TF.)
class MyDR(tf.keras.layers.Layer):
def __init__(self,rate,**kwargs):
super(MyDR, self).__init__(**kwargs)
self.noise_shape = None
self.rate = rate
def _get_noise_shape(self,x, noise_shape=None):
# If noise_shape is none return immediately.
if noise_shape is None:
return array_ops.shape(x)
try:
# Best effort to figure out the intended shape.
# If not possible, let the op to handle it.
# In eager mode exception will show up.
noise_shape_ = tensor_shape.as_shape(noise_shape)
except (TypeError, ValueError):
return noise_shape
if x.shape.dims is not None and len(x.shape.dims) == len(noise_shape_.dims):
new_dims = []
for i, dim in enumerate(x.shape.dims):
if noise_shape_.dims[i].value is None and dim.value is not None:
new_dims.append(dim.value)
else:
new_dims.append(noise_shape_.dims[i].value)
return tensor_shape.TensorShape(new_dims)
return noise_shape
def build(self, input_shape):
self.noise_shape = input_shape
print(self.noise_shape)
super(MyDR,self).build(input_shape)
#tf.function
def call(self,input):
self.noise_shape = self._get_noise_shape(input)
random_tensor = tf.random.uniform(self.noise_shape, seed=1235, dtype=input.dtype)
keep_prob = 1 - self.rate
scale = 1 / keep_prob
# NOTE: if (1.0 + rate) - 1 is equal to rate, then we want to consider that
# float to be selected, hence we use a >= comparison.
self.keep_mask = random_tensor >= self.rate
#NOTE: here is where I save the binary masks.
#the file grows quite big!
tf.print(self.keep_mask,output_stream="file://temp/droput_mask.txt")
ret = input * scale * math_ops.cast(self.keep_mask, input.dtype)
return ret

Categories

Resources