I'm working on a seq2seq RNN generating an output sequence of labels given a seed label. During the inference step I'd like to generate sequences containing only unique labels (i.e. skip labels that have already been added to the output sequence). To do this I created a sampler object that tries to remember the labels that have been added to the output and reduce their logit value to -np.inf.
Here is the sampler code:
class InferenceSampler(object):
def __init__(self, out_weights, out_biases):
self._out_weights = tf.transpose(out_weights)
self._out_biases = out_biases
self._n_tracks = out_weights.shape[0]
self.ids_mask = tf.zeros([self._n_tracks], name="playlist_mask")
def __call__(self, decoder_outputs):
_logits = tf.matmul(decoder_outputs, self._out_weights)
_logits = tf.nn.bias_add(_logits, self._out_biases)
# apply mask
_logits = _logits + self.ids_mask
_sample_ids = tf.cast(tf.argmax(_logits, axis=-1), tf.int32)
# update mask
step_ids_mask = tf.sparse_to_dense(_sample_ids, [self._n_tracks], -np.inf)
self.ids_mask = self.ids_mask + step_ids_mask
return _sample_ids
The code of the inference graph looks like this:
self._max_playlist_len = tf.placeholder(tf.int32, ())
self._start_tokens = tf.placeholder(tf.int32, [None])
sample_fn = InferenceSampler(out_weights, out_biases)
with tf.name_scope("inf_decoder"):
def _end_fn(sample_ids):
return tf.equal(sample_ids, PAD_ITEM_ID)
def _next_inputs_fn(sample_ids):
return tf.nn.embedding_lookup(
track_embs,
sample_ids
)
_start_inputs = tf.nn.embedding_lookup(
track_embs,
self._start_tokens
)
helper = tf.contrib.seq2seq.InferenceHelper(
sample_fn=sample_fn,
sample_shape=[],
sample_dtype=tf.int32,
start_inputs=_start_inputs,
end_fn=_end_fn,
next_inputs_fn=_next_inputs_fn
)
decoder = tf.contrib.seq2seq.BasicDecoder(
rnn_cell,
helper,
rnn_cell.zero_state(tf.shape(self._start_tokens)[0], tf.float32),
output_layer=projection_layer
)
outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
decoder,
maximum_iterations=self._max_playlist_len
)
self.playlists = outputs.sample_id
Unfortunately, the results still have duplicated labels. Moreover, when I try to get access to the sample_fn.ids_mask I receive an error message: ValueError: Operation 'inf_decoder/decoder/while/BasicDecoderStep/add_1' has been marked as not fetchable.
What am I doing wrong? And how legal is to create such sample_fn?
Trying to overcome the problem, I updated the inference in such a way that at each RNN step I output the vector of embeddings instead of item_id. After the inference is finished, I convert embeddings to item_ids.
First of all, this solution minimizes the number of operations. Secondly, since I use LSTM/GRU cells, they minimize the probability to observe two absolutely similar outputs on different steps of RNN's inference.
The new code looks like this:
with tf.name_scope("inf_decoder"):
def _sample_fn(decoder_outputs):
return decoder_outputs
def _end_fn(sample_ids):
# infinite
return tf.tile([False], [n_seeds])
_start_inputs = tf.nn.embedding_lookup(
track_embs,
self._seed_items
)
helper = tf.contrib.seq2seq.InferenceHelper(
sample_fn=_sample_fn,
sample_shape=[self.emb_size],
sample_dtype=tf.float32,
start_inputs=_start_inputs,
end_fn=_end_fn,
)
decoder = tf.contrib.seq2seq.BasicDecoder(
rnn_cell,
helper,
rnn_cell.zero_state(n_seeds, tf.float32),
output_layer=projection_layer
)
outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
decoder,
maximum_iterations=self._max_playlist_len
)
flat_rnn_output = tf.reshape(outputs.rnn_output, [-1, self.emb_size])
flat_logits = tf.matmul(flat_rnn_output, out_weights, transpose_b=True)
flat_logits = tf.nn.bias_add(flat_logits, out_biases)
item_ids = tf.cast(tf.argmax(flat_logits, axis=-1), tf.int32)
playlists = tf.reshape(item_ids, [n_seeds, -1])
self.playlists = playlists
So, after some investigation I found answers to all my questions related to this thread. The main question was: why self.ids_mask in InferenceSampler does not update? The reason is in the internals of dynamic_decode. According to this answer in Tensorflow's issue tracker:
... only tensors defined inside the loop will be evaluated
every loop iteration. All tensors defined outside a loop will be
evaluated exactly once.
In my case, self.ids_mask is specified outside the loop. That means that I need to re-write dynamic_decode to get what I want. The code below is a bit modified version of the initial task, but it does almost the same.
Let's start with a new dynamic_decode which should create and update the mask collecting sample_ids that have been already predicted. I removed the code which i didn't modify, follow the initial_mask and mask variables.
New dynamic_decode:
def dynamic_decode(decoder,
output_time_major=False,
impute_finished=False,
maximum_iterations=None,
parallel_iterations=32,
swap_memory=False,
scope=None):
...
initial_finished, initial_inputs, initial_mask, initial_state = decoder.initialize()
...
def body(time, outputs_ta, state, inputs, finished, sequence_lengths, mask):
"""Internal while_loop body.
Args:
time: scalar int32 tensor.
outputs_ta: structure of TensorArray.
state: (structure of) state tensors and TensorArrays.
inputs: (structure of) input tensors.
finished: bool tensor (keeping track of what's finished).
sequence_lengths: int32 tensor (keeping track of time of finish).
mask: SparseTensor to remove already predicted items
Returns:
`(time + 1, outputs_ta, next_state, next_inputs, next_finished,
next_sequence_lengths, next_mask)`.
```
"""
(next_outputs, decoder_state, next_inputs, next_mask,
decoder_finished) = decoder.step(time, inputs, state, mask)
...
nest.assert_same_structure(state, decoder_state)
nest.assert_same_structure(outputs_ta, next_outputs)
nest.assert_same_structure(inputs, next_inputs)
nest.assert_same_structure(mask, next_mask)
...
return (time + 1, outputs_ta, next_state, next_inputs, next_finished,
next_sequence_lengths, next_mask)
res = control_flow_ops.while_loop(
condition,
body,
loop_vars=[
initial_time, initial_outputs_ta, initial_state, initial_inputs,
initial_finished, initial_sequence_lengths, initial_mask,
],
parallel_iterations=parallel_iterations,
swap_memory=swap_memory)
...
return final_outputs, final_state, final_sequence_lengths
At the next step mask should be passed to Decoder and Helper. Here are the updated versions of BasicDecoder and InferenceHelper:
MaskedDecoder:
class MaskedDecoder(BasicDecoder):
def step(self, time, inputs, state, mask, name=None):
with ops.name_scope(name, "MaskedDecoderStep", (time, inputs, state, mask)):
cell_outputs, cell_state = self._cell(inputs, state)
if self._output_layer is not None:
cell_outputs = self._output_layer(cell_outputs)
sample_ids = self._helper.sample(
time=time,
outputs=cell_outputs,
state=cell_state,
mask=mask)
(finished, next_inputs, next_state, next_mask) = self._helper.next_inputs(
time=time,
outputs=cell_outputs,
state=cell_state,
mask=mask,
sample_ids=sample_ids)
outputs = BasicDecoderOutput(cell_outputs, sample_ids)
return (outputs, next_state, next_inputs, next_mask, finished)
MaskedInferenceHelper:
class MaskedInferenceHelper(Helper):
"""A helper to use during inference with a custom sampling function."""
def __init__(self, norm_track_embs, features, start_sample_ids):
self._norm_track_embs = norm_track_embs
self._batch_size = tf.shape(start_sample_ids)[0]
self._n_tracks = tf.shape(norm_track_embs)[0]
self._start_sample_ids = start_sample_ids
self._sample_shape = tf.TensorShape([])
self._sample_dtype = tf.int32
self._features = features
def _get_sparse_mask(self, sample_ids):
_mask_shape = tf.convert_to_tensor([
tf.cast(self._batch_size, dtype=tf.int64),
tf.cast(self._n_tracks, dtype=tf.int64)
])
_st_rows = tf.range(0, self._batch_size)
_st_cols = sample_ids
_st_indices = tf.cast(tf.stack([_st_rows, _st_cols], axis=1), dtype=tf.int64)
_st_values = tf.fill([self._batch_size], np.inf)
return tf.SparseTensor(_st_indices, _st_values, _mask_shape)
...
def initialize(self, name=None):
finished = tf.tile([False], [self._batch_size])
start_embs = tf.nn.embedding_lookup(self._norm_track_embs, self._start_sample_ids)
start_inputs = tf.concat([start_embs, self._features], axis=1)
mask = self._get_sparse_mask(self._start_sample_ids)
return finished, start_inputs, mask
def sample(self, time, outputs, state, mask, name=None):
del time, state # unused by sample
outputs = tf.nn.l2_normalize(outputs, axis=-1)
cos_sims = tf.matmul(outputs, self._norm_track_embs, transpose_b=True)
cos_sims = cos_sims - tf.sparse_tensor_to_dense(mask)
sample_ids = tf.cast(tf.argmax(cos_sims, axis=-1), tf.int32)
return sample_ids
def next_inputs(self, time, outputs, state, sample_ids, mask, name=None):
del time, outputs # unused by next_inputs
finished = tf.tile([False], [self._batch_size])
next_embs = tf.nn.embedding_lookup(self._norm_track_embs, sample_ids)
next_inputs = tf.concat([next_embs, self._features], axis=1)
next_mask = tf.sparse_add(mask, self._get_sparse_mask(sample_ids))
return finished, next_inputs, state, next_mask
So, now I can generate inferences without repetition of already predicted items.
Related
I’m currently trying to adapt a PyTorch Wave-U-Net implementation (https://github.com/f90/Wave-U-Net-Pytorch) so that it’ll work for audio mixing rather than source separation. I’m still quite new to PyTorch so I’m not exactly sure how one would go about structuring a forward pass for multiple inputs (8 audio tracks, each corresponding to a separate stem or instrument) and a single output (a mixture track, produced from the inputs).
The network is below, I don’t think the functions in the middle are relevant or need changing. The output is a dictionary with a key for each instrument matching the estimated audio source, however this is obviously not necessary for my task, I think the only changes needed are in __init__ and forward, but I’m unclear what changes are necessary and how to implement them.
class Waveunet(nn.Module):
def __init__(self, num_inputs, num_channels, num_outputs, instruments, kernel_size, target_output_size, conv_type, res, separate=False, depth=1, strides=2):
super(Waveunet, self).__init__()
self.num_levels = len(num_channels)
self.strides = strides
self.kernel_size = kernel_size
self.num_inputs = num_inputs
self.num_outputs = num_outputs
self.depth = depth
self.instruments = instruments
self.separate = separate
# Only odd filter kernels allowed
assert(kernel_size % 2 == 1)
self.waveunets = nn.ModuleDict()
model_list = instruments if separate else ["ALL"]
# Create a model for each source if we separate sources separately, otherwise only one (model_list=["ALL"])
for instrument in model_list:
module = nn.Module()
module.downsampling_blocks = nn.ModuleList()
module.upsampling_blocks = nn.ModuleList()
for i in range(self.num_levels - 1):
in_ch = num_inputs if i == 0 else num_channels[i]
module.downsampling_blocks.append(
DownsamplingBlock(in_ch, num_channels[i], num_channels[i+1], kernel_size, strides, depth, conv_type, res))
for i in range(0, self.num_levels - 1):
module.upsampling_blocks.append(
UpsamplingBlock(num_channels[-1-i], num_channels[-2-i], num_channels[-2-i], kernel_size, strides, depth, conv_type, res))
module.bottlenecks = nn.ModuleList(
[ConvLayer(num_channels[-1], num_channels[-1], kernel_size, 1, conv_type) for _ in range(depth)])
# Output conv
outputs = num_outputs if separate else num_outputs * len(instruments)
module.output_conv = nn.Conv1d(num_channels[0], outputs, 1)
self.waveunets[instrument] = module
self.set_output_size(target_output_size)
def set_output_size(self, target_output_size):
self.target_output_size = target_output_size
self.input_size, self.output_size = self.check_padding(target_output_size)
print("Using valid convolutions with " + str(self.input_size) + " inputs and " + str(self.output_size) + " outputs")
assert((self.input_size - self.output_size) % 2 == 0)
self.shapes = {"output_start_frame" : (self.input_size - self.output_size) // 2,
"output_end_frame" : (self.input_size - self.output_size) // 2 + self.output_size,
"output_frames" : self.output_size,
"input_frames" : self.input_size}
def check_padding(self, target_output_size):
# Ensure number of outputs covers a whole number of cycles so each output in the cycle is weighted equally during training
bottleneck = 1
while True:
out = self.check_padding_for_bottleneck(bottleneck, target_output_size)
if out is not False:
return out
bottleneck += 1
def check_padding_for_bottleneck(self, bottleneck, target_output_size):
module = self.waveunets[[k for k in self.waveunets.keys()][0]]
try:
curr_size = bottleneck
for idx, block in enumerate(module.upsampling_blocks):
curr_size = block.get_output_size(curr_size)
output_size = curr_size
# Bottleneck-Conv
curr_size = bottleneck
for block in reversed(module.bottlenecks):
curr_size = block.get_input_size(curr_size)
for idx, block in enumerate(reversed(module.downsampling_blocks)):
curr_size = block.get_input_size(curr_size)
assert(output_size >= target_output_size)
return curr_size, output_size
except AssertionError as e:
return False
def forward_module(self, x, module):
'''
A forward pass through a single Wave-U-Net (multiple Wave-U-Nets might be used, one for each source)
:param x: Input mix
:param module: Network module to be used for prediction
:return: Source estimates
'''
shortcuts = []
out = x
# DOWNSAMPLING BLOCKS
for block in module.downsampling_blocks:
out, short = block(out)
shortcuts.append(short)
# BOTTLENECK CONVOLUTION
for conv in module.bottlenecks:
out = conv(out)
# UPSAMPLING BLOCKS
for idx, block in enumerate(module.upsampling_blocks):
out = block(out, shortcuts[-1 - idx])
# OUTPUT CONV
out = module.output_conv(out)
if not self.training: # At test time clip predictions to valid amplitude range
out = out.clamp(min=-1.0, max=1.0)
return out
def forward(self, x, inst=None):
curr_input_size = x.shape[-1]
assert(curr_input_size == self.input_size) # User promises to feed the proper input himself, to get the pre-calculated (NOT the originally desired) output size
if self.separate:
return {inst : self.forward_module(x, self.waveunets[inst])}
else:
assert(len(self.waveunets) == 1)
out = self.forward_module(x, self.waveunets["ALL"])
out_dict = {}
for idx, inst in enumerate(self.instruments):
out_dict[inst] = out[:, idx * self.num_outputs:(idx + 1) * self.num_outputs]
return out_dict'
I have an issue when calculating one of the losses of my GAN models in Tensorflow, when attempting to calculate it using MeanAbsoluteError(). I am well aware that the shapes need to match to be added together to produce a loss, but the confusing part is that after calling model.fit(), it proceeds well and doesn't throw an error, until the very end of the epoch.
I've been doing logging and verified that the shapes indeed do match:
07:40:43,483 root DEBUG Mean loss for base XY: (None, 64, 64, 64) and (None, 64, 64, 64)
07:40:43,500 root DEBUG Mean loss for base YX: (None, 64, 64, 64) and (None, 64, 64, 64)
07:40:43,516 root DEBUG Base MAE success
07:40:43,516 root DEBUG Actual value sample: Tensor("mean_absolute_error_1/weighted_loss/value:0", shape=(), dtype=float32)
This tells me that it indeed succeeded, and produced a single float32 value. After that, however, it appears to crash somewhere, and I'm guessing it's at the part where all the values are added together to a single gen_XY loss for the $X \rightarrow Y$ generator. But since we confirmed it is indeed a 1D value, this shouldn't be the case.
I've tried commenting out this specific loss (out of the several I'm using) and the issue doesn't persist, which confirms to me it's related to this specific loss.
The loss is a loss between the latent feature maps in the middle between the encoder and decoder part of the Generator network.
Here is my model definition, excluding individual definitions of Up, Down and ResNet blocks:
class LGan(Model):
def __init__(self, gen_XY, gen_YX, disc_X, disc_Y, lambda_cycle=10.0, lambda_identity=2.0, lambda_base=5.0, lambda_sim=2.0, lambda_adv=2.0):
super(LGan, self).__init__(name="BokorGan")
self.gen_XY = gen_XY
self.gen_YX = gen_YX
self.disc_X = disc_X
self.disc_Y = disc_Y
self.lambda_cycle = lambda_cycle
self.lambda_identity = lambda_identity
self.lambda_base = lambda_base
self.lambda_sim = lambda_sim
self.lambda_adv = lambda_adv
self.XY_loss = []
self.YX_loss = []
self.X_loss = []
self.Y_loss = []
self.epoch = 0
self.cc_loss = CycleLoss()
self.id_loss = IdentityLoss()
self.base_loss = MeanAbsoluteError()
self.sim_loss = SimilarityLoss()
self.gen_loss = None
self.disc_loss = None
def compile(self, gen_XY_optim, gen_YX_optim, disc_X_optim, disc_Y_optim, gen_loss, disc_loss):
super(LGan, self).compile()
self.gen_XY_optim = gen_XY_optim
self.gen_YX_optim = gen_YX_optim
self.disc_X_optim = disc_X_optim
self.disc_Y_optim = disc_Y_optim
self.gen_loss = gen_loss
self.disc_loss = disc_loss
def train_step(self, input_pair):
input_x, input_y = input_pair
with tf.GradientTape(persistent=True) as tape:
# Generator outputs
gen_x, gen_latent_y = self.gen_YX(input_y)
gen_y, gen_latent_x = self.gen_XY(input_x)
cycle_x, cycled_latent_y = self.gen_YX(gen_y)
cycle_y, cycled_latent_x = self.gen_XY(gen_x)
id_x, _ = self.gen_YX(input_x)
id_y, _ = self.gen_XY(input_y)
# Discriminator outputs
disc_true_x = self.disc_X(input_x)
disc_fake_x = self.disc_X(gen_x)
disc_true_y = self.disc_Y(input_y)
disc_fake_y = self.disc_Y(gen_y)
# Adversarial loss
adv_XY_loss = self.gen_loss(disc_fake_y)
adv_YX_loss = self.gen_loss(disc_fake_x)
# Cycle loss
cycle_XY_loss = self.cc_loss(input_y, cycle_y)
cycle_YX_loss = self.cc_loss(input_x, cycle_x)
# Identity loss
id_XY_loss = self.id_loss(input_y, id_y)
id_YX_loss = self.id_loss(input_x, id_x)
# Similarity loss
sim_XY_loss = self.sim_loss(input_y, cycle_y)
logger.debug(f"Sample inputs to similarity loss: {input_y.shape} and {cycle_y.shape}")
sim_YX_loss = self.sim_loss(input_x, cycle_x)
logger.debug(f"Actual value sample: {sim_XY_loss}")
# Base loss
logger.debug(f"Mean loss for base XY: {gen_latent_y.shape} and {cycled_latent_y.shape}")
base_XY_loss = self.base_loss(gen_latent_y, cycled_latent_y)
logger.debug(f"Mean loss for base YX: {gen_latent_x.shape} and {cycled_latent_x.shape}")
base_YX_loss = self.base_loss(gen_latent_x, cycled_latent_x)
logger.debug("Base success")
logger.debug(f"Actual value sample: {base_YX_loss}")
# Total XY loss
total_XY_loss = (
adv_XY_loss * self.lambda_adv
+ cycle_XY_loss * self.lambda_cycle
+ id_XY_loss * self.lambda_identity
+ sim_XY_loss * self.lambda_sim
+ base_XY_loss * self.lambda_base
)
# Total YX loss
total_YX_loss = (
adv_YX_loss * self.lambda_adv
+ cycle_YX_loss * self.lambda_cycle
+ id_YX_loss * self.lambda_identity
+ sim_YX_loss * self.lambda_sim
+ base_YX_loss * self.lambda_base
)
# Discriminator (X) loss
disc_X_loss = self.disc_loss(disc_true_x, disc_fake_x)
# Discriminator (Y) loss
disc_Y_loss = self.disc_loss(disc_true_y, disc_fake_y)
grads_XY = tape.gradient(total_XY_loss, self.gen_XY.trainable_variables)
grads_YX = tape.gradient(total_YX_loss, self.gen_YX.trainable_variables)
# Get the gradients for the discriminators
disc_X_grads = tape.gradient(disc_X_loss, self.disc_X.trainable_variables)
disc_Y_grads = tape.gradient(disc_Y_loss, self.disc_Y.trainable_variables)
# Update the weights of the generators
self.gen_XY_optim.apply_gradients(
zip(grads_XY, self.gen_XY.trainable_variables)
)
self.gen_YX_optim.apply_gradients(
zip(grads_YX, self.gen_YX.trainable_variables)
)
# Update the weights of the discriminators
self.disc_X_optim.apply_gradients(
zip(disc_X_grads, self.disc_X.trainable_variables)
)
self.disc_Y_optim.apply_gradients(
zip(disc_Y_grads, self.disc_Y.trainable_variables)
)
return {
"XY_loss": total_XY_loss,
"YX_loss": total_YX_loss,
"D_X_loss": disc_X_loss,
"D_Y_loss": disc_Y_loss,
}
I'm not sure why this error happens as it only happens on the very end of the epoch. According to the error, the part where the losses are calculated happens only once, and that is on the end of the epoch. I'm new to NN so I'm not sure why this happens since they're part of the same training step, and even part of the same gradient tape context.
The convolutional model presented below, has two branches and each branch (for example) has two stages (convolutional layers).
My aim is to combine the weighted feature maps (channels) of the first convolutional layer from the second branch with the channels of the first convolutional layer from the first branch.
I want to extract the channels from the first convolutional layer in the second branch, multiply it by a weight (weight is a class in the code that makes the output a weighted version of its input) and stack it with the channels of its counterpart convolutional layer from the first branch. Afterwards, by utilizing a 1x1 conv2d, the size of the stacked feature maps will be changed to its initial size and this combined channels should be used by the first branch and the next convolutional layers will be computed based on these combined channels. After that, I want to have this kind of combination between the second convolutional layers of the branches. (In other words, I want to combine features channel-by-channels between branches.)
Please find the main_class (the whole model that consists of two branches) and the first_branch and second_branch below:
class main_class(nn.Module):
def __init__(self, pretrained=False):
super(main_class, self).__init__()
self.input=input_data() # input_data is a class the provides the input data for the each branch
self.conv_t2 = BasicConv3d(...........)
self.second_branch=second_branch(512, out_sigmoid=True)
self.conv_t1 = BasicConv3d(..............)
self.first_branch=first_branch(512, out_sigmoid=True)
self.last = nn.Conv2d(4, 1, kernel_size=1, stride=1)
self.sigmoid = nn.Sigmoid()
def forward(self, x, par = False):
x1, x2 = self.input(x)
#second branch
y2 = self.conv_t2(x2)
out2 = self.second_branch(y2)
#first branch
y1 = self.conv_t1(x1)
out1 = self.first_branch(y1)
x = torch.cat((out2, out1), 1)
x = self.last(x)
out = self.sigmoid(x)
if par:
return out1, out2, out
return out
The first_branch:
class first_branch(nn.Module):
def __init__(self, in_channel=512, out_channel=[380, 200], out_sigmoid=False):
super(first_branch, self).__init__()
self.out_sigmoid=out_sigmoid
self.deconvlayer1_2 = self._make_deconv(in_channel, out_channel[0], num_conv=3)
self.upsample1_2=Upsample(scale_factor=2, mode='bilinear')
self.combined1_2 = nn.conv2d(720, 380, kernel_size=1, stride=1, padding=0)
self.deconvlayer1_1 = self._make_deconv(out_channel[0], out_channel[1], num_conv=3)
self.upsample1_1=Upsample(scale_factor=2, mode='bilinear')
self.combined1_1 = nn.conv2d(400, 200, kernel_size=1, stride=1, padding=0)
def forward(self, x):
x=self.deconvlayer1_2(x)
x = self.upsample1_2(x)
x=self.deconvlayer1_1(x)
x = self.upsample1_1(x)
if self.out_sigmoid:
x=self.sigmoid(x)
return x
The second_branch:
class second_branch(nn.Module):
def __init__(self, in_channel=512, out_channel=[380,200], out_sigmoid=False):
super(second_branch, self).__init__()
self.out_sigmoid=out_sigmoid
self.weight = weight() # weight is a class that weighted its input
self.deconvlayer2_2 = self._make_deconv(in_channel, out_channel[0], num_conv=3)
self.upsample2_2=Upsample(scale_factor=2, mode='bilinear')
self.deconvlayer2_! = self._make_deconv(out_channel[0], out_channel[1], num_conv=3)
self.upsample2_1=Upsample(scale_factor=2, mode='bilinear')
def forward(self, x):
x=self.deconvlayer2_2(x)
x = self.upsample2_2(x)
weighted2_2 = self.weight(x)
x=self.deconvlayer2_1(x)
x = self.upsample2_1(x)
weighted2_1 = self.weight(x)
if self.out_sigmoid:
x=self.sigmoid(x)
return x, weighted2_1, weighted2_2
For implementing the mentioned idea in the main_class, I modified it as follows (instead of using the first_branch class in the forward function of the main_class, I wrote the script lines of the forward function of the first_branch in the forward function of the main_class):
class main_class(nn.Module):
def __init__(self, pretrained=False):
super(main_class, self).__init__()
self.input=input_data() # input_data is a class the provides the input data for the each branch
self.conv_t2 = BasicConv3d(....................)
self.second_branch=second_branch(512, out_sigmoid=True)
self.conv_t1 = BasicConv3d(............)
self.first_branch=first_branch(512, out_sigmoid=True)
self.last = nn.Conv2d(4, 1, kernel_size=1, stride=1)
self.sigmoid = nn.Sigmoid()
def forward(self, x, par = False):
x1, x2 = self.input(x)
#second branch
y2 = self.conv_t2(x2)
out2, weighted2_1, weighted2_2 = self.second_branch(y2)
#first branch
y1 = self.conv_t1(x1)
# instead of using from class first_branch, again I write the script lines of first_branch.forward() in below:
x=self.deconvlayer1_2(y1)
x = self.upsample1_2(x)
stacking_2 = torch.stack(x, weighted2_2)
x = self.frist_branch.combined1_2(stacking_2)
x=self.deconvlayer1_1(x)
x = self.upsample1_1(x)
stacking_1 = torch.stack(x, weighted2_1)
x = self.frist_branch.combined1_1(stacking_1)
out1=self.sigmoid(x)
x = torch.cat((out2, out1), 1)
x = self.last(x)
out = self.sigmoid(x)
if par:
return out1, out2, out
return out
I face with the following error:
TypeError: Cannot create a consistent method resolution order (MRO) for bases Module, second_branch
How can I fix this problem and how can I make the code able to have the interactions between new branches that may be added later to the model (for example if I have three branches, how can I have this kind of data combination between the third branch and the second one, and between the output of the previous combination and the first branch)?
In your main_class the second branch is not receiving additional arguments, it's only the first one that needs to be executed as second (in order). You could just add a parameter to the forward method of that branch like so:
class first_branch(nn.Module):
...
def forward(self, x, weighted_x: list = []):
x = self.deconvlayer1_2(x)
x = self.upsample1_2(x)
out1 = None
if len(weighted_x) > 0:
x = torch.stack(x, weighted_x[0])
x = self.combined1_2(x)
x = self.deconvlayer1_1(x)
x = self.upsample1_1(x)
out2 = None
if len(weighted_x) > 1:
x = torch.stack(x, weighted_x[1])
x = self.combined1_1(x)
if self.out_sigmoid:
x = self.sigmoid(x)
return x, out1, out2
As you can see, there's a lot of boilerplate code, which you can avoid by creating a small submodule that do this part of forward. You could then store multiple modules in your first_branch inside a ModuleList and iterate over them.
Short Description of my model
I am trying to write my own DQN algorithm in Python, using Tensorflow following the paper(Mnih et al., 2015). In train_DQN function, I have defined the training procedure, and DQN_CartPole is for defining the function approximation(simple 3-layered Neural Network). For loss function, Huber loss or MSE is implemented followed by the gradient clipping(between -1 and 1). Then, I have implemented soft-update method instead of hard-update of the target network by copying the weights in the main network.
Question
I am trying it on the CartPole environment(OpenAI gym), but the rewards does not improve as it does in other people's algorithms, such as keras-rl. Any help will be appreciated.
reward over timestep
If possible, could you have a look at the source code?
DQN model: https://github.com/Rowing0914/TF_RL/blob/master/agents/DQN_model.py
Training Script: https://github.com/Rowing0914/TF_RL/blob/master/agents/DQN_train.py
Reddit post: https://www.reddit.com/r/reinforcementlearning/comments/ba7o55/question_dqn_algorithm_does_not_work_well_on/?utm_source=share&utm_medium=web2x
class Parameters:
def __init__(self, mode=None):
assert mode != None
print("Loading Params for {} Environment".format(mode))
if mode == "Atari":
self.state_reshape = (1, 84, 84, 1)
self.num_frames = 1000000
self.memory_size = 10000
self.learning_start = 10000
self.sync_freq = 1000
self.batch_size = 32
self.gamma = 0.99
self.update_hard_or_soft = "soft"
self.soft_update_tau = 1e-2
self.epsilon_start = 1.0
self.epsilon_end = 0.01
self.decay_steps = 1000
self.prioritized_replay_alpha = 0.6
self.prioritized_replay_beta_start = 0.4
self.prioritized_replay_beta_end = 1.0
self.prioritized_replay_noise = 1e-6
elif mode == "CartPole":
self.state_reshape = (1, 4)
self.num_frames = 10000
self.memory_size = 20000
self.learning_start = 100
self.sync_freq = 100
self.batch_size = 32
self.gamma = 0.99
self.update_hard_or_soft = "soft"
self.soft_update_tau = 1e-2
self.epsilon_start = 1.0
self.epsilon_end = 0.01
self.decay_steps = 500
self.prioritized_replay_alpha = 0.6
self.prioritized_replay_beta_start = 0.4
self.prioritized_replay_beta_end = 1.0
self.prioritized_replay_noise = 1e-6
class _DQN:
"""
Boilerplate for DQN Agent
"""
def __init__(self):
"""
define the deep learning model here!
"""
pass
def predict(self, sess, state):
"""
predict q-values given a state
:param sess:
:param state:
:return:
"""
return sess.run(self.pred, feed_dict={self.state: state})
def update(self, sess, state, action, Y):
feed_dict = {self.state: state, self.action: action, self.Y: Y}
_, loss = sess.run([self.train_op, self.loss], feed_dict=feed_dict)
# print(action, Y, sess.run(self.idx_flattened, feed_dict=feed_dict))
return loss
class DQN_CartPole(_DQN):
"""
DQN Agent for CartPole game
"""
def __init__(self, scope, env, loss_fn ="MSE"):
self.scope = scope
self.num_action = env.action_space.n
with tf.variable_scope(scope):
self.state = tf.placeholder(shape=[None, 4], dtype=tf.float32, name="X")
self.Y = tf.placeholder(shape=[None], dtype=tf.float32, name="Y")
self.action = tf.placeholder(shape=[None], dtype=tf.int32, name="action")
fc1 = tf.keras.layers.Dense(16, activation=tf.nn.relu)(self.state)
fc2 = tf.keras.layers.Dense(16, activation=tf.nn.relu)(fc1)
fc3 = tf.keras.layers.Dense(16, activation=tf.nn.relu)(fc2)
self.pred = tf.keras.layers.Dense(self.num_action, activation=tf.nn.relu)(fc3)
# indices of the executed actions
self.idx_flattened = tf.range(0, tf.shape(self.pred)[0]) * tf.shape(self.pred)[1] + self.action
# passing [-1] to tf.reshape means flatten the array
# using tf.gather, associate Q-values with the executed actions
self.action_probs = tf.gather(tf.reshape(self.pred, [-1]), self.idx_flattened)
if loss_fn == "huber_loss":
# use huber loss
self.losses = tf.subtract(self.Y, self.action_probs)
self.loss = huber_loss(self.losses)
elif loss_fn == "MSE":
# use MSE
self.losses = tf.squared_difference(self.Y, self.action_probs)
self.loss = tf.reduce_mean(self.losses)
else:
assert False
# you can choose whatever you want for the optimiser
# self.optimizer = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6)
self.optimizer = tf.train.AdamOptimizer()
# to apply Gradient Clipping, we have to directly operate on the optimiser
# check this: https://www.tensorflow.org/api_docs/python/tf/train/Optimizer#processing_gradients_before_applying_them
self.grads_and_vars = self.optimizer.compute_gradients(self.loss)
self.clipped_grads_and_vars = [(ClipIfNotNone(grad, -1., 1.), var) for grad, var in self.grads_and_vars]
self.train_op = self.optimizer.apply_gradients(self.clipped_grads_and_vars)
def train_DQN(main_model, target_model, env, replay_buffer, policy, params):
"""
Train DQN agent which defined above
:param main_model:
:param target_model:
:param env:
:param params:
:return:
"""
# log purpose
losses, all_rewards, cnt_action = [], [], []
episode_reward, index_episode = 0, 0
with tf.Session() as sess:
# initialise all variables used in the model
sess.run(tf.global_variables_initializer())
state = env.reset()
start = time.time()
for frame_idx in range(1, params.num_frames + 1):
action = policy.select_action(sess, target_model, state.reshape(params.state_reshape))
cnt_action.append(action)
next_state, reward, done, _ = env.step(action)
replay_buffer.add(state, action, reward, next_state, done)
state = next_state
episode_reward += reward
if done:
index_episode += 1
state = env.reset()
all_rewards.append(episode_reward)
if frame_idx > params.learning_start and len(replay_buffer) > params.batch_size:
states, actions, rewards, next_states, dones = replay_buffer.sample(params.batch_size)
next_Q = target_model.predict(sess, next_states)
Y = rewards + params.gamma * np.max(next_Q, axis=1) * np.logical_not(dones)
loss = main_model.update(sess, states, actions, Y)
# Logging and refreshing log purpose values
losses.append(np.mean(loss))
logging(frame_idx, params.num_frames, index_episode, time.time()-start, episode_reward, np.mean(loss), cnt_action)
episode_reward = 0
cnt_action = []
start = time.time()
if frame_idx > params.learning_start and frame_idx % params.sync_freq == 0:
# soft update means we partially add the original weights of target model instead of completely
# sharing the weights among main and target models
if params.update_hard_or_soft == "hard":
sync_main_target(sess, main_model, target_model)
elif params.update_hard_or_soft == "soft":
soft_target_model_update(sess, main_model, target_model, tau=params.soft_update_tau)
return all_rewards, losses
Modification
dones -> np.logical_not(dones)
np.argmax -> np.max
separating MSE from huber_loss
Briefly looking over, it seems that the dones variable is a binary vector where 1 denotes done, and 0 denotes not-done.
You then use dones here:
Y = rewards + params.gamma * np.argmax(next_Q, axis=1) * dones
So for all terminating transitions, you add the expected cumulative reward when following the policy for the rest of the episode (which is zero). For all non-terminating transitions, you do not add the expect cumulative reward.
I think you mean to do this the other way around, perhaps swap dones in the above line of code with np.logical_not(dones)?
Also, now that I look at it, it seems there is another major problem with this line. np.argmax(next_Q, axis=1) returns the index of the maximum value in next_Q vector, not the actual maximum value. You need np.maximum(next_Q, axis=1) (IIRC) to get the maximum expected reward of the next state's actions.
EDIT: The loss function is also very strangely defined. You are mixing Huber Loss with Mean-Squared-Error. If you want to use either huber_loss or MSE, you just compute them on the difference between the expected and predicted values. You appear to be doing both, which is certainly not a commonly defined loss function. For example, your model loss to use Huber Loss should just be:
self.loss = tf.reduce_mean(huber_loss(abs(self.Y - self.action_probs)))
This is part of my current python code for NN training in python using CNTK module
batch_axis = C.Axis.default_batch_axis()
input_seq_axis = C.Axis.default_dynamic_axis()
input_dynamic_axes = [batch_axis, input_seq_axis]
input_dynamic_axes2 = [batch_axis, input_seq_axis]
input = C.input_variable(n_ins, dynamic_axes=input_dynamic_axes, dtype=numpy.float32)
output = C.input_variable(n_outs, dynamic_axes=input_dynamic_axes2, dtype=numpy.float32)
dnn_model = cntk_model.create_model(input, hidden_layer_type, hidden_layer_size, n_outs)
loss = C.squared_error(dnn_model, output)
error = C.squared_error(dnn_model, output)
lr_schedule = C.learning_rate_schedule(current_finetune_lr, C.UnitType.minibatch)
momentum_schedule = C.momentum_schedule(current_momentum)
learner = C.adam(dnn_model.parameters, lr_schedule, momentum_schedule, unit_gain = False, l1_regularization_weight=l1_reg, l2_regularization_weight= l2_reg)
trainer = C.Trainer(dnn_model, (loss, error), [learner])
And here is code for creating NN model
def create_model(features, hidden_layer_type, hidden_layer_size, n_out):
logger.debug('Creating cntk model')
assert len(hidden_layer_size) == len(hidden_layer_type)
n_layers = len(hidden_layer_size)
my_layers = list()
for i in xrange(n_layers):
if(hidden_layer_type[i] == 'TANH'):
my_layers.append(C.layers.Dense(hidden_layer_size[i], activation=C.tanh, init=C.layers.glorot_uniform()))
elif (hidden_layer_type[i] == 'LSTM'):
my_layers.append(C.layers.Recurrence(C.layers.LSTM(hidden_layer_size[i])))
else:
raise Exception('Unknown hidden layer type')
my_layers.append(C.layers.Dense(n_out, activation=None))
my_model = C.layers.Sequential([my_layers])
my_model = my_model(features)
return my_model
Now, I would like to change a backpropagation, so when the error is calculated not direct network output is used, but the output after some additional calculation. I tried to define something like this
def create_error_function(self, prediction, target):
prediction_denorm = C.element_times(prediction, self.std_vector)
prediction_denorm = C.plus(prediction_denorm, self.mean_vector)
prediction_denorm_rounded = C.round(C.element_times(prediction_denorm[0:5], C.round(prediction_denorm[5])))
prediction_denorm_rounded = C.element_divide(prediction_denorm_rounded, C.round(prediction_denorm[5]))
prediction_norm = C.minus(prediction_denorm_rounded, self.mean_vector[0:5])
prediction_norm = C.element_divide(prediction_norm, self.std_vector[0:5])
first = C.squared_error(prediction_norm, target[0:5])
second = C.minus(C.round(prediction_denorm[5]), self.mean_vector[5])
second = C.element_divide(second, self.std_vector[5])
return C.plus(first, C.squared_error(second, target[5]))
and use it instead standard squared_error.
And the part for NN training
dnn_model = cntk_model.create_model(input, hidden_layer_type, hidden_layer_size, n_outs)
error_function = cntk_model.ErrorFunction(cmp_mean_vector, cmp_std_vector)
loss = error_function.create_error_function(dnn_model, output)
error = error_function.create_error_function(dnn_model, output)
lr_schedule = C.learning_rate_schedule(current_finetune_lr, C.UnitType.minibatch)
momentum_schedule = C.momentum_schedule(current_momentum)
learner = C.adam(dnn_model.parameters, lr_schedule, momentum_schedule, unit_gain = False, l1_regularization_weight=l1_reg,
l2_regularization_weight= l2_reg)
trainer = C.Trainer(dnn_model, (loss, error), [learner])
trainer.train_minibatch({input: temp_train_x, output: temp_train_y})
But after two epochs I start gettting always the same average loss, as my network is not learning
Every time you want to change how backprop works, you need to use stop_gradient. This is the only function whose gradient is different from the gradient of the operation of the forward pass. In the forward pass stop_gradient acts as identity. In the backward pass it blocks the gradient from propagating.
To do an operation f(x) on some x in the forward pass and pretend as if it never happened in the backward pass you need to do something like:
C.stop_gradient(f(x) - x) + x. In your case that would be
norm_features = C.stop_gradient(features/normalization - features) + features