I’m currently trying to adapt a PyTorch Wave-U-Net implementation (https://github.com/f90/Wave-U-Net-Pytorch) so that it’ll work for audio mixing rather than source separation. I’m still quite new to PyTorch so I’m not exactly sure how one would go about structuring a forward pass for multiple inputs (8 audio tracks, each corresponding to a separate stem or instrument) and a single output (a mixture track, produced from the inputs).
The network is below, I don’t think the functions in the middle are relevant or need changing. The output is a dictionary with a key for each instrument matching the estimated audio source, however this is obviously not necessary for my task, I think the only changes needed are in __init__ and forward, but I’m unclear what changes are necessary and how to implement them.
class Waveunet(nn.Module):
def __init__(self, num_inputs, num_channels, num_outputs, instruments, kernel_size, target_output_size, conv_type, res, separate=False, depth=1, strides=2):
super(Waveunet, self).__init__()
self.num_levels = len(num_channels)
self.strides = strides
self.kernel_size = kernel_size
self.num_inputs = num_inputs
self.num_outputs = num_outputs
self.depth = depth
self.instruments = instruments
self.separate = separate
# Only odd filter kernels allowed
assert(kernel_size % 2 == 1)
self.waveunets = nn.ModuleDict()
model_list = instruments if separate else ["ALL"]
# Create a model for each source if we separate sources separately, otherwise only one (model_list=["ALL"])
for instrument in model_list:
module = nn.Module()
module.downsampling_blocks = nn.ModuleList()
module.upsampling_blocks = nn.ModuleList()
for i in range(self.num_levels - 1):
in_ch = num_inputs if i == 0 else num_channels[i]
module.downsampling_blocks.append(
DownsamplingBlock(in_ch, num_channels[i], num_channels[i+1], kernel_size, strides, depth, conv_type, res))
for i in range(0, self.num_levels - 1):
module.upsampling_blocks.append(
UpsamplingBlock(num_channels[-1-i], num_channels[-2-i], num_channels[-2-i], kernel_size, strides, depth, conv_type, res))
module.bottlenecks = nn.ModuleList(
[ConvLayer(num_channels[-1], num_channels[-1], kernel_size, 1, conv_type) for _ in range(depth)])
# Output conv
outputs = num_outputs if separate else num_outputs * len(instruments)
module.output_conv = nn.Conv1d(num_channels[0], outputs, 1)
self.waveunets[instrument] = module
self.set_output_size(target_output_size)
def set_output_size(self, target_output_size):
self.target_output_size = target_output_size
self.input_size, self.output_size = self.check_padding(target_output_size)
print("Using valid convolutions with " + str(self.input_size) + " inputs and " + str(self.output_size) + " outputs")
assert((self.input_size - self.output_size) % 2 == 0)
self.shapes = {"output_start_frame" : (self.input_size - self.output_size) // 2,
"output_end_frame" : (self.input_size - self.output_size) // 2 + self.output_size,
"output_frames" : self.output_size,
"input_frames" : self.input_size}
def check_padding(self, target_output_size):
# Ensure number of outputs covers a whole number of cycles so each output in the cycle is weighted equally during training
bottleneck = 1
while True:
out = self.check_padding_for_bottleneck(bottleneck, target_output_size)
if out is not False:
return out
bottleneck += 1
def check_padding_for_bottleneck(self, bottleneck, target_output_size):
module = self.waveunets[[k for k in self.waveunets.keys()][0]]
try:
curr_size = bottleneck
for idx, block in enumerate(module.upsampling_blocks):
curr_size = block.get_output_size(curr_size)
output_size = curr_size
# Bottleneck-Conv
curr_size = bottleneck
for block in reversed(module.bottlenecks):
curr_size = block.get_input_size(curr_size)
for idx, block in enumerate(reversed(module.downsampling_blocks)):
curr_size = block.get_input_size(curr_size)
assert(output_size >= target_output_size)
return curr_size, output_size
except AssertionError as e:
return False
def forward_module(self, x, module):
'''
A forward pass through a single Wave-U-Net (multiple Wave-U-Nets might be used, one for each source)
:param x: Input mix
:param module: Network module to be used for prediction
:return: Source estimates
'''
shortcuts = []
out = x
# DOWNSAMPLING BLOCKS
for block in module.downsampling_blocks:
out, short = block(out)
shortcuts.append(short)
# BOTTLENECK CONVOLUTION
for conv in module.bottlenecks:
out = conv(out)
# UPSAMPLING BLOCKS
for idx, block in enumerate(module.upsampling_blocks):
out = block(out, shortcuts[-1 - idx])
# OUTPUT CONV
out = module.output_conv(out)
if not self.training: # At test time clip predictions to valid amplitude range
out = out.clamp(min=-1.0, max=1.0)
return out
def forward(self, x, inst=None):
curr_input_size = x.shape[-1]
assert(curr_input_size == self.input_size) # User promises to feed the proper input himself, to get the pre-calculated (NOT the originally desired) output size
if self.separate:
return {inst : self.forward_module(x, self.waveunets[inst])}
else:
assert(len(self.waveunets) == 1)
out = self.forward_module(x, self.waveunets["ALL"])
out_dict = {}
for idx, inst in enumerate(self.instruments):
out_dict[inst] = out[:, idx * self.num_outputs:(idx + 1) * self.num_outputs]
return out_dict'
Related
I am developing a policy gradient NN with pytorch(version: 1.10.1) and I am having the run time error message as:
The error message as:
RuntimeError: one of the variables needed for gradient computation has been modified by an in-place operation: [torch.FloatTensor [1, 15]] is at version 1; expected version 0 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
I had read some similar discussions and people suggest trying to avoid doing a += 1. There are also some discussions that suggest downgrading the pytorch. People also suggest using the clone() instead modify the tensor. I had tried them all but I still have this error.
The error does not show everytime in the update() function. Sometimes it works well. Why does this happen?
My related code is as following, there are some weird variables names such as previous_R. I used them to avoid in place operation such as a = a + 1
NN class:
class NN(nn.Module):
"""
Feel free to change the architecture for different tasks!
"""
def __init__(self, env):
super(NN, self).__init__()
# 15 in this case
self.state_size = 15
# 31 (1 and -1 for each task-server and void) (m*n*2 + 1)
self.action_size = 31
self.linear1 = nn.Linear(self.state_size, 128)
self.linear2 = nn.Linear(128, 256)
self.linear3 = nn.Linear(256, self.action_size)
def forward(self, state):
output1 = F.relu(self.linear1(state))
output2 = F.relu(self.linear2(output1.clone()))
output3 = self.linear3(output2.clone())
# Note the conversion to Pytorch distribution.
distribution = Categorical(F.softmax(output3.clone(), dim=-1))
return distribution
Reinforcement Learning part related code:
class Agent():
def __init__(self, env, lr, gamma):
self.env = env
self.NN = NN(env)
self.lr = lr
self.optim_NN = optim.Adam(self.NN.parameters(), lr = self.lr)
self.gamma = gamma
def update(self, log_probs,returns):
with torch.autograd.set_detect_anomaly(True):
print("updating")
baselines = self.compute_baselines(returns.clone())
loss = self.compute_loss(log_probs.clone(), returns, baselines)
self.optim_NN.zero_grad()
loss.backward()
self.optim_NN.step()
def compute_returns(self,rewards):
R = 0
returns = []
for r in rewards[::-1]:
pre_R = R
R = r + self.gamma*pre_R
returns.insert(0,R)
returns = torch.tensor(returns)
return returns
def compute_baselines(self,returns):
baselines = []
baselines.append(returns[0])
for v in returns:
t = len(baselines)
b = (baselines[t-1]*t + v)/(t+1)
baselines.append(b)
return baselines
def compute_loss(self, log_probs,returns, baselines):
with torch.autograd.set_detect_anomaly(True):
loss = 0
for i in range(0,len(returns)):
l = log_probs[i].clone()
r = returns[i].clone()
b = baselines[i].clone()
pre_loss = loss
loss =pre_loss + (-l*(r-b))
# losses.append(loss)
# losses.append(-log_probs[i].clone()*(returns[i].clone()-baselines[i].clone()))
policy_loss = loss
return policy_loss
The convolutional model presented below, has two branches and each branch (for example) has two stages (convolutional layers).
My aim is to combine the weighted feature maps (channels) of the first convolutional layer from the second branch with the channels of the first convolutional layer from the first branch.
I want to extract the channels from the first convolutional layer in the second branch, multiply it by a weight (weight is a class in the code that makes the output a weighted version of its input) and stack it with the channels of its counterpart convolutional layer from the first branch. Afterwards, by utilizing a 1x1 conv2d, the size of the stacked feature maps will be changed to its initial size and this combined channels should be used by the first branch and the next convolutional layers will be computed based on these combined channels. After that, I want to have this kind of combination between the second convolutional layers of the branches. (In other words, I want to combine features channel-by-channels between branches.)
Please find the main_class (the whole model that consists of two branches) and the first_branch and second_branch below:
class main_class(nn.Module):
def __init__(self, pretrained=False):
super(main_class, self).__init__()
self.input=input_data() # input_data is a class the provides the input data for the each branch
self.conv_t2 = BasicConv3d(...........)
self.second_branch=second_branch(512, out_sigmoid=True)
self.conv_t1 = BasicConv3d(..............)
self.first_branch=first_branch(512, out_sigmoid=True)
self.last = nn.Conv2d(4, 1, kernel_size=1, stride=1)
self.sigmoid = nn.Sigmoid()
def forward(self, x, par = False):
x1, x2 = self.input(x)
#second branch
y2 = self.conv_t2(x2)
out2 = self.second_branch(y2)
#first branch
y1 = self.conv_t1(x1)
out1 = self.first_branch(y1)
x = torch.cat((out2, out1), 1)
x = self.last(x)
out = self.sigmoid(x)
if par:
return out1, out2, out
return out
The first_branch:
class first_branch(nn.Module):
def __init__(self, in_channel=512, out_channel=[380, 200], out_sigmoid=False):
super(first_branch, self).__init__()
self.out_sigmoid=out_sigmoid
self.deconvlayer1_2 = self._make_deconv(in_channel, out_channel[0], num_conv=3)
self.upsample1_2=Upsample(scale_factor=2, mode='bilinear')
self.combined1_2 = nn.conv2d(720, 380, kernel_size=1, stride=1, padding=0)
self.deconvlayer1_1 = self._make_deconv(out_channel[0], out_channel[1], num_conv=3)
self.upsample1_1=Upsample(scale_factor=2, mode='bilinear')
self.combined1_1 = nn.conv2d(400, 200, kernel_size=1, stride=1, padding=0)
def forward(self, x):
x=self.deconvlayer1_2(x)
x = self.upsample1_2(x)
x=self.deconvlayer1_1(x)
x = self.upsample1_1(x)
if self.out_sigmoid:
x=self.sigmoid(x)
return x
The second_branch:
class second_branch(nn.Module):
def __init__(self, in_channel=512, out_channel=[380,200], out_sigmoid=False):
super(second_branch, self).__init__()
self.out_sigmoid=out_sigmoid
self.weight = weight() # weight is a class that weighted its input
self.deconvlayer2_2 = self._make_deconv(in_channel, out_channel[0], num_conv=3)
self.upsample2_2=Upsample(scale_factor=2, mode='bilinear')
self.deconvlayer2_! = self._make_deconv(out_channel[0], out_channel[1], num_conv=3)
self.upsample2_1=Upsample(scale_factor=2, mode='bilinear')
def forward(self, x):
x=self.deconvlayer2_2(x)
x = self.upsample2_2(x)
weighted2_2 = self.weight(x)
x=self.deconvlayer2_1(x)
x = self.upsample2_1(x)
weighted2_1 = self.weight(x)
if self.out_sigmoid:
x=self.sigmoid(x)
return x, weighted2_1, weighted2_2
For implementing the mentioned idea in the main_class, I modified it as follows (instead of using the first_branch class in the forward function of the main_class, I wrote the script lines of the forward function of the first_branch in the forward function of the main_class):
class main_class(nn.Module):
def __init__(self, pretrained=False):
super(main_class, self).__init__()
self.input=input_data() # input_data is a class the provides the input data for the each branch
self.conv_t2 = BasicConv3d(....................)
self.second_branch=second_branch(512, out_sigmoid=True)
self.conv_t1 = BasicConv3d(............)
self.first_branch=first_branch(512, out_sigmoid=True)
self.last = nn.Conv2d(4, 1, kernel_size=1, stride=1)
self.sigmoid = nn.Sigmoid()
def forward(self, x, par = False):
x1, x2 = self.input(x)
#second branch
y2 = self.conv_t2(x2)
out2, weighted2_1, weighted2_2 = self.second_branch(y2)
#first branch
y1 = self.conv_t1(x1)
# instead of using from class first_branch, again I write the script lines of first_branch.forward() in below:
x=self.deconvlayer1_2(y1)
x = self.upsample1_2(x)
stacking_2 = torch.stack(x, weighted2_2)
x = self.frist_branch.combined1_2(stacking_2)
x=self.deconvlayer1_1(x)
x = self.upsample1_1(x)
stacking_1 = torch.stack(x, weighted2_1)
x = self.frist_branch.combined1_1(stacking_1)
out1=self.sigmoid(x)
x = torch.cat((out2, out1), 1)
x = self.last(x)
out = self.sigmoid(x)
if par:
return out1, out2, out
return out
I face with the following error:
TypeError: Cannot create a consistent method resolution order (MRO) for bases Module, second_branch
How can I fix this problem and how can I make the code able to have the interactions between new branches that may be added later to the model (for example if I have three branches, how can I have this kind of data combination between the third branch and the second one, and between the output of the previous combination and the first branch)?
In your main_class the second branch is not receiving additional arguments, it's only the first one that needs to be executed as second (in order). You could just add a parameter to the forward method of that branch like so:
class first_branch(nn.Module):
...
def forward(self, x, weighted_x: list = []):
x = self.deconvlayer1_2(x)
x = self.upsample1_2(x)
out1 = None
if len(weighted_x) > 0:
x = torch.stack(x, weighted_x[0])
x = self.combined1_2(x)
x = self.deconvlayer1_1(x)
x = self.upsample1_1(x)
out2 = None
if len(weighted_x) > 1:
x = torch.stack(x, weighted_x[1])
x = self.combined1_1(x)
if self.out_sigmoid:
x = self.sigmoid(x)
return x, out1, out2
As you can see, there's a lot of boilerplate code, which you can avoid by creating a small submodule that do this part of forward. You could then store multiple modules in your first_branch inside a ModuleList and iterate over them.
I'm new to python and tensorflow. I'm now testing Improved WGAN code from https://github.com/igul222/improved_wgan_training
After adjusting the code to python 3.6, it still gives "NameError: name 'train_gen' is not defined" when I ran it, although there wasn't warning from pylint.
Can anyone help me with it?
The version of python I'm using is 3.6. There were many syntax differences from 2.7. I've already changed a lot to make it work. And I am running Tensorflow in a virtual environment. Still couldn't figure out this one.
import os, sys
sys.path.append(os.getcwd())
import time
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import numpy as np
import sklearn.datasets
import tensorflow as tf
import tflib as lib
import tflib.ops.linear
import tflib.ops.conv2d
import tflib.ops.batchnorm
import tflib.ops.deconv2d
import tflib.save_images
import tflib.mnist
import tflib.plot
MODE = 'wgan-gp' # dcgan, wgan, or wgan-gp
DIM = 64 # Model dimensionality
BATCH_SIZE = 50 # Batch size
CRITIC_ITERS = 5 # For WGAN and WGAN-GP, number of critic iters per gen iter
LAMBDA = 10 # Gradient penalty lambda hyperparameter
ITERS = 200000 # How many generator iterations to train for
OUTPUT_DIM = 784 # Number of pixels in MNIST (28*28)
lib.print_model_settings(locals().copy())
def LeakyReLU(x, alpha=0.2):
return tf.maximum(alpha*x, x)
def ReLULayer(name, n_in, n_out, inputs):
output = lib.ops.linear.Linear(
name+'.Linear',
n_in,
n_out,
inputs,
initialization='he'
)
return tf.nn.relu(output)
def LeakyReLULayer(name, n_in, n_out, inputs):
output = lib.ops.linear.Linear(
name+'.Linear',
n_in,
n_out,
inputs,
initialization='he'
)
return LeakyReLU(output)
def Generator(n_samples, noise=None):
if noise is None:
noise = tf.random_normal([n_samples, 128])
output = lib.ops.linear.Linear('Generator.Input', 128, 4*4*4*DIM, noise)
if MODE == 'wgan':
output = lib.ops.batchnorm.Batchnorm('Generator.BN1', [0], output)
output = tf.nn.relu(output)
output = tf.reshape(output, [-1, 4*DIM, 4, 4])
output = lib.ops.deconv2d.Deconv2D('Generator.2', 4*DIM, 2*DIM, 5, output)
if MODE == 'wgan':
output = lib.ops.batchnorm.Batchnorm('Generator.BN2', [0,2,3], output)
output = tf.nn.relu(output)
output = output[:,:,:7,:7]
output = lib.ops.deconv2d.Deconv2D('Generator.3', 2*DIM, DIM, 5, output)
if MODE == 'wgan':
output = lib.ops.batchnorm.Batchnorm('Generator.BN3', [0,2,3], output)
output = tf.nn.relu(output)
output = lib.ops.deconv2d.Deconv2D('Generator.5', DIM, 1, 5, output)
output = tf.nn.sigmoid(output)
return tf.reshape(output, [-1, OUTPUT_DIM])
def Discriminator(inputs):
output = tf.reshape(inputs, [-1, 1, 28, 28])
output = lib.ops.conv2d.Conv2D('Discriminator.1',1,DIM,5,output,stride=2)
output = LeakyReLU(output)
output = lib.ops.conv2d.Conv2D('Discriminator.2', DIM, 2*DIM, 5, output, stride=2)
if MODE == 'wgan':
output = lib.ops.batchnorm.Batchnorm('Discriminator.BN2', [0,2,3], output)
output = LeakyReLU(output)
output = lib.ops.conv2d.Conv2D('Discriminator.3', 2*DIM, 4*DIM, 5, output, stride=2)
if MODE == 'wgan':
output = lib.ops.batchnorm.Batchnorm('Discriminator.BN3', [0,2,3], output)
output = LeakyReLU(output)
output = tf.reshape(output, [-1, 4*4*4*DIM])
output = lib.ops.linear.Linear('Discriminator.Output', 4*4*4*DIM, 1, output)
return tf.reshape(output, [-1])
real_data = tf.placeholder(tf.float32, shape=[BATCH_SIZE, OUTPUT_DIM])
fake_data = Generator(BATCH_SIZE)
disc_real = Discriminator(real_data)
disc_fake = Discriminator(fake_data)
gen_params = lib.params_with_name('Generator')
disc_params = lib.params_with_name('Discriminator')
if MODE == 'wgan':
gen_cost = -tf.reduce_mean(disc_fake)
disc_cost = tf.reduce_mean(disc_fake) - tf.reduce_mean(disc_real)
gen_train_op = tf.train.RMSPropOptimizer(
learning_rate=5e-5
).minimize(gen_cost, var_list=gen_params)
disc_train_op = tf.train.RMSPropOptimizer(
learning_rate=5e-5
).minimize(disc_cost, var_list=disc_params)
clip_ops = []
for var in lib.params_with_name('Discriminator'):
clip_bounds = [-.01, .01]
clip_ops.append(
tf.assign(
var,
tf.clip_by_value(var, clip_bounds[0], clip_bounds[1])
)
)
clip_disc_weights = tf.group(*clip_ops)
elif MODE == 'wgan-gp':
gen_cost = -tf.reduce_mean(disc_fake)
disc_cost = tf.reduce_mean(disc_fake) - tf.reduce_mean(disc_real)
alpha = tf.random_uniform(
shape=[BATCH_SIZE,1],
minval=0.,
maxval=1.
)
differences = fake_data - real_data
interpolates = real_data + (alpha*differences)
gradients = tf.gradients(Discriminator(interpolates), [interpolates])[0]
slopes = tf.sqrt(tf.reduce_sum(tf.square(gradients), reduction_indices=[1]))
gradient_penalty = tf.reduce_mean((slopes-1.)**2)
disc_cost += LAMBDA*gradient_penalty
gen_train_op = tf.train.AdamOptimizer(
learning_rate=1e-4,
beta1=0.5,
beta2=0.9
).minimize(gen_cost, var_list=gen_params)
disc_train_op = tf.train.AdamOptimizer(
learning_rate=1e-4,
beta1=0.5,
beta2=0.9
).minimize(disc_cost, var_list=disc_params)
clip_disc_weights = None
elif MODE == 'dcgan':
gen_cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
disc_fake,
tf.ones_like(disc_fake)
))
disc_cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
disc_fake,
tf.zeros_like(disc_fake)
))
disc_cost += tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
disc_real,
tf.ones_like(disc_real)
))
disc_cost /= 2.
gen_train_op = tf.train.AdamOptimizer(
learning_rate=2e-4,
beta1=0.5
).minimize(gen_cost, var_list=gen_params)
disc_train_op = tf.train.AdamOptimizer(
learning_rate=2e-4,
beta1=0.5
).minimize(disc_cost, var_list=disc_params)
clip_disc_weights = None
# For saving samples
fixed_noise = tf.constant(np.random.normal(size=(128, 128)).astype('float32'))
fixed_noise_samples = Generator(128, noise=fixed_noise)
def generate_image(frame, true_dist):
samples = session.run(fixed_noise_samples)
lib.save_images.save_images(
samples.reshape((128, 28, 28)),
'samples_{}.png'.format(frame)
)
# Dataset iterator
train_gen, dev_gen, test_gen = lib.mnist.load(BATCH_SIZE, BATCH_SIZE)
def inf_train_gen():
while True:
for images, targets in train_gen():
yield images
# Train loop
with tf.Session() as session:
session.run(tf.initialize_all_variables())
gen = inf_train_gen()
for iteration in range(ITERS):
start_time = time.time()
if iteration > 0:
_ = session.run(gen_train_op)
if MODE == 'dcgan':
disc_iters = 1
else:
disc_iters = CRITIC_ITERS
for i in range(disc_iters):
_data = gen.__next__()
_disc_cost, _ = session.run(
[disc_cost, disc_train_op],
feed_dict={real_data: _data}
)
if clip_disc_weights is not None:
_ = session.run(clip_disc_weights)
lib.plot.plot('train disc cost', _disc_cost)
lib.plot.plot('time', time.time() - start_time)
# Calculate dev loss and generate samples every 100 iters
if iteration % 100 == 99:
dev_disc_costs = []
for images,_ in dev_gen():
_dev_disc_cost = session.run(
disc_cost,
feed_dict={real_data: images}
)
dev_disc_costs.append(_dev_disc_cost)
lib.plot.plot('dev disc cost', np.mean(dev_disc_costs))
generate_image(iteration, _data)
# Write logs every 100 iters
if (iteration < 5) or (iteration % 100 == 99):
lib.plot.flush()
lib.plot.tick()
This is the section containing the error name.
# Dataset iterator
train_gen, dev_gen, test_gen = lib.mnist.load(BATCH_SIZE, BATCH_SIZE)
def inf_train_gen():
while True:
for images, targets in train_gen():
yield images
And here is the error.
Traceback (most recent call last):
File "<stdin>", line 13, in <module>
File "<stdin>", line 3, in inf_train_gen
NameError: name 'train_gen' is not defined
Attempt 1:
I believe it's just because you are saying for images, targets in train_gen(): when you should be saying for images, targets in train_gen:
In a nutshell, the brackets suggest you are calling a function, which leads Python to raise the exception NameError: name 'train_gen' is not defined because there is no function with the name train_gen defined.
In the future, your code should be minimal, because you have pasted an enormous amount of code which makes it very hard to debug/see what you're doing.
Attempt 2:
Upon second review of the code (this is a good reason why you need to make your examples as small as possible), I have realised that is is possible you are maybe importing this code from elsewhere?
When you are making the first assignment to train_gen this is outside the function scope. It is possible then that when you go to call the function train_gen is no longer defined, which is why you get your error. This can occur for a number of reasons. After having reviewed the code a little there are various issues I can see (bad practice mostly).
It is generally not a good idea to use global variables within a function as you have in inf_train_gen, if a function needs an argument to run properly, then it should be passed as an argument. This is because if we have a problem with a variable (as we do now) we can normally see where this variable comes from and what uses it, but if all function rely on the globally scoped variable, any number of functions could delete it, change it, etc.
Right now I have no idea what has happened to the variable train_gen, I would suggest printing out the variable at different intervals and seeing if you can see which function call is causing issues and in the future stay away from globally scoped variables unless absolutely necessary, it makes it near-impossible to debug.
I'm working on a seq2seq RNN generating an output sequence of labels given a seed label. During the inference step I'd like to generate sequences containing only unique labels (i.e. skip labels that have already been added to the output sequence). To do this I created a sampler object that tries to remember the labels that have been added to the output and reduce their logit value to -np.inf.
Here is the sampler code:
class InferenceSampler(object):
def __init__(self, out_weights, out_biases):
self._out_weights = tf.transpose(out_weights)
self._out_biases = out_biases
self._n_tracks = out_weights.shape[0]
self.ids_mask = tf.zeros([self._n_tracks], name="playlist_mask")
def __call__(self, decoder_outputs):
_logits = tf.matmul(decoder_outputs, self._out_weights)
_logits = tf.nn.bias_add(_logits, self._out_biases)
# apply mask
_logits = _logits + self.ids_mask
_sample_ids = tf.cast(tf.argmax(_logits, axis=-1), tf.int32)
# update mask
step_ids_mask = tf.sparse_to_dense(_sample_ids, [self._n_tracks], -np.inf)
self.ids_mask = self.ids_mask + step_ids_mask
return _sample_ids
The code of the inference graph looks like this:
self._max_playlist_len = tf.placeholder(tf.int32, ())
self._start_tokens = tf.placeholder(tf.int32, [None])
sample_fn = InferenceSampler(out_weights, out_biases)
with tf.name_scope("inf_decoder"):
def _end_fn(sample_ids):
return tf.equal(sample_ids, PAD_ITEM_ID)
def _next_inputs_fn(sample_ids):
return tf.nn.embedding_lookup(
track_embs,
sample_ids
)
_start_inputs = tf.nn.embedding_lookup(
track_embs,
self._start_tokens
)
helper = tf.contrib.seq2seq.InferenceHelper(
sample_fn=sample_fn,
sample_shape=[],
sample_dtype=tf.int32,
start_inputs=_start_inputs,
end_fn=_end_fn,
next_inputs_fn=_next_inputs_fn
)
decoder = tf.contrib.seq2seq.BasicDecoder(
rnn_cell,
helper,
rnn_cell.zero_state(tf.shape(self._start_tokens)[0], tf.float32),
output_layer=projection_layer
)
outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
decoder,
maximum_iterations=self._max_playlist_len
)
self.playlists = outputs.sample_id
Unfortunately, the results still have duplicated labels. Moreover, when I try to get access to the sample_fn.ids_mask I receive an error message: ValueError: Operation 'inf_decoder/decoder/while/BasicDecoderStep/add_1' has been marked as not fetchable.
What am I doing wrong? And how legal is to create such sample_fn?
Trying to overcome the problem, I updated the inference in such a way that at each RNN step I output the vector of embeddings instead of item_id. After the inference is finished, I convert embeddings to item_ids.
First of all, this solution minimizes the number of operations. Secondly, since I use LSTM/GRU cells, they minimize the probability to observe two absolutely similar outputs on different steps of RNN's inference.
The new code looks like this:
with tf.name_scope("inf_decoder"):
def _sample_fn(decoder_outputs):
return decoder_outputs
def _end_fn(sample_ids):
# infinite
return tf.tile([False], [n_seeds])
_start_inputs = tf.nn.embedding_lookup(
track_embs,
self._seed_items
)
helper = tf.contrib.seq2seq.InferenceHelper(
sample_fn=_sample_fn,
sample_shape=[self.emb_size],
sample_dtype=tf.float32,
start_inputs=_start_inputs,
end_fn=_end_fn,
)
decoder = tf.contrib.seq2seq.BasicDecoder(
rnn_cell,
helper,
rnn_cell.zero_state(n_seeds, tf.float32),
output_layer=projection_layer
)
outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
decoder,
maximum_iterations=self._max_playlist_len
)
flat_rnn_output = tf.reshape(outputs.rnn_output, [-1, self.emb_size])
flat_logits = tf.matmul(flat_rnn_output, out_weights, transpose_b=True)
flat_logits = tf.nn.bias_add(flat_logits, out_biases)
item_ids = tf.cast(tf.argmax(flat_logits, axis=-1), tf.int32)
playlists = tf.reshape(item_ids, [n_seeds, -1])
self.playlists = playlists
So, after some investigation I found answers to all my questions related to this thread. The main question was: why self.ids_mask in InferenceSampler does not update? The reason is in the internals of dynamic_decode. According to this answer in Tensorflow's issue tracker:
... only tensors defined inside the loop will be evaluated
every loop iteration. All tensors defined outside a loop will be
evaluated exactly once.
In my case, self.ids_mask is specified outside the loop. That means that I need to re-write dynamic_decode to get what I want. The code below is a bit modified version of the initial task, but it does almost the same.
Let's start with a new dynamic_decode which should create and update the mask collecting sample_ids that have been already predicted. I removed the code which i didn't modify, follow the initial_mask and mask variables.
New dynamic_decode:
def dynamic_decode(decoder,
output_time_major=False,
impute_finished=False,
maximum_iterations=None,
parallel_iterations=32,
swap_memory=False,
scope=None):
...
initial_finished, initial_inputs, initial_mask, initial_state = decoder.initialize()
...
def body(time, outputs_ta, state, inputs, finished, sequence_lengths, mask):
"""Internal while_loop body.
Args:
time: scalar int32 tensor.
outputs_ta: structure of TensorArray.
state: (structure of) state tensors and TensorArrays.
inputs: (structure of) input tensors.
finished: bool tensor (keeping track of what's finished).
sequence_lengths: int32 tensor (keeping track of time of finish).
mask: SparseTensor to remove already predicted items
Returns:
`(time + 1, outputs_ta, next_state, next_inputs, next_finished,
next_sequence_lengths, next_mask)`.
```
"""
(next_outputs, decoder_state, next_inputs, next_mask,
decoder_finished) = decoder.step(time, inputs, state, mask)
...
nest.assert_same_structure(state, decoder_state)
nest.assert_same_structure(outputs_ta, next_outputs)
nest.assert_same_structure(inputs, next_inputs)
nest.assert_same_structure(mask, next_mask)
...
return (time + 1, outputs_ta, next_state, next_inputs, next_finished,
next_sequence_lengths, next_mask)
res = control_flow_ops.while_loop(
condition,
body,
loop_vars=[
initial_time, initial_outputs_ta, initial_state, initial_inputs,
initial_finished, initial_sequence_lengths, initial_mask,
],
parallel_iterations=parallel_iterations,
swap_memory=swap_memory)
...
return final_outputs, final_state, final_sequence_lengths
At the next step mask should be passed to Decoder and Helper. Here are the updated versions of BasicDecoder and InferenceHelper:
MaskedDecoder:
class MaskedDecoder(BasicDecoder):
def step(self, time, inputs, state, mask, name=None):
with ops.name_scope(name, "MaskedDecoderStep", (time, inputs, state, mask)):
cell_outputs, cell_state = self._cell(inputs, state)
if self._output_layer is not None:
cell_outputs = self._output_layer(cell_outputs)
sample_ids = self._helper.sample(
time=time,
outputs=cell_outputs,
state=cell_state,
mask=mask)
(finished, next_inputs, next_state, next_mask) = self._helper.next_inputs(
time=time,
outputs=cell_outputs,
state=cell_state,
mask=mask,
sample_ids=sample_ids)
outputs = BasicDecoderOutput(cell_outputs, sample_ids)
return (outputs, next_state, next_inputs, next_mask, finished)
MaskedInferenceHelper:
class MaskedInferenceHelper(Helper):
"""A helper to use during inference with a custom sampling function."""
def __init__(self, norm_track_embs, features, start_sample_ids):
self._norm_track_embs = norm_track_embs
self._batch_size = tf.shape(start_sample_ids)[0]
self._n_tracks = tf.shape(norm_track_embs)[0]
self._start_sample_ids = start_sample_ids
self._sample_shape = tf.TensorShape([])
self._sample_dtype = tf.int32
self._features = features
def _get_sparse_mask(self, sample_ids):
_mask_shape = tf.convert_to_tensor([
tf.cast(self._batch_size, dtype=tf.int64),
tf.cast(self._n_tracks, dtype=tf.int64)
])
_st_rows = tf.range(0, self._batch_size)
_st_cols = sample_ids
_st_indices = tf.cast(tf.stack([_st_rows, _st_cols], axis=1), dtype=tf.int64)
_st_values = tf.fill([self._batch_size], np.inf)
return tf.SparseTensor(_st_indices, _st_values, _mask_shape)
...
def initialize(self, name=None):
finished = tf.tile([False], [self._batch_size])
start_embs = tf.nn.embedding_lookup(self._norm_track_embs, self._start_sample_ids)
start_inputs = tf.concat([start_embs, self._features], axis=1)
mask = self._get_sparse_mask(self._start_sample_ids)
return finished, start_inputs, mask
def sample(self, time, outputs, state, mask, name=None):
del time, state # unused by sample
outputs = tf.nn.l2_normalize(outputs, axis=-1)
cos_sims = tf.matmul(outputs, self._norm_track_embs, transpose_b=True)
cos_sims = cos_sims - tf.sparse_tensor_to_dense(mask)
sample_ids = tf.cast(tf.argmax(cos_sims, axis=-1), tf.int32)
return sample_ids
def next_inputs(self, time, outputs, state, sample_ids, mask, name=None):
del time, outputs # unused by next_inputs
finished = tf.tile([False], [self._batch_size])
next_embs = tf.nn.embedding_lookup(self._norm_track_embs, sample_ids)
next_inputs = tf.concat([next_embs, self._features], axis=1)
next_mask = tf.sparse_add(mask, self._get_sparse_mask(sample_ids))
return finished, next_inputs, state, next_mask
So, now I can generate inferences without repetition of already predicted items.
This is part of my current python code for NN training in python using CNTK module
batch_axis = C.Axis.default_batch_axis()
input_seq_axis = C.Axis.default_dynamic_axis()
input_dynamic_axes = [batch_axis, input_seq_axis]
input_dynamic_axes2 = [batch_axis, input_seq_axis]
input = C.input_variable(n_ins, dynamic_axes=input_dynamic_axes, dtype=numpy.float32)
output = C.input_variable(n_outs, dynamic_axes=input_dynamic_axes2, dtype=numpy.float32)
dnn_model = cntk_model.create_model(input, hidden_layer_type, hidden_layer_size, n_outs)
loss = C.squared_error(dnn_model, output)
error = C.squared_error(dnn_model, output)
lr_schedule = C.learning_rate_schedule(current_finetune_lr, C.UnitType.minibatch)
momentum_schedule = C.momentum_schedule(current_momentum)
learner = C.adam(dnn_model.parameters, lr_schedule, momentum_schedule, unit_gain = False, l1_regularization_weight=l1_reg, l2_regularization_weight= l2_reg)
trainer = C.Trainer(dnn_model, (loss, error), [learner])
And here is code for creating NN model
def create_model(features, hidden_layer_type, hidden_layer_size, n_out):
logger.debug('Creating cntk model')
assert len(hidden_layer_size) == len(hidden_layer_type)
n_layers = len(hidden_layer_size)
my_layers = list()
for i in xrange(n_layers):
if(hidden_layer_type[i] == 'TANH'):
my_layers.append(C.layers.Dense(hidden_layer_size[i], activation=C.tanh, init=C.layers.glorot_uniform()))
elif (hidden_layer_type[i] == 'LSTM'):
my_layers.append(C.layers.Recurrence(C.layers.LSTM(hidden_layer_size[i])))
else:
raise Exception('Unknown hidden layer type')
my_layers.append(C.layers.Dense(n_out, activation=None))
my_model = C.layers.Sequential([my_layers])
my_model = my_model(features)
return my_model
Now, I would like to change a backpropagation, so when the error is calculated not direct network output is used, but the output after some additional calculation. I tried to define something like this
def create_error_function(self, prediction, target):
prediction_denorm = C.element_times(prediction, self.std_vector)
prediction_denorm = C.plus(prediction_denorm, self.mean_vector)
prediction_denorm_rounded = C.round(C.element_times(prediction_denorm[0:5], C.round(prediction_denorm[5])))
prediction_denorm_rounded = C.element_divide(prediction_denorm_rounded, C.round(prediction_denorm[5]))
prediction_norm = C.minus(prediction_denorm_rounded, self.mean_vector[0:5])
prediction_norm = C.element_divide(prediction_norm, self.std_vector[0:5])
first = C.squared_error(prediction_norm, target[0:5])
second = C.minus(C.round(prediction_denorm[5]), self.mean_vector[5])
second = C.element_divide(second, self.std_vector[5])
return C.plus(first, C.squared_error(second, target[5]))
and use it instead standard squared_error.
And the part for NN training
dnn_model = cntk_model.create_model(input, hidden_layer_type, hidden_layer_size, n_outs)
error_function = cntk_model.ErrorFunction(cmp_mean_vector, cmp_std_vector)
loss = error_function.create_error_function(dnn_model, output)
error = error_function.create_error_function(dnn_model, output)
lr_schedule = C.learning_rate_schedule(current_finetune_lr, C.UnitType.minibatch)
momentum_schedule = C.momentum_schedule(current_momentum)
learner = C.adam(dnn_model.parameters, lr_schedule, momentum_schedule, unit_gain = False, l1_regularization_weight=l1_reg,
l2_regularization_weight= l2_reg)
trainer = C.Trainer(dnn_model, (loss, error), [learner])
trainer.train_minibatch({input: temp_train_x, output: temp_train_y})
But after two epochs I start gettting always the same average loss, as my network is not learning
Every time you want to change how backprop works, you need to use stop_gradient. This is the only function whose gradient is different from the gradient of the operation of the forward pass. In the forward pass stop_gradient acts as identity. In the backward pass it blocks the gradient from propagating.
To do an operation f(x) on some x in the forward pass and pretend as if it never happened in the backward pass you need to do something like:
C.stop_gradient(f(x) - x) + x. In your case that would be
norm_features = C.stop_gradient(features/normalization - features) + features