Related
I have a neural network in pytorch and make each layer automatically via the following structure:
class FCN(nn.Module):
##Neural Network
def __init__(self,layers):
super().__init__() #call __init__ from parent class
self.activation = nn.Tanh()
self.loss_function = nn.MSELoss(reduction ='mean')
'Initialise neural network as a list using nn.Modulelist'
self.linears = nn.ModuleList([nn.Linear(layers[i], layers[i+1]) for i in range(len(layers)-1)])
self.iter = 0
'Xavier Normal Initialization'
for i in range(len(layers)-1):
nn.init.xavier_normal_(self.linears[i].weight.data, gain=1.0)
nn.init.zeros_(self.linears[i].bias.data)
'foward pass'
def forward(self, x):
if torch.is_tensor(x) != True:
x = torch.from_numpy(x)
a = x.float()
for i in range(len(layers)-2):
z = self.linears[i](a)
a = self.activation(z)
a = self.linears[-1](a)
return a
The following code also makes the network for me:
layers = np.array([2, 50, 50, 1])
model = FCN(layers)
Now, I am wondering how I can automatically add dropout layers to the network. I tried the following change in the network structure but it only gives me one dropout layer at the end:
self.linears = nn.ModuleList([nn.Linear(layers[i], layers[i+1]) for i in range(len(layers)-1) + nn.Dropout(p=0.5)]
I very much appreciate any help in this regard.
If you can add a dropout layer by "adding it" with + as you do (I havent seen that, but if it works that is dope!) you should just move the + DropOut before the range I assume i.e
self.linears = nn.ModuleList([nn.Linear(layers[i], layers[i+1])+ nn.Dropout(p=0.5) for i in range(len(layers)-1) ]
EDIT
As expected you can't add it like that.
What you would do is to add a list with dropout-layers in the same way you do linear-layers, which you then use in your forward pass.
Below is an example; it might need to be tweaked to match your inputs etc
class FCN(nn.Module):
## Neural Network
def __init__(self,layers):
super().__init__()
self.activation = nn.Tanh()
self.loss_function = nn.MSELoss(reduction ='mean')
'Initialise neural network as a list using nn.Modulelist'
self.linears = nn.ModuleList([nn.Linear(layers[i], layers[i+1]) for i in range(len(layers)-1)])
self.dropout_layers = [nn.Dropout(p=0.5) for _ in range(len(layers)-1)]
self.iter = 0
'Xavier Normal Initialization'
for i in range(len(layers)-1):
nn.init.xavier_normal_(self.linears[i].weight.data, gain=1.0)
nn.init.zeros_(self.linears[i].bias.data)
def forward(self,x):
for layer,dropout in zip(self.linears,self.dropout_layers):
x = layer(x)
x = dropout(x)
return x
I think you should rewrite your code, nn.Sequential() might be the best tool to use.
class FCN(nn.Module):
"""Neural Network"""
def __init__(self, layers, drop_p=0.5):
super().__init__() # call __init__ from parent class
self.loss_function = nn.MSELoss(reduction='mean')
# Use nn.Sequential to create the neural network
# Here you need a list of Module that you want to use
module_list = [[nn.Linear(layers[i], layers[i + 1]), nn.Dropout(p=drop_p), nn.Tanh()] for i in range(len(layers) - 2)]
self.linears = nn.Sequential(*[item for sublist in module_list for item in sublist])
self.last_layer = nn.Linear(layers[-2], layers[-1])
self.iter = 0
# Xavier Normal Initialization
self.linears.apply(self.weights_init)
self.last_layer.apply(self.weights_init)
#staticmethod
def weights_init(m):
# Xavier Normal Initialization
if isinstance(m, nn.Conv2d):
torch.nn.init.xavier_uniform_(m.weight)
torch.nn.init.zero_(m.bias)
# Forward pass
def forward(self, x):
if not torch.is_tensor(x):
x = torch.from_numpy(x)
x = x.float()
x = self.linears(x)
x = self.last_layer(x)
return x
Something like this works fine, you can test it out with a dummy tensor like:
if __name__ == "__main__":
layers_size = np.array([2, 50, 50, 1])
model = FCN(layers_size)
t = torch.rand((2,))
output = model(t)
print(output)
>>> tensor([-0.0045], grad_fn=<AddBackward0>)
For the flattening operation of the module_list variable you can check here.
I was wondering how can I keep my batch norm layers active using an untrained feature extraction network.
Would this be considered feature extraction with an "untrained" network?:
class DenseNetConv(torch.nn.Module):
def __init__(self):
super(DenseNetConv,self).__init__()
original_model = models.densenet161(pretrained=False)
self.features = torch.nn.Sequential(*list(original_model.children())[:-1])
self.avgpool = nn.AdaptiveAvgPool2d(1)
for param in self.parameters():
param.requires_grad = False
def forward(self, x):
x = self.features(x)
x = F.relu(x, inplace=True)
x = F.avg_pool2d(x, kernel_size=7).view(x.size(0), -1)
return x
The above should return a tensor of [batch size, 2208], however, I want to make sure that by stating pretrained=False that I am basically extracting features from an untrained network.
I then use the following to define the classifier layers:
class MyDenseNetDens(torch.nn.Module):
def __init__(self, nb_out=2):
super().__init__()
self.dens1 = torch.nn.Linear(in_features=2208, out_features=512)
self.dens2 = torch.nn.Linear(in_features=512, out_features=128)
self.dens3 = torch.nn.Linear(in_features=128, out_features=nb_out)
def forward(self, x):
x = self.dens1(x)
x = torch.nn.functional.selu(x)
x = F.dropout(x, p=0.25, training=self.training)
x = self.dens2(x)
x = torch.nn.functional.selu(x)
x = F.dropout(x, p=0.25, training=self.training)
x = self.dens3(x)
return x
and finally join them together here:
class MyDenseNet(torch.nn.Module):
def __init__(self):
super().__init__()
self.mrnc = MyDenseNetConv()
self.mrnd = MyDenseNetDens()
def forward(self, x):
x = self.mrnc(x)
x = self.mrnd(x)
return x
densenet = MyDenseNet()
densenet.to(device)
densenet.train()
If I allow this to train, for example by applying densenet.train() will this be sufficient in allowing batch normalisation statistics to be generated for each mini-batch as well as allow for the running means and standard deviations to be learnt and applied during inference, while keeping the convolutional layers untrained?
Unfortunately, it will update the running stat anyway since the default batchnorm in your self.mrnc still initialized with the running mean and the running variance. All you need to do is shut them down:
class MyDenseNetConv(torch.nn.Module): # I renamed it to be correct to your code
def __init__(self):
super(MyDenseNetConv,self).__init__()
original_model = models.densenet161(pretrained=False)
for n, v in original_model.named_modules(): # this part is to remove the tracking stat option
if 'norm' in n:
v.track_running_stats=False
v.running_mean = None
v.running_var = None
v.num_batches_tracked = None
self.features = torch.nn.Sequential(*list(original_model.children())[:-1])
self.avgpool = nn.AdaptiveAvgPool2d(1)
for param in self.parameters():
param.requires_grad = False
def forward(self, x):
x = self.features(x)
x = F.relu(x, inplace=True)
x = F.avg_pool2d(x, kernel_size=7).view(x.size(0), -1)
return x
I want to convert https://web.casadi.org/blog/tensorflow/ , which was written in Tensorflow 1 with casadi, using Tensorflow 2. I have changed the code yet tf.disable_v2_behavior() had to be done to get it working.
import casadi as ca
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
class TensorFlowEvaluator(ca.Callback):
def __init__(self,t_in,t_out,session, opts={}):
"""
t_in: list of inputs (tensorflow placeholders)
t_out: list of outputs (tensors dependent on those placeholders)
session: a tensorflow session
"""
ca.Callback.__init__(self)
assert isinstance(t_in,list)
self.t_in = t_in
assert isinstance(t_out,list)
self.t_out = t_out
self.construct("TensorFlowEvaluator", opts)
self.session = session
self.refs = []
def get_n_in(self): return len(self.t_in)
def get_n_out(self): return len(self.t_out)
def get_sparsity_in(self,i):
return ca.Sparsity.dense(*self.t_in[i].get_shape().as_list())
def get_sparsity_out(self,i):
return ca.Sparsity.dense(*self.t_out[i].get_shape().as_list())
def eval(self,arg):
# Associate each tensorflow input with the numerical argument passed by CasADi
d = dict((v,arg[i].toarray()) for i,v in enumerate(self.t_in))
# Evaluate the tensorflow expressions
ret = self.session.run(self.t_out,feed_dict=d)
return ret
# Vanilla tensorflow offers just the reverse mode AD
def has_reverse(self,nadj): return nadj==1
def get_reverse(self,nadj,name,inames,onames,opts):
# Construct tensorflow placeholders for the reverse seeds
adj_seed = [tf.placeholder(shape=self.sparsity_out(i).shape,dtype=tf.float64) for i in range(self.n_out())]
# Construct the reverse tensorflow graph through 'gradients'
grad = tf.gradients(self.t_out, self.t_in,grad_ys=adj_seed)
# Create another TensorFlowEvaluator object
callback = TensorFlowEvaluator(self.t_in+adj_seed,grad,self.session)
# Make sure you keep a reference to it
self.refs.append(callback)
# Package it in the nominal_in+nominal_out+adj_seed form that CasADi expects
nominal_in = self.mx_in()
nominal_out = self.mx_out()
adj_seed = self.mx_out()
return ca.Function(name,nominal_in+nominal_out+adj_seed,callback.call(nominal_in+adj_seed),inames,onames)
if __name__=="__main__":
a = tf.placeholder(shape=(2,2),dtype=tf.float64)
b = tf.placeholder(shape=(2,1),dtype=tf.float64)
y = tf.matmul(tf.sin(a), b)
with tf.Session() as session:
f_tf = TensorFlowEvaluator([a,b], [y], session)
a = ca.MX.sym("a",2,2)
b = ca.MX.sym("a",2,1)
y = f_tf(a,b)
yref = ca.mtimes(ca.sin(a),b)
f = ca.Function('f',[a,b],[ca.jacobian(y,a)])
fref = ca.Function('f',[a,b],[ca.jacobian(yref,a)])
print(f(ca.DM([[1,2],[3,4]]),ca.DM([[1],[3]])))
print(fref(ca.DM([[1,2],[3,4]]),ca.DM([[1],[3]])))
Now I want to write this purely using Tensorflow 2.x. Eager execution is enabled by default I was thinking to use #tf.function to calculate the gradient,
#tf.function
def f_k(input_dat):
y = tf.matmul(tf.sin(input_dat[0]), input_dat[1])
grads = tf.gradients([y], input_dat)
# grads = tape.gradient([y], input_dat)
tf.print('tf >>', grads)
print('print >>', grads)
return y, grads
Here is the update the code at the moment,
import casadi as ca
import tensorflow as tf
from casadi import Sparsity
class TensorFlowEvaluator(ca.Callback):
def __init__(self, t_in, t_out, model, opts={}):
"""
t_in: list of inputs (tensorflow placeholders)
t_out: list of outputs (tensors dependent on those placeholders)
"""
ca.Callback.__init__(self)
assert isinstance(t_in,list)
self.t_in = t_in
assert isinstance(t_out,list)
self.t_out = t_out
self.construct("TensorFlowEvaluator", opts)
self.refs = []
self.model = model
def get_n_in(self): return len(self.t_in)
def get_n_out(self): return len(self.t_out)
def get_sparsity_in(self, i):
tesnor_shape = self.t_in[i].get_shape().as_list()
return Sparsity.dense(tesnor_shape[0], tesnor_shape[1])
# return Sparsity.dense(4, 1)
def get_sparsity_out(self, i):
return Sparsity.dense(2, 1)
def eval(self, arg):
# Associate each tensorflow input with the numerical argument passed by CasADi
print(arg)
# d = dict((v, arg[i].toarray()) for i,v in enumerate(self.t_in))
updated_t = []
for i,v in enumerate(self.t_in):
updated_t.append(tf.Variable(arg[i].toarray()))
# Evaluate the tensorflow expressions
if not tf.is_tensor(self.t_out[0]):
ret = self.t_out[0](updated_t)[0].numpy()
else:
ret = self.t_out[0](updated_t).numpy()
return [ca.DM(ret)]
# Vanilla tensorflow offers just the reverse mode AD
def has_reverse(self,nadj): return nadj==1
def get_reverse(self, nadj, name, inames, onames, opts):
initializer = tf.random_normal_initializer(mean=1., stddev=2.)
adj_seed = [ tf.Variable(initializer(shape=self.sparsity_out(i).shape, dtype=tf.float64)) for i in range(self.n_out())]
tf.config.run_functions_eagerly(False)
print("=============== self.t_in========", self.t_out)
print("=============== self.t_out========", self.t_in)
# grad = tape.gradient(mean, self.t_in, output_gradients=adj_seed)
out_, grad = self.t_out[0](self.t_in)
print("============== grad========", grad)
# Create another TensorFlowEvaluator object
callback = TensorFlowEvaluator(self.t_in + adj_seed, grad, self.model)
# Make sure you keep a reference to it
self.refs.append(callback)
# Package it in the nominal_in+nominal_out+adj_seed form that CasADi expects
nominal_in = self.mx_in()
nominal_out = self.mx_out()
adj_seed = self.mx_out()
return ca.Function(name, nominal_in+nominal_out+adj_seed, callback.call(nominal_in + adj_seed), inames, onames)
if __name__=="__main__":
initializer = tf.random_normal_initializer(mean=1., stddev=2.)
a = tf.Variable(initializer(shape=(2,2), dtype=tf.float64))
b = tf.Variable(initializer(shape=(2,1), dtype=tf.float64))
#tf.function
def f_k(input_dat):
y = tf.matmul(tf.sin(input_dat[0]), input_dat[1])
grads = tf.gradients([y], input_dat)
# grads = tape.gradient([y], input_dat)
tf.print('tf >>', grads)
print('print >>', grads)
return y, grads
f_tf = TensorFlowEvaluator([a,b], [f_k], None)
a = ca.MX.sym("a",2,2)
b = ca.MX.sym("a",2,1)
y = f_tf(a,b)
yref = ca.mtimes(ca.sin(a),b)
f = ca.Function('f',[a,b],[ca.jacobian(y,a)])
fref = ca.Function('f',[a,b],[ca.jacobian(yref,a)])
print(fref(ca.DM([[1,2],[3,4]]),ca.DM([[1],[3]])))
print(f(ca.DM([[1,2],[3,4]]),ca.DM([[1],[3]])))
Problem:
In the get_reverse method, when calculating the gradient, i.e., grad = tf.gradients(self.t_out, self.t_in,grad_ys=adj_seed), I get symbolic form, i.e., [<tf.Tensor 'gradients/Sin_grad/mul:0' shape=(2, 2) dtype=float32>, <tf.Tensor 'gradients/MatMul_grad/MatMul_1:0' shape=(2, 1) dtype=float32>] in Tensorflow 1.
However, in Tensorflow 2, I always get numerical results. I can access the graph but those are not callable. self.t_out[0].get_concrete_function(self.t_in).graph similar to here
What would be the better way to get the symbolic gradient like in Tensorflow 1?
Expected Behaviour:
out_, grad = self.t_out[0](self.t_in)
grad should return symbolic form of the gradient rather than numerical evaluation
I'm working on a seq2seq RNN generating an output sequence of labels given a seed label. During the inference step I'd like to generate sequences containing only unique labels (i.e. skip labels that have already been added to the output sequence). To do this I created a sampler object that tries to remember the labels that have been added to the output and reduce their logit value to -np.inf.
Here is the sampler code:
class InferenceSampler(object):
def __init__(self, out_weights, out_biases):
self._out_weights = tf.transpose(out_weights)
self._out_biases = out_biases
self._n_tracks = out_weights.shape[0]
self.ids_mask = tf.zeros([self._n_tracks], name="playlist_mask")
def __call__(self, decoder_outputs):
_logits = tf.matmul(decoder_outputs, self._out_weights)
_logits = tf.nn.bias_add(_logits, self._out_biases)
# apply mask
_logits = _logits + self.ids_mask
_sample_ids = tf.cast(tf.argmax(_logits, axis=-1), tf.int32)
# update mask
step_ids_mask = tf.sparse_to_dense(_sample_ids, [self._n_tracks], -np.inf)
self.ids_mask = self.ids_mask + step_ids_mask
return _sample_ids
The code of the inference graph looks like this:
self._max_playlist_len = tf.placeholder(tf.int32, ())
self._start_tokens = tf.placeholder(tf.int32, [None])
sample_fn = InferenceSampler(out_weights, out_biases)
with tf.name_scope("inf_decoder"):
def _end_fn(sample_ids):
return tf.equal(sample_ids, PAD_ITEM_ID)
def _next_inputs_fn(sample_ids):
return tf.nn.embedding_lookup(
track_embs,
sample_ids
)
_start_inputs = tf.nn.embedding_lookup(
track_embs,
self._start_tokens
)
helper = tf.contrib.seq2seq.InferenceHelper(
sample_fn=sample_fn,
sample_shape=[],
sample_dtype=tf.int32,
start_inputs=_start_inputs,
end_fn=_end_fn,
next_inputs_fn=_next_inputs_fn
)
decoder = tf.contrib.seq2seq.BasicDecoder(
rnn_cell,
helper,
rnn_cell.zero_state(tf.shape(self._start_tokens)[0], tf.float32),
output_layer=projection_layer
)
outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
decoder,
maximum_iterations=self._max_playlist_len
)
self.playlists = outputs.sample_id
Unfortunately, the results still have duplicated labels. Moreover, when I try to get access to the sample_fn.ids_mask I receive an error message: ValueError: Operation 'inf_decoder/decoder/while/BasicDecoderStep/add_1' has been marked as not fetchable.
What am I doing wrong? And how legal is to create such sample_fn?
Trying to overcome the problem, I updated the inference in such a way that at each RNN step I output the vector of embeddings instead of item_id. After the inference is finished, I convert embeddings to item_ids.
First of all, this solution minimizes the number of operations. Secondly, since I use LSTM/GRU cells, they minimize the probability to observe two absolutely similar outputs on different steps of RNN's inference.
The new code looks like this:
with tf.name_scope("inf_decoder"):
def _sample_fn(decoder_outputs):
return decoder_outputs
def _end_fn(sample_ids):
# infinite
return tf.tile([False], [n_seeds])
_start_inputs = tf.nn.embedding_lookup(
track_embs,
self._seed_items
)
helper = tf.contrib.seq2seq.InferenceHelper(
sample_fn=_sample_fn,
sample_shape=[self.emb_size],
sample_dtype=tf.float32,
start_inputs=_start_inputs,
end_fn=_end_fn,
)
decoder = tf.contrib.seq2seq.BasicDecoder(
rnn_cell,
helper,
rnn_cell.zero_state(n_seeds, tf.float32),
output_layer=projection_layer
)
outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
decoder,
maximum_iterations=self._max_playlist_len
)
flat_rnn_output = tf.reshape(outputs.rnn_output, [-1, self.emb_size])
flat_logits = tf.matmul(flat_rnn_output, out_weights, transpose_b=True)
flat_logits = tf.nn.bias_add(flat_logits, out_biases)
item_ids = tf.cast(tf.argmax(flat_logits, axis=-1), tf.int32)
playlists = tf.reshape(item_ids, [n_seeds, -1])
self.playlists = playlists
So, after some investigation I found answers to all my questions related to this thread. The main question was: why self.ids_mask in InferenceSampler does not update? The reason is in the internals of dynamic_decode. According to this answer in Tensorflow's issue tracker:
... only tensors defined inside the loop will be evaluated
every loop iteration. All tensors defined outside a loop will be
evaluated exactly once.
In my case, self.ids_mask is specified outside the loop. That means that I need to re-write dynamic_decode to get what I want. The code below is a bit modified version of the initial task, but it does almost the same.
Let's start with a new dynamic_decode which should create and update the mask collecting sample_ids that have been already predicted. I removed the code which i didn't modify, follow the initial_mask and mask variables.
New dynamic_decode:
def dynamic_decode(decoder,
output_time_major=False,
impute_finished=False,
maximum_iterations=None,
parallel_iterations=32,
swap_memory=False,
scope=None):
...
initial_finished, initial_inputs, initial_mask, initial_state = decoder.initialize()
...
def body(time, outputs_ta, state, inputs, finished, sequence_lengths, mask):
"""Internal while_loop body.
Args:
time: scalar int32 tensor.
outputs_ta: structure of TensorArray.
state: (structure of) state tensors and TensorArrays.
inputs: (structure of) input tensors.
finished: bool tensor (keeping track of what's finished).
sequence_lengths: int32 tensor (keeping track of time of finish).
mask: SparseTensor to remove already predicted items
Returns:
`(time + 1, outputs_ta, next_state, next_inputs, next_finished,
next_sequence_lengths, next_mask)`.
```
"""
(next_outputs, decoder_state, next_inputs, next_mask,
decoder_finished) = decoder.step(time, inputs, state, mask)
...
nest.assert_same_structure(state, decoder_state)
nest.assert_same_structure(outputs_ta, next_outputs)
nest.assert_same_structure(inputs, next_inputs)
nest.assert_same_structure(mask, next_mask)
...
return (time + 1, outputs_ta, next_state, next_inputs, next_finished,
next_sequence_lengths, next_mask)
res = control_flow_ops.while_loop(
condition,
body,
loop_vars=[
initial_time, initial_outputs_ta, initial_state, initial_inputs,
initial_finished, initial_sequence_lengths, initial_mask,
],
parallel_iterations=parallel_iterations,
swap_memory=swap_memory)
...
return final_outputs, final_state, final_sequence_lengths
At the next step mask should be passed to Decoder and Helper. Here are the updated versions of BasicDecoder and InferenceHelper:
MaskedDecoder:
class MaskedDecoder(BasicDecoder):
def step(self, time, inputs, state, mask, name=None):
with ops.name_scope(name, "MaskedDecoderStep", (time, inputs, state, mask)):
cell_outputs, cell_state = self._cell(inputs, state)
if self._output_layer is not None:
cell_outputs = self._output_layer(cell_outputs)
sample_ids = self._helper.sample(
time=time,
outputs=cell_outputs,
state=cell_state,
mask=mask)
(finished, next_inputs, next_state, next_mask) = self._helper.next_inputs(
time=time,
outputs=cell_outputs,
state=cell_state,
mask=mask,
sample_ids=sample_ids)
outputs = BasicDecoderOutput(cell_outputs, sample_ids)
return (outputs, next_state, next_inputs, next_mask, finished)
MaskedInferenceHelper:
class MaskedInferenceHelper(Helper):
"""A helper to use during inference with a custom sampling function."""
def __init__(self, norm_track_embs, features, start_sample_ids):
self._norm_track_embs = norm_track_embs
self._batch_size = tf.shape(start_sample_ids)[0]
self._n_tracks = tf.shape(norm_track_embs)[0]
self._start_sample_ids = start_sample_ids
self._sample_shape = tf.TensorShape([])
self._sample_dtype = tf.int32
self._features = features
def _get_sparse_mask(self, sample_ids):
_mask_shape = tf.convert_to_tensor([
tf.cast(self._batch_size, dtype=tf.int64),
tf.cast(self._n_tracks, dtype=tf.int64)
])
_st_rows = tf.range(0, self._batch_size)
_st_cols = sample_ids
_st_indices = tf.cast(tf.stack([_st_rows, _st_cols], axis=1), dtype=tf.int64)
_st_values = tf.fill([self._batch_size], np.inf)
return tf.SparseTensor(_st_indices, _st_values, _mask_shape)
...
def initialize(self, name=None):
finished = tf.tile([False], [self._batch_size])
start_embs = tf.nn.embedding_lookup(self._norm_track_embs, self._start_sample_ids)
start_inputs = tf.concat([start_embs, self._features], axis=1)
mask = self._get_sparse_mask(self._start_sample_ids)
return finished, start_inputs, mask
def sample(self, time, outputs, state, mask, name=None):
del time, state # unused by sample
outputs = tf.nn.l2_normalize(outputs, axis=-1)
cos_sims = tf.matmul(outputs, self._norm_track_embs, transpose_b=True)
cos_sims = cos_sims - tf.sparse_tensor_to_dense(mask)
sample_ids = tf.cast(tf.argmax(cos_sims, axis=-1), tf.int32)
return sample_ids
def next_inputs(self, time, outputs, state, sample_ids, mask, name=None):
del time, outputs # unused by next_inputs
finished = tf.tile([False], [self._batch_size])
next_embs = tf.nn.embedding_lookup(self._norm_track_embs, sample_ids)
next_inputs = tf.concat([next_embs, self._features], axis=1)
next_mask = tf.sparse_add(mask, self._get_sparse_mask(sample_ids))
return finished, next_inputs, state, next_mask
So, now I can generate inferences without repetition of already predicted items.
I've setup a print statement and I've noticed that for the first batch when feeding an RNN, the embeddings exist, but after the second batch they don't and I get the following error:
ValueError: Variable RNNLM/RNNLM/Embedding/Adam_2/ does not exist, or was not created with tf.get_variable(). Did you mean to set reuse=None in VarScope?
Here is my code for generating the embeddings:
def add_embedding(self):
with tf.device('/gpu:0'):
embedding = tf.get_variable("Embedding", [len(self.vocab), self.config.embed_size])
e_x = tf.nn.embedding_lookup(embedding, self.input_placeholder)
inputs = [tf.squeeze(s, [1]) for s in tf.split(1, self.config.num_steps, e_x)]
return inputs
Here is how the model is seutp, this is where I suspect the problem lies
def model(self, inputs):
with tf.variable_scope("input_drop"):
inputs_drop = [tf.nn.dropout(i, self.dropout_placeholder) for i in inputs]
with tf.variable_scope("RNN") as scope:
self.initial_state = tf.zeros([self.config.batch_size, self.config.hidden_size], tf.float32)
state = self.initial_state
states = []
for t, e in enumerate(inputs_drop):
print "t is {0}".format(t)
if t > 0:
scope.reuse_variables()
H = tf.get_variable("Hidden", [self.config.hidden_size, self.config.hidden_size])
I = tf.get_variable("I", [self.config.embed_size, self.config.hidden_size])
b_1 = tf.get_variable("b_1", (self.config.hidden_size,))
state = tf.sigmoid(tf.matmul(state, H) + tf.matmul(e, I) + b_1)
states.append(state)
with tf.variable_scope("output_dropout"):
rnn_outputs = [tf.nn.dropout(o, self.dropout_placeholder) for o in states]
return rnn_outputs
The issue arises when I get to the loss function, defined as follows
def add_training_op(self, loss):
opt = tf.train.AdamOptimizer(self.config.lr)
train_op = opt.minimize(loss)
return train_op
EDIT: Here is some updated code to help everyone out
def __init__(self, config):
self.config = config
self.load_data(debug=False)
self.add_placeholders()
self.inputs = self.add_embedding()
self.rnn_outputs = self.add_model(self.inputs)
self.outputs = self.add_projection(self.rnn_outputs)
self.predictions = [tf.nn.softmax(tf.cast(o, 'float64')) for o in self.outputs]
output = tf.reshape(tf.concat(1, self.outputs), [-1, len(self.vocab)])
self.calculate_loss = self.add_loss_op(output)
self.train_step = self.add_training_op(self.calculate_loss)
Here are the other methods here, pertaining to add_projection and calculate_loss so we can rule them out.
def add_loss_op(self, output):
weights = tf.ones([self.config.batch_size * self.config.num_steps], tf.int32)
seq_loss = tf.python.seq2seq.sequence_loss(
[output],
tf.reshape(self.labels_placeholder, [-1]),
weights
)
tf.add_to_collection('total_loss', seq_loss)
loss = tf.add_n(tf.get_collection('total_loss'))
return loss
def add_projection(self, rnn_outputs):
with tf.variable_scope("Projection", initializer=tf.contrib.layers.xavier_initializer()) as scope:
U = tf.get_variable("U", [self.config.hidden_size, len(self.vocab)])
b_2 = tf.get_variable("b_2", [len(self.vocab)])
outputs = [tf.matmul(x, U) + b_2 for x in rnn_outputs]
return outputs
def train_RNNLM():
config = Config()
gen_config = deepcopy(config)
gen_config.batch_size = gen_config.num_steps = 1
with tf.variable_scope('RNNLM') as scope:
model = RNNLM_Model(config)
# This instructs gen_model to reuse the same variables as the model above
scope.reuse_variables()
gen_model = RNNLM_Model(gen_config)
init = tf.initialize_all_variables()
saver = tf.train.Saver()
with tf.Session() as session:
best_val_pp = float('inf')
best_val_epoch = 0
session.run(init)
for epoch in xrange(config.max_epochs):
print 'Epoch {}'.format(epoch)
start = time.time()
###
train_pp = model.run_epoch(
session, model.encoded_train,
train_op=model.train_step)
valid_pp = model.run_epoch(session, model.encoded_valid)
print 'Training perplexity: {}'.format(train_pp)
print 'Validation perplexity: {}'.format(valid_pp)
if valid_pp < best_val_pp:
best_val_pp = valid_pp
best_val_epoch = epoch
saver.save(session, './ptb_rnnlm.weights')
if epoch - best_val_epoch > config.early_stopping:
break
print 'Total time: {}'.format(time.time() - start)
Seems that the code is trying to create a new Adam Variable in each batch.
Possible that the add_training_op is called twice?
Also, the snippet of def add_training_op is incomplete since there is no return statement.
The problem turned out to be the following line of code:
model = RNNLM_Model(config)
# This instructs gen_model to reuse the same variables as the model above
scope.reuse_variables()
gen_model = RNNLM_Model(gen_config)
It turns out that the second model was an issue by using reuse_variables(). By removing this line by issues went away.