I am trying to experiment with different implementations of VAE in tensorflow Keras. In the following model I get an error that no gradients are being provided for any variables in any layer.
tfkl = tf.keras.layers
class sampling2(tfk.layers.Layer):
def call(self, inputs):
z_mean, z_log_var = inputs
batch_size = tf.shape(z_mean)[0]
dim_z = tf.shape(z_mean)[1]
epsilon = tf.keras.backend.random_normal(shape=(batch_size, dim_z))
z_sample = z_mean + tf.exp(0.5 * z_log_var) * epsilon
return z_sample
class encoder2(tfk.layers.Layer):
def __init__(self, latent_dim = 30, intermediate_dim = 200, name= 'encoder2', **kwargs):
super(encoder2, self).__init__(name = name, **kwargs)
self.dense_1 = tfkl.Dense(intermediate_dim, activation="relu")
self.dense_mean = tfkl.Dense(latent_dim)
self.dense_log_var = tfkl.Dense(latent_dim)
self.sampling = sampling2()
def call(self, inputs):
x = self.dense_1(inputs)
z_mean = self.dense_mean(x)
z_log_var = self.dense_log_var(x)
z = self.sampling((z_mean, z_log_var))
return z_mean, z_log_var, z
class decoder2(tfk.layers.Layer):
def __init__(self, original_dim, intermediate_dim= 200, name = 'decoder2', **kwargs):
super(decoder2, self).__init__(name = name, **kwargs)
self.dense_1 = tfkl.Dense(intermediate_dim, activation='relu')
self.dense_output = tfkl.Dense(original_dim, activation = 'sigmoid')
def call(self, inputs):
x = self.dense_1(inputs)
logits = self.dense_output(x)
return logits
class VAE2(tfk.Model):
def __init__(self, original_dim, intermediate_dim = 800, latent_dim = 50,
name = 'VAE2', **kwargs):
super(VAE2, self).__init__(name = name, **kwargs)
self.original_dim = original_dim
self.encoder = encoder2(latent_dim = latent_dim, intermediate_dim = intermediate_dim)
self.decoder = decoder2(original_dim, intermediate_dim = intermediate_dim)
def call(self,inputs):
z_mean, z_log_var, z = self.encoder(inputs)
reconstructed = self.decoder(z)
return reconstructed
def training_step(self, inputs):
dense_train_batch = tf.sparse.to_dense(inputs)
with tf.GradientTape() as tape:
z_mean, z_log_var, z = self.encoder(dense_train_batch)
reconstructed = self.decoder(z)
kl_loss = -0.5 * tf.reduce_mean(z_log_var - tf.square(z_mean) -
tf.exp(z_log_var)+1)
self.add_loss(kl_loss)
grads = tape.gradient(loss, self.trainable_weights)
optimizer.apply_gradients(zip(grads, self.trainable_weights))
loss_fn = keras.losses.BinaryCrossentropy()
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
vae2 = VAE2(df_track_names_reduced.shape[0])
vae2.compile(optimizer=keras.optimizers.Adam(learning_rate = 0.001, amsgrad = True), loss = loss_fn)
vae2.fit(train_dataset, epochs =20)
Bellow I am providing the error message
ValueError: No gradients provided for any variable: ['VAE2/encoder2/dense_8/kernel:0','VAE2/encoder2/dense_8/bias:0', 'VAE2/encoder2/dense_9/kernel:0', 'VAE2/encoder2/dense_9/bias:0', 'VAE2/encoder2/dense_10/kernel:0', 'VAE2/encoder2/dense_10/bias:0', 'VAE2/decoder2
/dense_11/kernel:0', 'VAE2/decoder2/dense_11/bias:0', 'VAE2/decoder2/dense_12/kernel:0',
'VAE2/decoder2/dense_12/bias:0'].
You have to pass a loss tensor to tape.gradient not a function. Calculate binary loss, add it to kl_loss: loss = binary_loss + kl_loss and then pass to tape.gradient().
If you apply gradients manually - you should not call model.compile(), model.fit(). Build your custom loop instead. See here: https://keras.io/guides/writing_a_training_loop_from_scratch/.
But I don't think you really need applying gradients manually. I would just add kl_loss within call function. See here: https://keras.io/api/losses/
Related
I try to fight with overfitting, this is why I decided to look through documentation (https://pytorch-lightning.readthedocs.io/en/stable/common/evaluation_basic.html#train-with-the-validation-loop), where I found that you can pass in Trainer.fit training and validation dataloader. The question is that - should I use this method, or I can simply pass the dataloader class in Trainer.fit to prevent overfitting ?
Code DataLoader:
class ClassifierDataModule(pl.LightningDataModule):
def __init__(self, train_dataset:pd.DataFrame, val_dataset:pd.DataFrame, batch_size:int):
super().__init__()
self.prepare_data_per_node = False
self.train_dataset = train_dataset
self.val_dataset = val_dataset
self.batch_size=batch_size
def train_dataloader(self):
return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=os.cpu_count())
def val_dataloader(self):
return DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=True, num_workers=os.cpu_count())
data_module_classifier = ClassifierDataModule(train_dataset,val_dataset,test_dataset,BATCH_SIZE )
And here is my Trainer.fit():
model = MulticlassClassificationLIGHT(class_weights)
#trainer.fit(model, data_module_classifier) # SHOULD I USE THIS METHOD TO PREVENT OVERFITTING
trainer.fit(model, data_module_classifier.train_dataloader(),data_module_classifier.val_dataloader() ) # OR THIS ONE ?
My LightningModule just in case:
class MulticlassClassificationLIGHT(pl.LightningModule):
def __init__(self,class_weights):
super(MulticlassClassificationLIGHT, self).__init__()
self.num_feature=35
self.num_class=36
self.layer_1 = nn.Linear(self.num_feature, 512)
self.layer_2 = nn.Linear(512, 128)
self.layer_3 = nn.Linear(128, 64)
self.layer_out = nn.Linear(64, self.num_class)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(p=0.2)
self.batchnorm1 = nn.BatchNorm1d(512)
self.batchnorm2 = nn.BatchNorm1d(128)
self.batchnorm3 = nn.BatchNorm1d(64)
self.loss = nn.CrossEntropyLoss(weight=class_weights.to(device))
def forward(self, x):
x = self.layer_1(x)
x = self.batchnorm1(x)
x = self.relu(x)
x = self.layer_2(x)
x = self.batchnorm2(x)
x = self.relu(x)
x = self.dropout(x)
x = self.layer_3(x)
x = self.batchnorm3(x)
x = self.relu(x)
x = self.dropout(x)
x = self.layer_out(x)
return x
def training_step(self, batch, batch_idx):
x, y = batch
logits = self.forward(x)
loss = self.loss(logits, y)
self.log("train_loss", loss, prog_bar=True, logger=True)
return loss
def validation_step(self, batch, batch_idx):
x, y = batch
logits = self.forward(x)
loss = self.loss(logits, y)
self.log("val_loss", loss, prog_bar=True, logger=True) # I ask Trainer to "ModelCheckpoint" this loss
return loss
Passing validation data loader during training does not fix overfitting. It allows to measure the overfitting/underfitting of the model. We want performance on validation data to be closer to performance on training data in case of a well-fit model.
Regarding the syntax, This should work :
trainer.fit(model=model, train_dataloaders =data_module_classifier.train_dataloader(), val_dataloaders =data_module_classifier.val_dataloader())
documentation for fit here - https://pytorch-lightning.readthedocs.io/en/latest/common/trainer.html#trainer-class-api
I'm trying to use LSTM and transformer to do binary-classification, but it does not improve the performance than normal LSTM model, sometimes it will go even worse. Input shape of training data is (3014, 48, 178), input data is time-series medical data, the following code is for transformer.
class TokenAndPositionEmbedding(layers.Layer):
def __init__(self, maxlen, vocab_size, embed_dim):
super(TokenAndPositionEmbedding, self).__init__()
self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
def call(self, x):
maxlen = tf.shape(x)[-1]
positions = tf.range(start=0, limit=maxlen, delta=1)
positions = self.pos_emb(positions)
x = self.token_emb(x)
return x + positions
class TransformerBlock(layers.Layer):
def __init__(self, embed_dim, num_heads, ff_dim, rate=0.001):
super(TransformerBlock, self).__init__()
self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
self.ffn = Sequential(
[layers.Dense(ff_dim, activation="relu"),layers.Dense(embed_dim),]
)
self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = layers.Dropout(rate)
self.dropout2 = layers.Dropout(rate)
def call(self, inputs, training):
attn_output = self.att(inputs, inputs)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(inputs + attn_output)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
return self.layernorm2(out1 + ffn_output)
class PositionEmbeddingFixedWeights(layers.Layer):
def __init__(self, sequence_length, output_dim, **kwargs):
super(PositionEmbeddingFixedWeights, self).__init__(**kwargs)
position_embedding_matrix = self.get_position_encoding(sequence_length, output_dim)
self.position_embedding_layer = layers.Embedding(
input_dim=sequence_length, output_dim=output_dim,
weights=[position_embedding_matrix],
trainable=False
)
def get_position_encoding(self, seq_len, d, n=10000):
P = np.zeros((seq_len, d))
for k in range(seq_len):
for i in np.arange(int(d/2)):
denominator = np.power(n, 2*i/d)
P[k, 2*i] = np.sin(k/denominator)
P[k, 2*i+1] = np.cos(k/denominator)
return P
def call(self, inputs):
position_indices = tf.range(tf.shape(inputs)[-2])
embedded_indices = self.position_embedding_layer(position_indices)
return embedded_indices
Model code is
model = Sequential([tf.keras.Input(shape=(48,178)),
BatchNormalization(),
tf.keras.layers.GRU(units = 128,recurrent_dropout=0.5,activation='tanh', dropout=0.5,return_sequences = True,activity_regularizer=regularizers.L2(0.01)),
TransformerBlock(128, 48, 178),
tf.keras.layers.GlobalAveragePooling1D(),
Dense(60,activation='tanh'),
Dense(1,activation='sigmoid')])
It had troubled me for a long time. I'm trying to use LSTM to dealing with the time-series feature, and transformer to learn the importance between features, but it seems not working.
I am running Alexnet on CIFAR10 dataset using Pytorch Lightning, here is my model:
class SelfSupervisedModel(pl.LightningModule):
def __init__(self, hparams=None, num_classes=10, batch_size=128):
super(SelfSupervisedModel, self).__init__()
self.batch_size = batch_size
self.loss_fn = nn.CrossEntropyLoss()
self.hparams["lr"] = ModelHelper.Hyperparam.Learning_rate
self.model = torchvision.models.alexnet(pretrained=False)
def forward(self, x):
return self.model(x)
def training_step(self, train_batch, batch_idx):
inputs, targets = train_batch
predictions = self(inputs)
loss = self.loss_fn(predictions, targets)
return {'loss': loss}
def validation_step(self, test_batch, batch_idx):
inputs, targets = test_batch
predictions = self(inputs)
val_loss = self.loss_fn(predictions, targets)
_, preds = tf.max(predictions, 1)
acc = tf.sum(preds == targets.data) / (targets.shape[0] * 1.0)
return {'val_loss': val_loss, 'val_acc': acc, 'target': targets, 'preds': predictions}
def validation_epoch_end(self, outputs):
avg_loss = tf.stack([x['val_loss'] for x in outputs]).mean()
avg_acc = tf.stack([x['val_acc'].float() for x in outputs]).mean()
logs = {'val_loss': avg_loss, 'val_acc': avg_acc}
print(f'validation_epoch_end logs => {logs}')
OutputMatrix.predictions = tf.cat([tmp['preds'] for tmp in outputs])
OutputMatrix.targets = tf.cat([tmp['target'] for tmp in outputs])
return {'progress_bar': logs}
def configure_optimizers(self):
return tf.optim.SGD(self.parameters(), lr=self.hparams["lr"], momentum=0.9)
I am storing the predicted and true values in OutputMatrix.predictions and OutputMatrix.targets which are used to generate confusion matrix looks like below:
I'm pretty much sure that this should not be the output though. Can not find where is the mistake. Any help would be appreciated.
I would suggest using Torchmetrics and the internal log method, so the code could like:
class MyModule(LightningModule):
def __init__(self):
...
self.train_acc = torchmetrics.Accuracy()
self.valid_acc = torchmetrics.Accuracy()
def training_step(self, batch, batch_idx):
x, y = batch
preds = self(x)
...
self.train_acc(preds, y)
self.log('train_acc', self.train_acc, on_step=True, on_epoch=False)
def validation_step(self, batch, batch_idx):
logits = self(x)
...
self.valid_acc(logits, y)
self.log('valid_acc', self.valid_acc, on_step=True, on_epoch=True)
as you can also find in the docs related to PL integration.
I have been trying to implement the paper: SeER: An Explainable Deep Learning MIDI-based Hybrid Song Recommender System.
So, what I have been doing is this:
Model Code:
class HybridFactorization(tf.keras.layers.Layer):
# embedding_size is also the number of lstm units
# num_users, num_movies = input_shape
# required_users: (batch_size, embedding_size)
# songs_output: (batch_size, embedding_size)
def __init__(self, embedding_size, num_users, num_tracks):
super(HybridFactorization, self).__init__()
self.embedding_size = embedding_size
self.num_users = num_users
self.num_tracks = num_tracks
self.required_users = None
self.U = self.add_weight("U",
shape=[self.num_users, self.embedding_size],
dtype=tf.float32,
initializer=tf.initializers.GlorotUniform)
self.lstm = tf.keras.layers.LSTM(self.embedding_size)
def call(self, user_index, songs_batch):
output_lstm = self.lstm(songs_batch)
self.required_users = self.U.numpy()
self.required_users = tf.convert_to_tensor(self.required_users[np.array(user_index)],
dtype=tf.float32)
return tf.matmul(self.required_users, output_lstm, transpose_b=True)
class HybridRecommender(tf.keras.Model):
def __init__(self, embedding_size, num_users, num_tracks):
super(HybridRecommender, self).__init__()
self.HybridFactorization = HybridFactorization(embedding_size,
num_users, num_tracks)
def call(self, user_index, songs_batch):
output = self.HybridFactorization(user_index, songs_batch)
return output
Utility Functions and running the model:
def loss_fn(source, target):
mse = tf.keras.losses.MeanSquaredError()
return mse(source, target)
model = HybridRecommender(EMBEDDING_SIZE, num_users, num_tracks)
Xhat = model(user_index, songs_batch)
tf.keras.backend.clear_session()
optimizer = tf.keras.optimizers.Adam()
EPOCHS = 1
for epoch in range(EPOCHS):
start = time.time()
total_loss = 0
for (batch, (input_batch, target_batch)) in enumerate(train_dataset):
songs_batch = create_songs_batch(input_batch)
user_index = input_batch[:, 0].numpy()
X = create_pivot_batch(input_batch, target_batch)
with tf.GradientTape() as tape:
Xhat = model(user_index, songs_batch)
batch_loss = loss_fn(X, Xhat)
variables = model.trainable_variables
gradients = tape.gradient(batch_loss, variables)
optimizer.apply_gradients(zip(gradients, variables))
total_loss += batch_loss
Now, various functions like create_songs_batch(input_batch) and create_pivot_batch(input_batch, target_batch) just provide data in the required format.
My model runs but I get the warning:
WARNING:tensorflow:Gradients do not exist for variables ['U:0'] when minimizing the loss.
Now, I can see why variable U is not being updated as there is no direct path to it.
I want to update some specific rows of U which are mentioned in user_index in every batch call.
Is there a way to do it?
So, I was able to solve the problem by rather than copying some rows of U and trying to solve it. Instead, I used a temporary matrix that is one hot encoded form of user_index and multiplied it with U to desired results and it also removed the results.
Part of code that needs to be modified:
def call(self, user_index, songs_batch):
# output_lstm: (batch_size, emb_sz)
# batch_encoding: (batch_size, num_users)
# required_users: (batch_size, emb_sz)
output_lstm = self.lstm(songs_batch)
user_idx = np.array(user_index)
batch_encoding = np.zeros((user_idx.size, self.num_users))
batch_encoding[np.arange(user_idx.size), user_idx] = 1
batch_encoding = tf.convert_to_tensor(batch_encoding, dtype=tf.float32)
self.required_users = tf.matmul(batch_encoding, self.U)
return tf.matmul(self.required_users, output_lstm, transpose_b=True)
I have been trying to run some experiments using the deepfix tool (https://bitbucket.org/iiscseal/deepfix) which is a seq2seq model for correcting common programming errors.
I made changes to the code so that it is compatible to TF-1.12, as the original code contains tensorflow.contrib.seq2seq functions which are not supported in version TF-1.12 (only in TF-1.0.x).
The main changes were in the seq2seq_model defined in neural_net/train.py.
Below is the changed code. I'm new to the tensorflow RNN, and coded the decoder part using help from online codes.
class seq2seq_model():
PAD = 0
EOS = 1
def __init__(self, vocab_size, embedding_size, max_output_seq_len,
cell_type='LSTM', memory_dim=300, num_layers=4, dropout=0.2,
attention=True,
scope=None,
verbose=False):
assert 0 <= dropout and dropout <= 1, '0 <= dropout <= 1, you passed dropout={}'.format(
dropout)
tf.set_random_seed(1189)
self.attention = attention
self.max_output_seq_len = max_output_seq_len
self.memory_dim = memory_dim
self.num_layers = num_layers
self.dropout = dropout
self.scope = scope
if dropout != 0:
self.keep_prob = tf.placeholder(tf.float32)
else:
self.keep_prob = None
self.vocab_size = vocab_size
self.embedding_size = embedding_size
self.encoder_cell = _new_RNN_cell(
memory_dim, num_layers, cell_type, dropout, self.keep_prob)
self.decoder_cell = _new_RNN_cell(
memory_dim, num_layers, cell_type, dropout, self.keep_prob)
self._make_graph()
if self.scope is not None:
saver_vars = [var for var in tf.global_variables(
) if var.name.startswith(self.scope)]
else:
saver_vars = tf.global_variables()
if verbose:
print 'root-scope:', self.scope
print "\n\nDiscovered %d saver variables." % len(saver_vars)
for each in saver_vars:
print each.name
self.saver = tf.train.Saver(saver_vars, max_to_keep=5)
#property
def decoder_hidden_units(self):
return self.memory_dim
def _make_graph(self):
self._init_placeholders()
self._init_decoder_train_connectors()
self._init_embeddings()
self._init_simple_encoder()
self._init_decoder()
self._init_optimizer()
def _init_placeholders(self):
""" Everything is time-major """
self.encoder_inputs = tf.placeholder(
shape=(None, None),
dtype=tf.int32,
name='encoder_inputs',
)
self.encoder_inputs_length = tf.placeholder(
shape=(None,),
dtype=tf.int32,
name='encoder_inputs_length',
)
self.decoder_targets = tf.placeholder(
shape=(None, None),
dtype=tf.int32,
name='decoder_targets'
)
self.decoder_targets_length = tf.placeholder(
shape=(None,),
dtype=tf.int32,
name='decoder_targets_length',
)
def _init_decoder_train_connectors(self):
with tf.name_scope('decoderTrainFeeds'):
sequence_size, batch_size = tf.unstack(
tf.shape(self.decoder_targets), name='decoder_targets_shape')
EOS_SLICE = tf.ones([1, batch_size], dtype=tf.int32) * self.EOS
PAD_SLICE = tf.ones([1, batch_size], dtype=tf.int32) * self.PAD
self.decoder_train_inputs = tf.concat(
[EOS_SLICE, self.decoder_targets], axis=0, name="decoder_train_inputs")
self.decoder_train_length = self.decoder_targets_length + 1
decoder_train_targets = tf.concat(
[self.decoder_targets, PAD_SLICE], axis=0)
decoder_train_targets_seq_len, _ = tf.unstack(
tf.shape(decoder_train_targets))
decoder_train_targets_eos_mask = tf.one_hot(self.decoder_train_length - 1,
decoder_train_targets_seq_len,
on_value=self.EOS, off_value=self.PAD,
dtype=tf.int32)
decoder_train_targets_eos_mask = tf.transpose(
decoder_train_targets_eos_mask, [1, 0])
decoder_train_targets = tf.add(decoder_train_targets,
decoder_train_targets_eos_mask, name="decoder_train_targets")
self.decoder_train_targets = decoder_train_targets
self.loss_weights = tf.ones([
batch_size,
tf.reduce_max(self.decoder_train_length)
], dtype=tf.float32, name="loss_weights")
def _init_embeddings(self):
with tf.variable_scope("embedding") as scope:
sqrt3 = math.sqrt(3)
initializer = tf.random_uniform_initializer(-sqrt3, sqrt3)
self.embedding_matrix = tf.get_variable(
name="embedding_matrix",
shape=[self.vocab_size, self.embedding_size],
initializer=initializer,
dtype=tf.float32)
self.encoder_inputs_embedded = tf.nn.embedding_lookup(
self.embedding_matrix, self.encoder_inputs,
name="encoder_inputs_embedded")
self.decoder_train_inputs_embedded = tf.nn.embedding_lookup(
self.embedding_matrix, self.decoder_train_inputs,
name="decoder_train_inputs_embedded")
def _init_simple_encoder(self):
with tf.variable_scope("Encoder") as scope:
(self.encoder_outputs, self.encoder_state) = (
tf.nn.dynamic_rnn(cell=self.encoder_cell,
inputs=self.encoder_inputs_embedded,
sequence_length=self.encoder_inputs_length,
time_major=True,
dtype=tf.float32)
)
def _init_decoder(self):
with tf.variable_scope("decoder") as scope:
# def output_fn(outputs):
# return tf.contrib.layers.fully_connected(outputs, self.vocab_size, scope=scope,
# name = "output_fn")
sequence_size, batch_size = tf.unstack(
tf.shape(self.decoder_targets), name='decoder_targets_shape')
train_helper = seq2seq.TrainingHelper(
inputs=self.decoder_train_inputs_embedded,
sequence_length=self.decoder_train_length,
time_major=True,
name="train_helper")
pred_helper = seq2seq.SampleEmbeddingHelper(
embedding=self.embedding_matrix,
start_tokens=tf.ones([batch_size], dtype=tf.int32) * self.EOS,
end_token=self.EOS)
# name="pred_helper")
def _decode(helper, scope, reuse=None):
with tf.variable_scope(scope, reuse=reuse):
attention_states = tf.transpose(
self.encoder_outputs, [1, 0, 2])
attention_mechanism = seq2seq.BahdanauAttention(
num_units=self.decoder_hidden_units, memory=attention_states,
name="attention_mechanism")
attention_cell = seq2seq.AttentionWrapper(
self.decoder_cell, attention_mechanism,
name="atttention_wrapper")
out_cell = tf.contrib.rnn.OutputProjectionWrapper(
attention_cell, self.vocab_size, reuse=reuse)
# name="output_cell")
decoder = seq2seq.BasicDecoder(
cell=out_cell, helper=helper,
initial_state=out_cell.zero_state(
dtype=tf.float32, batch_size=batch_size))
# name="decoder")
outputs = seq2seq.dynamic_decode(
decoder=decoder, output_time_major=True,
impute_finished=True)
# name="outputs")
return outputs
(self.decoder_logits_train, self.decoder_state_train, _) = _decode(train_helper, "decoder")
(self.decoder_logits_inference, self.decoder_state_inference, _) = _decode(pred_helper, "decoder", reuse=True)
self.decoder_logits_train = self.decoder_logits_train.rnn_output
self.decoder_logits_inference = self.decoder_logits_inference.rnn_output
# self.decoder_logits_train = output_fn(self.decoder_outputs_train)
self.decoder_prediction_train = tf.argmax(
self.decoder_logits_train, axis=-1, name='decoder_prediction_train')
scope.reuse_variables()
self.decoder_prediction_inference = tf.argmax(self.decoder_logits_inference, axis=-1,
name='decoder_prediction_inference')
def _init_optimizer(self):
logits = tf.transpose(self.decoder_logits_train, [1, 0, 2])
targets = tf.transpose(self.decoder_train_targets, [1, 0])
self.loss = seq2seq.sequence_loss(logits=logits, targets=targets,
weights=self.loss_weights)
self.optimizer = tf.train.AdamOptimizer()
gvs = self.optimizer.compute_gradients(self.loss)
def ClipIfNotNone(grad):
if grad is None:
return grad
return tf.clip_by_value(grad, -1., 1)
# capped_gvs = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gvs]
capped_gvs = [(ClipIfNotNone(grad), var) for grad, var in gvs]
self.train_op = self.optimizer.apply_gradients(capped_gvs)
def make_feed_dict(self, x, x_len, y, y_len):
feed_dict = {
self.encoder_inputs: x,
self.encoder_inputs_length: x_len,
self.decoder_targets: y,
self.decoder_targets_length: y_len,
}
if self.dropout != 0:
feed_dict.update({self.keep_prob: 1.0 - self.dropout})
return feed_dict
def load_parameters(self, sess, filename):
self.saver.restore(sess, filename)
def save_parameters(self, sess, filename, global_step=None):
self.saver.save(sess, filename, global_step=global_step)
def train_step(self, session, x, x_len, y, y_len):
feed_dict = self.make_feed_dict(x, x_len, y, y_len)
_, loss = session.run([self.train_op, self.loss], feed_dict)
return loss
def validate_step(self, session, x, x_len, y, y_len):
feed_dict = self.make_feed_dict(x, x_len, y, y_len)
loss, decoder_prediction, decoder_train_targets = session.run([self.loss,
self.decoder_prediction_inference,
self.decoder_train_targets], feed_dict)
return loss, np.array(decoder_prediction).T, np.array(decoder_train_targets).T
def sample(self, session, X, X_len):
feed_dict = {self.encoder_inputs: X,
self.encoder_inputs_length: X_len}
if self.dropout != 0:
feed_dict.update({self.keep_prob: 1.0})
decoder_prediction = session.run(
self.decoder_prediction_inference, feed_dict)
return np.array(decoder_prediction).T
I am having some problems with this code:
Main problem - The seq2seq.train_step() and seq2seq.validate_step() functions are working, but when I use seq2seq.sample() for actually making inferences, I get an error that asks me to feed a value for decoder_targets. This is an unexpected behaviour as the SampleEmbeddingHelper function is used for inference which does not require decoder_targets. The error:
InvalidArgumentError (see above for traceback): You must feed a value
for placeholder tensor 'ids/decoder_targets' with dtype int32 and
shape [?,?] [[node ids/decoder_targets (defined at
.../code/neural_net/train.py:241) = Placeholderdtype=DT_INT32,
shape=[?,?],
_device="/job:localhost/replica:0/task:0/device:CPU:0"]]
When I try to use the GreedyEmbeddingHelper instead of SampleEmbeddingHelper, and then run decoder_logits_inference op, the machine hangs and runs out of memory after some time. Although SampleEmbeddingHelper works fine.
Well, SampleEmbeddingHelper does need decoder targets, since it mixes part of GreedyEmbeddingHelper(infer mode) and tf.contrib.seq2seq.TrainingHelper(teacher forcing). I think you just need to use GreedyEmbeddingHelper.
Since in the beginning, the parameters are totally random (if not pre-trained).
Maybe you have seen that the results of the first few loops of seq2seq model are totally messed up.
So if you use GreedyEmbeddingHelper, which outputs a result based on the previous one, and of course no one teaches it "where to stop", so it usually goes infinitely until your memory runs out. To solve this, you need to set an upper limit for the length of sentence in tf.contrib.seq2seq.dynamic_decode.
The argument is maximum_iterations. as shown in
tf.contrib.seq2seq.dynamic_decode