Related
I have been struggling to create a automatic speech recognition neural network using tensorflow trained on the hugging face mozilla common voice 11 dataset. The model seems to train well for around 100 batches before the loss sudenly goes to infinity.
Here is the code for the data preprocessing:
dataset = datasets.load_dataset("mozilla-foundation/common_voice_11_0", "en")
dataset = dataset.remove_columns(['client_id', 'audio', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'])
def prepare_dataset(batch):
wav_file = batch['path']
# Remove file name
split = wav_file.split("\\")
joined = "\\".join(split[:-1]) + "\\"
# Get the train number
complete_path = glob.glob(joined + "*")
# Combine all the parts
file = complete_path[0] + "\\" + split[-1]
batch['path'] = file
return batch
train_dataset = dataset['train'].map(prepare_dataset).shuffle(len(dataset['train']))
val_dataset = dataset['validation'].map(prepare_dataset).shuffle(len(dataset['validation']))
frame_length = 256
frame_step = 160
fft_length = 384
def load_mp3(wav_file):
audio = tfio.audio.AudioIOTensor(wav_file, dtype=tf.float32)
sample_rate = tf.cast(audio.rate, dtype=tf.int64)
audio = tf.squeeze(audio.to_tensor())
audio = tfio.audio.resample(audio, rate_in=sample_rate, rate_out=8000)
audio = tfio.audio.fade(audio, fade_in=1000, fade_out=2000, mode="logarithmic")
return audio
def convert_to_spect(audio):
spectrogram = tf.signal.stft(
audio, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length
)
spectrogram = tf.abs(spectrogram)
spectrogram = tf.math.pow(spectrogram, 0.5)
spectrogram = tfio.audio.freq_mask(spectrogram, param=25)
spectrogram = tfio.audio.time_mask(spectrogram, param=25)
spectrogram = tfio.audio.freq_mask(spectrogram, param=25)
spectrogram = tfio.audio.time_mask(spectrogram, param=25)
means = tf.math.reduce_mean(spectrogram, 1, keepdims=True)
stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True)
spectrogram = (spectrogram - means) / (stddevs + 1e-10)
return spectrogram
def process_text(label):
label = tf.strings.lower(label)
label = tf.strings.unicode_split(label, input_encoding="UTF-8")
label = char_to_num(label)
return label
def encode_mozilla_sample(wav_file, label):
audio = load_mp3(wav_file)
spectrogram = convert_to_spect(audio)
label = process_text(label)
return spectrogram, label
And here is the code for the model:
def CTCLoss(y_true, y_pred):
# Compute the training-time loss value
batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")
input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")
loss = tf.keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
return loss
def build_model(input_dim, output_dim, rnn_layers=5, conv_units=128, rnn_units=128, dropout=0.5):
input_spectrogram = tf.keras.layers.Input((None, input_dim), name="input")
x = tf.keras.layers.Reshape((-1, input_dim, 1), name="expand_dim")(input_spectrogram)
# Conv layers
x = tf.keras.layers.Conv2D(
filters=conv_units,
kernel_size=[11, 41],
strides=[2, 2],
padding="same",
use_bias=False,
name="conv_1",
)(x)
x = tf.keras.layers.BatchNormalization(name="conv_1_bn")(x)
x = tf.keras.layers.ReLU(name="conv_1_relu")(x)
x = tf.keras.layers.Conv2D(
filters=conv_units,
kernel_size=[11, 21],
strides=[1, 2],
padding="same",
use_bias=False,
name="conv_2",
)(x)
x = tf.keras.layers.BatchNormalization(name="conv_2_bn")(x)
x = tf.keras.layers.ReLU(name="conv_2_relu")(x)
x = tf.keras.layers.Reshape((-1, x.shape[-2] * x.shape[-1]))(x)
# RNN layers
for i in range(1, rnn_layers + 1):
recurrent = tf.keras.layers.GRU(
units=rnn_units,
activation="tanh",
recurrent_activation="sigmoid",
use_bias=True,
return_sequences=True,
reset_after=True,
name=f"gru_{i}",
)
x = tf.keras.layers.Bidirectional(
recurrent, name=f"bidirectional_{i}", merge_mode="concat"
)(x)
x = tf.keras.layers.BatchNormalization(name=f"rnn_{i}_bn")(x)
if i < rnn_layers:
x = tf.keras.layers.Dropout(rate=dropout)(x)
# Dense layer
x = tf.keras.layers.Dense(units=rnn_units * 2, activation="gelu", name="dense_1")(x)
x = tf.keras.layers.Dropout(rate=dropout)(x)
# Classification layer
output = tf.keras.layers.Dense(units=output_dim + 1, activation="softmax", name="output_layer")(x)
# Model
model = tf.keras.Model(input_spectrogram, output, name="DeepSpeech_2")
# Optimizer
opt = tf.keras.optimizers.Adam(learning_rate=0.01)
# Compile the model and return
model.compile(optimizer=opt, loss=CTCLoss)
return model
# Get the model
model = build_model(
input_dim=fft_length // 2 + 1,
output_dim=char_to_num.vocabulary_size(),
rnn_units=32,
conv_units=32,
rnn_layers=5,
dropout=0.5
)
Versions:
tensorflow: 2.10.1
python: 3.9.12
gpu: Nvidia GeForce RTX 3080
OS: Windows 11
cuDNN: 8.1
CUDA: 11.2
I have tried increasing the batch size expecting the model to generalize better but any batch size 256 or higher caused the gpu to run out of memory. The infite loss occurs with any batch size 128 or less. I have also tried increasing the batch size while using less data but the result is the same. I thought that reducing the neural network size would help solve the problem but no matter what, it seems that the loss goes to infinity after reaching a loss of around 200. A few other changes I have tried are activation functions(relu, leakyrelu, gelu), optimizers(SGD, ADAM, ADAMW), and the number of rnn/conv layers.
Note: I have considered using a pretrained model but I have always wanted to successfully create ASR from scratch using tensorflow. Will it even be possible to get even moderately acceptable results using my GPU and data or will I have to resort to using wav2vec?
Another note: I was first inspired to create this project after watching the video https://www.youtube.com/watch?v=YereI6Gn3bM
made by "The A.I. Hacker - Michael Phi" who first convinced me that this was possible. Before I had thought that my computer would not be able to handle this task but after seeing him do this with pytorch, similar computer specs, and the same data, I though that I would be able to do so.
Update:
I have recently tried replacing the 2D Conv layers with a single 1D Conv layer, making the GRU layer not bidirectional, and going back to the AdamW optimizer but nothing has changed.
Thanks for the solution. I just changed the number of neurons in the second to last dense layer to 512 and the model is currently running without error. Now I am just going to have to figure out how to improve the model so I can finally wrap up this project.
how can i convert my older version of tensorflow code to newer version as CNN ,RNN ,CTC is not working in newer version. I updated tensorflow thereafter many of the function stop working properly and shows error. Some of the function are not in the package anymore.
I dont have idea about how to convert it into new version of the tensorflow
from __future__ import absolute_import, division, print_function, unicode_literals
import codecs
import sys
import numpy as np
import tensorflow as tf
from DataLoader import FilePaths
import matplotlib.pyplot as plt
class DecoderType:
BestPath = 0
WordBeamSearch = 1
BeamSearch = 2
class Model:
# Model Constants
batchSize = 10 # 50
imgSize = (800, 64)
maxTextLen = 100
def __init__(self, charList, decoderType=DecoderType.BestPath, mustRestore=False):
self.charList = charList
self.decoderType = decoderType
self.mustRestore = mustRestore
self.snapID = 0
# input image batch
self.inputImgs =tf.compat.v1.placeholder(tf.float32, shape=(None, Model.imgSize[0], Model.imgSize[1]))
# setup CNN, RNN and CTC
self.setupCNN()
self.setupRNN()
self.setupCTC()
# setup optimizer to train NN
self.batchesTrained = 0
self.learningRate = tf.placeholder(tf.float32, shape=[])
self.optimizer = tf.train.RMSPropOptimizer(self.learningRate).minimize(self.loss)
# Initialize TensorFlow
(self.sess, self.saver) = self.setupTF()
self.training_loss_summary = tf.summary.scalar('loss', self.loss)
self.writer = tf.summary.FileWriter(
'./logs', self.sess.graph) # Tensorboard: Create writer
self.merge = tf.summary.merge([self.training_loss_summary]) # Tensorboard: Merge
def setupCNN(self):
""" Create CNN layers and return output of these layers """
cnnIn4d = tf.expand_dims(input=self.inputImgs, axis=3)
# First Layer: Conv (5x5) + Pool (2x2) - Output size: 400 x 32 x 64
with tf.name_scope('Conv_Pool_1'):
kernel = tf.Variable(
tf.random.truncated_normal([5, 5, 1, 64], stddev=0.1))
conv = tf.nn.conv2d(
cnnIn4d, kernel, padding='SAME', strides=(1, 1, 1, 1))
learelu = tf.nn.leaky_relu(conv, alpha=0.01)
pool = tf.nn.max_pool2d(learelu, (1, 2, 2, 1), (1, 2, 2, 1), 'VALID')
# Second Layer: Conv (5x5) + Pool (1x2) - Output size: 400 x 16 x 128
with tf.name_scope('Conv_Pool_2'):
kernel = tf.Variable(tf.truncated_normal(
[5, 5, 64, 128], stddev=0.1))
conv = tf.nn.conv2d(
pool, kernel, padding='SAME', strides=(1, 1, 1, 1))
learelu = tf.nn.leaky_relu(conv, alpha=0.01)
pool = tf.nn.max_pool(learelu, (1, 1, 2, 1), (1, 1, 2, 1), 'VALID')
# Third Layer: Conv (3x3) + Pool (2x2) + Simple Batch Norm - Output size: 200 x 8 x 128
with tf.name_scope('Conv_Pool_BN_3'):
kernel = tf.Variable(tf.truncated_normal(
[3, 3, 128, 128], stddev=0.1))
conv = tf.nn.conv2d(
pool, kernel, padding='SAME', strides=(1, 1, 1, 1))
mean, variance = tf.nn.moments(conv, axes=[0])
batch_norm = tf.nn.batch_normalization(
conv, mean, variance, offset=None, scale=None, variance_epsilon=0.001)
learelu = tf.nn.leaky_relu(batch_norm, alpha=0.01)
pool = tf.nn.max_pool(learelu, (1, 2, 2, 1), (1, 2, 2, 1), 'VALID')
# Fourth Layer: Conv (3x3) - Output size: 200 x 8 x 256
with tf.name_scope('Conv_4'):
kernel = tf.Variable(tf.truncated_normal(
[3, 3, 128, 256], stddev=0.1))
conv = tf.nn.conv2d(
pool, kernel, padding='SAME', strides=(1, 1, 1, 1))
learelu = tf.nn.leaky_relu(conv, alpha=0.01)
# Fifth Layer: Conv (3x3) + Pool(2x2) - Output size: 100 x 4 x 256
with tf.name_scope('Conv_Pool_5'):
kernel = tf.Variable(tf.truncated_normal(
[3, 3, 256, 256], stddev=0.1))
conv = tf.nn.conv2d(
learelu, kernel, padding='SAME', strides=(1, 1, 1, 1))
learelu = tf.nn.leaky_relu(conv, alpha=0.01)
pool = tf.nn.max_pool(learelu, (1, 2, 2, 1), (1, 2, 2, 1), 'VALID')
# Sixth Layer: Conv (3x3) + Pool(1x2) + Simple Batch Norm - Output size: 100 x 2 x 512
with tf.name_scope('Conv_Pool_BN_6'):
kernel = tf.Variable(tf.truncated_normal(
[3, 3, 256, 512], stddev=0.1))
conv = tf.nn.conv2d(
pool, kernel, padding='SAME', strides=(1, 1, 1, 1))
mean, variance = tf.nn.moments(conv, axes=[0])
batch_norm = tf.nn.batch_normalization(
conv, mean, variance, offset=None, scale=None, variance_epsilon=0.001)
learelu = tf.nn.leaky_relu(batch_norm, alpha=0.01)
pool = tf.nn.max_pool(learelu, (1, 1, 2, 1), (1, 1, 2, 1), 'VALID')
# Seventh Layer: Conv (3x3) + Pool (1x2) - Output size: 100 x 1 x 512
with tf.name_scope('Conv_Pool_7'):
kernel = tf.Variable(tf.truncated_normal(
[3, 3, 512, 512], stddev=0.1))
conv = tf.nn.conv2d(
pool, kernel, padding='SAME', strides=(1, 1, 1, 1))
learelu = tf.nn.leaky_relu(conv, alpha=0.01)
pool = tf.nn.max_pool(learelu, (1, 1, 2, 1), (1, 1, 2, 1), 'VALID')
self.cnnOut4d = pool
def setupRNN(self):
""" Create RNN layers and return output of these layers """
# Collapse layer to remove dimension 100 x 1 x 512 --> 100 x 512 on axis=2
rnnIn3d = tf.squeeze(self.cnnOut4d, axis=[2])
# 2 layers of LSTM cell used to build RNN
numHidden = 512
cells = [tf.contrib.rnn.LSTMCell(
num_units=numHidden, state_is_tuple=True, name='basic_lstm_cell') for _ in range(2)]
stacked = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True)
# Bi-directional RNN
# BxTxF -> BxTx2H
((forward, backward), _) = tf.nn.bidirectional_dynamic_rnn(
cell_fw=stacked, cell_bw=stacked, inputs=rnnIn3d, dtype=rnnIn3d.dtype)
# BxTxH + BxTxH -> BxTx2H -> BxTx1X2H
concat = tf.expand_dims(tf.concat([forward, backward], 2), 2)
# Project output to chars (including blank): BxTx1x2H -> BxTx1xC -> BxTxC
kernel = tf.Variable(tf.truncated_normal(
[1, 1, numHidden * 2, len(self.charList) + 1], stddev=0.1))
self.rnnOut3d = tf.squeeze(tf.nn.atrous_conv2d(value=concat, filters=kernel, rate=1, padding='SAME'), axis=[2])
def setupCTC(self):
""" Create CTC loss and decoder and return them """
# BxTxC -> TxBxC
self.ctcIn3dTBC = tf.transpose(self.rnnOut3d, [1, 0, 2])
# Ground truth text as sparse tensor
with tf.name_scope('CTC_Loss'):
self.gtTexts = tf.SparseTensor(tf.placeholder(tf.int64, shape=[
None, 2]), tf.placeholder(tf.int32, [None]), tf.placeholder(tf.int64, [2]))
# Calculate loss for batch
self.seqLen = tf.placeholder(tf.int32, [None])
self.loss = tf.reduce_mean(tf.nn.ctc_loss(labels=self.gtTexts, inputs=self.ctcIn3dTBC, sequence_length=self.seqLen,
ctc_merge_repeated=True, ignore_longer_outputs_than_inputs=True))
with tf.name_scope('CTC_Decoder'):
# Decoder: Best path decoding or Word beam search decoding
if self.decoderType == DecoderType.BestPath:
self.decoder = tf.nn.ctc_greedy_decoder(
inputs=self.ctcIn3dTBC, sequence_length=self.seqLen)
elif self.decoderType == DecoderType.BeamSearch:
self.decoder = tf.nn.ctc_beam_search_decoder(inputs=self.ctcIn3dTBC, sequence_length=self.seqLen, beam_width=50, merge_repeated=True)
elif self.decoderType == DecoderType.WordBeamSearch:
# Import compiled word beam search operation (see https://github.com/githubharald/CTCWordBeamSearch)
word_beam_search_module = tf.load_op_library(
'./TFWordBeamSearch.so')
# Prepare: dictionary, characters in dataset, characters forming words
chars = codecs.open(FilePaths.wordCharList.txt, 'r').read()
wordChars = codecs.open(
FilePaths.fnWordCharList, 'r').read()
corpus = codecs.open(FilePaths.corpus.txt, 'r').read()
# # Decoder using the "NGramsForecastAndSample": restrict number of (possible) next words to at most 20 words: O(W) mode of word beam search
# decoder = word_beam_search_module.word_beam_search(tf.nn.softmax(ctcIn3dTBC, dim=2), 25, 'NGramsForecastAndSample', 0.0, corpus.encode('utf8'), chars.encode('utf8'), wordChars.encode('utf8'))
# Decoder using the "Words": only use dictionary, no scoring: O(1) mode of word beam search
self.decoder = word_beam_search_module.word_beam_search(tf.nn.softmax(
self.ctcIn3dTBC, dim=2), 25, 'Words', 0.0, corpus.encode('utf8'), chars.encode('utf8'), wordChars.encode('utf8'))
# Return a CTC operation to compute the loss and CTC operation to decode the RNN output
return self.loss, self.decoder
def setupTF(self):
""" Initialize TensorFlow """
print('Python: ' + sys.version)
print('Tensorflow: ' + tf.__version__)
sess = tf.Session() # Tensorflow session
saver = tf.train.Saver(max_to_keep=3) # Saver saves model to file
modelDir = '../model/'
latestSnapshot = tf.train.latest_checkpoint(modelDir) # Is there a saved model?
# If model must be restored (for inference), there must be a snapshot
if self.mustRestore and not latestSnapshot:
raise Exception('No saved model found in: ' + modelDir)
# Load saved model if available
if latestSnapshot:
print('Init with stored values from ' + latestSnapshot)
saver.restore(sess, latestSnapshot)
else:
print('Init with new values')
sess.run(tf.global_variables_initializer())
return (sess, saver)
def toSpare(self, texts):
""" Convert ground truth texts into sparse tensor for ctc_loss """
indices = []
values = []
shape = [len(texts), 0] # Last entry must be max(labelList[i])
# Go over all texts
for (batchElement, texts) in enumerate(texts):
# Convert to string of label (i.e. class-ids)
# print(texts)
# labelStr = []
# for c in texts:
# print(c, '|', end='')
# labelStr.append(self.charList.index(c))
# print(' ')
labelStr = [self.charList.index(c) for c in texts]
# Sparse tensor must have size of max. label-string
if len(labelStr) > shape[1]:
shape[1] = len(labelStr)
# Put each label into sparse tensor
for (i, label) in enumerate(labelStr):
indices.append([batchElement, i])
values.append(label)
return (indices, values, shape)
def decoderOutputToText(self, ctcOutput):
""" Extract texts from output of CTC decoder """
# Contains string of labels for each batch element
encodedLabelStrs = [[] for i in range(Model.batchSize)]
# Word beam search: label strings terminated by blank
if self.decoderType == DecoderType.WordBeamSearch:
blank = len(self.charList)
for b in range(Model.batchSize):
for label in ctcOutput[b]:
if label == blank:
break
encodedLabelStrs[b].append(label)
# TF decoders: label strings are contained in sparse tensor
else:
# Ctc returns tuple, first element is SparseTensor
decoded = ctcOutput[0][0]
# Go over all indices and save mapping: batch -> values
idxDict = {b : [] for b in range(Model.batchSize)}
for (idx, idx2d) in enumerate(decoded.indices):
label = decoded.values[idx]
batchElement = idx2d[0] # index according to [b,t]
encodedLabelStrs[batchElement].append(label)
# Map labels to chars for all batch elements
return [str().join([self.charList[c] for c in labelStr]) for labelStr in encodedLabelStrs]
def trainBatch(self, batch, batchNum):
""" Feed a batch into the NN to train it """
sparse = self.toSpare(batch.gtTexts)
rate = 0.01 if self.batchesTrained < 10 else (
0.001 if self.batchesTrained < 2750 else 0.001)
evalList = [self.merge, self.optimizer, self.loss]
feedDict = {self.inputImgs( batch.imgs), self.gtTexts( sparse), self.seqLen ([Model.maxTextLen] * Model.batchSize), self.learningRate( rate)}
(loss_summary, _, lossVal) = self.sess.run(evalList, feedDict)
# Tensorboard: Add loss_summary to writer
self.writer.add_summary(loss_summary, batchNum)
self.batchesTrained += 1
return lossVal
def return_rnn_out(self, batch, write_on_csv=False):
"""Only return rnn_out prediction value without decoded"""
numBatchElements = len(batch.imgs)
decoded, rnnOutput = self.sess.run([self.decoder, self.ctcIn3dTBC],
{self.inputImgs: batch.imgs, self.seqLen: [Model.maxTextLen] * numBatchElements})
decoded = rnnOutput
print(decoded.shape)
if write_on_csv:
s = rnnOutput.shape
b = 0
csv = ''
for t in range(s[0]):
for c in range(s[2]):
csv += str(rnnOutput[t, b, c]) + ';'
csv += '\n'
open('mat_0.csv', 'w').write(csv)
return decoded[:,0,:].reshape(100,80)
def inferBatch(self, batch):
""" Feed a batch into the NN to recognize texts """
numBatchElements = len(batch.imgs)
feedDict = {self.inputImgs: batch.imgs, self.seqLen: [Model.maxTextLen] * numBatchElements}
evalRes = self.sess.run([self.decoder, self.ctcIn3dTBC], feedDict)
decoded = evalRes[0]
# # Dump RNN output to .csv file
# decoded, rnnOutput = self.sess.run([self.decoder, self.rnnOutput], {
# self.inputImgs: batch.imgs, self.seqLen: [Model.maxTextLen] * Model.batchSize})
# s = rnnOutput.shape
# b = 0
# csv = ''
# for t in range(s[0]):
# for c in range(s[2]):
# csv += str(rnnOutput[t, b, c]) + ';'
# csv += '\n'
# open('mat_0.csv', 'w').write(csv)
texts = self.decoderOutputToText(decoded)
return texts
def save(self):
""" Save model to file """
self.snapID += 1
self.saver.save(self.sess, r'C:\Users\PycharmProjects\hand\model\snapshot',
global_step=self.snapID)
You can run tf1 code in tf2 by importing tf a bit differently:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
For details on how to migrate your code you should look here: https://www.tensorflow.org/guide/migrate
[TF 1.8]
I'm trying to build a seq2seq model for a toy chatbot to learn about tensorflow and deep learning. I was able to train and run the model with sampled softmax and beam search but then I try to apply tf.contrib.seq2seq.LuongAttention using tf.contrib.seq2seq.AttentionWrapper I get the following error while building the graph:
ValueError: Dimensions must be equal, but are 384 and 256 for 'rnn/while/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/MatMul_2' (op: 'MatMul') with input shapes: [64,384], [256,512].
This is my model:
class ChatBotModel:
def __init__(self, inferring=False, batch_size=1, use_sample_sofmax=True):
"""forward_only: if set, we do not construct the backward pass in the model.
"""
print('Initialize new model')
self.inferring = inferring
self.batch_size = batch_size
self.use_sample_sofmax = use_sample_sofmax
def build_graph(self):
# INPUTS
self.X = tf.placeholder(tf.int32, [None, None])
self.Y = tf.placeholder(tf.int32, [None, None])
self.X_seq_len = tf.placeholder(tf.int32, [None])
self.Y_seq_len = tf.placeholder(tf.int32, [None])
self.gl_step = tf.Variable(
0, dtype=tf.int32, trainable=False, name='global_step')
single_cell = tf.nn.rnn_cell.BasicLSTMCell(128)
keep_prob = tf.cond(tf.convert_to_tensor(self.inferring, tf.bool), lambda: tf.constant(
1.0), lambda: tf.constant(0.8))
single_cell = tf.contrib.rnn.DropoutWrapper(
single_cell, output_keep_prob=keep_prob)
encoder_cell = tf.contrib.rnn.MultiRNNCell([single_cell for _ in range(2)])
# ENCODER
encoder_out, encoder_state = tf.nn.dynamic_rnn(
cell = encoder_cell,
inputs = tf.contrib.layers.embed_sequence(self.X, 10000, 128),
sequence_length = self.X_seq_len,
dtype = tf.float32)
# encoder_state is ((cell0_c, cell0_h), (cell1_c, cell1_h))
# DECODER INPUTS
after_slice = tf.strided_slice(self.Y, [0, 0], [self.batch_size, -1], [1, 1])
decoder_inputs = tf.concat( [tf.fill([self.batch_size, 1], 2), after_slice], 1)
# ATTENTION
attention_mechanism = tf.contrib.seq2seq.LuongAttention(
num_units = 128,
memory = encoder_out,
memory_sequence_length = self.X_seq_len)
# DECODER COMPONENTS
Y_vocab_size = 10000
decoder_cell = tf.contrib.rnn.MultiRNNCell([single_cell for _ in range(2)])
decoder_cell = tf.contrib.seq2seq.AttentionWrapper(
cell = decoder_cell,
attention_mechanism = attention_mechanism,
attention_layer_size=128)
decoder_embedding = tf.Variable(tf.random_uniform([Y_vocab_size, 128], -1.0, 1.0))
projection_layer = CustomDense(Y_vocab_size)
if self.use_sample_sofmax:
softmax_weight = projection_layer.kernel
softmax_biases = projection_layer.bias
if not self.inferring:
# TRAINING DECODER
training_helper = tf.contrib.seq2seq.TrainingHelper(
inputs = tf.nn.embedding_lookup(decoder_embedding, decoder_inputs),
sequence_length = self.Y_seq_len,
time_major = False)
decoder_initial_state = decoder_cell.zero_state(self.batch_size, dtype=tf.float32).clone(
cell_state=encoder_state)
training_decoder = tf.contrib.seq2seq.BasicDecoder(
cell = decoder_cell,
helper = training_helper,
initial_state = decoder_initial_state,
output_layer = projection_layer
)
training_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
decoder = training_decoder,
impute_finished = True,
maximum_iterations = tf.reduce_max(self.Y_seq_len))
training_logits = training_decoder_output.rnn_output
# LOSS
softmax_loss_function = None
if self.use_sample_sofmax:
def sampled_loss(labels, logits):
labels = tf.reshape(labels, [-1, 1])
return tf.nn.sampled_softmax_loss(weights=softmax_weight,
biases=softmax_biases,
labels=labels,
inputs=logits,
num_sampled=64,
num_classes=10000)
softmax_loss_function = sampled_loss
masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)
self.loss = tf.contrib.seq2seq.sequence_loss(logits = training_logits, targets = self.Y, weights = masks, softmax_loss_function=softmax_loss_function)
# BACKWARD
params = tf.trainable_variables()
gradients = tf.gradients(self.loss, params)
clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
self.train_op = tf.train.AdamOptimizer().apply_gradients(zip(clipped_gradients, params), global_step=self.gl_step)
else:
encoder_states = []
for i in range(2):
if isinstance(encoder_state[i],tf.contrib.rnn.LSTMStateTuple):
encoder_state_c = tf.contrib.seq2seq.tile_batch(encoder_state[i].c, multiplier=2)
encoder_state_h = tf.contrib.seq2seq.tile_batch(encoder_state[i].h, multiplier=2)
encoder_state = tf.contrib.rnn.LSTMStateTuple(c=encoder_state_c, h=encoder_state_h)
encoder_states.append(encoder_state)
encoder_states = tuple(encoder_states)
predicting_decoder = tf.contrib.seq2seq.BeamSearchDecoder(
cell = decoder_cell,
embedding = decoder_embedding,
start_tokens = tf.tile(tf.constant([2], dtype=tf.int32), [self.batch_size]),
end_token = 3,
initial_state = decoder_initial_state,
beam_width = 2,
output_layer = projection_layer,
length_penalty_weight = 0.0)
predicting_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
decoder = predicting_decoder,
impute_finished = False,
maximum_iterations = 4 * tf.reduce_max(self.Y_seq_len))
self.predicting_logits = predicting_decoder_output.predicted_ids
Tracing back a few lines of log and I saw that the error occurs here:
/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/rnn_cell_impl.py in call(self, inputs, state)
636
637 gate_inputs = math_ops.matmul(
--> 638 array_ops.concat([inputs, h], 1), self._kernel)
639 gate_inputs = nn_ops.bias_add(gate_inputs, self._bias)
I have checked the 'h' tensor of the LSTM cell and it has the shape of [batch_size, 128] so my guess is that the attention output from the previous decoding step is concatenated with the current encoder's input make the 'inputs' has the shape of [batch_size, 256] then it is concatenated with 'h' tensor to form a [batch_size, 384] tensor causing this error.
My question is: Isn't attention output supposed to be concatenated with the next decoder's input or I miss understanding anything? And how to fix this error.
you probably already found the answer but for peeps (like me) who also encounter this error, focus on the second shape. It specifies [256,512]. Now open up the code to "rnn_cell_impl.py" and go to the line where the concat op is taking place. You will notice that the kernel shape is the one being reported as being out of sync with your decoder input( which has num_units+attention_layer_size as the 1st dimension , 0th being your batch_size).
Basically you are using the same cell you created for the encoder unit in the decoder as well (its a 2 layer lstm with 128 right?) hence the kernel size shows up as 256,512. To fix this, in the line between these 2, add
Y_vocab_size = 10000
## create new decoder base rnn cell
decode_op_cell = tf.nn.rnn_cell.BasicLSTMCell(128)
## create new decoder base rnn cell
decoder_cell = tf.contrib.rnn.MultiRNNCell([decode_op_cell for _ in range(2)])
Now if you can visualize the code at the same line which gave you the error, you will see [64, 384] and [384, 512] ( which is a legit mat mul op and should fix your error) Of course, whatever dropout etc you want to add, feel free to add to this decode_op_cell as well.
I am testing a two-step architecture that is composed of a conventional first section that can be implemented with any standard deep learning architecture and a second section that must be coded manually outside the declaration of the Pytorch graph (while still utilizing numpy-like torch functions).
My problem can be simplified to coding a feed-forward neural network with two hidden layers, where the first is implemented within the Pytorch graph and the second is implemented manually outside the Pytorch graph.
Architecture:
Input
-> Linear(28 * 28, 120) w/ Pytorch
-> ReLU w/ Pytorch
-> Linear(120, 84) w/ Pytorch
-> ReLU w/ Pytorch
-> Linear(84, 10) w/o Pytorch
-> Output
Problem: My implementation below achieves a very low ~74%, while a standard fully Pytorch implementation achieves ~95%. What is causing this disparity?
I believe my problem lies in manually passing back the deltas, although the math looks right, so I am stuck in finding a solution to this.
Implementation of architecture and training on MNIST:
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.fc1 = nn.Linear(28 * 28, 120)
self.fc2 = nn.Linear(120, 84)
def forward(self, x):
x = x.view(-1, 28 * 28)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
return x
net = Net()
criterion = nn.MSELoss()
optimizer = optim.SGD(net.parameters(), lr=0.01)
# Initialize weights just as Pytorch does by default:
m = torch.distributions.uniform.Uniform(torch.tensor([-np.sqrt(1.0/84)]),
torch.tensor([np.sqrt(1.0/84)]))
W = m.sample((84, 10)).reshape((84, 10))
# based on https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
for epoch in range(2): # loop over the dataset multiple times
for i, data in enumerate(trainloader, 0):
# get the inputs
inputs, labels = data
# make one-hot encoding of labels
targets = oneHot(labels)
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
pytorch_outputs = net(inputs)
pytorch_outputs = torch.autograd.Variable(pytorch_outputs,
requires_grad=True)
manual_outputs = torch.mm(pytorch_outputs, W)
delta_out = manual_outputs - targets.view(-1,10) # = error_out
dEdW3 = torch.mm(torch.t(pytorch_outputs), delta_out)
W -= 0.01 * dEdW3 # gradient descent
delta_h = torch.autograd.Variable(
torch.t(torch.mm(W, torch.t(delta_out))))
loss = criterion(pytorch_outputs, delta_h)
loss.backward()
optimizer.step()
Full code:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
transform = transforms.Compose(
[transforms.ToTensor(),
transforms.Normalize((0.0, 0.0, 0.0), (1.0, 1.0, 1.0))])
trainset = torchvision.datasets.MNIST(root='./data', train=True,
download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=1,
shuffle=True, num_workers=1)
testset = torchvision.datasets.MNIST(root='./data', train=False,
download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=1,
shuffle=False, num_workers=1)
classes = ('0', '1', '2', '3',
'4', '5', '6', '7', '8', '9')
def oneHot(a):
b = torch.zeros(10)
b[a] = 1
return b
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.fc1 = nn.Linear(28 * 28, 120)
self.fc2 = nn.Linear(120, 84)
def forward(self, x):
x = x.view(-1, 28 * 28)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
return x
net = Net()
criterion = nn.MSELoss()
optimizer = optim.SGD(net.parameters(), lr=0.01)
# Initialize weights just as Pytorch does by default:
m = torch.distributions.uniform.Uniform(torch.tensor([-np.sqrt(1.0/84)]),
torch.tensor([np.sqrt(1.0/84)]))
W = m.sample((84, 10)).reshape((84, 10))
# based on https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
for epoch in range(2): # loop over the dataset multiple times
for i, data in enumerate(trainloader, 0):
# get the inputs
inputs, labels = data
# make one-hot encoding of labels
targets = oneHot(labels)
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
pytorch_outputs = net(inputs)
pytorch_outputs = torch.autograd.Variable(pytorch_outputs,
requires_grad=True)
manual_outputs = torch.mm(pytorch_outputs, W)
delta_out = manual_outputs - targets.view(-1,10) # = error_out
dEdW3 = torch.mm(torch.t(pytorch_outputs), delta_out)
W -= 0.001*dEdW3 # gradient descent
delta_h = torch.autograd.Variable(
torch.t(torch.mm(W, torch.t(delta_out))))
loss = criterion(pytorch_outputs, delta_h)
loss.backward()
optimizer.step()
correct = 0
total = 0
with torch.no_grad():
for data in testloader:
inputs, labels = data
pytorch_outputs = torch.autograd.Variable(net(inputs),
requires_grad=True)
manual_outputs = torch.mm(pytorch_outputs, W)
_, predicted = torch.max(manual_outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
print('Accuracy of the network on the 10000 test images: %d %%' % (
100 * correct / total))
I am doing a sequence to label learning model in PyTorch. I have two sentences and I am classifying whether they are entailed or not (SNLI dataset). I concatenate two 50 word sentences together (sometimes padded) into a vector of length 100. I then send in minibatches into word embeddings -> LSTM -> Linear layer. I am doing cross entropy loss but I need a vector of [mini_batch, C] to go into the CrossEntropyLoss function. Instead I still have the 100 words in my vector as [mini_batch, 100, C]
Here is my model:
class myLSTM(nn.Module):
def __init__(self, h_size=128, v_size=10, embed_d=300, mlp_d=256):
super(myLSTM, self).__init__()
self.embedding = nn.Embedding(v_size, embed_d)
self.lstm = nn.LSTM(embed_d, h_size, num_layers=1, bidirectional=True, batch_first=True)
self.mlp = nn.Linear(mlp_d, 1024)
# Set static embedding vectors
self.embedding.weight.requires_grad = False
#self.sm = nn.CrossEntropyLoss()
def display(self):
for param in self.parameters():
print(param.data.size())
def filter_params(self):
# Might not be compatible with python 3
#self.parameters = filter(lambda p: p.requires_grad, self.parameters())
pass
def init_hidden(self):
# Need to init hidden weights in LSTM
pass
def forward(self, sentence):
print(sentence.size())
embeds = self.embedding(sentence)
print(embeds.size())
out, _ = self.lstm(embeds)
print(out.size())
out = self.mlp(out)
return out
My training sequences with output:
batch_size = 3
SGD_optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=0.01, weight_decay=1e-4)
ADM_optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.01)
criterion = nn.CrossEntropyLoss()
num_epochs = 50
from torch.autograd import Variable
from torch import optim
for epoch in range(num_epochs):
print("Epoch {0}/{1}: {2}%".format(epoch, num_epochs, float(epoch)/num_epochs))
for start, end in tqdm(batch_index_gen(batch_size, len(n_data))):
# Convert minibatch to numpy
s1, s2, y = convert_to_numpy(n_data[start:end])
# Convert numpy to Tensor
res = np.concatenate((s1,s2), axis=1) # Attach two sentences into 1 input vector
input_tensor = torch.from_numpy(res).type(torch.LongTensor)
target_tensor = torch.from_numpy(y).type(torch.FloatTensor)
data, target = Variable(input_tensor), Variable(target_tensor)
# Zero gradients
SGD_optimizer.zero_grad()
# Forward Pass
output = model.forward(data)
print("Output size: ")
print(output.size())
print("Target size: ")
print(target.size())
# Calculate loss with respect to training labels
loss = criterion(output, target)
# Backprogogate and update optimizer
loss.backward()
SGD_optimizer.step()
#ADAM_optimizer.step()
output:
Epoch 0/50: 0.0%
torch.Size([3, 100])
torch.Size([3, 100, 300])
torch.Size([3, 100, 256])
Output size:
torch.Size([3, 100, 1024])
Target size:
torch.Size([3])
error:
ValueError: Expected 2 or 4 dimensions (got 3)
EDITED -------------------------------------------------------------------
I have now got my model training but I am getting low accuracy. Is there an issue with my LSTM outputs being concatenated and then condensed to a smaller tensor to go through my linear layer?
New Model:
class myLSTM(nn.Module):
def __init__(self, h_size=128, v_size=10, embed_d=300, mlp_d=256, num_classes=3, lstm_layers=1):
super(myLSTM, self).__init__()
self.num_layers = lstm_layers
self.hidden_size = h_size
self.embedding = nn.Embedding(v_size, embed_d)
self.lstm = nn.LSTM(embed_d, h_size, num_layers=lstm_layers, bidirectional=True, batch_first=True)
self.mlp = nn.Linear(2 * h_size * 2, num_classes)
# Set static embedding vectors
self.embedding.weight.requires_grad = False
def forward(self, s1, s2):
# Set initial states
#h0 = Variable(torch.zeros(self.num_layers*2, s1.size(0), self.hidden_size)).cuda() # 2 for bidirection
#c0 = Variable(torch.zeros(self.num_layers*2, s1.size(0), self.hidden_size)).cuda()
batch_size = s1.size()[0]
embeds_1 = self.embedding(s1)
embeds_2 = self.embedding(s2)
_, (h_1_last, _) = self.lstm(embeds_1)#, (h0, c0)) #note the change here. Last hidden state is taken
_, (h_2_last, _) = self.lstm(embeds_2)#, (h0, c0))
concat = torch.cat( (h_1_last, h_2_last), dim=2) #double check the dimension
concat = concat.view(batch_size, -1)
scores = self.mlp(concat)
return scores
New Training
batch_size = 64
SGD_optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=0.001, weight_decay=1e-4)
criterion = nn.CrossEntropyLoss()
num_epochs = 10
model.train()
if cuda:
model = model.cuda()
criterion = criterion.cuda()
from torch.autograd import Variable
from torch import optim
epoch_losses = []
for epoch in range(num_epochs):
print("Epoch {0}/{1}: {2}%".format(epoch, num_epochs, 100*float(epoch)/num_epochs))
# Batch loss aggregator
losses = []
for start, end in tqdm(batch_index_gen(batch_size, len(n_data))):
# Convert minibatch to numpy
s1, s2, y = convert_to_numpy(n_data[start:end])
# Convert numpy to Tensor
s1_tensor = torch.from_numpy(s1).type(torch.LongTensor)
s2_tensor = torch.from_numpy(s2).type(torch.LongTensor)
target_tensor = torch.from_numpy(y).type(torch.LongTensor)
s1 = Variable(s1_tensor)
s2 = Variable(s2_tensor)
target = Variable(target_tensor)
if cuda:
s1 = s1.cuda()
s2 = s2.cuda()
target = target.cuda()
# Zero gradients
SGD_optimizer.zero_grad()
# Forward Pass
output = model.forward(s1,s2)
# Calculate loss with respect to training labels
loss = criterion(output, target)
losses.append(loss.data[0])
# Backprogogate and update optimizer
loss.backward()
SGD_optimizer.step()
# concat losses to epoch losses
epoch_losses += losses
training with tensor sizes printed:
Epoch 0/10: 0.0%
Batch size: 64
Sentences
torch.Size([64, 50])
torch.Size([64, 50])
torch.Size([64, 50, 300])
torch.Size([64, 50, 300])
Hidden states
torch.Size([2, 64, 128])
torch.Size([2, 64, 128])
Concatenated hidden states
torch.Size([2, 64, 256])
Reshaped tensors for linear layer
torch.Size([64, 512])
Linear propogation
torch.Size([64, 3])
Evaluation
def eval_model(model, mode='dev'):
file_name = 'snli_1.0/snli_1.0_dev.jsonl' if mode == 'dev' else 'snli_1.0/snli_1.0_test.jsonl'
dev_data, _ = obtain_data(file_name)
dev_n_data = vocab.process_data(dev_data)
print("Length of data: {}".format(len(dev_n_data)))
eval_batch_size = 1024
model.eval()
total = len(dev_n_data)
hit = 0
correct = 0
# Batch dev eval
for start, end in batch_index_gen(eval_batch_size, len(dev_n_data)):
s1, s2, y = convert_to_numpy(dev_n_data[start:end])
s1_tensor = torch.from_numpy(s1).type(torch.LongTensor)
s2_tensor = torch.from_numpy(s2).type(torch.LongTensor)
target_tensor = torch.from_numpy(y).type(torch.LongTensor)
s1 = Variable(s1_tensor, volatile=True)
s2 = Variable(s2_tensor, volatile=True)
target = Variable(target_tensor, volatile=True)
if cuda:
s1 = s1.cuda()
s2 = s2.cuda()
target = target.cuda()
output = model.forward(s1,s2)
loss = criterion(output, target)
#print("output size: {}".format(output.size()))
#print("target size: {}".format(target.size()))
pred = output.data.max(1)[1] # get the index of the max log-probability
#print(pred[:5])
#print(output[:])
correct += pred.eq(target.data).cpu().sum()
return correct / float(total)
eval_model(model)
I think there is an issue in a way you are trying to solve an entailment problem.
Maybe you can do it this way:
design your module to accept two sentences as input
embed both of them with your embeddings
encode them using the LSTM module.
now you have two fixed length vector representations of two sentences. Simpliest thing to do is to just concatenate them
together.
Add liner layer on top to evaluate scores for each entailment class (3 I suppose)
apply softmax to get a proper probability distribution
So your model can look like this (double check the dimensions):
class myLSTM(nn.Module):
def __init__(self, h_size=128, v_size=10, embed_d=300, num_classes = 3):
super(myLSTM, self).__init__()
self.embedding = nn.Embedding(v_size, embed_d)
self.lstm = nn.LSTM(embed_d, h_size, num_layers=1, bidirectional=True, batch_first=True)
self.mlp = nn.Linear(2*h_size*2, num_classes) #<- change here
def forward(self, sentence1, sentence2):
embeds_1 = self.embedding(sentence1)
embeds_2 = self.embedding(sentence2)
_, (h_1_last, _) = self.lstm(embeds_1) #note the change here. Last hidden state is taken
_, (h_2_last, _) = self.lstm(embeds_2)
concat = torch.concat([h_1_last, h_2_last], dim=1) #double check the dimension
scores = self.mlp(concat)
probas = F.softmax(scores) #from torch.functional ...
Then you can play around with adding more hidden layers or thinking how combining two sentences can be done in more intelligent way (attention, etc).
Double check what CrossEntropyLoss accepts as input and target and adjust (is it unnormalized class scores or probability distribution). Check http://pytorch.org/docs/master/nn.html#lstm for LSTM module documentation to clarify what LSTM returns (do you need hidden states for every word or just the representation after the last one).