Implementing attention mechanism for neural machine translation

Implementing attention mechanism for neural machine translation - python

class Attention(nn.Module):
def __init__(self, hidden_size):
super(Attention, self).__init__()
self.hidden_size = hidden_size
# Create a two layer fully-connected network. Hint: Use nn.Sequential
# hidden_size*2 --> hidden_size, ReLU, hidden_size --> 1
self.attention_network = nn.Sequential(
nn.Linear(hidden_size*2, hidden_size),
nn.ReLU(),
nn.Linear(hidden_size, 1))
self.softmax = nn.Softmax(dim=1)
def forward(self, hidden, annotations):
"""The forward pass of the attention mechanism.
Arguments:
hidden: The current decoder hidden state. (batch_size x hidden_size)
annotations: The encoder hidden states for each step of the input sequence. (batch_size x seq_len x hidden_size)
Returns:
output: Normalized attention weights for each encoder hidden state. (batch_size x seq_len x 1)
The output must be a softmax weighting over the seq_len annotations.
"""
batch_size, seq_len, hid_size = annotations.size()
expanded_hidden = hidden.unsqueeze(1).expand_as(annotations)
# concat = ...
# reshaped_for_attention_net = ...
# attention_net_output = ...
# unnormalized_attention = ... # Reshape attention net output to have dimension batch_size x seq_len x 1
return self.softmax(unnormalized_attention)
In the forward function this is what I've tried:
concat = torch.cat((expanded_hidden, annotations), 2)
unnormalized_attention = self.attention_network(concat)
I'm trying to figure out
concat = ...
reshaped_for_attention_net = ...
attention_net_output = ...
unnormalized_attention = ...

Related

My RNN with attention model always predict the same class even if my data is not imbalanced

This is my attention layer code :
implementation of attention layer
**class Attention(nn.Module):
def __init__(self, feature_dim, step_dim, bias=True, **kwargs):
super(Attention, self).__init__(**kwargs)
self.supports_masking = True
self.bias = bias
self.feature_dim = feature_dim
self.step_dim = step_dim
self.features_dim = 0
weight = torch.zeros(feature_dim, 1)
nn.init.kaiming_uniform_(weight)
self.weight = nn.Parameter(weight)
if bias:
self.b = nn.Parameter(torch.zeros(step_dim))
def forward(self, x, mask=None):
feature_dim = self.feature_dim
step_dim = self.step_dim
eij = torch.mm(
x.contiguous().view(-1, feature_dim),
self.weight
).view(-1, step_dim)
if self.bias:
eij = eij + self.b
eij = torch.tanh(eij)
a = torch.exp(eij)
if mask is not None:
a = a * mask
a = a / (torch.sum(a, 1, keepdim=True) + 1e-10)
weighted_input = x * torch.unsqueeze(a, -1)
return torch.sum(weighted_input, 1)**
This is RNN codes :
**# Instantiate the model w/ hyperparams
weights_matrix = weights_matrix
output_size = 13 # number of classes to predict
hidden_dim = 64
drop_prob = 0.5
# The RNN model that will be used to perform classification
class AttentionLSTM(nn.Module):
def __init__(self, weights_matrix, output_size, hidden_dim, drop_prob):
super(AttentionLSTM, self).__init__()
# embedding layers
self.embedding, self.num_embeddings, self.embeddings_size = create_emb_layer(weights_matrix, True)
# embedding dropout
self.dropout = nn.Dropout2d(drop_prob)
# First lstm and GRU layers
self.lstm1 = nn.LSTM(self.embeddings_size, hidden_dim, batch_first=True, bidirectional=True)
self.gru1 = nn.GRU(hidden_dim * 2, hidden_dim, bidirectional=True, batch_first=True)
# attention layer
self.attention = Attention(hidden_dim*2, seq_length)
# Second lstm and GRU layers
self.lstm2 = nn.LSTM(hidden_dim * 2, hidden_dim, batch_first=True, bidirectional=True)
self.gru2 = nn.GRU(hidden_dim * 2, hidden_dim, bidirectional=True, batch_first=True)
# linear
self.fc = nn.Linear(hidden_dim * 2, hidden_dim * 2)
self.out = nn.Linear(hidden_dim * 2, output_size)
# activation functions
self.sigmoid = nn.Sigmoid() # for hidden layers
self.softmax = nn.Softmax(dim=1) # for output layer
def forward(self, x):
batch_size = x.size(0)
# embedding output
x = x.long()
embeds = self.embedding(x)
embeds = torch.squeeze(torch.unsqueeze(embeds, 0))
# lstm, and gru outputs
lstm_out1, _ = self.lstm1(embeds)
gru_out1, _ = self.gru1(lstm_out1)
gru_out1 = gru_out1.view(batch_size, -1, hidden_dim * 2)
attention_out = self.attention(gru_out1, seq_length)
attention_out = attention_out.view(batch_size, -1, hidden_dim * 2)
attention_out = self.sigmoid(attention_out)
lstm_out2, _ = self.lstm2(attention_out)
# slice lstm_out to just get output of last element of the input sequence
lstm_out2 = lstm_out2[:, -1]
gru_out2, _ = self.gru2(lstm_out2)
# linear outputs
fc_out = self.softmax(self.fc(gru_out2))
final_out = self.out(fc_out)
return final_out**
I am sure that my dataset is balanced after pre-processing step but my model always predict the same output. Precision and fscore are changing for each input, however, this problem makes my recall score 1.0 since output is always same whatever input is.
If anybody help me, i will be appreciated

It required some time to build networks from your requirements but I provided a few samples to create a customer layer or model, you start from an embedded layer and suddenly random leaves of data create different input every time GRU and LSTM learning layers may provide good results when they had :
Matching input and target layer and parameters.
Learning scopes when they can differentiate input, repeating of gated current, and LSTM is specifically used when patterns of data are
significant such as pictures or continue data.
Linear, and Sigmoid provide contrast differentiate and softmax sometime we required when compared based on distribution values. This
is supposed to create contrast output excepted softmax applied on
weights of values.
Loss Fn is based on a similar output dimension/expectation
[ Sample ]:
class create_emb_layer( tf.keras.layers.Embedding ):
def __init__( self, weights_matrix, bidirectional=True ):
self.num_embeddings = weights_matrix[0]
self.embeddings_size = weights_matrix[1]
self.bidirectional = bidirectional
super(create_emb_layer, self).__init__( self.embeddings_size, self.num_embeddings )
def build(self, input_shape):
self.kernel = self.add_weight("kernel",
shape=[int(input_shape[-1]),
self.input_dim])
def call(self, inputs):
return tf.matmul(inputs, self.kernel)
[ My model ]:
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Model Initialize
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
model = tf.keras.models.Sequential([
tf.keras.layers.InputLayer(input_shape=( 32, 32, 4 )),
tf.keras.layers.Normalization(mean=3., variance=2.),
tf.keras.layers.Normalization(mean=4., variance=6.),
tf.keras.layers.Conv2D(32, (3, 3), activation='relu'),
tf.keras.layers.MaxPooling2D((2, 2)),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Reshape((128, 225)),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(96, return_sequences=True, return_state=False)),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(96)),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(192, activation='relu'),
tf.keras.layers.Dense(10),
])
[ Output ]:

When to calculate Attention in encoder-decoder network?

I have been learning about Attention, and after reading some articles I am confused "when is attention calculated?". Some calculate attention before giving it to GRU (or LSTM) and some calculate attention after GRU. Here are two types of Decoder layer and both works, take a look at it:
Calculating Attention Before GRU:
Here, hidden is of shape (batch_size, 1, embedding_dim) aka Query and enc_output is (batch_size, max_len, embedding_dim) aka value.
class Decoder(tf.keras.layers.Layer):
def __init__(self, vocab_size, embedding_dim, units):
super().__init__()
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
self.gru = tf.keras.layers.GRU(units, return_sequences=True, return_state=True)
self.attention = tf.keras.layers.AdditiveAttention()
self.fc = tf.keras.layers.Dense(vocab_size)
def call(self, x, hidden, encoder_output):
x = self.embedding(x)
hidden = tf.expand_dims(hidden, axis=1)
context_vector, attention_score = self.attention(
[hidden, encoder_output],
return_attention_scores=True
)
x = tf.concat([context_vector, x], -1)
output, state = self.gru(x)
output = tf.reshape(output, (-1, output.shape[2]))
x = self.fc(output)
return x, state, attention_score
After GRU/LSTM:
class Decoder(tf.keras.Model):
def __init__(self, vocab_size, embedding_dim, hidden_dim, attention_func):
super(Decoder, self).__init__()
self.attention = LuongAttention(hidden_dim, attention_func)
self.hidden_dim = hidden_dim
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
self.lstm = tf.keras.layers.LSTM(hidden_dim, return_sequences=True, return_state=True)
self.wc = tf.keras.layers.Dense(hidden_dim, activation='tanh')
self.ws = tf.keras.layers.Dense(vocab_size)
def call(self, input_sequence, state, encoder_output):
# Remember that the input to the decoder
# is now a batch of one-word sequences,
# which means that its shape is (batch_size, 1)
embed = self.embedding(input_sequence)
# Therefore, the lstm_out has shape (batch_size, 1, hidden_dim)
lstm_out, state_h, state_c = self.lstm(embed, initial_state=state)
# Use self.attention to compute the context and alignment vectors
# context vector's shape: (batch_size, 1, hidden_dim)
# alignment vector's shape: (batch_size, 1, source_length)
context, alignment = self.attention(lstm_out, encoder_output)
# Combine the context vector and the LSTM output
# Before combined, both have shape of (batch_size, 1, hidden_dim),
# so let's squeeze the axis 1 first
# After combined, it will have shape of (batch_size, 2 * hidden_dim)
lstm_out = tf.concat([tf.squeeze(context, 1), tf.squeeze(lstm_out, 1)], 1)
# lstm_out now has shape (batch_size, hidden_dim)
lstm_out = self.wc(lstm_out)
# Finally, it is converted back to vocabulary space: (batch_size, vocab_size)
logits = self.ws(lstm_out)
return logits, state_h, state_c, alignment
# Reference: https://github.com/edumunozsala/NMT-encoder-decoder-Attention/blob/main/Intro-seq2seq-Encoder-Decoder-ENG-SPA-translator-tf2.ipynb
In both the cases, Attention is calculated at different steps. Which is True? When I start the training with both these Decoder layer it works the training starts. But question is which is the more reliable way to add Attention to Decoder layer?

Question about a time-series prediction LSTM with attention mechanism

I am working with time-series prediction with a simple LSTM model, I want to improve performance of my model, so I wonder how to add attention mechanism to my model. Here are codes of my model,
class RNN_LSTM(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_classes):
super(RNN_LSTM, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
self.fc = nn.Linear(hidden_size * sequence_length, num_classes)
self.dropout = nn.Dropout(p = drop_rate)
def forward(self, x):
# Set initial hidden and cell states
h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
# Forward propagate LSTM, out = [batch_size, seq_len, hidden_size]
out, _ = self.lstm(
x, (h0, c0)
)
# out: tensor of shape (batch_size, seq_length, hidden_size)
out = out.reshape(out.shape[0], -1)
# Decode the hidden state of the last time step
out = self.fc(out)
# out = out[:,-1,:].reshape(-1,1,144)
return out
I would be greatly thankful if you can provide any useful advise.

ValueError: Shapes (None, 5) and (None, 15, 5) are incompatible

I want to implement a Hierarchical attention mechanism for document classification presented by Yang. But I want to replace LSTM with Transformer.
I used Apoorv Nandan's text classification with Transformer:
https://keras.io/examples/nlp/text_classification_with_transformer/
I have implemented Transformer hierarchically to classification. One for sentence representation and another one for document representation. The code is as follow:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.utils.np_utils import to_categorical
class MultiHeadSelfAttention(layers.Layer):
def __init__(self, embed_dim, num_heads=8):
super(MultiHeadSelfAttention, self).__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
if embed_dim % num_heads != 0:
raise ValueError(
f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
)
self.projection_dim = embed_dim // num_heads
self.query_dense = layers.Dense(embed_dim)
self.key_dense = layers.Dense(embed_dim)
self.value_dense = layers.Dense(embed_dim)
self.combine_heads = layers.Dense(embed_dim)
def attention(self, query, key, value):
score = tf.matmul(query, key, transpose_b=True)
dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
scaled_score = score / tf.math.sqrt(dim_key)
weights = tf.nn.softmax(scaled_score, axis=-1)
output = tf.matmul(weights, value)
return output, weights
def separate_heads(self, x, batch_size):
x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
return tf.transpose(x, perm=[0, 2, 1, 3])
def call(self, inputs):
# x.shape = [batch_size, seq_len, embedding_dim]
batch_size = tf.shape(inputs)[0]
query = self.query_dense(inputs) # (batch_size, seq_len, embed_dim)
key = self.key_dense(inputs) # (batch_size, seq_len, embed_dim)
value = self.value_dense(inputs) # (batch_size, seq_len, embed_dim)
query = self.separate_heads(
query, batch_size
) # (batch_size, num_heads, seq_len, projection_dim)
key = self.separate_heads(
key, batch_size
) # (batch_size, num_heads, seq_len, projection_dim)
value = self.separate_heads(
value, batch_size
) # (batch_size, num_heads, seq_len, projection_dim)
attention, weights = self.attention(query, key, value)
attention = tf.transpose(
attention, perm=[0, 2, 1, 3]
) # (batch_size, seq_len, num_heads, projection_dim)
concat_attention = tf.reshape(
attention, (batch_size, -1, self.embed_dim)
) # (batch_size, seq_len, embed_dim)
output = self.combine_heads(
concat_attention
) # (batch_size, seq_len, embed_dim)
return output
def compute_output_shape(self, input_shape):
# it does not change the shape of its input
return input_shape
class TransformerBlock(layers.Layer):
def __init__(self, embed_dim, num_heads, ff_dim, dropout_rate, name=None):
super(TransformerBlock, self).__init__(name=name)
self.att = MultiHeadSelfAttention(embed_dim, num_heads)
self.ffn = keras.Sequential(
[layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim), ]
)
self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = layers.Dropout(dropout_rate)
self.dropout2 = layers.Dropout(dropout_rate)
def call(self, inputs, training):
attn_output = self.att(inputs)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(inputs + attn_output)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
return self.layernorm2(out1 + ffn_output)
def compute_output_shape(self, input_shape):
# it does not change the shape of its input
return input_shape
class TokenAndPositionEmbedding(layers.Layer):
def __init__(self, maxlen, vocab_size, embed_dim, name=None):
super(TokenAndPositionEmbedding, self).__init__(name=name)
self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
def call(self, x):
maxlen = tf.shape(x)[-1]
positions = tf.range(start=0, limit=maxlen, delta=1)
positions = self.pos_emb(positions)
x = self.token_emb(x)
return x + positions
def compute_output_shape(self, input_shape):
# it changes the shape from (batch_size, maxlen) to (batch_size, maxlen, embed_dim)
return input_shape + (self.pos_emb.output_dim,)
# Lower level (produce a representation of each sentence):
embed_dim = 100 # Embedding size for each token
num_heads = 2 # Number of attention heads
ff_dim = 64 # Hidden layer size in feed forward network inside transformer
L1_dense_units = 100 # Size of the sentence-level representations output by the word-level model
dropout_rate = 0.1
vocab_size = 1000
class_number = 5
max_docs = 10000
max_sentences = 15
max_words = 60
word_input = layers.Input(shape=(max_words,), name='word_input')
word_embedding = TokenAndPositionEmbedding(maxlen=max_words, vocab_size=vocab_size,
embed_dim=embed_dim, name='word_embedding')(word_input)
word_transformer = TransformerBlock(embed_dim=embed_dim, num_heads=num_heads, ff_dim=ff_dim,
dropout_rate=dropout_rate, name='word_transformer')(word_embedding)
word_pool = layers.GlobalAveragePooling1D(name='word_pooling')(word_transformer)
word_drop = layers.Dropout(dropout_rate, name='word_drop')(word_pool)
word_dense = layers.Dense(L1_dense_units, activation="relu", name='word_dense')(word_drop)
word_encoder = keras.Model(word_input, word_dense)
word_encoder.summary()
# =========================================================================
# Upper level (produce a representation of each document):
L2_dense_units = 100
sentence_input = layers.Input(shape=(max_sentences, max_words), name='sentence_input')
# This is the line producing "NotImplementedError":
sentence_encoder = tf.keras.layers.TimeDistributed(word_encoder, name='sentence_encoder')(sentence_input)
sentence_transformer = TransformerBlock(embed_dim=L1_dense_units, num_heads=num_heads, ff_dim=ff_dim,
dropout_rate=dropout_rate, name='sentence_transformer')(sentence_encoder)
sentence_dense = layers.TimeDistributed(layers.Dense(int(L2_dense_units)),name='sentence_dense')(sentence_transformer)
sentence_out = layers.Dropout(dropout_rate)(sentence_dense)
preds = layers.Dense(class_number , activation='softmax', name='sentence_output')(sentence_out)
model = keras.Model(sentence_input, preds)
model.summary()
#==========================================================================
Everything is OK(for testing you can copy and paste it in googlecolab). But when I compile and fit the model by following codes, it throws an error:
X = tf.random.uniform(shape=(max_docs, max_sentences, max_words), minval=1, maxval=1000, dtype=tf.dtypes.int32, seed=1)
y = tf.random.uniform(shape=(max_docs, ), minval=0, maxval=class_number , dtype=tf.dtypes.int32, seed=1)
y = to_categorical(y)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
history = model.fit(
X, y, batch_size=32, epochs=25,
)
The error is:
ValueError: Shapes (None, 5) and (None, 15, 5) are incompatible

When I had a similar error, I found that a Flatten() layer helped, I had incompatible shapes of (None, x, y) and (None, y).
If you try to provide a flatten layer for the part that gives you the (None, 15, 5), then it should output something like (None, 75).
The flatten layer merely removes dimensions, when I was doing this I got the output as (None, xy) and due to the way Tensorflow works, it was able to match both shapes as xy is obviously a factor of just y.

pytorch, Using nn.DataParallel in LSTM

/pytorch/aten/src/ATen/native/cudnn/RNN.cpp:1266: UserWarning: RNN module weights are not part of single contiguous chunk of memory.
This means they need to be compacted at every call, possibly greatly increasing memory usage. To compact weights again call flatten_parameters().
Hello. I am using pytorch.
I am trying to use DataParallel function in pytorch,
but the model is LSTM. I'm warned to flatten the model again,
but I don't know when and where to flatten.
Can you let me know?
This is my model
import torch.nn as nn
from torchvision import models
class ConvLstm(nn.Module):
def __init__(self, latent_dim, model, hidden_size, lstm_layers, bidirectional, n_class):
super(ConvLstm, self).__init__()
self.conv_model = Pretrained_conv(latent_dim, model)
self.Lstm = Lstm(latent_dim, hidden_size, lstm_layers, bidirectional)
self.output_layer = nn.Sequential(
nn.Linear(2 * hidden_size if bidirectional ==
True else hidden_size, n_class),
nn.Softmax(dim=-1)
)
def forward(self, x):
batch_size, timesteps, channel_x, h_x, w_x = x.shape
conv_input = x.view(batch_size * timesteps, channel_x, h_x, w_x)
conv_output = self.conv_model(conv_input)
lstm_input = conv_output.view(batch_size, timesteps, -1)
lstm_output = self.Lstm(lstm_input)
lstm_output = lstm_output[:, -1, :]
output = self.output_layer(lstm_output)
return output
class Pretrained_conv(nn.Module):
def __init__(self, latent_dim, model):
if model == 'resnet152':
super(Pretrained_conv, self).__init__()
self.conv_model = models.resnet152(pretrained=True)
# ====== freezing all of the layers ======
for param in self.conv_model.parameters():
param.requires_grad = False
# ====== changing the last FC layer to an output with the size we need. this layer is un freezed ======
self.conv_model.fc = nn.Linear(
self.conv_model.fc.in_features, latent_dim)
def forward(self, x):
return self.conv_model(x)
class Lstm(nn.Module):
def __init__(self, latent_dim, hidden_size, lstm_layers, bidirectional):
super(Lstm, self).__init__()
self.Lstm = nn.LSTM(latent_dim, hidden_size=hidden_size,
num_layers=lstm_layers, batch_first=True, bidirectional=bidirectional)
self.hidden_state = None
def reset_hidden_state(self):
self.hidden_state = None
def forward(self, x):
output, self.hidden_state = self.Lstm(x, self.hidden_state)
return output
Enter LSTM and execute the following code.
def foward_step(model, images, labels, criterion, mode=''):
model.module.Lstm.reset_hidden_state()
if mode == 'test':
with torch.no_grad():
output = model(images)
else:
output = model(images)
loss = criterion(output, labels)
# Accuracy calculation
predicted_labels = output.detach().argmax(dim=1)
acc = (predicted_labels == labels).cpu().numpy().sum()
return loss, acc, predicted_labels.cpu()
This is main
model = nn.DataParallel(model, device_ids=[0,1,2,3]).cuda()

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Implementing attention mechanism for neural machine translation - python

Related

My RNN with attention model always predict the same class even if my data is not imbalanced

When to calculate Attention in encoder-decoder network?

Question about a time-series prediction LSTM with attention mechanism

ValueError: Shapes (None, 5) and (None, 15, 5) are incompatible

pytorch, Using nn.DataParallel in LSTM

Categories

Resources