I am working with time-series prediction with a simple LSTM model, I want to improve performance of my model, so I wonder how to add attention mechanism to my model. Here are codes of my model,
class RNN_LSTM(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_classes):
super(RNN_LSTM, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
self.fc = nn.Linear(hidden_size * sequence_length, num_classes)
self.dropout = nn.Dropout(p = drop_rate)
def forward(self, x):
# Set initial hidden and cell states
h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
# Forward propagate LSTM, out = [batch_size, seq_len, hidden_size]
out, _ = self.lstm(
x, (h0, c0)
)
# out: tensor of shape (batch_size, seq_length, hidden_size)
out = out.reshape(out.shape[0], -1)
# Decode the hidden state of the last time step
out = self.fc(out)
# out = out[:,-1,:].reshape(-1,1,144)
return out
I would be greatly thankful if you can provide any useful advise.
Related
class Attention(nn.Module):
def __init__(self, hidden_size):
super(Attention, self).__init__()
self.hidden_size = hidden_size
# Create a two layer fully-connected network. Hint: Use nn.Sequential
# hidden_size*2 --> hidden_size, ReLU, hidden_size --> 1
self.attention_network = nn.Sequential(
nn.Linear(hidden_size*2, hidden_size),
nn.ReLU(),
nn.Linear(hidden_size, 1))
self.softmax = nn.Softmax(dim=1)
def forward(self, hidden, annotations):
"""The forward pass of the attention mechanism.
Arguments:
hidden: The current decoder hidden state. (batch_size x hidden_size)
annotations: The encoder hidden states for each step of the input sequence. (batch_size x seq_len x hidden_size)
Returns:
output: Normalized attention weights for each encoder hidden state. (batch_size x seq_len x 1)
The output must be a softmax weighting over the seq_len annotations.
"""
batch_size, seq_len, hid_size = annotations.size()
expanded_hidden = hidden.unsqueeze(1).expand_as(annotations)
# concat = ...
# reshaped_for_attention_net = ...
# attention_net_output = ...
# unnormalized_attention = ... # Reshape attention net output to have dimension batch_size x seq_len x 1
return self.softmax(unnormalized_attention)
In the forward function this is what I've tried:
concat = torch.cat((expanded_hidden, annotations), 2)
unnormalized_attention = self.attention_network(concat)
I'm trying to figure out
concat = ...
reshaped_for_attention_net = ...
attention_net_output = ...
unnormalized_attention = ...
This is my attention layer code :
implementation of attention layer
**class Attention(nn.Module):
def __init__(self, feature_dim, step_dim, bias=True, **kwargs):
super(Attention, self).__init__(**kwargs)
self.supports_masking = True
self.bias = bias
self.feature_dim = feature_dim
self.step_dim = step_dim
self.features_dim = 0
weight = torch.zeros(feature_dim, 1)
nn.init.kaiming_uniform_(weight)
self.weight = nn.Parameter(weight)
if bias:
self.b = nn.Parameter(torch.zeros(step_dim))
def forward(self, x, mask=None):
feature_dim = self.feature_dim
step_dim = self.step_dim
eij = torch.mm(
x.contiguous().view(-1, feature_dim),
self.weight
).view(-1, step_dim)
if self.bias:
eij = eij + self.b
eij = torch.tanh(eij)
a = torch.exp(eij)
if mask is not None:
a = a * mask
a = a / (torch.sum(a, 1, keepdim=True) + 1e-10)
weighted_input = x * torch.unsqueeze(a, -1)
return torch.sum(weighted_input, 1)**
This is RNN codes :
**# Instantiate the model w/ hyperparams
weights_matrix = weights_matrix
output_size = 13 # number of classes to predict
hidden_dim = 64
drop_prob = 0.5
# The RNN model that will be used to perform classification
class AttentionLSTM(nn.Module):
def __init__(self, weights_matrix, output_size, hidden_dim, drop_prob):
super(AttentionLSTM, self).__init__()
# embedding layers
self.embedding, self.num_embeddings, self.embeddings_size = create_emb_layer(weights_matrix, True)
# embedding dropout
self.dropout = nn.Dropout2d(drop_prob)
# First lstm and GRU layers
self.lstm1 = nn.LSTM(self.embeddings_size, hidden_dim, batch_first=True, bidirectional=True)
self.gru1 = nn.GRU(hidden_dim * 2, hidden_dim, bidirectional=True, batch_first=True)
# attention layer
self.attention = Attention(hidden_dim*2, seq_length)
# Second lstm and GRU layers
self.lstm2 = nn.LSTM(hidden_dim * 2, hidden_dim, batch_first=True, bidirectional=True)
self.gru2 = nn.GRU(hidden_dim * 2, hidden_dim, bidirectional=True, batch_first=True)
# linear
self.fc = nn.Linear(hidden_dim * 2, hidden_dim * 2)
self.out = nn.Linear(hidden_dim * 2, output_size)
# activation functions
self.sigmoid = nn.Sigmoid() # for hidden layers
self.softmax = nn.Softmax(dim=1) # for output layer
def forward(self, x):
batch_size = x.size(0)
# embedding output
x = x.long()
embeds = self.embedding(x)
embeds = torch.squeeze(torch.unsqueeze(embeds, 0))
# lstm, and gru outputs
lstm_out1, _ = self.lstm1(embeds)
gru_out1, _ = self.gru1(lstm_out1)
gru_out1 = gru_out1.view(batch_size, -1, hidden_dim * 2)
attention_out = self.attention(gru_out1, seq_length)
attention_out = attention_out.view(batch_size, -1, hidden_dim * 2)
attention_out = self.sigmoid(attention_out)
lstm_out2, _ = self.lstm2(attention_out)
# slice lstm_out to just get output of last element of the input sequence
lstm_out2 = lstm_out2[:, -1]
gru_out2, _ = self.gru2(lstm_out2)
# linear outputs
fc_out = self.softmax(self.fc(gru_out2))
final_out = self.out(fc_out)
return final_out**
I am sure that my dataset is balanced after pre-processing step but my model always predict the same output. Precision and fscore are changing for each input, however, this problem makes my recall score 1.0 since output is always same whatever input is.
If anybody help me, i will be appreciated
It required some time to build networks from your requirements but I provided a few samples to create a customer layer or model, you start from an embedded layer and suddenly random leaves of data create different input every time GRU and LSTM learning layers may provide good results when they had :
Matching input and target layer and parameters.
Learning scopes when they can differentiate input, repeating of gated current, and LSTM is specifically used when patterns of data are
significant such as pictures or continue data.
Linear, and Sigmoid provide contrast differentiate and softmax sometime we required when compared based on distribution values. This
is supposed to create contrast output excepted softmax applied on
weights of values.
Loss Fn is based on a similar output dimension/expectation
[ Sample ]:
class create_emb_layer( tf.keras.layers.Embedding ):
def __init__( self, weights_matrix, bidirectional=True ):
self.num_embeddings = weights_matrix[0]
self.embeddings_size = weights_matrix[1]
self.bidirectional = bidirectional
super(create_emb_layer, self).__init__( self.embeddings_size, self.num_embeddings )
def build(self, input_shape):
self.kernel = self.add_weight("kernel",
shape=[int(input_shape[-1]),
self.input_dim])
def call(self, inputs):
return tf.matmul(inputs, self.kernel)
[ My model ]:
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Model Initialize
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
model = tf.keras.models.Sequential([
tf.keras.layers.InputLayer(input_shape=( 32, 32, 4 )),
tf.keras.layers.Normalization(mean=3., variance=2.),
tf.keras.layers.Normalization(mean=4., variance=6.),
tf.keras.layers.Conv2D(32, (3, 3), activation='relu'),
tf.keras.layers.MaxPooling2D((2, 2)),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Reshape((128, 225)),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(96, return_sequences=True, return_state=False)),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(96)),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(192, activation='relu'),
tf.keras.layers.Dense(10),
])
[ Output ]:
I have been learning about Attention, and after reading some articles I am confused "when is attention calculated?". Some calculate attention before giving it to GRU (or LSTM) and some calculate attention after GRU. Here are two types of Decoder layer and both works, take a look at it:
Calculating Attention Before GRU:
Here, hidden is of shape (batch_size, 1, embedding_dim) aka Query and enc_output is (batch_size, max_len, embedding_dim) aka value.
class Decoder(tf.keras.layers.Layer):
def __init__(self, vocab_size, embedding_dim, units):
super().__init__()
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
self.gru = tf.keras.layers.GRU(units, return_sequences=True, return_state=True)
self.attention = tf.keras.layers.AdditiveAttention()
self.fc = tf.keras.layers.Dense(vocab_size)
def call(self, x, hidden, encoder_output):
x = self.embedding(x)
hidden = tf.expand_dims(hidden, axis=1)
context_vector, attention_score = self.attention(
[hidden, encoder_output],
return_attention_scores=True
)
x = tf.concat([context_vector, x], -1)
output, state = self.gru(x)
output = tf.reshape(output, (-1, output.shape[2]))
x = self.fc(output)
return x, state, attention_score
After GRU/LSTM:
class Decoder(tf.keras.Model):
def __init__(self, vocab_size, embedding_dim, hidden_dim, attention_func):
super(Decoder, self).__init__()
self.attention = LuongAttention(hidden_dim, attention_func)
self.hidden_dim = hidden_dim
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
self.lstm = tf.keras.layers.LSTM(hidden_dim, return_sequences=True, return_state=True)
self.wc = tf.keras.layers.Dense(hidden_dim, activation='tanh')
self.ws = tf.keras.layers.Dense(vocab_size)
def call(self, input_sequence, state, encoder_output):
# Remember that the input to the decoder
# is now a batch of one-word sequences,
# which means that its shape is (batch_size, 1)
embed = self.embedding(input_sequence)
# Therefore, the lstm_out has shape (batch_size, 1, hidden_dim)
lstm_out, state_h, state_c = self.lstm(embed, initial_state=state)
# Use self.attention to compute the context and alignment vectors
# context vector's shape: (batch_size, 1, hidden_dim)
# alignment vector's shape: (batch_size, 1, source_length)
context, alignment = self.attention(lstm_out, encoder_output)
# Combine the context vector and the LSTM output
# Before combined, both have shape of (batch_size, 1, hidden_dim),
# so let's squeeze the axis 1 first
# After combined, it will have shape of (batch_size, 2 * hidden_dim)
lstm_out = tf.concat([tf.squeeze(context, 1), tf.squeeze(lstm_out, 1)], 1)
# lstm_out now has shape (batch_size, hidden_dim)
lstm_out = self.wc(lstm_out)
# Finally, it is converted back to vocabulary space: (batch_size, vocab_size)
logits = self.ws(lstm_out)
return logits, state_h, state_c, alignment
# Reference: https://github.com/edumunozsala/NMT-encoder-decoder-Attention/blob/main/Intro-seq2seq-Encoder-Decoder-ENG-SPA-translator-tf2.ipynb
In both the cases, Attention is calculated at different steps. Which is True? When I start the training with both these Decoder layer it works the training starts. But question is which is the more reliable way to add Attention to Decoder layer?
I'm trying to implement a program that compares LSTM's performance vs GRU's performance for word prediction. I am using the same parameters for both of them, however while I am getting good perplexity values for the LSTM, the GRU values I'm getting are absolutely terrible.
I recently attempted to debug the training function since it originally only ranfor the LSTM model but not for the GRU model. As I already said, both models should get similar values, however for now the LSTM models starts with around ~150 perplexity and converges to a normal value, when the GRU model starts with some random value that's in the 1000s that does not converge at all.
I am quite new for all the RNN, LSTM, and GRU stuff, so forgive me if there's something obvious that I am missing.
Any help would be appriciated!
I use the following two models:
class LSTM_Model(nn.Module):
def __init__(self, vocab_size, embed_size, hidden_size, num_layers, dropout=0):
super(LSTM_Model, self).__init__()
self.embed = nn.Embedding(vocab_size, embed_size)
self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True, dropout = dropout)
self.fc = nn.Linear(hidden_size, vocab_size)
def forward(self, x, hidden_state):
x = self.embed(x)
out, (hidden_state, cell_state) = self.lstm(x, hidden_state)
out = out.reshape(out.size(0)*out.size(1), out.size(2)) # Reshape output to (batch_size*sequence_length, hidden_size)
out = self.fc(out)
return out, (hidden_state, cell_state)
class GRU_Model(nn.Module):
def __init__(self, vocab_size, embed_size, hidden_size, num_layers, dropout=0):
super(GRU_Model, self).__init__()
self.embed = nn.Embedding(vocab_size, embed_size)
self.gru = nn.GRU(embed_size, hidden_size, num_layers, batch_first=True, dropout = dropout)
self.fc = nn.Linear(hidden_size, vocab_size)
def forward(self, x, hidden_state):
x = self.embed(x)
out, hidden_state = self.gru(x, hidden_state)
out = out.reshape(out.size(0)*out.size(1), out.size(2)) # Reshape output to (batch_size*sequence_length, hidden_size)
out = self.fc(out)
return out, hidden_state
Training function:
def run_model(model, epochs=epochs, learning_rate=learning_rate, clip=clip, momentum=momentum, LSTM=True, GRU=False, Dropout=False):
# Define loss criterion and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=step_size, gamma=decay_rate)
train_perplexity, test_perplexity, valid_perplexity = [], [], []
# Train the model
for e in range(epochs):
# Set all initial hidden and cell states to zeroes
train_states=init_states(LSTM, GRU, num_layers, batch_size, hidden_size)
test_states=init_states(LSTM, GRU, num_layers, batch_size, hidden_size)
valid_states=init_states(LSTM, GRU, num_layers, batch_size, hidden_size)
# RUN TRAINING SET #
model.train()
for i in range(0, ids.size(1) - seq_length, seq_length):
# Set train_inputs and train_targets
train_inputs = ids[:, i:i+seq_length].to(device)
train_targets = ids[:, (i+1):(i+1)+seq_length].to(device)
# Forward pass
model.zero_grad()
if(LSTM==True):
train_states = [state.detach() for state in train_states] # Detach the hidden state from how it was previously produced
if(GRU==True):
train_states = train_states.data #detach?
train_outputs, train_states = model(train_inputs, train_states)
train_loss = criterion(train_outputs, train_targets.reshape(-1))
# Backward and optimize
train_loss.backward()
clip_grad_norm_(model.parameters(), clip)
optimizer.step()
lr_scheduler.step()
model.eval()
with torch.no_grad():
#test and validation, removed to reduce length
model.train() # reset to train mode after iterating through validation data
train_perplexity.append(math.exp(train_loss.item()))
test_perplexity.append(np.exp(np.mean(test_losses)))
valid_perplexity.append(np.exp(np.mean(valid_losses)))
print('Epoch ' + str(e+1) + '/' + str(epochs) + ': ')
print('Train Perplexity - ' + str(train_perplexity[e]))
print('Test Perplexity - ' + str(test_perplexity[e]))
print('Validation Perplexity - ' + str(valid_perplexity[e]))
print("----------------------------------------------------")
return train_perplexity, test_perplexity, valid_perplexity
Hidden state initialization:
def init_states(LSTM, GRU, num_layers=num_layers, batch_size=batch_size, hidden_size=hidden_size):
if (LSTM==True):
return (torch.FloatTensor(num_layers, batch_size, hidden_size).uniform_(r1, r2).to(device),
torch.FloatTensor(num_layers, batch_size, hidden_size).uniform_(r1, r2).to(device))
if (GRU==True):
return torch.FloatTensor(num_layers, batch_size, hidden_size).uniform_(r1, r2).to(device)
/pytorch/aten/src/ATen/native/cudnn/RNN.cpp:1266: UserWarning: RNN module weights are not part of single contiguous chunk of memory.
This means they need to be compacted at every call, possibly greatly increasing memory usage. To compact weights again call flatten_parameters().
Hello. I am using pytorch.
I am trying to use DataParallel function in pytorch,
but the model is LSTM. I'm warned to flatten the model again,
but I don't know when and where to flatten.
Can you let me know?
This is my model
import torch.nn as nn
from torchvision import models
class ConvLstm(nn.Module):
def __init__(self, latent_dim, model, hidden_size, lstm_layers, bidirectional, n_class):
super(ConvLstm, self).__init__()
self.conv_model = Pretrained_conv(latent_dim, model)
self.Lstm = Lstm(latent_dim, hidden_size, lstm_layers, bidirectional)
self.output_layer = nn.Sequential(
nn.Linear(2 * hidden_size if bidirectional ==
True else hidden_size, n_class),
nn.Softmax(dim=-1)
)
def forward(self, x):
batch_size, timesteps, channel_x, h_x, w_x = x.shape
conv_input = x.view(batch_size * timesteps, channel_x, h_x, w_x)
conv_output = self.conv_model(conv_input)
lstm_input = conv_output.view(batch_size, timesteps, -1)
lstm_output = self.Lstm(lstm_input)
lstm_output = lstm_output[:, -1, :]
output = self.output_layer(lstm_output)
return output
class Pretrained_conv(nn.Module):
def __init__(self, latent_dim, model):
if model == 'resnet152':
super(Pretrained_conv, self).__init__()
self.conv_model = models.resnet152(pretrained=True)
# ====== freezing all of the layers ======
for param in self.conv_model.parameters():
param.requires_grad = False
# ====== changing the last FC layer to an output with the size we need. this layer is un freezed ======
self.conv_model.fc = nn.Linear(
self.conv_model.fc.in_features, latent_dim)
def forward(self, x):
return self.conv_model(x)
class Lstm(nn.Module):
def __init__(self, latent_dim, hidden_size, lstm_layers, bidirectional):
super(Lstm, self).__init__()
self.Lstm = nn.LSTM(latent_dim, hidden_size=hidden_size,
num_layers=lstm_layers, batch_first=True, bidirectional=bidirectional)
self.hidden_state = None
def reset_hidden_state(self):
self.hidden_state = None
def forward(self, x):
output, self.hidden_state = self.Lstm(x, self.hidden_state)
return output
Enter LSTM and execute the following code.
def foward_step(model, images, labels, criterion, mode=''):
model.module.Lstm.reset_hidden_state()
if mode == 'test':
with torch.no_grad():
output = model(images)
else:
output = model(images)
loss = criterion(output, labels)
# Accuracy calculation
predicted_labels = output.detach().argmax(dim=1)
acc = (predicted_labels == labels).cpu().numpy().sum()
return loss, acc, predicted_labels.cpu()
This is main
model = nn.DataParallel(model, device_ids=[0,1,2,3]).cuda()