Pytorch variable size input for GRUCell - python

I'm new in PyTorch and I'm trying to train something with variable size inputs. And I want to use nn.GRUCell not nn.GRU. I firstly post padded the inputs, such that it could be of the same length. I want to whether my Model class is correct or not. What I'm doing is, I'm storing the output after every time_step into output_sequence and then only keeping the output only for the trajectory_length point.
class Model(nn.Module):
def __init__(self, cell, length, input_size, hidden_size, num_layers, output_size):
super(Model,self).__init__()
self.cell = cell
self.length = length
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.output_size = output_size
self.rnn = cell(self.input_size, self.hidden_size)
self.linear1 = nn.Linear(self.hidden_size+1, self.hidden_size)
self.linear2 = nn.Linear(self.hidden_size, self.output_size)
def forward(self, state, batch_size, action_masks, trajectory_length):
hidden_layer = torch.zeros(batch_size, self.hidden_size)
state = torch.transpose(state, 0, 1).float()
output_sequence = []
for i in range(self.length):
hidden_layer = self.rnn(state[i], hidden_layer)
output_sequence.append(hidden_layer)
output = torch.stack([output_sequence[l-1][i] for i, l in enumerate(trajectory_length)])
output = self.linear2(output)
for i in range(batch_size):
output[i][~action_masks[i]] = float('-inf')
return output

Related

NLP with PyTorch - RuntimeError: shape '[32, 128, 1]' is invalid for input of size 61440

I'm trying to run this code for the attention model in NLP.
class DecoderAttn(nn.Module):
def __init__(self, output_dim, emb_dim, hid_dim, n_layers, attn_dim):
super().__init__()
self.hid_dim = hid_dim
self.n_layers = n_layers
self.output_dim = output_dim
self.embedding = nn.Embedding(output_dim, emb_dim)
self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, batch_first=True)
# Implement this
# BEGIN
# self.fc_out =
# add attention layer and linear transform layers
# attention layer's elements defined (key, query, value).
self.key = nn.Linear(hid_dim, attn_dim)
self.query = nn.Linear(hid_dim, attn_dim)
self.value = nn.Linear(hid_dim, attn_dim)
self.attention = nn.MultiheadAttention(attn_dim, 128)
self.fc_out = nn.Linear(hid_dim, output_dim)
# END
def forward(self, input, hidden, encoder_outputs):
#input: [batch size]
#hidden: [batch size, hid_dim]
#encoder_outputs: [batch size, src_len, hid_dim]
input = input.unsqueeze(1)
#input: [batch size, 1]
embedded = self.embedding(input)
#embedded: [batch size, 1, emb dim]
output, hidden = self.rnn(embedded, hidden)
# implement this
# BEGIN
# compute v* (attention output)
# compute prediction, using a fully connected layer that takes as input
# both attention output and output from GRU
attention_out, attention_out_w = self.attention(self.query(output), self.key(encoder_outputs), self.value(encoder_outputs))
concat_out = torch.cat((output, attention_out), 2)
prediction = self.fc_out(output.squeeze(1))
# END
#prediction : [batch size, output dim]
return prediction, hidden
and after this part trying to run this:
INPUT_DIM = len(CHARS.vocab)
OUTPUT_DIM = len(PHONEMES.vocab)
ENC_EMB_DIM = 500
DEC_EMB_DIM = 50
HID_DIM = 256
ATTN_DIM = 128
N_LAYERS = 4
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS)
dec = DecoderAttn(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, ATTN_DIM)
model_attn = Seq2Seq(enc, dec, device).to(device)
N_EPOCHS = 10
CLIP = 1
train(model_attn, N_EPOCHS, CLIP)
but it gives the following error:
RuntimeError: shape '[32, 128, 1]' is invalid for input of size
61440
I am also adding the google colab link. It could be easier to investigate this way:
https://colab.research.google.com/drive/1MEemocW8nvebjq17CNnRzvUHdd_wQumP?usp=sharing

Question about a time-series prediction LSTM with attention mechanism

I am working with time-series prediction with a simple LSTM model, I want to improve performance of my model, so I wonder how to add attention mechanism to my model. Here are codes of my model,
class RNN_LSTM(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_classes):
super(RNN_LSTM, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
self.fc = nn.Linear(hidden_size * sequence_length, num_classes)
self.dropout = nn.Dropout(p = drop_rate)
def forward(self, x):
# Set initial hidden and cell states
h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
# Forward propagate LSTM, out = [batch_size, seq_len, hidden_size]
out, _ = self.lstm(
x, (h0, c0)
)
# out: tensor of shape (batch_size, seq_length, hidden_size)
out = out.reshape(out.shape[0], -1)
# Decode the hidden state of the last time step
out = self.fc(out)
# out = out[:,-1,:].reshape(-1,1,144)
return out
I would be greatly thankful if you can provide any useful advise.

Trying to compute the loss of an encoder/decoder model

I am attempting to create an encoder/decoder model with mini-batch. I continue to encounter an errors stating:
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [32, 6]], which is output 0 of AsStridedBackward0, is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
The traceback reveals something is wrong with the y=self.linear(out) but I am unsure what exactly. Any help would be greatly appreciated. Below is the model. Thank you.
import torch
import torch.nn as nn
import numpy as np
from torch.autograd import Variable
from sliding_window import sliding_window
from training_datasets import get_training_datasets_batch
torch.autograd.set_detect_anomaly(True)
class Encoder(nn.Module):
def __init__(self, input_size, hidden_size, num_layers=1):
super(Encoder, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.gru = nn.GRU(input_size=input_size, hidden_size=hidden_size,num_layers=num_layers,batch_first=True)
def forward(self, x):
flat = x.view(x.shape[0], x.shape[1], self.input_size)
out,h = self.gru(flat)
return out, h
class Decoder(nn.Module):
def __init__(self, input_size, hidden_size, output_size=6, num_layers=1):
super(Decoder, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.output_size = output_size
self.gru = nn.GRU(input_size=input_size,hidden_size=hidden_size,num_layers=num_layers,batch_first=True)
self.linear = nn.Linear(hidden_size, output_size)
self.ReLU = nn.ReLU()
def forward(self, x, h):
x = x.unsqueeze(1)
out, h = self.gru(x, h)
out = out.squeeze(1)
print(out.shape)
y = self.linear(out)
print(y.shape)
y = self.ReLU(y)
return y,h
class EncoderDecoder(nn.Module):
def __init__(self, hidden_size, input_size=6, output_size=6):
super(EncoderDecoder, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.encoder = Encoder(input_size=input_size, hidden_size=hidden_size)
self.decoder = Decoder(input_size=input_size, hidden_size=hidden_size, output_size=output_size)
def train_model(self, ts, epochs, target_len, features, batch_size=64, test_len=288, method = 'teacher_forcing', tfr = 0.5, lr = 0.01, dynamic_tf=False):
X,Y= sliding_window(ts, features=288, target_len=target_len)
x_train, x_val, x_test, y_train, y_val, y_test = get_training_datasets_batch(X,Y, features, test_len=test_len, batch_size=batch_size)
losses = np.full(epochs,np.nan)
optimizer = torch.optim.SGD(filter(lambda x: x.requires_grad, self.parameters()),
lr=lr)
criterion = nn.MSELoss()
for e in range(epochs):
print('Starting epoch {}'.format(e))
x_train_data = iter(x_train)
y_train_data = iter(y_train)
x_val_data = iter(x_val)
y_val_data = iter(y_val)
x_train_shape = list(x_train)[0].shape
# predicted = torch.zeros(target_len,batch_size,x_train_shape[2])
# print(predicted.shape)
loss=0
for x_train_in in x_train_data:
optimizer.zero_grad()
x_train_in = Variable(x_train_in)
y_train_in = Variable(next(y_train_data).transpose(0,1))
_, enc_h = self.encoder(x_train_in)
dec_in = x_train_in[:,-1,:]
dec_h = enc_h
if method == 'recursive':
for t in range(target_len):
dec_out, dec_h = self.decoder(dec_in, dec_h)
predicted = dec_out
dec_in = dec_out
loss += criterion(predicted,y_train_in[t])
loss.backward(retain_graph=True)
optimizer.step()
The problem in this case was the loss.backward(retain_graph=True). The code started working after adding the line loss=0. The loss value continues to increase and needs to be reset.
loss.backward()
optimizer.step()
loss=0

PyTorch LSTM categorical model - output to target mapping

I have a network which outputs a vector of length two. My targets are in the form of 1 or zeros, referring to two possible categories. What is the best way to get the loss - i.e. should I transform the targets, for example into a dimension 2 vector, or should I transform the output of the network, e.g. take the location of the max number as the output?
My network looks like:
class LSTMClassifier(nn.Module):
def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
super().__init__()
self.hidden_dim = hidden_dim
self.layer_dim = layer_dim
self.lstm1 = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)
self.lstm2 = nn.LSTM(hidden_dim, hidden_dim, layer_dim, batch_first=True)
self.fc1 = nn.Linear(hidden_dim, 32)
self.fc2 = nn.Linear(32, 1)
self.dropout = nn.Dropout(p=0.2)
self.batch_normalisation1 = nn.BatchNorm1d(layer_dim)
self.batch_normalisation2 = nn.BatchNorm1d(2)
self.activation = nn.Softmax(dim=2)
def forward(self, x):
h0, c0 = self.init_hidden(x)
out, (hn1, cn1) = self.lstm1(x, (h0, c0))
out = self.dropout(out,)
out = self.batch_normalisation1(out)
h1, c1 = self.init_hidden(out)
out, (hn2, cn2) = self.lstm2(out, (h1, c1))
out = self.dropout(out)
out = self.batch_normalisation1(out)
h2, c2 = self.init_hidden(out)
out, (hn3, cn3) = self.lstm2(out, (h2, c2))
out = self.dropout(out)
out = self.batch_normalisation1(out)
out = self.fc1(out[:, -1, :])
out = self.dropout(out)
out = self.fc2(out)
return out
def init_hidden(self, x):
h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim)
c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim)
return [t for t in (h0, c0)]
def pred(self, x):
out = self(x)
return out > 0
An example of input to this network is:
tensor([[[0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
[2.3597e-04, 1.1507e-02, 8.7719e-02, 6.1093e-02, 9.5556e-01],
[2.1474e-03, 5.3805e-03, 9.6491e-02, 2.2508e-01, 8.2222e-01]]])
which has shape torch.Size([1, 3, 5]). The target is currently 1 or 0. However, the network outputs a vector such as:
tensor([[0.5293, 0.4707]], grad_fn=<SoftmaxBackward>)
What would be the best way to set up the loss between these target and the network output?
Update:
I can now train the model as suggested in the answers as:
model = LSTMClassifier(5, 128, 3, 1)
Epochs = 10
batch_size = 32
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-6)
for epoch in range(Epochs):
if epoch == 0:
accurate = 0
for X_instance, y_instance in zip(val_x, val_y):
if int(y_instance) == 1 and model.pred(X_instance.view(-1, 3, 5)).item():
accurate += 1
print(f"Untrained accuracy test set: {accurate/len(val_x)}")
print(f"Epoch {epoch + 1}")
for n, (X, y) in enumerate(train_batches):
model.train()
optimizer.zero_grad()
y_pred = model(X)
loss = criterion(y_pred, y)
loss.backward()
optimizer.step()
model.eval()
accurate = 0
for X_instance, y_instance in zip(val_x, val_y):
if int(y_instance) == 1 and model.pred(X_instance.view(-1, 3, 5)).item():
accurate += 1
print(f"Accuracy test set: {accurate/len(val_x)}")
You shouldn't use any activation at the end of your network and output only a single neuron instead of two (trained with BCEWithLogitsLoss).
Below is your neural network code with commentary and removal of unnecessary parts:
class LSTMClassifier(nn.Module):
def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
super().__init__()
self.hidden_dim = hidden_dim
self.layer_dim = layer_dim
self.lstm1 = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)
self.lstm2 = nn.LSTM(hidden_dim, hidden_dim, layer_dim, batch_first=True)
self.fc1 = nn.Linear(hidden_dim, 32)
# Output 1 neuron instead of two
self.fc2 = nn.Linear(32, 1)
# Model should not depend on batch size
# self.batch_size = None
# You are not using this variable
# self.hidden = None
self.dropout = nn.Dropout(p=0.2)
self.batch_normalisation1 = nn.BatchNorm1d(layer_dim)
self.batch_normalisation2 = nn.BatchNorm1d(2)
def forward(self, x):
# Hidden are initialized with 0 explicitly
# h0, c0 = self.init_hidden(x)
out, _ = self.lstm1(x)
# No need for initial values
# out, (hn1, cn1) = self.lstm1(x, (h0, c0))
out = self.dropout(out)
out = self.batch_normalisation1(out)
# Same for all other cells you re-init with zeros, it's implicit
out, _ = self.lstm2(out)
out = self.dropout(out)
out = self.batch_normalisation1(out)
out, _ = self.lstm2(out)
out = self.dropout(out)
out = self.batch_normalisation1(out)
out = self.fc1(out[:, -1, :])
out = self.dropout(out)
# No need for activation
# out = F.softmax(self.fc2(out))
out = self.fc2(out)
return out
# Return True (1) or False (0)
def pred(self, x):
return self(x) > 0
I have also added pred method which transforms logits into targets (e.g. to use with some metrics).
Basically, if your logit is lower than 0 it is False, otherwise it is True. No need for activation in this case.

pytorch, Using nn.DataParallel in LSTM

/pytorch/aten/src/ATen/native/cudnn/RNN.cpp:1266: UserWarning: RNN module weights are not part of single contiguous chunk of memory.
This means they need to be compacted at every call, possibly greatly increasing memory usage. To compact weights again call flatten_parameters().
Hello. I am using pytorch.
I am trying to use DataParallel function in pytorch,
but the model is LSTM. I'm warned to flatten the model again,
but I don't know when and where to flatten.
Can you let me know?
This is my model
import torch.nn as nn
from torchvision import models
class ConvLstm(nn.Module):
def __init__(self, latent_dim, model, hidden_size, lstm_layers, bidirectional, n_class):
super(ConvLstm, self).__init__()
self.conv_model = Pretrained_conv(latent_dim, model)
self.Lstm = Lstm(latent_dim, hidden_size, lstm_layers, bidirectional)
self.output_layer = nn.Sequential(
nn.Linear(2 * hidden_size if bidirectional ==
True else hidden_size, n_class),
nn.Softmax(dim=-1)
)
def forward(self, x):
batch_size, timesteps, channel_x, h_x, w_x = x.shape
conv_input = x.view(batch_size * timesteps, channel_x, h_x, w_x)
conv_output = self.conv_model(conv_input)
lstm_input = conv_output.view(batch_size, timesteps, -1)
lstm_output = self.Lstm(lstm_input)
lstm_output = lstm_output[:, -1, :]
output = self.output_layer(lstm_output)
return output
class Pretrained_conv(nn.Module):
def __init__(self, latent_dim, model):
if model == 'resnet152':
super(Pretrained_conv, self).__init__()
self.conv_model = models.resnet152(pretrained=True)
# ====== freezing all of the layers ======
for param in self.conv_model.parameters():
param.requires_grad = False
# ====== changing the last FC layer to an output with the size we need. this layer is un freezed ======
self.conv_model.fc = nn.Linear(
self.conv_model.fc.in_features, latent_dim)
def forward(self, x):
return self.conv_model(x)
class Lstm(nn.Module):
def __init__(self, latent_dim, hidden_size, lstm_layers, bidirectional):
super(Lstm, self).__init__()
self.Lstm = nn.LSTM(latent_dim, hidden_size=hidden_size,
num_layers=lstm_layers, batch_first=True, bidirectional=bidirectional)
self.hidden_state = None
def reset_hidden_state(self):
self.hidden_state = None
def forward(self, x):
output, self.hidden_state = self.Lstm(x, self.hidden_state)
return output
Enter LSTM and execute the following code.
def foward_step(model, images, labels, criterion, mode=''):
model.module.Lstm.reset_hidden_state()
if mode == 'test':
with torch.no_grad():
output = model(images)
else:
output = model(images)
loss = criterion(output, labels)
# Accuracy calculation
predicted_labels = output.detach().argmax(dim=1)
acc = (predicted_labels == labels).cpu().numpy().sum()
return loss, acc, predicted_labels.cpu()
This is main
model = nn.DataParallel(model, device_ids=[0,1,2,3]).cuda()

Categories

Resources