pytorch, Using nn.DataParallel in LSTM

pytorch, Using nn.DataParallel in LSTM - python

/pytorch/aten/src/ATen/native/cudnn/RNN.cpp:1266: UserWarning: RNN module weights are not part of single contiguous chunk of memory.
This means they need to be compacted at every call, possibly greatly increasing memory usage. To compact weights again call flatten_parameters().
Hello. I am using pytorch.
I am trying to use DataParallel function in pytorch,
but the model is LSTM. I'm warned to flatten the model again,
but I don't know when and where to flatten.
Can you let me know?
This is my model
import torch.nn as nn
from torchvision import models
class ConvLstm(nn.Module):
def __init__(self, latent_dim, model, hidden_size, lstm_layers, bidirectional, n_class):
super(ConvLstm, self).__init__()
self.conv_model = Pretrained_conv(latent_dim, model)
self.Lstm = Lstm(latent_dim, hidden_size, lstm_layers, bidirectional)
self.output_layer = nn.Sequential(
nn.Linear(2 * hidden_size if bidirectional ==
True else hidden_size, n_class),
nn.Softmax(dim=-1)
)
def forward(self, x):
batch_size, timesteps, channel_x, h_x, w_x = x.shape
conv_input = x.view(batch_size * timesteps, channel_x, h_x, w_x)
conv_output = self.conv_model(conv_input)
lstm_input = conv_output.view(batch_size, timesteps, -1)
lstm_output = self.Lstm(lstm_input)
lstm_output = lstm_output[:, -1, :]
output = self.output_layer(lstm_output)
return output
class Pretrained_conv(nn.Module):
def __init__(self, latent_dim, model):
if model == 'resnet152':
super(Pretrained_conv, self).__init__()
self.conv_model = models.resnet152(pretrained=True)
# ====== freezing all of the layers ======
for param in self.conv_model.parameters():
param.requires_grad = False
# ====== changing the last FC layer to an output with the size we need. this layer is un freezed ======
self.conv_model.fc = nn.Linear(
self.conv_model.fc.in_features, latent_dim)
def forward(self, x):
return self.conv_model(x)
class Lstm(nn.Module):
def __init__(self, latent_dim, hidden_size, lstm_layers, bidirectional):
super(Lstm, self).__init__()
self.Lstm = nn.LSTM(latent_dim, hidden_size=hidden_size,
num_layers=lstm_layers, batch_first=True, bidirectional=bidirectional)
self.hidden_state = None
def reset_hidden_state(self):
self.hidden_state = None
def forward(self, x):
output, self.hidden_state = self.Lstm(x, self.hidden_state)
return output
Enter LSTM and execute the following code.
def foward_step(model, images, labels, criterion, mode=''):
model.module.Lstm.reset_hidden_state()
if mode == 'test':
with torch.no_grad():
output = model(images)
else:
output = model(images)
loss = criterion(output, labels)
# Accuracy calculation
predicted_labels = output.detach().argmax(dim=1)
acc = (predicted_labels == labels).cpu().numpy().sum()
return loss, acc, predicted_labels.cpu()
This is main
model = nn.DataParallel(model, device_ids=[0,1,2,3]).cuda()

Related

Question about a time-series prediction LSTM with attention mechanism

I am working with time-series prediction with a simple LSTM model, I want to improve performance of my model, so I wonder how to add attention mechanism to my model. Here are codes of my model,
class RNN_LSTM(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_classes):
super(RNN_LSTM, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
self.fc = nn.Linear(hidden_size * sequence_length, num_classes)
self.dropout = nn.Dropout(p = drop_rate)
def forward(self, x):
# Set initial hidden and cell states
h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
# Forward propagate LSTM, out = [batch_size, seq_len, hidden_size]
out, _ = self.lstm(
x, (h0, c0)
)
# out: tensor of shape (batch_size, seq_length, hidden_size)
out = out.reshape(out.shape[0], -1)
# Decode the hidden state of the last time step
out = self.fc(out)
# out = out[:,-1,:].reshape(-1,1,144)
return out
I would be greatly thankful if you can provide any useful advise.

Trying to compute the loss of an encoder/decoder model

I am attempting to create an encoder/decoder model with mini-batch. I continue to encounter an errors stating:
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [32, 6]], which is output 0 of AsStridedBackward0, is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
The traceback reveals something is wrong with the y=self.linear(out) but I am unsure what exactly. Any help would be greatly appreciated. Below is the model. Thank you.
import torch
import torch.nn as nn
import numpy as np
from torch.autograd import Variable
from sliding_window import sliding_window
from training_datasets import get_training_datasets_batch
torch.autograd.set_detect_anomaly(True)
class Encoder(nn.Module):
def __init__(self, input_size, hidden_size, num_layers=1):
super(Encoder, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.gru = nn.GRU(input_size=input_size, hidden_size=hidden_size,num_layers=num_layers,batch_first=True)
def forward(self, x):
flat = x.view(x.shape[0], x.shape[1], self.input_size)
out,h = self.gru(flat)
return out, h
class Decoder(nn.Module):
def __init__(self, input_size, hidden_size, output_size=6, num_layers=1):
super(Decoder, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.output_size = output_size
self.gru = nn.GRU(input_size=input_size,hidden_size=hidden_size,num_layers=num_layers,batch_first=True)
self.linear = nn.Linear(hidden_size, output_size)
self.ReLU = nn.ReLU()
def forward(self, x, h):
x = x.unsqueeze(1)
out, h = self.gru(x, h)
out = out.squeeze(1)
print(out.shape)
y = self.linear(out)
print(y.shape)
y = self.ReLU(y)
return y,h
class EncoderDecoder(nn.Module):
def __init__(self, hidden_size, input_size=6, output_size=6):
super(EncoderDecoder, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.encoder = Encoder(input_size=input_size, hidden_size=hidden_size)
self.decoder = Decoder(input_size=input_size, hidden_size=hidden_size, output_size=output_size)
def train_model(self, ts, epochs, target_len, features, batch_size=64, test_len=288, method = 'teacher_forcing', tfr = 0.5, lr = 0.01, dynamic_tf=False):
X,Y= sliding_window(ts, features=288, target_len=target_len)
x_train, x_val, x_test, y_train, y_val, y_test = get_training_datasets_batch(X,Y, features, test_len=test_len, batch_size=batch_size)
losses = np.full(epochs,np.nan)
optimizer = torch.optim.SGD(filter(lambda x: x.requires_grad, self.parameters()),
lr=lr)
criterion = nn.MSELoss()
for e in range(epochs):
print('Starting epoch {}'.format(e))
x_train_data = iter(x_train)
y_train_data = iter(y_train)
x_val_data = iter(x_val)
y_val_data = iter(y_val)
x_train_shape = list(x_train)[0].shape
# predicted = torch.zeros(target_len,batch_size,x_train_shape[2])
# print(predicted.shape)
loss=0
for x_train_in in x_train_data:
optimizer.zero_grad()
x_train_in = Variable(x_train_in)
y_train_in = Variable(next(y_train_data).transpose(0,1))
_, enc_h = self.encoder(x_train_in)
dec_in = x_train_in[:,-1,:]
dec_h = enc_h
if method == 'recursive':
for t in range(target_len):
dec_out, dec_h = self.decoder(dec_in, dec_h)
predicted = dec_out
dec_in = dec_out
loss += criterion(predicted,y_train_in[t])
loss.backward(retain_graph=True)
optimizer.step()

The problem in this case was the loss.backward(retain_graph=True). The code started working after adding the line loss=0. The loss value continues to increase and needs to be reset.
loss.backward()
optimizer.step()
loss=0

tensorflow autodiff slower than pytorch's counterpart

I am using tensorflow 2.0 and trying to evaluate gradients for backpropagating to a simple feedforward neural network. Here's how my model looks like:
def __init__(self, input_size, output_size):
inputs = tf.keras.Input(shape=(input_size,))
hidden_layer1 = tf.keras.layers.Dense(30, activation='relu')(inputs)
outputs = tf.keras.layers.Dense(output_size)(hidden_layer1)
self.model = tf.keras.Model(inputs=inputs, outputs=outputs)
self.optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
self.loss_function = tf.keras.losses.Huber()
The forward pass to this network is fine but when I use gradient tape to train the model, it is at least 10x slower than PyTorch.
Training function:
def learn_modified_x(self, inputs, targets, actions):
with tf.GradientTape() as tape:
predictions = self.model(inputs)
predictions_for_action = gather_single_along_axis(predictions, actions)
loss = self.loss_function(targets, predictions_for_action)
grads = tape.gradient(loss, self.model.trainable_weights)
self.optimizer.apply_gradients(zip(grads, self.model.trainable_weights))
I tried commenting lines to find what is actually causing the problem. I discovered that tape.gradient is a significant contributor to this situation.
Any idea?
PyTorch implementation
def __init__(self, input_size, nb_action):
super(Network, self).__init__()
self.input_size = input_size
self.nb_action = nb_action
self.fc1 = nn.Linear(input_size, 30)
self.fc2 = nn.Linear(30, nb_action)
def forward(self, state):
x = F.relu(self.fc1(state))
q_values = self.fc2(x)
return q_values
def learn(self, batch_state, batch_next_state, batch_reward, batch_action):
outputs = self.model(batch_state).gather(1, batch_action.unsqueeze(1)).squeeze(1)
next_outputs = self.model(batch_next_state).detach().max(1)[0]
target = self.gamma*next_outputs + batch_reward
td_loss = F.smooth_l1_loss(outputs, target)
self.optimizer.zero_grad()
td_loss.backward(retain_variables = True)
self.optimizer.step()

def __init__(self,...):
...
self.model.call = tf.function(self.model.call)
...
you need use tf.function to wrap your model's call function.

Why does my function get good values for LSTM but not for GRU?

I'm trying to implement a program that compares LSTM's performance vs GRU's performance for word prediction. I am using the same parameters for both of them, however while I am getting good perplexity values for the LSTM, the GRU values I'm getting are absolutely terrible.
I recently attempted to debug the training function since it originally only ranfor the LSTM model but not for the GRU model. As I already said, both models should get similar values, however for now the LSTM models starts with around ~150 perplexity and converges to a normal value, when the GRU model starts with some random value that's in the 1000s that does not converge at all.
I am quite new for all the RNN, LSTM, and GRU stuff, so forgive me if there's something obvious that I am missing.
Any help would be appriciated!
I use the following two models:
class LSTM_Model(nn.Module):
def __init__(self, vocab_size, embed_size, hidden_size, num_layers, dropout=0):
super(LSTM_Model, self).__init__()
self.embed = nn.Embedding(vocab_size, embed_size)
self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True, dropout = dropout)
self.fc = nn.Linear(hidden_size, vocab_size)
def forward(self, x, hidden_state):
x = self.embed(x)
out, (hidden_state, cell_state) = self.lstm(x, hidden_state)
out = out.reshape(out.size(0)*out.size(1), out.size(2)) # Reshape output to (batch_size*sequence_length, hidden_size)
out = self.fc(out)
return out, (hidden_state, cell_state)
class GRU_Model(nn.Module):
def __init__(self, vocab_size, embed_size, hidden_size, num_layers, dropout=0):
super(GRU_Model, self).__init__()
self.embed = nn.Embedding(vocab_size, embed_size)
self.gru = nn.GRU(embed_size, hidden_size, num_layers, batch_first=True, dropout = dropout)
self.fc = nn.Linear(hidden_size, vocab_size)
def forward(self, x, hidden_state):
x = self.embed(x)
out, hidden_state = self.gru(x, hidden_state)
out = out.reshape(out.size(0)*out.size(1), out.size(2)) # Reshape output to (batch_size*sequence_length, hidden_size)
out = self.fc(out)
return out, hidden_state
Training function:
def run_model(model, epochs=epochs, learning_rate=learning_rate, clip=clip, momentum=momentum, LSTM=True, GRU=False, Dropout=False):
# Define loss criterion and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=step_size, gamma=decay_rate)
train_perplexity, test_perplexity, valid_perplexity = [], [], []
# Train the model
for e in range(epochs):
# Set all initial hidden and cell states to zeroes
train_states=init_states(LSTM, GRU, num_layers, batch_size, hidden_size)
test_states=init_states(LSTM, GRU, num_layers, batch_size, hidden_size)
valid_states=init_states(LSTM, GRU, num_layers, batch_size, hidden_size)
# RUN TRAINING SET #
model.train()
for i in range(0, ids.size(1) - seq_length, seq_length):
# Set train_inputs and train_targets
train_inputs = ids[:, i:i+seq_length].to(device)
train_targets = ids[:, (i+1):(i+1)+seq_length].to(device)
# Forward pass
model.zero_grad()
if(LSTM==True):
train_states = [state.detach() for state in train_states] # Detach the hidden state from how it was previously produced
if(GRU==True):
train_states = train_states.data #detach?
train_outputs, train_states = model(train_inputs, train_states)
train_loss = criterion(train_outputs, train_targets.reshape(-1))
# Backward and optimize
train_loss.backward()
clip_grad_norm_(model.parameters(), clip)
optimizer.step()
lr_scheduler.step()
model.eval()
with torch.no_grad():
#test and validation, removed to reduce length
model.train() # reset to train mode after iterating through validation data
train_perplexity.append(math.exp(train_loss.item()))
test_perplexity.append(np.exp(np.mean(test_losses)))
valid_perplexity.append(np.exp(np.mean(valid_losses)))
print('Epoch ' + str(e+1) + '/' + str(epochs) + ': ')
print('Train Perplexity - ' + str(train_perplexity[e]))
print('Test Perplexity - ' + str(test_perplexity[e]))
print('Validation Perplexity - ' + str(valid_perplexity[e]))
print("----------------------------------------------------")
return train_perplexity, test_perplexity, valid_perplexity
Hidden state initialization:
def init_states(LSTM, GRU, num_layers=num_layers, batch_size=batch_size, hidden_size=hidden_size):
if (LSTM==True):
return (torch.FloatTensor(num_layers, batch_size, hidden_size).uniform_(r1, r2).to(device),
torch.FloatTensor(num_layers, batch_size, hidden_size).uniform_(r1, r2).to(device))
if (GRU==True):
return torch.FloatTensor(num_layers, batch_size, hidden_size).uniform_(r1, r2).to(device)

migrating from keras to pytorch

i'm newly in pytorch
it is the model with bidirectional lstm, is there any body to tel me what is the equivalent of this two different lstm & bi-lstm model?
i try some torch codes but it do not worked.because this code has suitable acc in keras,i want the exact model in torch and i unfortunately can't find it:(
fist_one:
def lstm_model(embedding_size, vocab_size):
title = layers.Input(shape=(None,), dtype='int32', name='title')
body = layers.Input(shape=(None,), dtype='int32', name='body')
embedding = layers.Embedding(
mask_zero=True,
input_dim=vocab_size,
output_dim=embedding_size,
weights=[w2v_weights],
trainable=True
)
lstm_1 = layers.LSTM(units=80, return_sequences=True)
lstm_2 = layers.LSTM(units=80, return_sequences=False)
emb_title = embedding(title)
print("question embedding shape", emb_title.shape)
sum_a = lstm_2(lstm_1(emb_title))
print("q_output shape", sum_a.shape)
emb_body = embedding(body)
print("answer embedding shape", emb_body.shape)
sum_b = lstm_2(lstm_1(emb_body))
print("a_output shape", sum_a.shape)
sim = layers.dot([sum_a, sum_b], axes=1, normalize=True)
print("qa_similarity shape", sim.shape)
# sim = layers.Activation(activation='sigmoid')(sim)
sim_model = models.Model(
inputs=[title, body],
outputs=[sim],
)
sim_model.compile(loss='mean_squared_error', optimizer='nadam', metrics=['accuracy'])
embedding_model = models.Model(
inputs=[title],
outputs=[sum_a]
)
return sim_model, embedding_model
second one:
def bilstm_model(embedding_size, vocab_size):
title = layers.Input(shape=(None,), dtype='int32', name='title')
body = layers.Input(shape=(None,), dtype='int32', name='body')
embedding = layers.Embedding(
mask_zero=True,
input_dim=vocab_size,
output_dim=embedding_size,
weights=[w2v_weights],
trainable=True
)
lstm_1 = layers.Bidirectional(LSTM(activation='tanh', dropout=0.2, units=100, return_sequences=True))
lstm_2 = layers.Bidirectional(LSTM(activation='tanh', dropout=0.2, units=100, return_sequences=False))
sum_a = lstm_2(lstm_1(embedding(title)))
sum_b = lstm_2(lstm_1(embedding(body)))
sim = layers.dot([sum_a, sum_b], axes=1, normalize=True)
# sim = layers.Activation(activation='sigmoid')(sim)
sim_model = models.Model(
inputs=[title, body],
outputs=[sim],
)
sim_model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
embedding_model = models.Model(
inputs=[title],
outputs=[sum_a]
)
return sim_model, embedding_model
i;m llokingo for true answer in weeks:(

Here is the possible conversion of your first lstm_model to PyTorch
Usually, you create a class for your networks in PyTorch.
Therefore I'll be implementing LSTM using a class
from torch import nn
import torch.nn.functional as F
class LSTMModel(nn.Module):
def __init__(self, vocab_size, hidden_size, num_layers,
dropout, embedding_size):
super(LSTMModel, self).__init__()
self.encoder = nn.Embedding(num_embeddings=embedding_size,
embedding_dim=vocab_size)
self.rnn = getattr(nn, 'LSTM')(vocab_size,
hidden_size,
num_layers,
dropout=dropout)
self.decoder = nn.Linear(in_features=hidden_size,
out_features=embedding_size)
self.init_weights()
self.hidden_size = hidden_size
self.weight_size = (num_layers, vocab_size, hidden_size)
def init_weights(self):
init_range = 0.1
nn.init.uniform_(self.encoder.weight, -init_range,
init_range)
nn.init.zeros_(self.decoder.weight)
nn.init.uniform_(self.decoder.weight, -init_range,
init_range)
def forward(self, input_, hidden_):
embedded = self.encoder(input_)
output, hidden_ = self.rnn(embedded, hidden_)
decoded = self.decoder(hidden_)
return F.log_softmax(input=decoded, dim=1), hidden_
def init_hidden(self):
weight = next(self.parameters())
return (weight.new_zeros(self.weight_size),
weight.new_zeros(self.weight_size))
Now, if you directly use the network above, you might encounter some problems. In that case, you need to modify the values.

class Model(nn.Module):
def __init__(self, **kwargs):
super().__init__()
self.embeddings = nn.Embedding(num_embeddings=kwargs["vocab_size"],
embedding_dim=kwargs["embedding_dim"],
padding_idx=kwargs["pad_idx"])
self.embeddings.weight.requires_grad = True # to not refine-tune
if kwargs["model"] == "lstm":
self.lstm = nn.LSTM(input_size=kwargs["embedding_dim"], # input
hidden_size=kwargs["lstm_units"], # output
num_layers=kwargs["lstm_layers"],
bidirectional=False,
batch_first=True)
if kwargs["model"] == "BiLSTM":
self.lstm = nn.LSTM(input_size=kwargs["embedding_dim"], # input
hidden_size=kwargs["bilstm_units"], # output
num_layers=kwargs["bilstm_layers"],
bidirectional=True,
batch_first=True)
self.dropout = nn.Dropout(kwargs["dropout"])
self.tanh = F.tanh
self.dropout = nn.Dropout(kwargs["dropout"])
def forward(self):
pass
class LSTM_Model(Model):
"""
a class to define multiple models
"""
def __init__(self, **kwargs):
super().__init__(**kwargs)
def forward(self, question, answer):
question_embedding = self.embeddings(question)
# print("question embedding shape:", question_embedding.shape)
answer_embedding = self.embeddings(answer)
# print("answer embedding shape:", answer_embedding.shape)
q_output, (qhidden, qcell) = self.lstm(question_embedding)
print("q_output shape:", q_output.shape)
# print("qhidden shape:", qhidden.shape)
# print("qcell shape:", qcell.shape)
a_output, (ahidden, acell) = self.lstm(answer_embedding)
print("a_output shape:", a_output.shape)
# print("ahidden shape:", ahidden.shape)
# print("acell shape:", acell.shape)
# qa_similary = torch.mm(qhidden[-1], ahidden[-1])
# qa_similary =torch.matmul((qhidden[-1]), torc.th(ahidden[-1]))
q_output = q_output[-1]
q_output = q_output.squeeze()
a_output = a_output[-1]
a_output = a_output.squeeze()
mm = torch.mul((q_output), (a_output))
mm -= mm.min(1, keepdim=True)[0]
mm /= mm.max(1, keepdim=True)[0]
qa_similary =torch.mean(mm, dim=1)
# print("qa_similary shape:", qa_similary.shape)
return qa_similary, qhidden
print("**************************MODEL DEFINE & CREATED!****************************")
is this a true and completely exact implemetation of that keras code for two layer lstm?

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

pytorch, Using nn.DataParallel in LSTM - python

Related

Question about a time-series prediction LSTM with attention mechanism

Trying to compute the loss of an encoder/decoder model

tensorflow autodiff slower than pytorch's counterpart

Why does my function get good values for LSTM but not for GRU?

migrating from keras to pytorch

Categories

Resources