I am kind of a beginner in RNNs, so I coded a LSTM architecture using Pytorch, but I always run out of RAM whenever I am in the 3rd epoch. I am already using a DataLoader and I tried to detach the gradient from the input tensor but it doesn't solve the problem out.
This is my training loop
writer = SummaryWriter()
criterion = nn.CrossEntropyLoss(reduction='mean', ignore_index = 0)
optimizer = optim.Adam(lstm.parameters(), lr = 1e-5)
gradient_clip = clip_grad_norm_(lstm.parameters(), max_norm = 5)
num_epochs = 20
epoch_loss = -1.0
loss = - 1
t = trange(num_epochs, desc= "Epoch loss", leave=True)
for epoch in t:
trainLoader = iter(DataLoader(dataset, batch_size = batch_size))
tt = trange(len(trainLoader)-1, desc= "Batch loss", leave=True)
for i in tt:
text, embedding = next(trainLoader)
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
y = lstm.forward(embedding.transpose(1,0))
labels = text.transpose(0,1)[1:].transpose(0,1).flatten()
loss = criterion(y.reshape(-1, y.shape[-1]), labels)
tt.set_description("Batch loss : %.4f" % loss)
tt.refresh()
loss.backward(retain_graph=True)
optimizer.step()
epoch_loss += loss
epoch_loss = epoch_loss / (len(trainLoader) - 1)
# Saving model
save_date = datetime.now().strftime("%d%m%Y-%H:%M:%S")
PATH = './save/lstm_model_'+save_date
torch.save(lstm, PATH)
# Updating progression bar
t.set_description("Epoch loss : %.4f" % epoch_loss)
t.refresh()
# Plotting gradients histograms in Tensorboard
writer.add_scalar('Text_generation_Loss/train', epoch_loss, epoch)
for tag, parm in lstm.named_parameters():
with torch.no_grad():
writer.add_histogram(tag, parm.grad.data.cpu().numpy(), epoch)
writer.flush()
print('Finished Training')
writer.close()
And this is the LSTM class that I built:
class LSTM(nn.Module):
def __init__(self, in_size : int, hidden_size : int):
super().__init__()
self.in_size = in_size
self.hidden_size = hidden_size
self.W_fi = nn.Linear(in_size,hidden_size)
self.W_fh = nn.Linear(hidden_size,hidden_size, bias=False)
self.W_ii = nn.Linear(in_size,hidden_size)
self.W_ih = nn.Linear(hidden_size,hidden_size, bias=False)
self.W_Ci = nn.Linear(in_size,hidden_size)
self.W_Ch = nn.Linear(hidden_size,hidden_size, bias=False)
self.W_oi = nn.Linear(in_size,hidden_size)
self.W_oh = nn.Linear(hidden_size,hidden_size, bias=False)
self.sigmoid = nn.Sigmoid()
self.tanh = nn.Tanh()
def one_step(self, x, h, C):
f_t = self.sigmoid(self.W_fi(x) + self.W_fh(h))
i_t = self.sigmoid(self.W_ii(x) + self.W_ih(h))
g_t = self.tanh(self.W_Ci(x) + self.W_Ch(h))
C_t = torch.mul(f_t, C) + torch.mul(i_t, g_t)
o_t = self.sigmoid(self.W_oi(x) + self.W_oh(h))
h_t = torch.mul(o_t, self.tanh(C_t))
return h_t, C_t
def forward(self, X):
h_out = []
h = - torch.ones(X.shape[1], self.hidden_size)
C = - torch.ones(X.shape[1], self.hidden_size)
h_t, C_t = self.one_step(X[0], h, C)
h_out.append(h_t)
for i in range(1, X.shape[0] - 1):
h_t, C_t = self.one_step(X[i], h_t, C_t)
h_out.append(h_t)
h_out = torch.cat(h_out)
return h_out #h_out.reshape(-1,batch_size,num_embeddings)
I already searched for a similar case but I wasn't able to find a solution
I don't know if it may help somebody, but I solved the problem. I wasn't perhaps clear about the task, but the goal was to make text generation. The first thing I was doing is embed the sentences using torch.nn.embedding that was defined outside my LSTM. The solution was to include it as a layer of my network, since the embedding is not a pretrained one and should be learned too.
Related
I am currently building an LSTM model in Pytorch to predict the next word of a given input.
My model:
class LSTM(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout_rate, tie_weights):
super().__init__()
self.num_layers = num_layers
self.hidden_dim = hidden_dim
self.embedding_dim = embedding_dim
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers,
dropout=dropout_rate, batch_first=True)
self.dropout = nn.Dropout(dropout_rate)
self.linear = nn.Linear(hidden_dim, vocab_size)
if tie_weights:
#Embedding and hidden layer need to be same size for weight tieing
assert embedding_dim == hidden_dim, 'cannot tie, check dims'
self.linear.weight = self.embedding.weight
self.init_weights()
def forward(self, x):
# x is a batch of input sequences
x = self.embedding(x)
x, _ = self.lstm(x)
x = self.linear(x)
return x
def init_weights(self):
init_range_emb = 0.1
init_range_other = 1/math.sqrt(self.hidden_dim)
self.embedding.weight.data.uniform_(-init_range_emb, init_range_emb)
self.linear.weight.data.uniform_(-init_range_other, init_range_other)
self.linear.bias.data.zero_()
for i in range(self.num_layers):
self.lstm.all_weights[i][0] = torch.FloatTensor(self.embedding_dim,
self.hidden_dim).uniform_(-init_range_other, init_range_other)
self.lstm.all_weights[i][1] = torch.FloatTensor(self.hidden_dim,
self.hidden_dim).uniform_(-init_range_other, init_range_other)
vocab_size = len(vocab)
embedding_dim = 100
hidden_dim = 100
num_layers = 2
dropout_rate = 0.4
tie_weights = True
model = LSTM(vocab_size, embedding_dim, hidden_dim, num_layers, dropout_rate, tie_weights)
model.to(device)
Training and evaluation funciton:
import copy
import time
criterion = nn.CrossEntropyLoss()
lr = 20.0 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
def train(model: nn.Module) -> None:
model.train() # turn on train mode
total_loss = 0.
log_interval = 200
start_time = time.time()
num_batches = len(train_data) // bptt
for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
data, targets = get_batch(train_data, i)
seq_len = data.size(0)
output = model(data)
loss = criterion(output.view(-1, vocab_size), targets)
optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
optimizer.step()
total_loss += loss.item()
if batch % log_interval == 0 and batch > 0:
lr = scheduler.get_last_lr()[0]
ms_per_batch = (time.time() - start_time) * 1000 / log_interval
cur_loss = total_loss / log_interval
ppl = math.exp(cur_loss)
print(f'| epoch {epoch:3d} | {batch:5d}/{num_batches:5d} batches | '
f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | '
f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
total_loss = 0
start_time = time.time()
def evaluate(model: nn.Module, eval_data: Tensor) -> float:
model.eval() # turn on evaluation mode
total_loss = 0.
with torch.no_grad():
for i in range(0, eval_data.size(0) - 1, bptt):
data, targets = get_batch(eval_data, i)
seq_len = data.size(0)
output = model(data)
output_flat = output.view(-1, vocab_size)
total_loss += seq_len * criterion(output_flat, targets).item()
return total_loss / (len(eval_data) - 1)
Training loop
best_val_loss = float('inf')
epochs = 50
best_model = None
for epoch in range(1, epochs + 1):
epoch_start_time = time.time()
train(model)
val_loss = evaluate(model, val_data)
val_ppl = math.exp(val_loss)
elapsed = time.time() - epoch_start_time
print('-' * 89)
print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
f'valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}')
print('-' * 89)
if val_loss < best_val_loss:
best_val_loss = val_loss
best_model = copy.deepcopy(model)
scheduler.step()
My problem is I have no idea how to go about this. I've seen some implementations of character based LSTM text generators but I'm looking for it to be word based. For example I want to pass an input like "How are you" and the output will included the next predicted word, like for example "How are you today"
Any help appreciated.
I would suggest to try the example in the attached link(https://www.kaggle.com/code/ysthehurricane/next-word-prediction-bi-lstm-tutorial-easy-way).
You can download the dataset from the attached link below.
(https://www.kaggle.com/datasets/dorianlazar/medium-articles-dataset)
It tries to predict the next word using Bi-directional LSTM architecture. I think that this example mostly suits to your needs, which will give you an idea to proceed further.
You can follow the instruction provided in the first link.
I'm trying to reconstruct this paper about hierarchical autoencoder for paragraphs.
The idea is: Break a paragraph into sentences, then encode each sentence using an LSTM, and then using these encoding as an input for another LSTM that encode the entire paragraph.
Then, using a mirror decoder, decode the encoded paragraph using an LSTM into multiple sentences, and then use another LSTM to decode each word, with a linear layer on top and predicts the word.
The objective is to try to predict the original paragraph.
I've done some preprocessing, and right now I save each paragraph as a tensor of (maxSentence,maxWordsPerSentence,VocabSize), using one hot encoding.
My problem is, there model is not learning. The loss stays exactly the same and it doesn't seem as anything is happening.. I wasn't sure on how to calculate the loss (I've ran a batch all together and decoded it into multiple paragraphs, and then calculated the loss against the entire batch predictions, my train function is added below. I don't know if that is the problem (maybe I should calculate loss sentence by sentence instead the entire paragraph?) or maybe I have a problem in my model.
Encoder code:
class Encoder(nn.Module):
def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
super().__init__()
#self.embedding = nn.Embedding(input_dim, emb_dim)
self.rnn_sent = nn.GRU(input_dim, enc_hid_dim, bidirectional = True)
self.rnn_par = nn.GRU(enc_hid_dim*2, dec_hid_dim, bidirectional = True)
def forward(self, src):
outputs, hidden = self.rnn_sent(src[:,0,0])
total_out = outputs.unsqueeze(0).permute(1,0,2)
for i in range(1,src.shape[1]):
for j in range(src.shape[2]):
outputs, hidden = self.rnn_sent(src[:,i,j],hidden)
total_out = torch.cat((total_out,outputs.unsqueeze(0).permute(1,0,2)),dim=1)
outputs_par, hidden_par = self.rnn_par(total_out[:,0])
for i in range(total_out.shape[1]):
outputs_par, hidden_par = self.rnn_par(total_out[:,i],hidden_par)
return outputs_par, hidden_par
Decoder code:
class Decoder(nn.Module):
def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
super().__init__()
self.output_dim = output_dim
self.attention = attention
#self.embedding = nn.Embedding(output_dim, emb_dim)
self.rnn_par = nn.GRU((enc_hid_dim * 2), dec_hid_dim*2)
self.rnn_sen = nn.GRU(output_dim, dec_hid_dim*2)
self.fc_out = nn.Linear(dec_hid_dim*2, output_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, input, hidden, encoder_outputs):
output, hidden = self.rnn_par(encoder_outputs)
all_par = output.unsqueeze(0).permute(1,0,2)
for i in range(1,max_par_len):
output,hidden = self.rnn_par(output,hidden)
all_par = torch.cat((all_par,output.unsqueeze(0).permute(1,0,2)),dim=1)
for i in range(max_par_len):
output_arg = self.fc_out(all_par[:,i])
#output_argmax = F.one_hot(output_arg.argmax(dim = 1), self.output_dim).to(torch.float)
output_argmax = torch.softmax(output_arg,dim=1)
output_sen, hidden_sen = self.rnn_sen(output_argmax)
all_par_sen = output_argmax.unsqueeze(0).permute(1,0,2)
for j in range(max_sen_len - 1):
output_sen,hidden_sen = self.rnn_sen(output_argmax,hidden_sen)
output_arg = self.fc_out(output_sen)
output_argmax = torch.softmax(output_arg,dim=1)
all_par_sen = torch.cat((all_par_sen,output_argmax.unsqueeze(0).permute(1,0,2)),dim=1)
if i == 0:
all_doc = all_par_sen.unsqueeze(0).permute(1,0,2,3)
else:
all_doc = torch.cat((all_doc,all_par_sen.unsqueeze(0).permute(1,0,2,3)),dim=1)
i+=1
return all_doc ,hidden_sen
And my train function:
def train(model, iterator, optimizer, criterion, clip, epoch):
model.train()
epoch_loss = 0
data = tqdm(iterator)
for i, batch in enumerate(data):
src = batch[0].to(device)#.to(torch.long)#.reshape(batch[0].shape[0],-1)
trg = batch[0].to(device)#.to(torch.long)#.reshape(batch[0].shape[0],-1)
target = torch.argmax(trg,dim=3).view(-1)
print(target)
optimizer.zero_grad()
output = model(src, trg).view(-1,OUTPUT_DIM)
loss = criterion(output, target)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
optimizer.step()
epoch_loss += loss.item()
N_EPOCHS = 20
CLIP = 1
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss(ignore_index = vocabulary['<pad>'])
best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):
start_time = time.time()
train_loader, valid_loader = data_loaders['train_loader'], data_loaders['test_loader']
train_loss = train(model, train_loader, optimizer, criterion, CLIP,f'{epoch+1}/{N_EPOCHS}')
#valid_loss = evaluate(model, valid_loader, criterion)
end_time = time.time()
epoch_mins, epoch_secs = epoch_time(start_time, end_time)
print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
I was following a youtube video and learning to make a chat bot, the teacher explained this step to make the training model, the code compiled perfectly for the teacher but im getting an error. What am i doing wrong?
for epoch in range(num_epochs):
for (words, labels) in train_loader:
words = words.to(device)
labels = labels.to(device, dtype=torch.int64)
outputs= model(words)
loss = criterion(outputs,labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if(epoch +1) % 100 == 0:
print(f'epoch {epoch+1}/{epoch}, loss = {loss.item():.4f}')
print(f'epoch {epoch+1}/{epoch}, loss = {loss.item():.4f}')
NeuralNet:
class NeuralNet(nn.Module):
def __init__(self,input_size, hidden_size,num_classes):
super(NeuralNet,self).__init__()
self.l1 = nn.Linear(input_size,hidden_size)
self.l2 = nn.Linear(hidden_size,hidden_size)
self.l3 = nn.Linear(hidden_size,num_classes)
self.relu = nn.ReLU()
def forward(self,x):
out = self.l1(x)
out = self.relu(out)
out = self.l2(out)
out = self.relu(out)
out = self.l3
return out
The issue is with the NeuralNet code specifically in the line:
out = self.l3
You are setting out to be the Linear layer instead of calling the linear layer on the data. Change it to
out = self.l3(out)
and it will work
I'm just learn pytorch recently.
And I try to write a same model like the paper that I have read for practice.
This is the PDF of the paper I refer.
https://dl.acm.org/doi/pdf/10.1145/3178876.3186066?download=true
Here is the code what I wrote.
class Tem(torch.nn.Module):
def __init__(self, embedding_size, hidden_size):
super(Tem, self).__init()
self.embedding_size = embedding_size
self.hidden_size = hidden_size
self.leaf_size = 0
self.xgb_model = None
self.vec_embedding = None
self.multi_hot_Q = None
self.user_embedding = torch.nn.Linear(1, embedding_size)
self.item_embedding = torch.nn.Linear(1, embedding_size)
def pretrain(self, ui_attributes, labels):
print("Start XGBoost Training...")
self.xgb_model = XGBoost(ui_attributes, labels)
self.leaf_size = self.xgb_model.leaf_size
self.vec_embedding = Variable(torch.rand(self.embedding_size, self.leaf_size, requires_grad=True))
self.h = Variable(torch.rand(self.hidden_size, 1, requires_grad=True))
self.att_w = Variable(torch.rand(2 * self.embedding_size, self.hidden_size, requires_grad=True))
self.att_b = Variable(torch.rand(self.leaf_size, self.hidden_size, requires_grad=True))
self.r_1 = Variable(torch.rand(self.embedding_size, 1, requires_grad=True))
self.r_2 = Variable(torch.rand(self.embedding_size, 1, requires_grad=True))
self.bias = Variable(torch.rand(1, 1, requires_grad=True))
def forward(self, ui_ids, ui_attributes):
if self.xgb_model == None:
raise Exception("Please run Tem.pretrain() to pre-train XGBoost model first.")
n_data = len(ui_ids)
att_input = torch.FloatTensor(ui_attributes)
self.multi_hot_Q = torch.FloatTensor(self.xgb_model.multi_hot(att_input)).permute(0,2,1)
vq = self.vec_embedding * self.multi_hot_Q
id_input = torch.FloatTensor(ui_ids)
user_embedded = self.user_embedding(id_input[:,0].reshape(n_data, 1))
item_embedded = self.item_embedding(id_input[:,1].reshape(n_data, 1))
ui = (user_embedded * item_embedded).reshape(n_data, self.embedding_size, 1)
ui_repeat = ui.repeat(1, 1, self.leaf_size)
cross = torch.cat([ui_repeat, vq], dim=1).permute(0,2,1)
re_cross = corss.reshape(cross.shape[0] * cross.shape[1], cross.shape[2])
attention = torch.mm(re_cross, self.att_w)
attention = F.leaky_relu(attention + self.att_b.repeat(n_data, 1))
attention = torch.mm(attention, self.h).reshape(n_data, self.leaf_size)
attention = F.softmax(attention).reshape(n_data, self.leaf_size, 1)
attention = self.vec_embedding.permute(1,0) * attention.repeat(1,1,20)
pool = torch.max(attention, 1).values
y_hat = self.bias.repeat(n_data, 1) + torch.mm(ui.reshape(n_data, self.embedding_size), self.r_1) + torch.mm(pool, self.r_2)
y_hat = F.softmax(torch.nn.Linear(1, 2)(y_hat))
return y_hat
My question is...It seems torch didn't know what tensor should be calculate gradient in backward propagation.
print(tem)
Tem(
(user_embedding): Linear(in_features=1, out_features=20, bias=True)
(item_embedding): Linear(in_features=1, out_features=20, bias=True)
)
I googled this problem, someone says those tensors should use torch.autograd.Variable(), but it didn't solve my problem. And someone says autograd directly supports tensors now. torch.autograd.Variable() is not necessary.
loss_func = torch.nn.CrossEntropyLoss()
optimizer = torch.Adagrad(tem.parameters(), lr=0.02)
for t in range(20):
prediction = tem(ids_train, att_train)
loss = loss_func(prediction, y_train)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if t % 5 == 0:
print("loss: ", loss)
loss: tensor(0.8133, grad_fn=<NllLossBackward>)
loss: tensor(0.8133, grad_fn=<NllLossBackward>)
loss: tensor(0.8133, grad_fn=<NllLossBackward>)
loss: tensor(0.8133, grad_fn=<NllLossBackward>)
Your problem is not related to Variable. As you said, it's not necessary anymore. To compute the gradients of a tensor declared in a model (that extends nn.Module) you need to include them into the model's parameters using the method nn.Parameter(). For example, to include self.h, you can do:
self.h = nn.Parameter(torch.zeros(10,10)
Now, when you call loss.backward() it'll collect the gradient for this variable (of course, loss must be dependent on self.h).
As opposed to native generative models, the input for this vae is a RGB image. Here if I compile the self.combined using add_loss method, the loss goes around 15000 to -22000. Compiling using mse works fine.
def __init__(self,type = 'landmark'):
self.latent_dim = 128
self.input_shape = (128,128,3)
self.batch_size = 1
self.original_dim = self.latent_dim*self.latent_dim
patch = int(self.input_shape[0] / 2**4)
self.disc_patch = (patch, patch, 1)
optimizer = tf.keras.optimizers.Adam(0.0002, 0.5)
pd = patch_discriminator(type)
self.discriminator = pd.discriminator()
self.discriminator.compile(loss = 'binary_crossentropy',optimizer = optimizer)
self.discriminator.trainable = False
vae = VAE(self.latent_dim,type = type)
encoder = vae.inference_net()
decoder = vae.generative_net()
if type == 'image':
self.orig_out = tf.random.normal(shape = (self.batch_size,128,128,3))
else:
self.orig_out = tf.random.normal(shape = (self.batch_size,128,128,1))
vae_input = tf.keras.layers.Input(shape = self.input_shape)
self.encoder_out = encoder(vae_input)
self.decoder_out = decoder(self.encoder_out[2])
self.generator = tf.keras.Model(vae_input,self.decoder_out)
vae_loss = self.compute_loss()
self.generator.add_loss(vae_loss)
self.generator.compile(optimizer = optimizer)
valid = self.discriminator([self.decoder_out,self.decoder_out])
self.combined = tf.keras.Model(vae_input,valid)
self.combined.add_loss(vae_loss)
self.combined.compile(optimizer = optimizer)
# self.combined.compile(loss='binary_crossentropy', optimizer=optimizer)
self.dl = DataLoader()
compute loss computes kl loss for VAE. Initially self.orig_out is set as normal tensor and is updated in training loop below.
def compute_loss(self):
bce = tf.keras.losses.BinaryCrossentropy()
reconstruction_loss = bce(self.decoder_out,self.orig_out)
reconstruction_loss = self.original_dim*reconstruction_loss
z_mean = self.encoder_out[0]
z_log_var = self.encoder_out[1]
kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var)
kl_loss = K.sum(kl_loss, axis=-1)
kl_loss *= -0.5
vae_loss = K.mean(reconstruction_loss + kl_loss)
return vae_loss
Training loop:
def train(self,batch_size = 1,epochs = 10):
start_time = datetime.datetime.now()
valid = np.ones((batch_size,) + self.disc_patch)
fake = np.zeros((batch_size,) + self.disc_patch)
threshold = epochs//10
for epoch in range(epochs):
for batch_i,(imA,imB,n_batches) in enumerate(self.dl.load_batch(target='landmark',batch_size=batch_size)):
self.orig_out = tf.convert_to_tensor(imB, dtype=tf.float32)
fakeA = self.generator.predict(imA)
d_real_loss = self.discriminator.train_on_batch([imB,imB],valid)
d_fake_loss = self.discriminator.train_on_batch([imB,fakeA],fake)
d_loss = 0.5*np.add(d_real_loss,d_fake_loss)
combined_loss = self.combined.train_on_batch(imA)
#combined_loss = self.combined.train_on_batch(imA,valid)
elapsed_time = datetime.datetime.now() - start_time
print (f"[Epoch {epoch}/{epochs}] [Batch {batch_i}/{n_batches}] [D loss: {d_loss}] [G loss: {combined_loss}] time: {elapsed_time}")
If I compile self.combined with kl loss using add_loss() method, I am not able to pass outputs during train_on_batch as shown above. Thus the generator won't learn and produces random outputs. How do I compile vae with discriminator using kl loss ?
I don't know if this will be the right answer, but VAE can be modeled using Tensorflow more easily since it deals with custom training loops.
You can follow this link which may contain some relevant information for your problem.