CTC: blank must be in label range - python

summary
I'm adding alphabets to captcha recognition, but pytorch's CTC seems to not working properly when alphabets are added.
What I've tried
At first, I modified BLANK_LABEL to 62 since there are 62 labels(0-9, a-z, A-Z), but it gives me runtime error blank must be in label range. I also tried BLANK_LABEL=0 and then assigning 1~63 as nonblank labels but it outputs NaN as loss.
The code
This is the colab link for the current version of my code: here
below are just core parts of the code.
Constants:
DATASET_PATH = "/home/ik1ne/Downloads/numbers"
MODEL_PATH = "/home/ik1ne/Downloads"
BATCH_SIZE = 50
TRAIN_BATCHES = 180
TEST_BATCHES = 20
TOTAL_BATCHES = TRAIN_BATCHES+TEST_BATCHES
TOTAL_DATASET = BATCH_SIZE*TOTAL_BATCHES
BLANK_LABEL = 63
dataset generation:
!pip install captcha
from captcha.image import ImageCaptcha
import itertools
import os
import random
import string
if not os.path.exists(DATASET_PATH):
os.makedirs(DATASET_PATH)
characters = "0123456789"+string.ascii_lowercase + string.ascii_uppercase
while(len(list(Path(DATASET_PATH).glob('*'))) < TOTAL_BATCHES):
captcha_str = "".join(random.choice(characters) for x in range(6))
if captcha_str in list(Path(DATASET_PATH).glob('*')):
continue
ImageCaptcha().write(captcha_str, f"{DATASET_PATH}/{captcha_str}.png")
dataset:
def convert_strseq_to_numseq(s):
for c in s:
if c >= '0' and c <= '9':
return int(c)
elif c>='a' and c <='z':
return ord(c)-ord('a')+10
else:
return ord(c)-ord('A')+36
class CaptchaDataset(Dataset):
"""CAPTCHA dataset."""
def __init__(self, root_dir, transform=None):
self.root_dir = root_dir
self.image_paths = list(Path(root_dir).glob('*'))
self.transform = transform
def __getitem__(self, index):
image = Image.open(self.image_paths[index])
if self.transform:
image = self.transform(image)
label_sequence = [convert_strseq_to_numseq(c) for c in self.image_paths[index].stem]
return (image, torch.tensor(label_sequence))
def __len__(self):
return len(self.image_paths)
model:
class StackedLSTM(nn.Module):
def __init__(self, input_size=60, output_size=11, hidden_size=512, num_layers=2):
super(StackedLSTM, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.dropout = nn.Dropout()
self.fc = nn.Linear(hidden_size, output_size)
self.lstm = nn.LSTM(input_size, hidden_size, num_layers)
def forward(self, inputs, hidden):
batch_size, seq_len, input_size = inputs.shape
outputs, hidden = self.lstm(inputs, hidden)
outputs = self.dropout(outputs)
outputs = torch.stack([self.fc(outputs[i]) for i in range(width)])
outputs = F.log_softmax(outputs, dim=2)
return outputs, hidden
def init_hidden(self, batch_size):
weight = next(self.parameters()).data
return (weight.new(self.num_layers, batch_size, self.hidden_size).zero_(),
weight.new(self.num_layers, batch_size, self.hidden_size).zero_())
net = StackedLSTM().to(device)
training:
net.train() # set network to training phase
epochs = 30
# for each pass of the training dataset
for epoch in range(epochs):
train_loss, train_correct, train_total = 0, 0, 0
h = net.init_hidden(BATCH_SIZE)
# for each batch of training examples
for batch_index, (inputs, targets) in enumerate(train_dataloader):
inputs, targets = inputs.to(device), targets.to(device)
h = tuple([each.data for each in h])
BATCH_SIZE, channels, height, width = inputs.shape
# reshape inputs: NxCxHxW -> WxNx(HxC)
inputs = (inputs
.permute(3, 0, 2, 1)
.contiguous()
.view((width, BATCH_SIZE, -1)))
optimizer.zero_grad() # zero the parameter gradients
outputs, h = net(inputs, h) # forward pass
# compare output with ground truth
input_lengths = torch.IntTensor(BATCH_SIZE).fill_(width)
target_lengths = torch.IntTensor([len(t) for t in targets])
loss = criterion(outputs, targets, input_lengths, target_lengths)
loss.backward() # backpropagation
nn.utils.clip_grad_norm_(net.parameters(), 10) # clip gradients
optimizer.step() # update network weights
# record statistics
prob, max_index = torch.max(outputs, dim=2)
train_loss += loss.item()
train_total += len(targets)
for i in range(BATCH_SIZE):
raw_pred = list(max_index[:, i].cpu().numpy())
pred = [c for c, _ in groupby(raw_pred) if c != BLANK_LABEL]
target = list(targets[i].cpu().numpy())
if pred == target:
train_correct += 1
# print statistics every 10 batches
if (batch_index + 1) % 10 == 0:
print(f'Epoch {epoch + 1}/{epochs}, ' +
f'Batch {batch_index + 1}/{len(train_dataloader)}, ' +
f'Train Loss: {(train_loss/1):.5f}, ' +
f'Train Accuracy: {(train_correct/train_total):.5f}')
train_loss, train_correct, train_total = 0, 0, 0

This error will occur when the index of blank is larger than the total number of classes, which equals number of chars + blank. What's more, the index starts from 0, instead of 1, so if you have 62 characters in total, their index should be 0-61 and the index of blank should be 62 instead of 63. (Or you can set blank as 0, other characters from 1-62)
You should also check the shape of the output tensor, it should has shape [T, B, C], where T is the time step length, B is the batch size, C is the class num, remember to add blank in to the class num or you will meet the problem

Most probably there is some problem with net shape when it's sent to CTC loss, but you should have provided the dataset to us to see the net's shape. It should be (T,N,C) , where T=input length, N=batch size, C=number of classes. And as I understand blank symbol id should in the 0..C range. Also, you should add blank symbol, for example '-' to the alphabet.

Related

How predict next word using LSTM model?

I am currently building an LSTM model in Pytorch to predict the next word of a given input.
My model:
class LSTM(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout_rate, tie_weights):
super().__init__()
self.num_layers = num_layers
self.hidden_dim = hidden_dim
self.embedding_dim = embedding_dim
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers,
dropout=dropout_rate, batch_first=True)
self.dropout = nn.Dropout(dropout_rate)
self.linear = nn.Linear(hidden_dim, vocab_size)
if tie_weights:
#Embedding and hidden layer need to be same size for weight tieing
assert embedding_dim == hidden_dim, 'cannot tie, check dims'
self.linear.weight = self.embedding.weight
self.init_weights()
def forward(self, x):
# x is a batch of input sequences
x = self.embedding(x)
x, _ = self.lstm(x)
x = self.linear(x)
return x
def init_weights(self):
init_range_emb = 0.1
init_range_other = 1/math.sqrt(self.hidden_dim)
self.embedding.weight.data.uniform_(-init_range_emb, init_range_emb)
self.linear.weight.data.uniform_(-init_range_other, init_range_other)
self.linear.bias.data.zero_()
for i in range(self.num_layers):
self.lstm.all_weights[i][0] = torch.FloatTensor(self.embedding_dim,
self.hidden_dim).uniform_(-init_range_other, init_range_other)
self.lstm.all_weights[i][1] = torch.FloatTensor(self.hidden_dim,
self.hidden_dim).uniform_(-init_range_other, init_range_other)
vocab_size = len(vocab)
embedding_dim = 100
hidden_dim = 100
num_layers = 2
dropout_rate = 0.4
tie_weights = True
model = LSTM(vocab_size, embedding_dim, hidden_dim, num_layers, dropout_rate, tie_weights)
model.to(device)
Training and evaluation funciton:
import copy
import time
criterion = nn.CrossEntropyLoss()
lr = 20.0 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
def train(model: nn.Module) -> None:
model.train() # turn on train mode
total_loss = 0.
log_interval = 200
start_time = time.time()
num_batches = len(train_data) // bptt
for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
data, targets = get_batch(train_data, i)
seq_len = data.size(0)
output = model(data)
loss = criterion(output.view(-1, vocab_size), targets)
optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
optimizer.step()
total_loss += loss.item()
if batch % log_interval == 0 and batch > 0:
lr = scheduler.get_last_lr()[0]
ms_per_batch = (time.time() - start_time) * 1000 / log_interval
cur_loss = total_loss / log_interval
ppl = math.exp(cur_loss)
print(f'| epoch {epoch:3d} | {batch:5d}/{num_batches:5d} batches | '
f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | '
f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
total_loss = 0
start_time = time.time()
def evaluate(model: nn.Module, eval_data: Tensor) -> float:
model.eval() # turn on evaluation mode
total_loss = 0.
with torch.no_grad():
for i in range(0, eval_data.size(0) - 1, bptt):
data, targets = get_batch(eval_data, i)
seq_len = data.size(0)
output = model(data)
output_flat = output.view(-1, vocab_size)
total_loss += seq_len * criterion(output_flat, targets).item()
return total_loss / (len(eval_data) - 1)
Training loop
best_val_loss = float('inf')
epochs = 50
best_model = None
for epoch in range(1, epochs + 1):
epoch_start_time = time.time()
train(model)
val_loss = evaluate(model, val_data)
val_ppl = math.exp(val_loss)
elapsed = time.time() - epoch_start_time
print('-' * 89)
print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
f'valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}')
print('-' * 89)
if val_loss < best_val_loss:
best_val_loss = val_loss
best_model = copy.deepcopy(model)
scheduler.step()
My problem is I have no idea how to go about this. I've seen some implementations of character based LSTM text generators but I'm looking for it to be word based. For example I want to pass an input like "How are you" and the output will included the next predicted word, like for example "How are you today"
Any help appreciated.
I would suggest to try the example in the attached link(https://www.kaggle.com/code/ysthehurricane/next-word-prediction-bi-lstm-tutorial-easy-way).
You can download the dataset from the attached link below.
(https://www.kaggle.com/datasets/dorianlazar/medium-articles-dataset)
It tries to predict the next word using Bi-directional LSTM architecture. I think that this example mostly suits to your needs, which will give you an idea to proceed further.
You can follow the instruction provided in the first link.

How to implement Laplace Posteriori Approximation on BERT in PyTorch?

I'm trying to implement the Laplace Posteriori Approximation on the last layer for the classification results obtained by BERT model. I get an error regarding input size, and after I fix it by extracting just embeddings and class labels from BERT to feed them into Laplace, I get another bunch of errors regarding input dimensions that I don't know how to debug.
As this is something I didn't find on the internet, and includes relatively new libraries, I will post here just the first error I got, code that might help in debugging and useful links.
I will update post if needed.
Of course, if someone knows how to implement Laplace Posteriori Approximation with BERT in some other library like Scikit or Trax, it would be helpful. Also, some other Transformer classification model with some other confidence approximation will be useful for me. Any help is appreciated!
Code:
# Import
import pandas as pd
import torch
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from torch import nn
from transformers import BertTokenizer
from transformers import BertModel
from transformers import BertForSequenceClassification
from sklearn.model_selection import train_test_split
import time
import os
#Toy Data
data_a_b_c = ["""category a. This is category a. In category a we talk about animals.
This category includes lions, fish, tigers, birds, elephants, mouses, dogs, cats, and all other animals."""] * 60 \
+ ["""category b. This is category b. In category b we talk about people. This category members are
Abraham Maslow, John Lennon, Drazen Petrovic, Nikola Tesla, Slavoljub Penkala, Nenad Bakic and Larry Page."""] * 60 \
+ ["""category c. This is category c. Category c is dedicated to car brands like Lamborgini, Rimac-Buggati, BMW, Mercedes,
Honda, Opel, Wolkswagen, and etc."""] * 60
label_0_1_2 = [0] * 60 + [1] * 60 + [2] * 60
d = {'text': data_a_b_c, 'labels': label_0_1_2}
df = pd.DataFrame(data=d)
print(df.head(3))
print(df.tail(3))
print(df.info())
# Parameters
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
batch_size = 2
learning_rate = 3e-4
epochs = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
labels = pd.Series(df.labels.values).to_dict()
num_classes = 3
print(f'Tokenizer: {tokenizer}, Batch size:{batch_size}, Learning rate:{learning_rate}, Epochs:{epochs}')
print('Device: ', device)
print('Number of possible classes: ', num_classes)
# Model Architecture
class TransformerModel(nn.Module):
def __init__(self, num_classes, dropout=0.5):
super(TransformerModel, self).__init__()
self.bert = BertModel.from_pretrained('bert-base-multilingual-cased')
self.dropout = nn.Dropout(dropout)
self.linear = nn.Linear(768, num_classes)
self.relu = nn.ReLU()
def forward(self, input_id, mask):
_, pooled_output = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
dropout_output = self.dropout(pooled_output)
linear_output = self.linear(dropout_output)
final_layer = self.relu(linear_output)
return final_layer
# Prepare Data Function
def prepare_data(data, labels):
texts = tokenizer(data, padding='max_length', max_length=512, truncation=True, return_tensors="pt")
input_ids = texts['input_ids']
attention_mask = texts['attention_mask']
train_dataset = TensorDataset(input_ids, attention_mask, torch.LongTensor(labels))
dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
return dataloader
#Run Training Function
def run_training(train_dataloader, val_dataloader, epochs=epochs, lr=learning_rate):
def train(dataloader):
model.train()
total_acc, total_count = 0, 0
log_interval = 128
start_time = time.time()
for idx, (input_id, mask, label) in enumerate(train_dataloader):
# print(idx)
mask = mask.to(device)
input_id = input_id.to(device)
label = label.type(torch.LongTensor).to(device)
output = model(input_id, mask)
optimizer.zero_grad()
loss = criterion(output, label)
loss.backward()
# torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
optimizer.step()
total_acc += (output.argmax(1) == label).sum().item()
total_count += label.size(0)
if idx % log_interval == 0 and idx > 0:
elapsed = time.time() - start_time
print('| epoch {:3d} | {:5d}/{:5d} batches '
'| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
total_acc / total_count))
total_acc, total_count = 0, 0
start_time = time.time()
def evaluate(dataloader):
model.eval()
total_acc, total_count = 0, 0
with torch.no_grad():
for idx, (input_id, mask, label) in enumerate(dataloader):
mask = mask.to(device)
input_id = input_id.to(device)
label = label.to(device)
output = model(input_id, mask)
total_acc += (output.argmax(1) == label).sum().item()
total_count += label.size(0)
return total_acc / total_count
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
cuda = torch.cuda.is_available()
device = torch.device("cuda" if cuda else "cpu")
device = 'cuda'
model.to(device)
total_accu = None
for epoch in range(1, epochs + 1):
epoch_start_time = time.time()
train(train_dataloader)
accu_val = evaluate(val_dataloader)
if total_accu is not None and total_accu > accu_val:
scheduler.step()
else:
total_accu = accu_val
print('-' * 59)
print('| end of epoch {:3d} | time: {:5.2f}s | '
'valid accuracy {:8.3f} '.format(epoch,
time.time() - epoch_start_time,
accu_val))
print('-' * 59)
# Data Split And Preparation
X_train, X_test, y_train, y_test = train_test_split(df.text.values.tolist(), df.labels.values.tolist(), test_size=0.2, random_state=2)
train_dataloader = prepare_data(X_train, y_train)
val_dataloader = prepare_data(X_test, y_test)
# Run The Model
model = TransformerModel(num_classes)
run_training(train_dataloader, val_dataloader)
print('finished')
# Save And Load The Model (if needed)
PATH = ".../Torch_BERT_model"
torch.save(model, os.path.join(PATH, "Toy_Data_BERT.pth"))
model = torch.load(os.path.join(PATH, "Toy_Data_BERT.pth"))
print(model)
# Laplace
from laplace import Laplace
la = Laplace(model, 'classification', subset_of_weights='last_layer', hessian_structure='full')
la.fit(train_dataloader)
Error I get:
--------------------------------------------------------------------------- ValueError Traceback (most recent call
last) ~\AppData\Local\Temp\ipykernel_7144\3779742208.py in <cell line:
2>()
1 la = Laplace(model, 'classification', subset_of_weights='last_layer', hessian_structure='full')
----> 2 la.fit(train_dataloader)
~\anaconda3\lib\site-packages\laplace\lllaplace.py in fit(self,
train_loader, override)
98
99 if self.model.last_layer is None:
--> 100 X, _ = next(iter(train_loader))
101 with torch.no_grad():
102 try:
ValueError: too many values to unpack (expected 2)
Useful link for Laplace implementation with examples:
https://aleximmer.github.io/Laplace/#full-example-optimization-of-the-marginal-likelihood-and-prediction
Code that might help in debugging:
for x in train_dataloader:
print("The length of batch is:", len(x))
print()
print("The batch looks like:", x)
print()
print("The length of the first element in the batch is:") #embedding
print(len(x[0]))
print("The length of the second element in the batch is:") #1 if place is filled with word, 0 if it's empty?
print(len(x[1]))
print("The length of the third element in the batch is:") #category
print(len(x[2]))
print()
print("The lengths of the first tensor and second tensor in the first element in the batch is:")
print(len(x[0][0]), len(x[0][1])) # = max_length (512)
print("The lengths of the first tensor and second tensor in the second element in the batch is:")
print(len(x[1][0]), len(x[1][1])) # = max_length (512)
print()
print()
The laplace library expects that the dataloader returns two parameters (X,y) and that the model requires exactly one argument to make its prediction (code). But your model forward pass requires two arguments, namely input_id and mask, and your dataloader returns three arguments input_id, mask, and labels.
There are several ways to work around this limitation (e.g. return a dict with input_ids and attention_mask). The way that requires the least understanding of the internals of the laplace library is to generate the attention mask at runtime in the forward pass (not great for the performance):
class TransformerModel(nn.Module):
def __init__(self, num_classes, pad_id, dropout=0.5):
super(TransformerModel, self).__init__()
self.bert = BertModel.from_pretrained('bert-base-multilingual-cased')
self.dropout = nn.Dropout(dropout)
self.linear = nn.Linear(768, num_classes)
self.relu = nn.ReLU()
self.pad_id = pad_id
def forward(self, input_id):
mask = (input_ids!=self.pad_id).type(input_ids.dtype)
_, pooled_output = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
dropout_output = self.dropout(pooled_output)
linear_output = self.linear(dropout_output)
final_layer = self.relu(linear_output)
return final_layer
model = TransformerModel(num_classes, tokenizer.pad_token_id)

Hierarchical LSTM autoencoder - model not training

I'm trying to reconstruct this paper about hierarchical autoencoder for paragraphs.
The idea is: Break a paragraph into sentences, then encode each sentence using an LSTM, and then using these encoding as an input for another LSTM that encode the entire paragraph.
Then, using a mirror decoder, decode the encoded paragraph using an LSTM into multiple sentences, and then use another LSTM to decode each word, with a linear layer on top and predicts the word.
The objective is to try to predict the original paragraph.
I've done some preprocessing, and right now I save each paragraph as a tensor of (maxSentence,maxWordsPerSentence,VocabSize), using one hot encoding.
My problem is, there model is not learning. The loss stays exactly the same and it doesn't seem as anything is happening.. I wasn't sure on how to calculate the loss (I've ran a batch all together and decoded it into multiple paragraphs, and then calculated the loss against the entire batch predictions, my train function is added below. I don't know if that is the problem (maybe I should calculate loss sentence by sentence instead the entire paragraph?) or maybe I have a problem in my model.
Encoder code:
class Encoder(nn.Module):
def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
super().__init__()
#self.embedding = nn.Embedding(input_dim, emb_dim)
self.rnn_sent = nn.GRU(input_dim, enc_hid_dim, bidirectional = True)
self.rnn_par = nn.GRU(enc_hid_dim*2, dec_hid_dim, bidirectional = True)
def forward(self, src):
outputs, hidden = self.rnn_sent(src[:,0,0])
total_out = outputs.unsqueeze(0).permute(1,0,2)
for i in range(1,src.shape[1]):
for j in range(src.shape[2]):
outputs, hidden = self.rnn_sent(src[:,i,j],hidden)
total_out = torch.cat((total_out,outputs.unsqueeze(0).permute(1,0,2)),dim=1)
outputs_par, hidden_par = self.rnn_par(total_out[:,0])
for i in range(total_out.shape[1]):
outputs_par, hidden_par = self.rnn_par(total_out[:,i],hidden_par)
return outputs_par, hidden_par
Decoder code:
class Decoder(nn.Module):
def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
super().__init__()
self.output_dim = output_dim
self.attention = attention
#self.embedding = nn.Embedding(output_dim, emb_dim)
self.rnn_par = nn.GRU((enc_hid_dim * 2), dec_hid_dim*2)
self.rnn_sen = nn.GRU(output_dim, dec_hid_dim*2)
self.fc_out = nn.Linear(dec_hid_dim*2, output_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, input, hidden, encoder_outputs):
output, hidden = self.rnn_par(encoder_outputs)
all_par = output.unsqueeze(0).permute(1,0,2)
for i in range(1,max_par_len):
output,hidden = self.rnn_par(output,hidden)
all_par = torch.cat((all_par,output.unsqueeze(0).permute(1,0,2)),dim=1)
for i in range(max_par_len):
output_arg = self.fc_out(all_par[:,i])
#output_argmax = F.one_hot(output_arg.argmax(dim = 1), self.output_dim).to(torch.float)
output_argmax = torch.softmax(output_arg,dim=1)
output_sen, hidden_sen = self.rnn_sen(output_argmax)
all_par_sen = output_argmax.unsqueeze(0).permute(1,0,2)
for j in range(max_sen_len - 1):
output_sen,hidden_sen = self.rnn_sen(output_argmax,hidden_sen)
output_arg = self.fc_out(output_sen)
output_argmax = torch.softmax(output_arg,dim=1)
all_par_sen = torch.cat((all_par_sen,output_argmax.unsqueeze(0).permute(1,0,2)),dim=1)
if i == 0:
all_doc = all_par_sen.unsqueeze(0).permute(1,0,2,3)
else:
all_doc = torch.cat((all_doc,all_par_sen.unsqueeze(0).permute(1,0,2,3)),dim=1)
i+=1
return all_doc ,hidden_sen
And my train function:
def train(model, iterator, optimizer, criterion, clip, epoch):
model.train()
epoch_loss = 0
data = tqdm(iterator)
for i, batch in enumerate(data):
src = batch[0].to(device)#.to(torch.long)#.reshape(batch[0].shape[0],-1)
trg = batch[0].to(device)#.to(torch.long)#.reshape(batch[0].shape[0],-1)
target = torch.argmax(trg,dim=3).view(-1)
print(target)
optimizer.zero_grad()
output = model(src, trg).view(-1,OUTPUT_DIM)
loss = criterion(output, target)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
optimizer.step()
epoch_loss += loss.item()
N_EPOCHS = 20
CLIP = 1
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss(ignore_index = vocabulary['<pad>'])
best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):
start_time = time.time()
train_loader, valid_loader = data_loaders['train_loader'], data_loaders['test_loader']
train_loss = train(model, train_loader, optimizer, criterion, CLIP,f'{epoch+1}/{N_EPOCHS}')
#valid_loss = evaluate(model, valid_loader, criterion)
end_time = time.time()
epoch_mins, epoch_secs = epoch_time(start_time, end_time)
print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')

Set torch.cuda.IntTensor to Long for embedding layer

Right now I am having trouble telling my network to understand my data because I have created an error with the embedding layer. It is expecting a Long but I am passing it an torch.cuda.IntTensor
My Model
class LSTMClassification(torch.nn.Module):
def __init__(self, n_vocab, n_embed, n_hidden, n_output, n_layers, drop_p = 0.5):
super().__init__()
# params: "n_" means dimension
self.n_vocab = n_vocab # number of unique words in vocabulary
self.n_layers = n_layers # number of LSTM layers
self.n_hidden = n_hidden # number of hidden nodes in LSTM
self.embedding = torch.nn.Embedding(n_vocab, n_embed)
self.lstm = torch.nn.LSTM(n_embed, n_hidden, n_layers, batch_first = True, dropout = drop_p)
self.dropout = torch.nn.Dropout(drop_p)
self.fc = torch.nn.Linear(n_hidden, n_output)
self.sigmoid = torch.nn.Sigmoid()
def forward (self, input_words):
# INPUT : (batch_size, seq_length)
embedded_words = self.embedding(input_words) # (batch_size, seq_length, n_embed)
lstm_out, h = self.lstm(embedded_words) # (batch_size, seq_length, n_hidden)
lstm_out = self.dropout(lstm_out)
lstm_out = lstm_out.contiguous().view(-1, self.n_hidden) # (batch_size*seq_length, n_hidden)
fc_out = self.fc(lstm_out) # (batch_size*seq_length, n_output)
sigmoid_out = self.sigmoid(fc_out) # (batch_size*seq_length, n_output)
sigmoid_out = sigmoid_out.view(batch_size, -1) # (batch_size, seq_length*n_output)
# extract the output of ONLY the LAST output of the LAST element of the sequence
sigmoid_last = sigmoid_out[:, -1] # (batch_size, 1)
return sigmoid_last, h
def init_hidden (self, batch_size): # initialize hidden weights (h,c) to 0
device = "cuda" if torch.cuda.is_available() else "cpu"
weights = next(self.parameters()).data
h = (weights.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device),
weights.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device))
return h
**Model Initialization & Training **
num_epochs_first = 1
for epoch in range(n_epochs):
h = model.init_hidden(batch_size)
for i, data in enumerate(train_loader, 0):
step += 1
inputs, labels = data[0].to(device), data[1].to(device)
# making requires_grad = False for the latest set of h
h = tuple([each.data for each in h])
model.zero_grad()
output, h = model(inputs)
#loss = criterion(output, labels)
#loss.backward()
#optimizer.step()
My Error
RuntimeError: Expected tensor for argument #1 'indices' to have scalar type Long; but got torch.cuda.IntTensor instead (while checking arguments for embedding)
**
The Error occurs on line:
output, h = model(inputs)
**

Ran out of Ram while training LSTM

I am kind of a beginner in RNNs, so I coded a LSTM architecture using Pytorch, but I always run out of RAM whenever I am in the 3rd epoch. I am already using a DataLoader and I tried to detach the gradient from the input tensor but it doesn't solve the problem out.
This is my training loop
writer = SummaryWriter()
criterion = nn.CrossEntropyLoss(reduction='mean', ignore_index = 0)
optimizer = optim.Adam(lstm.parameters(), lr = 1e-5)
gradient_clip = clip_grad_norm_(lstm.parameters(), max_norm = 5)
num_epochs = 20
epoch_loss = -1.0
loss = - 1
t = trange(num_epochs, desc= "Epoch loss", leave=True)
for epoch in t:
trainLoader = iter(DataLoader(dataset, batch_size = batch_size))
tt = trange(len(trainLoader)-1, desc= "Batch loss", leave=True)
for i in tt:
text, embedding = next(trainLoader)
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
y = lstm.forward(embedding.transpose(1,0))
labels = text.transpose(0,1)[1:].transpose(0,1).flatten()
loss = criterion(y.reshape(-1, y.shape[-1]), labels)
tt.set_description("Batch loss : %.4f" % loss)
tt.refresh()
loss.backward(retain_graph=True)
optimizer.step()
epoch_loss += loss
epoch_loss = epoch_loss / (len(trainLoader) - 1)
# Saving model
save_date = datetime.now().strftime("%d%m%Y-%H:%M:%S")
PATH = './save/lstm_model_'+save_date
torch.save(lstm, PATH)
# Updating progression bar
t.set_description("Epoch loss : %.4f" % epoch_loss)
t.refresh()
# Plotting gradients histograms in Tensorboard
writer.add_scalar('Text_generation_Loss/train', epoch_loss, epoch)
for tag, parm in lstm.named_parameters():
with torch.no_grad():
writer.add_histogram(tag, parm.grad.data.cpu().numpy(), epoch)
writer.flush()
print('Finished Training')
writer.close()
And this is the LSTM class that I built:
class LSTM(nn.Module):
def __init__(self, in_size : int, hidden_size : int):
super().__init__()
self.in_size = in_size
self.hidden_size = hidden_size
self.W_fi = nn.Linear(in_size,hidden_size)
self.W_fh = nn.Linear(hidden_size,hidden_size, bias=False)
self.W_ii = nn.Linear(in_size,hidden_size)
self.W_ih = nn.Linear(hidden_size,hidden_size, bias=False)
self.W_Ci = nn.Linear(in_size,hidden_size)
self.W_Ch = nn.Linear(hidden_size,hidden_size, bias=False)
self.W_oi = nn.Linear(in_size,hidden_size)
self.W_oh = nn.Linear(hidden_size,hidden_size, bias=False)
self.sigmoid = nn.Sigmoid()
self.tanh = nn.Tanh()
def one_step(self, x, h, C):
f_t = self.sigmoid(self.W_fi(x) + self.W_fh(h))
i_t = self.sigmoid(self.W_ii(x) + self.W_ih(h))
g_t = self.tanh(self.W_Ci(x) + self.W_Ch(h))
C_t = torch.mul(f_t, C) + torch.mul(i_t, g_t)
o_t = self.sigmoid(self.W_oi(x) + self.W_oh(h))
h_t = torch.mul(o_t, self.tanh(C_t))
return h_t, C_t
def forward(self, X):
h_out = []
h = - torch.ones(X.shape[1], self.hidden_size)
C = - torch.ones(X.shape[1], self.hidden_size)
h_t, C_t = self.one_step(X[0], h, C)
h_out.append(h_t)
for i in range(1, X.shape[0] - 1):
h_t, C_t = self.one_step(X[i], h_t, C_t)
h_out.append(h_t)
h_out = torch.cat(h_out)
return h_out #h_out.reshape(-1,batch_size,num_embeddings)
I already searched for a similar case but I wasn't able to find a solution
I don't know if it may help somebody, but I solved the problem. I wasn't perhaps clear about the task, but the goal was to make text generation. The first thing I was doing is embed the sentences using torch.nn.embedding that was defined outside my LSTM. The solution was to include it as a layer of my network, since the embedding is not a pretrained one and should be learned too.

Categories

Resources