Hierarchical LSTM autoencoder - model not training - python

I'm trying to reconstruct this paper about hierarchical autoencoder for paragraphs.
The idea is: Break a paragraph into sentences, then encode each sentence using an LSTM, and then using these encoding as an input for another LSTM that encode the entire paragraph.
Then, using a mirror decoder, decode the encoded paragraph using an LSTM into multiple sentences, and then use another LSTM to decode each word, with a linear layer on top and predicts the word.
The objective is to try to predict the original paragraph.
I've done some preprocessing, and right now I save each paragraph as a tensor of (maxSentence,maxWordsPerSentence,VocabSize), using one hot encoding.
My problem is, there model is not learning. The loss stays exactly the same and it doesn't seem as anything is happening.. I wasn't sure on how to calculate the loss (I've ran a batch all together and decoded it into multiple paragraphs, and then calculated the loss against the entire batch predictions, my train function is added below. I don't know if that is the problem (maybe I should calculate loss sentence by sentence instead the entire paragraph?) or maybe I have a problem in my model.
Encoder code:
class Encoder(nn.Module):
def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
super().__init__()
#self.embedding = nn.Embedding(input_dim, emb_dim)
self.rnn_sent = nn.GRU(input_dim, enc_hid_dim, bidirectional = True)
self.rnn_par = nn.GRU(enc_hid_dim*2, dec_hid_dim, bidirectional = True)
def forward(self, src):
outputs, hidden = self.rnn_sent(src[:,0,0])
total_out = outputs.unsqueeze(0).permute(1,0,2)
for i in range(1,src.shape[1]):
for j in range(src.shape[2]):
outputs, hidden = self.rnn_sent(src[:,i,j],hidden)
total_out = torch.cat((total_out,outputs.unsqueeze(0).permute(1,0,2)),dim=1)
outputs_par, hidden_par = self.rnn_par(total_out[:,0])
for i in range(total_out.shape[1]):
outputs_par, hidden_par = self.rnn_par(total_out[:,i],hidden_par)
return outputs_par, hidden_par
Decoder code:
class Decoder(nn.Module):
def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
super().__init__()
self.output_dim = output_dim
self.attention = attention
#self.embedding = nn.Embedding(output_dim, emb_dim)
self.rnn_par = nn.GRU((enc_hid_dim * 2), dec_hid_dim*2)
self.rnn_sen = nn.GRU(output_dim, dec_hid_dim*2)
self.fc_out = nn.Linear(dec_hid_dim*2, output_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, input, hidden, encoder_outputs):
output, hidden = self.rnn_par(encoder_outputs)
all_par = output.unsqueeze(0).permute(1,0,2)
for i in range(1,max_par_len):
output,hidden = self.rnn_par(output,hidden)
all_par = torch.cat((all_par,output.unsqueeze(0).permute(1,0,2)),dim=1)
for i in range(max_par_len):
output_arg = self.fc_out(all_par[:,i])
#output_argmax = F.one_hot(output_arg.argmax(dim = 1), self.output_dim).to(torch.float)
output_argmax = torch.softmax(output_arg,dim=1)
output_sen, hidden_sen = self.rnn_sen(output_argmax)
all_par_sen = output_argmax.unsqueeze(0).permute(1,0,2)
for j in range(max_sen_len - 1):
output_sen,hidden_sen = self.rnn_sen(output_argmax,hidden_sen)
output_arg = self.fc_out(output_sen)
output_argmax = torch.softmax(output_arg,dim=1)
all_par_sen = torch.cat((all_par_sen,output_argmax.unsqueeze(0).permute(1,0,2)),dim=1)
if i == 0:
all_doc = all_par_sen.unsqueeze(0).permute(1,0,2,3)
else:
all_doc = torch.cat((all_doc,all_par_sen.unsqueeze(0).permute(1,0,2,3)),dim=1)
i+=1
return all_doc ,hidden_sen
And my train function:
def train(model, iterator, optimizer, criterion, clip, epoch):
model.train()
epoch_loss = 0
data = tqdm(iterator)
for i, batch in enumerate(data):
src = batch[0].to(device)#.to(torch.long)#.reshape(batch[0].shape[0],-1)
trg = batch[0].to(device)#.to(torch.long)#.reshape(batch[0].shape[0],-1)
target = torch.argmax(trg,dim=3).view(-1)
print(target)
optimizer.zero_grad()
output = model(src, trg).view(-1,OUTPUT_DIM)
loss = criterion(output, target)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
optimizer.step()
epoch_loss += loss.item()
N_EPOCHS = 20
CLIP = 1
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss(ignore_index = vocabulary['<pad>'])
best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):
start_time = time.time()
train_loader, valid_loader = data_loaders['train_loader'], data_loaders['test_loader']
train_loss = train(model, train_loader, optimizer, criterion, CLIP,f'{epoch+1}/{N_EPOCHS}')
#valid_loss = evaluate(model, valid_loader, criterion)
end_time = time.time()
epoch_mins, epoch_secs = epoch_time(start_time, end_time)
print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')

Related

How predict next word using LSTM model?

I am currently building an LSTM model in Pytorch to predict the next word of a given input.
My model:
class LSTM(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout_rate, tie_weights):
super().__init__()
self.num_layers = num_layers
self.hidden_dim = hidden_dim
self.embedding_dim = embedding_dim
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers,
dropout=dropout_rate, batch_first=True)
self.dropout = nn.Dropout(dropout_rate)
self.linear = nn.Linear(hidden_dim, vocab_size)
if tie_weights:
#Embedding and hidden layer need to be same size for weight tieing
assert embedding_dim == hidden_dim, 'cannot tie, check dims'
self.linear.weight = self.embedding.weight
self.init_weights()
def forward(self, x):
# x is a batch of input sequences
x = self.embedding(x)
x, _ = self.lstm(x)
x = self.linear(x)
return x
def init_weights(self):
init_range_emb = 0.1
init_range_other = 1/math.sqrt(self.hidden_dim)
self.embedding.weight.data.uniform_(-init_range_emb, init_range_emb)
self.linear.weight.data.uniform_(-init_range_other, init_range_other)
self.linear.bias.data.zero_()
for i in range(self.num_layers):
self.lstm.all_weights[i][0] = torch.FloatTensor(self.embedding_dim,
self.hidden_dim).uniform_(-init_range_other, init_range_other)
self.lstm.all_weights[i][1] = torch.FloatTensor(self.hidden_dim,
self.hidden_dim).uniform_(-init_range_other, init_range_other)
vocab_size = len(vocab)
embedding_dim = 100
hidden_dim = 100
num_layers = 2
dropout_rate = 0.4
tie_weights = True
model = LSTM(vocab_size, embedding_dim, hidden_dim, num_layers, dropout_rate, tie_weights)
model.to(device)
Training and evaluation funciton:
import copy
import time
criterion = nn.CrossEntropyLoss()
lr = 20.0 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
def train(model: nn.Module) -> None:
model.train() # turn on train mode
total_loss = 0.
log_interval = 200
start_time = time.time()
num_batches = len(train_data) // bptt
for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
data, targets = get_batch(train_data, i)
seq_len = data.size(0)
output = model(data)
loss = criterion(output.view(-1, vocab_size), targets)
optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
optimizer.step()
total_loss += loss.item()
if batch % log_interval == 0 and batch > 0:
lr = scheduler.get_last_lr()[0]
ms_per_batch = (time.time() - start_time) * 1000 / log_interval
cur_loss = total_loss / log_interval
ppl = math.exp(cur_loss)
print(f'| epoch {epoch:3d} | {batch:5d}/{num_batches:5d} batches | '
f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | '
f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
total_loss = 0
start_time = time.time()
def evaluate(model: nn.Module, eval_data: Tensor) -> float:
model.eval() # turn on evaluation mode
total_loss = 0.
with torch.no_grad():
for i in range(0, eval_data.size(0) - 1, bptt):
data, targets = get_batch(eval_data, i)
seq_len = data.size(0)
output = model(data)
output_flat = output.view(-1, vocab_size)
total_loss += seq_len * criterion(output_flat, targets).item()
return total_loss / (len(eval_data) - 1)
Training loop
best_val_loss = float('inf')
epochs = 50
best_model = None
for epoch in range(1, epochs + 1):
epoch_start_time = time.time()
train(model)
val_loss = evaluate(model, val_data)
val_ppl = math.exp(val_loss)
elapsed = time.time() - epoch_start_time
print('-' * 89)
print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
f'valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}')
print('-' * 89)
if val_loss < best_val_loss:
best_val_loss = val_loss
best_model = copy.deepcopy(model)
scheduler.step()
My problem is I have no idea how to go about this. I've seen some implementations of character based LSTM text generators but I'm looking for it to be word based. For example I want to pass an input like "How are you" and the output will included the next predicted word, like for example "How are you today"
Any help appreciated.
I would suggest to try the example in the attached link(https://www.kaggle.com/code/ysthehurricane/next-word-prediction-bi-lstm-tutorial-easy-way).
You can download the dataset from the attached link below.
(https://www.kaggle.com/datasets/dorianlazar/medium-articles-dataset)
It tries to predict the next word using Bi-directional LSTM architecture. I think that this example mostly suits to your needs, which will give you an idea to proceed further.
You can follow the instruction provided in the first link.

TypeError: cross_entropy_loss(): argument ‘input’ (position 1) must be Tensor, not Linear

I was following a youtube video and learning to make a chat bot, the teacher explained this step to make the training model, the code compiled perfectly for the teacher but im getting an error. What am i doing wrong?
for epoch in range(num_epochs):
for (words, labels) in train_loader:
words = words.to(device)
labels = labels.to(device, dtype=torch.int64)
outputs= model(words)
loss = criterion(outputs,labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if(epoch +1) % 100 == 0:
print(f'epoch {epoch+1}/{epoch}, loss = {loss.item():.4f}')
print(f'epoch {epoch+1}/{epoch}, loss = {loss.item():.4f}')
NeuralNet:
class NeuralNet(nn.Module):
def __init__(self,input_size, hidden_size,num_classes):
super(NeuralNet,self).__init__()
self.l1 = nn.Linear(input_size,hidden_size)
self.l2 = nn.Linear(hidden_size,hidden_size)
self.l3 = nn.Linear(hidden_size,num_classes)
self.relu = nn.ReLU()
def forward(self,x):
out = self.l1(x)
out = self.relu(out)
out = self.l2(out)
out = self.relu(out)
out = self.l3
return out
The issue is with the NeuralNet code specifically in the line:
out = self.l3
You are setting out to be the Linear layer instead of calling the linear layer on the data. Change it to
out = self.l3(out)
and it will work

Neural network keep predicting the same number

I have a ROS application where a camera node sends an image via service to a neutral network node. My training and validation dataset I use is the MNIST database. It should be very easy to predict a number, but the neural network returns the same number for every single service request.
ai_service.py
class AiService():
def __init__(self, save_path):
self.batch_size = 2800
self.epochs = 25
self.learning_rate = 0.01
self.training_data = torch.utils.data.DataLoader(datasets.MNIST(root='./data', train=True, download=True,
transform=transforms.Compose([transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))])), 200, shuffle=True)
self.validation_data = torch.utils.data.DataLoader(datasets.MNIST(root='./data', train=False, download=True,
transform=transforms.Compose([transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))])), 200, shuffle=True)
...
# Function to train the mnist dataset.
def training(self):
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(self.model.parameters(), self.learning_rate)
start_time = time()
for epoch in range(self.epochs):
running_loss = 0
# trainig phase
for images, labels in self.training_data:
optimizer.zero_grad()
image, label = images.to(self.device), labels.to(self.device)
output = self.model(image)
loss = criterion(output, label)
loss.backward()
optimizer.step() #optimizing weights
running_loss += loss.item()
else:
print("Epoch {} - Training loss: {:.10f}".format(epoch, running_loss / len(self.training_data)))
print("\nTraining Time (in minutes): {:.2f} =".format((time() - start_time) / 60))
def validating(self, request_image):
self.model.eval()
tensor_image = self.image_to_tensor(request_image)
with torch.no_grad():
output = self.model(tensor_image)
return output.cpu().data.numpy().argmax()
def image_to_tensor(self, request_image):
return transforms.ToTensor()(self.cv_bridge.imgmsg_to_cv2(request_image, 'mono8'))
neural_network.py
class NeuralNetwork(nn.Module):
# Initializes the Neural Network by setting up the layers.
def __init__(self):
super().__init__()
self.flatten = nn.Flatten()
self.input_layer = nn.Sequential(nn.Linear(28*28, 512))
self.hidden_layer1 = nn.Linear(512, 254)
self.hidden_layer2 = nn.Linear(254, 128)
self.output_layer = nn.Linear(128, 10)
def forward(self, x):
x = self.flatten(x)
x = F.relu(self.input_layer(x))
x = F.relu(self.hidden_layer1(x))
x = F.relu(self.hidden_layer2(x))
x = self.output_layer(x)
return F.log_softmax(x, 1)
I get get a training accuracy of:
My output:
My camera image:
Could it be because of the resizing and grayscaling that the picture is not recognized? I just added imshow to the def image_to_tensor(self, request_image): function and the image is barely recognisable.

pytorch multi-class lstm predicting all one class on testing

I'm working on a project (my first AI project) and I've hit a bit of a wall. When performing testing on my trained classifier, it's predicting that everything is of class 1. Now the data set is heavily biased to class 1; however, I've implemented weights to compensate for this. Just concerned that I've coded this wrong or missed something. Please let me know if you see anything.
This is the setup and training
batchSize = 50
trainingLoad = DataLoader(trainingData, shuffle = True, batch_size = batchSize, drop_last=True)
validationLoad = DataLoader(validationData, shuffle = True, batch_size = batchSize, drop_last=True)
testingLoad = DataLoader(testingData, shuffle = True, batch_size = batchSize, drop_last=True)
vocabularySize = len(wordToNoDict)
output = 3
embedding = 400
hiddenDimension = 524
layers = 4
classifierModel = Classifier.HateSpeechDetector(device, vocabularySize, output, embedding, hiddenDimension, layers)
classifierModel.to(device)
path = 'Program\data\state_dict2.pt'
weights = torch.tensor([1203/1203, 1203/15389, 1203/3407])
criterion = nn.CrossEntropyLoss(weight = weights)
trainClassifier(classifierModel, trainingLoad, validationLoad, device, batchSize, criterion, path)
test(classifierModel, path, testingLoad, batchSize, device, criterion)
def trainClassifier(model, trainingData, validationData, device, batchSize, criterion, path):
epochs = 5
counter = 0
testWithValiEvery = 10
clip = 5
valid_loss_min = np.Inf
lr=0.0001
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
model.train()
for i in range(epochs):
h = model.init_hidden(batchSize, device)
for inputs, labels in trainingData:
h = tuple([e.data for e in h])
inputs, labels = inputs.to(device), labels.to(device)
model.zero_grad()
output, h = model(inputs, h)
loss = criterion(output.squeeze(), labels.long())
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), clip)
optimizer.step()
counter += 1
print(counter)
if counter%testWithValiEvery == 0:
print("validating")
val_h = model.init_hidden(batchSize, device)
val_losses = []
model.eval()
for inp, lab in validationData:
val_h = tuple([each.data for each in val_h])
inp, lab = inp.to(device), lab.to(device)
out, val_h = model(inp, val_h)#
val_loss = criterion(out.squeeze(), lab.long())
val_losses.append(val_loss.item())
model.train()
print("Epoch: {}/{}...".format(i+1, epochs),
"Step: {}...".format(counter),
"Loss: {:.6f}...".format(loss.item()),
"Val Loss: {:.6f}".format(np.mean(val_losses)))
if np.mean(val_losses) <= valid_loss_min:
torch.save(model.state_dict(), path)
print('Validation loss decreased ({:.6f} --> {:.6f}). Saving model ...'.format(valid_loss_min,np.mean(val_losses)))
print('model saved')
valid_loss_min = np.mean(val_losses)
This is the classifier - Fair amount of random commenting here where i've meddled with bits
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as op
import torchvision
from torch.utils.data import TensorDataset, DataLoader
from torchvision import transforms, datasets
class HateSpeechDetector(nn.Module):
def __init__(self, device, vocabularySize, output, embedding, hidden, layers, dropProb=0.5):
super(HateSpeechDetector, self).__init__()
#Number of outputs (Classes/Categories)
self.output = output
#Number of layers in the LSTM
self.numLayers = layers
#Number of hidden neurons in each LSTM layer
self.hiddenDimensions = hidden
#Device being used for by model (CPU or GPU)
self.device = device
#Embedding layer finds correlations in words by converting word integers into vectors
self.embedding = nn.Embedding(vocabularySize, embedding)
#LSTM stores important data in memory, using it to help with future predictions
self.lstm = nn.LSTM(embedding,hidden,layers,dropout=dropProb,batch_first=True)
#Dropout is used to randomly drop nodes. This helps to prevent overfitting of the model during training
self.dropout = nn.Dropout(dropProb)
#Establishing 4 simple layers and a sigmoid output
self.fc = nn.Linear(hidden, hidden)
self.fc2 = nn.Linear(hidden, hidden)
self.fc3 = nn.Linear(hidden, hidden)
self.fc4 = nn.Linear(hidden, hidden)
self.fc5 = nn.Linear(hidden, hidden)
self.fc6 = nn.Linear(hidden, output)
self.softmax = nn.Softmax(dim=2)
def forward(self, x, hidden):
batchSize = x.size(0)
x = x.long()
embeds = self.embedding(x)
lstm_out, hidden = self.lstm(embeds, hidden)
#Tensor changes here from 250,33,524 to 8250,524
# lstm_out = lstm_out.contiguous().view(-1,self.hiddenDimensions)
out = self.dropout(lstm_out)
out = self.fc(out)
out = self.fc2(out)
out = self.fc3(out)
out = self.fc4(out)
out = self.fc5(out)
out = self.fc6(out)
out = self.softmax(out)
out = out[:,-1,:]
# myTensor = torch.Tensor([0,0,0])
# newOut = torch.zeros(batchSize, self.output)
# count = 0
# row = 0
# for tensor in out:
# if(count == 33):
# newOut[row] = myTensor/33
# myTensor = torch.Tensor([0,0,0])
# row += 1
# count = 0
# myTensor += tensor
# count += 1
return out, hidden
def init_hidden(self, batchSize, device):
weight = next(self.parameters()).data
hidden = (weight.new(self.numLayers, batchSize, self.hiddenDimensions).zero_().to(device), weight.new(self.numLayers, batchSize, self.hiddenDimensions).zero_().to(device))
return hidden
You've added weights to the cross-entropy loss, and the weights bias towards the first class already ([1.0, 0.08, 0.35]).
Having a higher weight for a certain class means that the model will be more heavily penalized for getting that class wrong, and it's possible for the model to learn to just predict everything as the class with highest weight. Usually you don't need to manually assign weights.
Also, check your data to see if there's label imbalance, i.e., whether you have more training examples that are of the first class. An imbalanced training set has similar effects as setting different weights on the loss.

Ran out of Ram while training LSTM

I am kind of a beginner in RNNs, so I coded a LSTM architecture using Pytorch, but I always run out of RAM whenever I am in the 3rd epoch. I am already using a DataLoader and I tried to detach the gradient from the input tensor but it doesn't solve the problem out.
This is my training loop
writer = SummaryWriter()
criterion = nn.CrossEntropyLoss(reduction='mean', ignore_index = 0)
optimizer = optim.Adam(lstm.parameters(), lr = 1e-5)
gradient_clip = clip_grad_norm_(lstm.parameters(), max_norm = 5)
num_epochs = 20
epoch_loss = -1.0
loss = - 1
t = trange(num_epochs, desc= "Epoch loss", leave=True)
for epoch in t:
trainLoader = iter(DataLoader(dataset, batch_size = batch_size))
tt = trange(len(trainLoader)-1, desc= "Batch loss", leave=True)
for i in tt:
text, embedding = next(trainLoader)
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
y = lstm.forward(embedding.transpose(1,0))
labels = text.transpose(0,1)[1:].transpose(0,1).flatten()
loss = criterion(y.reshape(-1, y.shape[-1]), labels)
tt.set_description("Batch loss : %.4f" % loss)
tt.refresh()
loss.backward(retain_graph=True)
optimizer.step()
epoch_loss += loss
epoch_loss = epoch_loss / (len(trainLoader) - 1)
# Saving model
save_date = datetime.now().strftime("%d%m%Y-%H:%M:%S")
PATH = './save/lstm_model_'+save_date
torch.save(lstm, PATH)
# Updating progression bar
t.set_description("Epoch loss : %.4f" % epoch_loss)
t.refresh()
# Plotting gradients histograms in Tensorboard
writer.add_scalar('Text_generation_Loss/train', epoch_loss, epoch)
for tag, parm in lstm.named_parameters():
with torch.no_grad():
writer.add_histogram(tag, parm.grad.data.cpu().numpy(), epoch)
writer.flush()
print('Finished Training')
writer.close()
And this is the LSTM class that I built:
class LSTM(nn.Module):
def __init__(self, in_size : int, hidden_size : int):
super().__init__()
self.in_size = in_size
self.hidden_size = hidden_size
self.W_fi = nn.Linear(in_size,hidden_size)
self.W_fh = nn.Linear(hidden_size,hidden_size, bias=False)
self.W_ii = nn.Linear(in_size,hidden_size)
self.W_ih = nn.Linear(hidden_size,hidden_size, bias=False)
self.W_Ci = nn.Linear(in_size,hidden_size)
self.W_Ch = nn.Linear(hidden_size,hidden_size, bias=False)
self.W_oi = nn.Linear(in_size,hidden_size)
self.W_oh = nn.Linear(hidden_size,hidden_size, bias=False)
self.sigmoid = nn.Sigmoid()
self.tanh = nn.Tanh()
def one_step(self, x, h, C):
f_t = self.sigmoid(self.W_fi(x) + self.W_fh(h))
i_t = self.sigmoid(self.W_ii(x) + self.W_ih(h))
g_t = self.tanh(self.W_Ci(x) + self.W_Ch(h))
C_t = torch.mul(f_t, C) + torch.mul(i_t, g_t)
o_t = self.sigmoid(self.W_oi(x) + self.W_oh(h))
h_t = torch.mul(o_t, self.tanh(C_t))
return h_t, C_t
def forward(self, X):
h_out = []
h = - torch.ones(X.shape[1], self.hidden_size)
C = - torch.ones(X.shape[1], self.hidden_size)
h_t, C_t = self.one_step(X[0], h, C)
h_out.append(h_t)
for i in range(1, X.shape[0] - 1):
h_t, C_t = self.one_step(X[i], h_t, C_t)
h_out.append(h_t)
h_out = torch.cat(h_out)
return h_out #h_out.reshape(-1,batch_size,num_embeddings)
I already searched for a similar case but I wasn't able to find a solution
I don't know if it may help somebody, but I solved the problem. I wasn't perhaps clear about the task, but the goal was to make text generation. The first thing I was doing is embed the sentences using torch.nn.embedding that was defined outside my LSTM. The solution was to include it as a layer of my network, since the embedding is not a pretrained one and should be learned too.

Categories

Resources