pytorch multi-class lstm predicting all one class on testing

pytorch multi-class lstm predicting all one class on testing - python

I'm working on a project (my first AI project) and I've hit a bit of a wall. When performing testing on my trained classifier, it's predicting that everything is of class 1. Now the data set is heavily biased to class 1; however, I've implemented weights to compensate for this. Just concerned that I've coded this wrong or missed something. Please let me know if you see anything.
This is the setup and training
batchSize = 50
trainingLoad = DataLoader(trainingData, shuffle = True, batch_size = batchSize, drop_last=True)
validationLoad = DataLoader(validationData, shuffle = True, batch_size = batchSize, drop_last=True)
testingLoad = DataLoader(testingData, shuffle = True, batch_size = batchSize, drop_last=True)
vocabularySize = len(wordToNoDict)
output = 3
embedding = 400
hiddenDimension = 524
layers = 4
classifierModel = Classifier.HateSpeechDetector(device, vocabularySize, output, embedding, hiddenDimension, layers)
classifierModel.to(device)
path = 'Program\data\state_dict2.pt'
weights = torch.tensor([1203/1203, 1203/15389, 1203/3407])
criterion = nn.CrossEntropyLoss(weight = weights)
trainClassifier(classifierModel, trainingLoad, validationLoad, device, batchSize, criterion, path)
test(classifierModel, path, testingLoad, batchSize, device, criterion)
def trainClassifier(model, trainingData, validationData, device, batchSize, criterion, path):
epochs = 5
counter = 0
testWithValiEvery = 10
clip = 5
valid_loss_min = np.Inf
lr=0.0001
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
model.train()
for i in range(epochs):
h = model.init_hidden(batchSize, device)
for inputs, labels in trainingData:
h = tuple([e.data for e in h])
inputs, labels = inputs.to(device), labels.to(device)
model.zero_grad()
output, h = model(inputs, h)
loss = criterion(output.squeeze(), labels.long())
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), clip)
optimizer.step()
counter += 1
print(counter)
if counter%testWithValiEvery == 0:
print("validating")
val_h = model.init_hidden(batchSize, device)
val_losses = []
model.eval()
for inp, lab in validationData:
val_h = tuple([each.data for each in val_h])
inp, lab = inp.to(device), lab.to(device)
out, val_h = model(inp, val_h)#
val_loss = criterion(out.squeeze(), lab.long())
val_losses.append(val_loss.item())
model.train()
print("Epoch: {}/{}...".format(i+1, epochs),
"Step: {}...".format(counter),
"Loss: {:.6f}...".format(loss.item()),
"Val Loss: {:.6f}".format(np.mean(val_losses)))
if np.mean(val_losses) <= valid_loss_min:
torch.save(model.state_dict(), path)
print('Validation loss decreased ({:.6f} --> {:.6f}). Saving model ...'.format(valid_loss_min,np.mean(val_losses)))
print('model saved')
valid_loss_min = np.mean(val_losses)
This is the classifier - Fair amount of random commenting here where i've meddled with bits
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as op
import torchvision
from torch.utils.data import TensorDataset, DataLoader
from torchvision import transforms, datasets
class HateSpeechDetector(nn.Module):
def __init__(self, device, vocabularySize, output, embedding, hidden, layers, dropProb=0.5):
super(HateSpeechDetector, self).__init__()
#Number of outputs (Classes/Categories)
self.output = output
#Number of layers in the LSTM
self.numLayers = layers
#Number of hidden neurons in each LSTM layer
self.hiddenDimensions = hidden
#Device being used for by model (CPU or GPU)
self.device = device
#Embedding layer finds correlations in words by converting word integers into vectors
self.embedding = nn.Embedding(vocabularySize, embedding)
#LSTM stores important data in memory, using it to help with future predictions
self.lstm = nn.LSTM(embedding,hidden,layers,dropout=dropProb,batch_first=True)
#Dropout is used to randomly drop nodes. This helps to prevent overfitting of the model during training
self.dropout = nn.Dropout(dropProb)
#Establishing 4 simple layers and a sigmoid output
self.fc = nn.Linear(hidden, hidden)
self.fc2 = nn.Linear(hidden, hidden)
self.fc3 = nn.Linear(hidden, hidden)
self.fc4 = nn.Linear(hidden, hidden)
self.fc5 = nn.Linear(hidden, hidden)
self.fc6 = nn.Linear(hidden, output)
self.softmax = nn.Softmax(dim=2)
def forward(self, x, hidden):
batchSize = x.size(0)
x = x.long()
embeds = self.embedding(x)
lstm_out, hidden = self.lstm(embeds, hidden)
#Tensor changes here from 250,33,524 to 8250,524
# lstm_out = lstm_out.contiguous().view(-1,self.hiddenDimensions)
out = self.dropout(lstm_out)
out = self.fc(out)
out = self.fc2(out)
out = self.fc3(out)
out = self.fc4(out)
out = self.fc5(out)
out = self.fc6(out)
out = self.softmax(out)
out = out[:,-1,:]
# myTensor = torch.Tensor([0,0,0])
# newOut = torch.zeros(batchSize, self.output)
# count = 0
# row = 0
# for tensor in out:
# if(count == 33):
# newOut[row] = myTensor/33
# myTensor = torch.Tensor([0,0,0])
# row += 1
# count = 0
# myTensor += tensor
# count += 1
return out, hidden
def init_hidden(self, batchSize, device):
weight = next(self.parameters()).data
hidden = (weight.new(self.numLayers, batchSize, self.hiddenDimensions).zero_().to(device), weight.new(self.numLayers, batchSize, self.hiddenDimensions).zero_().to(device))
return hidden

You've added weights to the cross-entropy loss, and the weights bias towards the first class already ([1.0, 0.08, 0.35]).
Having a higher weight for a certain class means that the model will be more heavily penalized for getting that class wrong, and it's possible for the model to learn to just predict everything as the class with highest weight. Usually you don't need to manually assign weights.
Also, check your data to see if there's label imbalance, i.e., whether you have more training examples that are of the first class. An imbalanced training set has similar effects as setting different weights on the loss.

Related

How to implement Laplace Posteriori Approximation on BERT in PyTorch?

I'm trying to implement the Laplace Posteriori Approximation on the last layer for the classification results obtained by BERT model. I get an error regarding input size, and after I fix it by extracting just embeddings and class labels from BERT to feed them into Laplace, I get another bunch of errors regarding input dimensions that I don't know how to debug.
As this is something I didn't find on the internet, and includes relatively new libraries, I will post here just the first error I got, code that might help in debugging and useful links.
I will update post if needed.
Of course, if someone knows how to implement Laplace Posteriori Approximation with BERT in some other library like Scikit or Trax, it would be helpful. Also, some other Transformer classification model with some other confidence approximation will be useful for me. Any help is appreciated!
Code:
# Import
import pandas as pd
import torch
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from torch import nn
from transformers import BertTokenizer
from transformers import BertModel
from transformers import BertForSequenceClassification
from sklearn.model_selection import train_test_split
import time
import os
#Toy Data
data_a_b_c = ["""category a. This is category a. In category a we talk about animals.
This category includes lions, fish, tigers, birds, elephants, mouses, dogs, cats, and all other animals."""] * 60 \
+ ["""category b. This is category b. In category b we talk about people. This category members are
Abraham Maslow, John Lennon, Drazen Petrovic, Nikola Tesla, Slavoljub Penkala, Nenad Bakic and Larry Page."""] * 60 \
+ ["""category c. This is category c. Category c is dedicated to car brands like Lamborgini, Rimac-Buggati, BMW, Mercedes,
Honda, Opel, Wolkswagen, and etc."""] * 60
label_0_1_2 = [0] * 60 + [1] * 60 + [2] * 60
d = {'text': data_a_b_c, 'labels': label_0_1_2}
df = pd.DataFrame(data=d)
print(df.head(3))
print(df.tail(3))
print(df.info())
# Parameters
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
batch_size = 2
learning_rate = 3e-4
epochs = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
labels = pd.Series(df.labels.values).to_dict()
num_classes = 3
print(f'Tokenizer: {tokenizer}, Batch size:{batch_size}, Learning rate:{learning_rate}, Epochs:{epochs}')
print('Device: ', device)
print('Number of possible classes: ', num_classes)
# Model Architecture
class TransformerModel(nn.Module):
def __init__(self, num_classes, dropout=0.5):
super(TransformerModel, self).__init__()
self.bert = BertModel.from_pretrained('bert-base-multilingual-cased')
self.dropout = nn.Dropout(dropout)
self.linear = nn.Linear(768, num_classes)
self.relu = nn.ReLU()
def forward(self, input_id, mask):
_, pooled_output = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
dropout_output = self.dropout(pooled_output)
linear_output = self.linear(dropout_output)
final_layer = self.relu(linear_output)
return final_layer
# Prepare Data Function
def prepare_data(data, labels):
texts = tokenizer(data, padding='max_length', max_length=512, truncation=True, return_tensors="pt")
input_ids = texts['input_ids']
attention_mask = texts['attention_mask']
train_dataset = TensorDataset(input_ids, attention_mask, torch.LongTensor(labels))
dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
return dataloader
#Run Training Function
def run_training(train_dataloader, val_dataloader, epochs=epochs, lr=learning_rate):
def train(dataloader):
model.train()
total_acc, total_count = 0, 0
log_interval = 128
start_time = time.time()
for idx, (input_id, mask, label) in enumerate(train_dataloader):
# print(idx)
mask = mask.to(device)
input_id = input_id.to(device)
label = label.type(torch.LongTensor).to(device)
output = model(input_id, mask)
optimizer.zero_grad()
loss = criterion(output, label)
loss.backward()
# torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
optimizer.step()
total_acc += (output.argmax(1) == label).sum().item()
total_count += label.size(0)
if idx % log_interval == 0 and idx > 0:
elapsed = time.time() - start_time
print('| epoch {:3d} | {:5d}/{:5d} batches '
'| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
total_acc / total_count))
total_acc, total_count = 0, 0
start_time = time.time()
def evaluate(dataloader):
model.eval()
total_acc, total_count = 0, 0
with torch.no_grad():
for idx, (input_id, mask, label) in enumerate(dataloader):
mask = mask.to(device)
input_id = input_id.to(device)
label = label.to(device)
output = model(input_id, mask)
total_acc += (output.argmax(1) == label).sum().item()
total_count += label.size(0)
return total_acc / total_count
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
cuda = torch.cuda.is_available()
device = torch.device("cuda" if cuda else "cpu")
device = 'cuda'
model.to(device)
total_accu = None
for epoch in range(1, epochs + 1):
epoch_start_time = time.time()
train(train_dataloader)
accu_val = evaluate(val_dataloader)
if total_accu is not None and total_accu > accu_val:
scheduler.step()
else:
total_accu = accu_val
print('-' * 59)
print('| end of epoch {:3d} | time: {:5.2f}s | '
'valid accuracy {:8.3f} '.format(epoch,
time.time() - epoch_start_time,
accu_val))
print('-' * 59)
# Data Split And Preparation
X_train, X_test, y_train, y_test = train_test_split(df.text.values.tolist(), df.labels.values.tolist(), test_size=0.2, random_state=2)
train_dataloader = prepare_data(X_train, y_train)
val_dataloader = prepare_data(X_test, y_test)
# Run The Model
model = TransformerModel(num_classes)
run_training(train_dataloader, val_dataloader)
print('finished')
# Save And Load The Model (if needed)
PATH = ".../Torch_BERT_model"
torch.save(model, os.path.join(PATH, "Toy_Data_BERT.pth"))
model = torch.load(os.path.join(PATH, "Toy_Data_BERT.pth"))
print(model)
# Laplace
from laplace import Laplace
la = Laplace(model, 'classification', subset_of_weights='last_layer', hessian_structure='full')
la.fit(train_dataloader)
Error I get:
--------------------------------------------------------------------------- ValueError Traceback (most recent call
last) ~\AppData\Local\Temp\ipykernel_7144\3779742208.py in <cell line:
2>()
1 la = Laplace(model, 'classification', subset_of_weights='last_layer', hessian_structure='full')
----> 2 la.fit(train_dataloader)
~\anaconda3\lib\site-packages\laplace\lllaplace.py in fit(self,
train_loader, override)
98
99 if self.model.last_layer is None:
--> 100 X, _ = next(iter(train_loader))
101 with torch.no_grad():
102 try:
ValueError: too many values to unpack (expected 2)
Useful link for Laplace implementation with examples:
https://aleximmer.github.io/Laplace/#full-example-optimization-of-the-marginal-likelihood-and-prediction
Code that might help in debugging:
for x in train_dataloader:
print("The length of batch is:", len(x))
print()
print("The batch looks like:", x)
print()
print("The length of the first element in the batch is:") #embedding
print(len(x[0]))
print("The length of the second element in the batch is:") #1 if place is filled with word, 0 if it's empty?
print(len(x[1]))
print("The length of the third element in the batch is:") #category
print(len(x[2]))
print()
print("The lengths of the first tensor and second tensor in the first element in the batch is:")
print(len(x[0][0]), len(x[0][1])) # = max_length (512)
print("The lengths of the first tensor and second tensor in the second element in the batch is:")
print(len(x[1][0]), len(x[1][1])) # = max_length (512)
print()
print()

The laplace library expects that the dataloader returns two parameters (X,y) and that the model requires exactly one argument to make its prediction (code). But your model forward pass requires two arguments, namely input_id and mask, and your dataloader returns three arguments input_id, mask, and labels.
There are several ways to work around this limitation (e.g. return a dict with input_ids and attention_mask). The way that requires the least understanding of the internals of the laplace library is to generate the attention mask at runtime in the forward pass (not great for the performance):
class TransformerModel(nn.Module):
def __init__(self, num_classes, pad_id, dropout=0.5):
super(TransformerModel, self).__init__()
self.bert = BertModel.from_pretrained('bert-base-multilingual-cased')
self.dropout = nn.Dropout(dropout)
self.linear = nn.Linear(768, num_classes)
self.relu = nn.ReLU()
self.pad_id = pad_id
def forward(self, input_id):
mask = (input_ids!=self.pad_id).type(input_ids.dtype)
_, pooled_output = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
dropout_output = self.dropout(pooled_output)
linear_output = self.linear(dropout_output)
final_layer = self.relu(linear_output)
return final_layer
model = TransformerModel(num_classes, tokenizer.pad_token_id)

Hierarchical LSTM autoencoder - model not training

I'm trying to reconstruct this paper about hierarchical autoencoder for paragraphs.
The idea is: Break a paragraph into sentences, then encode each sentence using an LSTM, and then using these encoding as an input for another LSTM that encode the entire paragraph.
Then, using a mirror decoder, decode the encoded paragraph using an LSTM into multiple sentences, and then use another LSTM to decode each word, with a linear layer on top and predicts the word.
The objective is to try to predict the original paragraph.
I've done some preprocessing, and right now I save each paragraph as a tensor of (maxSentence,maxWordsPerSentence,VocabSize), using one hot encoding.
My problem is, there model is not learning. The loss stays exactly the same and it doesn't seem as anything is happening.. I wasn't sure on how to calculate the loss (I've ran a batch all together and decoded it into multiple paragraphs, and then calculated the loss against the entire batch predictions, my train function is added below. I don't know if that is the problem (maybe I should calculate loss sentence by sentence instead the entire paragraph?) or maybe I have a problem in my model.
Encoder code:
class Encoder(nn.Module):
def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
super().__init__()
#self.embedding = nn.Embedding(input_dim, emb_dim)
self.rnn_sent = nn.GRU(input_dim, enc_hid_dim, bidirectional = True)
self.rnn_par = nn.GRU(enc_hid_dim*2, dec_hid_dim, bidirectional = True)
def forward(self, src):
outputs, hidden = self.rnn_sent(src[:,0,0])
total_out = outputs.unsqueeze(0).permute(1,0,2)
for i in range(1,src.shape[1]):
for j in range(src.shape[2]):
outputs, hidden = self.rnn_sent(src[:,i,j],hidden)
total_out = torch.cat((total_out,outputs.unsqueeze(0).permute(1,0,2)),dim=1)
outputs_par, hidden_par = self.rnn_par(total_out[:,0])
for i in range(total_out.shape[1]):
outputs_par, hidden_par = self.rnn_par(total_out[:,i],hidden_par)
return outputs_par, hidden_par
Decoder code:
class Decoder(nn.Module):
def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
super().__init__()
self.output_dim = output_dim
self.attention = attention
#self.embedding = nn.Embedding(output_dim, emb_dim)
self.rnn_par = nn.GRU((enc_hid_dim * 2), dec_hid_dim*2)
self.rnn_sen = nn.GRU(output_dim, dec_hid_dim*2)
self.fc_out = nn.Linear(dec_hid_dim*2, output_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, input, hidden, encoder_outputs):
output, hidden = self.rnn_par(encoder_outputs)
all_par = output.unsqueeze(0).permute(1,0,2)
for i in range(1,max_par_len):
output,hidden = self.rnn_par(output,hidden)
all_par = torch.cat((all_par,output.unsqueeze(0).permute(1,0,2)),dim=1)
for i in range(max_par_len):
output_arg = self.fc_out(all_par[:,i])
#output_argmax = F.one_hot(output_arg.argmax(dim = 1), self.output_dim).to(torch.float)
output_argmax = torch.softmax(output_arg,dim=1)
output_sen, hidden_sen = self.rnn_sen(output_argmax)
all_par_sen = output_argmax.unsqueeze(0).permute(1,0,2)
for j in range(max_sen_len - 1):
output_sen,hidden_sen = self.rnn_sen(output_argmax,hidden_sen)
output_arg = self.fc_out(output_sen)
output_argmax = torch.softmax(output_arg,dim=1)
all_par_sen = torch.cat((all_par_sen,output_argmax.unsqueeze(0).permute(1,0,2)),dim=1)
if i == 0:
all_doc = all_par_sen.unsqueeze(0).permute(1,0,2,3)
else:
all_doc = torch.cat((all_doc,all_par_sen.unsqueeze(0).permute(1,0,2,3)),dim=1)
i+=1
return all_doc ,hidden_sen
And my train function:
def train(model, iterator, optimizer, criterion, clip, epoch):
model.train()
epoch_loss = 0
data = tqdm(iterator)
for i, batch in enumerate(data):
src = batch[0].to(device)#.to(torch.long)#.reshape(batch[0].shape[0],-1)
trg = batch[0].to(device)#.to(torch.long)#.reshape(batch[0].shape[0],-1)
target = torch.argmax(trg,dim=3).view(-1)
print(target)
optimizer.zero_grad()
output = model(src, trg).view(-1,OUTPUT_DIM)
loss = criterion(output, target)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
optimizer.step()
epoch_loss += loss.item()
N_EPOCHS = 20
CLIP = 1
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss(ignore_index = vocabulary['<pad>'])
best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):
start_time = time.time()
train_loader, valid_loader = data_loaders['train_loader'], data_loaders['test_loader']
train_loss = train(model, train_loader, optimizer, criterion, CLIP,f'{epoch+1}/{N_EPOCHS}')
#valid_loss = evaluate(model, valid_loader, criterion)
end_time = time.time()
epoch_mins, epoch_secs = epoch_time(start_time, end_time)
print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')

Overfitting when fine-tuning BERT sentiment analysis

I am newbie to Machine Learning in general. I am currently trying to follow a tutorial on sentiment analysis using BERT and Transformers https://curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/
However when I train the model it has appeared that the model is overfitting
I do not know how to fix this. I have tried lowering amount of epochs, increasing batch size , shuffling my data (which is ordered) and increasing the validation split. So far nothing has worked. I have even tried changing different learning rate but the one I am using now is the smallest.
Below is my code:
PRE_TRAINED_MODEL_NAME = 'TurkuNLP/bert-base-finnish-cased-v1'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
MAX_LEN = 40
#Make a PyTorch dataset
class FIDataset(Dataset):
def __init__(self, texts, targets, tokenizer, max_len):
self.texts = texts
self.targets = targets
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
return len(self.texts)
def __getitem__(self, item):
text = str(self.texts[item])
target = self.targets[item]
encoding = self.tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=self.max_len,
return_token_type_ids=False,
pad_to_max_length=True,
return_attention_mask=True,
return_tensors='pt',
)
return {
'text': text,
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'targets': torch.tensor(target, dtype=torch.long)
}
#split test and train
df_train, df_test = train_test_split(
df,
test_size=0.1,
random_state=RANDOM_SEED
)
df_val, df_test = train_test_split(
df_test,
test_size=0.5,
random_state=RANDOM_SEED
)
#data loader function
def create_data_loader(df, tokenizer, max_len, batch_size):
ds = FIDataset(
texts=df.content.to_numpy(),
targets=df.sentiment.to_numpy(),
tokenizer=tokenizer,
max_len=max_len
)
return DataLoader(
ds,
batch_size=batch_size,
num_workers=4
)
BATCH_SIZE = 32
#Load data into train, test, val
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)
#Bert model loading
bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
# Sentiment Classifier based on Bert model just loaded
class SentimentClassifier(nn.Module):
def __init__(self, n_classes):
super(SentimentClassifier, self).__init__()
self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
self.drop = nn.Dropout(p=0.1)
self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
def forward(self, input_ids, attention_mask):
returned = self.bert(
input_ids=input_ids,
attention_mask=attention_mask
)
pooled_output = returned["pooler_output"]
output = self.drop(pooled_output)
return self.out(output)
#Create a Classifier instance and move to GPU
model = SentimentClassifier(3)
model = model.to(device)
#Optimize with AdamW
EPOCHS = 6
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=0,
num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)
#Train each Epoch function
def train_epoch(
model,
data_loader,
loss_fn,
optimizer,
device,
scheduler,
n_examples
):
model = model.train()
losses = []
correct_predictions = 0
for d in data_loader:
input_ids = d["input_ids"].to(device)
attention_mask = d["attention_mask"].to(device)
targets = d["targets"].to(device)
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask
)
_, preds = torch.max(outputs, dim=1)
loss = loss_fn(outputs, targets)
correct_predictions += torch.sum(preds == targets)
losses.append(loss.item())
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
scheduler.step()
optimizer.zero_grad()
return correct_predictions.double() / n_examples, np.mean(losses)
import torch
history = defaultdict(list)
best_accuracy = 0
if __name__ == '__main__':
for epoch in range(EPOCHS):
print(f'Epoch {epoch + 1}/{EPOCHS}')
print('-' * 10)
train_acc, train_loss = train_epoch(
model,
train_data_loader,
loss_fn,
optimizer,
device,
scheduler,
len(df_train)
)
print(f'Train loss {train_loss} accuracy {train_acc}')
val_acc, val_loss = eval_model(
model,
val_data_loader,
loss_fn,
device,
len(df_val)
)
print(f'Val loss {val_loss} accuracy {val_acc}')
print()
history['train_acc'].append(train_acc)
history['train_loss'].append(train_loss)
history['val_acc'].append(val_acc)
history['val_loss'].append(val_loss)
if val_acc > best_accuracy:
torch.save(model.state_dict(), 'best_model_state.bin')
best_accuracy = val_acc

Broadly speaking, to reduce overfitting, you can:
increase regularization
reduce model complexity
perform early stopping
increase training data
From what you've written, you've already tried 3 and 4. In the case of neural networks, you can increase regularization by increasing dropout. You already have the code for it.
# NOTE: You don't need bert_model here since you're creating one inside
# of SentimentClassifier.
#bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
# Sentiment Classifier based on Bert model just loaded
class SentimentClassifier(nn.Module):
def __init__(self, n_classes):
super(SentimentClassifier, self).__init__()
self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
self.drop = nn.Dropout(p=0.1) # <-- INCREASE THIS VALUE
self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
I'd recommend trying higher values of the Dropout probability, as I noted in your code above ("INCREASE THIS VALUE"). Keep track of the Dropout probability and the resulting observed overfitting. Try probability values of 0.1, 0.2, 0.3, 0.4, 0.5.
Usually, I've found that dropout over 0.5 doesn't do much good.

Neural network keep predicting the same number

I have a ROS application where a camera node sends an image via service to a neutral network node. My training and validation dataset I use is the MNIST database. It should be very easy to predict a number, but the neural network returns the same number for every single service request.
ai_service.py
class AiService():
def __init__(self, save_path):
self.batch_size = 2800
self.epochs = 25
self.learning_rate = 0.01
self.training_data = torch.utils.data.DataLoader(datasets.MNIST(root='./data', train=True, download=True,
transform=transforms.Compose([transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))])), 200, shuffle=True)
self.validation_data = torch.utils.data.DataLoader(datasets.MNIST(root='./data', train=False, download=True,
transform=transforms.Compose([transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))])), 200, shuffle=True)
...
# Function to train the mnist dataset.
def training(self):
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(self.model.parameters(), self.learning_rate)
start_time = time()
for epoch in range(self.epochs):
running_loss = 0
# trainig phase
for images, labels in self.training_data:
optimizer.zero_grad()
image, label = images.to(self.device), labels.to(self.device)
output = self.model(image)
loss = criterion(output, label)
loss.backward()
optimizer.step() #optimizing weights
running_loss += loss.item()
else:
print("Epoch {} - Training loss: {:.10f}".format(epoch, running_loss / len(self.training_data)))
print("\nTraining Time (in minutes): {:.2f} =".format((time() - start_time) / 60))
def validating(self, request_image):
self.model.eval()
tensor_image = self.image_to_tensor(request_image)
with torch.no_grad():
output = self.model(tensor_image)
return output.cpu().data.numpy().argmax()
def image_to_tensor(self, request_image):
return transforms.ToTensor()(self.cv_bridge.imgmsg_to_cv2(request_image, 'mono8'))
neural_network.py
class NeuralNetwork(nn.Module):
# Initializes the Neural Network by setting up the layers.
def __init__(self):
super().__init__()
self.flatten = nn.Flatten()
self.input_layer = nn.Sequential(nn.Linear(28*28, 512))
self.hidden_layer1 = nn.Linear(512, 254)
self.hidden_layer2 = nn.Linear(254, 128)
self.output_layer = nn.Linear(128, 10)
def forward(self, x):
x = self.flatten(x)
x = F.relu(self.input_layer(x))
x = F.relu(self.hidden_layer1(x))
x = F.relu(self.hidden_layer2(x))
x = self.output_layer(x)
return F.log_softmax(x, 1)
I get get a training accuracy of:
My output:
My camera image:
Could it be because of the resizing and grayscaling that the picture is not recognized? I just added imshow to the def image_to_tensor(self, request_image): function and the image is barely recognisable.

Test Loss looks weird when plotted

I am using LSTM in-order to perform binary-classification, when I plot the test-loss it is not reducing over time.It is rather fluctuating a lot over time and looks extremely weird.The training loss on the other hand looks normal and is decreasing over time.
Here's a Picture of how it looks
This is my code of the model definition and configuration.
# Create LSTM Model
class LSTMModel(nn.Module):
def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
super(LSTMModel, self).__init__()
# Number of hidden dimensions
self.hidden_dim = hidden_dim
# Number of hidden layers
self.layer_dim = layer_dim
# LSTM
self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True, dropout=0.2)
# Readout layer
self.f1 = nn.Linear(hidden_dim, output_dim)
self.softmax = nn.Sigmoid()
def forward(self, x):
# Initialize hidden state with zeros
h0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).type(torch.FloatTensor).cuda())
# Initialize cell state
c0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).type(torch.FloatTensor).cuda())
out, (hn, cn) = self.lstm(x, (h0,c0))
out = self.f1(hn[-1])
out = self.softmax(out)
return out
#LSTM Configuration
batch_size = 10000
num_epochs = 200
learning_rate = 0.001#Try lowering the rate
# Create LSTM
input_dim = 1 # input dimension
hidden_dim = 50 # hidden layer dimension
layer_dim =2 # number of hidden layers
output_dim = 1 # output dimension
model = LSTMModel(input_dim, hidden_dim, layer_dim, output_dim)
model.cuda()
error = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
This is my code for training and testing
from tensorboardcolab import TensorBoardColab
globaliter = 0
globaliter2=0
tb = TensorBoardColab()
for epoch in tqdm(range(num_epochs)):
# Train
model.train()
for i, (inputs, targets) in enumerate(train_loader):
train = Variable(inputs.type(torch.FloatTensor).cuda())
targets = Variable(targets.type(torch.FloatTensor).cuda())
optimizer.zero_grad()
outputs = model(train)
loss = error(outputs, targets)
loss_list_train.append(loss.item())
loss.backward()
optimizer.step()
tb.save_value('Train Loss', 'train_loss', globaliter, loss.item())
globaliter += 1
tb.flush_line('train_loss')
# Test
model.eval()
for inputs, targets in test_loader:
inputs = Variable(inputs.type(torch.FloatTensor).cuda())
targets = Variable(targets.type(torch.FloatTensor).cuda())
outputs = model(inputs)
loss_test = error(outputs, targets)
loss_list_test.append(loss_test.item())
tb.save_value('Test Loss', 'test_loss', globaliter2, loss_test.item())
globaliter2 += 1
tb.flush_line('test_loss')
I'd really be grateful if someone helped me figure this out, or offered suggestions or advice

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

pytorch multi-class lstm predicting all one class on testing - python

Related

How to implement Laplace Posteriori Approximation on BERT in PyTorch?

Hierarchical LSTM autoencoder - model not training

Overfitting when fine-tuning BERT sentiment analysis

Neural network keep predicting the same number

Test Loss looks weird when plotted

Categories

Resources