Here is my code.I dont know why my train and validation accuracy increase too slow.Is that normal? I’m new at deep learning.This is my homework.Train and validation values dont change nearly till loop 500.Is that normal? I changed learning rate and add weight_decay etc. but i didnt see difference
# -*- coding: utf-8 -*-
import torch
import torch.nn.functional as F
from torch import autograd, nn
from torch.autograd import Variable
import numpy as np
import matplotlib.pyplot as plt
from torchvision import transforms, datasets
from torch.utils import data
Olivetti face dataset
from sklearn.datasets import fetch_olivetti_faces
# Olivetti dataset download
olivetti = fetch_olivetti_faces()
train = olivetti.images
label =
X = train
Y = label
print("\nDownload Ok")
Set for train
train_rate = 0.8
X_train = np.zeros([int(train_rate * X.shape[0]),64,64], dtype=float)
Y_train = np.zeros([int(train_rate * X.shape[0])], dtype=int)
X_val = np.zeros([int((1-train_rate) * X.shape[0]+1),64,64], dtype=float)
Y_val = np.zeros([int((1-train_rate) * X.shape[0]+1)], dtype=int)
#Split data for train and validation
for i in range(X.shape[0]):
if (i%10)/9 <= train_rate:
X_train[ie] = X[i]
Y_train[ie] = Y[i]
ie += 1
X_val[iv] = X[i]
Y_val[iv] = Y[i]
iv += 1
X_train = X_train.reshape(320,-1,64,64)
X_val = X_val.reshape(80,-1,64,64)
X_train = torch.Tensor(X_train)
Y_train = torch.Tensor(Y_train)
X_val = torch.Tensor(X_val)
Y_val = torch.Tensor(Y_val)
batch_size = 16
train_loader =,
val_loader =,
class CNNModule(nn.Module):
def __init__(self):
super(CNNModule, self).__init__()
self.conv1 = nn.Conv2d(1, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16 * 13 * 13, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 40)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(-1, 16 * 13 * 13)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
def make_train(model,dataset,n_iters,gpu):
# Organize data
X_train,Y_train,X_val,Y_val = dataset
kriter = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(),lr=0.03)
#Arrays to save loss and accuracy
tl=np.zeros(n_iters) #For train loss
ta=np.zeros(n_iters) #For train accuracy
vl=np.zeros(n_iters) #For validation loss
va=np.zeros(n_iters) #For validation accuracy
# Convert labels to long
Y_train = Y_train.long()
Y_val = Y_val.long()
# GPU control
if gpu:
X_train,Y_train = X_train.cuda(),Y_train.cuda()
X_val,Y_val = X_val.cuda(),Y_val.cuda()
model = model.cuda() # Parameters to GPU!
print("Using GPU")
print("Using CPU")
# print(X_train.shape)
# print(Y_train.shape)
for i in range(n_iters):
# train forward
train_out = model.forward(X_train)
train_loss = kriter(train_out,Y_train)
# Backward and optimization
# Compute train accuracy
train_predict = train_out.cpu().detach().argmax(dim=1)
train_accuracy = (train_predict.cpu().numpy()==Y_train.cpu().numpy()).mean()
# For validation
val_out = model.forward(X_val)
val_loss = kriter(val_out,Y_val)
# Compute validation accuracy
val_predict = val_out.cpu().detach().argmax(dim=1)
val_accuracy = (val_predict.cpu().numpy()==Y_val.cpu().numpy()).mean()
tl[i] = train_loss.cpu().detach().numpy()
ta[i] = train_accuracy
vl[i] = val_loss.cpu().detach().numpy()
va[i] = val_accuracy
# Show result each 5 loop
if i%5==0:
print("Loop --> ",i)
print("Train Loss :",train_loss.cpu().detach().numpy())
print("Train Accuracy :",train_accuracy)
print("Validation Loss :",val_loss.cpu().detach().numpy())
print("Validation Accuracy :",val_accuracy)
model = model.cpu()
#Print result
plt.plot(np.arange(n_iters), tl, 'r-')
plt.plot(np.arange(n_iters), ta, 'b--')
plt.plot(np.arange(n_iters), vl, 'r-')
plt.plot(np.arange(n_iters), va, 'b--')
dataset = X_train,Y_train,X_val,Y_val
gpu = True
gpu = gpu and torch.cuda.is_available()
model = CNNModule()
Loop --> 0
Train Loss : 3.6910985
Train Accuracy : 0.025
Validation Loss : 3.6908844
Validation Accuracy : 0.025
Loop --> 5
Loop --> 215
Train Loss : 3.6849258
Train Accuracy : 0.025
Validation Loss : 3.6850574
Validation Accuracy : 0.025
Loop --> 500
Train Loss : 3.4057992
Train Accuracy : 0.103125
Validation Loss : 3.5042462
Validation Accuracy : 0.0875
Loop --> 995
Train Loss : 0.007807272
Train Accuracy : 1.0
Validation Loss : 0.64222467
Validation Accuracy : 0.8375
I don't know if this is the only problem - but please note that you zero the gradient, then do forward pass over the validation data. which means that new gradients of the validation data are stored in the model before the next iteration. The common practice should be to create some evaluation method, and use it to make prediction over the validation set without saving the gradients. something like:
def eval_model(data, X_val, Y_val):
model.eval(); # this sets the model to be in inferrence mode (for example if you have batchNorm or droput layers)
with torch.no_grad(): # tells the model to not compute gradients.
val_out = model.forward(X_val)
val_loss = criterion(val_out,Y_val)
# here put some prints or whatever you want to do
model.train() # this returns the model to be in training mode
Being new to deep learning, I plan to open this post with a reproducible code example using Mnist, to understand fully on how to improve the training speed.
I'm using Ubuntu 20.04 LTS and have a RTX 3080, when I don't use the batch training and just train the whole 60,000 like below, it takes about 6-7 seconds to finish the training and GPU usage at 99-100%.
import torch
import torch.nn as nn
from torchvision import datasets
from torchvision.transforms import ToTensor
from import DataLoader, TensorDataset
import numpy as np
import random
from matplotlib import pyplot as plt
from import tqdm
import timeit
# Set Device function (to GPU)
def set_device():
device = "cuda" if torch.cuda.is_available() else "cpu"
if device != "cuda":
print("GPU is not enabled")
print("GPU is enabled")
return device
DEVICE = set_device()
# set seed function
def set_seed(seed=None, seed_torch=True):
if seed is None:
seed = np.random.choice(2 ** 32)
if seed_torch:
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
print(f'Random seed {seed} has been set.')
SEED = 2021
# for DataLoader
def seed_worker(worker_id):
worker_seed = torch.initial_seed() % 2**32
# Download Mnist datasets
train_data = datasets.MNIST(
test_data = datasets.MNIST(
X =, -1).float()
y = train_data.train_labels
X_test =, -1).float()
y_test = test_data.train_labels
# Simple Neural Net
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
# define layers
self.layers = nn.Sequential(
nn.Linear(784, 600),
nn.Linear(600, 300),
nn.Linear(300, 100),
nn.Linear(100, 10)
def forward(self, x):
return self.layers(x)
def predict(self, x):
return torch.argmax(self.forward(x), 1)
# simple train
X =
y =
X_test =
y_test =
SEED = 2021
model = Net().to(DEVICE)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.005, momentum=0.9)
loss_list = []
logits = model.forward(X)
loss = loss_function(logits, y)
start1 = timeit.default_timer()
for epoch in range(500):
logits = model.forward(X)
loss = loss_function(logits, y)
if epoch % 20 == 0:
print(f"epoch {epoch + 1}: loss: {loss:.5f},"
f"train_accuracy: {torch.sum(model.predict(X) == y) / 60000:.3f},"
f"test_accuracy:{torch.sum(model.predict(X_test) == y_test) / 10000:.3f}")
end1 = timeit.default_timer()
print(f"Time: {end1 - start1:.2f} seconds")
But when I use batch training like below, the speed drops significantly, and when num_workers=0, it takes 176 seconds to finish the training, and when num_workers=4, it takes 216 seconds to finish the training. And in both scenarios, the GPU usage hover around 20-30% and sometimes even lower. So my question is: is it normal to expect this time increase when using batch training, and if so, why should we use batch training? Is it to improve the test accuracy?
Secondly, why does increasing the num_workers take longer to train? Is there anything fundamentally wrong in the code? And is it normal to have GPU usage low when doing the batch training?
X =, -1).float()
y = train_data.train_labels
X_test =, -1).float()
y_test = test_data.train_labels
# Dataloader
g_seed = torch.Generator()
batch_size = 300
test_data = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_data, batch_size=batch_size,
shuffle=False, num_workers=8,
train_data = TensorDataset(X, y)
train_loader = DataLoader(train_data, batch_size=batch_size, drop_last=True,
shuffle=True, num_workers=8,
def train_test_classification(net, criterion, optimizer, train_loader,
test_loader, num_epochs=1, verbose=True,
training_plot=True, device='cuda'):
training_losses = []
for epoch in tqdm(range(num_epochs)): # loop over the dataset multiple times
running_loss = 0.0
for (i, data) in enumerate(train_loader, 0):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data
inputs =
labels =
# zero the parameter gradients
# forward + backward + optimize
outputs = net(inputs)
loss = criterion(outputs, labels)
# print statistics
if verbose:
training_losses += [loss.item()]
def test(data_loader):
correct = 0
total = 0
for data in data_loader:
inputs, labels = data
inputs =
labels =
outputs = net(inputs)
_, predicted = torch.max(outputs, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
acc = 100 * correct / total
return total, acc
train_total, train_acc = test(train_loader)
test_total, test_acc = test(test_loader)
if verbose:
print(f"Accuracy on the {train_total} training samples: {train_acc:0.2f}")
print(f"Accuracy on the {test_total} testing samples: {test_acc:0.2f}")
if training_plot:
plt.ylabel('Training loss')
return train_acc, test_acc
net = Net().to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net.parameters(), lr=0.005, momentum=0.9)
num_epochs = 500
start = timeit.default_timer()
_, _ = train_test_classification(net, criterion, optimizer, train_loader,
test_loader, num_epochs=num_epochs,
training_plot=True, device=DEVICE)
end = timeit.default_timer()
print(f"Time: {end-start:.2f}")
Low GPU usage can sometimes be due to slow data transfer. Having a large number of workers does not always help though.
Consider using pin_memory=True in the DataLoader definition. This should speed up the data transfer between CPU and GPU. Here is a thread on the Pytorch forum if you want more details.
Another solution may be to add the argument non_blocking=True inside the to() method.
I have set up custom training and testing functions in my project so I can minutely customise the training process. I use k-fold cross-validation to evaluate my model. For whatever reason, the model trains correctly for the first fold, and then on the second in throws this error.
tensorflow.python.framework.errors_impl.FailedPreconditionError: Could not find variable _AnonymousVar13. This could mean that the variable has been deleted. In TF1, it can also mean the variable is uninitialized. Debug info: container=localhost, status=Not found: Resource localhost/_AnonymousVar13/N10tensorflow3VarE does not exist. [[node test_model/dense_2/Tensordot/ReadVariableOp (defined at ]] [Op:__inference__train_step_1082]
I have no idea what's happening. I assumed the error arose because of poor initialisation, so I with the input shape. I have tried initialising the graph's weights with blank tensor, too, but that didn't work. I have also reset the backend on the last line in case there was a conflict with names, but that doesn't do the trick.
import numpy as np
import sklearn.model_selection
import tensorflow as tf
from tensorflow.python.keras.metrics import Mean, Precision, Recall
from tensorflow.python.keras.optimizer_v2.adam import Adam
n_splits = 5
batch_size = 16
n_epochs = 2
loss_function = tf.keras.losses.BinaryCrossentropy()
optimiser_fn = Adam
metrics = [
learning_rate = 1e-2
dense_outputs = [10,10]
activation = 'relu'
class TestModel(tf.keras.Model):
def __init__(self):
self._dense_ops = [tf.keras.layers.Dense(o) for o in dense_outputs]
self._output = tf.keras.layers.Dense(1)
def call(self, inputs):
hidden = inputs
for l in self._dense_ops:
hidden = l(hidden)
return self._output(hidden)
def _load_fold_sets_for_training(fold, fold_idcs, features, labels, batch_size):
# Get the indices for the sets.
train_idcs, validation_idcs, _ = fold_idcs[fold]
# Get the training data and labels.
training_data = features[train_idcs]
training_labels = labels[train_idcs]
# Load the training, validation and testing sets.
training_set =
(training_data, training_labels)
training_set = training_set.batch(batch_size, drop_remainder=False)
validation_set =
(features[validation_idcs], labels[validation_idcs])
validation_set = validation_set.batch(batch_size, drop_remainder=False)
return training_set, validation_set
def _train_step(batch_samples, batch_labels):
batch_predictions = model(batch_samples, training=True)
loss = loss_function(batch_predictions, batch_labels)
gradients = tf.gradients(loss, model.trainable_variables)
zip(gradients, model.trainable_variables)
batch_predictions = tf.sigmoid(batch_predictions)
[m.update_state(batch_labels, batch_predictions) for m in metrics[1:]]
def _inference_step(batch_samples, batch_labels):
batch_predictions = model(batch_samples, training=False)
loss = loss_function(batch_predictions, batch_labels)
batch_predictions = tf.sigmoid(batch_predictions)
[m.update_state(batch_labels, batch_predictions) for m in metrics[1:]]
# Generate dataset.
features = np.random.rand(15,1440,1)
labels = np.random.rand(15,1440)
# Set up splits.
kfold = sklearn.model_selection.KFold(n_splits=n_splits, shuffle=True)
splits = []
for train_idcs, test_idcs in kfold.split(features):
train_idcs, val_idcs = sklearn.model_selection.train_test_split(train_idcs)
splits += [[train_idcs, val_idcs, test_idcs]]
fold = 0
while fold < n_splits:
# Load datasets for fold.
training_set, validation_set = _load_fold_sets_for_training(fold, splits, features, labels, batch_size)
# Load model.
model = TestModel()
# Build model., 1))
# Initialise Adam optimiser.
optimiser = optimiser_fn(learning_rate)
epoch = 0
while epoch < n_epochs:
epoch += 1
# Training.
for batch_features, batch_labels in training_set: _train_step(batch_features, batch_labels)
print(f'fold {fold}: epoch {epoch}:', ' '.join(f'train_{}: {m.result():0.05f}' for m in metrics))
# Validation.
for batch_features, batch_labels in validation_set: _inference_step(batch_features, batch_labels)
print(f'fold {fold}: epoch {epoch}:', ' '.join(f'val_{}: {m.result():0.05f}' for m in metrics))
fold += 1
Any ideas?
The issue was the placement of the _train_step and _inference_step. If the two functions are redefined on every iteration of the fold, the error disappears and the model trains. I don't know why they must be redefined every step.
import numpy as np
import sklearn.model_selection
import tensorflow as tf
from tensorflow.python.keras.metrics import Mean, Precision, Recall
from tensorflow.python.keras.optimizer_v2.adam import Adam
n_splits = 5
batch_size = 2
n_epochs = 2
loss_function = tf.keras.losses.BinaryCrossentropy()
optimiser_fn = Adam
metrics = [
learning_rate = 1e-2
dense_outputs = [10, 10]
activation = 'relu'
class TestModel(tf.keras.Model):
def __init__(self):
self._dense_ops = [tf.keras.layers.Dense(o) for o in dense_outputs]
self._output = tf.keras.layers.Dense(1)
def call(self, inputs):
hidden = inputs
for l in self._dense_ops:
hidden = l(hidden)
return self._output(hidden)
def _load_fold_sets_for_training(fold, fold_idcs, features, labels, batch_size):
# Get the indices for the sets.
train_idcs, validation_idcs, _ = fold_idcs[fold]
# Get the training data and labels.
training_data = features[train_idcs]
training_labels = labels[train_idcs]
# Load the training, validation and testing sets.
training_set =
(training_data, training_labels)
training_set = training_set.batch(batch_size, drop_remainder=False)
validation_set =
(features[validation_idcs], labels[validation_idcs])
validation_set = validation_set.batch(batch_size, drop_remainder=False)
return training_set, validation_set
# Generate dataset.
features = np.random.rand(15, 1440, 1)
labels = np.random.rand(15, 1440)
# Set up splits.
kfold = sklearn.model_selection.KFold(n_splits=n_splits, shuffle=True)
splits = []
for train_idcs, test_idcs in kfold.split(features):
train_idcs, val_idcs = sklearn.model_selection.train_test_split(train_idcs)
splits += [[train_idcs, val_idcs, test_idcs]]
fold = 0
while fold < n_splits:
def _train_step(batch_samples, batch_labels):
batch_predictions = model(batch_samples, training=True)
loss = loss_function(batch_predictions, batch_labels)
gradients = tf.gradients(loss, model.trainable_variables)
zip(gradients, model.trainable_variables)
batch_predictions = tf.sigmoid(batch_predictions)
[m.update_state(batch_labels, batch_predictions) for m in metrics[1:]]
def _inference_step(batch_samples, batch_labels):
batch_predictions = model(batch_samples, training=False)
loss = loss_function(batch_predictions, batch_labels)
batch_predictions = tf.sigmoid(batch_predictions)
[m.update_state(batch_labels, batch_predictions) for m in metrics[1:]]
# Load datasets for fold.
training_set, validation_set = _load_fold_sets_for_training(fold, splits, features, labels,
# Load model.
model = TestModel()
# Build model., 1))
# Initialise Adam optimiser.
optimiser = optimiser_fn(learning_rate)
epoch = 0
while epoch < n_epochs:
epoch += 1
# Training.
for batch_features, batch_labels in training_set: _train_step(batch_features,
print(f'fold {fold}: epoch {epoch}:', ' '.join(f'train_{}: {m.result():0.05f}' for
m in metrics))
# Validation.
for batch_features, batch_labels in validation_set: _inference_step(batch_features,
print(f'fold {fold}: epoch {epoch}:', ' '.join(f'val_{}: {m.result():0.05f}' for m
in metrics))
fold += 1
I have the below code for a binary classification and it works fine but i would like to modify the nn.Sequential parameters and add an BiLSTM layer. I have the below code:
class BertClassifier(nn.Module):
def __init__(self, freeze_bert=False):
super(BertClassifier, self).__init__()
# Specify hidden size of BERT, hidden size of our classifier, and number of labels
D_in, H, D_out = 768, 50, 2
# Instantiate BERT model
self.bert = BertModel.from_pretrained('bert-base-multilingual-uncased')
# Instantiate an one-layer feed-forward classifier
self.classifier = nn.Sequential(nn.Linear(D_in, H),nn.ReLU(),nn.Linear(H, D_out))
# Freeze the BERT model
if freeze_bert:
for param in self.bert.parameters():
param.requires_grad = False
def forward(self, input_ids, attention_mask):
# Feed input to BERT
outputs = self.bert(input_ids=input_ids,attention_mask=attention_mask)
# Extract the last hidden state of the token `[CLS]` for classification task
last_hidden_state_cls = outputs[0][:, 0, :]
# Feed input to classifier to compute logits
logits = self.classifier(last_hidden_state_cls)
return logits
I have tried to modify the sequential like this self.classifier = nn.Sequential(nn.LSTM(D_in, H, batch_first=True, bidirectional=True),nn.ReLU(),nn.Linear(H, D_out)) but then it throws the error RuntimeError: input must have 3 dimensions, got 2 on line logits = self.classifier(last_hidden_state_cls). I found that I can use nn.ModuleDict instead of nn.Sequential and i made the below :
self.classifier = nn.ModuleDict({
'lstm': nn.LSTM(input_size=D_in, hidden_size=H,batch_first=True, bidirectional=True ),
'linear': nn.Linear(in_features=H,out_features=D_out)})
But now I'm having issues computing the forward function with this. Can someone advice how i can properly modify the forward function?
Update: I also installed CUDA and now when I run the code it returns the error CUDA out of memory. Tried to allocate 16.00 MiB and I tried to lower the batch size but that doesn't fix the problem. I also tried the below but didn't resolved either. Any advice, please?
import torch, gc
Update with the code:
MAX_LEN = 64
# For fine-tuning BERT, the authors recommend a batch size of 16 or 32.
batch_size = 32
file1 = open('MH.txt', 'r')
list_com = []
list_label = []
for line in file1:
possible_labels = 'positive|negative'
label = re.findall(possible_labels, line)
line = re.sub(possible_labels, ' ', line)
line = re.sub('\n', ' ', line)
list_tuples = list(zip(list_com, list_label))
labels = ['positive', 'negative']
df = pd.DataFrame(list_tuples, columns=['text', 'label'])
df['label'] = df['label'].map({'positive': 1, 'negative': 0})
for i in range(0,len(df['label'])):
list_label[i] = df['label'][i]
X = df.text.values
y = df.label.values
X_train, X_val, y_train, y_val =\
train_test_split(X, y, test_size=0.1, random_state=2020)
def text_preprocessing(text):
# Remove '#name'
text = re.sub(r'(#.*?)[\s]', ' ', text)
# Replace '&' with '&'
text = re.sub(r'&', '&', text)
# Remove trailing whitespace
text = re.sub(r'\s+', ' ', text).strip()
return text
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', do_lower_case=True)
# Create a function to tokenize a set of texts
def preprocessing_for_bert(data):
input_ids = []
attention_masks = []
for sent in data:
encoded_sent = tokenizer.encode_plus(
text=text_preprocessing(sent), # Preprocess sentence
add_special_tokens=True, # Add `[CLS]` and `[SEP]`
max_length=MAX_LEN, # Max length to truncate/pad
pad_to_max_length=True, # Pad sentence to max length
# return_tensors='pt', # Return PyTorch tensor
return_attention_mask=True # Return attention mask
# Add the outputs to the lists
# Convert lists to tensors
input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)
return input_ids, attention_masks
train_inputs, train_masks = preprocessing_for_bert(X_train)
val_inputs, val_masks = preprocessing_for_bert(X_val)
# Convert other data types to torch.Tensor
train_labels = torch.tensor(y_train)
val_labels = torch.tensor(y_val)
# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
# Create the DataLoader for our validation set
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)
# Create the BertClassfier class
class BertClassifier(nn.Module):
"""Bert Model for Classification Tasks."""
def __init__(self, freeze_bert=False):
#param bert: a BertModel object
#param classifier: a torch.nn.Module classifier
#param freeze_bert (bool): Set `False` to fine-tune the BERT model
super(BertClassifier, self).__init__()
# Specify hidden size of BERT, hidden size of our classifier, and number of labels
D_in, H, D_out = 768, 50, 2
# Instantiate BERT model
self.bert = BertModel.from_pretrained('bert-base-multilingual-uncased')
# Instantiate an one-layer feed-forward classifier
self.classifier = nn.ModuleDict({
'lstm': nn.LSTM(input_size=D_in, hidden_size=H, batch_first=True, bidirectional=True),
'linear': nn.Linear(in_features=H, out_features=D_out)})
# Freeze the BERT model
if freeze_bert:
for param in self.bert.parameters():
param.requires_grad = False
def forward(self, input_ids, attention_mask):
outputs = self.bert(input_ids=input_ids,attention_mask=attention_mask)
sequence_output = outputs[0]
sequence_output, _ = self.lstm(sequence_output)
linear_output = self.linear(sequence_output[:, -1])
return linear_output
def initialize_model(epochs=4):
# Instantiate Bert Classifier
bert_classifier = BertClassifier(freeze_bert=False)
# Tell PyTorch to run the model on GPU
# Create the optimizer
optimizer = AdamW(bert_classifier.parameters(), lr=5e-5)
# Total number of training steps
total_steps = len(train_dataloader) * epochs
# Set up the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps=total_steps)
return bert_classifier, optimizer, scheduler
# Specify loss function
loss_fn = nn.CrossEntropyLoss()
def set_seed(seed_value=42):
"""Set seed for reproducibility."""
def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
"""Train the BertClassifier model."""
# Start training loop
print("Start training...\n")
for epoch_i in range(epochs):
# Print the header of the result table
print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
print("-" * 70)
# Measure the elapsed time of each epoch
t0_epoch, t0_batch = time.time(), time.time()
# Reset tracking variables at the beginning of each epoch
total_loss, batch_loss, batch_counts = 0, 0, 0
# Put the model into the training mode
# For each batch of training data...
for step, batch in enumerate(train_dataloader):
batch_counts += 1
# Load batch to GPU
b_input_ids, b_attn_mask, b_labels = tuple( for t in batch)
# Zero out any previously calculated gradients
# Perform a forward pass. This will return logits.
logits = model(b_input_ids, b_attn_mask)
# Compute loss and accumulate the loss values
loss = loss_fn(logits, b_labels)
batch_loss += loss.item()
total_loss += loss.item()
# Perform a backward pass to calculate gradients
# Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# Update parameters and the learning rate
# Print the loss values and time elapsed for every 20 batches
if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
# Calculate time elapsed for 20 batches
time_elapsed = time.time() - t0_batch
# Print training results
f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")
# Reset batch tracking variables
batch_loss, batch_counts = 0, 0
t0_batch = time.time()
# Calculate the average loss over the entire training data
avg_train_loss = total_loss / len(train_dataloader)
print("-" * 70)
if evaluation == True:
# After the completion of each training epoch, measure the model's performance
# on our validation set.
val_loss, val_accuracy = evaluate(model, val_dataloader)
# Print performance over the entire training data
time_elapsed = time.time() - t0_epoch
f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
print("-" * 70)
print("Training complete!")
def evaluate(model, val_dataloader):
"""After the completion of each training epoch, measure the model's performance
on our validation set.
# Put the model into the evaluation mode. The dropout layers are disabled during
# the test time.
# Tracking variables
val_accuracy = []
val_loss = []
# For each batch in our validation set...
for batch in val_dataloader:
# Load batch to GPU
b_input_ids, b_attn_mask, b_labels = tuple( for t in batch)
# Compute logits
with torch.no_grad():
logits = model(b_input_ids, b_attn_mask)
# Compute loss
loss = loss_fn(logits, b_labels)
# Get the predictions
preds = torch.argmax(logits, dim=1).flatten()
# Calculate the accuracy rate
accuracy = (preds == b_labels).cpu().numpy().mean() * 100
# Compute the average accuracy and loss over the validation set.
val_loss = np.mean(val_loss)
val_accuracy = np.mean(val_accuracy)
return val_loss, val_accuracy
def accuracy(probs, y_true):
- Print AUC and accuracy on the test set
#params probs (np.array): an array of predicted probabilities with shape (len(y_true), 2)
#params y_true (np.array): an array of the true values with shape (len(y_true),)
fpr, tpr, threshold = roc_curve(y_true, preds)
roc_auc = auc(fpr, tpr)
print(f'AUC: {roc_auc:.4f}')
preds = probs[:, 1]
# Get accuracy over the test set
y_pred = np.where(preds >= 0.5, 1, 0)
accuracy = accuracy_score(y_true, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')
def bert_predict(model, test_dataloader):
"""Perform a forward pass on the trained BERT model to predict probabilities on the test set."""
# Put the model into the evaluation mode. The dropout layers are disabled during the test time.
all_logits = []
# For each batch in our test set...
for batch in test_dataloader:
# Load batch to GPU
b_input_ids, b_attn_mask = tuple( for t in batch)[:2]
# Compute logits
with torch.no_grad():
logits = model(b_input_ids, b_attn_mask)
# Concatenate logits from each batch
all_logits =, dim=0)
# Apply softmax to calculate probabilities
probs = F.softmax(all_logits, dim=1).cpu().numpy()
return probs
set_seed(42) # Set seed for reproducibility
bert_classifier, optimizer, scheduler = initialize_model(epochs=3)
# start training
train(bert_classifier, train_dataloader, val_dataloader, epochs=3, evaluation=True)
# Compute predicted probabilities on the test set
probs = bert_predict(bert_classifier, val_dataloader)
# Evaluate the Bert classifier
accuracy(probs, y_val)
I succeeded to build a linear regression neural network with 1 inputs and 1 outputs.
I am building a linear regression neural network with 5 inputs and 1 outputs now.
Here is the formula:
y = 3e + d^2 + 9c + 11b^6 + a + 19
However, no matter how many neurons, epochs and hidden layers I use, I cannot predict the a good result.
The predicted outputs are always within a small range. However, there are large variance among the expected outputs.
Predicted output vs Expected output
I guess it may be because of the choice of activation function, loss function and optimizer.
If not, multiple input neural network may need alternative method to build.
Here is my code:
import torch
import torch.nn as nn #neural network model
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch.nn.functional as F
from torch.autograd import Variable
from sklearn.preprocessing import MinMaxScaler
from pickle import dump
#Load datasets
dataset = pd.read_csv('testB_200.csv')
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, -1:].values
X_scaler = MinMaxScaler()
Y_scaler = MinMaxScaler()
X = X_scaler.transform(X)
Y = Y_scaler.transform(Y)
#save the scaler
dump(X_scaler, open('X_scaler.pkl', 'wb'))
dump(Y_scaler, open('Y_scaler.pkl', 'wb'))
train = int((len(dataset)+1)*0.8)
test = train + 1
x_temp_train = X[:train]
y_temp_train = Y[:train]
x_temp_test = X[test:]
y_temp_test = Y[test:]
X_train = torch.FloatTensor(x_temp_train)
Y_train = torch.FloatTensor(y_temp_train)
X_test = torch.FloatTensor(x_temp_test)
Y_test = torch.FloatTensor(y_temp_test)
D_in = 5 # D_in is input features
H = 12 # H is hidden dimension
H2 =8
H3 =4
D_out = 1 # D_out is output features.
#Define a Artifical Neural Network model
class Net(nn.Module):
#------------------3 hidden Layers------------------------------
def __init__(self, D_in, H, H2, H3, D_out):
super(Net, self).__init__()
self.linear1 = nn.Linear(D_in, H)
self.linear2 = nn.Linear(H, H2)
self.linear3 = nn.Linear(H2, H3)
self.linear4 = nn.Linear(H3, D_out)
def forward(self, x):
#activation function should be used here e.g: hidden = F.relu(...)
h_relu = self.linear1(x).clamp(min=0) #min=0 is like ReLU
middle = self.linear2(h_relu).clamp(min=0)
middle2 = self.linear3(middle).clamp(min=0)
prediction = self.linear4(middle2)
return prediction
model = Net(D_in, H, H2, H3, D_out)
#Define a Loss function and optimizer
criterion = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.2) #2e-7, lr=learning rate=0.2
#Training model
inputs = Variable(X_train)
outputs = Variable(Y_train)
inputs_val = Variable(X_test)
outputs_val = Variable(Y_test)
loss_values = []
val_values = []
epoch = []
for i in range(epoch_value):
for phase in ['train', 'val']:
if phase == 'train':
#print('train loss')
model.train() # Set model to training mode
prediction = model(inputs)
loss = criterion(prediction, outputs)
optimizer.zero_grad() #zero the parameter gradients
loss.backward() #compute gradients(dloss/dx)
optimizer.step() #updates the parameters
elif phase == 'val':
#print('validation loss')
model.eval() # Set model to evaluate mode
prediction_val = model(inputs_val)
loss_val = criterion(prediction_val, outputs_val)
optimizer.zero_grad() #zero the parameter gradients, 'formula2.pth') #save model
#Plot train_loss vs validation loss
plt.plot(epoch, val_values)
plt.title('model loss')
plt.legend(['train','validation'], loc='upper left')
#plot prediciton vs expected value
prediction_val = prediction_val.detach().numpy()
prediction_val = Y_scaler.inverse_transform(prediction_val)
Y_test = Y_scaler.inverse_transform(Y_test)
plt.legend(['expected','predict'], loc='upper left')
Model Loss vs Validation Loss
Validation vs Expected outputs
Thanks for your time.
Trying to get similar results on same dataset with Keras and PyTorch.
from numpy import array
from numpy import hstack
from sklearn.model_selection import train_test_split
# split a multivariate sequence into samples
def split_sequences(sequences, n_steps):
X, y = list(), list()
for i in range(len(sequences)):
# find the end of this pattern
end_ix = i + n_steps
# check if we are beyond the dataset
if end_ix > len(sequences):
# gather input and output parts of the pattern
seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix-1, -1]
return array(X), array(y)
def get_data():
# define input sequence
in_seq1 = array([x for x in range(0,500,10)])/1
in_seq2 = array([x for x in range(5,505,10)])/1
out_seq = array([in_seq1[i]+in_seq2[i] for i in range(len(in_seq1))])
# convert to [rows, columns] structure
in_seq1 = in_seq1.reshape((len(in_seq1), 1))
in_seq2 = in_seq2.reshape((len(in_seq2), 1))
out_seq = out_seq.reshape((len(out_seq), 1))
# horizontally stack columns
dataset = hstack((in_seq1, in_seq2, out_seq))
n_features = 2 # this is number of parallel inputs
n_timesteps = 3 # this is number of timesteps
# convert into input/output
X, y = split_sequences(dataset, n_timesteps)
print(X.shape, y.shape)
X_train,x_test,Y_train, y_test = train_test_split(X,y,test_size = 0.2,shuffle=False)
return X_train,x_test,Y_train, y_test
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from sklearn.metrics import mean_squared_error
import testing.TimeSeries.datacreator as dc # !!!!change this!!!!
X_train,x_test,Y_train, y_test = dc.get_data()
n_features = 2 # this is number of parallel inputs
n_timesteps = 3 # this is number of timesteps
# define model
model = Sequential()
model.add(LSTM(1024, activation='relu',
input_shape=(n_timesteps, n_features),
model.add(Dense(512, activation='relu'))
opt = keras.optimizers.Adam(lr=0.001,
model.compile(optimizer=opt, loss='mse')
# fit model, Y_train, epochs=200, verbose=1,validation_data=(x_test,y_test))
yhat = model.predict(x_test, verbose=0)
mean_squared_error(y_test, yhat)
PyTorch - module class
import numpy as np
import torch
import torch.nn.functional as F
from sklearn.metrics import mean_squared_error
import testing.TimeSeries.datacreator as dc # !!!! change this !!!!
X_train,x_test,Y_train, y_test = dc.get_data()
n_features = 2 # this is number of parallel inputs
n_timesteps = 3 # this is number of timesteps
class MV_LSTM(torch.nn.Module):
def __init__(self,n_features,seq_length):
super(MV_LSTM, self).__init__()
self.n_features = n_features # number of parallel inputs
self.seq_len = seq_length # number of timesteps
self.n_hidden = 1024 # number of hidden states
self.n_layers = 1 # number of LSTM layers (stacked)
self.l_lstm = torch.nn.LSTM(input_size = n_features,
hidden_size = self.n_hidden,
num_layers = self.n_layers,
batch_first = True)
# according to pytorch docs LSTM output is
# (batch_size,seq_len, num_directions * hidden_size)
# when considering batch_first = True
self.l_linear = torch.nn.Linear(self.n_hidden*self.seq_len, 512)
# self.l_linear1 = torch.nn.Linear(512, 512)
self.l_linear2 = torch.nn.Linear(512, 1)
def init_hidden(self, batch_size):
# even with batch_first = True this remains same as docs
hidden_state = torch.zeros(self.n_layers,batch_size,self.n_hidden).to(next(self.parameters()).device)
cell_state = torch.zeros(self.n_layers,batch_size,self.n_hidden).to(next(self.parameters()).device)
self.hidden = (hidden_state, cell_state)
def forward(self, x):
batch_size, seq_len, _ = x.size()
lstm_out, self.hidden = self.l_lstm(x,self.hidden)
# lstm_out(with batch_first = True) is
# (batch_size,seq_len,num_directions * hidden_size)
# for following linear layer we want to keep batch_size dimension and merge rest
# .contiguous() -> solves tensor compatibility error
x = lstm_out.contiguous().view(batch_size,-1)
x = F.relu(x)
x = F.relu(self.l_linear(x))
# x = F.relu(self.l_linear1(x))
x = self.l_linear2(x)
return x
PyTorch - init and train
# create NN
mv_net = MV_LSTM(n_features,n_timesteps)
criterion = torch.nn.MSELoss()
import keras # for epsilon constant
optimizer = torch.optim.Adam(mv_net.parameters(),
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_episodes = 200
batch_size = 32
eval_batch_size = 32
for t in range(train_episodes):
for b in range(0,len(X_train),batch_size):
inpt = X_train[b:b+batch_size,:,:]
target = Y_train[b:b+batch_size]
x_batch = torch.tensor(inpt,dtype=torch.float32).to(device)
y_batch = torch.tensor(target,dtype=torch.float32).to(device)
output = mv_net(x_batch)
loss = criterion(output.view(-1), y_batch)
acc = 0
for b in range(0,len(x_test),eval_batch_size):
inpt = x_test[b:b+eval_batch_size,:,:]
target = y_test[b:b+eval_batch_size]
x_batch = torch.tensor(inpt,dtype=torch.float32).to(device)
y_batch = torch.tensor(target,dtype=torch.float32).to(device)
output = mv_net(x_batch)
acc += mean_squared_error(y_batch.cpu().detach().numpy(), output.view(-1).cpu().detach().numpy())
print('step:' , t , 'train loss:' , round(loss.item(),3),'eval acc:',round(acc/len(x_test),3))
val = torch.tensor(x_test,dtype=torch.float32).to(device)
otp = mv_net(val)
print(mean_squared_error(y_test, otp.view(-1).cpu().detach().numpy()))
Keras produces test MSE almost 0, but PyTorch about 6000, which is way too different
I have tried couple tweaks in PyTorch code, but none got me anywhere close to similar keras, even with identical optim params
I cant see what is wrong with (kinda tutorialic) PyTorch code
I know it is almost one year too late. But I came across the same problem and I think the problem is the following. From the keras documentation it says:
return_sequences: Boolean. Whether to return the last output in the
output sequence, or the full sequence.
this basically means that the input shape of your self.l_linear needs to be torch.nn.Linear(1024, 512) instead of self.n_hidden*self.seq_len, 512.
Now you also need to do the same as keras does and only use the last output in your forward pass:
def forward(self, x):
batch_size, seq_len, _ = x.size()
lstm_out, self.hidden = self.l_lstm(x,self.hidden)
x = lstm_out[:,-1]
x = torch.nn.functional.relu(x)
x = torch.nn.functional.relu(self.l_linear(x))
x = self.l_linear2(x)
return x
when I run your example (which I needed to tweak a bit to get it run) I get very similar training losses.
38/38 [==============================] - 0s 6ms/step - loss: 67.6081 - val_loss: 325.9259
step: 199 train loss: 41.043 eval acc: 1142.688
I hope this helps others having a similar problem.
PS also note that keras is resetting the hidden state (stateful=False) by default.