Pytorch vs. Keras: Pytorch model overfits heavily - python

For several days now, I'm trying to replicate my keras training results with pytorch. Whatever I do, the pytorch model will overfit far earlier and stronger to the validation set then in keras. For pytorch I use the same XCeption Code from https://github.com/Cadene/pretrained-models.pytorch.
The dataloading, the augmentation, the validation, the training schedule etc. are equivalent. Am I missing something obvious? There must be a general problem somewhere. I tried thousands of different module constellations, but nothing seems to come even close to the keras training. Can somebody help?
Keras model: val accuracy > 90%
# base model
base_model = applications.Xception(weights='imagenet', include_top=False, input_shape=(img_width, img_height, 3))
# top model
x = base_model.output
x = GlobalMaxPooling2D()(x)
x = Dense(512, activation='relu')(x)
x = Dropout(0.5)(x)
predictions = Dense(4, activation='softmax')(x)
# this is the model we will train
model = Model(inputs=base_model.input, outputs=predictions)
# Compile model
from keras import optimizers
adam = optimizers.Adam(lr=0.0001)
model.compile(loss='categorical_crossentropy',
optimizer=adam, metrics=['accuracy'])
# LROnPlateau etc. with equivalent settings as pytorch
Pytorch model: val accuracy ~81%
from xception import xception
import torch.nn.functional as F
# modified from https://github.com/Cadene/pretrained-models.pytorch
class XCeption(nn.Module):
def __init__(self, num_classes):
super(XCeption, self).__init__()
original_model = xception(pretrained="imagenet")
self.features=nn.Sequential(*list(original_model.children())[:-1])
self.last_linear = nn.Sequential(
nn.Linear(original_model.last_linear.in_features, 512),
nn.ReLU(),
nn.Dropout(p=0.5),
nn.Linear(512, num_classes)
)
def logits(self, features):
x = F.relu(features)
x = F.adaptive_max_pool2d(x, (1, 1))
x = x.view(x.size(0), -1)
x = self.last_linear(x)
return x
def forward(self, input):
x = self.features(input)
x = self.logits(x)
return x
device = torch.device("cuda")
model=XCeption(len(class_names))
if torch.cuda.device_count() > 1:
print("Let's use", torch.cuda.device_count(), "GPUs!")
# dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
model = nn.DataParallel(model)
model.to(device)
criterion = nn.CrossEntropyLoss(size_average=False)
optimizer = optim.Adam(model.parameters(), lr=0.0001)
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.2, patience=5, cooldown=5)
Thank you very much!
Update:
Settings:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.2, patience=5, cooldown=5)
model = train_model(model, train_loader, val_loader,
criterion, optimizer, scheduler,
batch_size, trainmult=8, valmult=10,
num_epochs=200, epochs_top=0)
Cleaned training function:
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, batch_size, trainmult=1, valmult=1, num_epochs=None, epochs_top=0):
for epoch in range(num_epochs):
for phase in ['train', 'val']:
running_loss = 0.0
running_acc = 0
total = 0
# Iterate over data.
if phase=="train":
model.train(True) # Set model to training mode
for i in range(trainmult):
for data in train_loader:
# get the inputs
inputs, labels = data
inputs, labels = inputs.to(torch.device("cuda")), labels.to(torch.device("cuda"))
# zero the parameter gradients
optimizer.zero_grad()
# forward
outputs = model(inputs) # notinception
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
loss.backward()
optimizer.step()
# statistics
total += labels.size(0)
running_loss += loss.item()*labels.size(0)
running_acc += torch.sum(preds == labels)
train_loss=(running_loss/total)
train_acc=(running_acc.double()/total)
else:
model.train(False) # Set model to evaluate mode
with torch.no_grad():
for i in range(valmult):
for data in val_loader:
# get the inputs
inputs, labels = data
inputs, labels = inputs.to(torch.device("cuda")), labels.to(torch.device("cuda"))
# zero the parameter gradients
optimizer.zero_grad()
# forward
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels.data)
# statistics
total += labels.size(0)
running_loss += loss.item()*labels.size(0)
running_acc += torch.sum(preds == labels)
val_loss=(running_loss/total)
val_acc=(running_acc.double()/total)
scheduler.step(val_loss)
return model

it may be because type of weight initialization you are using
otherwise this should not happen
try with same initializer in both the models

self.features=nn.Sequential(*list(original_model.children())[:-1])
Are you sure that this line re-instantiates your model in exactly the same way? You're using a NN.Sequential instead of the original XCeption model's forward function. If there's anything in that forward function that isn't the exact same as using a nn.Sequential, it will not reproduce the same performance.
Instead of wrapping it in a Sequential, you could just change this
my_model = Xception()
# load weights before you change the architecture
my_model = load_weights(path_to_weights)
# overwrite the original's last_linear with your own
my_model.last_linear = nn.Sequential(
nn.Linear(original_model.last_linear.in_features, 512),
nn.ReLU(),
nn.Dropout(p=0.5),
nn.Linear(512, num_classes)
)

Related

Very high validation loss/small train loss in Pytorch, while finetuning resnet 50

I am training model to classify 2 types of images. I have decided to take a transfer-learning approach, freeze every part of resnet50 and new layer and start finetuning process. My dataset is not perfectly balanced but i used weights for that purpose.Please take a look at validation loss vs training loss graph. It seems to be extremely inconsitent. Could you please take a look at my code? I am new to Pytorch, maybe there is something wrong with my method and code. Final accuracy tested on test set is 86%. Thank you!
learning_rate = 1e-1
num_epochs = 100
patience = 10
batch_size = 100
weights = [4, 1]
model = models.resnet50(pretrained=True)
# Replace last layer
num_features = model.fc.in_features
model.fc = nn.Sequential(
nn.Linear(num_features, 512),
nn.ReLU(inplace=True),
nn.Linear(512, 64),
nn.Dropout(0.5, inplace=True),
nn.Linear(64, 2))
class_weights = torch.FloatTensor(weights).cuda()
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
running_loss = 0
losses = []
# To freeze the residual layers
for param in model.parameters():
param.requires_grad = False
for param in model.fc.parameters():
param.requires_grad = True
# Find total parameters and trainable parameters
total_params = sum(p.numel() for p in model.parameters())
print(f'{total_params:,} total parameters.')
total_trainable_params = sum(
p.numel() for p in model.parameters() if p.requires_grad)
print(f'{total_trainable_params:,} training parameters.')
24,590,082 total parameters.
1,082,050 training parameters.
# initialize the early_stopping object
early_stopping = pytorchtools.EarlyStopping(patience=patience, verbose=True)
for epoch in range(num_epochs):
##########################
#######TRAIN MODEL########
##########################
epochs_loss=0
##Switch to train mode
model.train()
for i, (images, labels) in enumerate(train_dl):
# Move tensors to the configured device
images = images.to(device)
labels = labels.to(device)
# Forward pass
# Backprpagation and optimization
optimizer.zero_grad()
outputs = model(images).to(device)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
#calculate train_loss
train_losses.append(loss.item())
##########################
#####VALIDATE MODEL#######
##########################
model.eval()
for images, labels in val_dl:
images = images.to(device)
labels = labels.to(device)
outputs = model(images).to(device)
loss = criterion(outputs,labels)
valid_losses.append(loss.item())
# print training/validation statistics
# calculate average loss over an epoch
train_loss = np.average(train_losses)
valid_loss = np.average(valid_losses)
# print(train_loss)
avg_train_losses.append(train_loss)
avg_valid_losses.append(valid_loss)
print_msg = (f'train_loss: {train_loss:.5f} ' + f'valid_loss: {valid_loss:.5f}')
print(print_msg)
# clear lists to track next epoch
train_losses = []
valid_losses = []
early_stopping(valid_loss, model)
print(epoch)
if early_stopping.early_stop:
print("Early stopping")
break

Apply gradient descent only if TensorFlow model improves on training and validation data

I want to customize the fit function of the model in order to apply the gradient descent on the weights only if the model improved its predictions on the validation data. The reason for this is that I want to prevent overfitting.
According to this guide it should be possible to customize the fit function of the model. However, the following code runs into errors:
class CustomModel(tf.keras.Model):
def train_step(self, data):
x, y = data
with tf.GradientTape() as tape:
y_pred = self(x, training=True)
loss = self.compiled_loss(y, y_pred, regularization_losses=self.losses)
trainable_vars = self.trainable_variables
gradients = tape.gradient(loss, trainable_vars)
### check and apply gradient
Y_pred_val = self.predict(X_val) # this does not work
acc_val = calculate_accuracy(Y_val, Y_pred_val)
if acc_val > last_acc_val:
self.optimizer.apply_gradients(zip(gradients, trainable_vars))
###
self.compiled_metrics.update_state(y, y_pred)
return_obj = {m.name: m.result() for m in self.metrics}
return_obj["acc_val"] = acc_val
return return_obj
How could it be possible to evaluate the model inside the fit function?
You don't have to subclass fit() for this. You can just make a custom training loop. Look how I did that:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
from tensorflow.keras import Model
import tensorflow as tf
from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, Concatenate
import tensorflow_datasets as tfds
from tensorflow.keras.regularizers import l1, l2, l1_l2
from collections import deque
dataset, info = tfds.load('mnist',
with_info=True,
split='train',
as_supervised=False)
TAKE = 1_000
data = dataset.map(lambda x: (tf.cast(x['image'],
tf.float32), x['label'])).shuffle(TAKE).take(TAKE)
len_train = int(8e-1*TAKE)
train = data.take(len_train).batch(8)
test = data.skip(len_train).take(info.splits['train'].num_examples - len_train).batch(8)
class CNN(Model):
def __init__(self):
super(CNN, self).__init__()
self.layer1 = Dense(32, activation=tf.nn.relu,
kernel_regularizer=l1(1e-2),
input_shape=info.features['image'].shape)
self.layer2 = Conv2D(filters=16,
kernel_size=(3, 3),
strides=(1, 1),
activation='relu',
input_shape=info.features['image'].shape)
self.layer3 = MaxPooling2D(pool_size=(2, 2))
self.layer4 = Conv2D(filters=32,
kernel_size=(3, 3),
strides=(1, 1),
activation=tf.nn.elu,
kernel_initializer=tf.keras.initializers.glorot_normal)
self.layer5 = MaxPooling2D(pool_size=(2, 2))
self.layer6 = Flatten()
self.layer7 = Dense(units=64,
activation=tf.nn.relu,
kernel_regularizer=l2(1e-2))
self.layer8 = Dense(units=64,
activation=tf.nn.relu,
kernel_regularizer=l1_l2(l1=1e-2, l2=1e-2))
self.layer9 = Concatenate()
self.layer10 = Dense(units=info.features['label'].num_classes)
def call(self, inputs, training=None, **kwargs):
b = self.layer1(inputs)
a = self.layer2(inputs)
a = self.layer3(a)
a = self.layer4(a)
a = self.layer5(a)
a = self.layer6(a)
a = self.layer8(a)
b = self.layer7(b)
b = self.layer6(b)
x = self.layer9([a, b])
x = self.layer10(x)
return x
cnn = CNN()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
train_loss = tf.keras.metrics.Mean()
test_loss = tf.keras.metrics.Mean()
train_acc = tf.keras.metrics.SparseCategoricalAccuracy()
test_acc = tf.keras.metrics.SparseCategoricalAccuracy()
optimizer = tf.keras.optimizers.Nadam()
template = 'Epoch {:3} Train Loss {:7.4f} Test Loss {:7.4f} ' \
'Train Acc {:6.2%} Test Acc {:6.2%} '
epochs = 5
early_stop = epochs//50
loss_hist = deque()
acc_hist = deque(maxlen=1)
acc_hist.append(0)
for epoch in range(1, epochs + 1):
train_loss.reset_states()
test_loss.reset_states()
train_acc.reset_states()
test_acc.reset_states()
for images, labels in train:
with tf.GradientTape() as tape:
logits = cnn(images, training=True)
loss = loss_object(labels, logits)
train_loss(loss)
train_acc(labels, logits)
current_acc = tf.metrics.SparseCategoricalAccuracy()(labels, logits)
if tf.greater(current_acc, acc_hist[-1]):
print('IMPROVEMENT.')
gradients = tape.gradient(loss, cnn.trainable_variables)
optimizer.apply_gradients(zip(gradients, cnn.trainable_variables))
acc_hist.append(current_acc)
for images, labels in test:
logits = cnn(images, training=False)
loss = loss_object(labels, logits)
test_loss(loss)
test_acc(labels, logits)
print(template.format(epoch,
train_loss.result(),
test_loss.result(),
train_acc.result(),
test_acc.result()))
if len(loss_hist) > early_stop and loss_hist.popleft() < min(loss_hist):
print('Early stopping. No validation loss decrease in %i epochs.' % early_stop)
break
Output:
IMPROVEMENT.
IMPROVEMENT.
IMPROVEMENT.
IMPROVEMENT.
Epoch 1 Train Loss 21.1698 Test Loss 21.3391 Train Acc 37.13% Test Acc 38.50%
IMPROVEMENT.
IMPROVEMENT.
IMPROVEMENT.
Epoch 2 Train Loss 13.8314 Test Loss 12.2496 Train Acc 50.88% Test Acc 52.50%
Epoch 3 Train Loss 13.7594 Test Loss 12.5884 Train Acc 51.75% Test Acc 53.00%
Epoch 4 Train Loss 13.1418 Test Loss 13.2374 Train Acc 52.75% Test Acc 51.50%
Epoch 5 Train Loss 13.6471 Test Loss 13.3157 Train Acc 49.63% Test Acc 51.50%
Here's the part that did the job. It's a deque and it skips the application of gradients if the last element of the deque is smaller.
for images, labels in train:
with tf.GradientTape() as tape:
logits = cnn(images, training=True)
loss = loss_object(labels, logits)
train_loss(loss)
train_acc(labels, logits)
current_acc = tf.metrics.SparseCategoricalAccuracy()(labels, logits)
if tf.greater(current_acc, acc_hist[-1]):
print('IMPROVEMENT.')
gradients = tape.gradient(loss, cnn.trainable_variables)
optimizer.apply_gradients(zip(gradients, cnn.trainable_variables))
acc_hist.append(current_acc)
Rather than create a custom fit I think it would be easier to use the callback ModelCheckpoint.
What you are trying to do is get the model that has the lowest validation error. Set it up to monitor validation loss. That way it will save the best model even if the network starts to over fit. Documentation is here.
If you do not get a model with a satisfactory validation accuracy then you will have to take other measures.
First look at your training accuracy.
My experience is that you should achieve at least 95%.
If the training accuracy is good but the validation accuracy is poor and degrades as you run more epochs that is a sign of over fitting.
You did not show the model but if you are doing classification you will probably have dense layers with the final layer using softmax activation.
Start out with model with only one dense layer and see if it trains well.
If not you may have to add additional dense hidden layers. If you do include a drop out layer to help prevent over fitting. You might also consider using regularizers. Documentation is
here..
I also find you can get improved performance if you dynamically adjust the learning rate. The callback ReduceLROnPlateau enables that capability.
Set it up to monitor validation loss and to reduce the learning rate by a factor if the loss fails to decrease. Documentation is here.

Loss Function Not Decreasing in CNN

I am new to Pytorch and I'm training a model for binary classification of images. The images are currently stored as .npy files and I am loading them and training my model in batches. When I implement this, the loss function does not decrease. When I test the model on the training and test set again, the accuracy is constant at 50%. The data set is balanced.
I tried making the dataset smaller (around 125 for each class) and I still have the same problem. I expect the model to overfit the training set but this does not occur.
Please see my code below
class Network(nn.Module):
def __init__(self):
super(Network,self).__init__()
self.conv1=nn.Conv2d(in_channels=2, out_channels=32, kernel_size=3)
self.conv2=nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3)
self.conv3=nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3)
self.fc1=nn.Linear(in_features=128*6*6, out_features=1000)
self.fc2=nn.Linear(in_features=1000, out_features=100)
self.out=nn.Linear(in_features=100, out_features=2)
def forward(self,t):
POOL_stride=2
#Conv1
t=F.relu(self.conv1(t))
t=F.max_pool2d(t, kernel_size=2, stride=POOL_stride)
#Conv2
t=F.relu(self.conv2(t))
t=F.max_pool2d(t, kernel_size=2, stride=POOL_stride)
#Conv3
t=F.relu(self.conv3(t))
t=F.max_pool2d(t, kernel_size=2, stride=POOL_stride)
# dense 1
t=t.reshape(-1, 128*6*6)
t=self.fc1(t)
t=F.relu(t)
#dense 2
t=self.fc2(t)
t=F.relu(t)
t=self.out(t)
return t
def npy_loader(path):
sample = torch.from_numpy(np.load(path))
return sample
criterion=nn.CrossEntropyLoss()
optimizer = optim.Adam(self.model.parameters(), lr=0.003)
model = Network()
trainset = datasets.DatasetFolder(
root=train_dir,
loader=npy_loader,
extensions=['.npy']
)
train_loader = torch.utils.data.DataLoader(
trainset,
batch_size=batch_size,
shuffle=True,
)
for epoch in range(epochs):
running_loss = 0
batches = 0
for inputs, labels in train_loader:
batches = batches+1
inputs, labels = inputs.to(device), labels.to(device)
optimizer.zero_grad()
output = model(inputs)
loss = criterion(output.squeeze(), labels.squeeze())
loss.backward()
optimizer.step()
running_loss += loss.item()
print('Loss :{:.4f} Epoch[{}/{}]'.format(running_loss/batches, epoch, epochs))
'''
You are providing parameters of some other self.model to optimizer while the model used for calculating the loss is different.
optimizer = optim.Adam(self.model.parameters(), lr=0.003)
model = Network()
Above is your sequence of defining optimizer and model. Notice that you are passing parameters of a different self.model to optimizer. Hence, optimizer.step() fails to update weights of desired model on which loss is being calculated. Instead it should be something like this:
model = Network()
optimizer = optim.Adam(model.parameters(), lr=0.003)
On another note, might I suggest that instead of returning 2 dimensional output from model, returning a 1-d output and using binary cross-entropy loss can also be explored, as your task is only a binary classification problem.

How to deactivate a dropout layer called with training=True in a Keras model?

I wish to view the final output of training a tf.keras model. In this case it would be an array of predictions from the softmax function, e.g. [0,0,0,1,0,1].
Other threads on here have suggested using model.predict(training_data), but this won't work for my situation since I am using dropout at training and validation, so neurons are randomly dropped and predicting again with the same data will give a different result.
def get_model():
inputs = tf.keras.layers.Input(shape=(input_dims,))
x = tf.keras.layers.Dropout(rate=dropout_rate)(inputs, training=True)
x = tf.keras.layers.Dense(units=29, activation='relu')(x)
x = tf.keras.layers.Dropout(rate=dropout_rate)(x, training=True)
x = tf.keras.layers.Dense(units=15, activation='relu')(x)
outputs = tf.keras.layers.Dense(2, activation='softmax')(x)
model = tf.keras.Model(inputs=inputs, outputs=outputs)
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['sparse_categorical_accuracy'])
return model
myModel = get_model()
myModel.summary()
myModel.fit(X_train, y_train,
batch_size = batch_size,
epochs= epochs,
verbose = 1,
validation_data = (X_val, y_val))
In tensorflow, you can grab the output of a model after training quite easily. Here is an example from a Github repo:
input = tf.placeholder(tf.float32, shape=[None, INPUT_DIMS])
labels = tf.placeholder(tf.float32, shape=[None])
hidden = tf.nn.tanh(make_nn_layer(normalized, NUM_HIDDEN))
logits = make_nn_layer(hidden, NUM_CLASSES)
outputs = tf.argmax(logits, 1)
int_labels = tf.to_int64(labels)
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, int_labels, name='xentropy')
train_step = tf.train.AdamOptimizer().minimize(cross_entropy)
correct_prediction = tf.equal(outputs, int_labels)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
with tf.Session() as sess:
sess.run(tf.initialize_all_variables())
validation_dict = {
input: validation_data[:,0:7],
labels: validation_data[:,7],}
for i in range(NUM_BATCHES):
batch = training_data[numpy.random.choice(training_size, BATCH_SIZE, False),:]
train_step.run({input: batch[:,0:7], labels: batch[:,7]})
if i % 100 == 0 or i == NUM_BATCHES - 1:
print('Accuracy %.2f%% at step %d' % (accuracy.eval(validation_dict) * 100, i))
output_data = outputs.eval({input: data_vector[:,0:7]})
The only output I can get from the trained model appears to be a history object. There is also a myModel.output object, but it is a tensor that I can't evaluate without putting data into it. Any ideas?
As far as I know, you can't turn off the dropout after passing training=True when calling the layers (unless you transfer the weights to a new model with the same architecture). However, instead you can build and train your model in normal case (i.e. without using training argument in the calls) and then selectively turn on and off the dropout layer in test phase by defining a backend function (i.e. keras.backend.function()) and setting the learning phase (i.e. keras.backend.learning_phase()):
# build your model normally (i.e. without using `training=True` argument)
# train your model...
from keras import backend as K
func = K.function(model.inputs + [K.learning_phase()], model.outputs)
# run the model with dropout layers being active, i.e. learning_phase == 1
preds = func(list_of_input_arrays + [1])
# run the model with dropout layers being inactive, i.e. learning_phase == 0
preds = func(list_of_input_arrays + [0])
Update: As I suggested above, another approach is to define a new model with the same architecture but without setting training=True, and then transfer the weights from the trained model to this new model. To achieve this, I just add a training argument to your get_model() function:
def get_model(training=None):
inputs = tf.keras.layers.Input(shape=(input_dims,))
x = tf.keras.layers.Dropout(rate=dropout_rate)(inputs, training=training)
x = tf.keras.layers.Dense(units=29, activation='relu')(x)
x = tf.keras.layers.Dropout(rate=dropout_rate)(x, training=training)
x = tf.keras.layers.Dense(units=15, activation='relu')(x)
outputs = tf.keras.layers.Dense(2, activation='softmax')(x)
model = tf.keras.Model(inputs=inputs, outputs=outputs)
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['sparse_categorical_accuracy'])
return model
# build a model with dropout layers active in both training and test phases
myModel = get_model(training=True)
# train the model
myModel.fit(...)
# build a clone of the model with dropouts deactivated in test phase
myTestModel = get_model() # note: the `training` is `None` by default
# transfer the weights from the trained model to this model
myTestModel.set_weights(myModel.get_weights())
# use the new model in test phase; the dropouts would not be active
myTestModel.predict(...)

Same working model in Keras not being improved in Pytorch

I'm converting a basic LSTM many-to-one architecture to predict the next single element in a sequence, written in Keras to Pytorch. NN architecture is the following (whole code can be found here):
model = Sequential()
model.add(LSTM(
512,
input_shape=(network_input.shape[1], network_input.shape[2]),
return_sequences=True
))
model.add(Dropout(0.3))
model.add(LSTM(512, return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(512))
model.add(Dense(256))
model.add(Dropout(0.3))
model.add(Dense(n_vocab))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
Running both models with the same data (yes, I've explicitly checked that), both start with a loss value ~ 4, but after 100 epochs or so, Keras already reached a loss ~ 0.02, which gives the desired results.
However, Pytorch model is stuck around ~ 3.4 after 20 epochs. I've tried many things:
Play with LR: It explodes when LR is too high, so this means that at least parameters are being updated.
Different optimizers, SGD, Adam, RMSprop, but same results with all.
Swap between .view[], .squeeze_ and indexing when accessing last sequence element.
Add, remove and modify non-linear activation functions and dropout.
Remove manual initialization for x_0 and h_0.
Here is the code for my model:
class NNP_RNN(nn.Module):
def __init__(self):
super(NNP_RNN, self).__init__()
self.lstm_1 = nn.LSTM(input_size=1, hidden_size=512, batch_first=True)
self.lstm_2 = nn.LSTM(input_size=512, hidden_size=512, batch_first=True)
self.lstm_3 = nn.LSTM(input_size=512, hidden_size=512, batch_first=True)
self.dense_1 = nn.Linear(in_features=512, out_features=256)
self.dense_2 = nn.Linear(in_features=256, out_features=58)
def forward(self, x):
batch_size = x.size(0)
h_0 = NNP_RNN.init_hidden((1, batch_size, 512))
c_0 = NNP_RNN.init_hidden((1, batch_size, 512))
x, _ = self.lstm_1(x, (h_0, c_0))
x = F.dropout(x, 0.3)
x, _ = self.lstm_2(x, (h_0, c_0))
x = F.dropout(x, 0.2)
_, (x, _) = self.lstm_3(x, (h_0, c_0))
x = x.squeeze_(0)
x = self.dense_1(x)
x = F.dropout(x, 0.1)
x = self.dense_2(x)
return x
#staticmethod
def init_hidden(dims):
return torch.zeros(dims, device=device)
And the training process:
optimizer = torch.optim.Adam(model.parameters(), lr=0.05)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.2, verbose=True, patience=5)
criterion = nn.CrossEntropyLoss()
for epoch in range(1, epochs + 1):
epoch_loss = 0
epoch_corrects = 0
for features, labels in tqdm(data, ncols=800):
features = features.to(device)
labels = labels.to(device)
optimizer.zero_grad()
batch_size = features.size(0)
output = model(features)
loss = criterion(output, labels)
loss.backward()
optimizer.step()
corrects = torch.argmax(output, dim=1)
corrects = torch.eq(corrects, labels).sum().item()
epoch_corrects += corrects
epoch_loss += loss.clone() * batch_size
epoch_loss /= len(data.dataset)
epoch_corrects /= len(data.dataset)
print(f'Loss epoch #{epoch} = {epoch_loss:.10f}, Accuracy = {epoch_corrects}')
scheduler.step(epoch_loss)

Categories

Resources