Pytorch tutorial loss is not decreasing as expected - python

I am reimplementing the pytorch tutorial of the Pytorch cifar10 tutorial
But I want to use a different model.
I don't want to use fully connected (in pytorch linear) layers and I want to add Batch Normalization.
My model looks like this:
import torch
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.pool = nn.MaxPool2d(2,2)
self.conv1 = nn.Conv2d(in_channels=3,out_channels=16,kernel_size=3, padding=1, padding_mode='zeros')
self.conv1_bn = nn.BatchNorm2d(16)
self.conv2 = nn.Conv2d(in_channels=16,out_channels=32,kernel_size=3, padding=1, padding_mode='zeros')
self.conv2_bn = nn.BatchNorm2d(32)
self.conv3 = nn.Conv2d(in_channels=32,out_channels=64,kernel_size=3, padding=1, padding_mode='zeros')
self.conv3_bn = nn.BatchNorm2d(64)
self.conv4 = nn.Conv2d(64,64,3, padding=1, padding_mode='zeros')
self.conv4_bn = nn.BatchNorm2d(64)
self.conv5 = nn.Conv2d(64,10,2,padding=0)
def forward(self, x): # x has shape (4,32,32,3)
x = self.pool(F.relu(self.conv1_bn(self.conv1(x)))) # feature map resolution is now 16*16
x = self.pool(F.relu(self.conv2_bn(self.conv2(x)))) # resolution now 8*8
x = self.pool(F.relu(self.conv3_bn(self.conv3(x)))) #resolution now 4*4
x = self.pool(F.relu(self.conv4_bn(self.conv4(x)))) # now 2*2
x = F.relu(self.conv5(x)) # The output shape is (batchsize, 1,1,10)
return x
Batchsize is 4 and image resolution is 32*32 so inputsize is 4,32,32,3
The convolution layers don't reduce the resolution size of the feature maps because of the padding. The resolution is halved with the maxpool layers. Conv5 gets an input with shape 4,2,2,64.
Now I use filtersize 2 and no padding to get a resolution of 1*1.
I have 10 classes so I use 10 filters. Each of the last filters should predict it's corresponding class.
The shape of the output is now (4,1,1,10).
But when I try to train this model the loss doesn't decrease. The amount of parameters of the tutorial model and my net are about the same at ~62k.
Here is the rest of the code. This is identical to the code in the tutorial but I have to reshape the output so it fits. (output in the tutorial was (4,10) and mine is 4,1,1,10)
transform = transforms.Compose(
[transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=4,
shuffle=True, num_workers=2)
testset = torchvision.datasets.CIFAR10(root='./data', train=False,
download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=4,
shuffle=False, num_workers=2)
net = Net()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net.to(device)
for epoch in range(2): # loop over the dataset multiple times
running_loss = 0.0
for i, data in enumerate(trainloader, 0):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data[0].to(device), data[1].to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = net(inputs) # I get the values as 4,1,1,10
outputs_reshaped = outputs.reshape(4,10)
loss = criterion(outputs_reshaped, labels)
loss.backward()
optimizer.step()
running_loss +=loss.item()
if i % 2000 == 1999: # print every 2000 mini-batches
print('[%d, %5d] loss: %.3f' %
(epoch + 1, i + 1, running_loss / 2000))
running_loss = 0.0
My loss looks like this.
[1, 2000] loss: 2.348
[1, 2000] loss: 2.477
[1, 4000] loss: 2.482
[1, 6000] loss: 2.468
[1, 8000] loss: 2.471
[1, 10000] loss: 2.482
[1, 12000] loss: 2.485
[2, 2000] loss: 2.486
[2, 4000] loss: 2.470
[2, 6000] loss: 2.479
[2, 8000] loss: 2.481
[2, 10000] loss: 2.474
[2, 12000] loss: 2.470
My model doesn't seem to learn anything. Anyone an idea why this might happen?

Your learning rate and momentum combination is too large for such a small batch size, try something like these:
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.0)
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
Update: I just realized another problem is you are using a relu activation at the end of the network. If you look at the documentation of CrossEntropyLoss, there is an advice:
The input is expected to contain raw, unnormalized scores for each
class.
Try training your network by removing last relu from conv5 and keeping lr=0.01 and momentum=0.9. Relu before cross entropy loss throws away information about class scores.

So if you have a similar problem
I changed the optimizer to
optimizer = optim.Adam(net.parameters(),0.001)
my last line in forward()
was
x = F.relu(self.conv5(x))
I removed the relu, it's now
x= self.conv5(x)
and now the loss is decreasing as expected (way faster than the tutorial with the same amount of parameters)

Related

RuntimeError: Given groups=1, weight of size [64, 3, 7, 7], expected input[100, 1, 28, 28] to have 3 channels, but got 1 channels instead

I am trying to use a pre-trained (resnet) model on the MNIST dataset, but this error always appears to me
RuntimeError: Given groups=1, weight of size [64, 3, 7, 7], expected input[100, 1, 28, 28] to have 3 channels, but got 1 channels instead.
This is my code:
MNIST dataset
from torchvision import datasets
import torchvision.transforms as transforms
# number of subprocesses to use for data loading
num_workers = 0
# how many samples per batch to load
batch_size = 100
# convert data to torch.FloatTensor
transform = transforms.Compose
([
transforms.ToTensor(),
transforms.RandomErasing(p=0.2)
])
# choose the training and test datasets
train_data = datasets.MNIST(root='data', train=True, download=True, transform=transform)
test_data = datasets.MNIST(root='data', train=False, download=True, transform=transform)
# prepare data loaders
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, num_workers=num_workers)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, num_workers=num_workers)
MLP network definition
from torch.nn.modules.activation import ReLU
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
# building the model and the type to model → Sequential
self.model = nn.Sequential
(
# building the layers and the type to layers → Linear
nn.Linear(28 * 28 , 200), # input layer = 100
# to avoid problem to overfitting → using the Dropout (As we can see, dropouts are used to randomly remove neurons while training of the neural network.)
nn.Dropout(0.2), # to use Dropout to avoid problem → overfitting
nn.ReLU(True), # activation function
nn.BatchNorm1d(num_features = 200), # Batch normalization (also known as batch norm) is a method used
# to make training of artificial neural networks faster and more stable through normalization of the layers' inputs by re-centering and re-scaling.
nn.Linear(200 , 10), # output layer = 10
)
def forward(self, x):
x = x.view(-1, 1, 28*28)
return self.model(x)
# initialize the N
model = Net()
print(model)
model = resnet50(weights = ResNet50_Weights.IMAGENET1K_V2)
model.fc = nn.Linear(512,10)
define an optimizer to update the model parameters
## Specify loss and optimization functions
# specify loss function
criterion = nn.CrossEntropyLoss()
# specify optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
Training Data
#number of epochs to train the model
n_epochs = 1 # suggest training between 20-50 epochs
model.train() # prep model for training
for epoch in range(n_epochs):
# monitor training loss
train_loss = 0.0
###################
# train the model #
###################
for data, target in train_loader:
# clear the gradients of all optimized variables
data = data.repeat(1,3,1,1)
optimizer.zero_grad()
# forward pass: compute predicted outputs by passing inputs to the model
output = model(data)
# calculate the loss
loss = criterion(output, target)
# backward pass: compute gradient of the loss with respect to model parameters
loss.backward()
# perform a single optimization step (parameter update)
optimizer.step()
# update running training loss
train_loss += loss.item()*data.size(0)
# print training statistics
# calculate average loss over an epoch
train_loss = train_loss/len(train_loader.dataset)
print('Epoch: {} \tTraining Loss: {:.6f}'.format(
epoch+1,
train_loss
))
Initialize lists to monitor test loss and accuracy
# initialize lists to monitor test loss and accuracy
test_loss = 0.0
class_correct = list(0. for i in range(10))
class_total = list(0. for i in range(10))
model.eval() # prep model for *evaluation*
for data, target in test_loader:
# forward pass: compute predicted outputs by passing inputs to the model
output = model(data)
# calculate the loss
loss = criterion(output, target)
# update test loss
test_loss += loss.item()*data.size(0)
# convert output probabilities to predicted class
_, pred = torch.max(output, 1)
# compare predictions to true label
correct = np.squeeze(pred.eq(target.data.view_as(pred)))
# calculate test accuracy for each object class
for i in range(16):
label = target.data[i]
class_correct[label] += correct[i].item()
class_total[label] += 1
# calculate and print avg test loss
test_loss = test_loss/len(test_loader.dataset)
print('Test Loss: {:.6f}\n'.format(test_loss))
for i in range(10):
if class_total[i] > 0:
print('Test Accuracy of %5s: %2d%% (%2d/%2d)' % (
str(i), 100 * class_correct[i] / class_total[i],
np.sum(class_correct[i]), np.sum(class_total[i])))
else:
print('Test Accuracy of %5s: N/A (no training examples)' % (classes[i]))
print('\nTest Accuracy (Overall): %2d%% (%2d/%2d)' % (
100. * np.sum(class_correct) / np.sum(class_total),
np.sum(class_correct), np.sum(class_total)))

BERT Debugging (not enough values to unpack (expected 2, got 1))

I'm new to BERT and trying to test it on my dataset. The code is as the followings:
# Import BERT model and Tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
bert = AutoModelForMaskedLM.from_pretrained("bert-base-chinese")
class BERT_Arch(nn.Module):
def __init__(self, bert):
super(BERT_Arch, self).__init__()
self.bert = bert
# dropout layer
self.dropout = nn.Dropout(0.1)
# relu activation function
self.relu = nn.ReLU()
# dense layer 1
self.fc1 = nn.Linear(768,512)
# dense layer 2 (Output layer)
self.fc2 = nn.Linear(512,2)
#softmax activation function
self.softmax = nn.LogSoftmax(dim=1)
#define the forward pass
def forward(self, sent_id, mask):
#pass the inputs to the model
_, cls_hs = self.bert(sent_id, attention_mask=mask)
x = self.fc1(cls_hs)
x = self.relu(x)
x = self.dropout(x)
# output layer
x = self.fc2(x)
# apply softmax activation
x = self.softmax(x)
return x
# function to train the model
def train():
model.train()
total_loss, total_accuracy = 0, 0
# empty list to save model predictions
total_preds=[]
# iterate over batches
for step,batch in enumerate(train_dataloader):
# progress update after every 50 batches.
if step % 50 == 0 and not step == 0:
print('Batch {:>5,} of {:>5,}.'.format(step, len(train_dataloader)))
# push the batch to gpu
batch = [r.to(device) for r in batch]
sent_id, mask, labels = batch
# clear previously calculated gradients
model.zero_grad()
# get model predictions for the current batch
preds = model(sent_id, mask)
# compute the loss between actual and predicted values
loss = cross_entropy(preds, labels)
# add on to the total loss
total_loss = total_loss + loss.item()
# backward pass to calculate the gradients
loss.backward()
# clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# update parameters
optimizer.step()
# model predictions are stored on GPU. So, push it to CPU
preds=preds.detach().cpu().numpy()
# append the model predictions
total_preds.append(preds)
# compute the training loss of the epoch
avg_loss = total_loss / len(train_dataloader)
# predictions are in the form of (no. of batches, size of batch, no. of classes).
# reshape the predictions in form of (number of samples, no. of classes)
total_preds = np.concatenate(total_preds, axis=0)
#returns the loss and predictions
return avg_loss, total_preds
# set initial loss to infinite
best_valid_loss = float('inf')
# empty lists to store training and validation loss of each epoch
train_losses=[]
valid_losses=[]
#for each epoch
for epoch in range(epochs):
print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
#train model
train_loss, _ = train()
#evaluate model
valid_loss, _ = evaluate()
#save the best model
if valid_loss < best_valid_loss:
best_valid_loss = valid_loss
torch.save(model.state_dict(), 'saved_weights.pt')
# append training and validation loss
train_losses.append(train_loss)
valid_losses.append(valid_loss)
print(f'\nTraining Loss: {train_loss:.3f}')
print(f'Validation Loss: {valid_loss:.3f}')
The error that I get is not enough values to unpack (expected 2, got 1). I have checked the tensor of input_ids and mask, and they looks like the followings:
tensor([[101, 102],
[101, 102],
[101, 102],
...,
[101, 102],
[101, 102],
[101, 102]])
tensor([[1, 1],
[1, 1],
[1, 1],
...,
[1, 1],
[1, 1],
[1, 1]])
tensor([0, 0, 0, ..., 0, 0, 0])
I think that the dimension of tensors is not wrong, so don't need to unsqueeze them as other answers show. Can someone check this for me? thanks ahead!
The complete errors prompt:
10 print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
11 #train model
---> 12 train_loss, _ = train()
13 #evaluate model
14 valid_loss, _ = evaluate()
16 model.zero_grad()
17 # get model predictions for the current batch
---> 18 preds = model(sent_id, mask)
19 # compute the loss between actual and predicted values
20 loss = cross_entropy(preds, labels)
19 def forward(self, sent_id, mask):
20 #pass the inputs to the model
---> 21 _, cls_hs = self.bert(sent_id, attention_mask=mask)
22 x = self.fc1(cls_hs)
23 x = self.relu(x)
ValueError: not enough values to unpack (expected 2, got 1)

Apply gradient descent only if TensorFlow model improves on training and validation data

I want to customize the fit function of the model in order to apply the gradient descent on the weights only if the model improved its predictions on the validation data. The reason for this is that I want to prevent overfitting.
According to this guide it should be possible to customize the fit function of the model. However, the following code runs into errors:
class CustomModel(tf.keras.Model):
def train_step(self, data):
x, y = data
with tf.GradientTape() as tape:
y_pred = self(x, training=True)
loss = self.compiled_loss(y, y_pred, regularization_losses=self.losses)
trainable_vars = self.trainable_variables
gradients = tape.gradient(loss, trainable_vars)
### check and apply gradient
Y_pred_val = self.predict(X_val) # this does not work
acc_val = calculate_accuracy(Y_val, Y_pred_val)
if acc_val > last_acc_val:
self.optimizer.apply_gradients(zip(gradients, trainable_vars))
###
self.compiled_metrics.update_state(y, y_pred)
return_obj = {m.name: m.result() for m in self.metrics}
return_obj["acc_val"] = acc_val
return return_obj
How could it be possible to evaluate the model inside the fit function?
You don't have to subclass fit() for this. You can just make a custom training loop. Look how I did that:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
from tensorflow.keras import Model
import tensorflow as tf
from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, Concatenate
import tensorflow_datasets as tfds
from tensorflow.keras.regularizers import l1, l2, l1_l2
from collections import deque
dataset, info = tfds.load('mnist',
with_info=True,
split='train',
as_supervised=False)
TAKE = 1_000
data = dataset.map(lambda x: (tf.cast(x['image'],
tf.float32), x['label'])).shuffle(TAKE).take(TAKE)
len_train = int(8e-1*TAKE)
train = data.take(len_train).batch(8)
test = data.skip(len_train).take(info.splits['train'].num_examples - len_train).batch(8)
class CNN(Model):
def __init__(self):
super(CNN, self).__init__()
self.layer1 = Dense(32, activation=tf.nn.relu,
kernel_regularizer=l1(1e-2),
input_shape=info.features['image'].shape)
self.layer2 = Conv2D(filters=16,
kernel_size=(3, 3),
strides=(1, 1),
activation='relu',
input_shape=info.features['image'].shape)
self.layer3 = MaxPooling2D(pool_size=(2, 2))
self.layer4 = Conv2D(filters=32,
kernel_size=(3, 3),
strides=(1, 1),
activation=tf.nn.elu,
kernel_initializer=tf.keras.initializers.glorot_normal)
self.layer5 = MaxPooling2D(pool_size=(2, 2))
self.layer6 = Flatten()
self.layer7 = Dense(units=64,
activation=tf.nn.relu,
kernel_regularizer=l2(1e-2))
self.layer8 = Dense(units=64,
activation=tf.nn.relu,
kernel_regularizer=l1_l2(l1=1e-2, l2=1e-2))
self.layer9 = Concatenate()
self.layer10 = Dense(units=info.features['label'].num_classes)
def call(self, inputs, training=None, **kwargs):
b = self.layer1(inputs)
a = self.layer2(inputs)
a = self.layer3(a)
a = self.layer4(a)
a = self.layer5(a)
a = self.layer6(a)
a = self.layer8(a)
b = self.layer7(b)
b = self.layer6(b)
x = self.layer9([a, b])
x = self.layer10(x)
return x
cnn = CNN()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
train_loss = tf.keras.metrics.Mean()
test_loss = tf.keras.metrics.Mean()
train_acc = tf.keras.metrics.SparseCategoricalAccuracy()
test_acc = tf.keras.metrics.SparseCategoricalAccuracy()
optimizer = tf.keras.optimizers.Nadam()
template = 'Epoch {:3} Train Loss {:7.4f} Test Loss {:7.4f} ' \
'Train Acc {:6.2%} Test Acc {:6.2%} '
epochs = 5
early_stop = epochs//50
loss_hist = deque()
acc_hist = deque(maxlen=1)
acc_hist.append(0)
for epoch in range(1, epochs + 1):
train_loss.reset_states()
test_loss.reset_states()
train_acc.reset_states()
test_acc.reset_states()
for images, labels in train:
with tf.GradientTape() as tape:
logits = cnn(images, training=True)
loss = loss_object(labels, logits)
train_loss(loss)
train_acc(labels, logits)
current_acc = tf.metrics.SparseCategoricalAccuracy()(labels, logits)
if tf.greater(current_acc, acc_hist[-1]):
print('IMPROVEMENT.')
gradients = tape.gradient(loss, cnn.trainable_variables)
optimizer.apply_gradients(zip(gradients, cnn.trainable_variables))
acc_hist.append(current_acc)
for images, labels in test:
logits = cnn(images, training=False)
loss = loss_object(labels, logits)
test_loss(loss)
test_acc(labels, logits)
print(template.format(epoch,
train_loss.result(),
test_loss.result(),
train_acc.result(),
test_acc.result()))
if len(loss_hist) > early_stop and loss_hist.popleft() < min(loss_hist):
print('Early stopping. No validation loss decrease in %i epochs.' % early_stop)
break
Output:
IMPROVEMENT.
IMPROVEMENT.
IMPROVEMENT.
IMPROVEMENT.
Epoch 1 Train Loss 21.1698 Test Loss 21.3391 Train Acc 37.13% Test Acc 38.50%
IMPROVEMENT.
IMPROVEMENT.
IMPROVEMENT.
Epoch 2 Train Loss 13.8314 Test Loss 12.2496 Train Acc 50.88% Test Acc 52.50%
Epoch 3 Train Loss 13.7594 Test Loss 12.5884 Train Acc 51.75% Test Acc 53.00%
Epoch 4 Train Loss 13.1418 Test Loss 13.2374 Train Acc 52.75% Test Acc 51.50%
Epoch 5 Train Loss 13.6471 Test Loss 13.3157 Train Acc 49.63% Test Acc 51.50%
Here's the part that did the job. It's a deque and it skips the application of gradients if the last element of the deque is smaller.
for images, labels in train:
with tf.GradientTape() as tape:
logits = cnn(images, training=True)
loss = loss_object(labels, logits)
train_loss(loss)
train_acc(labels, logits)
current_acc = tf.metrics.SparseCategoricalAccuracy()(labels, logits)
if tf.greater(current_acc, acc_hist[-1]):
print('IMPROVEMENT.')
gradients = tape.gradient(loss, cnn.trainable_variables)
optimizer.apply_gradients(zip(gradients, cnn.trainable_variables))
acc_hist.append(current_acc)
Rather than create a custom fit I think it would be easier to use the callback ModelCheckpoint.
What you are trying to do is get the model that has the lowest validation error. Set it up to monitor validation loss. That way it will save the best model even if the network starts to over fit. Documentation is here.
If you do not get a model with a satisfactory validation accuracy then you will have to take other measures.
First look at your training accuracy.
My experience is that you should achieve at least 95%.
If the training accuracy is good but the validation accuracy is poor and degrades as you run more epochs that is a sign of over fitting.
You did not show the model but if you are doing classification you will probably have dense layers with the final layer using softmax activation.
Start out with model with only one dense layer and see if it trains well.
If not you may have to add additional dense hidden layers. If you do include a drop out layer to help prevent over fitting. You might also consider using regularizers. Documentation is
here..
I also find you can get improved performance if you dynamically adjust the learning rate. The callback ReduceLROnPlateau enables that capability.
Set it up to monitor validation loss and to reduce the learning rate by a factor if the loss fails to decrease. Documentation is here.

Pytorch loss of convolution is 0.0 from start

I am building a conv net that classifies dog and cat. Architecture is pretty simple. 2 Conv(with batch norm, leakyReLU, Maxpooling) to 1 fc. Input image size is resized to 64. The size is good. The problem is loss is 0.0 from the start. I have no clue what the cause is. I couldn't find any answer. I have wrote every detail that might be important. If you need anything else please tell me, I will edit.
main.py
import torch
import torch.nn as nn
from torchvision import transforms, datasets
import PIL
import matplotlib.pyplot as plt
from Dataset import Dataset
from Network import Network
# Added to avoid torch._C._cuda_init() \n RuntimeError: CUDA error: unknown error
torch.cuda.current_device()
# Hyper Parameters
batch_size = 1
img_size = 64
learning_rate = 0.001
num_epoch = 1
# Directories
trainDir = "D:/Programming/python/Deep learning/datasets/dogs-vs-cats/train"
testDir = "D:/Programming/python/Deep learning/datasets/dogs-vs-cats/test1"
print("Initializing...")
# Device
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# Augmentation
transforms = transforms.Compose([
transforms.Resize((img_size, img_size)),
transforms.ColorJitter(hue=.05, saturation=.05),
transforms.RandomHorizontalFlip(),
transforms.RandomRotation(20, resample=PIL.Image.BILINEAR) ,
transforms.ToTensor()
])
trainset = datasets.ImageFolder(root=trainDir, transform=transforms)
testset = datasets.ImageFolder(root=testDir, transform=transforms)
train_loader = torch.utils.data.DataLoader(
trainset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(
testset, batch_size=batch_size, shuffle=False) # test set will not be shuffled
model = Network(img_size,2).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
total_step = len(train_loader)
print("Tranining started")
for epoch in range(num_epoch):
for i, (images, labels) in enumerate(train_loader):
images = images.to(device)
labels = labels.to(device)
# forward propagate
outputs = model(images)
loss = criterion(outputs, labels)
# backpropagte and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (i+1) % 100 == 0:
print(
"Epoch [{}/{}], Step[{}/{}], Loss: {}".format(
epoch+1, num_epoch, i+1, total_step, loss.item()
)
)
print("Tranining complete, validation started")
with torch.no_grad():
correct = 0
total = 0
for images, labels in test_loader:
images = images.to(device)
labels = labels.to(device)
outputs = model(images)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
print('Test Accuracy: {} %'.format(100 * correct / total))
#
torch.save(model.state_dict(), "model.ckpy")
Network.py
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
"""
Input size for conv
l = number of input feature maps
k = number of output feature maps
n, m = width and height of kernel
total parameter = (n*m*l+1)*k
"""
class Network(nn.Module):
def __init__(self, input_size, num_class):
super(Network, self).__init__()
self.conv1 = nn.Sequential(
nn.Conv2d(3, 16, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(16),
nn.LeakyReLU(),
nn.MaxPool2d(kernel_size=2, stride=2)
) # output size = (128, 128, 16)
self.conv2 = nn.Sequential(
nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(32),
nn.LeakyReLU(),
nn.MaxPool2d(kernel_size=2, stride=2)
) # output size = (64, 64, 32)
self.fc1 = nn.Linear(
int((input_size/4)**2*32), num_class
)
def forward(self, x):
out = self.conv1(x)
out = self.conv2(out)
out = out.view(out.size(0), -1)
out = self.fc1(out)
return out
Output
Epoch [1/1], Step[5800/25000], Loss: 0.0
Epoch [1/1], Step[5900/25000], Loss: 0.0
Epoch [1/1], Step[6000/25000], Loss: 0.0
Epoch [1/1], Step[6100/25000], Loss: 0.0
Epoch [1/1], Step[6200/25000], Loss: 0.0
Epoch [1/1], Step[6300/25000], Loss: 0.0
Epoch [1/1], Step[6400/25000], Loss: 0.0
Epoch [1/1], Step[6500/25000], Loss: 0.0
Result after each layer
outputs of conv1,2
[[ 3.0135e-01, 3.5849e-01, 4.7758e-01, ..., 3.9759e-01,
3.7988e-01, 9.7870e-01],
[ 4.3010e-01, 6.0753e-03, 4.5642e-01, ..., -8.5486e-04,
4.4537e-02, 2.9074e-01],
[ 3.8567e-01, 7.8431e-02, 2.3859e-01, ..., -3.0013e-03,
-5.5821e-03, 1.2284e-01],
...,
[ 3.9181e-01, 3.9093e-01, 1.2053e-01, ..., -4.7156e-03,
5.6266e-01, 7.7017e-01],
outputs of fc1
[[-0.0772, 0.2166]]
loss = criterion(output, target.view(-1)) # Flatten target
try this.
could you remove these two line?
images = images.to(device)
labels = labels.to(device)
self.conv1 and 2 must be sent to cuda : self.conv1(2).cuda()

Same working model in Keras not being improved in Pytorch

I'm converting a basic LSTM many-to-one architecture to predict the next single element in a sequence, written in Keras to Pytorch. NN architecture is the following (whole code can be found here):
model = Sequential()
model.add(LSTM(
512,
input_shape=(network_input.shape[1], network_input.shape[2]),
return_sequences=True
))
model.add(Dropout(0.3))
model.add(LSTM(512, return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(512))
model.add(Dense(256))
model.add(Dropout(0.3))
model.add(Dense(n_vocab))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
Running both models with the same data (yes, I've explicitly checked that), both start with a loss value ~ 4, but after 100 epochs or so, Keras already reached a loss ~ 0.02, which gives the desired results.
However, Pytorch model is stuck around ~ 3.4 after 20 epochs. I've tried many things:
Play with LR: It explodes when LR is too high, so this means that at least parameters are being updated.
Different optimizers, SGD, Adam, RMSprop, but same results with all.
Swap between .view[], .squeeze_ and indexing when accessing last sequence element.
Add, remove and modify non-linear activation functions and dropout.
Remove manual initialization for x_0 and h_0.
Here is the code for my model:
class NNP_RNN(nn.Module):
def __init__(self):
super(NNP_RNN, self).__init__()
self.lstm_1 = nn.LSTM(input_size=1, hidden_size=512, batch_first=True)
self.lstm_2 = nn.LSTM(input_size=512, hidden_size=512, batch_first=True)
self.lstm_3 = nn.LSTM(input_size=512, hidden_size=512, batch_first=True)
self.dense_1 = nn.Linear(in_features=512, out_features=256)
self.dense_2 = nn.Linear(in_features=256, out_features=58)
def forward(self, x):
batch_size = x.size(0)
h_0 = NNP_RNN.init_hidden((1, batch_size, 512))
c_0 = NNP_RNN.init_hidden((1, batch_size, 512))
x, _ = self.lstm_1(x, (h_0, c_0))
x = F.dropout(x, 0.3)
x, _ = self.lstm_2(x, (h_0, c_0))
x = F.dropout(x, 0.2)
_, (x, _) = self.lstm_3(x, (h_0, c_0))
x = x.squeeze_(0)
x = self.dense_1(x)
x = F.dropout(x, 0.1)
x = self.dense_2(x)
return x
#staticmethod
def init_hidden(dims):
return torch.zeros(dims, device=device)
And the training process:
optimizer = torch.optim.Adam(model.parameters(), lr=0.05)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.2, verbose=True, patience=5)
criterion = nn.CrossEntropyLoss()
for epoch in range(1, epochs + 1):
epoch_loss = 0
epoch_corrects = 0
for features, labels in tqdm(data, ncols=800):
features = features.to(device)
labels = labels.to(device)
optimizer.zero_grad()
batch_size = features.size(0)
output = model(features)
loss = criterion(output, labels)
loss.backward()
optimizer.step()
corrects = torch.argmax(output, dim=1)
corrects = torch.eq(corrects, labels).sum().item()
epoch_corrects += corrects
epoch_loss += loss.clone() * batch_size
epoch_loss /= len(data.dataset)
epoch_corrects /= len(data.dataset)
print(f'Loss epoch #{epoch} = {epoch_loss:.10f}, Accuracy = {epoch_corrects}')
scheduler.step(epoch_loss)

Categories

Resources