Pytorch CNN - no improvement in accuracy during training - python

I've built a CNN using Pytorch and am attempting to train it to classify dog and cat images from this Kaggle dataset.
The training loss starts at ~9 after the first epoch and then gets stuck at ~0.69 from the second epoch onwards. The testing loss and the accuracy are stack at ~0.69 and ~50% throughout the training.
At the moment my parameters are as follows:
batch_size = 128
num_epochs = 10
learning_rate = 0.1
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adamax(model.parameters(), lr=learning_rate)
I've tried changing the batch size, the optimizer and the learning rate. I've attached my code for creating the dataset and the CNN, and the training loop below.
Definition of dataset class and transforms for augmentations
class CatDogDataset(Dataset):
def __init__(self, images_list, mode="train", transform=None):
self.images_list = images_list
self.mode = mode
self.transform = transform
# dataset length
def __len__(self):
self.dataset_len = len(self.images_list)
return self.dataset_len
# load an image
def __getitem__(self, idx):
image_name = self.images_list[idx]
image = Image.open(image_name)
image = image.resize((224,224)) # this is important when feeding into a pretrained model
transformed_image = self.transform(image)
image_category = image_name.split("/")[-1].split(".")[0]
if self.mode == "train" or self.mode == "val":
if image_category == "cat":
label = 0
else:
label = 1
return transformed_image, label
else:
return transformed_image
train_transforms = transforms.Compose([
transforms.RandomHorizontalFlip(p=0.5),
transforms.RandomRotation(15),
transforms.RandomResizedCrop(224, scale=(0.8,1.0),ratio=(1.0,1.0)),
transforms.ToTensor(),
transforms.Normalize((0, 0, 0),(1, 1, 1))
])
val_transforms = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0, 0, 0),(1, 1, 1))
])
CNN class definition
class CNN(nn.Module):
def __init__(self):
super().__init__()
self.cnn_layers = nn.Sequential(
# convolutional layer 1
nn.Conv2d(in_channels=3, out_channels=16, kernel_size=5, padding=0, stride=1),
nn.BatchNorm2d(16),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2),
# convolutional layer 2
nn.Conv2d(in_channels=16, out_channels=32, kernel_size=5, padding=0, stride=1),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2),
# convolutional layer 3
nn.Conv2d(in_channels=32, out_channels=64, kernel_size=5, padding=0, stride=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2)
)
self.linear_layers = nn.Sequential(
nn.Linear(in_features=64 * 24 * 24, out_features=10),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(in_features=10, out_features=2)
)
def forward(self, x):
out = self.cnn_layers(x)
#print(out.shape)
out = out.view(-1, 64 * 24 * 24) # flatten
out = self.linear_layers(out)
return out
Model training and validation
from tqdm import tqdm
train_losses = []
val_losses = []
accuracy_list = []
for epoch in range(num_epochs):
# perform training on train set
model.train()
running_loss = 0
for images, labels in tqdm(train_dataloader):
# load to gpu
images = images.to(device)
labels = labels.to(device)
# forward pass
outputs = model(images)
loss = criterion(outputs, labels)
running_loss += loss.item()
# backprop and update model params
optimizer.zero_grad()
loss.backward()
optimizer.step()
# calculate training loss for the epoch
train_losses.append(running_loss / len(train_dataloader))
# calculate loss accuracy on validation set
model.eval()
running_loss = 0
num_correct = 0
num_predictions = 0
with torch.no_grad():
for images, labels in tqdm(val_dataloader):
# load to gpu
images = images.to(device)
labels = labels.to(device)
# forward pass
outputs = model(images)
loss = criterion(outputs, labels)
running_loss += loss.item()
# calculate accuracy for batch
_, predicted = torch.max(outputs.data, 1)
num_correct += (predicted == labels).sum().item()
num_predictions += labels.size(0)
# calculate val loss for epoch
val_losses.append(running_loss / len(val_dataloader))
# calculate accuracy for epoch
accuracy = num_correct / num_predictions * 100
accuracy_list.append(accuracy)
print("[Epoch: %d / %d], [Train loss: %.4f], [Test loss: %.4f], [Acc: %.2f]" \
%(epoch+1, num_epochs, train_losses[-1], val_losses[-1], accuracy))

Related

Neural network keep predicting the same number

I have a ROS application where a camera node sends an image via service to a neutral network node. My training and validation dataset I use is the MNIST database. It should be very easy to predict a number, but the neural network returns the same number for every single service request.
ai_service.py
class AiService():
def __init__(self, save_path):
self.batch_size = 2800
self.epochs = 25
self.learning_rate = 0.01
self.training_data = torch.utils.data.DataLoader(datasets.MNIST(root='./data', train=True, download=True,
transform=transforms.Compose([transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))])), 200, shuffle=True)
self.validation_data = torch.utils.data.DataLoader(datasets.MNIST(root='./data', train=False, download=True,
transform=transforms.Compose([transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))])), 200, shuffle=True)
...
# Function to train the mnist dataset.
def training(self):
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(self.model.parameters(), self.learning_rate)
start_time = time()
for epoch in range(self.epochs):
running_loss = 0
# trainig phase
for images, labels in self.training_data:
optimizer.zero_grad()
image, label = images.to(self.device), labels.to(self.device)
output = self.model(image)
loss = criterion(output, label)
loss.backward()
optimizer.step() #optimizing weights
running_loss += loss.item()
else:
print("Epoch {} - Training loss: {:.10f}".format(epoch, running_loss / len(self.training_data)))
print("\nTraining Time (in minutes): {:.2f} =".format((time() - start_time) / 60))
def validating(self, request_image):
self.model.eval()
tensor_image = self.image_to_tensor(request_image)
with torch.no_grad():
output = self.model(tensor_image)
return output.cpu().data.numpy().argmax()
def image_to_tensor(self, request_image):
return transforms.ToTensor()(self.cv_bridge.imgmsg_to_cv2(request_image, 'mono8'))
neural_network.py
class NeuralNetwork(nn.Module):
# Initializes the Neural Network by setting up the layers.
def __init__(self):
super().__init__()
self.flatten = nn.Flatten()
self.input_layer = nn.Sequential(nn.Linear(28*28, 512))
self.hidden_layer1 = nn.Linear(512, 254)
self.hidden_layer2 = nn.Linear(254, 128)
self.output_layer = nn.Linear(128, 10)
def forward(self, x):
x = self.flatten(x)
x = F.relu(self.input_layer(x))
x = F.relu(self.hidden_layer1(x))
x = F.relu(self.hidden_layer2(x))
x = self.output_layer(x)
return F.log_softmax(x, 1)
I get get a training accuracy of:
My output:
My camera image:
Could it be because of the resizing and grayscaling that the picture is not recognized? I just added imshow to the def image_to_tensor(self, request_image): function and the image is barely recognisable.

How can I solve the shpe and reconstract CNN for this project?

when I train this network on medical images data
-train
-benign
-normal
-cancer
-test
-benign
-normal
-cancer
-valid
-benign
-normal
-cancer
I get an error when I do training
this is data loading.
import os
import torch
from torchvision import datasets, transforms
### TODO: Write data loaders for training, validation, and test sets
## Specify appropriate transforms, and batch_sizes
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
# number of subprocesses to use for data loading
num_workers = 0
# how many samples per batch to load
batch_size = 32
data_transform_train = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
data_transform_test = transforms.Compose([
transforms.Resize(234),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
data_dir = '/content/drive/MyDrive/COVID-19 Database/COVID'
train_dir = os.path.join(data_dir, 'train')
valid_dir = os.path.join(data_dir, 'valid')
test_dir = os.path.join(data_dir, 'test')
train_data = datasets.ImageFolder(train_dir, transform=data_transform_train)
valid_data = datasets.ImageFolder(valid_dir, transform=data_transform_test)
test_data = datasets.ImageFolder(test_dir, transform=data_transform_test)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, num_workers=num_workers, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_data, batch_size=batch_size, num_workers=num_workers, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, num_workers=num_workers, shuffle=True)
loaders_scratch = {
'train' : train_loader,
'valid' : valid_loader,
'test' : test_loader
}
make a model here from scratch
import torch.nn as nn
import torch.nn.functional as F
# define the CNN architecture
class Net(nn.Module):
### TODO: choose an architecture, and complete the class
def __init__(self):
super(Net, self).__init__()
## Define layers of a CNN
self.conv1 = nn.Conv2d(1, 128, 3) #(224-3)/1+1= 222
self.conv2 = nn.Conv2d(128, 64, 3) #110 after pooling with (2,2) ==>(110-3)/1+1=108
self.conv3 = nn.Conv2d(64, 64, 3) # 54 after pooling with (2,2) ==> 110/2=54 ==>(54-3)/1+1=52
self.conv4 = nn.Conv2d(64, 32, 3) # 26 after pooling with (2,2) ==> 52/2=26 ==>(26-3)/1+1=24
self.conv5 = nn.Conv2d(32, 16, 3) # 12 after pooling with (2,2) ==> 24/2=12 ==> (12-3)/1+1=10
self.conv6 = nn.Conv2d(16, 8, 3) # 5 after pooling with (2,2) ==> 10/2=2
self.pool = nn.MaxPool2d(2, 2)
self.fc1 = nn.Linear(8 * 5 * 5, 160) #8 is a out_channel(number of filter) of last conv layer and 5 is the output of last conv layer after pooling(200 input to fc1)
self.fc2 = nn.Linear(160, 3) #166 is the output of the fc1 as input to fc2 and 133 output classes
self.dropout25 = nn.Dropout(p=0.5) # 50% dropout of nodes
self.softmax = nn.Softmax(dim = 1)
def forward(self, x):
## Define forward behavior
x = F.relu(self.conv1(x))
x = self.pool(F.relu(self.conv2(x)))
x = self.pool(F.relu(self.conv3(x)))
x = self.pool(F.relu(self.conv4(x)))
x = self.pool(F.relu(self.conv5(x)))
x = self.pool(F.relu(self.conv6(x)))
x = x.view(x.size(0), -1)
x = F.relu(self.fc1(x))
x = self.dropout25(x)
x = self.fc2(x)
x = self.softmax(x)
return x
#-#-# You so NOT have to modify the code below this line. #-#-#
# instantiate the CNN
model_scratch = Net()
use_cuda = torch.cuda.is_available()
# move tensors to GPU if CUDA is available
if use_cuda:
model_scratch.cuda()
print(model_scratch)
here I define loss and optimizer
import torch.optim as optim
### TODO: select loss function
criterion_scratch = nn.CrossEntropyLoss()
### TODO: select optimizer
optimizer_scratch = optim.Adam(model_scratch.parameters(), lr = 0.001)
make a training and the error i appear here
import numpy as np
def train(n_epochs, loaders, model, optimizer, criterion,use_cuda,save_path):
"""returns trained model"""
# initialize tracker for maxi validation loss
valid_loss_min = np.Inf
for epoch in range(1, n_epochs+1):
# initialize variables to monitor training and validation loss
train_loss = 0.0
valid_loss = 0.0
###################
# train the model #
###################
model.train()
for batch_idx, (data, target) in enumerate(loaders['train']):
# move to GPU
if use_cuda:
data, target = data.cuda(), target.cuda()
## find the loss and update the model parameters accordingly
## record the average training loss, using something like
## train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.data - train_loss))
optimizer.zero_grad()
output = model(data)
loss = criterion(output,target)
loss.backward()
optimizer.step()
train_loss += loss.item()*data.size(0)
######################
# validate the model #
######################
model.eval()
for batch_idx, (data, target) in enumerate(loaders['valid']):
# move to GPU
if use_cuda:
data, target = data.cuda(), target.cuda()
## update the average validation loss
output = model(data)
loss = criterion(output,target)
valid_loss += loss.item()*data.size(0)
train_loss = train_loss/len(loaders['train'].dataset)
valid_loss = valid_loss/len(loaders['valid'].dataset)
# print training/validation statistics
print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
epoch,
train_loss,
valid_loss
))
## TODO: save the model if validation loss has decreased
if valid_loss <= valid_loss_min:
print('Validation loss decreased ({:.6f} --> {:.6f}). Saving model ...'.format(
valid_loss_min,
valid_loss))
torch.save(model.state_dict(),save_path)
valid_loss_min = valid_loss
# return trained model
return model
# train the model
model_scratch = train(15, loaders_scratch, model_scratch, optimizer_scratch,
criterion_scratch, use_cuda, 'model_scratch.pt')
# load the model that got the best validation accuracy
model_scratch.load_state_dict(torch.load('model_scratch.pt'))
and this is an error
RuntimeError Traceback (most recent call last)
<ipython-input-4-63f181ccccc5> in <module>()
66 # train the model
67 model_scratch = train(15, loaders_scratch, model_scratch, optimizer_scratch,
---> 68 criterion_scratch, use_cuda, 'model_scratch.pt')
69
70 # load the model that got the best validation accuracy
5 frames
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/conv.py in _conv_forward(self, input, weight, bias)
394 _pair(0), self.dilation, self.groups)
395 return F.conv2d(input, weight, bias, self.stride,
--> 396 self.padding, self.dilation, self.groups)
397
398 def forward(self, input: Tensor) -> Tensor:
RuntimeError: Given groups=1, weight of size [128, 1, 3, 3], expected input[32, 3, 224, 224] to have 1 channels, but got 3 channels instead
its because you have a model definition which have 1 channel...and your datasets class have images of 3 channels
So in your model should be written as
import torch.nn as nn
import torch.nn.functional as F
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
## Define layers of a CNN
self.conv1 = nn.Conv2d(3, 128, 3) #(224-3)/1+1= 222
self.conv2 = nn.Conv2d(128, 64, 3) #110 after pooling with (2,2) ==>(110-3)/1+1=108
self.conv3 = nn.Conv2d(64, 64, 3)
.
.
.
in short make self.conv1 = nn.Conv2d(1, 128, 3) to this self.conv1 = nn.Conv2d(3, 128, 3) #(224-3)/1+1= 222
EDIT : Until you do this (below code) ,Your images will still be in 3 channel
data_transform = transforms.Compose([transforms.Grayscale(num_output_channels=1),
transforms.ToTensor()])
dataset = ImageFolder(root, transform=data_transform)
Hence the above code is necessary to make single channel input

Why pytorch gradients are array but not vector?

I am trying to calculate the dot of gradients of the same layer of two different epochs but when I am using print(model.layer1[0].weight.grad) it returns
tensor([[[[-1.1855e-03, -3.7884e-03, -2.8973e-03, -2.8847e-03, -9.6510e-04],
[-2.0213e-03, -4.4927e-03, -5.4852e-03, -6.6060e-03, -3.5726e-03],
[ 7.4499e-04, -1.8440e-03, -5.0472e-03, -5.6322e-03, -1.9532e-03],
[-4.5696e-04, 9.6445e-04, -1.4923e-03, -2.9467e-03, -1.4610e-03],
[ 2.4987e-04, 2.2086e-03, -7.6576e-04, -2.7009e-03, -2.8571e-03]]],
[[[ 2.1447e-03, 3.1090e-03, 6.8175e-03, 6.4778e-03, 3.0501e-03],
[ 2.0214e-03, 3.9936e-03, 7.9528e-03, 6.0224e-03, 1.7545e-03],
[ 3.8781e-03, 5.6659e-03, 6.6901e-03, 5.4041e-03, 7.8014e-04],
[ 4.4273e-03, 3.4548e-03, 5.7185e-03, 4.1650e-03, 9.9067e-04],
[ 4.6075e-03, 4.1176e-03, 6.8392e-03, 3.4005e-03, 1.0009e-03]]],
[[[-3.8654e-04, -2.9567e-03, -6.1341e-03, -8.3991e-03, -8.2343e-03],
[-2.9113e-03, -5.4605e-03, -6.3008e-03, -8.2075e-03, -9.6702e-03],
[-1.5218e-03, -4.4105e-03, -5.5651e-03, -6.8926e-03, -6.6076e-03],
[-6.0357e-04, -3.1118e-03, -4.4441e-03, -4.0519e-03, -3.9733e-03],
[-2.8683e-04, -1.6281e-03, -4.2213e-03, -5.5304e-03, -5.0142e-03]]],
[[[-3.7607e-04, -1.7234e-04, -1.4569e-03, -3.5825e-04, 1.4530e-03],
[ 2.6226e-04, 8.5076e-04, 1.2195e-03, 2.7885e-03, 2.5953e-03],
[-7.7404e-04, 1.0984e-03, 7.8208e-04, 5.1286e-03, 4.6842e-03],
[-1.8183e-03, 8.9730e-04, 1.0955e-03, 4.9259e-03, 6.4677e-03],
[ 1.1674e-03, 4.0651e-03, 4.5886e-03, 8.3678e-03, 8.9893e-03]]],
Are that the gradients? If yes, why they are not vector? Below there is my neural network
class ConvNet(nn.Module):
def __init__(self):
super(ConvNet, self).__init__()
self.layer1 = nn.Sequential(
nn.Conv2d(1, 32, kernel_size=5, stride=1, padding=2),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2))
self.layer2 = nn.Sequential(
nn.Conv2d(32, 64, kernel_size=5, stride=1, padding=2),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2))
self.drop_out = nn.Dropout()
self.fc1 = nn.Linear(7 * 7 * 64, 1000)
self.fc2 = nn.Linear(1000, 10)
def forward(self, x):
out = self.layer1(x)
out = self.layer2(out)
out = out.reshape(out.size(0), -1)
out = self.drop_out(out)
out = self.fc1(out)
out = self.fc2(out)
return out
Below is the code of how I train and compute the gradients
model = ConvNet()
klisi=[]
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# Train the model
total_step = len(train_loader)
loss_list = []
acc_list = []
for epoch in range(num_epochs):
for i, (images, labels) in enumerate(train_loader):
# Run the forward pass
outputs = model(images)
loss = criterion(outputs, labels)
loss_list.append(loss.item())
# Backprop and perform Adam optimisation
optimizer.zero_grad()
loss.backward()
optimizer.step()
# Track the accuracy
total = labels.size(0)
_, predicted = torch.max(outputs.data, 1)
correct = (predicted == labels).sum().item()
acc_list.append(correct / total)
if (i + 1) % 100 == 0:
print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Accuracy: {:.2f}%'
.format(epoch + 1, num_epochs, i + 1, total_step, loss.item(),
(correct / total) * 100))
print(model.layer1[0].weight.grad)
klisi.append(model.layer1[0].weight.grad)
print(optimizer.param_groups[0]['lr'])
optimizer.param_groups[0]['lr'] *= 0.9999

Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same

I'm trying to implement ResNet18 on pyTorch but I'm having some troubles with it. My code is this:
device = torch.device("cuda:0")
class ResnetBlock(nn.Module):
def __init__(self, strides, nf, nf0, reps, bn):
super(ResnetBlock, self).__init__()
self.adapt = strides == 2
self.layers = []
self.relus = []
self.adapt_layer = nn.Conv2d(nf0, nf, kernel_size=1, stride=strides, padding=0) if self.adapt else None
for i in range(reps):
self.layers.append(nn.Sequential(
nn.Conv2d(nf0, nf, kernel_size=3, stride=strides, padding=1),
nn.BatchNorm2d(nf, eps=0.001, momentum=0.99),
nn.ReLU(),
nn.Conv2d(nf, nf, kernel_size=3, stride=1, padding=1),
nn.BatchNorm2d(nf, eps=0.001, momentum=0.99)))
self.relus.append(nn.ReLU())
strides = 1
nf0 = nf
def forward(self, x):
for i, (layer, relu) in enumerate(zip(self.layers, self.relus)):
rama = layer(x)
if self.adapt and i == 0:
x = self.adapt_layer(x)
x = x + rama
x = relu(x)
return x
class ConvNet(nn.Module):
def __init__(self):
super(ConvNet, self).__init__()
self.layer1 = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3),
nn.MaxPool2d(kernel_size=2, stride=2))
self.blocks = nn.Sequential(
ResnetBlock(1, 64, 64, 2, bn),
ResnetBlock(2, 128, 64, 2, bn),
ResnetBlock(2, 256, 128, 2, bn),
ResnetBlock(2, 512, 256, 2, bn))
self.fcout = nn.Linear(512, 10)
def forward(self, x):
out = self.layer1(x)
out = self.blocks(out)
out = out.reshape(out.size(0), -1)
out = self.fcout(out)
return out
num_epochs = 50
num_classes = 10
batch_size = 50
learning_rate = 0.00001
trans = transforms.ToTensor()
train_dataset = torchvision.datasets.CIFAR10(root="./dataset_pytorch", train=True, download=True, transform=trans)
test_dataset = torchvision.datasets.CIFAR10(root="./dataset_pytorch", train=False, download=True, transform=trans)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
def weights_init(m):
if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
nn.init.xavier_uniform_(m.weight.data)
nn.init.zeros_(m.bias.data)
model = ConvNet()
model.apply(weights_init)
model.to(device)
summary(model, (3,32,32))
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, eps=1e-6)
# Train the model
total_step = len(train_loader)
loss_list = []
acc_list = []
acc_list_test = []
for epoch in range(num_epochs):
total = 0
correct = 0
for i, (images, labels) in enumerate(train_loader):
images = images.to(device)
labels = labels.to(device)
optimizer.zero_grad()
# Run the forward pass
outputs = model(images)
loss = criterion(outputs, labels)
loss_list.append(loss.item())
# Backprop and perform Adam optimisation
loss.backward()
optimizer.step()
# Track the accuracy
total += labels.size(0)
_, predicted = torch.max(outputs.data, 1)
correct += (predicted == labels).sum().item()
acc_list.append(correct / total)
print("Train")
print('Epoch [{}/{}], Accuracy: {:.2f}%'
.format(epoch + 1, num_epochs, (correct / total) * 100))
total_test = 0
correct_test = 0
for i, (images, labels) in enumerate(test_loader):
images = images.to(device)
labels = labels.to(device)
# Run the forward pass
outputs = model(images)
# Track the accuracy
total_test += labels.size(0)
_, predicted = torch.max(outputs.data, 1)
correct_test += (predicted == labels).sum().item()
acc_list_test.append(correct_test / total_test)
print("Test")
print('Epoch [{}/{}], Accuracy: {:.2f}%'
.format(epoch + 1, num_epochs, (correct_test / total_test) * 100))
It's weird because it's throwing me that error Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same even though I've moved both the model and the data to cuda.
I guess it's related with how I defined or used "ResnetBlock", because if I remove from ConvNet those blocks (removing the line out = self.blocks(out)), the code works. But I don't know what I'm doing wrong.
The problem is in this line:
model.to(device)
to is not in-place. It returns the converted model. You need to change it to:
model = model.to(device)
EDIT: Another problem: vanilla list cannot be tracked by PyTorch. You need to use nn.ModuleList.
From
self.layers = []
self.relus = []
To
self.layers = nn.ModuleList()
self.relus = nn.ModuleList()

How can I save my training progress in PyTorch for a certain batch no.?

I'm simply trying to train a ResNet18 model using PyTorch library. The training dataset consists of 25,000 images. Therefore, it is taking a lot of time for even the first epoch to complete. Therefore, I want to save the progress after a certain no. of batch iteration is completed. But I can't figure out how to modify my code and how to use the torch.save() and torch.load() functions in my code to save the periodic progress.
My code is given below:
# BUILD THE NETWORK
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data
import torch.nn.functional as F
import torchvision
import torchvision.models as models
from torchvision import transforms
from PIL import Image
import matplotlib.pyplot as plt
# DOWNLOAD PRETRAINED MODELS ON ImageNet
model_resnet18 = torch.hub.load('pytorch/vision', 'resnet18', pretrained = True)
model_resnet34 = torch.hub.load('pytorch/vision', 'resnet34', pretrained = True)
for name, param in model_resnet18.named_parameters():
if('bn' not in name):
param.requires_grad = False
for name, param in model_resnet34.named_parameters():
if('bn' not in name):
param.requires_grad = False
num_classes = 2
model_resnet18.fc = nn.Sequential(nn.Linear(model_resnet18.fc.in_features, 512),
nn.ReLU(),
nn.Dropout(),
nn.Linear(512, num_classes))
model_resnet34.fc = nn.Sequential(nn.Linear(model_resnet34.fc.in_features, 512),
nn.ReLU(),
nn.Dropout(),
nn.Linear(512, num_classes))
# FUNCTIONS FOR TRAINING AND LOADING DATA
def train(model, optimizer, loss_fn, train_loader, val_loader, epochs = 5, device = "cuda"):
print("Inside Train Function\n")
for epoch in range(epochs):
print("Epoch : {} running".format(epoch))
training_loss = 0.0
valid_loss = 0.0
model.train()
k = 0
for batch in train_loader:
optimizer.zero_grad()
inputs, targets = batch
inputs = inputs.to(device)
output = model(inputs)
loss = loss_fn(output, targets)
loss.backward()
optimizer.step()
training_loss += loss.data.item() * inputs.size(0)
print("End of batch loop iteration {} \n".format(k))
k = k + 1
training_loss /= len(train_loader.dataset)
model.eval()
num_correct = 0
num_examples = 0
for batch in val_loader:
inputs, targets = batch
inputs.to(device)
output = model(inputs)
targets = targets.to(device)
loss = loss_fn(output, targets)
valid_loss += loss.data.item() * inputs.size(0)
correct = torch.eq(torch.max(F.softmax(output, dim = 1), dim = 1)[1], targets).view(-1)
num_correct += torch.sum(correct).item()
num_examples += correct.shape[0]
valid_loss /= len(val_loader.dataset)
print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}, accuracy = {:.4f}'.format(epoch, training_loss, valid_loss, num_correct / num_examples))
batch_size = 32
img_dimensions = 224
img_transforms = transforms.Compose([ transforms.Resize((img_dimensions, img_dimensions)),
transforms.ToTensor(),
transforms.Normalize(mean = [0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225])
])
img_test_transforms = transforms.Compose([ transforms.Resize((img_dimensions, img_dimensions)),
transforms.ToTensor(),
transforms.Normalize(mean = [0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225])
])
def check_image(path):
try:
im = Image.open(path)
return True
except:
return False
train_data_path = "E:\Image Recognition\dogsandcats\\train\\"
train_data = torchvision.datasets.ImageFolder(root=train_data_path,transform=img_transforms, is_valid_file=check_image)
validation_data_path = "E:\\Image Recognition\\dogsandcats\\validation\\"
validation_data = torchvision.datasets.ImageFolder(root=validation_data_path,transform=img_test_transforms, is_valid_file=check_image)
test_data_path = "E:\\Image Recognition\\dogsandcats\\test\\"
test_data = torchvision.datasets.ImageFolder(root=test_data_path,transform=img_test_transforms, is_valid_file=check_image)
num_workers = 6
train_data_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=num_workers)
validation_data_loader = torch.utils.data.DataLoader(validation_data, batch_size=batch_size, shuffle=False, num_workers=num_workers)
test_data_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=False, num_workers=num_workers)
print(torch.cuda.is_available(), "\n")
if torch.cuda.is_available():
device = torch.device("cuda")
else:
device = torch.device("cpu")
print(f'Num training images: {len(train_data_loader.dataset)}')
print(f'Num validation images: {len(validation_data_loader.dataset)}')
print(f'Num test images: {len(test_data_loader.dataset)}')
def test_model(model):
print("Inside Test Model Function\n")
correct = 0
total = 0
with torch.no_grad():
for data in test_data_loader:
images, labels = data[0].to(device), data[1].to(device)
outputs = model(images)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
print('correct: {:d} total: {:d}'.format(correct, total))
print('accuracy = {:f}'.format(correct / total))
model_resnet18.to(device)
optimizer = optim.Adam(model_resnet18.parameters(), lr=0.001)
if __name__ == "__main__":
train(model_resnet18, optimizer, torch.nn.CrossEntropyLoss(), train_data_loader, validation_data_loader, epochs=2, device=device)
test_model(model_resnet18)
model_resnet34.to(device)
optimizer = optim.Adam(model_resnet34.parameters(), lr=0.001)
if __name__ == "__main__":
train(model_resnet34, optimizer, torch.nn.CrossEntropyLoss(), train_data_loader, validation_data_loader, epochs=2, device=device)
test_model(model_resnet34)
import os
def find_classes(dir):
classes = os.listdir(dir)
classes.sort()
class_to_idx = {classes[i]: i for i in range(len(classes))}
return classes, class_to_idx
def make_prediction(model, filename):
labels, _ = find_classes('E:\\Image Recognition\\dogsandcats\\test\\test')
img = Image.open(filename)
img = img_test_transforms(img)
img = img.unsqueeze(0)
prediction = model(img.to(device))
prediction = prediction.argmax()
print(labels[prediction])
make_prediction(model_resnet34, 'E:\\Image Recognition\\dogsandcats\\test\\test\\3.jpg') #dog
make_prediction(model_resnet34, 'E:\\Image Recognition\\dogsandcats\\test\\test\\5.jpg') #cat
torch.save(model_resnet18.state_dict(), "./model_resnet18.pth")
torch.save(model_resnet34.state_dict(), "./model_resnet34.pth")
# Remember that you must call model.eval() to set dropout and batch normalization layers to
# evaluation mode before running inference. Failing to do this will yield inconsistent inference results.
resnet18 = torch.hub.load('pytorch/vision', 'resnet18')
resnet18.fc = nn.Sequential(nn.Linear(resnet18.fc.in_features,512),nn.ReLU(), nn.Dropout(), nn.Linear(512, num_classes))
resnet18.load_state_dict(torch.load('./model_resnet18.pth'))
resnet18.eval()
resnet34 = torch.hub.load('pytorch/vision', 'resnet34')
resnet34.fc = nn.Sequential(nn.Linear(resnet34.fc.in_features,512),nn.ReLU(), nn.Dropout(), nn.Linear(512, num_classes))
resnet34.load_state_dict(torch.load('./model_resnet34.pth'))
resnet34.eval()
# Test against the average of each prediction from the two models
models_ensemble = [resnet18.to(device), resnet34.to(device)]
correct = 0
total = 0
if __name__ == '__main__':
with torch.no_grad():
for data in test_data_loader:
images, labels = data[0].to(device), data[1].to(device)
predictions = [i(images).data for i in models_ensemble]
avg_predictions = torch.mean(torch.stack(predictions), dim=0)
_, predicted = torch.max(avg_predictions, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
if total != 0:
print('accuracy = {:f}'.format(correct / total))
print('correct: {:d} total: {:d}'.format(correct, total))
To be very precise, I want to save my progress at the end of for batch in train_loader: loop, for say k = 1500.
If anyone can guide me about modifying my code so that I can save my progress and resume it later, then it will be a great and highly appreciated.
Whenever you want to save your training progress, you need to save two things:
Your model's state dict
Your optimizer's state dict
This can be done in the following way:
def save_checkpoint(model, optimizer, save_path, epoch):
torch.save({
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'epoch': epoch
}, save_path)
To resume training, you can restore your model and optimizer's state dict.
def load_checkpoint(model, optimizer, load_path):
checkpoint = torch.load(load_path)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
return model, optimizer, epoch
You can save your model at any point in training, wherever you need to. However, it should be ideal to save after finishing an epoch.

Categories

Resources