VGG parameters not updating - python

I am trying to train the VGG network from PyTorch to build a predictive model for the FashionMNIST dataset. But when I print the gradients out, it seems that the parameters are not updating and the gradient is always zero. Here is my implementation
## Specify Batch Size
train_batch_size = 32
test_batch_size = 32
## Specify Image Transforms
img_transform = transforms.Compose([
transforms.Resize((64,64)),
transforms.ToTensor(),
transforms.Normalize((0.5,), (0.5,))
])
## Download Datasets
train_data = FashionMNIST('./data', transform=img_transform, download=True, train=True)
test_data = FashionMNIST('./data', transform=img_transform, download=True, train=False)
## Initialize Dataloaders
training_dataloader = DataLoader(train_data, batch_size=train_batch_size, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=test_batch_size, shuffle=True)
vgg16 = models.vgg16()
model_a = vgg16
model_a.classifier[6] = nn.Linear(4096, 10) # to match the output dimension FashionMNIST
model_a.cuda()
# Hyperparameters and weights init
num_epochs = 50
batch_size = 196 #64
learning_rate = 1e-3
def init_weights(m):
if isinstance(m, nn.Linear):
torch.nn.init.xavier_uniform(m.weight)
m.bias.data.fill_(0.01)
print("Weights initialized using xavier_uniform")
model_a.apply(init_weights)
optimizer = torch.optim.Adam(model_a.parameters(), lr=learning_rate, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
criterion = nn.CrossEntropyLoss()
# Training
for epoch in tqdm(range(num_epochs)):
for i, (images, labels) in enumerate(training_dataloader):
images = torch.cat((images, images, images),1)
optimizer.zero_grad()
outputs = model_a(images.cuda())
loss = criterion(outputs, labels.cuda())
loss.backward()
optimizer.step()

Changing the training loop to this will resolve the issue:
for i, (images, labels) in enumerate(training_dataloader):
images = Variable(images.to(device))
labels = Variable(labels.to(device))
images = torch.cat((images, images, images),1)
optimizer.zero_grad()
outputs = model_b(images)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()

Related

Why is my output predicting the same label using pretrained Alexnet in pytorch?

I'm attempting to use a pretrained alexnet model for CIFAR10 dataset however it always predicts everything as the same class. I use the exact same code except using alexnet untrained and it works as intended. Why is it doing this?
Here is my code:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
net = models.alexnet(pretrained=True).to(device)
transform = transforms.Compose(
[transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))])
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
shuffle=True, num_workers=2, )
testset = torchvision.datasets.CIFAR10(root='./data', train=False,
download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
shuffle=True, num_workers=2)
classes = ('plane', 'car', 'bird', 'cat',
'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.005, momentum=0.9)
for epoch in range(3): # loop over the dataset multiple times
running_loss = 0.0
for i, data in enumerate(trainloader, 0):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data
inputs, labels = inputs.to(device),labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = net(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
if i % 2000 == 1999: # print every 2000 mini-batches
print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
running_loss = 0.0
print('Finished Training')
After the code I print the accuracy of each class. And it predicts every image as a plane.
Pytorch AlexNet was trained on ImageNet so the classifier is with 1000 classes.
CIFAR10 is a 10 classes dataset.
You should create a new classifier before training with CIFAR10.
I found this post By Dr. Vaibhav Kumar that should explain how to do so.

Very high validation loss/small train loss in Pytorch, while finetuning resnet 50

I am training model to classify 2 types of images. I have decided to take a transfer-learning approach, freeze every part of resnet50 and new layer and start finetuning process. My dataset is not perfectly balanced but i used weights for that purpose.Please take a look at validation loss vs training loss graph. It seems to be extremely inconsitent. Could you please take a look at my code? I am new to Pytorch, maybe there is something wrong with my method and code. Final accuracy tested on test set is 86%. Thank you!
learning_rate = 1e-1
num_epochs = 100
patience = 10
batch_size = 100
weights = [4, 1]
model = models.resnet50(pretrained=True)
# Replace last layer
num_features = model.fc.in_features
model.fc = nn.Sequential(
nn.Linear(num_features, 512),
nn.ReLU(inplace=True),
nn.Linear(512, 64),
nn.Dropout(0.5, inplace=True),
nn.Linear(64, 2))
class_weights = torch.FloatTensor(weights).cuda()
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
running_loss = 0
losses = []
# To freeze the residual layers
for param in model.parameters():
param.requires_grad = False
for param in model.fc.parameters():
param.requires_grad = True
# Find total parameters and trainable parameters
total_params = sum(p.numel() for p in model.parameters())
print(f'{total_params:,} total parameters.')
total_trainable_params = sum(
p.numel() for p in model.parameters() if p.requires_grad)
print(f'{total_trainable_params:,} training parameters.')
24,590,082 total parameters.
1,082,050 training parameters.
# initialize the early_stopping object
early_stopping = pytorchtools.EarlyStopping(patience=patience, verbose=True)
for epoch in range(num_epochs):
##########################
#######TRAIN MODEL########
##########################
epochs_loss=0
##Switch to train mode
model.train()
for i, (images, labels) in enumerate(train_dl):
# Move tensors to the configured device
images = images.to(device)
labels = labels.to(device)
# Forward pass
# Backprpagation and optimization
optimizer.zero_grad()
outputs = model(images).to(device)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
#calculate train_loss
train_losses.append(loss.item())
##########################
#####VALIDATE MODEL#######
##########################
model.eval()
for images, labels in val_dl:
images = images.to(device)
labels = labels.to(device)
outputs = model(images).to(device)
loss = criterion(outputs,labels)
valid_losses.append(loss.item())
# print training/validation statistics
# calculate average loss over an epoch
train_loss = np.average(train_losses)
valid_loss = np.average(valid_losses)
# print(train_loss)
avg_train_losses.append(train_loss)
avg_valid_losses.append(valid_loss)
print_msg = (f'train_loss: {train_loss:.5f} ' + f'valid_loss: {valid_loss:.5f}')
print(print_msg)
# clear lists to track next epoch
train_losses = []
valid_losses = []
early_stopping(valid_loss, model)
print(epoch)
if early_stopping.early_stop:
print("Early stopping")
break

Extract features from last hidden layer Pytorch Resnet18

I am implementing an image classifier using the Oxford Pet dataset with the pre-trained Resnet18 CNN.
The dataset consists of 37 categories with ~200 images in each of them.
Rather than using the final fc layer of the CNN as output to make predictions I want to use the CNN as a feature extractor to classify the pets.
For each image i'd like to grab features from the last hidden layer (which should be before the 1000-dimensional output layer). My model is using Relu activation so I should grab the output just after the ReLU (so all values will be non-negative)
Here is code (following the transfer learning tutorial on Pytorch):
loading data
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
image_datasets = {"train": datasets.ImageFolder('images_new/train', transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
normalize
])), "test": datasets.ImageFolder('images_new/test', transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
normalize
]))
}
dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=4,
shuffle=True, num_workers=4, pin_memory=True)
for x in ['train', 'test']}
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'test']}
train_class_names = image_datasets['train'].classes
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
train function
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
since = time.time()
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train', 'test']:
if phase == 'train':
scheduler.step()
model.train() # Set model to training mode
else:
model.eval() # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for inputs, labels in dataloaders[phase]:
inputs = inputs.to(device)
labels = labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'train'):
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
epoch_loss = running_loss / dataset_sizes[phase]
epoch_acc = running_corrects.double() / dataset_sizes[phase]
print('{} Loss: {:.4f} Acc: {:.4f}'.format(
phase, epoch_loss, epoch_acc))
# deep copy the model
if phase == 'test' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
print()
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))
# load best model weights
model.load_state_dict(best_model_wts)
return model
Compute SGD cross-entropy loss
model_ft = models.resnet18(pretrained=True)
num_ftrs = model_ft.fc.in_features
print("number of features: ", num_ftrs)
model_ft.fc = nn.Linear(num_ftrs, len(train_class_names))
model_ft = model_ft.to(device)
criterion = nn.CrossEntropyLoss()
# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)
# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)
model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler,
num_epochs=24)
Now how do I get a feature vector from the last hidden layer for each of my images? I know I have to freeze the previous layer so that gradient isn't computed on them but I'm having trouble extracting the feature vectors.
My ultimate goal is to use those feature vectors to train a linear classifier such as Ridge or something like that.
Thanks!
You can try the approach below. This will work for any layer with only a change of offset.
model_ft = models.resnet18(pretrained=True)
### strip the last layer
feature_extractor = torch.nn.Sequential(*list(model_ft.children())[:-1])
### check this works
x = torch.randn([1,3,224,224])
output = feature_extractor(x) # output now has the features corresponding to input x
print(output.shape)
torch.Size([1, 512, 1, 1])
This is probably not the best idea, but you can do something like this:
#assuming model_ft is trained now
model_ft.fc_backup = model_ft.fc
model_ft.fc = nn.Sequential() #empty sequential layer does nothing (pass-through)
# or model_ft.fc = nn.Identity()
# now you use your network as a feature extractor
I also checked fc is the right attribute to change, look at forward
If you know the name of your layer (eg layer4 in resnet), you can use hooks:
def get_hidden_features(x, layer):
activation = {}
def get_activation(name):
def hook(m, i, o):
activation[name] = o.detach()
return hook
model.register_forward_hook(get_activation(layer))
_ = model(x)
return activation[layer]
get_features(inputs, "layer4")
Example: https://discuss.pytorch.org/t/how-can-i-extract-intermediate-layer-output-from-loaded-cnn-model/77301/3

Size mismatch for fc.bias and fc.weight in PyTorch

I used the transfer learning approach to train a model and saved the best-detected weights. In another script, I tried to use the saved weights for prediction. But I am getting errors as follows. I have used ResNet for finetuning the network and have 4 classes.
RuntimeError: Error(s) in loading state_dict for ResNet:
size mismatch for fc.bias: copying a param of torch.Size([1000]) from
checkpoint, where the shape is torch.Size([4]) in current model.
size mismatch for fc.weight: copying a param of torch.Size([1000,
512]) from checkpoint, where the shape is torch.Size([4, 512]) in
current model.
I am using the following code for prediction of output:
checkpoint = torch.load("./models/custom_model13.model")
model = resnet18(pretrained=True)
model.load_state_dict(checkpoint)
model.eval()
def predict_image(image_path):
transformation = transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
image_tensor = transformation(image).float()
image_tensor = image_tensor.unsqueeze_(0)
if torch.cuda.is_available():
image_tensor.cuda()
input = Variable(image_tensor)
output = model(input)
index = output.data.numpy().argmax()
return index
if __name__ == "main":
imagefile = "image.png"
imagepath = os.path.join(os.getcwd(),imagefile)
prediction = predict_image(imagepath)
print("Predicted Class: ",prediction)
And the following code to train and save the model:
Data_dir = 'Dataset'
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
data_transforms[x])
for x in ['train', 'val']}
dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=4,
shuffle=True, num_workers=4)
for x in ['train', 'val']}
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
class_names = image_datasets['train'].classes
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print (device)
def save_models(epochs, model):
torch.save(model.state_dict(), "custom_model{}.model".format(epochs))
print("Checkpoint Saved")
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
since = time.time()
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train', 'val']:
if phase == 'train':
scheduler.step()
model.train() # Set model to training mode
else:
model.eval() # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for inputs, labels in dataloaders[phase]:
inputs = inputs.to(device)
labels = labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'train'):
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
epoch_loss = running_loss / dataset_sizes[phase]
epoch_acc = running_corrects.double() / dataset_sizes[phase]
print('{} Loss: {:.4f} Acc: {:.4f}'.format(
phase, epoch_loss, epoch_acc))
# deep copy the model
if phase == 'train' and epoch_acc > best_acc:
save_models(epoch,model)
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
print()
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))
# load best model weights
model.load_state_dict(best_model_wts)
return model
def visualize_model(model, num_images=6):
was_training = model.training
model.eval()
images_so_far = 0
fig = plt.figure()
with torch.no_grad():
for i, (inputs, labels) in enumerate(dataloaders['val']):
inputs = inputs.to(device)
labels = labels.to(device)
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
for j in range(inputs.size()[0]):
images_so_far += 1
ax = plt.subplot(num_images//2, 2, images_so_far)
ax.axis('off')
ax.set_title('predicted: {}'.format(class_names[preds[j]]))
imshow(inputs.cpu().data[j])
if images_so_far == num_images:
model.train(mode=was_training)
return
model.train(mode=was_training)
model_ft = models.resnet18(pretrained=True)
num_ftrs = model_ft.fc.in_features
model_ft.fc = nn.Linear(num_ftrs, 4)
model_ft = model_ft.to(device)
criterion = nn.CrossEntropyLoss()
optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)
model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler,
num_epochs=25)
Cause:
You trained a model derived from resnet18 in this way:
model_ft = models.resnet18(pretrained=True)
num_ftrs = model_ft.fc.in_features
model_ft.fc = nn.Linear(num_ftrs, 4)
That is, you changed the last nn.Linear layer to output 4 dim prediction instead of the default 1000.
When you try and load the model for prediction, your code is:
model = resnet18(pretrained=True)
model.load_state_dict(checkpoint)
You did not apply the same change of the last nn.Linear layer to model therefore the checkpoint you are trying to load does not fit.
Fix:
(1) Apply the same change before loading the checkpoint:
model = resnet18(pretrained=True)
num_ftrs = model_ft.fc.in_features
model_ft.fc = nn.Linear(num_ftrs, 4) # make the change
model.load_state_dict(checkpoint) # load
(2) Even better, use num_classes argument to construct resnet with the desired number of outputs to begin with:
model = resnet18(pretrained=True, num_classes=4)
model.load_state_dict(checkpoint) # load

Pytorch vs. Keras: Pytorch model overfits heavily

For several days now, I'm trying to replicate my keras training results with pytorch. Whatever I do, the pytorch model will overfit far earlier and stronger to the validation set then in keras. For pytorch I use the same XCeption Code from https://github.com/Cadene/pretrained-models.pytorch.
The dataloading, the augmentation, the validation, the training schedule etc. are equivalent. Am I missing something obvious? There must be a general problem somewhere. I tried thousands of different module constellations, but nothing seems to come even close to the keras training. Can somebody help?
Keras model: val accuracy > 90%
# base model
base_model = applications.Xception(weights='imagenet', include_top=False, input_shape=(img_width, img_height, 3))
# top model
x = base_model.output
x = GlobalMaxPooling2D()(x)
x = Dense(512, activation='relu')(x)
x = Dropout(0.5)(x)
predictions = Dense(4, activation='softmax')(x)
# this is the model we will train
model = Model(inputs=base_model.input, outputs=predictions)
# Compile model
from keras import optimizers
adam = optimizers.Adam(lr=0.0001)
model.compile(loss='categorical_crossentropy',
optimizer=adam, metrics=['accuracy'])
# LROnPlateau etc. with equivalent settings as pytorch
Pytorch model: val accuracy ~81%
from xception import xception
import torch.nn.functional as F
# modified from https://github.com/Cadene/pretrained-models.pytorch
class XCeption(nn.Module):
def __init__(self, num_classes):
super(XCeption, self).__init__()
original_model = xception(pretrained="imagenet")
self.features=nn.Sequential(*list(original_model.children())[:-1])
self.last_linear = nn.Sequential(
nn.Linear(original_model.last_linear.in_features, 512),
nn.ReLU(),
nn.Dropout(p=0.5),
nn.Linear(512, num_classes)
)
def logits(self, features):
x = F.relu(features)
x = F.adaptive_max_pool2d(x, (1, 1))
x = x.view(x.size(0), -1)
x = self.last_linear(x)
return x
def forward(self, input):
x = self.features(input)
x = self.logits(x)
return x
device = torch.device("cuda")
model=XCeption(len(class_names))
if torch.cuda.device_count() > 1:
print("Let's use", torch.cuda.device_count(), "GPUs!")
# dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
model = nn.DataParallel(model)
model.to(device)
criterion = nn.CrossEntropyLoss(size_average=False)
optimizer = optim.Adam(model.parameters(), lr=0.0001)
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.2, patience=5, cooldown=5)
Thank you very much!
Update:
Settings:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.2, patience=5, cooldown=5)
model = train_model(model, train_loader, val_loader,
criterion, optimizer, scheduler,
batch_size, trainmult=8, valmult=10,
num_epochs=200, epochs_top=0)
Cleaned training function:
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, batch_size, trainmult=1, valmult=1, num_epochs=None, epochs_top=0):
for epoch in range(num_epochs):
for phase in ['train', 'val']:
running_loss = 0.0
running_acc = 0
total = 0
# Iterate over data.
if phase=="train":
model.train(True) # Set model to training mode
for i in range(trainmult):
for data in train_loader:
# get the inputs
inputs, labels = data
inputs, labels = inputs.to(torch.device("cuda")), labels.to(torch.device("cuda"))
# zero the parameter gradients
optimizer.zero_grad()
# forward
outputs = model(inputs) # notinception
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
loss.backward()
optimizer.step()
# statistics
total += labels.size(0)
running_loss += loss.item()*labels.size(0)
running_acc += torch.sum(preds == labels)
train_loss=(running_loss/total)
train_acc=(running_acc.double()/total)
else:
model.train(False) # Set model to evaluate mode
with torch.no_grad():
for i in range(valmult):
for data in val_loader:
# get the inputs
inputs, labels = data
inputs, labels = inputs.to(torch.device("cuda")), labels.to(torch.device("cuda"))
# zero the parameter gradients
optimizer.zero_grad()
# forward
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels.data)
# statistics
total += labels.size(0)
running_loss += loss.item()*labels.size(0)
running_acc += torch.sum(preds == labels)
val_loss=(running_loss/total)
val_acc=(running_acc.double()/total)
scheduler.step(val_loss)
return model
it may be because type of weight initialization you are using
otherwise this should not happen
try with same initializer in both the models
self.features=nn.Sequential(*list(original_model.children())[:-1])
Are you sure that this line re-instantiates your model in exactly the same way? You're using a NN.Sequential instead of the original XCeption model's forward function. If there's anything in that forward function that isn't the exact same as using a nn.Sequential, it will not reproduce the same performance.
Instead of wrapping it in a Sequential, you could just change this
my_model = Xception()
# load weights before you change the architecture
my_model = load_weights(path_to_weights)
# overwrite the original's last_linear with your own
my_model.last_linear = nn.Sequential(
nn.Linear(original_model.last_linear.in_features, 512),
nn.ReLU(),
nn.Dropout(p=0.5),
nn.Linear(512, num_classes)
)

Categories

Resources