I am trying to save images that I configure during training to the output bucket in sagemaker. I've read that all the information that needs to be saved during training goes into the model.tar.gz file. I've tried saving plots using the model_dir and the output_data_dir to no avail. The model itself is saved properly, but the additional information is not being stored with it. I want to reload this additional information (the saved images) during inference but have heard that storing all the information in the model.tar.gz can cause slow inference. I would love some help.
Here is my estimator
from sagemaker.pytorch import PyTorch
estimator = PyTorch(entry_point='XXXXXXXX/AWS/mnist.py',
role=role,
py_version='py3',
framework_version='1.8.0',
instance_count=1,
instance_type='ml.c5.xlarge',
output_path='s3://XXXXX-bucket/',
)
and the code in mnist.py:
import os
import pandas as pd
import torch
import matplotlib.pyplot as plt
import argparse
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from torchvision.transforms import ToTensor
from torchvision.io import read_image
from torch import nn
import matplotlib.pyplot as plt
class NeuralNetwork(nn.Module):
def __init__(self):
super(NeuralNetwork, self).__init__()
self.flatten = nn.Flatten()
self.linear_relu_stack = nn.Sequential(
nn.Linear(28*28, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, 10),
)
def forward(self, x):
x = self.flatten(x)
logits = self.linear_relu_stack(x)
return logits
def train_loop(dataloader, model, loss_fn, optimizer):
size = len(dataloader.dataset)
for batch, (X, y) in enumerate(dataloader):
# Compute prediction and loss
pred = model(X.to(device))
loss = loss_fn(pred, y.to(device))
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch % 100 == 0:
loss, current = loss.item(), batch * len(X)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
def test_loop(dataloader, model, loss_fn):
size = len(dataloader.dataset)
num_batches = len(dataloader)
test_loss, correct = 0, 0
with torch.no_grad():
for X, y in dataloader:
pred = model(X.to(device))
test_loss += loss_fn(pred, y.to(device)).item()
correct += (pred.argmax(1) == y.to(device)).type(torch.float).sum().item()
test_loss /= num_batches
correct /= size
print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
# Initialize the loss function
if __name__=='__main__':
# default to the value in environment variable `SM_MODEL_DIR`. Using args makes the script more portable.
parser = argparse.ArgumentParser()
parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR'])
args, _ = parser.parse_known_args()
training_data = datasets.FashionMNIST(
root="data",
train=True,
download=True,
transform=ToTensor()
)
test_data = datasets.FashionMNIST(
root="data",
train=False,
download=True,
transform=ToTensor()
)
labels_map = {
0: "T-Shirt",
1: "Trouser",
2: "Pullover",
3: "Dress",
4: "Coat",
5: "Sandal",
6: "Shirt",
7: "Sneaker",
8: "Bag",
9: "Ankle Boot",
}
figure = plt.figure(figsize=(8, 8))
cols, rows = 3, 3
for i in range(1, cols * rows + 1):
sample_idx = torch.randint(len(training_data), size=(1,)).item()
img, label = training_data[sample_idx]
figure.add_subplot(rows, cols, i)
plt.title(labels_map[label])
plt.axis("off")
plt.imsave(args.output_data_dir+'plot'+str(i)+'.jpg', img.squeeze(), cmap="gray")
train_dataloader = DataLoader(training_data, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)
# Display image and label.
train_features, train_labels = next(iter(train_dataloader))
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")
img = train_features[0].squeeze()
label = train_labels[0]
plt.imsave(args.output_data_dir+'sample.jpg', img, cmap="gray")
print("Saved img.")
print(f"Label: {label}")
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")
model = NeuralNetwork().to(device)
print(model)
learning_rate = 1e-3
batch_size = 64
epochs = 5
# ... train `model`, then save it to `model_dir`
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
epochs = 1
for t in range(epochs):
print(f"Epoch {t+1}\n-------------------------------")
train_loop(train_dataloader, model, loss_fn, optimizer)
test_loop(test_dataloader, model, loss_fn)
print("Done!")
with open(os.path.join(args.model_dir, 'model.pth'), 'wb') as f:
torch.save(model.state_dict(), f)
plt.plot([1,2,3,4])
plt.ylabel('some numbers')
plt.show()
plt.savefig('test.jpeg')
I suspect there is an issue with string concatenation in plt.imsave because the environment variable SM_OUTPUT_DATA_DIR by default points to /opt/ml/output/data (that's the actual value of args.output_data_dir, since you don't pass this parameter) so the outcome is something like /opt/ml/output/dataplot1.jpg. The same happen if you use the model_dir in the same way. I'd rather use something like os.path.join like you're already doing for the model. here a nice exaplaination about these folders and environment variables in sagemaker.
Related
I want to write a train function in a class for training a model; The following code reported an error; can anyone give me a hint for solving this issue?
import numpy as np
import os
import sys
sys.executable
sys.version
##define a neuralnet class
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
os.chdir("/Users/zhangzhongheng/Downloads/")
os.getcwd()
#Download training data from open datasets.
training_data = datasets.FashionMNIST(
root="data",
train=True,
download=True,
transform=ToTensor(),
)
Download test data from open datasets.
test_data = datasets.FashionMNIST(
root="data",
train=False,
download=True,
transform=ToTensor(),
)
batch_size = 64
Create data loaders.
train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)
for X, y in test_dataloader:
print(f"Shape of X [N, C, H, W]: {X.shape}")
print(f"Shape of y: {y.shape} {y.dtype}")
break
Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")
Define model
class NeuralNetwork(nn.Module):
def __init__(self):
super().__init__()
self.flatten = nn.Flatten()
self.linear_relu_stack = nn.Sequential(
nn.Linear(28*28, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, 10)
)
def forward(self, x):
x = self.flatten(x)
logits = self.linear_relu_stack(x)
return logits
def model_train(self,dataloader):
self.train()
size = len(dataloader.dataset)
optimizer = torch.optim.SGD(self.parameters(), lr=1e-3)
for batch, (X, y) in enumerate(dataloader):
X, y = X.to(device), y.to(device)
# Compute prediction error
pred = self.forward(X)
loss = nn.CrossEntropyLoss(pred, y)
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch % 100 == 0:
loss, current = loss.item(), batch * len(X)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
Model = NeuralNetwork()
epochs = 5
for t in range(epochs):
print(f"Epoch {t+1}\n-------------------------------")
Model.model_train(train_dataloader)
print("Done!")
The above code reported the following error:
Epoch 1
-------------------------------
RuntimeError: Boolean value of Tensor with more than one value is ambiguous
Done!
Changing nn.CrossEntropyLoss to nn.CrossEntropyLoss() should solve this problem.
Refer to the official documentation here.
It will look something like this loss = nn.CrossEntropyLoss()(pred, y)
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import torch
import torchvision
from torchvision import datasets
from torchvision import transforms as T # for simplifying the transforms
from torch import nn, optim
from torch.nn import functional as F
from torch.utils.data import DataLoader, sampler, random_split
from torchvision import models
!pip install timm # kaggle doesnt have it installed by default
import timm
from timm.loss import LabelSmoothingCrossEntropy
import sys
from tqdm import tqdm
import time
import copy
def get_classes(data_dir):
all_data = datasets.ImageFolder(data_dir)
return all_data.classes
def get_data_loaders(data_dir, batch_size, train = False):
if train:
#train
transform = T.Compose([
T.RandomHorizontalFlip(),
T.RandomVerticalFlip(),
T.RandomApply(torch.nn.ModuleList([T.ColorJitter()]), p=0.25),
T.Resize(256),
T.CenterCrop(224),
T.ToTensor(),
T.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), # imagenet means
T.RandomErasing(p=0.2, value='random')
])
train_data = datasets.ImageFolder(os.path.join(data_dir, "train/"), transform = transform)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=4)
return train_loader, len(train_data)
else:
# val/test
transform = T.Compose([ # We dont need augmentation for test transforms
T.Resize(256),
T.CenterCrop(224),
T.ToTensor(),
T.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), # imagenet means
])
val_data = datasets.ImageFolder(os.path.join(data_dir, "validation/"), transform=transform)
test_data = datasets.ImageFolder(os.path.join(data_dir, "train/"), transform=transform)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=True, num_workers=4)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True, num_workers=4)
return val_loader, test_loader, len(val_data), len(test_data)
dataset_path = "/kaggle/input/dfdc-faces-of-the-train-sample"
(train_loader, train_data_len) = get_data_loaders(dataset_path, 128, train=True)
(val_loader, test_loader, valid_data_len, test_data_len) = get_data_loaders(dataset_path, 32, train=False)
classes = get_classes("/kaggle/input/dfdc-faces-of-the-train-sample/train")
print(classes, len(classes))
dataloaders = {
"train": train_loader,
"validation": val_loader
}
dataset_sizes = {
"train": train_data_len,
"validation": valid_data_len
}
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = torch.hub.load('facebookresearch/deit:main', 'deit_tiny_patch16_224', pretrained=True)
for param in model.parameters(): #freeze model
param.requires_grad = False
n_inputs = model.head.in_features
model.head = nn.Sequential(
nn.Linear(n_inputs, 512),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(512, len(classes))
)
model = model.to(device)
print(model.head)
criterion = LabelSmoothingCrossEntropy()
criterion = criterion.to(device)
optimizer = optim.Adam(model.head.parameters(), lr=0.001)
# lr scheduler
exp_lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.97)
def train_model(model, criterion, optimizer, scheduler, num_epochs=1):
since = time.time()
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
for epoch in range(num_epochs):
print(f'Epoch {epoch}/{num_epochs - 1}')
print("-"*10)
for phase in ['train', 'validation']: # We do training and validation phase per epoch
if phase == 'train':
model.train() # model to training mode
else:
model.eval() # model to evaluate
running_loss = 0.0
running_corrects = 0.0
for inputs, labels in tqdm(dataloaders[phase]):
inputs = inputs.to(device)
labels = labels.to(device)
optimizer.zero_grad()
with torch.set_grad_enabled(phase == 'train'): # no autograd makes validation go faster
outputs = model(inputs)
_, preds = torch.max(outputs, 1) # used for accuracy
loss = criterion(outputs, labels)
if phase == 'train':
loss.backward()
optimizer.step()
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
if phase == 'train':
scheduler.step() # step at end of epoch
epoch_loss = running_loss / dataset_sizes[phase]
epoch_acc = running_corrects.double() / dataset_sizes[phase]
print("{} Loss: {:.4f} Acc: {:.4f}".format(phase, epoch_loss, epoch_acc))
if phase == 'validation' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict()) # keep the best validation accuracy model
print()
time_elapsed = time.time() - since # slight error
print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
print("Best Val Acc: {:.4f}".format(best_acc))
model.load_state_dict(best_model_wts)
return model
model_ft = train_model(model, criterion, optimizer, exp_lr_scheduler)# now it is a lot faster
test_loss = 0.0
class_correct = list(0 for i in range(len(classes)))
class_total = list(0 for i in range(len(classes)))
model.eval()
for data, target in tqdm(test_loader):
data, target = data.to(device), target.to(device)
with torch.no_grad(): # turn off autograd for faster testing
output = model(data)
loss = criterion(output, target)
test_loss = loss.item() * data.size(0)
_, pred = torch.max(output, 1)
correct_tensor = pred.eq(target.data.view_as(pred))
correct = np.squeeze(correct_tensor.cpu().numpy())
if len(target) == 32:
for i in range(32):
label = target.data[i]
class_correct[label] += correct[i].item()
class_total[label] += 1
test_loss = test_loss / test_data_len
print('Test Loss: {:.4f}'.format(test_loss))
for i in range(len(classes)):
if class_total[i] > 0:
print("Test Accuracy of %5s: %2d%% (%2d/%2d)" % (
classes[i], 100*class_correct[i]/class_total[i], np.sum(class_correct[i]), np.sum(class_total[i])
))
else:
print("Test accuracy of %5s: NA" % (classes[i]))
print("Test Accuracy of %2d%% (%2d/%2d)" % (
100*np.sum(class_correct)/np.sum(class_total), np.sum(class_correct), np.sum(class_total)
))
torch.save(model.state_dict(),'checkpoint.pt')
I have copied this code from Kaggle and similarly the data. The dataset is also present on kaggle and it's name is same as used in the code dataset path above. The code just works fine and also makes the model, but I don't know how to make a prediction.
from PIL import Image
import cv2
path_to_model = 'checkpoint.pt'
imgpath='../input/dfdc-faces-of-the-train-sample/train/fake/aapnvogymq_0_0.png'
img=cv2.imread(imgpath)
transform = T.Compose([ # We dont need augmentation for test transforms
T.Resize(256),
T.CenterCrop(224),
T.ToTensor(),
T.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), # imagenet means
])
PIL_image = Image.fromarray(np.uint8(img)).convert('RGB')
imge=transform(PIL_image)
imge=imge.view(-1,3,224,224)
print(imge.shape)
model.load_state_dict(torch.load(path_to_model))
#print(summary(model,(3,224,224)))
model.eval()
labels = labels.to(device)
logits = model(imge.to(device))
params = list(model.parameters())
sm = nn.Softmax()
#weight_softmax = model.linear1.weight.detach().cpu().numpy()
logits = sm(logits)
_,prediction = torch.max(logits,1)
confidence = logits[:,int(prediction.item())].item()*100
print('confidence of prediction:',logits[:,int(prediction.item())].item()*100)
print(prediction.item())
I did it myself and I can predict now
I'm new to Neural Networks and I'm trying to train a CNN model on a custom dataset (cats and dogs images in a single directory). So I guess I do the very usual stuff here which is in the most tutorials, but just in case I will give here my full code.
First I generate .csv file to be processed:
import os
import torch
device = ("cuda" if torch.cuda.is_available() else "cpu")
train_df = pd.DataFrame(columns=["img_name","label"])
train_df["img_name"] = os.listdir("train/")
for idx, i in enumerate(os.listdir("train/")):
if "cat" in i:
train_df["label"][idx] = 0
if "dog" in i:
train_df["label"][idx] = 1
train_df.to_csv (r'train_csv.csv', index = False, header=True)
Then I prepare the dataset:
from torch.utils.data import Dataset
import pandas as pd
import os
from PIL import Image
import torch
class CatsAndDogsDataset(Dataset):
def __init__(self, root_dir, annotation_file, transform=None):
self.root_dir = root_dir
self.annotations = pd.read_csv(annotation_file)
self.transform = transform
def __len__(self):
return len(self.annotations)
def __getitem__(self, index):
img_id = self.annotations.iloc[index, 0]
img = Image.open(os.path.join(self.root_dir, img_id)).convert("RGB")
y_label = torch.tensor(float(self.annotations.iloc[index, 1]))
if self.transform is not None:
img = self.transform(img)
return (img, y_label)
This is my model:
import torch.nn as nn
import torchvision.models as models
class CNN(nn.Module):
def __init__(self, train_CNN=False, num_classes=1):
super(CNN, self).__init__()
self.train_CNN = train_CNN
self.inception = models.inception_v3(pretrained=True, aux_logits=False)
self.inception.fc = nn.Linear(self.inception.fc.in_features, num_classes)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(0.5)
self.sigmoid = nn.Sigmoid()
def forward(self, images):
features = self.inception(images)
return self.sigmoid(self.dropout(self.relu(features))).squeeze(1)
This is my hyper-params, transformations and dataloaders:
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
num_epochs = 10
learning_rate = 0.00001
train_CNN = False
batch_size = 32
shuffle = True
pin_memory = True
num_workers = 0
transform = transforms.Compose(
[
transforms.Resize((356, 356)),
transforms.RandomCrop((299, 299)),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
]
)
dataset = CatsAndDogsDataset("train","train_csv.csv",transform=transform)
print(len(dataset))
train_set, validation_set = torch.utils.data.random_split(dataset,[162,40])
train_loader = DataLoader(dataset=train_set, shuffle=shuffle, batch_size=batch_size,num_workers=num_workers,pin_memory=pin_memory)
validation_loader = DataLoader(dataset=validation_set, shuffle=shuffle, batch_size=batch_size,num_workers=num_workers, pin_memory=pin_memory)
model = CNN().to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for name, param in model.inception.named_parameters():
if "fc.weight" in name or "fc.bias" in name:
param.requires_grad = True
else:
param.requires_grad = train_CNN
and accuracy check:
def check_accuracy(loader, model):
if loader == train_loader:
print("Checking accuracy on training data")
else:
print("Checking accuracy on validation data")
num_correct = 0
num_samples = 0
model.eval()
with torch.no_grad():
for x, y in loader:
x = x.to(device=device)
y = y.to(device=device)
scores = model(x)
predictions = torch.tensor([1.0 if i >= 0.5 else 0.0 for i in scores]).to(device)
num_correct += (predictions == y).sum()
num_samples += predictions.size(0)
print(
f"Got {num_correct} / {num_samples} with accuracy {float(num_correct)/float(num_samples)*100:.2f}"
)
model.train()
return f"{float(num_correct)/float(num_samples)*100:.2f}"
And this is my training function:
from tqdm import tqdm
def train():
model.train()
for epoch in range(num_epochs):
loop = tqdm(train_loader, total = len(train_loader), leave = True)
if epoch % 2 == 0:
loop.set_postfix(val_acc = check_accuracy(validation_loader, model))
for imgs, labels in loop:
imgs = imgs.to(device)
labels = labels.to(device)
outputs = model(imgs)
loss = criterion(outputs, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
loop.set_description(f"Epoch [{epoch}/{num_epochs}]")
loop.set_postfix(loss = loss.item())
if __name__ == "__main__":
train()
0%| | 0/6 [00:00<?, ?it/s]Checking accuracy on validation data
0%| | 0/6 [01:13<?, ?it/s, val_acc=60.00]Got 24 / 40 with accuracy 60.00
Epoch [0/10]: 100%|██████████| 6/6 [06:02<00:00, 60.39s/it, loss=0.693]
Epoch [1/10]: 100%|██████████| 6/6 [04:49<00:00, 48.23s/it, loss=0.693]
...
Epoch [8/10]: 100%|██████████| 6/6 [06:07<00:00, 61.29s/it, loss=0.693]
Epoch [9/10]: 100%|██████████| 6/6 [04:55<00:00, 49.19s/it, loss=0.781]
The model gets trained fine but when I try to use it for prediction I get different results each time I run this last piece in my Jupyter Notebooks:
model.eval()
img = Image.open('train/cat.22.png').convert("RGB")
img_t = transform(img)
batch_t = torch.unsqueeze(img_t, 0)
out = model(batch_t)
print(out)
tensor([0.5276], grad_fn=)
tensor([0.5000], grad_fn=)
tensor([0.5064], grad_fn=)
etc. Each time different result for the same image. Is this normal? Why this is happening?
I don't see you loading your trained model. This means every time you initialize the CNN module, the inception.fc layer will get initialized with random weights, this is most probably the reason why you are getting different results on each inference.
Edit: You have a random transform in your transformation pipeline, namely RandomCrop.
According to this answer on the use of model.eval(), I believe you might want to ensure that you have the lower half of the code cell wrapped in a with torch.no_grad(): context. I think it may still be learning/updating parameters unless inside that context.
I'm simply trying to train a ResNet18 model using PyTorch library. The training dataset consists of 25,000 images. Therefore, it is taking a lot of time for even the first epoch to complete. Therefore, I want to save the progress after a certain no. of batch iteration is completed. But I can't figure out how to modify my code and how to use the torch.save() and torch.load() functions in my code to save the periodic progress.
My code is given below:
# BUILD THE NETWORK
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data
import torch.nn.functional as F
import torchvision
import torchvision.models as models
from torchvision import transforms
from PIL import Image
import matplotlib.pyplot as plt
# DOWNLOAD PRETRAINED MODELS ON ImageNet
model_resnet18 = torch.hub.load('pytorch/vision', 'resnet18', pretrained = True)
model_resnet34 = torch.hub.load('pytorch/vision', 'resnet34', pretrained = True)
for name, param in model_resnet18.named_parameters():
if('bn' not in name):
param.requires_grad = False
for name, param in model_resnet34.named_parameters():
if('bn' not in name):
param.requires_grad = False
num_classes = 2
model_resnet18.fc = nn.Sequential(nn.Linear(model_resnet18.fc.in_features, 512),
nn.ReLU(),
nn.Dropout(),
nn.Linear(512, num_classes))
model_resnet34.fc = nn.Sequential(nn.Linear(model_resnet34.fc.in_features, 512),
nn.ReLU(),
nn.Dropout(),
nn.Linear(512, num_classes))
# FUNCTIONS FOR TRAINING AND LOADING DATA
def train(model, optimizer, loss_fn, train_loader, val_loader, epochs = 5, device = "cuda"):
print("Inside Train Function\n")
for epoch in range(epochs):
print("Epoch : {} running".format(epoch))
training_loss = 0.0
valid_loss = 0.0
model.train()
k = 0
for batch in train_loader:
optimizer.zero_grad()
inputs, targets = batch
inputs = inputs.to(device)
output = model(inputs)
loss = loss_fn(output, targets)
loss.backward()
optimizer.step()
training_loss += loss.data.item() * inputs.size(0)
print("End of batch loop iteration {} \n".format(k))
k = k + 1
training_loss /= len(train_loader.dataset)
model.eval()
num_correct = 0
num_examples = 0
for batch in val_loader:
inputs, targets = batch
inputs.to(device)
output = model(inputs)
targets = targets.to(device)
loss = loss_fn(output, targets)
valid_loss += loss.data.item() * inputs.size(0)
correct = torch.eq(torch.max(F.softmax(output, dim = 1), dim = 1)[1], targets).view(-1)
num_correct += torch.sum(correct).item()
num_examples += correct.shape[0]
valid_loss /= len(val_loader.dataset)
print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}, accuracy = {:.4f}'.format(epoch, training_loss, valid_loss, num_correct / num_examples))
batch_size = 32
img_dimensions = 224
img_transforms = transforms.Compose([ transforms.Resize((img_dimensions, img_dimensions)),
transforms.ToTensor(),
transforms.Normalize(mean = [0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225])
])
img_test_transforms = transforms.Compose([ transforms.Resize((img_dimensions, img_dimensions)),
transforms.ToTensor(),
transforms.Normalize(mean = [0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225])
])
def check_image(path):
try:
im = Image.open(path)
return True
except:
return False
train_data_path = "E:\Image Recognition\dogsandcats\\train\\"
train_data = torchvision.datasets.ImageFolder(root=train_data_path,transform=img_transforms, is_valid_file=check_image)
validation_data_path = "E:\\Image Recognition\\dogsandcats\\validation\\"
validation_data = torchvision.datasets.ImageFolder(root=validation_data_path,transform=img_test_transforms, is_valid_file=check_image)
test_data_path = "E:\\Image Recognition\\dogsandcats\\test\\"
test_data = torchvision.datasets.ImageFolder(root=test_data_path,transform=img_test_transforms, is_valid_file=check_image)
num_workers = 6
train_data_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=num_workers)
validation_data_loader = torch.utils.data.DataLoader(validation_data, batch_size=batch_size, shuffle=False, num_workers=num_workers)
test_data_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=False, num_workers=num_workers)
print(torch.cuda.is_available(), "\n")
if torch.cuda.is_available():
device = torch.device("cuda")
else:
device = torch.device("cpu")
print(f'Num training images: {len(train_data_loader.dataset)}')
print(f'Num validation images: {len(validation_data_loader.dataset)}')
print(f'Num test images: {len(test_data_loader.dataset)}')
def test_model(model):
print("Inside Test Model Function\n")
correct = 0
total = 0
with torch.no_grad():
for data in test_data_loader:
images, labels = data[0].to(device), data[1].to(device)
outputs = model(images)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
print('correct: {:d} total: {:d}'.format(correct, total))
print('accuracy = {:f}'.format(correct / total))
model_resnet18.to(device)
optimizer = optim.Adam(model_resnet18.parameters(), lr=0.001)
if __name__ == "__main__":
train(model_resnet18, optimizer, torch.nn.CrossEntropyLoss(), train_data_loader, validation_data_loader, epochs=2, device=device)
test_model(model_resnet18)
model_resnet34.to(device)
optimizer = optim.Adam(model_resnet34.parameters(), lr=0.001)
if __name__ == "__main__":
train(model_resnet34, optimizer, torch.nn.CrossEntropyLoss(), train_data_loader, validation_data_loader, epochs=2, device=device)
test_model(model_resnet34)
import os
def find_classes(dir):
classes = os.listdir(dir)
classes.sort()
class_to_idx = {classes[i]: i for i in range(len(classes))}
return classes, class_to_idx
def make_prediction(model, filename):
labels, _ = find_classes('E:\\Image Recognition\\dogsandcats\\test\\test')
img = Image.open(filename)
img = img_test_transforms(img)
img = img.unsqueeze(0)
prediction = model(img.to(device))
prediction = prediction.argmax()
print(labels[prediction])
make_prediction(model_resnet34, 'E:\\Image Recognition\\dogsandcats\\test\\test\\3.jpg') #dog
make_prediction(model_resnet34, 'E:\\Image Recognition\\dogsandcats\\test\\test\\5.jpg') #cat
torch.save(model_resnet18.state_dict(), "./model_resnet18.pth")
torch.save(model_resnet34.state_dict(), "./model_resnet34.pth")
# Remember that you must call model.eval() to set dropout and batch normalization layers to
# evaluation mode before running inference. Failing to do this will yield inconsistent inference results.
resnet18 = torch.hub.load('pytorch/vision', 'resnet18')
resnet18.fc = nn.Sequential(nn.Linear(resnet18.fc.in_features,512),nn.ReLU(), nn.Dropout(), nn.Linear(512, num_classes))
resnet18.load_state_dict(torch.load('./model_resnet18.pth'))
resnet18.eval()
resnet34 = torch.hub.load('pytorch/vision', 'resnet34')
resnet34.fc = nn.Sequential(nn.Linear(resnet34.fc.in_features,512),nn.ReLU(), nn.Dropout(), nn.Linear(512, num_classes))
resnet34.load_state_dict(torch.load('./model_resnet34.pth'))
resnet34.eval()
# Test against the average of each prediction from the two models
models_ensemble = [resnet18.to(device), resnet34.to(device)]
correct = 0
total = 0
if __name__ == '__main__':
with torch.no_grad():
for data in test_data_loader:
images, labels = data[0].to(device), data[1].to(device)
predictions = [i(images).data for i in models_ensemble]
avg_predictions = torch.mean(torch.stack(predictions), dim=0)
_, predicted = torch.max(avg_predictions, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
if total != 0:
print('accuracy = {:f}'.format(correct / total))
print('correct: {:d} total: {:d}'.format(correct, total))
To be very precise, I want to save my progress at the end of for batch in train_loader: loop, for say k = 1500.
If anyone can guide me about modifying my code so that I can save my progress and resume it later, then it will be a great and highly appreciated.
Whenever you want to save your training progress, you need to save two things:
Your model's state dict
Your optimizer's state dict
This can be done in the following way:
def save_checkpoint(model, optimizer, save_path, epoch):
torch.save({
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'epoch': epoch
}, save_path)
To resume training, you can restore your model and optimizer's state dict.
def load_checkpoint(model, optimizer, load_path):
checkpoint = torch.load(load_path)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
return model, optimizer, epoch
You can save your model at any point in training, wherever you need to. However, it should be ideal to save after finishing an epoch.
I am literally a beginner of PyTorch.
I trained an autoencoder network so that I can plot the distribution of the latent vectors (the result of encoders).
This is the code that I used for network training.
import torch
import torchvision
from torch import nn
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.utils import save_image
from torch.utils.data import Dataset
from PIL import Image
import os
import glob
dir_img_decoded = '/media/dohyeong/HDD/mouth_autoencoder/dc_img_2'
if not os.path.exists(dir_img_decoded):
os.mkdir(dir_img_decoded)
dir_check_point = '/media/dohyeong/HDD/mouth_autoencoder/ckpt_2'
if not os.path.exists(dir_check_point):
os.mkdir(dir_check_point)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
num_epochs = 200
batch_size = 150 # up -> GPU memory increase
learning_rate = 1e-3
dir_dataset = '/media/dohyeong/HDD/mouth_autoencoder/mouth_crop/dir_normalized_mouth_cropped_images'
images = glob.glob(os.path.join(dir_dataset, '*.png'))
train_images = images[:-113]
test_images = images[-113:]
train_images.sort()
test_images.sort()
class TrumpMouthDataset(Dataset):
def __init__(self, images):
super(TrumpMouthDataset, self).__init__()
self.images = images
self.transform = transforms.Compose([
# transforms.Resize((28, 28)),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
def __getitem__(self, index):
image = Image.open(self.images[index])
return self.transform(image)
def __len__(self):
return len(self.images)
train_dataset = TrumpMouthDataset(train_images)
test_dataset = TrumpMouthDataset(test_images)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
class Autoencoder(nn.Module):
def __init__(self):
super(Autoencoder, self).__init__()
self.encoder = nn.Sequential(
nn.Linear(60000, 60),
nn.ReLU(True),
nn.Linear(60, 3),
nn.ReLU(True),
)
self.decoder = nn.Sequential(
nn.Linear(3, 60),
nn.ReLU(True),
nn.Linear(60, 60000),
nn.Tanh()
)
def forward(self, x):
x = x.view(x.size(0), -1)
encoded = self.encoder(x)
decoded = self.decoder(encoded)
return encoded, decoded
model = Autoencoder().cuda()
if torch.cuda.device_count() > 1:
model = nn.DataParallel(model)
model.to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(),
lr=learning_rate,
weight_decay=1e-5)
for epoch in range(num_epochs):
total_loss = 0
for index, imgs in enumerate(train_dataloader):
imgs = imgs.to(device)
# ===================forward=====================
outputs = model(imgs)
imgs_flatten = imgs.view(imgs.size(0), -1)
loss = criterion(outputs, imgs_flatten)
# ===================backward====================
optimizer.zero_grad()
loss.backward()
optimizer.step()
total_loss += loss.item()
print('{} Epoch, [{}/{}] batch, loss: {:.4f}'.format(epoch, index + 1, len(train_dataloader), loss.item()))
avg_loss = total_loss / len(train_dataset)
print('{} Epoch, avg_loss: {:.4f}'.format(epoch, avg_loss))
if epoch % 10 == 0:
check_point_file = os.path.join(dir_check_point, str(epoch) + ".pth")
torch.save(model.state_dict(), check_point_file)
After training, I tried to get encoded values using this code.
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
check_point = '/media/dohyeong/HDD/mouth_autoencoder/290.pth'
model = torch.load(check_point)
for index, imgs in enumerate(train_dataloader):
imgs = imgs.to(device)
# ===================evaluate=====================
encoded, _ = model(imgs)
It finished with this error message.
"TypeError: 'collections.OrderedDict' object is not callable"
May I get some help?
Hi and welcome to the PyTorch community :D
TL;DR
Change model = torch.load(check_point) to model.load_state_dict(torch.load(check_point)).
The only problem is with the line:
model = torch.load(check_point)
The way you saved the checkpoint was:
torch.save(model.state_dict(), check_point_file)
That is, you saved the model's state_dict (which is just a dictionary of the various parameters that together describe the current instance of the model) in check_point_file.
Now, in order to load it back, just reverse the process.
check_point_file contains just the state_dict.
It knows nothing about the internals of the model - what it's architecture is, how it's supposed to work etc.
So, load it back:
state_dict = torch.load(check_point)
This state_dict can now be copied onto your Model instance as follows:
model.load_state_dict(state_dict)
Or, more succinctly,
model.load_state_dict(torch.load(check_point))
You got the error because the torch.load(check_point) returned the state_dict which you assigned to model.
When you subsequently called model(imgs), model was an OrderedDict object (not callable).
Hence the error.
See the Serialization Semantics Notes for more details.
Apart from that, your code sure is thorough for a beginner. Great going!
P.S. Your device agnosticity is brilliant! Perhaps you'd want to take a look at:
the line model = Autoencoder().cuda()
The map_location argument of torch.load()