pytorch autoencoder model evaluation fail

pytorch autoencoder model evaluation fail - python

I am literally a beginner of PyTorch.
I trained an autoencoder network so that I can plot the distribution of the latent vectors (the result of encoders).
This is the code that I used for network training.
import torch
import torchvision
from torch import nn
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.utils import save_image
from torch.utils.data import Dataset
from PIL import Image
import os
import glob
dir_img_decoded = '/media/dohyeong/HDD/mouth_autoencoder/dc_img_2'
if not os.path.exists(dir_img_decoded):
os.mkdir(dir_img_decoded)
dir_check_point = '/media/dohyeong/HDD/mouth_autoencoder/ckpt_2'
if not os.path.exists(dir_check_point):
os.mkdir(dir_check_point)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
num_epochs = 200
batch_size = 150 # up -> GPU memory increase
learning_rate = 1e-3
dir_dataset = '/media/dohyeong/HDD/mouth_autoencoder/mouth_crop/dir_normalized_mouth_cropped_images'
images = glob.glob(os.path.join(dir_dataset, '*.png'))
train_images = images[:-113]
test_images = images[-113:]
train_images.sort()
test_images.sort()
class TrumpMouthDataset(Dataset):
def __init__(self, images):
super(TrumpMouthDataset, self).__init__()
self.images = images
self.transform = transforms.Compose([
# transforms.Resize((28, 28)),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
def __getitem__(self, index):
image = Image.open(self.images[index])
return self.transform(image)
def __len__(self):
return len(self.images)
train_dataset = TrumpMouthDataset(train_images)
test_dataset = TrumpMouthDataset(test_images)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
class Autoencoder(nn.Module):
def __init__(self):
super(Autoencoder, self).__init__()
self.encoder = nn.Sequential(
nn.Linear(60000, 60),
nn.ReLU(True),
nn.Linear(60, 3),
nn.ReLU(True),
)
self.decoder = nn.Sequential(
nn.Linear(3, 60),
nn.ReLU(True),
nn.Linear(60, 60000),
nn.Tanh()
)
def forward(self, x):
x = x.view(x.size(0), -1)
encoded = self.encoder(x)
decoded = self.decoder(encoded)
return encoded, decoded
model = Autoencoder().cuda()
if torch.cuda.device_count() > 1:
model = nn.DataParallel(model)
model.to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(),
lr=learning_rate,
weight_decay=1e-5)
for epoch in range(num_epochs):
total_loss = 0
for index, imgs in enumerate(train_dataloader):
imgs = imgs.to(device)
# ===================forward=====================
outputs = model(imgs)
imgs_flatten = imgs.view(imgs.size(0), -1)
loss = criterion(outputs, imgs_flatten)
# ===================backward====================
optimizer.zero_grad()
loss.backward()
optimizer.step()
total_loss += loss.item()
print('{} Epoch, [{}/{}] batch, loss: {:.4f}'.format(epoch, index + 1, len(train_dataloader), loss.item()))
avg_loss = total_loss / len(train_dataset)
print('{} Epoch, avg_loss: {:.4f}'.format(epoch, avg_loss))
if epoch % 10 == 0:
check_point_file = os.path.join(dir_check_point, str(epoch) + ".pth")
torch.save(model.state_dict(), check_point_file)
After training, I tried to get encoded values using this code.
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
check_point = '/media/dohyeong/HDD/mouth_autoencoder/290.pth'
model = torch.load(check_point)
for index, imgs in enumerate(train_dataloader):
imgs = imgs.to(device)
# ===================evaluate=====================
encoded, _ = model(imgs)
It finished with this error message.
"TypeError: 'collections.OrderedDict' object is not callable"
May I get some help?

Hi and welcome to the PyTorch community :D
TL;DR
Change model = torch.load(check_point) to model.load_state_dict(torch.load(check_point)).
The only problem is with the line:
model = torch.load(check_point)
The way you saved the checkpoint was:
torch.save(model.state_dict(), check_point_file)
That is, you saved the model's state_dict (which is just a dictionary of the various parameters that together describe the current instance of the model) in check_point_file.
Now, in order to load it back, just reverse the process.
check_point_file contains just the state_dict.
It knows nothing about the internals of the model - what it's architecture is, how it's supposed to work etc.
So, load it back:
state_dict = torch.load(check_point)
This state_dict can now be copied onto your Model instance as follows:
model.load_state_dict(state_dict)
Or, more succinctly,
model.load_state_dict(torch.load(check_point))
You got the error because the torch.load(check_point) returned the state_dict which you assigned to model.
When you subsequently called model(imgs), model was an OrderedDict object (not callable).
Hence the error.
See the Serialization Semantics Notes for more details.
Apart from that, your code sure is thorough for a beginner. Great going!
P.S. Your device agnosticity is brilliant! Perhaps you'd want to take a look at:
the line model = Autoencoder().cuda()
The map_location argument of torch.load()

Related

write a train routine in class with pytorch

I want to write a train function in a class for training a model; The following code reported an error; can anyone give me a hint for solving this issue?
import numpy as np
import os
import sys
sys.executable
sys.version
##define a neuralnet class
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
os.chdir("/Users/zhangzhongheng/Downloads/")
os.getcwd()
#Download training data from open datasets.
training_data = datasets.FashionMNIST(
root="data",
train=True,
download=True,
transform=ToTensor(),
)
Download test data from open datasets.
test_data = datasets.FashionMNIST(
root="data",
train=False,
download=True,
transform=ToTensor(),
)
batch_size = 64
Create data loaders.
train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)
for X, y in test_dataloader:
print(f"Shape of X [N, C, H, W]: {X.shape}")
print(f"Shape of y: {y.shape} {y.dtype}")
break
Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")
Define model
class NeuralNetwork(nn.Module):
def __init__(self):
super().__init__()
self.flatten = nn.Flatten()
self.linear_relu_stack = nn.Sequential(
nn.Linear(28*28, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, 10)
)
def forward(self, x):
x = self.flatten(x)
logits = self.linear_relu_stack(x)
return logits
def model_train(self,dataloader):
self.train()
size = len(dataloader.dataset)
optimizer = torch.optim.SGD(self.parameters(), lr=1e-3)
for batch, (X, y) in enumerate(dataloader):
X, y = X.to(device), y.to(device)
# Compute prediction error
pred = self.forward(X)
loss = nn.CrossEntropyLoss(pred, y)
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch % 100 == 0:
loss, current = loss.item(), batch * len(X)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
Model = NeuralNetwork()
epochs = 5
for t in range(epochs):
print(f"Epoch {t+1}\n-------------------------------")
Model.model_train(train_dataloader)
print("Done!")
The above code reported the following error:
Epoch 1
-------------------------------
RuntimeError: Boolean value of Tensor with more than one value is ambiguous
Done!

Changing nn.CrossEntropyLoss to nn.CrossEntropyLoss() should solve this problem.
Refer to the official documentation here.
It will look something like this loss = nn.CrossEntropyLoss()(pred, y)

PyTorch-Lightning error on trainer.predict()

I'm using an example for training a model on MNIST dataset from pytorch-lightning's documentation (see here), to which I tried to add a prediction step. However, when performing trainer.predict(model) I get an error:
AttributeError: 'list' object has no attribute 'flatten'
I followed the instructions and examples I found online (adding the functions predict_step, predict_dataloader and adding a stage under setup function) and it looks pretty simple - however it doesn't work.
Here's the code I'm running:
import os
import torch
from pytorch_lightning import LightningModule, Trainer
from pytorch_lightning.callbacks.progress import TQDMProgressBar
from pytorch_lightning.loggers import CSVLogger
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, random_split
from torchmetrics import Accuracy
from torchvision import transforms
from torchvision.datasets import MNIST
PATH_DATASETS = os.environ.get("PATH_DATASETS", ".")
BATCH_SIZE = 256 if torch.cuda.is_available() else 64
class LitMNIST(LightningModule):
def __init__(self, data_dir=PATH_DATASETS, hidden_size=64, learning_rate=2e-4):
super().__init__()
# Set our init args as class attributes
self.data_dir = data_dir
self.hidden_size = hidden_size
self.learning_rate = learning_rate
# Hardcode some dataset specific attributes
self.num_classes = 10
self.dims = (1, 28, 28)
channels, width, height = self.dims
self.transform = transforms.Compose(
[
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,)),
]
)
# Define PyTorch model
self.model = nn.Sequential(
nn.Flatten(),
nn.Linear(channels * width * height, hidden_size),
nn.ReLU(),
nn.Dropout(p=0.9),
nn.Linear(hidden_size, hidden_size),
nn.ReLU(),
nn.Dropout(0.1),
nn.Linear(hidden_size, self.num_classes),
)
self.val_accuracy = Accuracy(task='multiclass', num_classes=10) # I fixed this since the code from the tutorial didn't work
self.test_accuracy = Accuracy(task='multiclass', num_classes=10) # I fixed this since the code from the tutorial didn't work
def forward(self, x):
x = self.model(x)
return F.log_softmax(x, dim=1)
def training_step(self, batch, batch_idx):
x, y = batch
logits = self(x)
loss = F.nll_loss(logits, y)
return loss
def validation_step(self, batch, batch_idx):
x, y = batch
logits = self(x)
loss = F.nll_loss(logits, y)
preds = torch.argmax(logits, dim=1)
self.val_accuracy.update(preds, y)
# Calling self.log will surface up scalars for you in TensorBoard
self.log("val_loss", loss, prog_bar=True)
self.log("val_acc", self.val_accuracy, prog_bar=True)
def test_step(self, batch, batch_idx):
x, y = batch
logits = self(x)
loss = F.nll_loss(logits, y)
preds = torch.argmax(logits, dim=1)
self.test_accuracy.update(preds, y)
# Calling self.log will surface up scalars for you in TensorBoard
self.log("test_loss", loss, prog_bar=True)
self.log("test_acc", self.test_accuracy, prog_bar=True)
def predict_step(self, batch, batch_idx, dataloader_idx=0):
return self(batch)
def configure_optimizers(self):
optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
return optimizer
####################
# DATA RELATED HOOKS
####################
def prepare_data(self):
# download
MNIST(self.data_dir, train=True, download=True)
MNIST(self.data_dir, train=False, download=True)
def setup(self, stage=None):
# Assign train/val datasets for use in dataloaders
if stage == "fit" or stage is None:
mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])
# Assign test dataset for use in dataloader(s)
if stage == "test" or stage is None:
self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transform)
if stage == "predict":
self.mnist_predict = MNIST(self.data_dir, train=False, transform=self.transform)
def train_dataloader(self):
return DataLoader(self.mnist_train, batch_size=BATCH_SIZE)
def val_dataloader(self):
return DataLoader(self.mnist_val, batch_size=BATCH_SIZE)
def test_dataloader(self):
return DataLoader(self.mnist_test, batch_size=BATCH_SIZE)
def predict_dataloader(self):
return DataLoader(self.mnist_predict, batch_size=BATCH_SIZE)
model = LitMNIST(hidden_size=2)
trainer = Trainer(
accelerator="auto",
devices=1 if torch.cuda.is_available() else None, # limiting got iPython runs
max_epochs=3,
callbacks=[TQDMProgressBar(refresh_rate=20)],
logger=CSVLogger(save_dir="logs/"),
)
trainer.fit(model)
predict = trainer.predict(model)
What is the problem?

You forgot to separate intput and label in the batch:
def predict_step(self, batch, batch_idx, dataloader_idx=0):
x, y = batch
return self(x)

Save images from sagemaker training

I am trying to save images that I configure during training to the output bucket in sagemaker. I've read that all the information that needs to be saved during training goes into the model.tar.gz file. I've tried saving plots using the model_dir and the output_data_dir to no avail. The model itself is saved properly, but the additional information is not being stored with it. I want to reload this additional information (the saved images) during inference but have heard that storing all the information in the model.tar.gz can cause slow inference. I would love some help.
Here is my estimator
from sagemaker.pytorch import PyTorch
estimator = PyTorch(entry_point='XXXXXXXX/AWS/mnist.py',
role=role,
py_version='py3',
framework_version='1.8.0',
instance_count=1,
instance_type='ml.c5.xlarge',
output_path='s3://XXXXX-bucket/',
)
and the code in mnist.py:
import os
import pandas as pd
import torch
import matplotlib.pyplot as plt
import argparse
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from torchvision.transforms import ToTensor
from torchvision.io import read_image
from torch import nn
import matplotlib.pyplot as plt
class NeuralNetwork(nn.Module):
def __init__(self):
super(NeuralNetwork, self).__init__()
self.flatten = nn.Flatten()
self.linear_relu_stack = nn.Sequential(
nn.Linear(28*28, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, 10),
)
def forward(self, x):
x = self.flatten(x)
logits = self.linear_relu_stack(x)
return logits
def train_loop(dataloader, model, loss_fn, optimizer):
size = len(dataloader.dataset)
for batch, (X, y) in enumerate(dataloader):
# Compute prediction and loss
pred = model(X.to(device))
loss = loss_fn(pred, y.to(device))
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch % 100 == 0:
loss, current = loss.item(), batch * len(X)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
def test_loop(dataloader, model, loss_fn):
size = len(dataloader.dataset)
num_batches = len(dataloader)
test_loss, correct = 0, 0
with torch.no_grad():
for X, y in dataloader:
pred = model(X.to(device))
test_loss += loss_fn(pred, y.to(device)).item()
correct += (pred.argmax(1) == y.to(device)).type(torch.float).sum().item()
test_loss /= num_batches
correct /= size
print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
# Initialize the loss function
if __name__=='__main__':
# default to the value in environment variable `SM_MODEL_DIR`. Using args makes the script more portable.
parser = argparse.ArgumentParser()
parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR'])
args, _ = parser.parse_known_args()
training_data = datasets.FashionMNIST(
root="data",
train=True,
download=True,
transform=ToTensor()
)
test_data = datasets.FashionMNIST(
root="data",
train=False,
download=True,
transform=ToTensor()
)
labels_map = {
0: "T-Shirt",
1: "Trouser",
2: "Pullover",
3: "Dress",
4: "Coat",
5: "Sandal",
6: "Shirt",
7: "Sneaker",
8: "Bag",
9: "Ankle Boot",
}
figure = plt.figure(figsize=(8, 8))
cols, rows = 3, 3
for i in range(1, cols * rows + 1):
sample_idx = torch.randint(len(training_data), size=(1,)).item()
img, label = training_data[sample_idx]
figure.add_subplot(rows, cols, i)
plt.title(labels_map[label])
plt.axis("off")
plt.imsave(args.output_data_dir+'plot'+str(i)+'.jpg', img.squeeze(), cmap="gray")
train_dataloader = DataLoader(training_data, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)
# Display image and label.
train_features, train_labels = next(iter(train_dataloader))
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")
img = train_features[0].squeeze()
label = train_labels[0]
plt.imsave(args.output_data_dir+'sample.jpg', img, cmap="gray")
print("Saved img.")
print(f"Label: {label}")
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")
model = NeuralNetwork().to(device)
print(model)
learning_rate = 1e-3
batch_size = 64
epochs = 5
# ... train `model`, then save it to `model_dir`
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
epochs = 1
for t in range(epochs):
print(f"Epoch {t+1}\n-------------------------------")
train_loop(train_dataloader, model, loss_fn, optimizer)
test_loop(test_dataloader, model, loss_fn)
print("Done!")
with open(os.path.join(args.model_dir, 'model.pth'), 'wb') as f:
torch.save(model.state_dict(), f)
plt.plot([1,2,3,4])
plt.ylabel('some numbers')
plt.show()
plt.savefig('test.jpeg')

I suspect there is an issue with string concatenation in plt.imsave because the environment variable SM_OUTPUT_DATA_DIR by default points to /opt/ml/output/data (that's the actual value of args.output_data_dir, since you don't pass this parameter) so the outcome is something like /opt/ml/output/dataplot1.jpg. The same happen if you use the model_dir in the same way. I'd rather use something like os.path.join like you're already doing for the model. here a nice exaplaination about these folders and environment variables in sagemaker.

Convolutional Neural Network Model - Why do I get different results on the same image

I'm new to Neural Networks and I'm trying to train a CNN model on a custom dataset (cats and dogs images in a single directory). So I guess I do the very usual stuff here which is in the most tutorials, but just in case I will give here my full code.
First I generate .csv file to be processed:
import os
import torch
device = ("cuda" if torch.cuda.is_available() else "cpu")
train_df = pd.DataFrame(columns=["img_name","label"])
train_df["img_name"] = os.listdir("train/")
for idx, i in enumerate(os.listdir("train/")):
if "cat" in i:
train_df["label"][idx] = 0
if "dog" in i:
train_df["label"][idx] = 1
train_df.to_csv (r'train_csv.csv', index = False, header=True)
Then I prepare the dataset:
from torch.utils.data import Dataset
import pandas as pd
import os
from PIL import Image
import torch
class CatsAndDogsDataset(Dataset):
def __init__(self, root_dir, annotation_file, transform=None):
self.root_dir = root_dir
self.annotations = pd.read_csv(annotation_file)
self.transform = transform
def __len__(self):
return len(self.annotations)
def __getitem__(self, index):
img_id = self.annotations.iloc[index, 0]
img = Image.open(os.path.join(self.root_dir, img_id)).convert("RGB")
y_label = torch.tensor(float(self.annotations.iloc[index, 1]))
if self.transform is not None:
img = self.transform(img)
return (img, y_label)
This is my model:
import torch.nn as nn
import torchvision.models as models
class CNN(nn.Module):
def __init__(self, train_CNN=False, num_classes=1):
super(CNN, self).__init__()
self.train_CNN = train_CNN
self.inception = models.inception_v3(pretrained=True, aux_logits=False)
self.inception.fc = nn.Linear(self.inception.fc.in_features, num_classes)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(0.5)
self.sigmoid = nn.Sigmoid()
def forward(self, images):
features = self.inception(images)
return self.sigmoid(self.dropout(self.relu(features))).squeeze(1)
This is my hyper-params, transformations and dataloaders:
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
num_epochs = 10
learning_rate = 0.00001
train_CNN = False
batch_size = 32
shuffle = True
pin_memory = True
num_workers = 0
transform = transforms.Compose(
[
transforms.Resize((356, 356)),
transforms.RandomCrop((299, 299)),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
]
)
dataset = CatsAndDogsDataset("train","train_csv.csv",transform=transform)
print(len(dataset))
train_set, validation_set = torch.utils.data.random_split(dataset,[162,40])
train_loader = DataLoader(dataset=train_set, shuffle=shuffle, batch_size=batch_size,num_workers=num_workers,pin_memory=pin_memory)
validation_loader = DataLoader(dataset=validation_set, shuffle=shuffle, batch_size=batch_size,num_workers=num_workers, pin_memory=pin_memory)
model = CNN().to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for name, param in model.inception.named_parameters():
if "fc.weight" in name or "fc.bias" in name:
param.requires_grad = True
else:
param.requires_grad = train_CNN
and accuracy check:
def check_accuracy(loader, model):
if loader == train_loader:
print("Checking accuracy on training data")
else:
print("Checking accuracy on validation data")
num_correct = 0
num_samples = 0
model.eval()
with torch.no_grad():
for x, y in loader:
x = x.to(device=device)
y = y.to(device=device)
scores = model(x)
predictions = torch.tensor([1.0 if i >= 0.5 else 0.0 for i in scores]).to(device)
num_correct += (predictions == y).sum()
num_samples += predictions.size(0)
print(
f"Got {num_correct} / {num_samples} with accuracy {float(num_correct)/float(num_samples)*100:.2f}"
)
model.train()
return f"{float(num_correct)/float(num_samples)*100:.2f}"
And this is my training function:
from tqdm import tqdm
def train():
model.train()
for epoch in range(num_epochs):
loop = tqdm(train_loader, total = len(train_loader), leave = True)
if epoch % 2 == 0:
loop.set_postfix(val_acc = check_accuracy(validation_loader, model))
for imgs, labels in loop:
imgs = imgs.to(device)
labels = labels.to(device)
outputs = model(imgs)
loss = criterion(outputs, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
loop.set_description(f"Epoch [{epoch}/{num_epochs}]")
loop.set_postfix(loss = loss.item())
if __name__ == "__main__":
train()
0%| | 0/6 [00:00<?, ?it/s]Checking accuracy on validation data
0%| | 0/6 [01:13<?, ?it/s, val_acc=60.00]Got 24 / 40 with accuracy 60.00
Epoch [0/10]: 100%|██████████| 6/6 [06:02<00:00, 60.39s/it, loss=0.693]
Epoch [1/10]: 100%|██████████| 6/6 [04:49<00:00, 48.23s/it, loss=0.693]
...
Epoch [8/10]: 100%|██████████| 6/6 [06:07<00:00, 61.29s/it, loss=0.693]
Epoch [9/10]: 100%|██████████| 6/6 [04:55<00:00, 49.19s/it, loss=0.781]
The model gets trained fine but when I try to use it for prediction I get different results each time I run this last piece in my Jupyter Notebooks:
model.eval()
img = Image.open('train/cat.22.png').convert("RGB")
img_t = transform(img)
batch_t = torch.unsqueeze(img_t, 0)
out = model(batch_t)
print(out)
tensor([0.5276], grad_fn=)
tensor([0.5000], grad_fn=)
tensor([0.5064], grad_fn=)
etc. Each time different result for the same image. Is this normal? Why this is happening?

I don't see you loading your trained model. This means every time you initialize the CNN module, the inception.fc layer will get initialized with random weights, this is most probably the reason why you are getting different results on each inference.
Edit: You have a random transform in your transformation pipeline, namely RandomCrop.

According to this answer on the use of model.eval(), I believe you might want to ensure that you have the lower half of the code cell wrapped in a with torch.no_grad(): context. I think it may still be learning/updating parameters unless inside that context.

Saving/Loading PyTorch Models Wrapped in sklearn-compatible Estimators

I have written an Autoencoder using PyTorch and I have rolled it into a custom sklearn BaseEstimator. I normally train the estimator on a machine with a GPU and save them for later evaluation using pickle. If I try to load an estimator on a machine where the model was stored on the GPU, I get the following error:
RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location='cpu' to map your storages to the CPU.
Is there a way to force the PyTorch model to be moved to the CPU before pickling the estimator without an explicit call?
Is there a way to unpickle an estimator that was saved while the model was on the GPU?
The following is an example of my PyTorch model and sklearn comptatable estimator, along with an example of how I am trying to save and load my models.
PyTorch Model
import torch.nn as nn
class _AutoEncoder(nn.Module):
def __init__(self, input_dim, output_dim, encoder_dim=4):
super(_AutoEncoder, self).__init__()
hidden_dim = int( (input_dim+encoder_dim)/2 )
layers = []
layers.append( nn.Linear(input_dim, hidden_dim) )
layers.append( nn.Linear(hidden_dim, encoder_dim) )
self.encoder = nn.Sequential(*layers)
layers = []
layers.append( nn.Linear(encoder_dim, hidden_dim) )
layers.append( nn.Linear(hidden_dim, output_dim) )
self.decoder = nn.Sequential(*layers)
def forward(self, X):
return self.decoder( self.encoder( X ) )
sklearn Compatible Estimator
import warnings
import inspect
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import utils as sk_utils
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data
class AutoEncoder(BaseEstimator, TransformerMixin):
def __init__(
self,
encoder_dim=4,
n_epochs=200,
batch_size=None,
shuffle=True,
use_cuda=False
):
super(AutoEncoder, self).__init__()
args, _, _, values = inspect.getargvalues(inspect.currentframe())
values.pop("self")
for arg, val in values.items():
setattr(self, arg, val)
if use_cuda:
if torch.cuda.is_available():
self.device = torch.device("cuda")
else:
self.device = torch.device("cpu")
warnings.warn("cuda not avaliable", UserWarning)
else:
self.device = torch.device("cpu")
def fit(self, X, y=None):
# X, y = sk_utils.check_X_y(X, y, ensure_2d=False, allow_nd=True)
self._model = self._train_classifier(X, y)
return self
def transform(self, X):
sk_utils.validation.check_is_fitted(self, ['_model'])
X = sk_utils.check_array(X)
X = torch.from_numpy(X.astype(np.float32)).to(self.device)
with torch.no_grad():
self._model.eval()
output = self._model.forward( X )
return output.cpu().numpy()
def encode(self, X):
sk_utils.validation.check_is_fitted(self, ['_model'])
X = sk_utils.check_array(X)
X = torch.from_numpy(X.astype(np.float32)).to(self.device)
with torch.no_grad():
self._model.eval()
output = self._model.encoder( X )
return output.cpu().numpy()
def decode(self, X):
sk_utils.validation.check_is_fitted(self, ['_model'])
X = sk_utils.check_array(X)
X = torch.from_numpy(X.astype(np.float32)).to(self.device)
with torch.no_grad():
self._model.eval()
output = self._model.decoder( X )
return output.cpu().numpy()
def _train_classifier(self, x_train, y_train):
x_train = torch.from_numpy(x_train.astype(np.float32)).to(self.device)
y_train = torch.from_numpy(y_train.astype(np.float32)).to(self.device)
input_dim = x_train.shape[-1]
output_dim = y_train.shape[-1]
model = _AutoEncoder(input_dim, output_dim, encoder_dim=self.encoder_dim).to(self.device)
loss_function = nn.MSELoss()
optimizer = optim.Adam(model.parameters())
print model
if self.batch_size is None:
return self._batch_train_simple_classifier(x_train, y_train, model, loss_function, optimizer)
else:
return self._minibatch_train_simple_classifier(x_train, y_train, model, loss_function, optimizer)
def _batch_train_simple_classifier(self, x_train, y_train, model, loss_function, optimizer):
for epoch in range(1, self.n_epochs+1):
model.train()
optimizer.zero_grad()
outputs = model.forward(x_train)
loss = loss_function(outputs, y_train)
loss.backward()
optimizer.step()
if epoch % 10 == 0 or epoch == self.n_epochs:
message = "Train Epoch: {:5d}, Loss: {:15.6f}".format(
epoch,
loss.item()
)
print message
return model
def _minibatch_train_simple_classifier(self, x_train, y_train, model, loss_function, optimizer):
train_data = torch.utils.data.TensorDataset(x_train, y_train)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=self.batch_size, shuffle=self.shuffle)
for epoch in range(1, self.n_epochs+1):
for data, target in train_loader:
model.train()
optimizer.zero_grad()
outputs = model.forward(data)
loss = loss_function(outputs, target)
loss.backward()
optimizer.step()
if epoch % 10 == 0 or epoch == self.n_epochs:
model.eval()
outputs = model.forward(x_train)
loss = loss_function(outputs, y_train)
message = "Train Epoch: {:5d}, Loss: {:15.6f}".format(
epoch,
loss.item()
)
print message
return model
Training
This is normally done on a machine with a GPU.
from sklearn import datasets as sk_datasets
digits = sk_datasets.load_digits(n_class=10, return_X_y=False)
data = digits.data
ae = AutoEncoder(
encoder_dim=2,
n_epochs=100,
batch_size=128,
shuffle=True,
use_cuda=True
)
data_fitted = ae.fit_transform(data, data)
Saving the Estimator
I'd like to find a way to have the PyTorch model moded to the CPU before it is saved without having an explicit call. Maybe a function as part of the AutoEncoder class that gets called as it is pickled?
with open("autoencoder.pkl", "wb") as fp:
# ae._model needs to be moved to the CPU here.
# I don't want to have to call ae._model.cpu() explicitly
pickle.dump(ae, fp)
Loading
I can not figure out how to load the estimator on a machine without a GPU if it was saved while the PyTorch model was still on the GPU.
# This gives an error if the model was saved while on the GPU,
# and a GPU is not avaiable when loading.
with open("autoencoder.pkl", "rb") as fp:
model = pickle.load(fp)
# This also a similar error. I also would not expect this to
# work since the pickle file contains an sklearn estimator
# wrapping a PyTorch model.
with open("autoencoder.pkl", "rb") as fp:
touch.load(fp, map_location="cpu")

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

pytorch autoencoder model evaluation fail - python

Related

write a train routine in class with pytorch

PyTorch-Lightning error on trainer.predict()

Save images from sagemaker training

Convolutional Neural Network Model - Why do I get different results on the same image

Saving/Loading PyTorch Models Wrapped in sklearn-compatible Estimators

Categories

Resources