Overfitting when fine-tuning BERT sentiment analysis - python

I am newbie to Machine Learning in general. I am currently trying to follow a tutorial on sentiment analysis using BERT and Transformers https://curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/
However when I train the model it has appeared that the model is overfitting
I do not know how to fix this. I have tried lowering amount of epochs, increasing batch size , shuffling my data (which is ordered) and increasing the validation split. So far nothing has worked. I have even tried changing different learning rate but the one I am using now is the smallest.
Below is my code:
PRE_TRAINED_MODEL_NAME = 'TurkuNLP/bert-base-finnish-cased-v1'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
MAX_LEN = 40
#Make a PyTorch dataset
class FIDataset(Dataset):
def __init__(self, texts, targets, tokenizer, max_len):
self.texts = texts
self.targets = targets
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
return len(self.texts)
def __getitem__(self, item):
text = str(self.texts[item])
target = self.targets[item]
encoding = self.tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=self.max_len,
return_token_type_ids=False,
pad_to_max_length=True,
return_attention_mask=True,
return_tensors='pt',
)
return {
'text': text,
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'targets': torch.tensor(target, dtype=torch.long)
}
#split test and train
df_train, df_test = train_test_split(
df,
test_size=0.1,
random_state=RANDOM_SEED
)
df_val, df_test = train_test_split(
df_test,
test_size=0.5,
random_state=RANDOM_SEED
)
#data loader function
def create_data_loader(df, tokenizer, max_len, batch_size):
ds = FIDataset(
texts=df.content.to_numpy(),
targets=df.sentiment.to_numpy(),
tokenizer=tokenizer,
max_len=max_len
)
return DataLoader(
ds,
batch_size=batch_size,
num_workers=4
)
BATCH_SIZE = 32
#Load data into train, test, val
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)
#Bert model loading
bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
# Sentiment Classifier based on Bert model just loaded
class SentimentClassifier(nn.Module):
def __init__(self, n_classes):
super(SentimentClassifier, self).__init__()
self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
self.drop = nn.Dropout(p=0.1)
self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
def forward(self, input_ids, attention_mask):
returned = self.bert(
input_ids=input_ids,
attention_mask=attention_mask
)
pooled_output = returned["pooler_output"]
output = self.drop(pooled_output)
return self.out(output)
#Create a Classifier instance and move to GPU
model = SentimentClassifier(3)
model = model.to(device)
#Optimize with AdamW
EPOCHS = 6
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=0,
num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)
#Train each Epoch function
def train_epoch(
model,
data_loader,
loss_fn,
optimizer,
device,
scheduler,
n_examples
):
model = model.train()
losses = []
correct_predictions = 0
for d in data_loader:
input_ids = d["input_ids"].to(device)
attention_mask = d["attention_mask"].to(device)
targets = d["targets"].to(device)
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask
)
_, preds = torch.max(outputs, dim=1)
loss = loss_fn(outputs, targets)
correct_predictions += torch.sum(preds == targets)
losses.append(loss.item())
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
scheduler.step()
optimizer.zero_grad()
return correct_predictions.double() / n_examples, np.mean(losses)
import torch
history = defaultdict(list)
best_accuracy = 0
if __name__ == '__main__':
for epoch in range(EPOCHS):
print(f'Epoch {epoch + 1}/{EPOCHS}')
print('-' * 10)
train_acc, train_loss = train_epoch(
model,
train_data_loader,
loss_fn,
optimizer,
device,
scheduler,
len(df_train)
)
print(f'Train loss {train_loss} accuracy {train_acc}')
val_acc, val_loss = eval_model(
model,
val_data_loader,
loss_fn,
device,
len(df_val)
)
print(f'Val loss {val_loss} accuracy {val_acc}')
print()
history['train_acc'].append(train_acc)
history['train_loss'].append(train_loss)
history['val_acc'].append(val_acc)
history['val_loss'].append(val_loss)
if val_acc > best_accuracy:
torch.save(model.state_dict(), 'best_model_state.bin')
best_accuracy = val_acc

Broadly speaking, to reduce overfitting, you can:
increase regularization
reduce model complexity
perform early stopping
increase training data
From what you've written, you've already tried 3 and 4. In the case of neural networks, you can increase regularization by increasing dropout. You already have the code for it.
# NOTE: You don't need bert_model here since you're creating one inside
# of SentimentClassifier.
#bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
# Sentiment Classifier based on Bert model just loaded
class SentimentClassifier(nn.Module):
def __init__(self, n_classes):
super(SentimentClassifier, self).__init__()
self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
self.drop = nn.Dropout(p=0.1) # <-- INCREASE THIS VALUE
self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
I'd recommend trying higher values of the Dropout probability, as I noted in your code above ("INCREASE THIS VALUE"). Keep track of the Dropout probability and the resulting observed overfitting. Try probability values of 0.1, 0.2, 0.3, 0.4, 0.5.
Usually, I've found that dropout over 0.5 doesn't do much good.

Related

Input contains NaN, infinity or a value too large for dtype('float32'). Pythorch

I try to train model but in vain. I see the error
Input contains NaN, infinity or a value too large for dtype('float32').
I think it can be connected with Mse function, because with MAE it works somehow also with RMSE it works somehow (on the second epoch i have RMSE = 10). I can't figure out what i do wrong.
# Count Nan
df = pd.read_csv('data.txt.zip', header=None)
X = df.iloc[:, 1:].values
y = df.iloc[:, 0].values
train_size = 463715
X_train = X[:train_size, :]
y_train = y[:train_size]
X_test = X[train_size:, :]
y_test = y[train_size:]
#ToTensor
X_train = torch.FloatTensor(X_train)
y_train = torch.FloatTensor(y_train)
X_test = torch.FloatTensor(X_test)
y_test = torch.FloatTensor(y_test)
# Create TensorDataset
train_ds = TensorDataset(X_train, y_train)
test_ds = TensorDataset(X_test, y_test)
val_num = 92743
train_num = 370972
# Divide train data into train and validation data
train_ds, val_ds = random_split(train_ds, [train_num, val_num])
# Evaluate accuracy
def accuracy(y_true, y_pred):
return r2_score(y_true, y_pred)
# create Class
class BaselineModel(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(BaselineModel, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.linear1 = nn.Linear(90, 45)
self.linear2 = nn.Linear(45, 1)
self.linear3 = nn.Linear(45, 15)
self.linear4 = nn.Linear(15, 1)
self.batch = nn.BatchNorm2d(hidden_size)
self.relu = nn.ReLU()
self.lreku = nn.LeakyReLU()
self.elu = nn.ELU()
self.dropout = nn.Dropout(0.5)
def forward(self, x):
x = self.elu(self.linear1(x))
return self.linear2(x)
def training_step(self, criterion, batch):
x_train, y_train = batch
y_pred = self(x_train)
loss = (criterion(y_pred, y_train.unsqueeze(1)))
return loss
def validation_step(self, criterion, batch):
x_val, y_val = batch
y_pred = self(x_val)
loss = (criterion(y_pred, y_val.unsqueeze(1)))
acc = accuracy(y_val, y_pred)
return {'val_loss': loss, 'val_acc': acc}
def validation_epoch_end(self, y_pred):
batch_losses = [x['val_loss'] for x in y_pred]
epoch_loss = torch.stack(batch_losses).mean()
batch_accs = [x['val_acc'] for x in y_pred]
epoch_acc = np.mean(batch_accs)
#epoch_acc = torch.stack(batch_accs).mean()
return {'val_loss': epoch_loss.item(), 'val_acc': epoch_acc.item()}
def epoch_end(self, epoch, result):
print(f"Epoch {epoch}, val_loss: {result['val_loss']}, val_acc: {result['val_acc']} ")
model = BaselineModel(input_size = 90, hidden_size = 45, output_size = 1)
# Evaluate
def evaluate(model, criterion, val_loader):
with torch.no_grad():
y_pred = [model.validation_step(criterion, batch) for batch in val_loader]
return model.validation_epoch_end(y_pred)
# Train
def train(model, criterion, optimizer, train_loader, val_loader, lr, epochs):
history = []
for epoch in range(epochs):
for batch in train_loader:
optimizer.zero_grad()
loss = model.training_step(criterion, batch)
loss.backward()
optimizer.step()
result = evaluate(model, criterion, val_loader)
model.epoch_end(epoch, result)
history.append(result)
#return history
# Create train_loader & val_loader
batch_size = 128
train_loader = DataLoader(train_ds, batch_size = batch_size, shuffle = True)
val_loader = DataLoader(val_ds, batch_size = batch_size, shuffle = True)
# Create parameters and Train
lr = 0.05
optimizer = torch.optim.SGD(model.parameters(), lr, momentum = 0.9)
criterion = F.mse_loss
epochs = 10
train(model, criterion, optimizer, train_loader, val_loader, lr, epochs)
Yes, it is because of your loss of function. if the value of the loss function after some epoch becomes very small or very large then when you want to use it in backpropagation to train the model, you face this error. To handle that, you should use Early Stopping to Halt the Training. so you should implement Callback, Callbacks provide a way to execute code and interact with the training model process automatically.

Keeping all tensors in GPU when training a faster RCNN

I am trying to train a faster-RCNN model for bounding box detection on a custom COCO-like dataset. I am using a GPU, even though I use .to(device) to push tensors into the GPU, I keep getting the following error:
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py in linear(input, weight, bias)
1845 if has_torch_function_variadic(input, weight):
1846 return handle_torch_function(linear, (input, weight), input, weight, bias=bias)
-> 1847 return torch._C._nn.linear(input, weight, bias)
1848
1849
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking arugment for argument mat1 in method wrapper_addmm)
Training snippet:
# Initialize Dataset
train_dataset = TrainDataset('coco_train.json')
def collate_fn(batch):
return tuple(zip(*batch))
train_data_loader = DataLoader(
train_dataset,
batch_size=2,
shuffle=True,
num_workers=2,
collate_fn=collate_fn
)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=False)
model.to(device) # EDIT
num_classes = 3 # eyelids, iris + background
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)
num_epochs = 40
itr = 1
for epoch in range(num_epochs):
for images, targets in train_data_loader:
images = list(image.to(device) for image in images)
targets = [ { k: v.to(device) for k, v in t.items() } for t in targets ]
loss_dict = model(images, targets)
losses = sum(loss for loss in loss_dict.values())
loss_value = losses.item()
optimizer.zero_grad()
losses.backward()
optimizer.step()
if itr % 50 == 0:
print(f"Iteration #{itr} loss: {loss_value}")
itr += 1
lr_scheduler.step()
My Dataset getitem() snippet: (truncated)
def __getitem__(self, index : int):
...
target = {}
target["boxes"] = torch.as_tensor(boxes, dtype=torch.float32)
target["labels"] = torch.as_tensor(labels, dtype=torch.int64)
target["image_id"] = torch.as_tensor([ int(image_id) ], dtype=torch.int64)
target["area"] = torch.as_tensor(area, dtype=torch.float32)
target["iscrowd"] = torch.as_tensor(iscrowd, dtype=torch.int64)
image = torchvision.transforms.ToTensor()(image)
return image, target
You should load your model on the GPU device as well:
model.to(device)
Note, torch.nn.Module.to is an inplace operation.

Neural network keep predicting the same number

I have a ROS application where a camera node sends an image via service to a neutral network node. My training and validation dataset I use is the MNIST database. It should be very easy to predict a number, but the neural network returns the same number for every single service request.
ai_service.py
class AiService():
def __init__(self, save_path):
self.batch_size = 2800
self.epochs = 25
self.learning_rate = 0.01
self.training_data = torch.utils.data.DataLoader(datasets.MNIST(root='./data', train=True, download=True,
transform=transforms.Compose([transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))])), 200, shuffle=True)
self.validation_data = torch.utils.data.DataLoader(datasets.MNIST(root='./data', train=False, download=True,
transform=transforms.Compose([transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))])), 200, shuffle=True)
...
# Function to train the mnist dataset.
def training(self):
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(self.model.parameters(), self.learning_rate)
start_time = time()
for epoch in range(self.epochs):
running_loss = 0
# trainig phase
for images, labels in self.training_data:
optimizer.zero_grad()
image, label = images.to(self.device), labels.to(self.device)
output = self.model(image)
loss = criterion(output, label)
loss.backward()
optimizer.step() #optimizing weights
running_loss += loss.item()
else:
print("Epoch {} - Training loss: {:.10f}".format(epoch, running_loss / len(self.training_data)))
print("\nTraining Time (in minutes): {:.2f} =".format((time() - start_time) / 60))
def validating(self, request_image):
self.model.eval()
tensor_image = self.image_to_tensor(request_image)
with torch.no_grad():
output = self.model(tensor_image)
return output.cpu().data.numpy().argmax()
def image_to_tensor(self, request_image):
return transforms.ToTensor()(self.cv_bridge.imgmsg_to_cv2(request_image, 'mono8'))
neural_network.py
class NeuralNetwork(nn.Module):
# Initializes the Neural Network by setting up the layers.
def __init__(self):
super().__init__()
self.flatten = nn.Flatten()
self.input_layer = nn.Sequential(nn.Linear(28*28, 512))
self.hidden_layer1 = nn.Linear(512, 254)
self.hidden_layer2 = nn.Linear(254, 128)
self.output_layer = nn.Linear(128, 10)
def forward(self, x):
x = self.flatten(x)
x = F.relu(self.input_layer(x))
x = F.relu(self.hidden_layer1(x))
x = F.relu(self.hidden_layer2(x))
x = self.output_layer(x)
return F.log_softmax(x, 1)
I get get a training accuracy of:
My output:
My camera image:
Could it be because of the resizing and grayscaling that the picture is not recognized? I just added imshow to the def image_to_tensor(self, request_image): function and the image is barely recognisable.

Pytorch convnet isn't learning

I'm new to PyTorch and I'm trying to build a model for a Kaggle competition. I used a pre-trained resnet but the training and the validation loss don't decrease. I suspect I did something wrong in my implementation:
#================================================================================
class TransferResnet(nn.Module):
def __init__(self, classes=4):
super().__init__()
# Use a pretrained model
self.network = models.resnet34(pretrained=True)
# Replace last layer
num_ftrs = self.network.fc.in_features
self.network.fc = nn.Sequential(nn.Linear(num_ftrs, 128),
nn.ReLU(),
nn.Dropout(0.50),
nn.Linear(128,classes))
def forward(self, xb):
out = self.network(xb)
return out
def feed_to_network(self, batch):
images, labels = batch
out = self(images)
out = F.softmax(out, dim=1)
loss = F.cross_entropy(out, labels)
return loss, out
#======================================================
def get_scores(labels, prediction, loss=None):
"Return classification scores"
accuracy = accuracy_score(labels, prediction)
f1 = f1_score(labels, prediction,
average='weighted', zero_division=0)
precision = precision_score(labels, prediction,
average='weighted', zero_division=0)
recall = recall_score(labels, prediction,
average='weighted', zero_division=0)
if loss:
return [accuracy, f1, precision, recall, loss]
else:
return [accuracy, f1, precision, recall]
def get_predictions(model, loader):
"""This function takes a model and a data loader,
returns the list of losses, the predictions and the labels"""
with torch.no_grad():
model.eval()
losses = []
predictions = []
labels = []
for batch in loader:
loss, out = model.feed_to_network(batch)
predictions += torch.max(out, dim=1)[1].tolist()
labels += batch[1].tolist()
losses.append(loss.item())
return labels, predictions, sum(losses)/len(losses)
#=================================================================
def fit(epochs, model, train_loader, val_loader,
opt_func=torch.optim.Adam, lr=3e-4, step_size=100):
def get_parameter(optimizer, parameter="lr"):
"""Retrieve learning rate or parameter"""
if parameter == 'lr':
for param_group in optimizer.param_groups:
return param_group['lr']
torch.cuda.empty_cache()
model.train()
#Dataframe that will store the metrics
train_metrics_df = pd.DataFrame(columns=['accuracy', 'f1', 'precision',
'recall', 'loss'])
valid_metrics_df = pd.DataFrame(columns=['accuracy', 'f1', 'precision',
'recall', 'loss'])
momentum_list = []
lr_list = []
optimizer = opt_func([{"params": model.network.fc.parameters(), "lr": lr},
{"params": model.network.layer4.parameters(), "lr": lr/2},
{"params": model.network.layer3.parameters(), "lr": lr/4},
{"params": model.network.layer2.parameters(), "lr": lr/6},
{"params": model.network.layer1.parameters(), "lr": lr/8},], lr)
for epoch in range(epochs):
# Training Phase
train_label = []
train_prediction = []
train_losses = []
for batch in tqdm(train_loader):
loss, out = model.feed_to_network(batch)
loss.backward()
#momentum_list.append(get_parameter(optimizer, parameter="momentum"))
lr_list.append(get_parameter(optimizer, parameter="lr"))
optimizer.step()
optimizer.zero_grad()
#Extract labels, predictions and loss of the training set
train_prediction += torch.max(out, dim=1)[1].tolist()
train_label += batch[1].tolist()
train_losses.append(loss.item())
#Evaluation phase
val_labels, val_predictions, val_loss = get_predictions(model, val_loader)
train_metrics_df.loc[epoch] = get_scores(train_label,train_prediction,
loss=sum(train_losses)/len(train_losses))
valid_metrics_df.loc[epoch] = get_scores(val_labels, val_predictions,
loss=val_loss)
print_epoch_trainLoss = train_metrics_df.iloc[epoch]["loss"]
print_epoch_validLoss = valid_metrics_df.iloc[epoch]["loss"]
print_epoch_validAccu = valid_metrics_df.iloc[epoch]["accuracy"]
print_epoch_trainAccu = train_metrics_df.iloc[epoch]["accuracy"]
print(f"Epoch: {epoch+1}, train loss: {print_epoch_trainLoss:.2f}, "
f"validation loss: {print_epoch_validLoss:.2f}, "
f"validation accuracy: {print_epoch_validAccu:.2f}, "
f"training accuracy: {print_epoch_trainAccu:.2f}, ")
return train_metrics_df, valid_metrics_df, (momentum_list,lr_list)
All the images are normalized, cropped to proper dimensions (490x490) and some data augmentation is performed (random flip, rotations, etc...). All this code is executed on a GPU using Kaggle notebooks (my GPU is not enough for this dataset). This is my first implementation of a CNN and I do not know what I did wrong. I also tried to learn the classifier with a learning rate = 0.1, but the loss does not decrease.

Saving/Loading PyTorch Models Wrapped in sklearn-compatible Estimators

I have written an Autoencoder using PyTorch and I have rolled it into a custom sklearn BaseEstimator. I normally train the estimator on a machine with a GPU and save them for later evaluation using pickle. If I try to load an estimator on a machine where the model was stored on the GPU, I get the following error:
RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location='cpu' to map your storages to the CPU.
Is there a way to force the PyTorch model to be moved to the CPU before pickling the estimator without an explicit call?
Is there a way to unpickle an estimator that was saved while the model was on the GPU?
The following is an example of my PyTorch model and sklearn comptatable estimator, along with an example of how I am trying to save and load my models.
PyTorch Model
import torch.nn as nn
class _AutoEncoder(nn.Module):
def __init__(self, input_dim, output_dim, encoder_dim=4):
super(_AutoEncoder, self).__init__()
hidden_dim = int( (input_dim+encoder_dim)/2 )
layers = []
layers.append( nn.Linear(input_dim, hidden_dim) )
layers.append( nn.Linear(hidden_dim, encoder_dim) )
self.encoder = nn.Sequential(*layers)
layers = []
layers.append( nn.Linear(encoder_dim, hidden_dim) )
layers.append( nn.Linear(hidden_dim, output_dim) )
self.decoder = nn.Sequential(*layers)
def forward(self, X):
return self.decoder( self.encoder( X ) )
sklearn Compatible Estimator
import warnings
import inspect
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import utils as sk_utils
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data
class AutoEncoder(BaseEstimator, TransformerMixin):
def __init__(
self,
encoder_dim=4,
n_epochs=200,
batch_size=None,
shuffle=True,
use_cuda=False
):
super(AutoEncoder, self).__init__()
args, _, _, values = inspect.getargvalues(inspect.currentframe())
values.pop("self")
for arg, val in values.items():
setattr(self, arg, val)
if use_cuda:
if torch.cuda.is_available():
self.device = torch.device("cuda")
else:
self.device = torch.device("cpu")
warnings.warn("cuda not avaliable", UserWarning)
else:
self.device = torch.device("cpu")
def fit(self, X, y=None):
# X, y = sk_utils.check_X_y(X, y, ensure_2d=False, allow_nd=True)
self._model = self._train_classifier(X, y)
return self
def transform(self, X):
sk_utils.validation.check_is_fitted(self, ['_model'])
X = sk_utils.check_array(X)
X = torch.from_numpy(X.astype(np.float32)).to(self.device)
with torch.no_grad():
self._model.eval()
output = self._model.forward( X )
return output.cpu().numpy()
def encode(self, X):
sk_utils.validation.check_is_fitted(self, ['_model'])
X = sk_utils.check_array(X)
X = torch.from_numpy(X.astype(np.float32)).to(self.device)
with torch.no_grad():
self._model.eval()
output = self._model.encoder( X )
return output.cpu().numpy()
def decode(self, X):
sk_utils.validation.check_is_fitted(self, ['_model'])
X = sk_utils.check_array(X)
X = torch.from_numpy(X.astype(np.float32)).to(self.device)
with torch.no_grad():
self._model.eval()
output = self._model.decoder( X )
return output.cpu().numpy()
def _train_classifier(self, x_train, y_train):
x_train = torch.from_numpy(x_train.astype(np.float32)).to(self.device)
y_train = torch.from_numpy(y_train.astype(np.float32)).to(self.device)
input_dim = x_train.shape[-1]
output_dim = y_train.shape[-1]
model = _AutoEncoder(input_dim, output_dim, encoder_dim=self.encoder_dim).to(self.device)
loss_function = nn.MSELoss()
optimizer = optim.Adam(model.parameters())
print model
if self.batch_size is None:
return self._batch_train_simple_classifier(x_train, y_train, model, loss_function, optimizer)
else:
return self._minibatch_train_simple_classifier(x_train, y_train, model, loss_function, optimizer)
def _batch_train_simple_classifier(self, x_train, y_train, model, loss_function, optimizer):
for epoch in range(1, self.n_epochs+1):
model.train()
optimizer.zero_grad()
outputs = model.forward(x_train)
loss = loss_function(outputs, y_train)
loss.backward()
optimizer.step()
if epoch % 10 == 0 or epoch == self.n_epochs:
message = "Train Epoch: {:5d}, Loss: {:15.6f}".format(
epoch,
loss.item()
)
print message
return model
def _minibatch_train_simple_classifier(self, x_train, y_train, model, loss_function, optimizer):
train_data = torch.utils.data.TensorDataset(x_train, y_train)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=self.batch_size, shuffle=self.shuffle)
for epoch in range(1, self.n_epochs+1):
for data, target in train_loader:
model.train()
optimizer.zero_grad()
outputs = model.forward(data)
loss = loss_function(outputs, target)
loss.backward()
optimizer.step()
if epoch % 10 == 0 or epoch == self.n_epochs:
model.eval()
outputs = model.forward(x_train)
loss = loss_function(outputs, y_train)
message = "Train Epoch: {:5d}, Loss: {:15.6f}".format(
epoch,
loss.item()
)
print message
return model
Training
This is normally done on a machine with a GPU.
from sklearn import datasets as sk_datasets
digits = sk_datasets.load_digits(n_class=10, return_X_y=False)
data = digits.data
ae = AutoEncoder(
encoder_dim=2,
n_epochs=100,
batch_size=128,
shuffle=True,
use_cuda=True
)
data_fitted = ae.fit_transform(data, data)
Saving the Estimator
I'd like to find a way to have the PyTorch model moded to the CPU before it is saved without having an explicit call. Maybe a function as part of the AutoEncoder class that gets called as it is pickled?
with open("autoencoder.pkl", "wb") as fp:
# ae._model needs to be moved to the CPU here.
# I don't want to have to call ae._model.cpu() explicitly
pickle.dump(ae, fp)
Loading
I can not figure out how to load the estimator on a machine without a GPU if it was saved while the PyTorch model was still on the GPU.
# This gives an error if the model was saved while on the GPU,
# and a GPU is not avaiable when loading.
with open("autoencoder.pkl", "rb") as fp:
model = pickle.load(fp)
# This also a similar error. I also would not expect this to
# work since the pickle file contains an sklearn estimator
# wrapping a PyTorch model.
with open("autoencoder.pkl", "rb") as fp:
touch.load(fp, map_location="cpu")

Categories

Resources