Can any one help me
I am facing this error "h5py objects cannot be pickled" while running (train.py) on https://github.com/RoyalSkye/Image-Caption
(my OS is Window 10)
train.py
#!/usr/bin/env python3
import h5py
import time
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence
from models import *
from transformer import *
from datasets import *
from utils import *
from nltk.translate.bleu_score import corpus_bleu
import argparse
import codecs
import numpy as np
def train(args, train_loader, encoder, decoder, criterion, encoder_optimizer, decoder_optimizer, epoch):
"""
Performs one epoch's training.
:param train_loader: DataLoader for training data
:param encoder: encoder model
:param decoder: decoder model
:param criterion: loss layer
:param encoder_optimizer: optimizer to update encoder's weights (if fine-tuning)
:param decoder_optimizer: optimizer to update decoder's weights
:param epoch: epoch number
"""
decoder.train() # train mode (dropout and batchnorm is used)
encoder.train()
batch_time = AverageMeter() # forward prop. + back prop. time
data_time = AverageMeter() # data loading time
losses = AverageMeter() # loss (per word decoded)
top5accs = AverageMeter() # top5 accuracy
start = time.time()
# Batches
for i, (imgs, caps, caplens) in enumerate(train_loader):
data_time.update(time.time() - start)
# Move to GPU, if available
imgs = imgs.to(device)
caps = caps.to(device)
caplens = caplens.to(device)
# Forward prop.
imgs = encoder(imgs)
# imgs: [batch_size, 14, 14, 2048]
# caps: [batch_size, 52]
# caplens: [batch_size, 1]
scores, caps_sorted, decode_lengths, alphas, sort_ind = decoder(imgs, caps, caplens)
# Since we decoded starting with <start>, the targets are all words after <start>, up to <end>
targets = caps_sorted[:, 1:]
# Remove timesteps that we didn't decode at, or are pads
# pack_padded_sequence is an easy trick to do this
scores = pack_padded_sequence(scores, decode_lengths, batch_first=True).data
targets = pack_padded_sequence(targets, decode_lengths, batch_first=True).data
# print(scores.size())
# print(targets.size())
# Calculate loss
loss = criterion(scores, targets)
# Add doubly stochastic attention regularization
# Second loss, mentioned in paper "Show, Attend and Tell: Neural Image Caption Generation with Visual Attention"
# https://arxiv.org/abs/1502.03044
# In section 4.2.1 Doubly stochastic attention regularization: We know the weights sum to 1 at a given timestep.
# But we also encourage the weights at a single pixel p to sum to 1 across all timesteps T.
# This means we want the model to attend to every pixel over the course of generating the entire sequence.
# Therefore, we want to minimize the difference between 1 and the sum of a pixel's weights across all timesteps.
if args.decoder_mode == "lstm":
loss += args.alpha_c * ((1. - alphas.sum(dim=1)) ** 2).mean()
elif args.decoder_mode == "transformer":
dec_alphas = alphas["dec_enc_attns"]
alpha_trans_c = args.alpha_c / (args.n_heads * args.decoder_layers)
for layer in range(args.decoder_layers): # args.decoder_layers = len(dec_alphas)
cur_layer_alphas = dec_alphas[layer] # [batch_size, n_heads, 52, 196]
for h in range(args.n_heads):
cur_head_alpha = cur_layer_alphas[:, h, :, :]
loss += alpha_trans_c * ((1. - cur_head_alpha.sum(dim=1)) ** 2).mean()
# Back prop.
decoder_optimizer.zero_grad()
if encoder_optimizer is not None:
encoder_optimizer.zero_grad()
loss.backward()
# Clip gradients
if args.grad_clip is not None:
clip_gradient(decoder_optimizer, args.grad_clip)
if encoder_optimizer is not None:
clip_gradient(encoder_optimizer, args.grad_clip)
# Update weights
decoder_optimizer.step()
if encoder_optimizer is not None:
encoder_optimizer.step()
# Keep track of metrics
top5 = accuracy(scores, targets, 5)
losses.update(loss.item(), sum(decode_lengths))
top5accs.update(top5, sum(decode_lengths))
batch_time.update(time.time() - start)
start = time.time()
if i % args.print_freq == 0:
print("Epoch: {}/{} step: {}/{} Loss: {} AVG_Loss: {} Top-5 Accuracy: {} Batch_time: {}s".format(epoch+1, args.epochs, i+1, len(train_loader), losses.val, losses.avg, top5accs.val, batch_time.val))
def validate(args, val_loader, encoder, decoder, criterion):
"""
Performs one epoch's validation.
:param val_loader: DataLoader for validation data.
:param encoder: encoder model
:param decoder: decoder model
:param criterion: loss layer
:return: score_dict {'Bleu_1': 0., 'Bleu_2': 0., 'Bleu_3': 0., 'Bleu_4': 0., 'METEOR': 0., 'ROUGE_L': 0., 'CIDEr': 1.}
"""
decoder.eval() # eval mode (no dropout or batchnorm)
if encoder is not None:
encoder.eval()
batch_time = AverageMeter()
losses = AverageMeter()
top5accs = AverageMeter()
start = time.time()
references = list() # references (true captions) for calculating BLEU-4 score
hypotheses = list() # hypotheses (predictions)
# explicitly disable gradient calculation to avoid CUDA memory error
with torch.no_grad():
# Batches
for i, (imgs, caps, caplens, allcaps) in enumerate(val_loader):
# Move to device, if available
imgs = imgs.to(device)
caps = caps.to(device)
caplens = caplens.to(device)
# Forward prop.
if encoder is not None:
imgs = encoder(imgs)
scores, caps_sorted, decode_lengths, alphas, sort_ind = decoder(imgs, caps, caplens)
# Since we decoded starting with <start>, the targets are all words after <start>, up to <end>
targets = caps_sorted[:, 1:]
# Remove timesteps that we didn't decode at, or are pads
# pack_padded_sequence is an easy trick to do this
scores_copy = scores.clone()
scores = pack_padded_sequence(scores, decode_lengths, batch_first=True).data
targets = pack_padded_sequence(targets, decode_lengths, batch_first=True).data
# Calculate loss
loss = criterion(scores, targets)
# Add doubly stochastic attention regularization
if args.decoder_mode == "lstm":
loss += args.alpha_c * ((1. - alphas.sum(dim=1)) ** 2).mean()
elif args.decoder_mode == "transformer":
dec_alphas = alphas["dec_enc_attns"]
alpha_trans_c = args.alpha_c / (args.n_heads * args.decoder_layers)
for layer in range(args.decoder_layers): # args.decoder_layers = len(dec_alphas)
cur_layer_alphas = dec_alphas[layer] # [batch_size, n_heads, 52, 196]
for h in range(args.n_heads):
cur_head_alpha = cur_layer_alphas[:, h, :, :]
loss += alpha_trans_c * ((1. - cur_head_alpha.sum(dim=1)) ** 2).mean()
# Keep track of metrics
losses.update(loss.item(), sum(decode_lengths))
top5 = accuracy(scores, targets, 5)
top5accs.update(top5, sum(decode_lengths))
batch_time.update(time.time() - start)
start = time.time()
# Store references (true captions), and hypothesis (prediction) for each image
# If for n images, we have n hypotheses, and references a, b, c... for each image, we need -
# references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...]
# References
allcaps = allcaps[sort_ind] # because images were sorted in the decoder
for j in range(allcaps.shape[0]):
img_caps = allcaps[j].tolist()
img_captions = list(
map(lambda c: [w for w in c if w not in {word_map['<start>'], word_map['<pad>']}],
img_caps)) # remove <start> and pads
references.append(img_captions)
# Hypotheses
_, preds = torch.max(scores_copy, dim=2)
preds = preds.tolist()
temp_preds = list()
for j, p in enumerate(preds):
temp_preds.append(preds[j][:decode_lengths[j]]) # remove pads
preds = temp_preds
hypotheses.extend(preds)
assert len(references) == len(hypotheses)
# Calculate BLEU-1~4 scores
# metrics = {}
# weights = (1.0 / 1.0,)
# metrics["bleu1"] = corpus_bleu(references, hypotheses, weights)
# weights = (1.0/2.0, 1.0/2.0,)
# metrics["bleu2"] = corpus_bleu(references, hypotheses, weights)
# weights = (1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0,)
# metrics["bleu3"] = corpus_bleu(references, hypotheses, weights)
# metrics["bleu4"] = corpus_bleu(references, hypotheses)
# Calculate BLEU1~4, METEOR, ROUGE_L, CIDEr scores
metrics = get_eval_score(references, hypotheses)
print("EVA LOSS: {} TOP-5 Accuracy {} BLEU-1 {} BLEU2 {} BLEU3 {} BLEU-4 {} METEOR {} ROUGE_L {} CIDEr {}".format
(losses.avg, top5accs.avg, metrics["Bleu_1"], metrics["Bleu_2"], metrics["Bleu_3"], metrics["Bleu_4"],
metrics["METEOR"], metrics["ROUGE_L"], metrics["CIDEr"]))
return metrics
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Image_Captioning')
# Data parameters
parser.add_argument('--data_folder', default="./dataset/generated_data",
help='folder with data files saved by create_input_files.py.')
parser.add_argument('--data_name', default="coco_5_cap_per_img_5_min_word_freq",
help='base name shared by data files.')
# Model parameters
parser.add_argument('--emb_dim', type=int, default=300, help='dimension of word embeddings.')
parser.add_argument('--attention_dim', type=int, default=512, help='dimension of attention linear layers.')
parser.add_argument('--decoder_dim', type=int, default=512, help='dimension of decoder RNN.')
parser.add_argument('--n_heads', type=int, default=8, help='Multi-head attention.')
parser.add_argument('--dropout', type=float, default=0.1, help='dropout')
parser.add_argument('--decoder_mode', default="transformer", help='which model does decoder use?') # lstm or transformer
parser.add_argument('--attention_method', default="ByPixel", help='which attention method to use?') # ByPixel or ByChannel
parser.add_argument('--encoder_layers', type=int, default=2, help='the number of layers of encoder in Transformer.')
parser.add_argument('--decoder_layers', type=int, default=6, help='the number of layers of decoder in Transformer.')
# Training parameters
parser.add_argument('--epochs', type=int, default=100,
help='number of epochs to train for (if early stopping is not triggered).')
parser.add_argument('--stop_criteria', type=int, default=25, help='training stop if epochs_since_improvement == stop_criteria')
parser.add_argument('--batch_size', type=int, default=32, help='batch_size')
parser.add_argument('--print_freq', type=int, default=100, help='print training/validation stats every __ batches.')
parser.add_argument('--workers', type=int, default=1, help='for data-loading; right now, only 1 works with h5pys.')
parser.add_argument('--encoder_lr', type=float, default=1e-4, help='learning rate for encoder if fine-tuning.')
parser.add_argument('--decoder_lr', type=float, default=1e-4, help='learning rate for decoder.')
parser.add_argument('--grad_clip', type=float, default=5., help='clip gradients at an absolute value of.')
parser.add_argument('--alpha_c', type=float, default=1.,
help='regularization parameter for doubly stochastic attention, as in the paper.')
parser.add_argument('--fine_tune_encoder', type=bool, default=False, help='whether fine-tune encoder or not')
parser.add_argument('--fine_tune_embedding', type=bool, default=False, help='whether fine-tune word embeddings or not')
parser.add_argument('--checkpoint', default=None, help='path to checkpoint, None if none.')
parser.add_argument('--embedding_path', default=None, help='path to pre-trained word Embedding.')
args = parser.parse_args()
# load checkpoint, these parameters can't be modified
final_args = {"emb_dim": args.emb_dim,
"attention_dim": args.attention_dim,
"decoder_dim": args.decoder_dim,
"n_heads": args.n_heads,
"dropout": args.dropout,
"decoder_mode": args.decoder_mode,
"attention_method": args.attention_method,
"encoder_layers": args.encoder_layers,
"decoder_layers": args.decoder_layers}
start_epoch = 0
best_bleu4 = 0. # BLEU-4 score right now
epochs_since_improvement = 0 # keeps track of number of epochs since there's been an improvement in validation BLEU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # sets device for model and PyTorch tensors
cudnn.benchmark = True # set to true only if inputs to model are fixed size; otherwise lot of computational overhead
print(device)
# Read word map
word_map_file = os.path.join(args.data_folder, 'WORDMAP_' + args.data_name + '.json')
with open(word_map_file, 'r') as j:
word_map = json.load(j)
# Initialize / load checkpoint
if args.checkpoint is None:
encoder = CNN_Encoder(attention_method=args.attention_method)
encoder.fine_tune(args.fine_tune_encoder)
encoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, encoder.parameters()),
lr=args.encoder_lr) if args.fine_tune_encoder else None
if args.decoder_mode == "lstm":
decoder = DecoderWithAttention(attention_dim=args.attention_dim,
embed_dim=args.emb_dim,
decoder_dim=args.decoder_dim,
vocab_size=len(word_map),
dropout=args.dropout)
elif args.decoder_mode == "transformer":
decoder = Transformer(vocab_size=len(word_map), embed_dim=args.emb_dim, encoder_layers=args.encoder_layers,
decoder_layers=args.decoder_layers, dropout=args.dropout,
attention_method=args.attention_method, n_heads=args.n_heads)
decoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, decoder.parameters()),
lr=args.decoder_lr)
# load pre-trained word embedding
if args.embedding_path is not None:
all_word_embeds = {}
for i, line in enumerate(codecs.open(args.embedding_path, 'r', 'utf-8')):
s = line.strip().split()
all_word_embeds[s[0]] = np.array([float(i) for i in s[1:]])
# change emb_dim
args.emb_dim = list(all_word_embeds.values())[-1].size
word_embeds = np.random.uniform(-np.sqrt(0.06), np.sqrt(0.06), (len(word_map), args.emb_dim))
for w in word_map:
if w in all_word_embeds:
word_embeds[word_map[w]] = all_word_embeds[w]
elif w.lower() in all_word_embeds:
word_embeds[word_map[w]] = all_word_embeds[w.lower()]
else:
# <pad> <start> <end> <unk>
embedding_i = torch.ones(1, args.emb_dim)
torch.nn.init.xavier_uniform_(embedding_i)
word_embeds[word_map[w]] = embedding_i
word_embeds = torch.FloatTensor(word_embeds).to(device)
decoder.load_pretrained_embeddings(word_embeds)
decoder.fine_tune_embeddings(args.fine_tune_embedding)
print('Loaded {} pre-trained word embeddings.'.format(len(word_embeds)))
else:
checkpoint = torch.load(args.checkpoint, map_location=str(device))
start_epoch = checkpoint['epoch'] + 1
epochs_since_improvement = checkpoint['epochs_since_improvement']
best_bleu4 = checkpoint['metrics']["Bleu_4"]
encoder = checkpoint['encoder']
encoder_optimizer = checkpoint['encoder_optimizer']
decoder = checkpoint['decoder']
decoder_optimizer = checkpoint['decoder_optimizer']
decoder.fine_tune_embeddings(args.fine_tune_embedding)
# load final_args from checkpoint
final_args = checkpoint['final_args']
for key in final_args.keys():
args.__setattr__(key, final_args[key])
if args.fine_tune_encoder is True and encoder_optimizer is None:
print("Encoder_Optimizer is None, Creating new Encoder_Optimizer!")
encoder.fine_tune(args.fine_tune_encoder)
encoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, encoder.parameters()),
lr=args.encoder_lr)
# Move to GPU, if available
decoder = decoder.to(device)
encoder = encoder.to(device)
print("encoder_layers {} decoder_layers {} n_heads {} dropout {} attention_method {} encoder_lr {} "
"decoder_lr {} alpha_c {}".format(args.encoder_layers, args.decoder_layers, args.n_heads, args.dropout,
args.attention_method, args.encoder_lr, args.decoder_lr, args.alpha_c))
print(encoder)
print(decoder)
# Loss function
criterion = nn.CrossEntropyLoss().to(device)
# Custom dataloaders
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
# normalize = transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
# pin_memory: If True, the data loader will copy Tensors into CUDA pinned memory before returning them.
# If your data elements are a custom type, or your collate_fn returns a batch that is a custom type.
train_loader = torch.utils.data.DataLoader(
CaptionDataset(args.data_folder, args.data_name, 'TRAIN', transform=transforms.Compose([normalize])),
batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True)
val_loader = torch.utils.data.DataLoader(
CaptionDataset(args.data_folder, args.data_name, 'VAL', transform=transforms.Compose([normalize])),
batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True)
# Epochs
for epoch in range(start_epoch, args.epochs):
# Decay learning rate if there is no improvement for 5 consecutive epochs, and terminate training after 25
# 8 20
if epochs_since_improvement == args.stop_criteria:
print("the model has not improved in the last {} epochs".format(args.stop_criteria))
break
if epochs_since_improvement > 0 and epochs_since_improvement % 5 == 0:
adjust_learning_rate(decoder_optimizer, 0.8)
if args.fine_tune_encoder and encoder_optimizer is not None:
print(encoder_optimizer)
adjust_learning_rate(encoder_optimizer, 0.8)
# One epoch's training
train(args, train_loader=train_loader, encoder=encoder, decoder=decoder, criterion=criterion,
encoder_optimizer=encoder_optimizer, decoder_optimizer=decoder_optimizer, epoch=epoch)
# One epoch's validation
metrics = validate(args, val_loader=val_loader, encoder=encoder, decoder=decoder, criterion=criterion)
recent_bleu4 = metrics["Bleu_4"]
# Check if there was an improvement
is_best = recent_bleu4 > best_bleu4
best_bleu4 = max(recent_bleu4, best_bleu4)
if not is_best:
epochs_since_improvement += 1
print("\nEpochs since last improvement: %d\n" % (epochs_since_improvement,))
else:
epochs_since_improvement = 0
# Save checkpoint
save_checkpoint(args.data_name, epoch, epochs_since_improvement, encoder, decoder, encoder_optimizer,
decoder_optimizer, metrics, is_best, final_args)
Traceback (most recent call last):
File D:\COCO\imge_captioning_transform_github\3\Image-Caption-master\train.py:394 in
train(args, train_loader=train_loader, encoder=encoder, decoder=decoder, criterion=criterion,
File D:\COCO\imge_captioning_transform_github\3\Image-Caption-master\train.py:44 in train
for i, (imgs, caps, caplens) in enumerate(train_loader):
File ~\anaconda3\envs\my_envir_gpu\lib\site-packages\torch\utils\data\dataloader.py:368 in iter
return self._get_iterator()
File ~\anaconda3\envs\my_envir_gpu\lib\site-packages\torch\utils\data\dataloader.py:314 in _get_iterator
return _MultiProcessingDataLoaderIter(self)
File ~\anaconda3\envs\my_envir_gpu\lib\site-packages\torch\utils\data\dataloader.py:927 in init
w.start()
File ~\anaconda3\envs\my_envir_gpu\lib\multiprocessing\process.py:121 in start
self._popen = self._Popen(self)
File ~\anaconda3\envs\my_envir_gpu\lib\multiprocessing\context.py:224 in _Popen
return _default_context.get_context().Process._Popen(process_obj)
File ~\anaconda3\envs\my_envir_gpu\lib\multiprocessing\context.py:327 in _Popen
return Popen(process_obj)
File ~\anaconda3\envs\my_envir_gpu\lib\multiprocessing\popen_spawn_win32.py:93 in init
reduction.dump(process_obj, to_child)
File ~\anaconda3\envs\my_envir_gpu\lib\multiprocessing\reduction.py:60 in dump
ForkingPickler(file, protocol).dump(obj)
File ~\anaconda3\envs\my_envir_gpu\lib\site-packages\h5py_hl\base.py:368 in getnewargs
raise TypeError("h5py objects cannot be pickled")
TypeError: h5py objects cannot be pickled
2022-06-30 17:24:41.206091: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX AVX2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-06-30 17:24:41.525476: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 3497 MB memory: -> device: 0, name: NVIDIA GeForce RTX 3060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6
2022-06-30 17:24:44.486920: W tensorflow/core/common_runtime/forward_type_inference.cc:231] Type inference failed. This indicates an invalid graph that escaped type checking. Error message: INVALID_ARGUMENT: expected compatible input types, but input 1:
type_id: TFT_OPTIONAL
args {
type_id: TFT_PRODUCT
args {
type_id: TFT_TENSOR
args {
type_id: TFT_LEGACY_VARIANT
}
}
}
is neither a subtype nor a supertype of the combined inputs preceding it:
type_id: TFT_OPTIONAL
args {
type_id: TFT_PRODUCT
args {
type_id: TFT_TENSOR
args {
type_id: TFT_INT32
}
}
}
while inferring type of node 'cond_40/output/_25'
2022-06-30 17:24:45.077383: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
Traceback (most recent call last):
File "", line 1, in
File "C:\Users\MSI\anaconda3\envs\my_envir_gpu\lib\multiprocessing\spawn.py", line 116, in spawn_main
exitcode = _main(fd, parent_sentinel)
File "C:\Users\MSI\anaconda3\envs\my_envir_gpu\lib\multiprocessing\spawn.py", line 126, in _main
self = reduction.pickle.load(from_parent)
EOFError: Ran out of input
I am using Python 3.9, PyTorch 1.10 with Cuda 11.3 (WINDOWS 10)
Thanks,
I am trying ( num_workers=0 ) , but still same error
Related
I have the following code to train a transformer-based model (huggingface) for a multi-head regression task (I call it multi-head because the model predicts multiple output scores, not only one).
# select device
device = 'cuda' if cuda.is_available() else 'cpu'
print("DEVICE: ", device)
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
TRAIN_EPOCHS = 1
TEST_EPOCHS = 1
LEARNING_RATE = 2e-05
REG_DROPOUT = 0.1
DISPLAY_STEP_THRESHOLD = 100
MODEL_NAME_CHECKPOINT = 'bert-base-uncased'
MODEL_FOLDER = 'autotune_multihead_regression_model'
# *************** DATA *********************
# load data
train_data = pd.read_csv(f"derived_data/traindev_fold{args.fold}.csv")
test_data = pd.read_csv(f"derived_data/test_fold{args.fold}.csv")
train_data['labels'] = train_data[train_data.columns[2:]].values.tolist()
test_data['labels'] = test_data[test_data.columns[2:]].values.tolist()
print("train data shape: ", train_data.shape)
print("test data shape: ", test_data.shape)
# make datasets
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)
dataset = DatasetDict({
'train': train_dataset,
'test': test_dataset
})
# initialize tokenizer
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME_CHECKPOINT)
def preprocess_function(examples):
return tokenizer(examples["full_text"], truncation=True, padding="max_length")
# apply the preprocessing on the entire dataset
encoded_dataset = dataset.map(preprocess_function, batched=True)
# ******************** FINETUNING ********************#
class BERTClass(torch.nn.Module):
def __init__(self):
super(BERTClass, self).__init__()
self.l0 = transformers.BertModel.from_pretrained(MODEL_NAME_CHECKPOINT)
self.l1 = torch.nn.Linear(768, 6)
def forward(self, input_ids, attention_mask, labels):
""""Override the function forward. Note that keys not appearing here will be removed by trainer.
It does not matter if trainer is not used.
"""
# _, output_0 = self.l0(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
_, output_0 = self.l0(input_ids, attention_mask=attention_mask, return_dict=False)
output = self.l1(output_0)
return output
def model_init():
model = BERTClass()
return model
args = TrainingArguments(
MODEL_FOLDER,
evaluation_strategy="epoch",
save_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=TRAIN_BATCH_SIZE,
per_device_eval_batch_size=VALID_BATCH_SIZE,
num_train_epochs=5,
weight_decay=0.01,
load_best_model_at_end=True,
# this is an advantage in the sense that the last model is not necessarily the best one.
metric_for_best_model="mean_rmse",
logging_strategy="steps",
logging_steps=100,
push_to_hub=False
)
def mean_rmse(outputs, targets):
""""
:param
outputs: 2D list
targets: 2D list
:returns
a scalar real number
"""
delta = outputs - targets
delta = torch.sqrt((delta ** 2).mean(axis=0))
return delta.mean()
def compute_metrics(eval_pred):
predictions, labels = eval_pred
return {"mean_rmse": mean_rmse(predictions, labels)}
class RegressionTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False):
labels = inputs.get("labels")
outputs = model(**inputs)
loss = torch.nn.MSELoss()(outputs.squeeze(), labels.squeeze())
return (loss, outputs) if return_outputs else loss
temp_dataset = encoded_dataset["train"].select(range(100))
trainer = RegressionTrainer(
model_init(),
args,
train_dataset=encoded_dataset["train"],
eval_dataset=temp_dataset,
tokenizer=tokenizer,
compute_metrics=compute_metrics
)
trainer.train()
In the code, I use a customized model because I want to have the flexibility of the head. Also, I use Trainer to train the model because I want to use hyperparameter_search.
The target (labels) of the dataset is a (row) vector of 6 variables.
Now the training seems going well when I can see the loss decreasing.
However, the code crashes when it starts the evaluation.
In the code above, I use part of the training set for evaluation, and it throws:
***** Running Evaluation *****
Num examples = 100
Batch size = 8
Traceback (most recent call last):██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:00<00:00, 13.35it/s]
File "autotune_multiregression_head_bert.py", line 152, in <module>
trainer.train()
File "/home/ubuntu/anaconda3/envs/pytorch_p37/lib/python3.7/site-packages/transformers/trainer.py", line 1504, in train
ignore_keys_for_eval=ignore_keys_for_eval,
File "/home/ubuntu/anaconda3/envs/pytorch_p37/lib/python3.7/site-packages/transformers/trainer.py", line 1834, in _inner_training_loop
self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
File "/home/ubuntu/anaconda3/envs/pytorch_p37/lib/python3.7/site-packages/transformers/trainer.py", line 2052, in _maybe_log_save_evaluate
metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
File "/home/ubuntu/anaconda3/envs/pytorch_p37/lib/python3.7/site-packages/transformers/trainer.py", line 2781, in evaluate
metric_key_prefix=metric_key_prefix,
File "/home/ubuntu/anaconda3/envs/pytorch_p37/lib/python3.7/site-packages/transformers/trainer.py", line 3059, in evaluation_loop
metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels))
File "autotune_multiregression_head_bert.py", line 130, in compute_metrics
return {"mcrmse": mcrmse_fn(predictions, labels)}
File "autotune_multiregression_head_bert.py", line 123, in mcrmse_fn
delta = outputs - targets
ValueError: operands could not be broadcast together with shapes (87,6) (100,6)
20%|
I got the same error when evaluating on the test set. I assume some examples got failed when evaluating and are not added to the final result, hence the final shape of outputs is not consistent with targets? (I debugged and see that outputs has 87 examples while targets has 100).
What has gone wrong?
I'm trying to implement the Laplace Posteriori Approximation on the last layer for the classification results obtained by BERT model. I get an error regarding input size, and after I fix it by extracting just embeddings and class labels from BERT to feed them into Laplace, I get another bunch of errors regarding input dimensions that I don't know how to debug.
As this is something I didn't find on the internet, and includes relatively new libraries, I will post here just the first error I got, code that might help in debugging and useful links.
I will update post if needed.
Of course, if someone knows how to implement Laplace Posteriori Approximation with BERT in some other library like Scikit or Trax, it would be helpful. Also, some other Transformer classification model with some other confidence approximation will be useful for me. Any help is appreciated!
Code:
# Import
import pandas as pd
import torch
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from torch import nn
from transformers import BertTokenizer
from transformers import BertModel
from transformers import BertForSequenceClassification
from sklearn.model_selection import train_test_split
import time
import os
#Toy Data
data_a_b_c = ["""category a. This is category a. In category a we talk about animals.
This category includes lions, fish, tigers, birds, elephants, mouses, dogs, cats, and all other animals."""] * 60 \
+ ["""category b. This is category b. In category b we talk about people. This category members are
Abraham Maslow, John Lennon, Drazen Petrovic, Nikola Tesla, Slavoljub Penkala, Nenad Bakic and Larry Page."""] * 60 \
+ ["""category c. This is category c. Category c is dedicated to car brands like Lamborgini, Rimac-Buggati, BMW, Mercedes,
Honda, Opel, Wolkswagen, and etc."""] * 60
label_0_1_2 = [0] * 60 + [1] * 60 + [2] * 60
d = {'text': data_a_b_c, 'labels': label_0_1_2}
df = pd.DataFrame(data=d)
print(df.head(3))
print(df.tail(3))
print(df.info())
# Parameters
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
batch_size = 2
learning_rate = 3e-4
epochs = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
labels = pd.Series(df.labels.values).to_dict()
num_classes = 3
print(f'Tokenizer: {tokenizer}, Batch size:{batch_size}, Learning rate:{learning_rate}, Epochs:{epochs}')
print('Device: ', device)
print('Number of possible classes: ', num_classes)
# Model Architecture
class TransformerModel(nn.Module):
def __init__(self, num_classes, dropout=0.5):
super(TransformerModel, self).__init__()
self.bert = BertModel.from_pretrained('bert-base-multilingual-cased')
self.dropout = nn.Dropout(dropout)
self.linear = nn.Linear(768, num_classes)
self.relu = nn.ReLU()
def forward(self, input_id, mask):
_, pooled_output = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
dropout_output = self.dropout(pooled_output)
linear_output = self.linear(dropout_output)
final_layer = self.relu(linear_output)
return final_layer
# Prepare Data Function
def prepare_data(data, labels):
texts = tokenizer(data, padding='max_length', max_length=512, truncation=True, return_tensors="pt")
input_ids = texts['input_ids']
attention_mask = texts['attention_mask']
train_dataset = TensorDataset(input_ids, attention_mask, torch.LongTensor(labels))
dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
return dataloader
#Run Training Function
def run_training(train_dataloader, val_dataloader, epochs=epochs, lr=learning_rate):
def train(dataloader):
model.train()
total_acc, total_count = 0, 0
log_interval = 128
start_time = time.time()
for idx, (input_id, mask, label) in enumerate(train_dataloader):
# print(idx)
mask = mask.to(device)
input_id = input_id.to(device)
label = label.type(torch.LongTensor).to(device)
output = model(input_id, mask)
optimizer.zero_grad()
loss = criterion(output, label)
loss.backward()
# torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
optimizer.step()
total_acc += (output.argmax(1) == label).sum().item()
total_count += label.size(0)
if idx % log_interval == 0 and idx > 0:
elapsed = time.time() - start_time
print('| epoch {:3d} | {:5d}/{:5d} batches '
'| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
total_acc / total_count))
total_acc, total_count = 0, 0
start_time = time.time()
def evaluate(dataloader):
model.eval()
total_acc, total_count = 0, 0
with torch.no_grad():
for idx, (input_id, mask, label) in enumerate(dataloader):
mask = mask.to(device)
input_id = input_id.to(device)
label = label.to(device)
output = model(input_id, mask)
total_acc += (output.argmax(1) == label).sum().item()
total_count += label.size(0)
return total_acc / total_count
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
cuda = torch.cuda.is_available()
device = torch.device("cuda" if cuda else "cpu")
device = 'cuda'
model.to(device)
total_accu = None
for epoch in range(1, epochs + 1):
epoch_start_time = time.time()
train(train_dataloader)
accu_val = evaluate(val_dataloader)
if total_accu is not None and total_accu > accu_val:
scheduler.step()
else:
total_accu = accu_val
print('-' * 59)
print('| end of epoch {:3d} | time: {:5.2f}s | '
'valid accuracy {:8.3f} '.format(epoch,
time.time() - epoch_start_time,
accu_val))
print('-' * 59)
# Data Split And Preparation
X_train, X_test, y_train, y_test = train_test_split(df.text.values.tolist(), df.labels.values.tolist(), test_size=0.2, random_state=2)
train_dataloader = prepare_data(X_train, y_train)
val_dataloader = prepare_data(X_test, y_test)
# Run The Model
model = TransformerModel(num_classes)
run_training(train_dataloader, val_dataloader)
print('finished')
# Save And Load The Model (if needed)
PATH = ".../Torch_BERT_model"
torch.save(model, os.path.join(PATH, "Toy_Data_BERT.pth"))
model = torch.load(os.path.join(PATH, "Toy_Data_BERT.pth"))
print(model)
# Laplace
from laplace import Laplace
la = Laplace(model, 'classification', subset_of_weights='last_layer', hessian_structure='full')
la.fit(train_dataloader)
Error I get:
--------------------------------------------------------------------------- ValueError Traceback (most recent call
last) ~\AppData\Local\Temp\ipykernel_7144\3779742208.py in <cell line:
2>()
1 la = Laplace(model, 'classification', subset_of_weights='last_layer', hessian_structure='full')
----> 2 la.fit(train_dataloader)
~\anaconda3\lib\site-packages\laplace\lllaplace.py in fit(self,
train_loader, override)
98
99 if self.model.last_layer is None:
--> 100 X, _ = next(iter(train_loader))
101 with torch.no_grad():
102 try:
ValueError: too many values to unpack (expected 2)
Useful link for Laplace implementation with examples:
https://aleximmer.github.io/Laplace/#full-example-optimization-of-the-marginal-likelihood-and-prediction
Code that might help in debugging:
for x in train_dataloader:
print("The length of batch is:", len(x))
print()
print("The batch looks like:", x)
print()
print("The length of the first element in the batch is:") #embedding
print(len(x[0]))
print("The length of the second element in the batch is:") #1 if place is filled with word, 0 if it's empty?
print(len(x[1]))
print("The length of the third element in the batch is:") #category
print(len(x[2]))
print()
print("The lengths of the first tensor and second tensor in the first element in the batch is:")
print(len(x[0][0]), len(x[0][1])) # = max_length (512)
print("The lengths of the first tensor and second tensor in the second element in the batch is:")
print(len(x[1][0]), len(x[1][1])) # = max_length (512)
print()
print()
The laplace library expects that the dataloader returns two parameters (X,y) and that the model requires exactly one argument to make its prediction (code). But your model forward pass requires two arguments, namely input_id and mask, and your dataloader returns three arguments input_id, mask, and labels.
There are several ways to work around this limitation (e.g. return a dict with input_ids and attention_mask). The way that requires the least understanding of the internals of the laplace library is to generate the attention mask at runtime in the forward pass (not great for the performance):
class TransformerModel(nn.Module):
def __init__(self, num_classes, pad_id, dropout=0.5):
super(TransformerModel, self).__init__()
self.bert = BertModel.from_pretrained('bert-base-multilingual-cased')
self.dropout = nn.Dropout(dropout)
self.linear = nn.Linear(768, num_classes)
self.relu = nn.ReLU()
self.pad_id = pad_id
def forward(self, input_id):
mask = (input_ids!=self.pad_id).type(input_ids.dtype)
_, pooled_output = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
dropout_output = self.dropout(pooled_output)
linear_output = self.linear(dropout_output)
final_layer = self.relu(linear_output)
return final_layer
model = TransformerModel(num_classes, tokenizer.pad_token_id)
For some reason while training my VAE my RAM usage is steadily increasing, and I cannot seem to pin point why.
I have narrowed down the problem to my save_plots function by using psutil.virtual_memory() checking my virtual memory between function calls.
Here is the code for the VAE model and initialization of model and training params:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torch.utils.data import DataLoader
from utils import modelSummary, train_evaluate, plot_training_results
class Encoder(nn.Module):
def __init__(self, latent_dims) -> None:
super(Encoder, self).__init__()
self.conv1 = nn.Conv2d(1, 64, 3, stride = 2, bias = False)
self.batchnorm1 = nn.BatchNorm2d(64)
self.conv2 = nn.Conv2d(64, 128 , 3, stride = 2, bias = False)
self.batchnorm2 = nn.BatchNorm2d(128)
self.conv3 = nn.Conv2d(128, 128, 3, stride = 2) # (#num samples, 64 , 2 , 2)
self.flatten = nn.Flatten(start_dim = 1) # (#num samples, 256)
self.linear1 = nn.Linear(512, 1024)
self.mu = nn.Linear(1024, latent_dims)
self.sigma = nn.Linear(1024, latent_dims)
self.N = torch.distributions.Normal(0, 1)
self.N.loc = self.N.loc.cuda()
self.N.scale = self.N.scale.cuda()
self.kl = 0
def forward(self, x):
x = F.relu(self.conv1(x))
x = self.batchnorm1(x)
x = F.relu(self.conv2(x))
x = self.batchnorm2(x)
x = self.conv3(x)
x = self.flatten(x)
x = F.relu(self.linear1(x))
mu = self.mu(x)
sigma = torch.exp(self.sigma(x))
z = mu + sigma * self.N.sample(mu.shape)
self.kl = (sigma**2 + mu**2 - torch.log(sigma) - 0.5).sum()
return z
class Decoder(nn.Module):
def __init__(self, latent_dims) -> None:
super(Decoder, self).__init__()
self.linear1 = nn.Linear(latent_dims, 512)
self.deconv1 = nn.ConvTranspose2d(32, 128, 3, stride = 3, padding = 1, output_padding = 2, bias = False)
self.batchnorm1 = nn.BatchNorm2d(128)
self.deconv2 = nn.ConvTranspose2d(128, 64, 3, stride = 2, output_padding = 1, bias = False)
self.batchnorm2 = nn.BatchNorm2d(64)
self.deconv3 = nn.ConvTranspose2d(64, 1, 3)
def forward(self, x):
x = F.relu(self.linear1(x))
x = x.view(-1, 32, 4, 4)
x = F.relu(self.deconv1(x))
x = self.batchnorm1(x)
x = F.relu(self.deconv2(x))
x = self.batchnorm2(x)
x = torch.sigmoid(self.deconv3(x))
return x
class VariationalAutoEncoder(nn.Module):
def __init__(self, latent_dims) -> None:
super(VariationalAutoEncoder, self).__init__()
self.encoder = Encoder(latent_dims)
self.decoder = Decoder(latent_dims)
def forward(self, x):
z = self.encoder(x)
return self.decoder(z)
if __name__ == '__main__':
# Initialize Model
latent_dims = 256
model = VariationalAutoEncoder(latent_dims)
modelSummary(model)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}\n")
training_params = {
'num_epochs': 200,
'batch_size': 512,
'loss_function':F.mse_loss,
'optimizer': torch.optim.Adam(model.parameters(), lr=1e-4),
'save_path': 'training_256',
'sample_size': 10,
'plot_every': 1,
'latent_dims' : latent_dims
}
# Load Data
train_dataset = DataLoader(torchvision.datasets.MNIST(root = './data', train = True, download = True, transform = torchvision.transforms.ToTensor()), batch_size = training_params['batch_size'])
validation_dataset = DataLoader(torchvision.datasets.MNIST(root = './data', train = False, download = True, transform = torchvision.transforms.ToTensor()), batch_size = training_params['batch_size'])
metrics = {
'l1': lambda output, target: (torch.abs(output - target).sum())
}
train_results, evaluation_results = train_evaluate(model, device, train_dataset, validation_dataset, training_params, metrics)
plot_training_results(train_results=train_results, validation_results=evaluation_results, training_params=training_params, metrics=metrics)
Here is my utils.py file containing the training loop and other utility functions
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import time
import gc
import numpy as np
import matplotlib.pyplot as plt
def modelSummary(model, verbose=False):
if verbose:
print(model)
total_parameters = 0
for name, param in model.named_parameters():
num_params = param.size()[0]
total_parameters += num_params
if verbose:
print(f"Layer: {name}")
print(f"\tNumber of parameters: {num_params}")
print(f"\tShape: {param.shape}")
if total_parameters > 1e5:
print(f"Total number of parameters: {total_parameters/1e6:.2f}M")
else:
print(f"Total number of parameters: {total_parameters/1e3:.2f}K")
def train_epoch(model: nn.Module, device: torch.device, train_dataloader: DataLoader, training_params: dict, metrics: dict):
"""_summary_
Args:
model (nn.Module): Model to be trained by
device (str): device to be trained on
train_dataloader (nn.data.DataLoader): Dataloader object to load batches of dataset
training_params (dict): Dictionary of training parameters containing "batch_size", "loss_function"
"optimizer".
metrics (dict): Dictionary of functional methods that would compute the metric value
Returns:
run_results (dict): Dictionary of metrics computed for the epoch
"""
OPTIMIZER = training_params["optimizer"]
model = model.to(device)
model.train()
# Dictionary holding result of this epoch
run_results = dict()
for metric in metrics:
run_results[metric] = 0.0
run_results["loss"] = 0.0
# Iterate over batches
num_batches = 0
for x, target in train_dataloader:
num_batches += 1
# Move tensors to device
input = x.to(device)
# Forward pass
output = model(input)
# Compute loss
loss = ((output - input)**2).sum() + model.encoder.kl
# Backward pass
OPTIMIZER.zero_grad()
loss.backward()
OPTIMIZER.step()
# Update metrics
run_results["loss"] += loss.detach().item()
for key, func in metrics.items():
run_results[key] += func(output, input).detach().item()
# Clean up memory
del loss
del input
del output
for key in run_results:
run_results[key] /= num_batches
return run_results
def evaluate_epoch(model: nn.Module, device: torch.device, validation_dataloader: DataLoader, training_params: dict, metrics: dict):
"""_summary_
Args:
model (nn.Module): model to evaluate
device (str): device to evaluate on
validation_dataloader (DataLoader): DataLoader for evaluation
training_params (dict): Dictionary of training parameters containing "batch_size", "loss_function"
"optimizer".
metrics (dict): Dictionary of functional methods that would compute the metric value
Returns:
run_results (dict): Dictionary of metrics computed for the epoch
"""
model = model.to(device)
# Dictionary holding result of this epoch
run_results = dict()
for metric in metrics:
run_results[metric] = 0.0
run_results["loss"] = 0.0
# Iterate over batches
with torch.no_grad():
model.eval()
num_batches = 0
for x, target in validation_dataloader:
num_batches += 1
# Move tensors to device
input = x.to(device)
target = target.to(device)
# Forward pass
output = model(input)
# Compute loss
loss = ((output - input)**2).sum() + model.encoder.kl
# Update metrics
run_results["loss"] += loss.detach().item()
for key, func in metrics.items():
run_results[key] += func(output, input).detach().item()
# Clean up memory
del loss
del input
del output
for key in run_results:
run_results[key] /= num_batches
return run_results
def train_evaluate(model: nn.Module, device: torch.device, train_dataloader: DataLoader, validation_dataloader: DataLoader, training_params: dict, metrics: dict):
"""Function to train a model and provide statistics during training
Args:
model (nn.Module): Model to be trained
device (torch.device): Device to be trained on
train_dataset (DataLoader): Dataset to be trained on
validation_dataset (DataLoader): Dataset to be evaluated on
training_params (dict): Dictionary of training parameters containing "num_epochs", "batch_size", "loss_function",
"save_path", "optimizer"
metrics (dict): Dictionary of functional methods that would compute the metric value
Returns:
_type_: _description_
"""
NUM_EPOCHS = training_params["num_epochs"]
BATCH_SIZE = training_params["batch_size"]
SAVE_PATH = training_params["save_path"]
SAMPLE_SIZE = training_params["sample_size"]
PLOT_EVERY = training_params["plot_every"]
LATENT_DIMS = training_params["latent_dims"]
# Initialize metrics
train_results = dict()
train_results['loss'] = np.empty(1)
evaluation_results = dict()
evaluation_results['loss'] = np.empty(1)
for metric in metrics:
train_results[metric] = np.empty(1)
evaluation_results[metric] = np.empty(1)
batch = next(iter(validation_dataloader))
idxs = []
for i in range(SAMPLE_SIZE):
idx = torch.where(batch[1] == i)[0].squeeze()[0]
idxs.append(idx.item())
FIXED_SAMPLES = batch[0][idxs].to(device).detach()
FIXED_NOISE = torch.normal(0, 1, size = (100, LATENT_DIMS)).to(device).detach()
del idxs
del batch
for epoch in range(NUM_EPOCHS):
start = time.time()
print(f"======== Epoch {epoch+1}/{NUM_EPOCHS} ========")
# Train Model
print("Training ... ")
epoch_train_results = train_epoch(model, device, train_dataloader, training_params, metrics)
# Evaluate Model
print("Evaluating ... ")
epoch_evaluation_results = evaluate_epoch(model, device, validation_dataloader, training_params, metrics)
for metric in metrics:
np.append(train_results[metric], epoch_train_results[metric])
np.append(evaluation_results[metric], epoch_evaluation_results[metric])
# Print results of epoch
print(f"Completed Epoch {epoch+1}/{NUM_EPOCHS} in {(time.time() - start):.2f}s")
print(f"Train Loss: {epoch_train_results['loss']:.2f} \t Validation Loss: {epoch_evaluation_results['loss']:.2f}")
# Plot results
if epoch % PLOT_EVERY == 0:
save_plots(FIXED_SAMPLES, FIXED_NOISE, model, device, epoch, training_params)
print(f"Items cleaned up: {gc.collect()}")
# Save model
SAVE = f"{SAVE_PATH}_epoch{epoch + 1}.pt"
torch.save(model.state_dict(), SAVE)
return train_results, evaluation_results
def save_plots(fixed_samples, fixed_noise, model, device, epoch, training_params):
"""Function to save plots of the model
Args:
fixed_samples (torch.Tensor): Samples to be plotted
fixed_noise (torch.Tensor): Noise to be plotted
model (nn.Module): Model to be tested
epoch (int): Epoch number
SAVE_PATH (str): Path to save plots
"""
SAMPLE_SIZE = training_params["sample_size"]
SAVE_PATH = training_params["save_path"]
with torch.no_grad():
model.eval()
fixed_samples = fixed_samples.to(device)
fixed_noise = fixed_noise.to(device)
outputs = model(fixed_samples)
generated_images = model.decoder(fixed_noise)
fig, ax = plt.subplots(2, SAMPLE_SIZE, figsize=(SAMPLE_SIZE * 5,15))
for i in range(SAMPLE_SIZE):
image = fixed_samples[i].detach().cpu().numpy()
output = outputs[i].detach().cpu().numpy()
ax[0][i].imshow(image.reshape(28,28))
ax[1][i].imshow(output.reshape(28,28))
plt.savefig(f"{SAVE_PATH}/training_images/epoch{epoch + 1}.png")
plt.close()
del fig, ax
del output
del outputs
_, axs = plt.subplots(10, 10, figsize=(30, 20))
axs = axs.flatten()
for image, ax in zip(generated_images, axs):
ax.imshow(image.cpu().numpy().reshape(28, 28))
ax.axis('off')
plt.savefig(f"{SAVE_PATH}/generated_images/epoch{epoch + 1}.png")
plt.close()
# Clean up memory
del generated_images
del image
del _, axs
def plot_training_results(train_results, validation_results, training_params, metrics):
"""Function to plot training results
Args:
train_results (dict): Dictionary of training results
validation_results (dict): Dictionary of validation results
"""
plt.plot(train_results['loss'], label='Training Loss')
plt.plot(validation_results['loss'], label='Validation Loss')
for metric in metrics:
plt.plot(train_results[metric], label=f"Train {metric}")
plt.plot(validation_results[metric], label=f"Validation {metric}")
plt.legend()
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.savefig(f"{training_params['save_path']}_training_results.png")
plt.show()
if __name__ == '__main__':
pass
Am I doing something wrong while detaching? Or is it a problem with the number of figures I am saving?
On another sidenote, while training the following by running in a terminal and just calling python VAE.py, I run out of memory due to the steady increase as mentioned above, however if I run it on VSCode it seems to clean up my memory as it nears max, is there any documentation of this or am I mistaken?
I am trying to train a faster-RCNN model for bounding box detection on a custom COCO-like dataset. I am using a GPU, even though I use .to(device) to push tensors into the GPU, I keep getting the following error:
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py in linear(input, weight, bias)
1845 if has_torch_function_variadic(input, weight):
1846 return handle_torch_function(linear, (input, weight), input, weight, bias=bias)
-> 1847 return torch._C._nn.linear(input, weight, bias)
1848
1849
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking arugment for argument mat1 in method wrapper_addmm)
Training snippet:
# Initialize Dataset
train_dataset = TrainDataset('coco_train.json')
def collate_fn(batch):
return tuple(zip(*batch))
train_data_loader = DataLoader(
train_dataset,
batch_size=2,
shuffle=True,
num_workers=2,
collate_fn=collate_fn
)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=False)
model.to(device) # EDIT
num_classes = 3 # eyelids, iris + background
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)
num_epochs = 40
itr = 1
for epoch in range(num_epochs):
for images, targets in train_data_loader:
images = list(image.to(device) for image in images)
targets = [ { k: v.to(device) for k, v in t.items() } for t in targets ]
loss_dict = model(images, targets)
losses = sum(loss for loss in loss_dict.values())
loss_value = losses.item()
optimizer.zero_grad()
losses.backward()
optimizer.step()
if itr % 50 == 0:
print(f"Iteration #{itr} loss: {loss_value}")
itr += 1
lr_scheduler.step()
My Dataset getitem() snippet: (truncated)
def __getitem__(self, index : int):
...
target = {}
target["boxes"] = torch.as_tensor(boxes, dtype=torch.float32)
target["labels"] = torch.as_tensor(labels, dtype=torch.int64)
target["image_id"] = torch.as_tensor([ int(image_id) ], dtype=torch.int64)
target["area"] = torch.as_tensor(area, dtype=torch.float32)
target["iscrowd"] = torch.as_tensor(iscrowd, dtype=torch.int64)
image = torchvision.transforms.ToTensor()(image)
return image, target
You should load your model on the GPU device as well:
model.to(device)
Note, torch.nn.Module.to is an inplace operation.
These are pytorch-yolo v3 code. I downloaded it in github. (https://github.com/eriklindernoren/PyTorch-YOLOv3)
I tuned this for two classes. And while I'm doing trainning, there is still an error.
This is test.py code.
from __future__ import division
from models import *
from utils.utils import *
from utils.datasets import *
from utils.parse_config import *
import os
import sys
import time
import datetime
import argparse
import tqdm
import torch
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision import transforms
from torch.autograd import Variable
import torch.optim as optim
def evaluate(model, path, iou_thres, conf_thres, nms_thres, img_size, batch_size):
model.eval()
# Get dataloader
dataset = ListDataset(path, img_size=img_size, augment=False, multiscale=False)
dataloader = torch.utils.data.DataLoader(
dataset, batch_size=batch_size, shuffle=False, num_workers=1, collate_fn=dataset.collate_fn
)
Tensor = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor
labels = []
sample_metrics = [] # List of tuples (TP, confs, pred)
for batch_i, (_, imgs, targets) in enumerate(tqdm.tqdm(dataloader, desc="Detecting objects")):
# Extract labels
labels += targets[:, 1].tolist()
# Rescale target
targets[:, 2:] = xywh2xyxy(targets[:, 2:])
targets[:, 2:] *= img_size
imgs = Variable(imgs.type(Tensor), requires_grad=False)
with torch.no_grad():
outputs = model(imgs)
outputs = non_max_suppression(outputs, conf_thres=conf_thres, nms_thres=nms_thres)
sample_metrics += get_batch_statistics(outputs, targets, iou_threshold=iou_thres)
# Concatenate sample statistics
true_positives, pred_scores, pred_labels = [np.concatenate(x, 0) for x in list(zip(*sample_metrics))]
precision, recall, AP, f1, ap_class = ap_per_class(true_positives, pred_scores, pred_labels, labels)
return precision, recall, AP, f1, ap_class
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--batch_size", type=int, default=8, help="size of each image batch")
parser.add_argument("--model_def", type=str, default="config/yolov3.cfg", help="path to model definition file")
parser.add_argument("--data_config", type=str, default="config/coco.data", help="path to data config file")
parser.add_argument("--weights_path", type=str, default="weights/yolov3.weights", help="path to weights file")
parser.add_argument("--class_path", type=str, default="data/coco.names", help="path to class label file")
parser.add_argument("--iou_thres", type=float, default=0.5, help="iou threshold required to qualify as detected")
parser.add_argument("--conf_thres", type=float, default=0.001, help="object confidence threshold")
parser.add_argument("--nms_thres", type=float, default=0.5, help="iou thresshold for non-maximum suppression")
parser.add_argument("--n_cpu", type=int, default=8, help="number of cpu threads to use during batch generation")
parser.add_argument("--img_size", type=int, default=416, help="size of each image dimension")
opt = parser.parse_args()
print(opt)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
data_config = parse_data_config(opt.data_config)
valid_path = data_config["valid"]
class_names = load_classes(data_config["names"])
# Initiate model
model = Darknet(opt.model_def).to(device)
if opt.weights_path.endswith(".weights"):
# Load darknet weights
model.load_darknet_weights(opt.weights_path)
else:
# Load checkpoint weights
model.load_state_dict(torch.load(opt.weights_path))
print("Compute mAP...")
precision, recall, AP, f1, ap_class = evaluate(
model,
path=valid_path,
iou_thres=opt.iou_thres,
conf_thres=opt.conf_thres,
nms_thres=opt.nms_thres,
img_size=opt.img_size,
batch_size=8,
)
print("Average Precisions:")
for i, c in enumerate(ap_class):
print(f"+ Class '{c}' ({class_names[c]}) - AP: {AP[i]}")
print(f"mAP: {AP.mean()}")
And, this is trainplate.py code. (Originally, this is train.py. But, I renamed it.)
from models import *
from utils.logger import *
from utils.utils import *
from utils.datasets import *
from utils.parse_config import *
from test import evaluate
from terminaltables import AsciiTable
import os
import sys
import time
import datetime
import argparse
import torch
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision import transforms
from torch.autograd import Variable
import torch.optim as optim
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--epochs", type=int, default=3, help="number of epochs")
parser.add_argument("--batch_size", type=int, default=8, help="size of each image batch")
parser.add_argument("--gradient_accumulations", type=int, default=2, help="number of gradient accums before step")
parser.add_argument("--model_def", type=str, default="config/yolov3plate.cfg", help="path to model definition file")
parser.add_argument("--data_config", type=str, default="config/plate.data", help="path to data config file")
parser.add_argument("--pretrained_weights", type=str, help="if specified starts from checkpoint model")
parser.add_argument("--n_cpu", type=int, default=8, help="number of cpu threads to use during batch generation")
parser.add_argument("--img_size", type=int, default=416, help="size of each image dimension")
parser.add_argument("--checkpoint_interval", type=int, default=1, help="interval between saving model weights")
parser.add_argument("--evaluation_interval", type=int, default=1, help="interval evaluations on validation set")
parser.add_argument("--compute_map", default=False, help="if True computes mAP every tenth batch")
parser.add_argument("--multiscale_training", default=True, help="allow for multi-scale training")
opt = parser.parse_args()
print(opt)
logger = Logger("logs")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
os.makedirs("output", exist_ok=True)
os.makedirs("checkpoints", exist_ok=True)
# Get data configuration
data_config = parse_data_config(opt.data_config)
train_path = data_config["train"]
valid_path = data_config["valid"]
class_names = load_classes(data_config["names"])
# Initiate model
model = Darknet(opt.model_def).to(device)
model.apply(weights_init_normal)
# If specified we start from checkpoint
if opt.pretrained_weights:
if opt.pretrained_weights.endswith(".pth"):
model.load_state_dict(torch.load(opt.pretrained_weights))
else:
model.load_darknet_weights(opt.pretrained_weights)
# Get dataloader
dataset = ListDataset(train_path, augment=True, multiscale=opt.multiscale_training)
dataloader = torch.utils.data.DataLoader(
dataset,
batch_size=opt.batch_size,
shuffle=True,
num_workers=opt.n_cpu,
pin_memory=True,
collate_fn=dataset.collate_fn,
)
optimizer = torch.optim.Adam(model.parameters())
metrics = [
"grid_size",
"loss",
"x",
"y",
"w",
"h",
"conf",
"cls",
"cls_acc",
"recall50",
"recall75",
"precision",
"conf_obj",
"conf_noobj",
]
for epoch in range(opt.epochs):
model.train()
start_time = time.time()
for batch_i, (_, imgs, targets) in enumerate(dataloader):
batches_done = len(dataloader) * epoch + batch_i
imgs = Variable(imgs.to(device))
targets = Variable(targets.to(device), requires_grad=False)
loss, outputs = model(imgs, targets)
loss.backward()
if batches_done % opt.gradient_accumulations:
# Accumulates gradient before each step
optimizer.step()
optimizer.zero_grad()
# ----------------
# Log progress
# ----------------
log_str = "\n---- [Epoch %d/%d, Batch %d/%d] ----\n" % (epoch, opt.epochs, batch_i, len(dataloader))
metric_table = [["Metrics", *[f"YOLO Layer {i}" for i in range(len(model.yolo_layers))]]]
# Log metrics at each YOLO layer
for i, metric in enumerate(metrics):
formats = {m: "%.6f" for m in metrics}
formats["grid_size"] = "%2d"
formats["cls_acc"] = "%.2f%%"
row_metrics = [formats[metric] % yolo.metrics.get(metric, 0) for yolo in model.yolo_layers]
metric_table += [[metric, *row_metrics]]
# Tensorboard logging
tensorboard_log = []
for j, yolo in enumerate(model.yolo_layers):
for name, metric in yolo.metrics.items():
if name != "grid_size":
tensorboard_log += [(f"{name}_{j+1}", metric)]
tensorboard_log += [("loss", loss.item())]
logger.list_of_scalars_summary(tensorboard_log, batches_done)
log_str += AsciiTable(metric_table).table
log_str += f"\nTotal loss {loss.item()}"
# Determine approximate time left for epoch
epoch_batches_left = len(dataloader) - (batch_i + 1)
time_left = datetime.timedelta(seconds=epoch_batches_left * (time.time() - start_time) / (batch_i + 1))
log_str += f"\n---- ETA {time_left}"
print(log_str)
model.seen += imgs.size(0)
if epoch % opt.evaluation_interval == 0:
print("\n---- Evaluating Model ----")
# Evaluate the model on the validation set
precision, recall, AP, f1, ap_class = evaluate(
model,
path=valid_path,
iou_thres=0.5,
conf_thres=0.5,
nms_thres=0.5,
img_size=opt.img_size,
batch_size=8,
)
evaluation_metrics = [
("val_precision", precision.mean()),
("val_recall", recall.mean()),
("val_mAP", AP.mean()),
("val_f1", f1.mean()),
]
logger.list_of_scalars_summary(evaluation_metrics, epoch)
# Print class APs and mAP
ap_table = [["Index", "Class name", "AP"]]
for i, c in enumerate(ap_class):
ap_table += [[c, class_names[c], "%.5f" % AP[i]]]
print(AsciiTable(ap_table).table)
print(f"---- mAP {AP.mean()}")
if epoch % opt.checkpoint_interval == 0:
torch.save(model.state_dict(), f"checkpoints/yolov3_ckpt_%d.pth" % epoch)
Whenever I run the trainplate.py code, I get the following ValueErrorr: What should I do?
---- Evaluating Model ----
Detecting objects: 0it [00:00, ?it/s]
Traceback (most recent call last):
File "C:/Users/jr291/Desktop/연구/PyTorch-YOLOv3/trainplate.py", line 160, in <module>
batch_size=8,
File "C:\Users\jr291\Desktop\연구\PyTorch-YOLOv3\test.py", line 53, in evaluate
true_positives, pred_scores, pred_labels = [np.concatenate(x, 0) for x in list(zip(*sample_metrics))]
ValueError: not enough values to unpack (expected 3, got 0)
Also, get_batch_statistics function is like below.
def get_batch_statistics(outputs, targets, iou_threshold):
""" Compute true positives, predicted scores and predicted labels per sample """
batch_metrics = []
for sample_i in range(len(outputs)):
if outputs[sample_i] is None:
continue
output = outputs[sample_i]
pred_boxes = output[:, :4]
pred_scores = output[:, 4]
pred_labels = output[:, -1]
true_positives = np.zeros(pred_boxes.shape[0])
annotations = targets[targets[:, 0] == sample_i][:, 1:]
target_labels = annotations[:, 0] if len(annotations) else []
if len(annotations):
detected_boxes = []
target_boxes = annotations[:, 1:]
for pred_i, (pred_box, pred_label) in enumerate(zip(pred_boxes, pred_labels)):
# If targets are found break
if len(detected_boxes) == len(annotations):
break
# Ignore if label is not one of the target labels
if pred_label not in target_labels:
continue
iou, box_index = bbox_iou(pred_box.unsqueeze(0), target_boxes).max(0)
if iou >= iou_threshold and box_index not in detected_boxes:
true_positives[pred_i] = 1
detected_boxes += [box_index]
batch_metrics.append([true_positives, pred_scores, pred_labels])
return batch_metrics
It seems that this list of comprehension: [np.concatenate(x, 0) for x in list(zip(*sample_metrics))] is empty. It is hard to say since I don't know how sample_metrics looks like, because I don't see definition of get_batch_statistics in this sentence: sample_metrics += get_batch_statistics(outputs, targets, iou_threshold=iou_thres).
But this might helps.
A statement like this:
list = [2, 3, 4]
a, b, c = list
means same what this:
list = [2, 3, 4]
a = list[0]
b = list[1]
c = list[2]
but if your list is list = [1, 2] and you try to unpack it: a, b, c = list, then you get a similar error to yours.