These are pytorch-yolo v3 code. I downloaded it in github. (https://github.com/eriklindernoren/PyTorch-YOLOv3)
I tuned this for two classes. And while I'm doing trainning, there is still an error.
This is test.py code.
from __future__ import division
from models import *
from utils.utils import *
from utils.datasets import *
from utils.parse_config import *
import os
import sys
import time
import datetime
import argparse
import tqdm
import torch
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision import transforms
from torch.autograd import Variable
import torch.optim as optim
def evaluate(model, path, iou_thres, conf_thres, nms_thres, img_size, batch_size):
model.eval()
# Get dataloader
dataset = ListDataset(path, img_size=img_size, augment=False, multiscale=False)
dataloader = torch.utils.data.DataLoader(
dataset, batch_size=batch_size, shuffle=False, num_workers=1, collate_fn=dataset.collate_fn
)
Tensor = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor
labels = []
sample_metrics = [] # List of tuples (TP, confs, pred)
for batch_i, (_, imgs, targets) in enumerate(tqdm.tqdm(dataloader, desc="Detecting objects")):
# Extract labels
labels += targets[:, 1].tolist()
# Rescale target
targets[:, 2:] = xywh2xyxy(targets[:, 2:])
targets[:, 2:] *= img_size
imgs = Variable(imgs.type(Tensor), requires_grad=False)
with torch.no_grad():
outputs = model(imgs)
outputs = non_max_suppression(outputs, conf_thres=conf_thres, nms_thres=nms_thres)
sample_metrics += get_batch_statistics(outputs, targets, iou_threshold=iou_thres)
# Concatenate sample statistics
true_positives, pred_scores, pred_labels = [np.concatenate(x, 0) for x in list(zip(*sample_metrics))]
precision, recall, AP, f1, ap_class = ap_per_class(true_positives, pred_scores, pred_labels, labels)
return precision, recall, AP, f1, ap_class
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--batch_size", type=int, default=8, help="size of each image batch")
parser.add_argument("--model_def", type=str, default="config/yolov3.cfg", help="path to model definition file")
parser.add_argument("--data_config", type=str, default="config/coco.data", help="path to data config file")
parser.add_argument("--weights_path", type=str, default="weights/yolov3.weights", help="path to weights file")
parser.add_argument("--class_path", type=str, default="data/coco.names", help="path to class label file")
parser.add_argument("--iou_thres", type=float, default=0.5, help="iou threshold required to qualify as detected")
parser.add_argument("--conf_thres", type=float, default=0.001, help="object confidence threshold")
parser.add_argument("--nms_thres", type=float, default=0.5, help="iou thresshold for non-maximum suppression")
parser.add_argument("--n_cpu", type=int, default=8, help="number of cpu threads to use during batch generation")
parser.add_argument("--img_size", type=int, default=416, help="size of each image dimension")
opt = parser.parse_args()
print(opt)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
data_config = parse_data_config(opt.data_config)
valid_path = data_config["valid"]
class_names = load_classes(data_config["names"])
# Initiate model
model = Darknet(opt.model_def).to(device)
if opt.weights_path.endswith(".weights"):
# Load darknet weights
model.load_darknet_weights(opt.weights_path)
else:
# Load checkpoint weights
model.load_state_dict(torch.load(opt.weights_path))
print("Compute mAP...")
precision, recall, AP, f1, ap_class = evaluate(
model,
path=valid_path,
iou_thres=opt.iou_thres,
conf_thres=opt.conf_thres,
nms_thres=opt.nms_thres,
img_size=opt.img_size,
batch_size=8,
)
print("Average Precisions:")
for i, c in enumerate(ap_class):
print(f"+ Class '{c}' ({class_names[c]}) - AP: {AP[i]}")
print(f"mAP: {AP.mean()}")
And, this is trainplate.py code. (Originally, this is train.py. But, I renamed it.)
from models import *
from utils.logger import *
from utils.utils import *
from utils.datasets import *
from utils.parse_config import *
from test import evaluate
from terminaltables import AsciiTable
import os
import sys
import time
import datetime
import argparse
import torch
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision import transforms
from torch.autograd import Variable
import torch.optim as optim
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--epochs", type=int, default=3, help="number of epochs")
parser.add_argument("--batch_size", type=int, default=8, help="size of each image batch")
parser.add_argument("--gradient_accumulations", type=int, default=2, help="number of gradient accums before step")
parser.add_argument("--model_def", type=str, default="config/yolov3plate.cfg", help="path to model definition file")
parser.add_argument("--data_config", type=str, default="config/plate.data", help="path to data config file")
parser.add_argument("--pretrained_weights", type=str, help="if specified starts from checkpoint model")
parser.add_argument("--n_cpu", type=int, default=8, help="number of cpu threads to use during batch generation")
parser.add_argument("--img_size", type=int, default=416, help="size of each image dimension")
parser.add_argument("--checkpoint_interval", type=int, default=1, help="interval between saving model weights")
parser.add_argument("--evaluation_interval", type=int, default=1, help="interval evaluations on validation set")
parser.add_argument("--compute_map", default=False, help="if True computes mAP every tenth batch")
parser.add_argument("--multiscale_training", default=True, help="allow for multi-scale training")
opt = parser.parse_args()
print(opt)
logger = Logger("logs")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
os.makedirs("output", exist_ok=True)
os.makedirs("checkpoints", exist_ok=True)
# Get data configuration
data_config = parse_data_config(opt.data_config)
train_path = data_config["train"]
valid_path = data_config["valid"]
class_names = load_classes(data_config["names"])
# Initiate model
model = Darknet(opt.model_def).to(device)
model.apply(weights_init_normal)
# If specified we start from checkpoint
if opt.pretrained_weights:
if opt.pretrained_weights.endswith(".pth"):
model.load_state_dict(torch.load(opt.pretrained_weights))
else:
model.load_darknet_weights(opt.pretrained_weights)
# Get dataloader
dataset = ListDataset(train_path, augment=True, multiscale=opt.multiscale_training)
dataloader = torch.utils.data.DataLoader(
dataset,
batch_size=opt.batch_size,
shuffle=True,
num_workers=opt.n_cpu,
pin_memory=True,
collate_fn=dataset.collate_fn,
)
optimizer = torch.optim.Adam(model.parameters())
metrics = [
"grid_size",
"loss",
"x",
"y",
"w",
"h",
"conf",
"cls",
"cls_acc",
"recall50",
"recall75",
"precision",
"conf_obj",
"conf_noobj",
]
for epoch in range(opt.epochs):
model.train()
start_time = time.time()
for batch_i, (_, imgs, targets) in enumerate(dataloader):
batches_done = len(dataloader) * epoch + batch_i
imgs = Variable(imgs.to(device))
targets = Variable(targets.to(device), requires_grad=False)
loss, outputs = model(imgs, targets)
loss.backward()
if batches_done % opt.gradient_accumulations:
# Accumulates gradient before each step
optimizer.step()
optimizer.zero_grad()
# ----------------
# Log progress
# ----------------
log_str = "\n---- [Epoch %d/%d, Batch %d/%d] ----\n" % (epoch, opt.epochs, batch_i, len(dataloader))
metric_table = [["Metrics", *[f"YOLO Layer {i}" for i in range(len(model.yolo_layers))]]]
# Log metrics at each YOLO layer
for i, metric in enumerate(metrics):
formats = {m: "%.6f" for m in metrics}
formats["grid_size"] = "%2d"
formats["cls_acc"] = "%.2f%%"
row_metrics = [formats[metric] % yolo.metrics.get(metric, 0) for yolo in model.yolo_layers]
metric_table += [[metric, *row_metrics]]
# Tensorboard logging
tensorboard_log = []
for j, yolo in enumerate(model.yolo_layers):
for name, metric in yolo.metrics.items():
if name != "grid_size":
tensorboard_log += [(f"{name}_{j+1}", metric)]
tensorboard_log += [("loss", loss.item())]
logger.list_of_scalars_summary(tensorboard_log, batches_done)
log_str += AsciiTable(metric_table).table
log_str += f"\nTotal loss {loss.item()}"
# Determine approximate time left for epoch
epoch_batches_left = len(dataloader) - (batch_i + 1)
time_left = datetime.timedelta(seconds=epoch_batches_left * (time.time() - start_time) / (batch_i + 1))
log_str += f"\n---- ETA {time_left}"
print(log_str)
model.seen += imgs.size(0)
if epoch % opt.evaluation_interval == 0:
print("\n---- Evaluating Model ----")
# Evaluate the model on the validation set
precision, recall, AP, f1, ap_class = evaluate(
model,
path=valid_path,
iou_thres=0.5,
conf_thres=0.5,
nms_thres=0.5,
img_size=opt.img_size,
batch_size=8,
)
evaluation_metrics = [
("val_precision", precision.mean()),
("val_recall", recall.mean()),
("val_mAP", AP.mean()),
("val_f1", f1.mean()),
]
logger.list_of_scalars_summary(evaluation_metrics, epoch)
# Print class APs and mAP
ap_table = [["Index", "Class name", "AP"]]
for i, c in enumerate(ap_class):
ap_table += [[c, class_names[c], "%.5f" % AP[i]]]
print(AsciiTable(ap_table).table)
print(f"---- mAP {AP.mean()}")
if epoch % opt.checkpoint_interval == 0:
torch.save(model.state_dict(), f"checkpoints/yolov3_ckpt_%d.pth" % epoch)
Whenever I run the trainplate.py code, I get the following ValueErrorr: What should I do?
---- Evaluating Model ----
Detecting objects: 0it [00:00, ?it/s]
Traceback (most recent call last):
File "C:/Users/jr291/Desktop/연구/PyTorch-YOLOv3/trainplate.py", line 160, in <module>
batch_size=8,
File "C:\Users\jr291\Desktop\연구\PyTorch-YOLOv3\test.py", line 53, in evaluate
true_positives, pred_scores, pred_labels = [np.concatenate(x, 0) for x in list(zip(*sample_metrics))]
ValueError: not enough values to unpack (expected 3, got 0)
Also, get_batch_statistics function is like below.
def get_batch_statistics(outputs, targets, iou_threshold):
""" Compute true positives, predicted scores and predicted labels per sample """
batch_metrics = []
for sample_i in range(len(outputs)):
if outputs[sample_i] is None:
continue
output = outputs[sample_i]
pred_boxes = output[:, :4]
pred_scores = output[:, 4]
pred_labels = output[:, -1]
true_positives = np.zeros(pred_boxes.shape[0])
annotations = targets[targets[:, 0] == sample_i][:, 1:]
target_labels = annotations[:, 0] if len(annotations) else []
if len(annotations):
detected_boxes = []
target_boxes = annotations[:, 1:]
for pred_i, (pred_box, pred_label) in enumerate(zip(pred_boxes, pred_labels)):
# If targets are found break
if len(detected_boxes) == len(annotations):
break
# Ignore if label is not one of the target labels
if pred_label not in target_labels:
continue
iou, box_index = bbox_iou(pred_box.unsqueeze(0), target_boxes).max(0)
if iou >= iou_threshold and box_index not in detected_boxes:
true_positives[pred_i] = 1
detected_boxes += [box_index]
batch_metrics.append([true_positives, pred_scores, pred_labels])
return batch_metrics
It seems that this list of comprehension: [np.concatenate(x, 0) for x in list(zip(*sample_metrics))] is empty. It is hard to say since I don't know how sample_metrics looks like, because I don't see definition of get_batch_statistics in this sentence: sample_metrics += get_batch_statistics(outputs, targets, iou_threshold=iou_thres).
But this might helps.
A statement like this:
list = [2, 3, 4]
a, b, c = list
means same what this:
list = [2, 3, 4]
a = list[0]
b = list[1]
c = list[2]
but if your list is list = [1, 2] and you try to unpack it: a, b, c = list, then you get a similar error to yours.
Related
I want to train a classifier on ImageNet dataset (1000 classes) and I need each batch to contain 64 images from the same class and consecutive batches from different classes. So far based on #shai's suggestion and this post I have
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import numpy as np
import random
import argparse
import torch
import os
class DS(Dataset):
def __init__(self, data, num_classes):
super(DS, self).__init__()
self.data = data
self.indices = [[] for _ in range(num_classes)]
for i, (data, class_label) in enumerate(data):
# create a list of lists, where every sublist containts the indices of
# the samples that belong to the class_label
self.indices[class_label].append(i)
def classes(self):
return self.indices
def __getitem__(self, index):
return self.data[index]
class BatchSampler:
def __init__(self, classes, batch_size):
# classes is a list of lists where each sublist refers to a class and contains
# the sample ids that belond to this class
self.classes = classes
self.n_batches = sum([len(x) for x in classes]) // batch_size
self.min_class_size = min([len(x) for x in classes])
self.batch_size = batch_size
self.class_range = list(range(len(self.classes)))
random.shuffle(self.class_range)
assert batch_size < self.min_class_size, 'batch_size should be at least {}'.format(self.min_class_size)
def __iter__(self):
batches = []
for j in range(self.n_batches):
if j < len(self.class_range):
batch_class = self.class_range[j]
else:
batch_class = random.choice(self.class_range)
batches.append(np.random.choice(self.classes[batch_class], self.batch_size))
return iter(batches)
def main():
# Code about
_train_dataset = DS(train_dataset, train_dataset.num_classes)
_batch_sampler = BatchSampler(_train_dataset.classes(), batch_size=args.batch_size)
_train_loader = DataLoader(dataset=_train_dataset, batch_sampler=_batch_sampler)
labels = []
for i, (inputs, _labels) in enumerate(_train_loader):
labels.append(torch.unique(_labels).item())
print("Unique labels: {}".format(torch.unique(_labels).item()))
labels = set(labels)
print('Length of traversed unique labels: {}'.format(len(labels)))
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
parser.add_argument('--data', metavar='DIR', nargs='?', default='imagenet',
help='path to dataset (default: imagenet)')
parser.add_argument('--dummy', action='store_true', help="use fake data to benchmark")
parser.add_argument('-b', '--batch-size', default=64, type=int,
metavar='N',
help='mini-batch size (default: 256), this is the total '
'batch size of all GPUs on the current node when '
'using Data Parallel or Distributed Data Parallel')
parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
help='number of data loading workers (default: 4)')
args = parser.parse_args()
if args.dummy:
print("=> Dummy data is used!")
num_classes = 100
train_dataset = datasets.FakeData(size=12811, image_size=(3, 224, 224),
num_classes=num_classes, transform=transforms.ToTensor())
val_dataset = datasets.FakeData(5000, (3, 224, 224), num_classes, transforms.ToTensor())
else:
traindir = os.path.join(args.data, 'train')
valdir = os.path.join(args.data, 'val')
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
train_dataset = datasets.ImageFolder(
traindir,
transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
normalize,
]))
val_dataset = datasets.ImageFolder(
valdir,
transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
normalize,
]))
# Samplers are initialized to None and train_sampler will be replaced
train_sampler, val_sampler = None, None
train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
num_workers=args.workers, pin_memory=True, sampler=train_sampler)
val_loader = torch.utils.data.DataLoader(
val_dataset, batch_size=args.batch_size, shuffle=False,
num_workers=args.workers, pin_memory=True, sampler=val_sampler)
main()
which prints: Length of traversed unique labels: 100.
However, creating self.indices in the for loop takes a lot of time. Is there a more efficient way to construct this sampler?
EDIT: yield implementation
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import numpy as np
import random
import argparse
import torch
import os
from tqdm import tqdm
import os.path
class DS(Dataset):
def __init__(self, data, num_classes):
super(DS, self).__init__()
self.data = data
self.data_len = len(data)
indices = [[] for _ in range(num_classes)]
for i, (_, class_label) in tqdm(enumerate(data), total=len(data), miniters=1,
desc='Building class indices dataset..'):
indices[class_label].append(i)
self.indices = indices
def per_class_sample_indices(self):
return self.indices
def __getitem__(self, index):
return self.data[index]
def __len__(self):
return self.data_len
class BatchSampler:
def __init__(self, per_class_sample_indices, batch_size):
# classes is a list of lists where each sublist refers to a class and contains
# the sample ids that belond to this class
self.per_class_sample_indices = per_class_sample_indices
self.n_batches = sum([len(x) for x in per_class_sample_indices]) // batch_size
self.min_class_size = min([len(x) for x in per_class_sample_indices])
self.batch_size = batch_size
self.class_range = list(range(len(self.per_class_sample_indices)))
random.shuffle(self.class_range)
def __iter__(self):
for j in range(self.n_batches):
if j < len(self.class_range):
batch_class = self.class_range[j]
else:
batch_class = random.choice(self.class_range)
if self.batch_size <= len(self.per_class_sample_indices[batch_class]):
batch = np.random.choice(self.per_class_sample_indices[batch_class], self.batch_size)
# batches.append(np.random.choice(self.per_class_sample_indices[batch_class], self.batch_size))
else:
batch = self.per_class_sample_indices[batch_class]
yield batch
def n_batches(self):
return self.n_batches
def main():
file_path = 'a_file_path'
file_name = 'per_class_sample_indices.pt'
if not os.path.exists(os.path.join(file_path, file_name)):
print('File: {} does not exists. Create it.'.format(file_name))
per_class_sample_indices = DS(train_dataset, num_classes).per_class_sample_indices()
torch.save(per_class_sample_indices, os.path.join(file_path, file_name))
else:
per_class_sample_indices = torch.load(os.path.join(file_path, file_name))
print('File: {} exists. Do not create it.'.format(file_name))
batch_sampler = BatchSampler(per_class_sample_indices,
batch_size=args.batch_size)
train_loader = torch.utils.data.DataLoader(
train_dataset,
# batch_size=args.batch_size,
# shuffle=(train_sampler is None),
num_workers=args.workers,
pin_memory=True,
# sampler=train_sampler,
batch_sampler=batch_sampler
)
# We do not use sampler for the validation
# val_loader = torch.utils.data.DataLoader(
# val_dataset, batch_size=args.batch_size, shuffle=False,
# num_workers=args.workers, pin_memory=True, sampler=None)
labels = []
for i, (inputs, _labels) in enumerate(train_loader):
labels.append(torch.unique(_labels).item())
print("Unique labels: {}".format(torch.unique(_labels).item()))
labels = set(labels)
print('Length of traversed unique labels: {}'.format(len(labels)))
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
parser.add_argument('--data', metavar='DIR', nargs='?', default='imagenet',
help='path to dataset (default: imagenet)')
parser.add_argument('--dummy', action='store_true', help="use fake data to benchmark")
parser.add_argument('-b', '--batch-size', default=64, type=int,
metavar='N',
help='mini-batch size (default: 256), this is the total '
'batch size of all GPUs on the current node when '
'using Data Parallel or Distributed Data Parallel')
parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
help='number of data loading workers (default: 4)')
args = parser.parse_args()
if args.dummy:
print("=> Dummy data is used!")
num_classes = 100
train_dataset = datasets.FakeData(size=12811, image_size=(3, 224, 224),
num_classes=num_classes, transform=transforms.ToTensor())
val_dataset = datasets.FakeData(5000, (3, 224, 224), num_classes, transforms.ToTensor())
else:
traindir = os.path.join(args.data, 'train')
valdir = os.path.join(args.data, 'val')
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
train_dataset = datasets.ImageFolder(
traindir,
transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
normalize,
]))
val_dataset = datasets.ImageFolder(
valdir,
transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
normalize,
]))
num_classes = len(train_dataset.classes)
main()
A similar post but in TensorFlow can be found here
Your code seems fine. The issue here is not the sampler but the preprocessing step you are required to perform in order to sort out the instance indices by their class. Since this is always the same sort, I recommend you cache this information (the data contained inside of self.indices) on your file system such that you avoid having to reconstruct it on every dataset load. You can do so using either numpy.save or torch.save.
You should write your own batch_sampler class for the DataLoader.
I am trying to save images that I configure during training to the output bucket in sagemaker. I've read that all the information that needs to be saved during training goes into the model.tar.gz file. I've tried saving plots using the model_dir and the output_data_dir to no avail. The model itself is saved properly, but the additional information is not being stored with it. I want to reload this additional information (the saved images) during inference but have heard that storing all the information in the model.tar.gz can cause slow inference. I would love some help.
Here is my estimator
from sagemaker.pytorch import PyTorch
estimator = PyTorch(entry_point='XXXXXXXX/AWS/mnist.py',
role=role,
py_version='py3',
framework_version='1.8.0',
instance_count=1,
instance_type='ml.c5.xlarge',
output_path='s3://XXXXX-bucket/',
)
and the code in mnist.py:
import os
import pandas as pd
import torch
import matplotlib.pyplot as plt
import argparse
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from torchvision.transforms import ToTensor
from torchvision.io import read_image
from torch import nn
import matplotlib.pyplot as plt
class NeuralNetwork(nn.Module):
def __init__(self):
super(NeuralNetwork, self).__init__()
self.flatten = nn.Flatten()
self.linear_relu_stack = nn.Sequential(
nn.Linear(28*28, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, 10),
)
def forward(self, x):
x = self.flatten(x)
logits = self.linear_relu_stack(x)
return logits
def train_loop(dataloader, model, loss_fn, optimizer):
size = len(dataloader.dataset)
for batch, (X, y) in enumerate(dataloader):
# Compute prediction and loss
pred = model(X.to(device))
loss = loss_fn(pred, y.to(device))
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch % 100 == 0:
loss, current = loss.item(), batch * len(X)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
def test_loop(dataloader, model, loss_fn):
size = len(dataloader.dataset)
num_batches = len(dataloader)
test_loss, correct = 0, 0
with torch.no_grad():
for X, y in dataloader:
pred = model(X.to(device))
test_loss += loss_fn(pred, y.to(device)).item()
correct += (pred.argmax(1) == y.to(device)).type(torch.float).sum().item()
test_loss /= num_batches
correct /= size
print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
# Initialize the loss function
if __name__=='__main__':
# default to the value in environment variable `SM_MODEL_DIR`. Using args makes the script more portable.
parser = argparse.ArgumentParser()
parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR'])
args, _ = parser.parse_known_args()
training_data = datasets.FashionMNIST(
root="data",
train=True,
download=True,
transform=ToTensor()
)
test_data = datasets.FashionMNIST(
root="data",
train=False,
download=True,
transform=ToTensor()
)
labels_map = {
0: "T-Shirt",
1: "Trouser",
2: "Pullover",
3: "Dress",
4: "Coat",
5: "Sandal",
6: "Shirt",
7: "Sneaker",
8: "Bag",
9: "Ankle Boot",
}
figure = plt.figure(figsize=(8, 8))
cols, rows = 3, 3
for i in range(1, cols * rows + 1):
sample_idx = torch.randint(len(training_data), size=(1,)).item()
img, label = training_data[sample_idx]
figure.add_subplot(rows, cols, i)
plt.title(labels_map[label])
plt.axis("off")
plt.imsave(args.output_data_dir+'plot'+str(i)+'.jpg', img.squeeze(), cmap="gray")
train_dataloader = DataLoader(training_data, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)
# Display image and label.
train_features, train_labels = next(iter(train_dataloader))
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")
img = train_features[0].squeeze()
label = train_labels[0]
plt.imsave(args.output_data_dir+'sample.jpg', img, cmap="gray")
print("Saved img.")
print(f"Label: {label}")
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")
model = NeuralNetwork().to(device)
print(model)
learning_rate = 1e-3
batch_size = 64
epochs = 5
# ... train `model`, then save it to `model_dir`
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
epochs = 1
for t in range(epochs):
print(f"Epoch {t+1}\n-------------------------------")
train_loop(train_dataloader, model, loss_fn, optimizer)
test_loop(test_dataloader, model, loss_fn)
print("Done!")
with open(os.path.join(args.model_dir, 'model.pth'), 'wb') as f:
torch.save(model.state_dict(), f)
plt.plot([1,2,3,4])
plt.ylabel('some numbers')
plt.show()
plt.savefig('test.jpeg')
I suspect there is an issue with string concatenation in plt.imsave because the environment variable SM_OUTPUT_DATA_DIR by default points to /opt/ml/output/data (that's the actual value of args.output_data_dir, since you don't pass this parameter) so the outcome is something like /opt/ml/output/dataplot1.jpg. The same happen if you use the model_dir in the same way. I'd rather use something like os.path.join like you're already doing for the model. here a nice exaplaination about these folders and environment variables in sagemaker.
Can any one help me
I am facing this error "h5py objects cannot be pickled" while running (train.py) on https://github.com/RoyalSkye/Image-Caption
(my OS is Window 10)
train.py
#!/usr/bin/env python3
import h5py
import time
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence
from models import *
from transformer import *
from datasets import *
from utils import *
from nltk.translate.bleu_score import corpus_bleu
import argparse
import codecs
import numpy as np
def train(args, train_loader, encoder, decoder, criterion, encoder_optimizer, decoder_optimizer, epoch):
"""
Performs one epoch's training.
:param train_loader: DataLoader for training data
:param encoder: encoder model
:param decoder: decoder model
:param criterion: loss layer
:param encoder_optimizer: optimizer to update encoder's weights (if fine-tuning)
:param decoder_optimizer: optimizer to update decoder's weights
:param epoch: epoch number
"""
decoder.train() # train mode (dropout and batchnorm is used)
encoder.train()
batch_time = AverageMeter() # forward prop. + back prop. time
data_time = AverageMeter() # data loading time
losses = AverageMeter() # loss (per word decoded)
top5accs = AverageMeter() # top5 accuracy
start = time.time()
# Batches
for i, (imgs, caps, caplens) in enumerate(train_loader):
data_time.update(time.time() - start)
# Move to GPU, if available
imgs = imgs.to(device)
caps = caps.to(device)
caplens = caplens.to(device)
# Forward prop.
imgs = encoder(imgs)
# imgs: [batch_size, 14, 14, 2048]
# caps: [batch_size, 52]
# caplens: [batch_size, 1]
scores, caps_sorted, decode_lengths, alphas, sort_ind = decoder(imgs, caps, caplens)
# Since we decoded starting with <start>, the targets are all words after <start>, up to <end>
targets = caps_sorted[:, 1:]
# Remove timesteps that we didn't decode at, or are pads
# pack_padded_sequence is an easy trick to do this
scores = pack_padded_sequence(scores, decode_lengths, batch_first=True).data
targets = pack_padded_sequence(targets, decode_lengths, batch_first=True).data
# print(scores.size())
# print(targets.size())
# Calculate loss
loss = criterion(scores, targets)
# Add doubly stochastic attention regularization
# Second loss, mentioned in paper "Show, Attend and Tell: Neural Image Caption Generation with Visual Attention"
# https://arxiv.org/abs/1502.03044
# In section 4.2.1 Doubly stochastic attention regularization: We know the weights sum to 1 at a given timestep.
# But we also encourage the weights at a single pixel p to sum to 1 across all timesteps T.
# This means we want the model to attend to every pixel over the course of generating the entire sequence.
# Therefore, we want to minimize the difference between 1 and the sum of a pixel's weights across all timesteps.
if args.decoder_mode == "lstm":
loss += args.alpha_c * ((1. - alphas.sum(dim=1)) ** 2).mean()
elif args.decoder_mode == "transformer":
dec_alphas = alphas["dec_enc_attns"]
alpha_trans_c = args.alpha_c / (args.n_heads * args.decoder_layers)
for layer in range(args.decoder_layers): # args.decoder_layers = len(dec_alphas)
cur_layer_alphas = dec_alphas[layer] # [batch_size, n_heads, 52, 196]
for h in range(args.n_heads):
cur_head_alpha = cur_layer_alphas[:, h, :, :]
loss += alpha_trans_c * ((1. - cur_head_alpha.sum(dim=1)) ** 2).mean()
# Back prop.
decoder_optimizer.zero_grad()
if encoder_optimizer is not None:
encoder_optimizer.zero_grad()
loss.backward()
# Clip gradients
if args.grad_clip is not None:
clip_gradient(decoder_optimizer, args.grad_clip)
if encoder_optimizer is not None:
clip_gradient(encoder_optimizer, args.grad_clip)
# Update weights
decoder_optimizer.step()
if encoder_optimizer is not None:
encoder_optimizer.step()
# Keep track of metrics
top5 = accuracy(scores, targets, 5)
losses.update(loss.item(), sum(decode_lengths))
top5accs.update(top5, sum(decode_lengths))
batch_time.update(time.time() - start)
start = time.time()
if i % args.print_freq == 0:
print("Epoch: {}/{} step: {}/{} Loss: {} AVG_Loss: {} Top-5 Accuracy: {} Batch_time: {}s".format(epoch+1, args.epochs, i+1, len(train_loader), losses.val, losses.avg, top5accs.val, batch_time.val))
def validate(args, val_loader, encoder, decoder, criterion):
"""
Performs one epoch's validation.
:param val_loader: DataLoader for validation data.
:param encoder: encoder model
:param decoder: decoder model
:param criterion: loss layer
:return: score_dict {'Bleu_1': 0., 'Bleu_2': 0., 'Bleu_3': 0., 'Bleu_4': 0., 'METEOR': 0., 'ROUGE_L': 0., 'CIDEr': 1.}
"""
decoder.eval() # eval mode (no dropout or batchnorm)
if encoder is not None:
encoder.eval()
batch_time = AverageMeter()
losses = AverageMeter()
top5accs = AverageMeter()
start = time.time()
references = list() # references (true captions) for calculating BLEU-4 score
hypotheses = list() # hypotheses (predictions)
# explicitly disable gradient calculation to avoid CUDA memory error
with torch.no_grad():
# Batches
for i, (imgs, caps, caplens, allcaps) in enumerate(val_loader):
# Move to device, if available
imgs = imgs.to(device)
caps = caps.to(device)
caplens = caplens.to(device)
# Forward prop.
if encoder is not None:
imgs = encoder(imgs)
scores, caps_sorted, decode_lengths, alphas, sort_ind = decoder(imgs, caps, caplens)
# Since we decoded starting with <start>, the targets are all words after <start>, up to <end>
targets = caps_sorted[:, 1:]
# Remove timesteps that we didn't decode at, or are pads
# pack_padded_sequence is an easy trick to do this
scores_copy = scores.clone()
scores = pack_padded_sequence(scores, decode_lengths, batch_first=True).data
targets = pack_padded_sequence(targets, decode_lengths, batch_first=True).data
# Calculate loss
loss = criterion(scores, targets)
# Add doubly stochastic attention regularization
if args.decoder_mode == "lstm":
loss += args.alpha_c * ((1. - alphas.sum(dim=1)) ** 2).mean()
elif args.decoder_mode == "transformer":
dec_alphas = alphas["dec_enc_attns"]
alpha_trans_c = args.alpha_c / (args.n_heads * args.decoder_layers)
for layer in range(args.decoder_layers): # args.decoder_layers = len(dec_alphas)
cur_layer_alphas = dec_alphas[layer] # [batch_size, n_heads, 52, 196]
for h in range(args.n_heads):
cur_head_alpha = cur_layer_alphas[:, h, :, :]
loss += alpha_trans_c * ((1. - cur_head_alpha.sum(dim=1)) ** 2).mean()
# Keep track of metrics
losses.update(loss.item(), sum(decode_lengths))
top5 = accuracy(scores, targets, 5)
top5accs.update(top5, sum(decode_lengths))
batch_time.update(time.time() - start)
start = time.time()
# Store references (true captions), and hypothesis (prediction) for each image
# If for n images, we have n hypotheses, and references a, b, c... for each image, we need -
# references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...]
# References
allcaps = allcaps[sort_ind] # because images were sorted in the decoder
for j in range(allcaps.shape[0]):
img_caps = allcaps[j].tolist()
img_captions = list(
map(lambda c: [w for w in c if w not in {word_map['<start>'], word_map['<pad>']}],
img_caps)) # remove <start> and pads
references.append(img_captions)
# Hypotheses
_, preds = torch.max(scores_copy, dim=2)
preds = preds.tolist()
temp_preds = list()
for j, p in enumerate(preds):
temp_preds.append(preds[j][:decode_lengths[j]]) # remove pads
preds = temp_preds
hypotheses.extend(preds)
assert len(references) == len(hypotheses)
# Calculate BLEU-1~4 scores
# metrics = {}
# weights = (1.0 / 1.0,)
# metrics["bleu1"] = corpus_bleu(references, hypotheses, weights)
# weights = (1.0/2.0, 1.0/2.0,)
# metrics["bleu2"] = corpus_bleu(references, hypotheses, weights)
# weights = (1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0,)
# metrics["bleu3"] = corpus_bleu(references, hypotheses, weights)
# metrics["bleu4"] = corpus_bleu(references, hypotheses)
# Calculate BLEU1~4, METEOR, ROUGE_L, CIDEr scores
metrics = get_eval_score(references, hypotheses)
print("EVA LOSS: {} TOP-5 Accuracy {} BLEU-1 {} BLEU2 {} BLEU3 {} BLEU-4 {} METEOR {} ROUGE_L {} CIDEr {}".format
(losses.avg, top5accs.avg, metrics["Bleu_1"], metrics["Bleu_2"], metrics["Bleu_3"], metrics["Bleu_4"],
metrics["METEOR"], metrics["ROUGE_L"], metrics["CIDEr"]))
return metrics
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Image_Captioning')
# Data parameters
parser.add_argument('--data_folder', default="./dataset/generated_data",
help='folder with data files saved by create_input_files.py.')
parser.add_argument('--data_name', default="coco_5_cap_per_img_5_min_word_freq",
help='base name shared by data files.')
# Model parameters
parser.add_argument('--emb_dim', type=int, default=300, help='dimension of word embeddings.')
parser.add_argument('--attention_dim', type=int, default=512, help='dimension of attention linear layers.')
parser.add_argument('--decoder_dim', type=int, default=512, help='dimension of decoder RNN.')
parser.add_argument('--n_heads', type=int, default=8, help='Multi-head attention.')
parser.add_argument('--dropout', type=float, default=0.1, help='dropout')
parser.add_argument('--decoder_mode', default="transformer", help='which model does decoder use?') # lstm or transformer
parser.add_argument('--attention_method', default="ByPixel", help='which attention method to use?') # ByPixel or ByChannel
parser.add_argument('--encoder_layers', type=int, default=2, help='the number of layers of encoder in Transformer.')
parser.add_argument('--decoder_layers', type=int, default=6, help='the number of layers of decoder in Transformer.')
# Training parameters
parser.add_argument('--epochs', type=int, default=100,
help='number of epochs to train for (if early stopping is not triggered).')
parser.add_argument('--stop_criteria', type=int, default=25, help='training stop if epochs_since_improvement == stop_criteria')
parser.add_argument('--batch_size', type=int, default=32, help='batch_size')
parser.add_argument('--print_freq', type=int, default=100, help='print training/validation stats every __ batches.')
parser.add_argument('--workers', type=int, default=1, help='for data-loading; right now, only 1 works with h5pys.')
parser.add_argument('--encoder_lr', type=float, default=1e-4, help='learning rate for encoder if fine-tuning.')
parser.add_argument('--decoder_lr', type=float, default=1e-4, help='learning rate for decoder.')
parser.add_argument('--grad_clip', type=float, default=5., help='clip gradients at an absolute value of.')
parser.add_argument('--alpha_c', type=float, default=1.,
help='regularization parameter for doubly stochastic attention, as in the paper.')
parser.add_argument('--fine_tune_encoder', type=bool, default=False, help='whether fine-tune encoder or not')
parser.add_argument('--fine_tune_embedding', type=bool, default=False, help='whether fine-tune word embeddings or not')
parser.add_argument('--checkpoint', default=None, help='path to checkpoint, None if none.')
parser.add_argument('--embedding_path', default=None, help='path to pre-trained word Embedding.')
args = parser.parse_args()
# load checkpoint, these parameters can't be modified
final_args = {"emb_dim": args.emb_dim,
"attention_dim": args.attention_dim,
"decoder_dim": args.decoder_dim,
"n_heads": args.n_heads,
"dropout": args.dropout,
"decoder_mode": args.decoder_mode,
"attention_method": args.attention_method,
"encoder_layers": args.encoder_layers,
"decoder_layers": args.decoder_layers}
start_epoch = 0
best_bleu4 = 0. # BLEU-4 score right now
epochs_since_improvement = 0 # keeps track of number of epochs since there's been an improvement in validation BLEU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # sets device for model and PyTorch tensors
cudnn.benchmark = True # set to true only if inputs to model are fixed size; otherwise lot of computational overhead
print(device)
# Read word map
word_map_file = os.path.join(args.data_folder, 'WORDMAP_' + args.data_name + '.json')
with open(word_map_file, 'r') as j:
word_map = json.load(j)
# Initialize / load checkpoint
if args.checkpoint is None:
encoder = CNN_Encoder(attention_method=args.attention_method)
encoder.fine_tune(args.fine_tune_encoder)
encoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, encoder.parameters()),
lr=args.encoder_lr) if args.fine_tune_encoder else None
if args.decoder_mode == "lstm":
decoder = DecoderWithAttention(attention_dim=args.attention_dim,
embed_dim=args.emb_dim,
decoder_dim=args.decoder_dim,
vocab_size=len(word_map),
dropout=args.dropout)
elif args.decoder_mode == "transformer":
decoder = Transformer(vocab_size=len(word_map), embed_dim=args.emb_dim, encoder_layers=args.encoder_layers,
decoder_layers=args.decoder_layers, dropout=args.dropout,
attention_method=args.attention_method, n_heads=args.n_heads)
decoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, decoder.parameters()),
lr=args.decoder_lr)
# load pre-trained word embedding
if args.embedding_path is not None:
all_word_embeds = {}
for i, line in enumerate(codecs.open(args.embedding_path, 'r', 'utf-8')):
s = line.strip().split()
all_word_embeds[s[0]] = np.array([float(i) for i in s[1:]])
# change emb_dim
args.emb_dim = list(all_word_embeds.values())[-1].size
word_embeds = np.random.uniform(-np.sqrt(0.06), np.sqrt(0.06), (len(word_map), args.emb_dim))
for w in word_map:
if w in all_word_embeds:
word_embeds[word_map[w]] = all_word_embeds[w]
elif w.lower() in all_word_embeds:
word_embeds[word_map[w]] = all_word_embeds[w.lower()]
else:
# <pad> <start> <end> <unk>
embedding_i = torch.ones(1, args.emb_dim)
torch.nn.init.xavier_uniform_(embedding_i)
word_embeds[word_map[w]] = embedding_i
word_embeds = torch.FloatTensor(word_embeds).to(device)
decoder.load_pretrained_embeddings(word_embeds)
decoder.fine_tune_embeddings(args.fine_tune_embedding)
print('Loaded {} pre-trained word embeddings.'.format(len(word_embeds)))
else:
checkpoint = torch.load(args.checkpoint, map_location=str(device))
start_epoch = checkpoint['epoch'] + 1
epochs_since_improvement = checkpoint['epochs_since_improvement']
best_bleu4 = checkpoint['metrics']["Bleu_4"]
encoder = checkpoint['encoder']
encoder_optimizer = checkpoint['encoder_optimizer']
decoder = checkpoint['decoder']
decoder_optimizer = checkpoint['decoder_optimizer']
decoder.fine_tune_embeddings(args.fine_tune_embedding)
# load final_args from checkpoint
final_args = checkpoint['final_args']
for key in final_args.keys():
args.__setattr__(key, final_args[key])
if args.fine_tune_encoder is True and encoder_optimizer is None:
print("Encoder_Optimizer is None, Creating new Encoder_Optimizer!")
encoder.fine_tune(args.fine_tune_encoder)
encoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, encoder.parameters()),
lr=args.encoder_lr)
# Move to GPU, if available
decoder = decoder.to(device)
encoder = encoder.to(device)
print("encoder_layers {} decoder_layers {} n_heads {} dropout {} attention_method {} encoder_lr {} "
"decoder_lr {} alpha_c {}".format(args.encoder_layers, args.decoder_layers, args.n_heads, args.dropout,
args.attention_method, args.encoder_lr, args.decoder_lr, args.alpha_c))
print(encoder)
print(decoder)
# Loss function
criterion = nn.CrossEntropyLoss().to(device)
# Custom dataloaders
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
# normalize = transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
# pin_memory: If True, the data loader will copy Tensors into CUDA pinned memory before returning them.
# If your data elements are a custom type, or your collate_fn returns a batch that is a custom type.
train_loader = torch.utils.data.DataLoader(
CaptionDataset(args.data_folder, args.data_name, 'TRAIN', transform=transforms.Compose([normalize])),
batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True)
val_loader = torch.utils.data.DataLoader(
CaptionDataset(args.data_folder, args.data_name, 'VAL', transform=transforms.Compose([normalize])),
batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True)
# Epochs
for epoch in range(start_epoch, args.epochs):
# Decay learning rate if there is no improvement for 5 consecutive epochs, and terminate training after 25
# 8 20
if epochs_since_improvement == args.stop_criteria:
print("the model has not improved in the last {} epochs".format(args.stop_criteria))
break
if epochs_since_improvement > 0 and epochs_since_improvement % 5 == 0:
adjust_learning_rate(decoder_optimizer, 0.8)
if args.fine_tune_encoder and encoder_optimizer is not None:
print(encoder_optimizer)
adjust_learning_rate(encoder_optimizer, 0.8)
# One epoch's training
train(args, train_loader=train_loader, encoder=encoder, decoder=decoder, criterion=criterion,
encoder_optimizer=encoder_optimizer, decoder_optimizer=decoder_optimizer, epoch=epoch)
# One epoch's validation
metrics = validate(args, val_loader=val_loader, encoder=encoder, decoder=decoder, criterion=criterion)
recent_bleu4 = metrics["Bleu_4"]
# Check if there was an improvement
is_best = recent_bleu4 > best_bleu4
best_bleu4 = max(recent_bleu4, best_bleu4)
if not is_best:
epochs_since_improvement += 1
print("\nEpochs since last improvement: %d\n" % (epochs_since_improvement,))
else:
epochs_since_improvement = 0
# Save checkpoint
save_checkpoint(args.data_name, epoch, epochs_since_improvement, encoder, decoder, encoder_optimizer,
decoder_optimizer, metrics, is_best, final_args)
Traceback (most recent call last):
File D:\COCO\imge_captioning_transform_github\3\Image-Caption-master\train.py:394 in
train(args, train_loader=train_loader, encoder=encoder, decoder=decoder, criterion=criterion,
File D:\COCO\imge_captioning_transform_github\3\Image-Caption-master\train.py:44 in train
for i, (imgs, caps, caplens) in enumerate(train_loader):
File ~\anaconda3\envs\my_envir_gpu\lib\site-packages\torch\utils\data\dataloader.py:368 in iter
return self._get_iterator()
File ~\anaconda3\envs\my_envir_gpu\lib\site-packages\torch\utils\data\dataloader.py:314 in _get_iterator
return _MultiProcessingDataLoaderIter(self)
File ~\anaconda3\envs\my_envir_gpu\lib\site-packages\torch\utils\data\dataloader.py:927 in init
w.start()
File ~\anaconda3\envs\my_envir_gpu\lib\multiprocessing\process.py:121 in start
self._popen = self._Popen(self)
File ~\anaconda3\envs\my_envir_gpu\lib\multiprocessing\context.py:224 in _Popen
return _default_context.get_context().Process._Popen(process_obj)
File ~\anaconda3\envs\my_envir_gpu\lib\multiprocessing\context.py:327 in _Popen
return Popen(process_obj)
File ~\anaconda3\envs\my_envir_gpu\lib\multiprocessing\popen_spawn_win32.py:93 in init
reduction.dump(process_obj, to_child)
File ~\anaconda3\envs\my_envir_gpu\lib\multiprocessing\reduction.py:60 in dump
ForkingPickler(file, protocol).dump(obj)
File ~\anaconda3\envs\my_envir_gpu\lib\site-packages\h5py_hl\base.py:368 in getnewargs
raise TypeError("h5py objects cannot be pickled")
TypeError: h5py objects cannot be pickled
2022-06-30 17:24:41.206091: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX AVX2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-06-30 17:24:41.525476: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 3497 MB memory: -> device: 0, name: NVIDIA GeForce RTX 3060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6
2022-06-30 17:24:44.486920: W tensorflow/core/common_runtime/forward_type_inference.cc:231] Type inference failed. This indicates an invalid graph that escaped type checking. Error message: INVALID_ARGUMENT: expected compatible input types, but input 1:
type_id: TFT_OPTIONAL
args {
type_id: TFT_PRODUCT
args {
type_id: TFT_TENSOR
args {
type_id: TFT_LEGACY_VARIANT
}
}
}
is neither a subtype nor a supertype of the combined inputs preceding it:
type_id: TFT_OPTIONAL
args {
type_id: TFT_PRODUCT
args {
type_id: TFT_TENSOR
args {
type_id: TFT_INT32
}
}
}
while inferring type of node 'cond_40/output/_25'
2022-06-30 17:24:45.077383: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
Traceback (most recent call last):
File "", line 1, in
File "C:\Users\MSI\anaconda3\envs\my_envir_gpu\lib\multiprocessing\spawn.py", line 116, in spawn_main
exitcode = _main(fd, parent_sentinel)
File "C:\Users\MSI\anaconda3\envs\my_envir_gpu\lib\multiprocessing\spawn.py", line 126, in _main
self = reduction.pickle.load(from_parent)
EOFError: Ran out of input
I am using Python 3.9, PyTorch 1.10 with Cuda 11.3 (WINDOWS 10)
Thanks,
I am trying ( num_workers=0 ) , but still same error
Since Pytorch GPU support for apple silicon was just released, I tried to install PyTorch using the steps on the following link. As of now, only a nightly build is available so I installed it. However, when I run the following code, I get the error.
Link I followed:
https://pytorch.org/blog/introducing-accelerated-pytorch-training-on-mac/
https://pytorch.org/get-started/locally/
Code:
from __future__ import print_function
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 32, 3, 1)
self.conv2 = nn.Conv2d(32, 64, 3, 1)
self.dropout1 = nn.Dropout(0.25)
self.dropout2 = nn.Dropout(0.5)
self.fc1 = nn.Linear(9216, 128)
self.fc2 = nn.Linear(128, 10)
def forward(self, x):
x = self.conv1(x)
x = F.relu(x)
x = self.conv2(x)
x = F.relu(x)
x = F.max_pool2d(x, 2)
x = self.dropout1(x)
x = torch.flatten(x, 1)
x = self.fc1(x)
x = F.relu(x)
x = self.dropout2(x)
x = self.fc2(x)
output = F.log_softmax(x, dim=1)
return output
def train(args, model, device, train_loader, optimizer, epoch):
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = F.nll_loss(output, target)
loss.backward()
optimizer.step()
if batch_idx % args.log_interval == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
if args.dry_run:
break
def test(model, device, test_loader):
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
correct += pred.eq(target.view_as(pred)).sum().item()
test_loss /= len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset)))
def main():
# Training settings
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
parser.add_argument('--batch-size', type=int, default=64, metavar='N',
help='input batch size for training (default: 64)')
parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
help='input batch size for testing (default: 1000)')
parser.add_argument('--epochs', type=int, default=14, metavar='N',
help='number of epochs to train (default: 14)')
parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
help='learning rate (default: 1.0)')
parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
help='Learning rate step gamma (default: 0.7)')
parser.add_argument('--device', default='cpu',
help='choose device')
parser.add_argument('--dry-run', action='store_true', default=False,
help='quickly check a single pass')
parser.add_argument('--seed', type=int, default=1, metavar='S',
help='random seed (default: 1)')
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
help='how many batches to wait before logging training status')
parser.add_argument('--save-model', action='store_true', default=False,
help='For Saving the current Model')
args = parser.parse_args()
torch.manual_seed(args.seed)
device = torch.device(args.device)
train_kwargs = {'batch_size': args.batch_size}
test_kwargs = {'batch_size': args.test_batch_size}
#if use_cuda:
# cuda_kwargs = {'num_workers': 1,
# 'pin_memory': True,
# 'shuffle': True}
# train_kwargs.update(cuda_kwargs)
# test_kwargs.update(cuda_kwargs)
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
dataset1 = datasets.MNIST('../data', train=True, download=True,
transform=transform)
dataset2 = datasets.MNIST('../data', train=False,
transform=transform)
train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)
model = Net().to(device)
optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
for epoch in range(1, args.epochs + 1):
train(args, model, device, train_loader, optimizer, epoch)
test(model, device, test_loader)
scheduler.step()
if args.save_model:
torch.save(model.state_dict(), "mnist_cnn.pt")
if __name__ == '__main__':
main()
# run with --device cpu or --device mps
Error:
RuntimeError: The MPS backend is supported on MacOS 12.3+.Current OS version can be queried using sw_vers. I am not sure how to use this argument sw_vers.
My Macbook specification:
Model: M1 Max
OS Version: 12.2.1
From my inspection, it looks like my OS needs to at least 12.3 for it to run however it says I can use sw_vers to run it on my current OS. I don't want to update because of other libraries compatibility issue. Can anyone figure this out?
I am trying to use torchvision.transforms to apply transformtation the training data, but getting the following traceback error, Traceback (most recent call last):
File "train4.py", line 113, in
targets = torch.tensor([metadata['count'][os.path.split(path)[-1]] for path in paths]) # B
ValueError: too many dimensions 'str'
import argparse
import datetime
import glob
import os
import random
import shutil
import time
from os.path import join
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from torchvision.transforms import ToTensor, RandomChoice, Compose, functional
from tqdm import tqdm
from convnet0 import Convnet
from dataset2 import CellsDataset
import pdb
parser = argparse.ArgumentParser('Predicting hits from pixels')
parser.add_argument('name',type=str,help='Name of experiment')
parser.add_argument('data_dir',type=str,help='Path to data directory containing images and gt.csv')
parser.add_argument('--num_steps',type=int,default=20000,help='Number of training iterations')
parser.add_argument('--batchsize',type=int,default=16,help='Size of batch')
parser.add_argument('--weight_decay',type=float,default=0.0,help='Weight decay coefficient (something like 10^-5)')
parser.add_argument('--lr',type=float,default=0.0001,help='Learning rate')
parser.add_argument('--resume',action='store_true',help='Resume experiments from checkpoint directory')
parser.add_argument('--seed',type=int,default=1337,help='RNG seed')
args = parser.parse_args()
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(args.seed)
torch.cuda.manual_seed(args.seed)
np.random.seed(args.seed)
random.seed(args.seed)
# create output directory tree:
if os.path.isdir(args.name) and not args.resume:
shutil.rmtree(args.name)
logs_path = join(args.name,'logs')
checkpoints_path = join(args.name,'checkpoints')
checkpoint_path = join(checkpoints_path,'checkpoint.pth')
if not os.path.isdir(args.name):
os.mkdir(args.name)
os.mkdir(logs_path)
os.mkdir(checkpoints_path)
# record arguments for future reference:
with open(join(args.name,'arguments.txt'),'w') as fout:
fout.write(str(args))
# create datasets:
toTensor = ToTensor()
dataset_train = CellsDataset(args.data_dir,transform=ToTensor(),return_filenames=True)
# duplicate the dataset and transform the duplicates
transforms = Compose([RandomChoice([functional.hflip,functional.vflip])])
dataset_train.duplicate_and_transform(transforms)
print(dataset_train.Size())
# load metadata:
metadata = pd.read_csv(join(args.data_dir,'gt.csv'))
metadata.set_index('filename', inplace=True)
# assign to train or test
random.shuffle(dataset_train.files)
split_point = int(len(dataset_train) * 0.9) # 90/10 train/val split
dataset_test = CellsDataset(args.data_dir,transform=ToTensor(),return_filenames=True)
dataset_test.files = dataset_train.files[split_point:]
dataset_train.files = dataset_train.files[:split_point]
loader_train = DataLoader(dataset_train,batch_size=args.batchsize,shuffle=True,num_workers=4,pin_memory=True)
loader_test = DataLoader(dataset_test, batch_size=args.batchsize,shuffle=True,num_workers=4,pin_memory=True)
# create model:
model = Convnet()
model.to(device)
# create optimizer:
optimizer = torch.optim.Adam(model.parameters(),lr=args.lr,weight_decay=args.weight_decay)
if args.resume:
try:
checkpoint = torch.load(checkpoint_path)
print('Resuming from checkpoint...')
model.load_state_dict(checkpoint['state_dict'])
globalStep = checkpoint['globalStep']
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
dataset_train.files = checkpoint['train_paths']
dataset_test.files = checkpoint['test_paths']
except FileNotFoundError:
globalStep = 0
# create logger:
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M")
writer = SummaryWriter()
# main training loop
global_step = 0
best_test_error = 10000
for epoch in range(15):
print("Epoch %d" % epoch)
model.train()
for images, paths in tqdm(loader_train):
images = images.to(device)
pdb.set_trace()
targets = torch.tensor([metadata['count'][os.path.split(path)[-1]] for path in paths]) # B
targets = targets.float().to(device)
# forward pass:
output = model(images) # B x 1 x 9 x 9 (analogous to a heatmap)
preds = output.sum(dim=[1,2,3]) # predicted cell counts (vector of length B)
# backward pass:
loss = torch.mean((preds - targets)**2)
optimizer.zero_grad()
loss.backward()
optimizer.step()
# logging:
count_error = torch.abs(preds - targets).mean()
writer.add_scalar('train_loss', loss.item(), global_step=global_step)
writer.add_scalar('train_count_error', count_error.item(), global_step=global_step)
print("Step %d, loss=%f, count error=%f" % (global_step,loss.item(),count_error.item()))
global_step += 1
mean_test_error = 0
model.eval()
for images, paths in tqdm(loader_test):
images = images.to(device)
targets = torch.tensor([metadata['count'][os.path.split(path)[-1]] for path in paths]) # B
targets = targets.float().to(device)
# forward pass:
output = model(images) # B x 1 x 9 x 9 (analogous to a heatmap)
preds = output.sum(dim=[1,2,3]) # predicted cell counts (vector of length B)
# logging:
loss = torch.mean((preds - targets)**2)
count_error = torch.abs(preds - targets).mean()
mean_test_error += count_error
writer.add_scalar('test_loss', loss.item(), global_step=global_step)
writer.add_scalar('test_count_error', count_error.item(), global_step=global_step)
global_step += 1
mean_test_error = mean_test_error / len(loader_test)
print("Test count error: %f" % mean_test_error)
if mean_test_error < best_test_error:
best_test_error = mean_test_error
torch.save({'state_dict':model.state_dict(),
'optimizer_state_dict':optimizer.state_dict(),
'globalStep':global_step,
'train_paths':dataset_train.files,
'test_paths':dataset_test.files},checkpoint_path)