I have a file containing paths to images I would like to load into Pytorch, while utilizing the built-in dataloader features (multiprocess loading pipeline, data augmentations, and so on).
def create_links():
data_dir = "/myfolder"
full_path_list = []
assert os.path.isdir(data_dir)
for _, _, filenames in os.walk(data_dir):
for filename in filenames:
full_path_list.append(os.path.join(data_dir, filename))
with open(config.data.links_file, 'w+') as links_file:
for full_path in full_path_list:
links_file.write(f"{full_path}\n")
def read_links_file_to_list():
config = ConfigProvider.config()
links_file_path = config.data.links_file
if not os.path.isfile(links_file_path):
raise RuntimeError("did you forget to create a file with links to images? Try using 'create_links()'")
with open(links_file_path, 'r') as links_file:
return links_file.readlines()
So I have a list of files (or a generator, or whatever works), file_list = read_links_file_to_list().
How can I build a Pytorch dataloader around it, and how would I use it?
What you want is a Custom Dataset. The __getitem__ method is where you would apply transforms such as data-augmentation etc. To give you an idea of what it looks like in practice you can take a look at this Custom Dataset I wrote the other day:
class GTSR43Dataset(Dataset):
"""German Traffic Sign Recognition dataset."""
def __init__(self, root_dir, train_file, transform=None):
self.root_dir = root_dir
self.train_file_path = train_file
self.label_df = pd.read_csv(os.path.join(self.root_dir, self.train_file_path))
self.transform = transform
self.classes = list(self.label_df['ClassId'].unique())
def __getitem__(self, idx):
"""Return (image, target) after resize and preprocessing."""
img = os.path.join(self.root_dir, self.label_df.iloc[idx, 7])
X = Image.open(img)
y = self.class_to_index(self.label_df.iloc[idx, 6])
if self.transform:
X = self.transform(X)
return X, y
def class_to_index(self, class_name):
"""Returns the index of a given class."""
return self.classes.index(class_name)
def index_to_class(self, class_index):
"""Returns the class of a given index."""
return self.classes[class_index]
def get_class_count(self):
"""Return a list of label occurences"""
cls_count = dict(self.label_df.ClassId.value_counts())
# cls_percent = list(map(lambda x: (1 - x / sum(cls_count)), cls_count))
return cls_count
def __len__(self):
"""Returns the length of the dataset."""
return len(self.label_df)
Related
I am training a custom Encoder-Decoder network but the training gets stuck at Epoch 3. Nothing happens for about 2 hours. I will share the Dataset class and the DataLoader object. The version if CUDA and GPU can be seen in the pic below.
Training stuck here:
nvidia-smi output looks like this:
The __getitem__ method of the dataset class looks like this:
def __init__(self,
images_dir,
annots_dir,
train=True,
img_size=(512, 1536),
stride=4,
model='custom',
transforms=None):
"""
:param root: dataset directory
:param filenames: filenames inside the root directory
:param labels: Object Detection Labels
super(CustomDataset).__init__()
self.images_dir = images_dir
self.annots_dir = annots_dir
self.train = train
self.image_size = img_size
self.stride = stride
self.transforms = transforms
self.model = model
# Load the image and annotation files from the dataset
# self.image_files, self.annot_files = self._load_image_and_annot_files()
self.image_files = [os.path.join(self.images_dir, idx) for idx in os.listdir(self.images_dir)]
self.annot_files = [os.path.join(self.annots_dir, idx) for idx in os.listdir(self.annots_dir)]
def __getitem__(self, index):
"""
:param index: index...0 to N
:return: tensor_image and tensor_label
"""
# Image filename from _load_image_files()
# Load Image with _read_matrix() and label
curr_image_filename = self.image_files[index]
curr_annot_filename = self.annot_files[index]
# curr_image_filename = self.image_files[index]
# curr_annot_filename = self.annot_files[index]
np_image = self._read_matrix(raw_img=curr_image_filename)
np_image_normalized = np.squeeze(self._normalize_raw_img(np_image))
# label = self.labels[index]
boxes, classes, depths, tgts = self._load_annotations(curr_annot_filename)
# Normalize bounding boxes: range [0, 1]
targets_normalized = self._normalize_bbox(np_image_normalized, tgts)
# image and the corresponding label should be a tensor
torch_image = torch.from_numpy(np_image).reshape(1, 512, 1536).float() # dtype: torch.float64
torch_boxes = torch.from_numpy(boxes).type(torch.FloatTensor)
torch_depths = torch.from_numpy(depths)
if self.model == 'fasterrcnn':
# For FasterRCNN: As COCO format
area = (torch_boxes[:, 3] - torch_boxes[:, 1]) * (torch_boxes[:, 2] - torch_boxes[:, 0])
iscrowd = torch.zeros((boxes.shape[0],), dtype=torch.int64)
image_id = torch.Tensor([index])
torch_classes = torch.from_numpy(classes)
target = {'boxes': torch_boxes, 'labels': torch_classes.long(),
'area': area, 'iscrowd': iscrowd, 'image_id': image_id}
return torch_image, target
elif self.model == 'custom':
if self.train:
if self.transforms:
try:
tr = self.transforms()
transform_image, transform_boxes, labels = tr.__call__(np_image, tgts, tgts[:, :4], tgts[:, 4:])
transform_targets = np.hstack((np.array(transform_boxes), labels))
gt_tensor = gt_creator(img_size=self.image_size,
stride=self.stride,
num_classes=8,
label_lists=transform_targets)
return torch.from_numpy(transform_image).float(), gt_tensor
except IndexError:
pass
else:
gt_tensor = gt_creator(img_size=self.image_size,
stride=self.stride,
num_classes=8,
label_lists=targets_normalized)
return torch_image, gt_tensor
else:
return torch_image, targets_normalized
And in the train.py script the DataLoader object is:
train_loader = torch.utils.data.DataLoader(dataset=dataset,
shuffle=True,
batch_size=1,
num_workers=0,
collate_fn=detection_collate,
pin_memory=True)
Why does the training get stuck? Is there an issue with the __getitem__ method? Or the DataLoader?
Thank You.
This happens because torch doesnt restart your dataset, if your data runs out it stops and waits for more input so cycling has to be done manually.
I used something along the lines of
from itertools import cycle
class Dataloader():
#init and whatever
self.__iter__():
return cycle(get_sample()) # get_sample is your current getitem
In PyTorch data loader, how could I concatenate an image (let's say x.jpg) in-band wise to each and every input images. ie, in effect I will have 4 band input ( 3 band input jpg with 1 band x.jpg. How to implement it.
Please find below example of my current dataloader just to load the images. To this, I want to add x.jpg to "image"(ie input image, not to mask)
from PIL import Image
class lakeDataSet(Dataset):
def __init__(self, root, transform):
super().__init__()
self.root = root
self.img_dir = os.path.join(root,'image-c3/c3-crop') #9UAV
self.mask_dir = os.path.join(root,'label-c3/c3-crop')
# self.mask_dir = os.path.join(root,'test')
self.files = [fname for fname in os.listdir(self.img_dir) if fname.endswith('.jpg')]
self.transform = transform
def __len__(self):
return len(self.files)
def __getitem__(self,I):
fname = self.files[i]
img_path = os.path.join(self.img_dir, fname)
mask_path = os.path.join(self.mask_dir, fname)
img = self.transform(Image.open(img_path))
mask = self.transform(Image.open(mask_path))
return img, mask
I suppose the self.transform already has ToTensor. Otherwise you should specify it as well.
Then you can just concat the first dimension. Like
x_jpg = self.transform(Image.open('x.jpg'))
img = torch.cat((img, x_jpg), 0)
The x.jpg has to have only 1 channel, if it's an RGB then obviously it'll become 6 channels instead of 4.
I am working an image retrieval project, for making model more fair i want to construct batches that return:
5 images per class, and
75 images and per batch
I have total 300 classes in my dataset, so it obvious that only 15 classes of images can be contained in each batch.data is balanced this mean there is equal number of images for per class,I am using pytorch.
I have create pytorch dataset and I want to add above functionality in my ImageFolderLoader class whose code I added below.
IMG_EXTENSIONS = [
'.jpg', '.JPG', '.jpeg', '.JPEG',
'.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP',
]
def is_image_file(filename):
return any(filename.endswith(extension) for extension in IMG_EXTENSIONS)
def find_classes(dir):
classes = os.listdir(dir)
classes.sort()
class_to_idx = {classes[i]: i for i in range(len(classes))}
classes = [clss.split('.')[1] for clss in classes]
return classes, class_to_idx
def make_dataset(dir, class_to_idx):
images = []
for target in os.listdir(dir):
d = os.path.join(dir, target)
if not os.path.isdir(d):
continue
for filename in os.listdir(d):
if is_image_file(filename):
path = '{0}/{1}'.format(target, filename)
item = (path, class_to_idx[target])
images.append(item)
return images
def default_loader(path):
return Image.open(path).convert('RGB')
class ImageFolderLoader(Dataset):
def __init__(self, root, transform=None, loader=default_loader,):
classes, class_to_idx = find_classes(root)
imgs = make_dataset(root, class_to_idx)
self.root = root
self.imgs = imgs
self.classes = classes
self.class_to_idx = class_to_idx
self.transform = transform
self.loader = loader
def __getitem__(self, index):
path, target = self.imgs[index]
img = self.loader(os.path.join(self.root, path))
if self.transform is not None:
img = self.transform(img)
return img, target
def __len__(self):
return len(self.imgs)
if there is way to do this then please let me know>.
edit:- Anyone want to see solution for this, i added the solution below after solving this problem.
I solved the problem by including batch_sampler in DataLoader module. for this i used pytorch-balanced-sampler git project, which allows awesome customization for batch_sampler, you should visit this repo.
My custom dataset:
IMG_EXTENSIONS = [
'.jpg', '.JPG', '.jpeg', '.JPEG',
'.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP',
]
def is_image_file(filename):
return any(filename.endswith(extension) for extension in IMG_EXTENSIONS)
def find_classes(dir):
classes = os.listdir(dir)
classes.sort()
class_to_idx = {classes[i]: i for i in range(len(classes))}
classes = [clss.split('.')[1] for clss in classes]
return classes, class_to_idx
def make_dataset(dir, class_to_idx):
images = []
for target in os.listdir(dir):
d = os.path.join(dir, target)
if not os.path.isdir(d):
continue
for filename in os.listdir(d):
if is_image_file(filename):
path = '{0}/{1}'.format(target, filename)
item = (path, class_to_idx[target])
images.append(item)
data_dict = {}
for item in images:
cls = item[1]
if cls not in data_dict.keys():
data_dict[cls] = [item]
else:
data_dict[cls].append(item)
return images,data_dict
def default_loader(path):
return Image.open(path).convert('RGB')
class ImageFolderLoader(Dataset):
def __init__(self, root, transform=None, loader=default_loader):
classes, class_to_idx = find_classes(root)
imgs,instance_labels = make_dataset(root, class_to_idx)
self.instance_labels = instance_labels
self.root = root
self.imgs = imgs
self.classes = classes
self.class_to_idx = class_to_idx
self.transform = transform
self.loader = loader
def __getitem__(self, index):
path, target = self.imgs[index]
img = self.loader(os.path.join(self.root, path))
if self.transform is not None:
img = self.transform(img)
return img, target
def __len__(self):
return len(self.imgs)
Then i used SamplerFactory class from pytorch-balances-sampler project, you need to visit this repository for understand the parameters,
train_data = ImageFolderLoader(root=TRAIN_PATH, transform=transform)
batch_sampler = SamplerFactory().get(
class_idxs=my_list,
batch_size=75,
n_batches=146,
alpha=1,
kind="fixed"
)
There are a few open-ended questions in how this is implemented. For instance, do you want each class to be equally represented regardless of that class's actual frequency? Note that this may give better performance on minority classes at the expense of performance on majority classes.
Also, do you want each example to be used at most once per epoch, or at least once per epoch?
In any case, this will likely be difficult to accomplish with the standard getitem method because it returns an example with no regard for the other examples returned in the same batch. You'll likely need to define a custom dataloader object to ensure good data distribution and usage properties, which is a bit unfortunate because pytorch's dataloader and dataset objects work together quite nicely and efficiently for most simple use cases. Perhaps someone else has a solution that uses these objects.
Here's a solution that uses random sampling with replacement after each batch, so there's no guarantee that every example will be used. Also, it uses looping so you could probably do better with parallelization.
class ImageFolderLoader(Dataset):
def __init__(self, root, transform=None, loader=default_loader,):
classes, class_to_idx = find_classes(root)
imgs = make_dataset(root, class_to_idx)
#currently, imgs items are of the form (path,class)
data_dict = {}
for item in imgs:
cls = item[1]
if cls not in data_dict.keys():
data_dict[cls] = [item]
else:
data_dict[cls].append(item)
# each class is the key for a list of all items belonging to that class
self.data_dict = data_dict
self.root = root
self.imgs = imgs
self.classes = classes
self.class_to_idx = class_to_idx
self.transform = transform
self.loader = loader
def get_batch(self):
img_batch = []
label_batch = []
classes = random.sample((0,300),15)
for cls in classes:
class_data = self.data_dict[cls]
selection = random.sample((0,len(class_data),5)
for idx in selection:
img = self.loader(os.path.join(self.root, class_data[idx][0]))
if self.transform is not None:
img = self.transform(img)
img_batch.append(img)
label_batch.append(cls)
img_batch = torch.stack(img_batch)
label_batch = torch.stack(label_batch)
return img_batch, label_batch
def __len__(self):
return len(self.imgs)
I'm solving the problem of object detection on a fruit dataset: https://yadi.sk/d/UPwQB7OZrB48qQ.
I was given the code for my dataset class:
class2tag = {"apple": 1, "orange": 2, "banana": 3}
class FruitDataset(Dataset):
def __init__(self, data_dir, transform=None):
self.images = []
self.annotations = []
self.transform = transform
for annotation in glob.glob(data_dir + "/*xml"):
image_fname = os.path.splitext(annotation)[0] + ".jpg"
self.images.append(cv2.cvtColor(cv2.imread(image_fname), cv2.COLOR_BGR2RGB))
with open(annotation) as f:
annotation_dict = xmltodict.parse(f.read())
bboxes = []
labels = []
objects = annotation_dict["annotation"]["object"]
if not isinstance(objects, list):
objects = [objects]
for obj in objects:
bndbox = obj["bndbox"]
bbox = [bndbox["xmin"], bndbox["ymin"], bndbox["xmax"], bndbox["ymax"]]
bbox = list(map(int, bbox))
bboxes.append(torch.tensor(bbox))
labels.append(class2tag[obj["name"]])
self.annotations.append(
{"boxes": torch.stack(bboxes).float(), "labels": torch.tensor(labels)}
)
def __getitem__(self, i):
if self.transform:
# the following code is correct if you use albumentations
# if you use torchvision transforms you have to modify it =)
res = self.transform(
image=self.images[i],
bboxes=self.annotations[i]["boxes"],
labels=self.annotations[i]["labels"],
)
return res["image"], {
"boxes": torch.tensor(res["bboxes"]),
"labels": torch.tensor(res["labels"]),
}
else:
return self.images[i], self.annotations[i]
def __len__(self):
return len(self.images)
I'm doing my project in Google Colab, so I've mounted Google Drive and unzipped archive.
from google.colab import drive
drive.mount('/content/drive')
Then I did some augmentations using albumentations:
train_transform = A.Compose([
A.Flip(p=0.25),
A.RGBShift(p=0.2),
], bbox_params=A.BboxParams(format='coco'))
val_transform = A.Compose([], bbox_params=A.BboxParams(format='coco'))
train_dataset = FruitDataset("./train_zip/train", transform=train_transform)
val_dataset = FruitDataset("./test_zip/test", transform=val_transform)
However, when I run len(train_dataset), I get the value of 0. So, I cannot get why my dataset size is 0. Neither can I understand where the problem is.
Would be very grateful for any possible advice.
In the function read_train_sets() an empty class is created called DataSets. It has no methods or variables. An object called data_sets is then created.
My question is, is data_sets.train an object of the class DataSet().
Or are you creating a method called train() and setting it equal to an object of the DataSet() class.
Note that there are two classes called DataSet and DataSets in the code.
import cv2
import os
import glob
from sklearn.utils import shuffle
import numpy as np
def load_train(train_path, image_size, classes):
images = []
labels = []
img_names = []
cls = []
print('Going to read training images')
for fields in classes:
index = classes.index(fields)
print('Now going to read {} files (Index: {})'.format(fields, index))
path = os.path.join(train_path, fields, '*g')
files = glob.glob(path)
for fl in files:
image = cv2.imread(fl)
image = cv2.resize(image, (image_size, image_size),0,0, cv2.INTER_LINEAR)
image = image.astype(np.float32)
image = np.multiply(image, 1.0 / 255.0)
images.append(image)
label = np.zeros(len(classes))
label[index] = 1.0
labels.append(label)
flbase = os.path.basename(fl)
img_names.append(flbase)
cls.append(fields)
images = np.array(images)
labels = np.array(labels)
img_names = np.array(img_names)
cls = np.array(cls)
return images, labels, img_names, cls
class DataSet(object):
def __init__(self, images, labels, img_names, cls):
self._num_examples = images.shape[0]
self._images = images
self._labels = labels
self._img_names = img_names
self._cls = cls
self._epochs_done = 0
self._index_in_epoch = 0
#property
def images(self):
return self._images
#property
def labels(self):
return self._labels
#property
def img_names(self):
return self._img_names
#property
def cls(self):
return self._cls
#property
def num_examples(self):
return self._num_examples
#property
def epochs_done(self):
return self._epochs_done
def next_batch(self, batch_size):
"""Return the next `batch_size` examples from this data set."""
start = self._index_in_epoch
self._index_in_epoch += batch_size
if self._index_in_epoch > self._num_examples:
# After each epoch we update this
self._epochs_done += 1
start = 0
self._index_in_epoch = batch_size
assert batch_size <= self._num_examples
end = self._index_in_epoch
return self._images[start:end], self._labels[start:end], self._img_names[start:end], self._cls[start:end]
def read_train_sets(train_path, image_size, classes, validation_size):
class DataSets(object):
pass
data_sets = DataSets()
images, labels, img_names, cls = load_train(train_path, image_size, classes)
images, labels, img_names, cls = shuffle(images, labels, img_names, cls)
if isinstance(validation_size, float):
validation_size = int(validation_size * images.shape[0])
validation_images = images[:validation_size]
validation_labels = labels[:validation_size]
validation_img_names = img_names[:validation_size]
validation_cls = cls[:validation_size]
train_images = images[validation_size:]
train_labels = labels[validation_size:]
train_img_names = img_names[validation_size:]
train_cls = cls[validation_size:]
data_sets.train = DataSet(train_images, train_labels, train_img_names, train_cls)
data_sets.valid = DataSet(validation_images, validation_labels, validation_img_names, validation_cls)
return data_sets
You can dynamically assign attributes to your objects in Python. Try inserting hasattr(data_sets, 'train') which asks if data_sets has attribute train after you assign it and see what you get. Also you can call type(data_sets.train) and convince yourself that it is indeed of type DataSet.
data_sets.train = DataSet(train_images, train_labels, train_img_names, train_cls)
This is quite clear since we are assigning a Class object to the data_sets.train
With respect to data_sets object, train and validate will be 2 attributes to it. Hope this helps.