Dataloader of different sequence of images in a single folder - python

I'm doing frame generation. For each images in Dataset/train/ folder (e.g 1.png ) I generated a sequence of 100 images and save all of them into a single Dataset/frames/train/ as (1_1.png...1_100.png), here is an example of my folder structure:
I have created my custom dataloader where I stack the frames generated as channels to form a sequence, but my problem I don't want to frames from image 2 to overlap with frames from 1 when create a sequence, How can I ensure the different frames don't overlap?
Here is my custom dataloader:
class LevelSetDataset(Dataset):
Dataset object for CNN models
Temporal is defined implicitly
as the number of channels
- X dimension
[H, W, C=number_of_timestap(t)]
- Y dimension
[W, W, C =(t+1)]
def __init__(self, input_image_path:str,
self.input_image_path = input_image_path
self.target_image_path = target_image_path
self.threshold = threshold
self.num_input_steps = num_input_steps
self.num_future_steps = num_future_steps
self.image_dimension = image_dimension
self.data_transformations= data_transformations
self.istraining_mode = istraining_mode
# get a list of input filenames as sort them (e.g. 1.png, 2.png,..,N.png)
input_image_fp = sorted(glob(os.path.join(self.input_image_path , "*")),
key=lambda x: int(os.path.basename(x).split('.')[0])
# repeat the input image untill it matches the number of segmentation
# step of the target image
self.input_image_fp = [i for i in input_image_fp for _ in range(100)]
# get a list of the target filenames and sort them by the first id and second
# id after the underscore (e.g. 1_1.png, 1_2,..,N_M.png)
self.target_image_fp= sorted(glob(os.path.join(self.target_image_path , "*")),
key=lambda x: (int(os.path.basename(x).split('_')[0]),
# check if in training mode
# to apply transformations
if (self.data_transformations is None) and (self.istraining_mode):
self.data_transformations= torchvision.transforms.Compose([
if (self.data_transformations is None) and (not self.istraining_mode):
self.data_transformations== torchvision.transforms.Compose([
self.transforms = self.data_transformations
def _create_binary_mask(self, x):
x[x>=self.threshold] = 1
x[x <self.threshold] = 0
return x
def _stat_norm(self, x):
norm =torchvision.transforms.Compose([torchvision.transforms.Resize(
return norm(x)
def __len__(self):
return len(self.target_image_fp) - (self.num_input_steps+self.num_future_steps)
def __getitem__(self, index):
X = torch.zeros((self.image_dimension, self.image_dimension, self.num_input_steps+1))
for step_idx, step in enumerate(np.arange(index, self.num_input_steps, 1)):
target_image =[step+self.num_input_steps+self.num_future_steps-1])
target_image = self.transforms(target_image)
target_image = self._create_binary_mask(target_image)
X[:, :, step_idx] = target_image # (t+1)
input_img =[index]).convert('L')
# input_img = self.transforms(input_img)
input_img = self.transforms(input_img)
X[:, :, 0] = input_img
target_image =[index+self.num_input_steps+self.num_future_steps-1])
target_image = self.transforms(target_image)
target_image = self._create_binary_mask(target_image)
image_name = self.target_image_fp[index+self.num_input_steps+self.num_future_steps-1].split('/')[-1]
Y = target_image
return X, Y, image_name


Using pytorch for data training visualize(image = image, mask = mask.squeeze()) is giving the image (.jpeg) output, but not mask (.tiff) output

I am doing training using Lits (Liver) Dataset using Pytorch. Images are .jpeg and masks are .tiff images.
After doing the preprocessing steps like normalization, shape manipulation, etc. .tiff images are not visible, it is a black image.
visualize(image = image, mask = mask.squeeze()) is giving the image output, but not the mask output.
class Dataset(BaseDataset):
images_dir (str): path to images folder
masks_dir (str): path to segmentation masks folder
class_values (list): values of classes to extract from segmentation mask
augmentation (albumentations.Compose): data transfromation pipeline
(e.g. flip, scale, etc.)
preprocessing (albumentations.Compose): data preprocessing
(e.g. noralization, shape manipulation, etc.)
CLASSES = ['background', 'liver', 'tumor']
def __init__(self, image_dir, mask_dir, classes = None, augmentation= None, preprocessing=None):
self.images = os.listdir(image_dir)[0:3000]
#self.masks = list(map(lambda x: x.replace(".jpg", "_mask.png"), self.images)) #only for 512x512
#self.masks = list(map(lambda x: x.replace(".jpg", ".png"), self.images))
self.masks = list(map(lambda x: x.replace(".jpg", ".tiff"), self.images))
self.class_values = [self.CLASSES.index(cls.lower()) for cls in classes]
self.augmentation = augmentation
self.preprocessing = preprocessing
self.image_dir = image_dir
self.mask_dir = mask_dir
def __getitem__(self, i):
# read data
image = cv2.imread(self.image_dir + '/' + self.images[i])
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
mask = cv2.imread(self.mask_dir + '/' + self.masks[i], 0)
mask = np.expand_dims(mask, axis = 2)
# masks = [(mask == v) for v in self.class_values]
# mask = np.stack(masks, axis=-1).astype('float')
# print(mask.shape)
# # extract certain classes from mask (e.g. cars)
# masks = [(mask == v) for v in self.class_values]
# mask = np.stack(masks, axis=-1).astype('float')
if self.augmentation:
sample = self.augmentation(image = image, mask= mask)
image, mask = sample['image'], sample['mask']
if self.preprocessing:
sample = self.preprocessing(image = image, mask= mask)
image, mask = sample['image'], sample['mask']
return image, mask
def __len__(self):
return len(self.images)
dataset = Dataset(image_dir = train_frame_path,
mask_dir = train_mask_path,
classes = ['background', 'liver', 'tumor'])
image, mask = dataset[1210]
visualize(image = image, mask = mask.squeeze())

tensorflow: incompatible shapes related to batch size

I have an issue training my tensorflow model which is seemingly related to batch size. If I set the batch size to 1 it executes fine.
If I set the batch size to 6 and provide 13 records I receive this error:
tensorflow.python.framework.errors_impl.InvalidArgumentError: Incompatible shapes: [34,2] vs. [32,2]
If I set the batch size to 32 and provide 64 records I receive this error:
tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
tensorflow.python.framework.errors_impl.InvalidArgumentError: Incompatible shapes: [34,2] vs. [32,2]
I did this last check to see if they had to be multiples of batch size, but it appears not.
The shape of my model input is (960, 960, 3), and the output shape is (2).
Here is the code for my data generator:
class DataGenerator(tf.keras.utils.Sequence):
'Generates data for Keras'
def __init__(self,
target_size=(128, 128), # width. height
self.target_size = target_size
self.batch_size = batch_size = directory
client = MongoClient(CONNECTION_STRING)
# Create the database for our example (we will use the same database throughout the tutorial
db = client[DB_NAME]
col = db[collection_name]
captures = col.find()
if limit is not None:
captures = captures.limit(limit)
self.img_paths = []
self.img_paths_wo_ext = []
df = pd.DataFrame()
self.count = 0
for capture in captures:
img_path = os.path.join(directory, capture['ImageName'])
if os.path.exists(img_path):
df = df.append({'ImageName': img_path, 'X': capture['X'], 'Y': capture['Y']}, ignore_index=True)
print(f"{img_path} for capture {capture['_id']} does not exist")
self.count +=1
df.set_index('ImageName', inplace=True)
self.targets = df
self.shuffle = shuffle
def __len__(self):
'Denotes the number of batches per epoch'
return int(np.ceil(len(self.img_paths) / self.batch_size))
def __getitem__(self, index):
'Generate one batch of data'
# Generate indexes of the batch
# print(f'index: {index}, batchsize: {self.batch_size}, range:{index*self.batch_size}:{min((index+1)*self.batch_size,len(self.indexes))}, length:{self.indexes}')
indexes = self.indexes[index*self.batch_size:min((index+1)*self.batch_size,len(self.indexes))]
# Find list of IDs
list_paths = [self.img_paths[k] for k in indexes]
list_paths_wo_ext = [self.img_paths_wo_ext[k] for k in indexes]
# Generate data
X, y = self.__data_generation(list_paths, list_paths_wo_ext)
return X, y
def on_epoch_end(self):
'Updates indexes after each epoch'
self.indexes = np.arange(len(self.img_paths))
if self.shuffle == True:
def __data_generation(self, list_paths, list_paths_wo_ext):
'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
# Initialization
x = np.empty((self.batch_size, self.target_size[1], self.target_size[0], 3))
# print(list_paths)
# print(self.targets)
y = self.targets.loc[list_paths].values
# Generate data
for i, ID in enumerate(list_paths):
size = None
resize_cache_path = f'{ID}.resized.{self.target_size[0]}x{self.target_size[1]}.png'
resized = None # type: Image
# Store sample
img = # type: Image
img.load() # required for png.split()
except BaseException as ex:
raise Exception(f'Error loading PNG \'{ID}\': {str(ex)}')
if size is not None:
raise Exception(f'Image already loaded for ID: {ID}, paths: {list_paths}, size: {size}')
size = img.size
if os.path.isfile(resize_cache_path):
resized =
resized = img.resize(self.target_size)
x[i, ] = resized
y[i][0] = (y[i][0] / size[0]) * self.target_size[0]
y[i][1] = (y[i][1] / size[1]) * self.target_size[1]
return x, y
What am I doing wrong?
Turns out there were two issues.
First was the initialisation of the numpy array, which needed to be capped at the remaining length of the input for the last batch:
x = np.empty((min(self.batch_size, len(list_paths)), self.target_size[1], self.target_size[0], 3))
Secondly, my input did have duplicates which have since been removed.

Training stuck at Epoch 3 PyTorch

I am training a custom Encoder-Decoder network but the training gets stuck at Epoch 3. Nothing happens for about 2 hours. I will share the Dataset class and the DataLoader object. The version if CUDA and GPU can be seen in the pic below.
Training stuck here:
nvidia-smi output looks like this:
The __getitem__ method of the dataset class looks like this:
def __init__(self,
img_size=(512, 1536),
:param root: dataset directory
:param filenames: filenames inside the root directory
:param labels: Object Detection Labels
self.images_dir = images_dir
self.annots_dir = annots_dir
self.train = train
self.image_size = img_size
self.stride = stride
self.transforms = transforms
self.model = model
# Load the image and annotation files from the dataset
# self.image_files, self.annot_files = self._load_image_and_annot_files()
self.image_files = [os.path.join(self.images_dir, idx) for idx in os.listdir(self.images_dir)]
self.annot_files = [os.path.join(self.annots_dir, idx) for idx in os.listdir(self.annots_dir)]
def __getitem__(self, index):
:param index: index...0 to N
:return: tensor_image and tensor_label
# Image filename from _load_image_files()
# Load Image with _read_matrix() and label
curr_image_filename = self.image_files[index]
curr_annot_filename = self.annot_files[index]
# curr_image_filename = self.image_files[index]
# curr_annot_filename = self.annot_files[index]
np_image = self._read_matrix(raw_img=curr_image_filename)
np_image_normalized = np.squeeze(self._normalize_raw_img(np_image))
# label = self.labels[index]
boxes, classes, depths, tgts = self._load_annotations(curr_annot_filename)
# Normalize bounding boxes: range [0, 1]
targets_normalized = self._normalize_bbox(np_image_normalized, tgts)
# image and the corresponding label should be a tensor
torch_image = torch.from_numpy(np_image).reshape(1, 512, 1536).float() # dtype: torch.float64
torch_boxes = torch.from_numpy(boxes).type(torch.FloatTensor)
torch_depths = torch.from_numpy(depths)
if self.model == 'fasterrcnn':
# For FasterRCNN: As COCO format
area = (torch_boxes[:, 3] - torch_boxes[:, 1]) * (torch_boxes[:, 2] - torch_boxes[:, 0])
iscrowd = torch.zeros((boxes.shape[0],), dtype=torch.int64)
image_id = torch.Tensor([index])
torch_classes = torch.from_numpy(classes)
target = {'boxes': torch_boxes, 'labels': torch_classes.long(),
'area': area, 'iscrowd': iscrowd, 'image_id': image_id}
return torch_image, target
elif self.model == 'custom':
if self.train:
if self.transforms:
tr = self.transforms()
transform_image, transform_boxes, labels = tr.__call__(np_image, tgts, tgts[:, :4], tgts[:, 4:])
transform_targets = np.hstack((np.array(transform_boxes), labels))
gt_tensor = gt_creator(img_size=self.image_size,
return torch.from_numpy(transform_image).float(), gt_tensor
except IndexError:
gt_tensor = gt_creator(img_size=self.image_size,
return torch_image, gt_tensor
return torch_image, targets_normalized
And in the script the DataLoader object is:
train_loader =,
Why does the training get stuck? Is there an issue with the __getitem__ method? Or the DataLoader?
Thank You.
This happens because torch doesnt restart your dataset, if your data runs out it stops and waits for more input so cycling has to be done manually.
I used something along the lines of
from itertools import cycle
class Dataloader():
#init and whatever
return cycle(get_sample()) # get_sample is your current getitem

Optimize pytorch data loader for reading small patches in full HD images

I'm training my neural network using PyTorch framework. The data is full HD images (1920x1080). But in each iteration, I just need to crop out a random 256x256 patch from these images. My network is relatively small (5 conv layers), and hence the bottleneck is being caused by loading the data. I've provided my current code below. Is there any way to optimize loading the data and speed up the training?
from pathlib import Path
import numpy
import as data
import Imath
import OpenEXR
class Ours(data.Dataset):
Loads patches of resolution 256x256. Patches are selected such that they contain atleast 1 unknown pixel
def __init__(self, data_dirpath, split_name, patch_size):
super(Ours, self).__init__()
self.dataroot = Path(data_dirpath) / split_name
self.video_names = []
for video_path in sorted(self.dataroot.iterdir()):
for i in range(4):
for j in range(11):
view_num = i * 12 + j
self.video_names.append((video_path.stem, view_num))
self.patch_size = patch_size
def __getitem__(self, index):
video_name, view_num = self.video_names[index]
patch_start_pt = (numpy.random.randint(1080), numpy.random.randint(1920))
frame1_path = self.dataroot / video_name / f'render/rgb/{view_num + 1:04}.png'
frame2_path = self.dataroot / video_name / f'render/rgb/{view_num + 2:04}.png'
depth_path = self.dataroot / video_name / f'render/depth/{view_num + 1:04}.exr'
mask_path = self.dataroot / video_name / f'render/masks/{view_num + 1:04}.png'
frame1 = self.get_image(frame1_path, patch_start_pt)
frame2 = self.get_image(frame2_path, patch_start_pt)
mask = self.get_mask(mask_path, patch_start_pt)
depth = self.get_depth(depth_path, patch_start_pt, mask)
data_dict = {
'frame1': frame1,
'frame2': frame2,
'mask': mask,
'depth': depth,
return data_dict
def __len__(self):
return len(self.video_names)
def get_mask(path: Path, patch_start_point: tuple):
h, w = patch_start_point
mask =[h:h + self.patch_size, w:w + self.patch_size][None]
return mask
def get_image(self, path: Path, patch_start_point: tuple):
h, w = patch_start_point
image =
image = image[h:h + self.patch_size, w:w + self.patch_size, :3]
image = image.astype(numpy.float32) / 255 * 2 - 1
image_cf = numpy.moveaxis(image, [0, 1, 2], [1, 2, 0])
return image_cf
def get_depth(self, path: Path, patch_start_point: tuple, mask: numpy.ndarray):
h, w = patch_start_point
exrfile = OpenEXR.InputFile(path.as_posix())
raw_bytes ='B', Imath.PixelType(Imath.PixelType.FLOAT))
depth_vector = numpy.frombuffer(raw_bytes, dtype=numpy.float32)
height = exrfile.header()['displayWindow'].max.y + 1 - exrfile.header()['displayWindow'].min.y
width = exrfile.header()['displayWindow'].max.x + 1 - exrfile.header()['displayWindow'].min.x
depth = numpy.reshape(depth_vector, (height, width))
depth = depth[h:h + self.patch_size, w:w + self.patch_size]
depth = depth[None]
depth = depth.astype(numpy.float32)
depth = depth * mask
return depth
Finally, I'm creating a DataLoader as follows:
train_data_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=4)
What I've tried so far:
I've searched if it is possible to read a part of the image. Unfortunately, I didn't get any leads. Looks like python libraries read the full image.
I'm planning to read more patches from a single image so that I will need to read fewer images. But in PyTorch framework, the get_item() function has to return a single sample, not a batch. So, in each get_item() I can read only a patch.
I'm planning to circumvent this as follows: Read 4 patches in get_item() and return patches of shape (4,3,256,256) instead of (3,256,256). Later when I read a batch using dataloader, I'll get a batch of shape (BS,4,3,256,256) instead of (BS,3,256,256). I can then concatenate the data along dim=1 to convert (BS,4,3,256,256) to (BS*4,3,256,256). Thus I can reduce batch_size (BS) by 4 and hopefully this will speed up data loading by 4 times.
Are there any other options? I'm open to all kind of suggestions. Thanks!

How to use torchvision.transforms for data augmentation of segmentation task in Pytorch?

I am a little bit confused about the data augmentation performed in PyTorch.
Because we are dealing with segmentation tasks, we need data and mask for the same data augmentation, but some of them are random, such as random rotation.
Keras provides a random seed guarantee that data and mask do the same operation, as shown in the following code:
data_gen_args = dict(featurewise_center=True,
image_datagen = ImageDataGenerator(**data_gen_args)
mask_datagen = ImageDataGenerator(**data_gen_args)
seed = 1
image_generator = image_datagen.flow(train_data, seed=seed, batch_size=1)
mask_generator = mask_datagen.flow(train_label, seed=seed, batch_size=1)
train_generator = zip(image_generator, mask_generator)
I didn't find a similar description in the official Pytorch documentation, so I don't know how to ensure that data and mask can be processed synchronously.
Pytorch does provide such a function, but I want to apply it to a custom Dataloader.
For example:
def __getitem__(self, index):
img = np.zeros((self.im_ht, self.im_wd, channel_size))
mask = np.zeros((self.im_ht, self.im_wd, channel_size))
temp_img = np.load(Image_path + '{:0>4}'.format(self.patient_index[index]) + '.npy')
temp_label = np.load(Label_path + '{:0>4}'.format(self.patient_index[index]) + '.npy')
for i in range(channel_size):
img[:,:,i] = temp_img[self.count[index] + i]
mask[:,:,i] = temp_label[self.count[index] + i]
if self.transforms:
img = np.uint8(img)
mask = np.uint8(mask)
img = self.transforms(img)
mask = self.transforms(mask)
return img, mask
In this case, img and mask will be transformed separately, because some operations such as random rotation are random, so the correspondence between mask and image may be changed. In other words, the image may have rotated but the mask did not do this.
I used the method in, but I got an error::
Traceback (most recent call last):
File "", line 87, in <module>
for batch_idx, image, mask in enumerate(train_loader):
File "/home/dirk/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/utils/data/", line 314, in __next__
batch = self.collate_fn([self.dataset[i] for i in indices])
File "/home/dirk/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/utils/data/", line 314, in <listcomp>
batch = self.collate_fn([self.dataset[i] for i in indices])
File "/home/dirk/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/utils/data/", line 103, in __getitem__
return self.dataset[self.indices[idx]]
File "/home/dirk/home/data/dirk/segmentation_unet_pytorch/", line 164, in __getitem__
img, mask = self.transforms(img, mask)
File "/home/dirk/home/data/dirk/segmentation_unet_pytorch/", line 17, in __call__
img, mask = a(img, mask)
TypeError: __call__() takes 2 positional arguments but 3 were given
This is my code for __getitem__():
data_transforms = {
'train': Compose([
train_set = DatasetUnetForTestTransform(fold=args.fold, random_index=args.random_index,transforms=data_transforms['train'])
# __getitem__ in class DatasetUnetForTestTransform
def __getitem__(self, index):
img = np.zeros((self.im_ht, self.im_wd, channel_size))
mask = np.zeros((self.im_ht, self.im_wd, channel_size))
temp_img = np.load(Label_path + '{:0>4}'.format(self.patient_index[index]) + '.npy')
temp_label = np.load(Label_path + '{:0>4}'.format(self.patient_index[index]) + '.npy')
temp_img, temp_label = crop_data_label_from_0(temp_img, temp_label)
for i in range(channel_size):
img[:,:,i] = temp_img[self.count[index] + i]
mask[:,:,i] = temp_label[self.count[index] + i]
if self.transforms:
img = T.ToPILImage()(np.uint8(img))
mask = T.ToPILImage()(np.uint8(mask))
img, mask = self.transforms(img, mask)
img = T.ToTensor()(img).copy()
mask = T.ToTensor()(mask).copy()
return img, mask
I found that after ToTensor, the dice between the same labels becomes 255 instead of 1, how to fix it?
# Dice computation
def DSC_computation(label, pred):
pred_sum = pred.sum()
label_sum = label.sum()
inter_sum = np.logical_and(pred, label).sum()
return 2 * float(inter_sum) / (pred_sum + label_sum)
Feel free to ask if more code is needed to explain the problem.
Transforms which require input parameters like RandomCrop has a get_param method which would return the parameters for that particular transformation. This can be then applied to both the image and mask using the functional interface of transforms:
from torchvision import transforms
import torchvision.transforms.functional as F
i, j, h, w = transforms.RandomCrop.get_params(input, (100, 100))
input = F.crop(input, i, j, h, w)
target = F.crop(target, i, j, h, w)
Sample available here:
Complete example available here for VOC & COCO:
Regarding the error,
ToTensor() was not overridden to handle additional mask argument, so it cannot be in data_transforms. Moreover, __getitem__ does ToTensor of both img and mask before returning them.
data_transforms = {
'train': Compose([
#transforms.ToTensor() => remove this line
torchvision also provides similar functions [document].
Here is a simple example,
import torchvision
from torchvision import transforms
trans = transforms.Compose([transforms.CenterCrop((178, 178)),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
dset = torchvision.datasets.MNIST(data_root, transforms=trans)
A brief example when customizing your own CelebA dataset. Note that, to apply transformations, you need call transform list in __getitem__.
class CelebADataset(Dataset):
def __init__(self, root, transforms=None, num=None):
super(CelebADataset, self).__init__()
self.img_root = os.path.join(root, 'img_align_celeba')
self.attr_root = os.path.join(root, 'Anno/list_attr_celeba.txt')
self.transforms = transforms
df = pd.read_csv(self.attr_root, sep='\s+', header=1, index_col=0)
if num is None:
self.labels = df.values
self.img_name = df.index.values
self.labels = df.values[:num]
self.img_name = df.index.values[:num]
def __getitem__(self, index):
img =, self.img_name[index]))
# only use blond_hair, eyeglass, male, smile
indices = [9, 15, 20, 31]
label = np.take(self.labels[index], indices)
label[label==-1] = 0
if self.transforms is not None:
img = self.transforms(img)
return np.asarray(img), label
def __len__(self):
return len(self.labels)
I probably miss something at the first glance. The main point of your problem is how to apply "the same" data preprocessing to img and labels. To my understanding, there is no available Pytorch built-in function. So, what I did before is to implement the augmentation by myself.
class RandomRotate(object):
def __init__(self, degree): = degree
def __call__(self, img, mask):
rotate_degree = random.random() * 2 * -
return img.rotate(rotate_degree, Image.BILINEAR),
mask.rotate(rotate_degree, Image.NEAREST)
Note that the input should be PIL format. See this for more information.
Another idea is to stack your image and mask along the channel dimensions and then transform them together. Obviously this only works for geometric-type transforms and you need to use the same dtype for both. I use something like this:
# Apply these to image and mask
affine_transforms = transforms.Compose([
# Apply these to image only
image_transforms = transforms.Compose([
# Loader...
def __getitem__(self, index: int):
# Get the image and mask, here shape=(HxW) for both
image = self.images[index]
mask = self.masks[index]
# Stack the image and mask together so they get the same geometric transformations
stacked =[image, mask], dim=0) # shape=(2xHxW)
stacked = self.affine_transforms(stacked)
# Split them back up again
image, mask = torch.chunk(stacked, chunks=2, dim=0)
# Image transforms are only applied to the image
image = self.image_transforms(image)
return image, mask

