In the function read_train_sets() an empty class is created called DataSets. It has no methods or variables. An object called data_sets is then created.
My question is, is data_sets.train an object of the class DataSet().
Or are you creating a method called train() and setting it equal to an object of the DataSet() class.
Note that there are two classes called DataSet and DataSets in the code.
import cv2
import os
import glob
from sklearn.utils import shuffle
import numpy as np
def load_train(train_path, image_size, classes):
images = []
labels = []
img_names = []
cls = []
print('Going to read training images')
for fields in classes:
index = classes.index(fields)
print('Now going to read {} files (Index: {})'.format(fields, index))
path = os.path.join(train_path, fields, '*g')
files = glob.glob(path)
for fl in files:
image = cv2.imread(fl)
image = cv2.resize(image, (image_size, image_size),0,0, cv2.INTER_LINEAR)
image = image.astype(np.float32)
image = np.multiply(image, 1.0 / 255.0)
images.append(image)
label = np.zeros(len(classes))
label[index] = 1.0
labels.append(label)
flbase = os.path.basename(fl)
img_names.append(flbase)
cls.append(fields)
images = np.array(images)
labels = np.array(labels)
img_names = np.array(img_names)
cls = np.array(cls)
return images, labels, img_names, cls
class DataSet(object):
def __init__(self, images, labels, img_names, cls):
self._num_examples = images.shape[0]
self._images = images
self._labels = labels
self._img_names = img_names
self._cls = cls
self._epochs_done = 0
self._index_in_epoch = 0
#property
def images(self):
return self._images
#property
def labels(self):
return self._labels
#property
def img_names(self):
return self._img_names
#property
def cls(self):
return self._cls
#property
def num_examples(self):
return self._num_examples
#property
def epochs_done(self):
return self._epochs_done
def next_batch(self, batch_size):
"""Return the next `batch_size` examples from this data set."""
start = self._index_in_epoch
self._index_in_epoch += batch_size
if self._index_in_epoch > self._num_examples:
# After each epoch we update this
self._epochs_done += 1
start = 0
self._index_in_epoch = batch_size
assert batch_size <= self._num_examples
end = self._index_in_epoch
return self._images[start:end], self._labels[start:end], self._img_names[start:end], self._cls[start:end]
def read_train_sets(train_path, image_size, classes, validation_size):
class DataSets(object):
pass
data_sets = DataSets()
images, labels, img_names, cls = load_train(train_path, image_size, classes)
images, labels, img_names, cls = shuffle(images, labels, img_names, cls)
if isinstance(validation_size, float):
validation_size = int(validation_size * images.shape[0])
validation_images = images[:validation_size]
validation_labels = labels[:validation_size]
validation_img_names = img_names[:validation_size]
validation_cls = cls[:validation_size]
train_images = images[validation_size:]
train_labels = labels[validation_size:]
train_img_names = img_names[validation_size:]
train_cls = cls[validation_size:]
data_sets.train = DataSet(train_images, train_labels, train_img_names, train_cls)
data_sets.valid = DataSet(validation_images, validation_labels, validation_img_names, validation_cls)
return data_sets
You can dynamically assign attributes to your objects in Python. Try inserting hasattr(data_sets, 'train') which asks if data_sets has attribute train after you assign it and see what you get. Also you can call type(data_sets.train) and convince yourself that it is indeed of type DataSet.
data_sets.train = DataSet(train_images, train_labels, train_img_names, train_cls)
This is quite clear since we are assigning a Class object to the data_sets.train
With respect to data_sets object, train and validate will be 2 attributes to it. Hope this helps.
Related
Good afternoon!
I have questions about the following tutorial:
https://pytorch.org/tutorials/beginner/data_loading_tutorial.html 1
I have a similar dataset (images + landmarks). I’ve built the custom dataloader following the tutorial and checked the types of dataloader components (torch.float64 for both images and landmarks).
Then I applied the dataloader to the classification model with this training class:
class Trainer():
def __init__(self,criterion = None,optimizer = None,schedular = None):
self.criterion = criterion
self.optimizer = optimizer
self.schedular = schedular
def train_batch_loop(self,model,train_dataloader):
train_loss = 0.0
train_acc = 0.0
for images,landmarks, labels in train_dataloader:
images = images.to(device)
landmarks = landmarks.to(device)
labels = labels.to(device)
self.optimizer.zero_grad()
I won’t be elaborating further because the training crushes at images = images.to(device) with the following error: AttributeError: ‘str’ object has no attribute 'to’
I don’t understand where this string is coming from if all the dataloader components are torch.float64.
I went back to check the initial data: in the tutorial, the landmarks are summarized in a pandas dataframe with landmark values as int64 and image name as “object”.
In my summary dataframe image name is an “object” as well and landmarks are numpy.float64. Again, no strings anywhere…
Appreciate any advice - what else should I check in addition to dtypes?
There are 30 cats and 48 landmarks for each image
The dataset is defined as follows:
class FaceLandmarksDataset(Dataset):
def __init__(self, data_frame, root_dir, transform=None):
self.data_frame = data_frame
self.root_dir = root_dir
self.transform = transform
def __len__(self):
return len(self.data_frame)
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
img_name = os.path.join(self.root_dir,
self.data_frame.iloc[idx, 2])
image = io.imread(img_name)
landmarks = self.data_frame.iloc[idx, 3:]
landmarks = np.array([landmarks])
landmarks = landmarks.astype('float').reshape(-1, 2)
labels = self.data_frame.iloc[idx, 1].reshape(1)
sample = {'image': image, 'landmarks': landmarks, 'labels': labels}
if self.transform:
sample = self.transform(sample)
return sample
Hi i made some changes based on your dataset, please make adjustment where you see fit, as i don't have your data i cannot test this out but this should be it based on on my understanding
import torch
from torch.utils.data import Dataset
import os
from skimage import io
import numpy as np
from typing import Dict
class FaceLandmarksDataset(Dataset):
def __init__(self, data_frame, root_dir, transform=None):
self.data_frame = data_frame
self.root_dir = root_dir
self.transform = transform
def __len__(self):
return len(self.data_frame)
def __getitem__(self, idx) -> Dict[str, torch.Tensor]:
if torch.is_tensor(idx):
idx = idx.tolist()
img_name = os.path.join(self.root_dir,self.data_frame.iloc[idx]['image_name'])
image = io.imread(img_name)
landmarks = self.data_frame.iloc[idx, 3:].tolist()
landmarks = np.array(landmarks)
landmarks = landmarks.astype('float32').reshape(-1, 2)
labels = self.data_frame.iloc[idx]['label']
# Your transforming your image only therefore pass in the array
if self.transform:
image = self.transform(image)
# Create dictionary after finishing all transforms -> automatically becomes torch tensor when passed
sample = {
'image': image,
'landmarks': landmarks,
'labels': labels
}
return sample
I am working on an NLP project using Seq2Seq. I created a data frame from my dataset then created a batch iterator using data loader, see the following code:
# creates lists containing each pair
original_word_pairs = [[w for w in l.split('\t')] for l in lines[:num_examples]]
data = pd.DataFrame(original_word_pairs, columns=["src", "trg"])
# conver the data to tensors and pass to the Dataloader
# to create a batch iterator
class MyData(Dataset):
def __init__(self, X, y):
self.data = X
self.target = y
# TODO: convert this into torch code is possible
self.length = [ np.sum(1 - np.equal(x, 0)) for x in X]
def __getitem__(self, index):
x = self.data[index]
y = self.target[index]
x_len = self.length[index]
return x,y,x_len
def __len__(self):
return len(self.data)
train_dataset = MyData(input_tensor_train, target_tensor_train)
val_dataset = MyData(input_tensor_val, target_tensor_val)
train_dataset = DataLoader(train_dataset, batch_size = BATCH_SIZE,
drop_last=True,
shuffle=True)
test_dataset= DataLoader(val_dataset, batch_size = BATCH_SIZE,
drop_last=True,
shuffle=True)
That is a part of my code, the thing is I want to use the iterators like this
for i, batch in enumerate(iterator):
src = batch.src
trg = batch.trg
But I got an error "AttributeError: 'list' object has no attribute 'src'"
How can I use the iterator and access a specific column?
You can redefine __getitem__ in your Dataset to return a dictionary:
def __getitem__(self, index):
x = self.data[index]
y = self.target[index]
x_len = self.length[index]
return {"src": x, "trg": y, "x_len": x_len}
The default collate_fn of DataLoader will take care to provide a dictionary containing batches instead of single observations, but you need to convert x_len to a tensor into __getitem__ to make it work (or you can pass a custom collate_fn).
When I use datagenerator below, learned from https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly
class DataGenerator(keras.utils.Sequence):
def __init__(self, file_list,batch_size):
"""Constructor can be expanded,
with batch size, dimentation etc.
"""
self.file_list = file_list
self.batch_size = batch_size
self.on_epoch_end()
def __len__(self):
'Take all batches in each iteration'
return int(np.floor(len(self.file_list) / self.batch_size))
def __getitem__(self, index):
'Generate one batch of data'
# Generate indexes of the batch
indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
# Find list of IDs
list_IDs_temp = [self.list_IDs[k] for k in indexes]
# Generate data
X, y = self.__data_generation(list_IDs_temp)
return X, y
def on_epoch_end(self):
'Updates indexes after each epoch'
self.indexes = np.arange(len(self.file_list))
def __data_generation(self, file_list_temp):
'Generates data containing batch_size samples'
loc = os.path.abspath('.')
# Generate data
for ID in file_list_temp:
x_file_path = os.path.join(loc, "target",ID)
y_file_path = os.path.join(loc, "newlabel",ID)
# Store sample
X = np.load(x_file_path)
# Store class
y = np.load(y_file_path)
return X, y
I call it using:
batch_size=32
training_generator = DataGenerator(train,batch_size)
validation_generator = DataGenerator(val,batch_size)
bce = tf.keras.losses.BinaryCrossentropy(reduction='sum')
EPOCHS =10
model = tf.keras.models.load_model('./training/20220317.h5',compile=False)
model.compile(optimizer='adam',loss = weightedLoss(100),metrics=['accuracy'])
H = model.fit_generator(generator=training_generator,
validation_data=validation_generator,epochs=EPOCHS)
But when I run the script, it keeps saying :
AttributeError: 'DataGenerator' object has no attribute 'shape'
I tried from tensorflow.python.keras.utils.data_utils import Sequence to fix it, but it doesn't work. What should I do to deal with this error?
Problem solved.
I change from keras.utils import Sequenceto from tensorflow.keras.utils import Sequence
and I use model.fit(x=training_generator)
instead of model.fit_generator
I am training a custom Encoder-Decoder network but the training gets stuck at Epoch 3. Nothing happens for about 2 hours. I will share the Dataset class and the DataLoader object. The version if CUDA and GPU can be seen in the pic below.
Training stuck here:
nvidia-smi output looks like this:
The __getitem__ method of the dataset class looks like this:
def __init__(self,
images_dir,
annots_dir,
train=True,
img_size=(512, 1536),
stride=4,
model='custom',
transforms=None):
"""
:param root: dataset directory
:param filenames: filenames inside the root directory
:param labels: Object Detection Labels
super(CustomDataset).__init__()
self.images_dir = images_dir
self.annots_dir = annots_dir
self.train = train
self.image_size = img_size
self.stride = stride
self.transforms = transforms
self.model = model
# Load the image and annotation files from the dataset
# self.image_files, self.annot_files = self._load_image_and_annot_files()
self.image_files = [os.path.join(self.images_dir, idx) for idx in os.listdir(self.images_dir)]
self.annot_files = [os.path.join(self.annots_dir, idx) for idx in os.listdir(self.annots_dir)]
def __getitem__(self, index):
"""
:param index: index...0 to N
:return: tensor_image and tensor_label
"""
# Image filename from _load_image_files()
# Load Image with _read_matrix() and label
curr_image_filename = self.image_files[index]
curr_annot_filename = self.annot_files[index]
# curr_image_filename = self.image_files[index]
# curr_annot_filename = self.annot_files[index]
np_image = self._read_matrix(raw_img=curr_image_filename)
np_image_normalized = np.squeeze(self._normalize_raw_img(np_image))
# label = self.labels[index]
boxes, classes, depths, tgts = self._load_annotations(curr_annot_filename)
# Normalize bounding boxes: range [0, 1]
targets_normalized = self._normalize_bbox(np_image_normalized, tgts)
# image and the corresponding label should be a tensor
torch_image = torch.from_numpy(np_image).reshape(1, 512, 1536).float() # dtype: torch.float64
torch_boxes = torch.from_numpy(boxes).type(torch.FloatTensor)
torch_depths = torch.from_numpy(depths)
if self.model == 'fasterrcnn':
# For FasterRCNN: As COCO format
area = (torch_boxes[:, 3] - torch_boxes[:, 1]) * (torch_boxes[:, 2] - torch_boxes[:, 0])
iscrowd = torch.zeros((boxes.shape[0],), dtype=torch.int64)
image_id = torch.Tensor([index])
torch_classes = torch.from_numpy(classes)
target = {'boxes': torch_boxes, 'labels': torch_classes.long(),
'area': area, 'iscrowd': iscrowd, 'image_id': image_id}
return torch_image, target
elif self.model == 'custom':
if self.train:
if self.transforms:
try:
tr = self.transforms()
transform_image, transform_boxes, labels = tr.__call__(np_image, tgts, tgts[:, :4], tgts[:, 4:])
transform_targets = np.hstack((np.array(transform_boxes), labels))
gt_tensor = gt_creator(img_size=self.image_size,
stride=self.stride,
num_classes=8,
label_lists=transform_targets)
return torch.from_numpy(transform_image).float(), gt_tensor
except IndexError:
pass
else:
gt_tensor = gt_creator(img_size=self.image_size,
stride=self.stride,
num_classes=8,
label_lists=targets_normalized)
return torch_image, gt_tensor
else:
return torch_image, targets_normalized
And in the train.py script the DataLoader object is:
train_loader = torch.utils.data.DataLoader(dataset=dataset,
shuffle=True,
batch_size=1,
num_workers=0,
collate_fn=detection_collate,
pin_memory=True)
Why does the training get stuck? Is there an issue with the __getitem__ method? Or the DataLoader?
Thank You.
This happens because torch doesnt restart your dataset, if your data runs out it stops and waits for more input so cycling has to be done manually.
I used something along the lines of
from itertools import cycle
class Dataloader():
#init and whatever
self.__iter__():
return cycle(get_sample()) # get_sample is your current getitem
I am working an image retrieval project, for making model more fair i want to construct batches that return:
5 images per class, and
75 images and per batch
I have total 300 classes in my dataset, so it obvious that only 15 classes of images can be contained in each batch.data is balanced this mean there is equal number of images for per class,I am using pytorch.
I have create pytorch dataset and I want to add above functionality in my ImageFolderLoader class whose code I added below.
IMG_EXTENSIONS = [
'.jpg', '.JPG', '.jpeg', '.JPEG',
'.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP',
]
def is_image_file(filename):
return any(filename.endswith(extension) for extension in IMG_EXTENSIONS)
def find_classes(dir):
classes = os.listdir(dir)
classes.sort()
class_to_idx = {classes[i]: i for i in range(len(classes))}
classes = [clss.split('.')[1] for clss in classes]
return classes, class_to_idx
def make_dataset(dir, class_to_idx):
images = []
for target in os.listdir(dir):
d = os.path.join(dir, target)
if not os.path.isdir(d):
continue
for filename in os.listdir(d):
if is_image_file(filename):
path = '{0}/{1}'.format(target, filename)
item = (path, class_to_idx[target])
images.append(item)
return images
def default_loader(path):
return Image.open(path).convert('RGB')
class ImageFolderLoader(Dataset):
def __init__(self, root, transform=None, loader=default_loader,):
classes, class_to_idx = find_classes(root)
imgs = make_dataset(root, class_to_idx)
self.root = root
self.imgs = imgs
self.classes = classes
self.class_to_idx = class_to_idx
self.transform = transform
self.loader = loader
def __getitem__(self, index):
path, target = self.imgs[index]
img = self.loader(os.path.join(self.root, path))
if self.transform is not None:
img = self.transform(img)
return img, target
def __len__(self):
return len(self.imgs)
if there is way to do this then please let me know>.
edit:- Anyone want to see solution for this, i added the solution below after solving this problem.
I solved the problem by including batch_sampler in DataLoader module. for this i used pytorch-balanced-sampler git project, which allows awesome customization for batch_sampler, you should visit this repo.
My custom dataset:
IMG_EXTENSIONS = [
'.jpg', '.JPG', '.jpeg', '.JPEG',
'.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP',
]
def is_image_file(filename):
return any(filename.endswith(extension) for extension in IMG_EXTENSIONS)
def find_classes(dir):
classes = os.listdir(dir)
classes.sort()
class_to_idx = {classes[i]: i for i in range(len(classes))}
classes = [clss.split('.')[1] for clss in classes]
return classes, class_to_idx
def make_dataset(dir, class_to_idx):
images = []
for target in os.listdir(dir):
d = os.path.join(dir, target)
if not os.path.isdir(d):
continue
for filename in os.listdir(d):
if is_image_file(filename):
path = '{0}/{1}'.format(target, filename)
item = (path, class_to_idx[target])
images.append(item)
data_dict = {}
for item in images:
cls = item[1]
if cls not in data_dict.keys():
data_dict[cls] = [item]
else:
data_dict[cls].append(item)
return images,data_dict
def default_loader(path):
return Image.open(path).convert('RGB')
class ImageFolderLoader(Dataset):
def __init__(self, root, transform=None, loader=default_loader):
classes, class_to_idx = find_classes(root)
imgs,instance_labels = make_dataset(root, class_to_idx)
self.instance_labels = instance_labels
self.root = root
self.imgs = imgs
self.classes = classes
self.class_to_idx = class_to_idx
self.transform = transform
self.loader = loader
def __getitem__(self, index):
path, target = self.imgs[index]
img = self.loader(os.path.join(self.root, path))
if self.transform is not None:
img = self.transform(img)
return img, target
def __len__(self):
return len(self.imgs)
Then i used SamplerFactory class from pytorch-balances-sampler project, you need to visit this repository for understand the parameters,
train_data = ImageFolderLoader(root=TRAIN_PATH, transform=transform)
batch_sampler = SamplerFactory().get(
class_idxs=my_list,
batch_size=75,
n_batches=146,
alpha=1,
kind="fixed"
)
There are a few open-ended questions in how this is implemented. For instance, do you want each class to be equally represented regardless of that class's actual frequency? Note that this may give better performance on minority classes at the expense of performance on majority classes.
Also, do you want each example to be used at most once per epoch, or at least once per epoch?
In any case, this will likely be difficult to accomplish with the standard getitem method because it returns an example with no regard for the other examples returned in the same batch. You'll likely need to define a custom dataloader object to ensure good data distribution and usage properties, which is a bit unfortunate because pytorch's dataloader and dataset objects work together quite nicely and efficiently for most simple use cases. Perhaps someone else has a solution that uses these objects.
Here's a solution that uses random sampling with replacement after each batch, so there's no guarantee that every example will be used. Also, it uses looping so you could probably do better with parallelization.
class ImageFolderLoader(Dataset):
def __init__(self, root, transform=None, loader=default_loader,):
classes, class_to_idx = find_classes(root)
imgs = make_dataset(root, class_to_idx)
#currently, imgs items are of the form (path,class)
data_dict = {}
for item in imgs:
cls = item[1]
if cls not in data_dict.keys():
data_dict[cls] = [item]
else:
data_dict[cls].append(item)
# each class is the key for a list of all items belonging to that class
self.data_dict = data_dict
self.root = root
self.imgs = imgs
self.classes = classes
self.class_to_idx = class_to_idx
self.transform = transform
self.loader = loader
def get_batch(self):
img_batch = []
label_batch = []
classes = random.sample((0,300),15)
for cls in classes:
class_data = self.data_dict[cls]
selection = random.sample((0,len(class_data),5)
for idx in selection:
img = self.loader(os.path.join(self.root, class_data[idx][0]))
if self.transform is not None:
img = self.transform(img)
img_batch.append(img)
label_batch.append(cls)
img_batch = torch.stack(img_batch)
label_batch = torch.stack(label_batch)
return img_batch, label_batch
def __len__(self):
return len(self.imgs)