Create iterator from a Data Frame in Python

Create iterator from a Data Frame in Python - python

I am working on an NLP project using Seq2Seq. I created a data frame from my dataset then created a batch iterator using data loader, see the following code:
# creates lists containing each pair
original_word_pairs = [[w for w in l.split('\t')] for l in lines[:num_examples]]
data = pd.DataFrame(original_word_pairs, columns=["src", "trg"])
# conver the data to tensors and pass to the Dataloader
# to create a batch iterator
class MyData(Dataset):
def __init__(self, X, y):
self.data = X
self.target = y
# TODO: convert this into torch code is possible
self.length = [ np.sum(1 - np.equal(x, 0)) for x in X]
def __getitem__(self, index):
x = self.data[index]
y = self.target[index]
x_len = self.length[index]
return x,y,x_len
def __len__(self):
return len(self.data)
train_dataset = MyData(input_tensor_train, target_tensor_train)
val_dataset = MyData(input_tensor_val, target_tensor_val)
train_dataset = DataLoader(train_dataset, batch_size = BATCH_SIZE,
drop_last=True,
shuffle=True)
test_dataset= DataLoader(val_dataset, batch_size = BATCH_SIZE,
drop_last=True,
shuffle=True)
That is a part of my code, the thing is I want to use the iterators like this
for i, batch in enumerate(iterator):
src = batch.src
trg = batch.trg
But I got an error "AttributeError: 'list' object has no attribute 'src'"
How can I use the iterator and access a specific column?

You can redefine __getitem__ in your Dataset to return a dictionary:
def __getitem__(self, index):
x = self.data[index]
y = self.target[index]
x_len = self.length[index]
return {"src": x, "trg": y, "x_len": x_len}
The default collate_fn of DataLoader will take care to provide a dictionary containing batches instead of single observations, but you need to convert x_len to a tensor into __getitem__ to make it work (or you can pass a custom collate_fn).

Related

AttributeError: 'DataGenerator' object has no attribute 'shape' when I use data generators with Keras

When I use datagenerator below, learned from https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly
class DataGenerator(keras.utils.Sequence):
def __init__(self, file_list,batch_size):
"""Constructor can be expanded,
with batch size, dimentation etc.
"""
self.file_list = file_list
self.batch_size = batch_size
self.on_epoch_end()
def __len__(self):
'Take all batches in each iteration'
return int(np.floor(len(self.file_list) / self.batch_size))
def __getitem__(self, index):
'Generate one batch of data'
# Generate indexes of the batch
indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
# Find list of IDs
list_IDs_temp = [self.list_IDs[k] for k in indexes]
# Generate data
X, y = self.__data_generation(list_IDs_temp)
return X, y
def on_epoch_end(self):
'Updates indexes after each epoch'
self.indexes = np.arange(len(self.file_list))
def __data_generation(self, file_list_temp):
'Generates data containing batch_size samples'
loc = os.path.abspath('.')
# Generate data
for ID in file_list_temp:
x_file_path = os.path.join(loc, "target",ID)
y_file_path = os.path.join(loc, "newlabel",ID)
# Store sample
X = np.load(x_file_path)
# Store class
y = np.load(y_file_path)
return X, y
I call it using:
batch_size=32
training_generator = DataGenerator(train,batch_size)
validation_generator = DataGenerator(val,batch_size)
bce = tf.keras.losses.BinaryCrossentropy(reduction='sum')
EPOCHS =10
model = tf.keras.models.load_model('./training/20220317.h5',compile=False)
model.compile(optimizer='adam',loss = weightedLoss(100),metrics=['accuracy'])
H = model.fit_generator(generator=training_generator,
validation_data=validation_generator,epochs=EPOCHS)
But when I run the script, it keeps saying :
AttributeError: 'DataGenerator' object has no attribute 'shape'
I tried from tensorflow.python.keras.utils.data_utils import Sequence to fix it, but it doesn't work. What should I do to deal with this error?

Problem solved.
I change from keras.utils import Sequenceto from tensorflow.keras.utils import Sequence
and I use model.fit(x=training_generator)
instead of model.fit_generator

tensorflow: incompatible shapes related to batch size

I have an issue training my tensorflow model which is seemingly related to batch size. If I set the batch size to 1 it executes fine.
If I set the batch size to 6 and provide 13 records I receive this error:
tensorflow.python.framework.errors_impl.InvalidArgumentError: Incompatible shapes: [34,2] vs. [32,2]
If I set the batch size to 32 and provide 64 records I receive this error:
tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
tensorflow.python.framework.errors_impl.InvalidArgumentError: Incompatible shapes: [34,2] vs. [32,2]
I did this last check to see if they had to be multiples of batch size, but it appears not.
The shape of my model input is (960, 960, 3), and the output shape is (2).
Here is the code for my data generator:
class DataGenerator(tf.keras.utils.Sequence):
'Generates data for Keras'
def __init__(self,
directory,
collection_name,
batch_size=32,
target_size=(128, 128), # width. height
shuffle=False,
limit=None):
'Initialization'
self.target_size = target_size
self.batch_size = batch_size
self.directory = directory
client = MongoClient(CONNECTION_STRING)
# Create the database for our example (we will use the same database throughout the tutorial
db = client[DB_NAME]
col = db[collection_name]
captures = col.find()
if limit is not None:
captures = captures.limit(limit)
self.img_paths = []
self.img_paths_wo_ext = []
df = pd.DataFrame()
self.count = 0
for capture in captures:
img_path = os.path.join(directory, capture['ImageName'])
if os.path.exists(img_path):
df = df.append({'ImageName': img_path, 'X': capture['X'], 'Y': capture['Y']}, ignore_index=True)
self.img_paths.append(img_path)
self.img_paths_wo_ext.append(os.path.splitext(img_path)[0])
else:
print(f"{img_path} for capture {capture['_id']} does not exist")
self.count +=1
df.set_index('ImageName', inplace=True)
self.targets = df
self.shuffle = shuffle
self.on_epoch_end()
def __len__(self):
'Denotes the number of batches per epoch'
return int(np.ceil(len(self.img_paths) / self.batch_size))
def __getitem__(self, index):
'Generate one batch of data'
# Generate indexes of the batch
# print(f'index: {index}, batchsize: {self.batch_size}, range:{index*self.batch_size}:{min((index+1)*self.batch_size,len(self.indexes))}, length:{self.indexes}')
indexes = self.indexes[index*self.batch_size:min((index+1)*self.batch_size,len(self.indexes))]
# Find list of IDs
list_paths = [self.img_paths[k] for k in indexes]
list_paths_wo_ext = [self.img_paths_wo_ext[k] for k in indexes]
# Generate data
X, y = self.__data_generation(list_paths, list_paths_wo_ext)
return X, y
def on_epoch_end(self):
'Updates indexes after each epoch'
self.indexes = np.arange(len(self.img_paths))
if self.shuffle == True:
np.random.shuffle(self.indexes)
def __data_generation(self, list_paths, list_paths_wo_ext):
'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
# Initialization
x = np.empty((self.batch_size, self.target_size[1], self.target_size[0], 3))
# print(list_paths)
# print(self.targets)
y = self.targets.loc[list_paths].values
# Generate data
for i, ID in enumerate(list_paths):
size = None
resize_cache_path = f'{ID}.resized.{self.target_size[0]}x{self.target_size[1]}.png'
resized = None # type: Image
# Store sample
img = Image.open(ID) # type: Image
try:
img.load() # required for png.split()
except BaseException as ex:
raise Exception(f'Error loading PNG \'{ID}\': {str(ex)}')
if size is not None:
raise Exception(f'Image already loaded for ID: {ID}, paths: {list_paths}, size: {size}')
size = img.size
if os.path.isfile(resize_cache_path):
resized = Image.open(resize_cache_path)
resized.load()
else:
resized = img.resize(self.target_size)
resized.save(resize_cache_path)
x[i, ] = resized
y[i][0] = (y[i][0] / size[0]) * self.target_size[0]
y[i][1] = (y[i][1] / size[1]) * self.target_size[1]
return x, y
What am I doing wrong?

Turns out there were two issues.
First was the initialisation of the numpy array, which needed to be capped at the remaining length of the input for the last batch:
x = np.empty((min(self.batch_size, len(list_paths)), self.target_size[1], self.target_size[0], 3))
Secondly, my input did have duplicates which have since been removed.

Training stuck at Epoch 3 PyTorch

I am training a custom Encoder-Decoder network but the training gets stuck at Epoch 3. Nothing happens for about 2 hours. I will share the Dataset class and the DataLoader object. The version if CUDA and GPU can be seen in the pic below.
Training stuck here:
nvidia-smi output looks like this:
The __getitem__ method of the dataset class looks like this:
def __init__(self,
images_dir,
annots_dir,
train=True,
img_size=(512, 1536),
stride=4,
model='custom',
transforms=None):
"""
:param root: dataset directory
:param filenames: filenames inside the root directory
:param labels: Object Detection Labels
super(CustomDataset).__init__()
self.images_dir = images_dir
self.annots_dir = annots_dir
self.train = train
self.image_size = img_size
self.stride = stride
self.transforms = transforms
self.model = model
# Load the image and annotation files from the dataset
# self.image_files, self.annot_files = self._load_image_and_annot_files()
self.image_files = [os.path.join(self.images_dir, idx) for idx in os.listdir(self.images_dir)]
self.annot_files = [os.path.join(self.annots_dir, idx) for idx in os.listdir(self.annots_dir)]
def __getitem__(self, index):
"""
:param index: index...0 to N
:return: tensor_image and tensor_label
"""
# Image filename from _load_image_files()
# Load Image with _read_matrix() and label
curr_image_filename = self.image_files[index]
curr_annot_filename = self.annot_files[index]
# curr_image_filename = self.image_files[index]
# curr_annot_filename = self.annot_files[index]
np_image = self._read_matrix(raw_img=curr_image_filename)
np_image_normalized = np.squeeze(self._normalize_raw_img(np_image))
# label = self.labels[index]
boxes, classes, depths, tgts = self._load_annotations(curr_annot_filename)
# Normalize bounding boxes: range [0, 1]
targets_normalized = self._normalize_bbox(np_image_normalized, tgts)
# image and the corresponding label should be a tensor
torch_image = torch.from_numpy(np_image).reshape(1, 512, 1536).float() # dtype: torch.float64
torch_boxes = torch.from_numpy(boxes).type(torch.FloatTensor)
torch_depths = torch.from_numpy(depths)
if self.model == 'fasterrcnn':
# For FasterRCNN: As COCO format
area = (torch_boxes[:, 3] - torch_boxes[:, 1]) * (torch_boxes[:, 2] - torch_boxes[:, 0])
iscrowd = torch.zeros((boxes.shape[0],), dtype=torch.int64)
image_id = torch.Tensor([index])
torch_classes = torch.from_numpy(classes)
target = {'boxes': torch_boxes, 'labels': torch_classes.long(),
'area': area, 'iscrowd': iscrowd, 'image_id': image_id}
return torch_image, target
elif self.model == 'custom':
if self.train:
if self.transforms:
try:
tr = self.transforms()
transform_image, transform_boxes, labels = tr.__call__(np_image, tgts, tgts[:, :4], tgts[:, 4:])
transform_targets = np.hstack((np.array(transform_boxes), labels))
gt_tensor = gt_creator(img_size=self.image_size,
stride=self.stride,
num_classes=8,
label_lists=transform_targets)
return torch.from_numpy(transform_image).float(), gt_tensor
except IndexError:
pass
else:
gt_tensor = gt_creator(img_size=self.image_size,
stride=self.stride,
num_classes=8,
label_lists=targets_normalized)
return torch_image, gt_tensor
else:
return torch_image, targets_normalized
And in the train.py script the DataLoader object is:
train_loader = torch.utils.data.DataLoader(dataset=dataset,
shuffle=True,
batch_size=1,
num_workers=0,
collate_fn=detection_collate,
pin_memory=True)
Why does the training get stuck? Is there an issue with the __getitem__ method? Or the DataLoader?
Thank You.

This happens because torch doesnt restart your dataset, if your data runs out it stops and waits for more input so cycling has to be done manually.
I used something along the lines of
from itertools import cycle
class Dataloader():
#init and whatever
self.__iter__():
return cycle(get_sample()) # get_sample is your current getitem

How to create a custom data loader in Pytorch?

I have a file containing paths to images I would like to load into Pytorch, while utilizing the built-in dataloader features (multiprocess loading pipeline, data augmentations, and so on).
def create_links():
data_dir = "/myfolder"
full_path_list = []
assert os.path.isdir(data_dir)
for _, _, filenames in os.walk(data_dir):
for filename in filenames:
full_path_list.append(os.path.join(data_dir, filename))
with open(config.data.links_file, 'w+') as links_file:
for full_path in full_path_list:
links_file.write(f"{full_path}\n")
def read_links_file_to_list():
config = ConfigProvider.config()
links_file_path = config.data.links_file
if not os.path.isfile(links_file_path):
raise RuntimeError("did you forget to create a file with links to images? Try using 'create_links()'")
with open(links_file_path, 'r') as links_file:
return links_file.readlines()
So I have a list of files (or a generator, or whatever works), file_list = read_links_file_to_list().
How can I build a Pytorch dataloader around it, and how would I use it?

What you want is a Custom Dataset. The __getitem__ method is where you would apply transforms such as data-augmentation etc. To give you an idea of what it looks like in practice you can take a look at this Custom Dataset I wrote the other day:
class GTSR43Dataset(Dataset):
"""German Traffic Sign Recognition dataset."""
def __init__(self, root_dir, train_file, transform=None):
self.root_dir = root_dir
self.train_file_path = train_file
self.label_df = pd.read_csv(os.path.join(self.root_dir, self.train_file_path))
self.transform = transform
self.classes = list(self.label_df['ClassId'].unique())
def __getitem__(self, idx):
"""Return (image, target) after resize and preprocessing."""
img = os.path.join(self.root_dir, self.label_df.iloc[idx, 7])
X = Image.open(img)
y = self.class_to_index(self.label_df.iloc[idx, 6])
if self.transform:
X = self.transform(X)
return X, y
def class_to_index(self, class_name):
"""Returns the index of a given class."""
return self.classes.index(class_name)
def index_to_class(self, class_index):
"""Returns the class of a given index."""
return self.classes[class_index]
def get_class_count(self):
"""Return a list of label occurences"""
cls_count = dict(self.label_df.ClassId.value_counts())
# cls_percent = list(map(lambda x: (1 - x / sum(cls_count)), cls_count))
return cls_count
def __len__(self):
"""Returns the length of the dataset."""
return len(self.label_df)

python creating an object of a class

In the function read_train_sets() an empty class is created called DataSets. It has no methods or variables. An object called data_sets is then created.
My question is, is data_sets.train an object of the class DataSet().
Or are you creating a method called train() and setting it equal to an object of the DataSet() class.
Note that there are two classes called DataSet and DataSets in the code.
import cv2
import os
import glob
from sklearn.utils import shuffle
import numpy as np
def load_train(train_path, image_size, classes):
images = []
labels = []
img_names = []
cls = []
print('Going to read training images')
for fields in classes:
index = classes.index(fields)
print('Now going to read {} files (Index: {})'.format(fields, index))
path = os.path.join(train_path, fields, '*g')
files = glob.glob(path)
for fl in files:
image = cv2.imread(fl)
image = cv2.resize(image, (image_size, image_size),0,0, cv2.INTER_LINEAR)
image = image.astype(np.float32)
image = np.multiply(image, 1.0 / 255.0)
images.append(image)
label = np.zeros(len(classes))
label[index] = 1.0
labels.append(label)
flbase = os.path.basename(fl)
img_names.append(flbase)
cls.append(fields)
images = np.array(images)
labels = np.array(labels)
img_names = np.array(img_names)
cls = np.array(cls)
return images, labels, img_names, cls
class DataSet(object):
def __init__(self, images, labels, img_names, cls):
self._num_examples = images.shape[0]
self._images = images
self._labels = labels
self._img_names = img_names
self._cls = cls
self._epochs_done = 0
self._index_in_epoch = 0
#property
def images(self):
return self._images
#property
def labels(self):
return self._labels
#property
def img_names(self):
return self._img_names
#property
def cls(self):
return self._cls
#property
def num_examples(self):
return self._num_examples
#property
def epochs_done(self):
return self._epochs_done
def next_batch(self, batch_size):
"""Return the next `batch_size` examples from this data set."""
start = self._index_in_epoch
self._index_in_epoch += batch_size
if self._index_in_epoch > self._num_examples:
# After each epoch we update this
self._epochs_done += 1
start = 0
self._index_in_epoch = batch_size
assert batch_size <= self._num_examples
end = self._index_in_epoch
return self._images[start:end], self._labels[start:end], self._img_names[start:end], self._cls[start:end]
def read_train_sets(train_path, image_size, classes, validation_size):
class DataSets(object):
pass
data_sets = DataSets()
images, labels, img_names, cls = load_train(train_path, image_size, classes)
images, labels, img_names, cls = shuffle(images, labels, img_names, cls)
if isinstance(validation_size, float):
validation_size = int(validation_size * images.shape[0])
validation_images = images[:validation_size]
validation_labels = labels[:validation_size]
validation_img_names = img_names[:validation_size]
validation_cls = cls[:validation_size]
train_images = images[validation_size:]
train_labels = labels[validation_size:]
train_img_names = img_names[validation_size:]
train_cls = cls[validation_size:]
data_sets.train = DataSet(train_images, train_labels, train_img_names, train_cls)
data_sets.valid = DataSet(validation_images, validation_labels, validation_img_names, validation_cls)
return data_sets

You can dynamically assign attributes to your objects in Python. Try inserting hasattr(data_sets, 'train') which asks if data_sets has attribute train after you assign it and see what you get. Also you can call type(data_sets.train) and convince yourself that it is indeed of type DataSet.

data_sets.train = DataSet(train_images, train_labels, train_img_names, train_cls)
This is quite clear since we are assigning a Class object to the data_sets.train
With respect to data_sets object, train and validate will be 2 attributes to it. Hope this helps.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Create iterator from a Data Frame in Python - python

Related

AttributeError: 'DataGenerator' object has no attribute 'shape' when I use data generators with Keras

tensorflow: incompatible shapes related to batch size

Training stuck at Epoch 3 PyTorch

How to create a custom data loader in Pytorch?

python creating an object of a class

Categories

Resources