Related
I am doing training using Lits (Liver) Dataset using Pytorch. Images are .jpeg and masks are .tiff images.
After doing the preprocessing steps like normalization, shape manipulation, etc. .tiff images are not visible, it is a black image.
visualize(image = image, mask = mask.squeeze()) is giving the image output, but not the mask output.
class Dataset(BaseDataset):
'''
Args:
images_dir (str): path to images folder
masks_dir (str): path to segmentation masks folder
class_values (list): values of classes to extract from segmentation mask
augmentation (albumentations.Compose): data transfromation pipeline
(e.g. flip, scale, etc.)
preprocessing (albumentations.Compose): data preprocessing
(e.g. noralization, shape manipulation, etc.)
'''
CLASSES = ['background', 'liver', 'tumor']
def __init__(self, image_dir, mask_dir, classes = None, augmentation= None, preprocessing=None):
self.images = os.listdir(image_dir)[0:3000]
#self.masks = list(map(lambda x: x.replace(".jpg", "_mask.png"), self.images)) #only for 512x512
#self.masks = list(map(lambda x: x.replace(".jpg", ".png"), self.images))
self.masks = list(map(lambda x: x.replace(".jpg", ".tiff"), self.images))
self.class_values = [self.CLASSES.index(cls.lower()) for cls in classes]
self.augmentation = augmentation
self.preprocessing = preprocessing
self.image_dir = image_dir
self.mask_dir = mask_dir
def __getitem__(self, i):
# read data
image = cv2.imread(self.image_dir + '/' + self.images[i])
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
mask = cv2.imread(self.mask_dir + '/' + self.masks[i], 0)
mask = np.expand_dims(mask, axis = 2)
# masks = [(mask == v) for v in self.class_values]
# mask = np.stack(masks, axis=-1).astype('float')
# print(mask.shape)
# # extract certain classes from mask (e.g. cars)
# masks = [(mask == v) for v in self.class_values]
# mask = np.stack(masks, axis=-1).astype('float')
if self.augmentation:
sample = self.augmentation(image = image, mask= mask)
image, mask = sample['image'], sample['mask']
if self.preprocessing:
sample = self.preprocessing(image = image, mask= mask)
image, mask = sample['image'], sample['mask']
return image, mask
def __len__(self):
return len(self.images)
dataset = Dataset(image_dir = train_frame_path,
mask_dir = train_mask_path,
classes = ['background', 'liver', 'tumor'])
image, mask = dataset[1210]
visualize(image = image, mask = mask.squeeze())
I'm training my neural network using PyTorch framework. The data is full HD images (1920x1080). But in each iteration, I just need to crop out a random 256x256 patch from these images. My network is relatively small (5 conv layers), and hence the bottleneck is being caused by loading the data. I've provided my current code below. Is there any way to optimize loading the data and speed up the training?
Code:
from pathlib import Path
import numpy
import skimage.io
import torch.utils.data as data
import Imath
import OpenEXR
class Ours(data.Dataset):
"""
Loads patches of resolution 256x256. Patches are selected such that they contain atleast 1 unknown pixel
"""
def __init__(self, data_dirpath, split_name, patch_size):
super(Ours, self).__init__()
self.dataroot = Path(data_dirpath) / split_name
self.video_names = []
for video_path in sorted(self.dataroot.iterdir()):
for i in range(4):
for j in range(11):
view_num = i * 12 + j
self.video_names.append((video_path.stem, view_num))
self.patch_size = patch_size
return
def __getitem__(self, index):
video_name, view_num = self.video_names[index]
patch_start_pt = (numpy.random.randint(1080), numpy.random.randint(1920))
frame1_path = self.dataroot / video_name / f'render/rgb/{view_num + 1:04}.png'
frame2_path = self.dataroot / video_name / f'render/rgb/{view_num + 2:04}.png'
depth_path = self.dataroot / video_name / f'render/depth/{view_num + 1:04}.exr'
mask_path = self.dataroot / video_name / f'render/masks/{view_num + 1:04}.png'
frame1 = self.get_image(frame1_path, patch_start_pt)
frame2 = self.get_image(frame2_path, patch_start_pt)
mask = self.get_mask(mask_path, patch_start_pt)
depth = self.get_depth(depth_path, patch_start_pt, mask)
data_dict = {
'frame1': frame1,
'frame2': frame2,
'mask': mask,
'depth': depth,
}
return data_dict
def __len__(self):
return len(self.video_names)
#staticmethod
def get_mask(path: Path, patch_start_point: tuple):
h, w = patch_start_point
mask = skimage.io.imread(path.as_posix())[h:h + self.patch_size, w:w + self.patch_size][None]
return mask
def get_image(self, path: Path, patch_start_point: tuple):
h, w = patch_start_point
image = skimage.io.imread(path.as_posix())
image = image[h:h + self.patch_size, w:w + self.patch_size, :3]
image = image.astype(numpy.float32) / 255 * 2 - 1
image_cf = numpy.moveaxis(image, [0, 1, 2], [1, 2, 0])
return image_cf
def get_depth(self, path: Path, patch_start_point: tuple, mask: numpy.ndarray):
h, w = patch_start_point
exrfile = OpenEXR.InputFile(path.as_posix())
raw_bytes = exrfile.channel('B', Imath.PixelType(Imath.PixelType.FLOAT))
depth_vector = numpy.frombuffer(raw_bytes, dtype=numpy.float32)
height = exrfile.header()['displayWindow'].max.y + 1 - exrfile.header()['displayWindow'].min.y
width = exrfile.header()['displayWindow'].max.x + 1 - exrfile.header()['displayWindow'].min.x
depth = numpy.reshape(depth_vector, (height, width))
depth = depth[h:h + self.patch_size, w:w + self.patch_size]
depth = depth[None]
depth = depth.astype(numpy.float32)
depth = depth * mask
return depth
Finally, I'm creating a DataLoader as follows:
train_data_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=4)
What I've tried so far:
I've searched if it is possible to read a part of the image. Unfortunately, I didn't get any leads. Looks like python libraries read the full image.
I'm planning to read more patches from a single image so that I will need to read fewer images. But in PyTorch framework, the get_item() function has to return a single sample, not a batch. So, in each get_item() I can read only a patch.
I'm planning to circumvent this as follows: Read 4 patches in get_item() and return patches of shape (4,3,256,256) instead of (3,256,256). Later when I read a batch using dataloader, I'll get a batch of shape (BS,4,3,256,256) instead of (BS,3,256,256). I can then concatenate the data along dim=1 to convert (BS,4,3,256,256) to (BS*4,3,256,256). Thus I can reduce batch_size (BS) by 4 and hopefully this will speed up data loading by 4 times.
Are there any other options? I'm open to all kind of suggestions. Thanks!
I am trying to perform edge detection for my images of soil grains using holistically nested edge detection method HED as shown however when using combined fine and coarse soil grains , the region of fine particles is not clear so I suggest making image convolution by cutting the image into smaller rectangular areas in both directions and make HED for every portion of image and store them to black copy image so as to add the edged portions to this image .
I faced an error after repeating the algorithm of HED in a for loop by dividing the width of image to 5 portions and the height to 4 portions but I can't fix that error .
Here is the algorithm used
# import the necessary packages
import argparse
import cv2
import os
import easygui
path = easygui.fileopenbox()
print(path)
hdir = os.path.dirname(path)
print(hdir)
hfilename = os.path.basename(path)
print(hfilename)
hname = os.path.splitext(hfilename)[0]
print(hname)
houtname = hname+"_out.jpg"
print(houtname)
hout = os.path.sep.join([hdir,houtname])
print(hout)
# # construct the argument parser and parse the arguments
# ap = argparse.ArgumentParser()
# ap.add_argument("-d", "--edge-detector", type=str, required=True,
# help="path to OpenCV's deep learning edge detector")
# ap.add_argument("-i", "--image", type=str, required=True,
# help="path to input image")
# args = vars(ap.parse_args())
class CropLayer(object):
def __init__(self, params, blobs):
# initialize our starting and ending (x, y)-coordinates of
# the crop
self.startX = 0
self.startY = 0
self.endX = 0
self.endY = 0
def getMemoryShapes(self, inputs):
# the crop layer will receive two inputs -- we need to crop
# the first input blob to match the shape of the second one,
# keeping the batch size and number of channels
(inputShape, targetShape) = (inputs[0], inputs[1])
(batchSize, numChannels) = (inputShape[0], inputShape[1])
(H, W) = (targetShape[2], targetShape[3])
# compute the starting and ending crop coordinates
self.startX = int((inputShape[3] - targetShape[3]) / 2)
self.startY = int((inputShape[2] - targetShape[2]) / 2)
self.endX = self.startX + W
self.endY = self.startY + H
# return the shape of the volume (we'll perform the actual
# crop during the forward pass
return [[batchSize, numChannels, H, W]]
def forward(self, inputs):
# use the derived (x, y)-coordinates to perform the crop
return [inputs[0][:, :, self.startY:self.endY,
self.startX:self.endX]]
# load our serialized edge detector from disk
print("[INFO] loading edge detector...")
fpath = os.path.abspath(__file__)
fdir = os.path.dirname(fpath)
print(fdir)
protoPath = os.path.sep.join([fdir,"hed_model", "deploy.prototxt"])
print(protoPath)
modelPath = os.path.sep.join([fdir,"hed_model","hed_pretrained_bsds.caffemodel"])
print(modelPath)
net = cv2.dnn.readNetFromCaffe(protoPath, modelPath)
# register our new layer with the model
cv2.dnn_registerLayer("Crop", CropLayer)
# load the input image and grab its dimensions
image = cv2.imread('D:\My work\MASTERS WORK\GSD files\Sample E photos\SampleE_#1_26pxfor1mm.jpg')
im_copy = image.copy()*0
(H, W) = image.shape[:2]
# print(image.shape[:2])
# image.shape[:2] =(H*3, W*3)
# image = cv2.resize(image,0.5)
h=0
w=0
for m in range(0,H ,int(H/5)):
for n in range(0,W,int(W/3)):
gray = image[h:m,w:n]
# convert the image to grayscale, blur it, and perform Canny
# edge detection
print("[INFO] performing Canny edge detection...")
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
blurred = cv2.GaussianBlur(gray, (5, 5), 0)
canny = cv2.Canny(blurred, 30, 150)
# construct a blob out of the input image for the Holistically-Nested
# Edge Detector
# cc = cv2.cvtColor(canny, cv2.COLOR_GRAY2BGR)
# image = image+cc
# mean = (104.00698793, 116.66876762, 122.67891434),
blob = cv2.dnn.blobFromImage(image, scalefactor=1.0, size=((m-h), (n-w)),
# mean=(230, 120, 50),
mean=(104.00698793, 116.66876762, 122.67891434),
swapRB=False, crop=False)
print( blob)
cv2.waitKey(0)
# set the blob as the input to the network and perform a forward pass
# to compute the edges
print("[INFO] performing holistically-nested edge detection...")
net.setInput(blob)
hed = net.forward()
hed = cv2.resize(hed[0, 0], ((m-h), (n-w)))
hed = (255 * hed).astype("uint8")
# Adding the edge detection for each portion to the copy image as follows
im_copy = im_copy + hed
h+=int(H/5)
w+=int(W/4)
# show the output edge detection results for Canny and
# Holistically-Nested Edge Detection
cv2.imshow("Input", image)
cv2.imshow("Canny", canny)
cv2.imshow("HED", hed)
cv2.waitKey(0)
cv2.imshow('Frame ',im_copy)
cv2.imwrite(hout, im_copy)
cv2v2.waitKey(0)
I then use this edged image in further analysis on the image .
The error I got using the algorithm
net = cv2.dnn.readNetFromCaffe(protoPath, modelPath)
cv2.error: OpenCV(4.1.1) C:\projects\opencv-python\opencv\modules\dnn\src\caffe\caffe_io.cpp:1121: error: (-2:Unspecified error) FAILED: fs.is_open(). Can't open "D:\My work\MASTERS WORK\hed_model\deploy.prototxt" in function 'cv::dnn::ReadProtoFromTextFile'
I'm currently making a object detection app, which is able to detect if tires are damaged or not. For this I'm using Google's AutoML edge, which exports a TFlite model. Now I wanna implement this model in my code, but apparently the coordinates it predicts are normalized and I'm stuck in denormalizing them
Have a Look at my code here:
import tensorflow as tf
import numpy as np
import cv2
MODEL_PATH = 'Resources/model_v1_OD.tflite'
LABEL_PATH = 'Resources/model_v1_OD.txt'
class TFTireModel():
labels = []
intepreter = None
input_details = []
output_details = []
height = 0
width = 0
def __init__(self):
with open(LABEL_PATH, 'r') as f:
self.labels = [line.strip() for line in f.readlines()]
# Init TFlite interpreter
self.interpreter = tf.lite.Interpreter(model_path=MODEL_PATH)
self.interpreter.allocate_tensors()
# Get input and output tensors.
self.input_details = self.interpreter.get_input_details()
self.output_details = self.interpreter.get_output_details()
# Get input dimensions
self.height = self.input_details[0]['shape'][1]
self.width = self.input_details[0]['shape'][2]
def predict(self, img, threshold=0.3):
# Resize image to input dimensions
img = cv2.resize(img, (self.width, self.height))
img = np.expand_dims(img, axis=0)
img = (2.0 / 255.0) * img - 1.0
img = img.astype('uint8')
# Predict image
self.interpreter.set_tensor(self.input_details[0]['index'], img)
self.interpreter.invoke()
# get results
boxes = self.interpreter.get_tensor(
self.output_details[0]['index'])
print(f"boxes: {boxes}")
classes = self.interpreter.get_tensor(
self.output_details[1]['index'])
scores = self.interpreter.get_tensor(
self.output_details[2]['index'])
num = self.interpreter.get_tensor(
self.output_details[3]['index'])
# Get output
output =self._boxes_coordinates(boxes=np.squeeze(boxes[0]),
classes=np.squeeze(classes[0]+1).astype(np.int32),
scores=np.squeeze(scores[0]),
im_width=self.width,
im_height=self.height,
min_score_thresh=threshold)
print(f"output: {output}")
# Format output
return output
def _boxes_coordinates(self,
boxes,
classes,
scores,
im_width,
im_height,
max_boxes_to_draw=4,
min_score_thresh=0.4):
print(f"width: {im_width}, height {im_height}" )
if not max_boxes_to_draw:
max_boxes_to_draw = boxes.shape[0]
number_boxes = min(max_boxes_to_draw, boxes.shape[0])
tire_boxes = []
# person_labels = []
for i in range(number_boxes):
if scores is None or scores[i] > min_score_thresh:
box = tuple(boxes[i].tolist())
ymin, xmin, ymax, xmax = box
xmin, ymin, xmax, ymax = (int(xmin * im_width), int(xmax * im_width), int(ymin * im_height), int(ymax * im_height)) #TODO: DO A LOOP
#tire_boxes.append([(ymin, xmin, ymax, xmax), scores[i], self.labels[classes[i]]]) #More complete
tire_boxes.append((xmin, ymin, xmax, ymax))
return tire_boxes
Things go wrong at:
boxes = self.interpreter.get_tensor(
self.output_details[0]['index'])
print(f"boxes: {boxes}"
boxes: [[[ 0.00263482 0.50020593 0.3734043 0.83953816]
[ 0.12580797 0.14952084 0.65327024 0.61710536]
[ 0.13584864 0.38896233 0.6485662 0.85324436]
[ 0.31914377 0.3945622 0.87147605 0.8458656 ]
[ 0.01334581 0.03666234 0.46443292 0.55461186]
[ 0.1018104 -0.08279537 0.6541427 0.37984413]
Since here the output is normalized, and I don't know how to denormalize it. The desired output is percentages of the width and height as can be seen in the _boxes_coordinates function.
The outputs from TFLite's Object Detection models are in the format:
[top/Height, left/Width, bottom/Height, right/Width]
So if you know the dimensions of your image, you can compute the boundaries of each computed object.
I am a little bit confused about the data augmentation performed in PyTorch.
Because we are dealing with segmentation tasks, we need data and mask for the same data augmentation, but some of them are random, such as random rotation.
Keras provides a random seed guarantee that data and mask do the same operation, as shown in the following code:
data_gen_args = dict(featurewise_center=True,
featurewise_std_normalization=True,
rotation_range=25,
horizontal_flip=True,
vertical_flip=True)
image_datagen = ImageDataGenerator(**data_gen_args)
mask_datagen = ImageDataGenerator(**data_gen_args)
seed = 1
image_generator = image_datagen.flow(train_data, seed=seed, batch_size=1)
mask_generator = mask_datagen.flow(train_label, seed=seed, batch_size=1)
train_generator = zip(image_generator, mask_generator)
I didn't find a similar description in the official Pytorch documentation, so I don't know how to ensure that data and mask can be processed synchronously.
Pytorch does provide such a function, but I want to apply it to a custom Dataloader.
For example:
def __getitem__(self, index):
img = np.zeros((self.im_ht, self.im_wd, channel_size))
mask = np.zeros((self.im_ht, self.im_wd, channel_size))
temp_img = np.load(Image_path + '{:0>4}'.format(self.patient_index[index]) + '.npy')
temp_label = np.load(Label_path + '{:0>4}'.format(self.patient_index[index]) + '.npy')
for i in range(channel_size):
img[:,:,i] = temp_img[self.count[index] + i]
mask[:,:,i] = temp_label[self.count[index] + i]
if self.transforms:
img = np.uint8(img)
mask = np.uint8(mask)
img = self.transforms(img)
mask = self.transforms(mask)
return img, mask
In this case, img and mask will be transformed separately, because some operations such as random rotation are random, so the correspondence between mask and image may be changed. In other words, the image may have rotated but the mask did not do this.
EDIT 1
I used the method in augmentations.py, but I got an error::
Traceback (most recent call last):
File "test_transform.py", line 87, in <module>
for batch_idx, image, mask in enumerate(train_loader):
File "/home/dirk/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 314, in __next__
batch = self.collate_fn([self.dataset[i] for i in indices])
File "/home/dirk/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 314, in <listcomp>
batch = self.collate_fn([self.dataset[i] for i in indices])
File "/home/dirk/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/utils/data/dataset.py", line 103, in __getitem__
return self.dataset[self.indices[idx]]
File "/home/dirk/home/data/dirk/segmentation_unet_pytorch/data.py", line 164, in __getitem__
img, mask = self.transforms(img, mask)
File "/home/dirk/home/data/dirk/segmentation_unet_pytorch/augmentations.py", line 17, in __call__
img, mask = a(img, mask)
TypeError: __call__() takes 2 positional arguments but 3 were given
This is my code for __getitem__():
data_transforms = {
'train': Compose([
RandomHorizontallyFlip(),
RandomRotate(degree=25),
transforms.ToTensor()
]),
}
train_set = DatasetUnetForTestTransform(fold=args.fold, random_index=args.random_index,transforms=data_transforms['train'])
# __getitem__ in class DatasetUnetForTestTransform
def __getitem__(self, index):
img = np.zeros((self.im_ht, self.im_wd, channel_size))
mask = np.zeros((self.im_ht, self.im_wd, channel_size))
temp_img = np.load(Label_path + '{:0>4}'.format(self.patient_index[index]) + '.npy')
temp_label = np.load(Label_path + '{:0>4}'.format(self.patient_index[index]) + '.npy')
temp_img, temp_label = crop_data_label_from_0(temp_img, temp_label)
for i in range(channel_size):
img[:,:,i] = temp_img[self.count[index] + i]
mask[:,:,i] = temp_label[self.count[index] + i]
if self.transforms:
img = T.ToPILImage()(np.uint8(img))
mask = T.ToPILImage()(np.uint8(mask))
img, mask = self.transforms(img, mask)
img = T.ToTensor()(img).copy()
mask = T.ToTensor()(mask).copy()
return img, mask
EDIT 2
I found that after ToTensor, the dice between the same labels becomes 255 instead of 1, how to fix it?
# Dice computation
def DSC_computation(label, pred):
pred_sum = pred.sum()
label_sum = label.sum()
inter_sum = np.logical_and(pred, label).sum()
return 2 * float(inter_sum) / (pred_sum + label_sum)
Feel free to ask if more code is needed to explain the problem.
Transforms which require input parameters like RandomCrop has a get_param method which would return the parameters for that particular transformation. This can be then applied to both the image and mask using the functional interface of transforms:
from torchvision import transforms
import torchvision.transforms.functional as F
i, j, h, w = transforms.RandomCrop.get_params(input, (100, 100))
input = F.crop(input, i, j, h, w)
target = F.crop(target, i, j, h, w)
Sample available here:
https://github.com/pytorch/vision/releases/tag/v0.2.0
Complete example available here for VOC & COCO:
https://github.com/pytorch/vision/blob/master/references/segmentation/transforms.py
https://github.com/pytorch/vision/blob/master/references/segmentation/train.py
Regarding the error,
ToTensor() was not overridden to handle additional mask argument, so it cannot be in data_transforms. Moreover, __getitem__ does ToTensor of both img and mask before returning them.
data_transforms = {
'train': Compose([
RandomHorizontallyFlip(),
RandomRotate(degree=25),
#transforms.ToTensor() => remove this line
]),
}
torchvision also provides similar functions [document].
Here is a simple example,
import torchvision
from torchvision import transforms
trans = transforms.Compose([transforms.CenterCrop((178, 178)),
transforms.Resize(128),
transforms.RandomRotation(20),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
dset = torchvision.datasets.MNIST(data_root, transforms=trans)
EDIT
A brief example when customizing your own CelebA dataset. Note that, to apply transformations, you need call transform list in __getitem__.
class CelebADataset(Dataset):
def __init__(self, root, transforms=None, num=None):
super(CelebADataset, self).__init__()
self.img_root = os.path.join(root, 'img_align_celeba')
self.attr_root = os.path.join(root, 'Anno/list_attr_celeba.txt')
self.transforms = transforms
df = pd.read_csv(self.attr_root, sep='\s+', header=1, index_col=0)
#print(df.columns.tolist())
if num is None:
self.labels = df.values
self.img_name = df.index.values
else:
self.labels = df.values[:num]
self.img_name = df.index.values[:num]
def __getitem__(self, index):
img = Image.open(os.path.join(self.img_root, self.img_name[index]))
# only use blond_hair, eyeglass, male, smile
indices = [9, 15, 20, 31]
label = np.take(self.labels[index], indices)
label[label==-1] = 0
if self.transforms is not None:
img = self.transforms(img)
return np.asarray(img), label
def __len__(self):
return len(self.labels)
EDIT 2
I probably miss something at the first glance. The main point of your problem is how to apply "the same" data preprocessing to img and labels. To my understanding, there is no available Pytorch built-in function. So, what I did before is to implement the augmentation by myself.
class RandomRotate(object):
def __init__(self, degree):
self.degree = degree
def __call__(self, img, mask):
rotate_degree = random.random() * 2 * self.degree - self.degree
return img.rotate(rotate_degree, Image.BILINEAR),
mask.rotate(rotate_degree, Image.NEAREST)
Note that the input should be PIL format. See this for more information.
Another idea is to stack your image and mask along the channel dimensions and then transform them together. Obviously this only works for geometric-type transforms and you need to use the same dtype for both. I use something like this:
# Apply these to image and mask
affine_transforms = transforms.Compose([
transforms.RandomAffine(degrees=180),
...
])
# Apply these to image only
image_transforms = transforms.Compose([
transforms.GaussianBlur(),
...
])
# Loader...
def __getitem__(self, index: int):
# Get the image and mask, here shape=(HxW) for both
image = self.images[index]
mask = self.masks[index]
# Stack the image and mask together so they get the same geometric transformations
stacked = torch.cat([image, mask], dim=0) # shape=(2xHxW)
stacked = self.affine_transforms(stacked)
# Split them back up again
image, mask = torch.chunk(stacked, chunks=2, dim=0)
# Image transforms are only applied to the image
image = self.image_transforms(image)
return image, mask