I'm currently making a object detection app, which is able to detect if tires are damaged or not. For this I'm using Google's AutoML edge, which exports a TFlite model. Now I wanna implement this model in my code, but apparently the coordinates it predicts are normalized and I'm stuck in denormalizing them
Have a Look at my code here:
import tensorflow as tf
import numpy as np
import cv2
MODEL_PATH = 'Resources/model_v1_OD.tflite'
LABEL_PATH = 'Resources/model_v1_OD.txt'
class TFTireModel():
labels = []
intepreter = None
input_details = []
output_details = []
height = 0
width = 0
def __init__(self):
with open(LABEL_PATH, 'r') as f:
self.labels = [line.strip() for line in f.readlines()]
# Init TFlite interpreter
self.interpreter = tf.lite.Interpreter(model_path=MODEL_PATH)
self.interpreter.allocate_tensors()
# Get input and output tensors.
self.input_details = self.interpreter.get_input_details()
self.output_details = self.interpreter.get_output_details()
# Get input dimensions
self.height = self.input_details[0]['shape'][1]
self.width = self.input_details[0]['shape'][2]
def predict(self, img, threshold=0.3):
# Resize image to input dimensions
img = cv2.resize(img, (self.width, self.height))
img = np.expand_dims(img, axis=0)
img = (2.0 / 255.0) * img - 1.0
img = img.astype('uint8')
# Predict image
self.interpreter.set_tensor(self.input_details[0]['index'], img)
self.interpreter.invoke()
# get results
boxes = self.interpreter.get_tensor(
self.output_details[0]['index'])
print(f"boxes: {boxes}")
classes = self.interpreter.get_tensor(
self.output_details[1]['index'])
scores = self.interpreter.get_tensor(
self.output_details[2]['index'])
num = self.interpreter.get_tensor(
self.output_details[3]['index'])
# Get output
output =self._boxes_coordinates(boxes=np.squeeze(boxes[0]),
classes=np.squeeze(classes[0]+1).astype(np.int32),
scores=np.squeeze(scores[0]),
im_width=self.width,
im_height=self.height,
min_score_thresh=threshold)
print(f"output: {output}")
# Format output
return output
def _boxes_coordinates(self,
boxes,
classes,
scores,
im_width,
im_height,
max_boxes_to_draw=4,
min_score_thresh=0.4):
print(f"width: {im_width}, height {im_height}" )
if not max_boxes_to_draw:
max_boxes_to_draw = boxes.shape[0]
number_boxes = min(max_boxes_to_draw, boxes.shape[0])
tire_boxes = []
# person_labels = []
for i in range(number_boxes):
if scores is None or scores[i] > min_score_thresh:
box = tuple(boxes[i].tolist())
ymin, xmin, ymax, xmax = box
xmin, ymin, xmax, ymax = (int(xmin * im_width), int(xmax * im_width), int(ymin * im_height), int(ymax * im_height)) #TODO: DO A LOOP
#tire_boxes.append([(ymin, xmin, ymax, xmax), scores[i], self.labels[classes[i]]]) #More complete
tire_boxes.append((xmin, ymin, xmax, ymax))
return tire_boxes
Things go wrong at:
boxes = self.interpreter.get_tensor(
self.output_details[0]['index'])
print(f"boxes: {boxes}"
boxes: [[[ 0.00263482 0.50020593 0.3734043 0.83953816]
[ 0.12580797 0.14952084 0.65327024 0.61710536]
[ 0.13584864 0.38896233 0.6485662 0.85324436]
[ 0.31914377 0.3945622 0.87147605 0.8458656 ]
[ 0.01334581 0.03666234 0.46443292 0.55461186]
[ 0.1018104 -0.08279537 0.6541427 0.37984413]
Since here the output is normalized, and I don't know how to denormalize it. The desired output is percentages of the width and height as can be seen in the _boxes_coordinates function.
The outputs from TFLite's Object Detection models are in the format:
[top/Height, left/Width, bottom/Height, right/Width]
So if you know the dimensions of your image, you can compute the boundaries of each computed object.
Related
I am using the following code to generate the GradCam maps for the input images. I am not able to resize the subplot to visualise them properly as soon in the figure below.
Code:
import cv2
def NormalizeData(data):
return (data - np.min(data)) / (np.max(data) - np.min(data))
for i in range(5):
img = train_loader.dataset[i][0].to(device)
mdlOut = model(img)
#img = torch.tensor(img, requires_grad=True)
img=img.clone().detach().requires_grad_(True) #newline
out = model(img)
idx = out.argmax()
yc = out[0,idx]
yc.backward()
rawGrad = img.grad.cpu().detach().numpy()
grad = NormalizeData(rawGrad)
avgGrad = np.mean(np.abs(grad),axis=(1,2))
img = NormalizeData(img.cpu().detach().numpy())
CAM = NormalizeData(avgGrad[0]*img[0,:,:] \+ avgGrad[1]*img[1,:,:] \+ avgGrad[2]*img[2,:,:])
plt.subplot(20,2,2*i+1)
plt.imshow(CAM,cmap='jet')
plt.subplot(20,2,2*i+2)
plt.imshow(np.einsum('kli->lik',img))
I am currently trying to train my own model for image recognition and I chance upon Jason Brownlee tutorial on training a model, I made some adjustment myself to training more than 1 class and I got stuck when the training starts as there is no progress on the first epoch. below is my code. My current dataset is only 32 photos of car and bicycle all in JPG format and has been annotated. Hope I can get some help and if you need more information I would be glad to provide it.
below is the minor adjustment that was made from the original
class ModelDataset(Dataset):
# load the dataset definitions
def load_dataset(self, dataset_dir, is_train=True):
# define class
lines = []
with open('OpenLabeling-master/main/class_list.txt') as f:
lines = f.readlines()
count = 1
for line in lines:
self.add_class("dataset", count, line)
count += 1
# define data locations
images_dir = dataset_dir + '/images/'
annotations_dir = dataset_dir + '/annots/'
# find all images
for filename in listdir(images_dir):
# extract image id
image_id = filename[:-4]
# skip bad images
if image_id in ['00090']:
continue
# skip all images after 150 if we are building the train set
if is_train and int(image_id) >= 150:
continue
# skip all images before 150 if we are building the test/val set
if not is_train and int(image_id) < 150:
continue
img_path = images_dir + filename
ann_path = annotations_dir + image_id + '.xml'
# add to dataset
self.add_image('dataset', image_id=image_id, path=img_path, annotation=ann_path,class_ids=[0,1,2])
# load all bounding boxes for an image
def extract_boxes(self, filename):
# load and parse the file
root = ElementTree.parse(filename)
boxes = list()
# extract each bounding box
box_class_list = list()
for box in root.findall('.//bndbox'):
xmin = int(box.find('xmin').text)
ymin = int(box.find('ymin').text)
xmax = int(box.find('xmax').text)
ymax = int(box.find('ymax').text)
coors = [xmin, ymin, xmax, ymax]
# extract image dimensions
# for name in root.findall('.//name'):
# box_class_list.append(name.text)
width = int(root.find('.//size/width').text)
height = int(root.find('.//size/height').text)
return boxes, width, height
# load the masks for an image
def load_mask(self, image_id):
# get details of image
info = self.image_info[image_id]
# define box file location
path = info['annotation']
# load XML
boxes, w, h = self.extract_boxes(path)
# create one array for all masks, each on a different channel
masks = zeros([h, w, len(boxes)], dtype='uint8')
# create masks
class_ids = list()
for i in range(len(boxes)):
box = boxes[i]
print(box)
row_s, row_e = box[1], box[3]
col_s, col_e = box[0], box[2]
if i == 0:
masks[row_s:row_e, col_s:col_e, i] = 1
class_ids.append(self.class_names.index('Car'))
else:
masks[row_s:row_e, col_s:col_e, i] = 2
class_ids.append(self.class_names.index('Bicycle'))
return masks, asarray(class_ids, dtype='int32')
my config are as follows
Configurations:
BACKBONE resnet101
BACKBONE_STRIDES [4, 8, 16, 32, 64]
BATCH_SIZE 1
BBOX_STD_DEV [0.1 0.1 0.2 0.2]
COMPUTE_BACKBONE_SHAPE None
DETECTION_MAX_INSTANCES 100
DETECTION_MIN_CONFIDENCE 0.7
DETECTION_NMS_THRESHOLD 0.3
FPN_CLASSIF_FC_LAYERS_SIZE 1024
GPU_COUNT 1
GRADIENT_CLIP_NORM 5.0
IMAGES_PER_GPU 1
IMAGE_CHANNEL_COUNT 3
IMAGE_MAX_DIM 1024
IMAGE_META_SIZE 15
IMAGE_MIN_DIM 100
IMAGE_MIN_SCALE 0
IMAGE_RESIZE_MODE square
IMAGE_SHAPE [1024 1024 3]
LEARNING_MOMENTUM 0.9
LEARNING_RATE 0.001
LOSS_WEIGHTS {'rpn_class_loss': 1.0, 'rpn_bbox_loss': 1.0, 'mrcnn_class_loss': 1.0, 'mrcnn_bbox_loss': 1.0, 'mrcnn_mask_loss': 1.0}
MASK_POOL_SIZE 14
MASK_SHAPE [28, 28]
MAX_GT_INSTANCES 100
MEAN_PIXEL [123.7 116.8 103.9]
MINI_MASK_SHAPE (56, 56)
NAME trained_cfg
NUM_CLASSES 3
POOL_SIZE 7
POST_NMS_ROIS_INFERENCE 1000
POST_NMS_ROIS_TRAINING 2000
PRE_NMS_LIMIT 6000
ROI_POSITIVE_RATIO 0.33
RPN_ANCHOR_RATIOS [0.5, 1, 2]
RPN_ANCHOR_SCALES (32, 64, 128, 256, 512)
RPN_ANCHOR_STRIDE 1
RPN_BBOX_STD_DEV [0.1 0.1 0.2 0.2]
RPN_NMS_THRESHOLD 0.7
RPN_TRAIN_ANCHORS_PER_IMAGE 256
STEPS_PER_EPOCH 100
TOP_DOWN_PYRAMID_SIZE 256
TRAIN_BN False
TRAIN_ROIS_PER_IMAGE 100
USE_MINI_MASK True
USE_RPN_ROIS True
VALIDATION_STEPS 50
WEIGHT_DECAY 0.0001
code to prep the model for training
model = MaskRCNN(mode='training', model_dir='./', config=config)
model.load_weights('mask_rcnn_coco.h5', by_name=True, exclude=["mrcnn_class_logits", "mrcnn_bbox_fc", "mrcnn_bbox", "mrcnn_mask"])
model.train(train_set, test_set, learning_rate=config.LEARNING_RATE, epochs=5, layers='heads')
I managed to get it to work and the epoch started working. the issue was with the code that was edited, I have shown a sample of the code changes I did. it somehow didn't append the coordinate inside the extract_boxes method which cause an infinite loop during the print statement.
# load all bounding boxes for an image
def extract_boxes(self, filename):
# load and parse the file
root = ElementTree.parse(filename)
boxes = list()
# extract each bounding box
box_class_list = list()
for box in root.findall('.//object'):
name = box.find('name').text
xmin = int(box.find('./bndbox/xmin').text)
ymin = int(box.find('./bndbox/ymin').text)
xmax = int(box.find('./bndbox/xmax').text)
ymax = int(box.find('./bndbox/ymax').text)
coors = [xmin, ymin, xmax, ymax, name]
if name=='Car' or name=='Bicycle':
boxes.append(coors)
width = int(root.find('.//size/width').text)
height = int(root.find('.//size/height').text)
return boxes, width, height
# load the masks for an image
def load_mask(self, image_id):
# get details of image
y = 1
info = self.image_info[image_id]
# define box file location
path = info['annotation']
# print(path)
# print("_____")
# load XML
boxes, w, h = self.extract_boxes(path)
# print(boxes)
# create one array for all masks, each on a different channel
masks = zeros([h, w, len(boxes)], dtype='uint8')
# create masks
class_ids = list()
for i in range(len(boxes)):
box = boxes[i]
row_s, row_e = box[1], box[3]
col_s, col_e = box[0], box[2]
if i == 0:
masks[row_s:row_e, col_s:col_e, i] = 1
class_ids.append(self.class_names.index('Car'))
else:
masks[row_s:row_e, col_s:col_e, i] = 2
class_ids.append(self.class_names.index('Bicycle'))
return masks, asarray(class_ids, dtype='int32')
First of all i already load my model to predict inference set that i already prepared, but i got error when to try predict and show the result.
so here my code
def load_img(filename):
img = read_file(filename) # Load Data
img = decode_image(img, channels=3) # convert to RGB
img = resize(img, size=[img_height, img_height])
img = np.array(img)[:,:,1] # Resize image
img = img/255. # Rescale Images
return img
inf1 = load_img(r'ML2\COVID-19\inf_set\covid\covid - 1.jpeg')
inf2 = load_img(r'ML2\COVID-19\inf_set\covid\covid - 2.jpeg')
inf3 = load_img(r'ML2\COVID-19\inf_set\normal\Normal - 1.jpeg')
inf4 = load_img(r'ML2\COVID-19\inf_set\normal\Normal - 2.jpeg')
inf5 = load_img(r'ML2\COVID-19\inf_set\pneumonia\Pneumonia - 1.jpeg')
inf6 = load_img(r'ML2\COVID-19\inf_set\pneumonia\Pneumonia - 2.jpeg')
plt.figure(figsize=(35, 5))
plt.suptitle('Prediction Results', fontsize=15)
counter = 1
for i in [inf1,inf2, inf3, inf4, inf5,inf6]:
plt.subplot(1,6,counter)
res = int(tf.round(model.predict(x=expand_dims(i, axis=0))))
plt.imshow(i)
plt.title(f"Prediction: {label_data[res]}")
plt.axis('off')
counter += 1
plt.show()
And here the error notification
So, i need help to solve this proble, thank you before
You are squashing the image size in this line
img = np.array(img)[:,:,1]
This is why your image becomes size (220,220) instead of (220,220,1), which when you do expand_dims will be of proper input shape (1,220,220,1)
You could change the load_img function, or you could solve this by doing
res = int(tf.round(model.predict(x=expand_dims(i, axis=[0,3]))))
I'm building CV app using OpenCV 4.5.1 and torchvision 0.9.1. When I apply this transform
test_transform = transforms.Compose([
landmarks.CenterCrop((512, 512)),
landmarks.ToTensor()
])
to the image and try to
img = img.permute(1, 2, 0).byte().numpy()
cv.circle(img=img, center=tuple(keypoints[i]), radius=2, color=(127, 255, 255), thickness=-1)
I receive an error from cv.circle()
TypeError: Expected Ptr<cv::UMat> for argument 'img'
Without CenterCrop everything works just fine
test_transform = transforms.Compose([
landmarks.ToTensor()
])
Transforms applied look like:
ToTensor:
class ToTensor:
"""Convert ndarrays in sample to Tensors."""
def __call__(self, sample):
image, landmarks = sample['image'], sample['landmarks']
image = image.transpose((2, 0, 1))
return {'image': torch.from_numpy(image).float(),
'landmarks': torch.from_numpy(landmarks).view(-1).float()}
CenterCrop:
class CenterCrop:
def __init__(self, output_size):
assert isinstance(output_size, (int, tuple))
if isinstance(output_size, int):
self.output_size = (output_size, output_size)
else:
assert len(output_size) == 2
self.output_size = output_size
def __call__(self, sample):
image, landmarks = sample['image'], sample['landmarks']
h, w = image.shape[:2]
new_h, new_w = self.output_size
top = (h - new_h) // 2
left = (w - new_w) // 2
image = image[top: top + new_h,
left: left + new_w]
landmarks = landmarks - [left, top]
return {'image': image, 'landmarks': landmarks}
My question:
How to explain this behavior of cv.circle(), given that everything except image size (reduced when crop applies) matches in images with and w/o applying CenterCrop transform? Thank you.
I'm training my neural network using PyTorch framework. The data is full HD images (1920x1080). But in each iteration, I just need to crop out a random 256x256 patch from these images. My network is relatively small (5 conv layers), and hence the bottleneck is being caused by loading the data. I've provided my current code below. Is there any way to optimize loading the data and speed up the training?
Code:
from pathlib import Path
import numpy
import skimage.io
import torch.utils.data as data
import Imath
import OpenEXR
class Ours(data.Dataset):
"""
Loads patches of resolution 256x256. Patches are selected such that they contain atleast 1 unknown pixel
"""
def __init__(self, data_dirpath, split_name, patch_size):
super(Ours, self).__init__()
self.dataroot = Path(data_dirpath) / split_name
self.video_names = []
for video_path in sorted(self.dataroot.iterdir()):
for i in range(4):
for j in range(11):
view_num = i * 12 + j
self.video_names.append((video_path.stem, view_num))
self.patch_size = patch_size
return
def __getitem__(self, index):
video_name, view_num = self.video_names[index]
patch_start_pt = (numpy.random.randint(1080), numpy.random.randint(1920))
frame1_path = self.dataroot / video_name / f'render/rgb/{view_num + 1:04}.png'
frame2_path = self.dataroot / video_name / f'render/rgb/{view_num + 2:04}.png'
depth_path = self.dataroot / video_name / f'render/depth/{view_num + 1:04}.exr'
mask_path = self.dataroot / video_name / f'render/masks/{view_num + 1:04}.png'
frame1 = self.get_image(frame1_path, patch_start_pt)
frame2 = self.get_image(frame2_path, patch_start_pt)
mask = self.get_mask(mask_path, patch_start_pt)
depth = self.get_depth(depth_path, patch_start_pt, mask)
data_dict = {
'frame1': frame1,
'frame2': frame2,
'mask': mask,
'depth': depth,
}
return data_dict
def __len__(self):
return len(self.video_names)
#staticmethod
def get_mask(path: Path, patch_start_point: tuple):
h, w = patch_start_point
mask = skimage.io.imread(path.as_posix())[h:h + self.patch_size, w:w + self.patch_size][None]
return mask
def get_image(self, path: Path, patch_start_point: tuple):
h, w = patch_start_point
image = skimage.io.imread(path.as_posix())
image = image[h:h + self.patch_size, w:w + self.patch_size, :3]
image = image.astype(numpy.float32) / 255 * 2 - 1
image_cf = numpy.moveaxis(image, [0, 1, 2], [1, 2, 0])
return image_cf
def get_depth(self, path: Path, patch_start_point: tuple, mask: numpy.ndarray):
h, w = patch_start_point
exrfile = OpenEXR.InputFile(path.as_posix())
raw_bytes = exrfile.channel('B', Imath.PixelType(Imath.PixelType.FLOAT))
depth_vector = numpy.frombuffer(raw_bytes, dtype=numpy.float32)
height = exrfile.header()['displayWindow'].max.y + 1 - exrfile.header()['displayWindow'].min.y
width = exrfile.header()['displayWindow'].max.x + 1 - exrfile.header()['displayWindow'].min.x
depth = numpy.reshape(depth_vector, (height, width))
depth = depth[h:h + self.patch_size, w:w + self.patch_size]
depth = depth[None]
depth = depth.astype(numpy.float32)
depth = depth * mask
return depth
Finally, I'm creating a DataLoader as follows:
train_data_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=4)
What I've tried so far:
I've searched if it is possible to read a part of the image. Unfortunately, I didn't get any leads. Looks like python libraries read the full image.
I'm planning to read more patches from a single image so that I will need to read fewer images. But in PyTorch framework, the get_item() function has to return a single sample, not a batch. So, in each get_item() I can read only a patch.
I'm planning to circumvent this as follows: Read 4 patches in get_item() and return patches of shape (4,3,256,256) instead of (3,256,256). Later when I read a batch using dataloader, I'll get a batch of shape (BS,4,3,256,256) instead of (BS,3,256,256). I can then concatenate the data along dim=1 to convert (BS,4,3,256,256) to (BS*4,3,256,256). Thus I can reduce batch_size (BS) by 4 and hopefully this will speed up data loading by 4 times.
Are there any other options? I'm open to all kind of suggestions. Thanks!