I am currently trying to train my own model for image recognition and I chance upon Jason Brownlee tutorial on training a model, I made some adjustment myself to training more than 1 class and I got stuck when the training starts as there is no progress on the first epoch. below is my code. My current dataset is only 32 photos of car and bicycle all in JPG format and has been annotated. Hope I can get some help and if you need more information I would be glad to provide it.
below is the minor adjustment that was made from the original
class ModelDataset(Dataset):
# load the dataset definitions
def load_dataset(self, dataset_dir, is_train=True):
# define class
lines = []
with open('OpenLabeling-master/main/class_list.txt') as f:
lines = f.readlines()
count = 1
for line in lines:
self.add_class("dataset", count, line)
count += 1
# define data locations
images_dir = dataset_dir + '/images/'
annotations_dir = dataset_dir + '/annots/'
# find all images
for filename in listdir(images_dir):
# extract image id
image_id = filename[:-4]
# skip bad images
if image_id in ['00090']:
continue
# skip all images after 150 if we are building the train set
if is_train and int(image_id) >= 150:
continue
# skip all images before 150 if we are building the test/val set
if not is_train and int(image_id) < 150:
continue
img_path = images_dir + filename
ann_path = annotations_dir + image_id + '.xml'
# add to dataset
self.add_image('dataset', image_id=image_id, path=img_path, annotation=ann_path,class_ids=[0,1,2])
# load all bounding boxes for an image
def extract_boxes(self, filename):
# load and parse the file
root = ElementTree.parse(filename)
boxes = list()
# extract each bounding box
box_class_list = list()
for box in root.findall('.//bndbox'):
xmin = int(box.find('xmin').text)
ymin = int(box.find('ymin').text)
xmax = int(box.find('xmax').text)
ymax = int(box.find('ymax').text)
coors = [xmin, ymin, xmax, ymax]
# extract image dimensions
# for name in root.findall('.//name'):
# box_class_list.append(name.text)
width = int(root.find('.//size/width').text)
height = int(root.find('.//size/height').text)
return boxes, width, height
# load the masks for an image
def load_mask(self, image_id):
# get details of image
info = self.image_info[image_id]
# define box file location
path = info['annotation']
# load XML
boxes, w, h = self.extract_boxes(path)
# create one array for all masks, each on a different channel
masks = zeros([h, w, len(boxes)], dtype='uint8')
# create masks
class_ids = list()
for i in range(len(boxes)):
box = boxes[i]
print(box)
row_s, row_e = box[1], box[3]
col_s, col_e = box[0], box[2]
if i == 0:
masks[row_s:row_e, col_s:col_e, i] = 1
class_ids.append(self.class_names.index('Car'))
else:
masks[row_s:row_e, col_s:col_e, i] = 2
class_ids.append(self.class_names.index('Bicycle'))
return masks, asarray(class_ids, dtype='int32')
my config are as follows
Configurations:
BACKBONE resnet101
BACKBONE_STRIDES [4, 8, 16, 32, 64]
BATCH_SIZE 1
BBOX_STD_DEV [0.1 0.1 0.2 0.2]
COMPUTE_BACKBONE_SHAPE None
DETECTION_MAX_INSTANCES 100
DETECTION_MIN_CONFIDENCE 0.7
DETECTION_NMS_THRESHOLD 0.3
FPN_CLASSIF_FC_LAYERS_SIZE 1024
GPU_COUNT 1
GRADIENT_CLIP_NORM 5.0
IMAGES_PER_GPU 1
IMAGE_CHANNEL_COUNT 3
IMAGE_MAX_DIM 1024
IMAGE_META_SIZE 15
IMAGE_MIN_DIM 100
IMAGE_MIN_SCALE 0
IMAGE_RESIZE_MODE square
IMAGE_SHAPE [1024 1024 3]
LEARNING_MOMENTUM 0.9
LEARNING_RATE 0.001
LOSS_WEIGHTS {'rpn_class_loss': 1.0, 'rpn_bbox_loss': 1.0, 'mrcnn_class_loss': 1.0, 'mrcnn_bbox_loss': 1.0, 'mrcnn_mask_loss': 1.0}
MASK_POOL_SIZE 14
MASK_SHAPE [28, 28]
MAX_GT_INSTANCES 100
MEAN_PIXEL [123.7 116.8 103.9]
MINI_MASK_SHAPE (56, 56)
NAME trained_cfg
NUM_CLASSES 3
POOL_SIZE 7
POST_NMS_ROIS_INFERENCE 1000
POST_NMS_ROIS_TRAINING 2000
PRE_NMS_LIMIT 6000
ROI_POSITIVE_RATIO 0.33
RPN_ANCHOR_RATIOS [0.5, 1, 2]
RPN_ANCHOR_SCALES (32, 64, 128, 256, 512)
RPN_ANCHOR_STRIDE 1
RPN_BBOX_STD_DEV [0.1 0.1 0.2 0.2]
RPN_NMS_THRESHOLD 0.7
RPN_TRAIN_ANCHORS_PER_IMAGE 256
STEPS_PER_EPOCH 100
TOP_DOWN_PYRAMID_SIZE 256
TRAIN_BN False
TRAIN_ROIS_PER_IMAGE 100
USE_MINI_MASK True
USE_RPN_ROIS True
VALIDATION_STEPS 50
WEIGHT_DECAY 0.0001
code to prep the model for training
model = MaskRCNN(mode='training', model_dir='./', config=config)
model.load_weights('mask_rcnn_coco.h5', by_name=True, exclude=["mrcnn_class_logits", "mrcnn_bbox_fc", "mrcnn_bbox", "mrcnn_mask"])
model.train(train_set, test_set, learning_rate=config.LEARNING_RATE, epochs=5, layers='heads')
I managed to get it to work and the epoch started working. the issue was with the code that was edited, I have shown a sample of the code changes I did. it somehow didn't append the coordinate inside the extract_boxes method which cause an infinite loop during the print statement.
# load all bounding boxes for an image
def extract_boxes(self, filename):
# load and parse the file
root = ElementTree.parse(filename)
boxes = list()
# extract each bounding box
box_class_list = list()
for box in root.findall('.//object'):
name = box.find('name').text
xmin = int(box.find('./bndbox/xmin').text)
ymin = int(box.find('./bndbox/ymin').text)
xmax = int(box.find('./bndbox/xmax').text)
ymax = int(box.find('./bndbox/ymax').text)
coors = [xmin, ymin, xmax, ymax, name]
if name=='Car' or name=='Bicycle':
boxes.append(coors)
width = int(root.find('.//size/width').text)
height = int(root.find('.//size/height').text)
return boxes, width, height
# load the masks for an image
def load_mask(self, image_id):
# get details of image
y = 1
info = self.image_info[image_id]
# define box file location
path = info['annotation']
# print(path)
# print("_____")
# load XML
boxes, w, h = self.extract_boxes(path)
# print(boxes)
# create one array for all masks, each on a different channel
masks = zeros([h, w, len(boxes)], dtype='uint8')
# create masks
class_ids = list()
for i in range(len(boxes)):
box = boxes[i]
row_s, row_e = box[1], box[3]
col_s, col_e = box[0], box[2]
if i == 0:
masks[row_s:row_e, col_s:col_e, i] = 1
class_ids.append(self.class_names.index('Car'))
else:
masks[row_s:row_e, col_s:col_e, i] = 2
class_ids.append(self.class_names.index('Bicycle'))
return masks, asarray(class_ids, dtype='int32')
Related
I have two csv files that i am uploading from an ec2 instance to the s3 bucket along with a few other files. All the other files are being uploaded just fine but my csv files, though it is uploaded, there seems ot be no data inside it even though the local copy of the file on the instance is showing the data. im not sure why its saying 0 bytes on the bucket.
the csv file is part of another larger program. here is the code.
from boto3.session import Session
import botocore
import boto3
import zipfile
import darknet
import os
import cv2
import glob
import csv
import numpy as np
global lat_start, lon_start
import shutil
#HELPER FUNCTION DEFINITIONS
ACCESS_KEY = '*********'
SECRET_KEY = '******D'
def image_detection(image_path, network, class_names, class_colors, thresh):
# Darknet doesn't accept numpy images.
# Create one with image we reuse for each detect
width = darknet.network_width(network)
height = darknet.network_height(network)
darknet_image = darknet.make_image(width, height, 3)
image = cv2.imread(image_path)
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image_resized = cv2.resize(image_rgb, (width, height),interpolation=cv2.INTER_LINEAR)
darknet.copy_image_from_bytes(darknet_image, image_resized.tobytes())
detections = darknet.detect_image(network, class_names, darknet_image, thresh=thresh)
darknet.free_image(darknet_image)
image = darknet.draw_boxes(detections, image_resized, class_colors)
return cv2.cvtColor(image, cv2.COLOR_BGR2RGB), detections
def discretize_line(lat_start, lon_start, d_element, d, bearing):
# d_element -> how many element we need in a line secment
# global lat_start, lon_start
R = 6371.0*1000.0
# -1 because in case of 10 elements/points we also want len(lat_array) the same
dstep = d/(d_element-1) #0.6524896365354135 #2.0 # meters
dist_list = np.ones(int(d/dstep))*dstep
# print(dist_list)
brg = np.radians(bearing)
# if d%dstep != 0:
# dist_list = np.append(dist_list, d%dstep)
# This will append lat and lon into array which contains
# small segments of distance
lat_array = np.array([np.radians(lat_start)]) # rads
lon_array = np.array([np.radians(lon_start)]) # rads
# lat_array = np.array([])
# lon_array = np.array([])
for i, dist in enumerate(dist_list):
## last element make the waypoint shifted, so we break it
if i >= (d_element):
break
lat1 = lat_array[i]
lon1 = lon_array[i]
# print(dist)
Ad = dist/R
lat2 = np.arcsin(np.sin(lat1)*np.cos(Ad) + np.cos(lat1)*np.sin(Ad)*np.cos(brg))
lon2 = lon1 + np.arctan2( (np.sin(brg)*np.sin(Ad)*np.cos(lat1)) , (np.cos(Ad) - np.sin(lat1)*np.sin(lat2)))
lat_array = np.append(lat_array, lat2)
lon_array = np.append(lon_array, lon2)
# print(i)
return lat_array, lon_array
def get_distance_bearing(lat1, lon1, lat2, lon2):
# global lat_start, lon_start
R = 6371.0*1000.0
lat_start = np.radians(lat1)
lon_start = np.radians(lon1)
lat_end = np.radians(lat2)
lon_end = np.radians(lon2)
dLat = lat_end - lat_start
dLon = lon_end - lon_start
a = np.sin(dLat/2.0)*np.sin(dLat/2.0) + np.cos(lat_start)*np.cos(lat_end)*np.sin(dLon/2.0)*np.sin(dLon/2.0)
c = 2.0*np.arctan2(np.sqrt(a),np.sqrt(1-a))
d = c*R
y = np.sin(dLon)*np.cos(lat_end)
x = np.cos(lat_start)*np.sin(lat_end) - np.sin(lat_start)*np.cos(lat_end)*np.cos(dLon)
bearing = np.degrees(np.arctan2(y,x))
return d, bearing
def upload_to_aws(local_file, bucket, s3_file):
s3 = boto3.client('s3', aws_access_key_id=ACCESS_KEY,
aws_secret_access_key=SECRET_KEY)
try:
s3.upload_file(local_file, bucket, s3_file)
print("Upload Successful")
return True
except FileNotFoundError:
print("The file was not found")
return False
except NoCredentialsError:
print("Credentials not available")
return False
##END OF FUNCTION DEFINITIONS ##
#Unzip the zip file and its contents
print("unzipping")
path_to_zip_file = "/home/ubuntu/pano/Zip/Videos.zip"
with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
zip_ref.extractall("/home/ubuntu/pano/Video")
print("Finished Unzipping")
#End of Unzip
# CSV open and declaration##
data_file_path = "/home/ubuntu/pano/stack/quantity.csv"
data_file = open(data_file_path, "w+")
dataCSVWriter = csv.writer(data_file, delimiter=',',quotechar='|', quoting=csv.QUOTE_MINIMAL)
dataCSVWriter.writerow(['lat', 'lon', 'Quantity'])
#CSV for lane thumbnail
thumbnail_data_file_path = "/home/ubuntu/pano/stack/lane_thumbnail.csv"
thumbnail_data_file = open(thumbnail_data_file_path, "w+")
thumbnail_dataCSVWriter = csv.writer(thumbnail_data_file, delimiter=',',quotechar='|', quoting=csv.QUOTE_MINIMAL)
thumbnail_dataCSVWriter.writerow(['lat', 'lon'])
#Define start and end point lists
#start_point_list = [(35.841454251754755, 139.52427014959153),(35.84147944801779, 139.52420150963678)]
start_point_list = [(36.12083710338884, 139.21630320454503),(36.12080527337101, 139.2164926108044)]
#end_point_list = [(35.84151350159559, 139.52424466860762),(35.84144222040454, 139.52422739581436)]
end_point_list = [(36.12083735438514, 139.2164757318577),(36.12081575161991, 139.21630345327617)]
wp_lat_array = np.array([])
wp_lon_array = np.array([])
##Split th eline into points and it is stored in lat array lon array
"""for i in range(len(start_point_list)):
## input two points and find a slicing waypoint between it
distance, bearing_deg = get_distance_bearing(start_point_list[i][0], start_point_list[i][1], end_point_list[i][0], end_point_list[i][1])
print(distance)
lat_array, lon_array = discretize_line(start_point_list[i][0], start_point_list[i][1], float(d_element[i]), distance, bearing_deg)"""
#Initialize the detector variables and paths
quantity_bottles_frame = []
config_file = "/home/ubuntu/darknet_bottle_example/yolov4_bottle_can.cfg"
data_file = "/home/ubuntu/darknet_bottle_example/obj_bottle_can.data"
weights = "/home/ubuntu/darknet_bottle_example/yolov4_bottle_can_best.weights"
network, class_names, class_colors = darknet.load_network(
config_file,
data_file,
weights,
batch_size=1
)
image_dir = "/home/ubuntu/pano/Frames"
#1.Split into frames
path = "/home/ubuntu/pano/Video/Panorama/Videos"
j = 0
"""Order of events
1. Split into frames
2. Rotate images if needed
3. Running through detctor
4. Calculate count and draw bounding boxes
5. Store these images in respective directoies
6. Take start point of lane and end point and split into many coordinates in between based on number of frames
7. Write to csv file
8. Stack the images per lane
9. Empty the Frames folder after every lane
10. Upload stacked images and csv to cloud """
# Parameter to change is fps in the ffmpeg command. Change accoprding to need based on reference
for filename in os.listdir(path):
if (filename.endswith(".mp4")): #or .avi, .mpeg, whatever.
j += 1
path1 = path + filename
print(path1)
os.system("ffmpeg -i /home/ubuntu/pano/Video/Panorama/Videos/{0} -vf fps=0.07 /home/ubuntu/pano/Frames/{1}-%3d.jpg".format(filename,j))
#2. Rotate images if needed
frames_path = "/home/ubuntu/pano/Frames/*.jpg"
list_images = glob.glob(frames_path)
list_sorted = sorted(list_images)
#for image in list_sorted:
#read the image
# temp = cv2.imread(image)
# image1 = cv2.rotate(temp, cv2.ROTATE_90_COUNTERCLOCKWISE)
# cv2.imwrite("{0}".format(image), image1)
## according to how many partial panorama we have in each lane
d_element =[len(list_images)]
print(f"Now detecting objects in lane {j}")
#3. Running through detctor
frame_number = 1
for image in sorted(os.listdir(image_dir)):
#Path to the input images for the detector i.e Frames
quantity_frame = 0
image_name = f"{image}"
ext = '.jpg'
input_image_name = image_name
image_path = os.path.join(image_dir, input_image_name)
print(image_path)
#Path to output images to be stored after running through detector
output_dir = f"/home/ubuntu/pano/lane{j}"
output_name = "yolo_" + image_name
output_path = os.path.join(output_dir, output_name)
# image = load_images(image_path)
dn_frame_width = 416
dn_frame_height = 416
frame = cv2.imread(image_path)
frame_width = frame.shape[1]
frame_height = frame.shape[0]
#### Passing the image to darknet
image, detections = image_detection(image_path, network, class_names, class_colors, thresh=0.05)
#cv2.imwrite(f'/home/ubuntu/temp/Inference{frame_number}.jpg', image)
#cv2.imwrite(f'/home/ubuntu/temp/orignal_detect{frame_number}.jpg', frame)
###Based on the detections, running them through a loop to draw bounding box and also incrememnt count of object in the frame
#4. Calculate count and draw bounding boxes
for i in range(len(detections)):
xc_percent = detections[i][2][0]/dn_frame_width
yc_percent = detections[i][2][1]/dn_frame_height
w_percent = detections[i][2][2]/dn_frame_width
h_percent = detections[i][2][3]/dn_frame_height
xc = xc_percent*frame_width
yc = yc_percent*frame_height
w = w_percent*frame_width
h = h_percent*frame_height
xmin = xc - w/2.0
ymin = yc - h/2.0
xmax = xc + w/2.0
ymax = yc + h/2.0
#If object is detected, increase the count of the object in the frame
if detections[i][0] == "bottle":
cv2.rectangle(frame, (int(xmin),int(ymin)),(int(xmax),int(ymax)),(0,0,255),2)
cv2.putText(frame, "bottle", (int(xmin), int(ymin-10)), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0,0,255), 2)
quantity_frame += 1
elif detections[i][0] == "can":
cv2.rectangle(frame, (int(xmin),int(ymin)),(int(xmax),int(ymax)),(255,0,0),2)
cv2.putText(frame, "can", (int(xmin), int(ymin-10)), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255,0,0), 2)
else:
print(f"{image} has no objects ")
print(f"Quantity in frame {frame_number} = {quantity_frame}")
#5. Store these images in respective directoies
cv2.imwrite(output_path, frame)
quantity_bottles_frame.append(quantity_frame)
frame_number += 1
###Split the points into equidistant points between start point and end point
##6. Take start point of lane and end point and split into many coordinates in between based on number of frames
distance, bearing_deg = get_distance_bearing(start_point_list[j-1][0], start_point_list[j-1][1], end_point_list[j-1][0], end_point_list[j-1][1])
print(distance)
lat_array, lon_array = discretize_line(start_point_list[j-1][0], start_point_list[j-1][1], float(d_element[0]), distance, bearing_deg)
lat_csv = []
lon_csv = []
##Convery those points into degrees
for lat,lon in zip(lat_array, lon_array):
lat_degrees = "{:}".format(np.degrees(lat))
lon_degrees = "{:}".format(np.degrees(lon))
lat_csv.append(lat_degrees)
lon_csv.append(lon_degrees)
#lat_csv = "{:}".format(np.degrees(lat))
#lon_csv = "{:}".format(np.degrees(lon))
##7.Write each row in the csv file
for k in range(d_element[0]):
dataCSVWriter.writerow([lat_csv[k], lon_csv[k], quantity_bottles_frame[k]])
#if k != d_element[0]-1:
# dataCSVWriter.writerow([lat_csv[k], lon_csv[k], quantity_bottles_frame[k], "-", "-" ])
if k ==d_element[0]-1:
print(lat_csv[int(d_element[0]/2)])
thumbnail_dataCSVWriter.writerow([ lat_csv[int(d_element[0]/2)],lon_csv[int(d_element[0]/2)]])
#####8.STACKING THE IMAGES ######
images = []
stacking_input = f"/home/ubuntu/pano/lane{j}/*.jpg"
list_images = glob.glob(stacking_input)
#print(list_images)
stacking_input_reverse = sorted(list_images, reverse = True)
print(stacking_input_reverse)
for image in stacking_input_reverse:
img = cv2.imread(image)
images.append(img)
final_image = cv2.hconcat(images)
image_name = f"cloud_lane{j}_stack.jpg"
stacking_output = f"/home/ubuntu/pano/stack"
output_path = os.path.join(stacking_output, image_name)
cv2.imwrite(output_path, final_image)
##### 9. DELETE FRAMES AFTER ONE ITERATION OF LOOP #####
for f in os.listdir(image_dir):
del_path = "/home/ubuntu/pano/Frames/" + f
os.remove(del_path)
else:
continue
#Close csv file
#data_file.close()
#thumbnail_data_file.close()
### 10. Upload to s3 bucket ####
stack_path = "/home/ubuntu/pano/stack"
for file in sorted(os.listdir(stack_path)):
print(f"Uploading {file}")
uploaded = upload_to_aws(f'/home/ubuntu/pano/stack/{file}', 'fbt-pano-test', f'{file}')
Do i need to close the csv file in any way? Or does s3 not support csv upload through boto3?
I found it. Turns out, the csv files werent closed at the end. So i moved the upload to s3 part to another program. now python closes the csv files at the end of this program automatically. and so when the upload program runs next, it gets uploaded properly.
I am new to Pytorch and so far it has been incredible. I am using it to count the number of pills in an image. I have found that in the majority of my images the max number of objects that it detects is 100. For the picture below it reaches a max count of 100 with the confidence around .6. After that it doesn't increase anymore even down to .1 confidence. I haven't been able to find anything in the docs or any other places online. I am using the fasterrcnn_resnet50_fpn model. Below is the code that load the trained model and evaluate the image. Any tips or even different packages that would be able to count all objects would be super helpful.
## Loading the trained module
loaded_model = get_model(num_classes = 2)
loaded_model.load_state_dict(torch.load('Pillcount/model'))
os.chdir('../pytorchobjdet/vision')
class CountDataset(torch.utils.data.Dataset):
def __init__(self, root, data_file, transforms=None):
self.root = root
self.transforms = transforms
self.imgs = sorted(os.listdir(os.path.join(root, "count")))
self.path_to_data_file = data_file
def __getitem__(self, idx):
# load images and bounding boxes
img_path = os.path.join(self.root, "count", self.imgs[idx])
img = Image.open(img_path).convert("RGB")
box_list = parse_one_annot(self.path_to_data_file,
self.imgs[idx])
boxes = torch.as_tensor(box_list, dtype=torch.float32)
num_objs = len(box_list)
# there is only one class
labels = torch.ones((num_objs,), dtype=torch.int64)
image_id = torch.tensor([idx])
area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:,
0])
# suppose all instances are not crowd
iscrowd = torch.zeros((num_objs,), dtype=torch.int64)
target = {}
target["boxes"] = boxes
target["labels"] = labels
target["image_id"] = image_id
target["area"] = area
target["iscrowd"] = iscrowd
if self.transforms is not None:
img, target = self.transforms(img, target)
return img, target
def __len__(self):
return len(self.imgs)
dataset_count = CountDataset(root='../../Pill_Object_Detection',
data_file = "../../Pill_Object_Detection/count_labels.csv",
transforms = get_transform(train=False))
idx = 1
img, _ = dataset_count[idx]
#put the model in evaluation mode
loaded_model.eval()
with torch.no_grad():
prediction = loaded_model([img])
image = Image.fromarray(img.mul(255).permute(1, 2,0).byte().numpy())
draw = ImageDraw.Draw(image)
# draw groundtruth
count = 0
for element in range(len(prediction[0]["boxes"])):
boxes = prediction[0]["boxes"][element].cpu().numpy()
score = np.round(prediction[0]["scores"][element].cpu().numpy(),
decimals= 4)
if score > 0.6:
draw.rectangle([(boxes[0], boxes[1]), (boxes[2], boxes[3])],
outline ="red", width =3)
draw.text((boxes[0], boxes[1]), text = str(score))
count +=1
print(f'count = {count}')
image
The advice from the comment above was very helpful. I used the YOLO5vs model and it did an incredible job. This tutorial had a super easy set up that had you upload the annotated images into roboflow, and then it had some google colab tutorials set up for almost all of the current object detectors out there. Here is the result. I just need to give better quality training data but it did extremely well for the few pictures that I gave it. It can count well over 150 objects in the same image no problem.
I'm training my neural network using PyTorch framework. The data is full HD images (1920x1080). But in each iteration, I just need to crop out a random 256x256 patch from these images. My network is relatively small (5 conv layers), and hence the bottleneck is being caused by loading the data. I've provided my current code below. Is there any way to optimize loading the data and speed up the training?
Code:
from pathlib import Path
import numpy
import skimage.io
import torch.utils.data as data
import Imath
import OpenEXR
class Ours(data.Dataset):
"""
Loads patches of resolution 256x256. Patches are selected such that they contain atleast 1 unknown pixel
"""
def __init__(self, data_dirpath, split_name, patch_size):
super(Ours, self).__init__()
self.dataroot = Path(data_dirpath) / split_name
self.video_names = []
for video_path in sorted(self.dataroot.iterdir()):
for i in range(4):
for j in range(11):
view_num = i * 12 + j
self.video_names.append((video_path.stem, view_num))
self.patch_size = patch_size
return
def __getitem__(self, index):
video_name, view_num = self.video_names[index]
patch_start_pt = (numpy.random.randint(1080), numpy.random.randint(1920))
frame1_path = self.dataroot / video_name / f'render/rgb/{view_num + 1:04}.png'
frame2_path = self.dataroot / video_name / f'render/rgb/{view_num + 2:04}.png'
depth_path = self.dataroot / video_name / f'render/depth/{view_num + 1:04}.exr'
mask_path = self.dataroot / video_name / f'render/masks/{view_num + 1:04}.png'
frame1 = self.get_image(frame1_path, patch_start_pt)
frame2 = self.get_image(frame2_path, patch_start_pt)
mask = self.get_mask(mask_path, patch_start_pt)
depth = self.get_depth(depth_path, patch_start_pt, mask)
data_dict = {
'frame1': frame1,
'frame2': frame2,
'mask': mask,
'depth': depth,
}
return data_dict
def __len__(self):
return len(self.video_names)
#staticmethod
def get_mask(path: Path, patch_start_point: tuple):
h, w = patch_start_point
mask = skimage.io.imread(path.as_posix())[h:h + self.patch_size, w:w + self.patch_size][None]
return mask
def get_image(self, path: Path, patch_start_point: tuple):
h, w = patch_start_point
image = skimage.io.imread(path.as_posix())
image = image[h:h + self.patch_size, w:w + self.patch_size, :3]
image = image.astype(numpy.float32) / 255 * 2 - 1
image_cf = numpy.moveaxis(image, [0, 1, 2], [1, 2, 0])
return image_cf
def get_depth(self, path: Path, patch_start_point: tuple, mask: numpy.ndarray):
h, w = patch_start_point
exrfile = OpenEXR.InputFile(path.as_posix())
raw_bytes = exrfile.channel('B', Imath.PixelType(Imath.PixelType.FLOAT))
depth_vector = numpy.frombuffer(raw_bytes, dtype=numpy.float32)
height = exrfile.header()['displayWindow'].max.y + 1 - exrfile.header()['displayWindow'].min.y
width = exrfile.header()['displayWindow'].max.x + 1 - exrfile.header()['displayWindow'].min.x
depth = numpy.reshape(depth_vector, (height, width))
depth = depth[h:h + self.patch_size, w:w + self.patch_size]
depth = depth[None]
depth = depth.astype(numpy.float32)
depth = depth * mask
return depth
Finally, I'm creating a DataLoader as follows:
train_data_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=4)
What I've tried so far:
I've searched if it is possible to read a part of the image. Unfortunately, I didn't get any leads. Looks like python libraries read the full image.
I'm planning to read more patches from a single image so that I will need to read fewer images. But in PyTorch framework, the get_item() function has to return a single sample, not a batch. So, in each get_item() I can read only a patch.
I'm planning to circumvent this as follows: Read 4 patches in get_item() and return patches of shape (4,3,256,256) instead of (3,256,256). Later when I read a batch using dataloader, I'll get a batch of shape (BS,4,3,256,256) instead of (BS,3,256,256). I can then concatenate the data along dim=1 to convert (BS,4,3,256,256) to (BS*4,3,256,256). Thus I can reduce batch_size (BS) by 4 and hopefully this will speed up data loading by 4 times.
Are there any other options? I'm open to all kind of suggestions. Thanks!
I'm doing frame generation. For each images in Dataset/train/ folder (e.g 1.png ) I generated a sequence of 100 images and save all of them into a single Dataset/frames/train/ as (1_1.png...1_100.png), here is an example of my folder structure:
Dataset:
train:
1.png
2.png
3.png
.
.
N.png
frames:
train:
1_1.png
1_2.png
.
.
N_100.png
2_1.png
2_2.png
.
.
N_100.png
I have created my custom dataloader where I stack the frames generated as channels to form a sequence, but my problem I don't want to frames from image 2 to overlap with frames from 1 when create a sequence, How can I ensure the different frames don't overlap?
Here is my custom dataloader:
class LevelSetDataset(Dataset):
"""
Dataset object for CNN models
Temporal is defined implicitly
as the number of channels
example:
- X dimension
[H, W, C=number_of_timestap(t)]
- Y dimension
[W, W, C =(t+1)]
"""
def __init__(self, input_image_path:str,
target_image_path:str,
threshold:float=0.5,
num_input_steps:int=3,
num_future_steps:int=1,
image_dimension:int=32,
data_transformations=None,
istraining_mode:bool=True
):
self.input_image_path = input_image_path
self.target_image_path = target_image_path
self.threshold = threshold
self.num_input_steps = num_input_steps
self.num_future_steps = num_future_steps
self.image_dimension = image_dimension
self.data_transformations= data_transformations
self.istraining_mode = istraining_mode
# get a list of input filenames as sort them (e.g. 1.png, 2.png,..,N.png)
input_image_fp = sorted(glob(os.path.join(self.input_image_path , "*")),
key=lambda x: int(os.path.basename(x).split('.')[0])
)
# repeat the input image untill it matches the number of segmentation
# step of the target image
self.input_image_fp = [i for i in input_image_fp for _ in range(100)]
# get a list of the target filenames and sort them by the first id and second
# id after the underscore (e.g. 1_1.png, 1_2,..,N_M.png)
self.target_image_fp= sorted(glob(os.path.join(self.target_image_path , "*")),
key=lambda x: (int(os.path.basename(x).split('_')[0]),
int(os.path.basename(x).split('_')[1].split('.')[0]))
)
# check if in training mode
# to apply transformations
if (self.data_transformations is None) and (self.istraining_mode):
self.data_transformations= torchvision.transforms.Compose([
torchvision.transforms.Resize(size=(self.image_dimension,self.image_dimension),
interpolation=Image.BILINEAR),
torchvision.transforms.RandomHorizontalFlip(p=0.5),
torchvision.transforms.RandomVerticalFlip(p=0.5),
torchvision.transforms.ToTensor()
])
if (self.data_transformations is None) and (not self.istraining_mode):
self.data_transformations== torchvision.transforms.Compose([
torchvision.transforms.Resize(size=(self.image_dimension,self.image_dimension),
interpolation=Image.BILINEAR),
torchvision.transforms.ToTensor()
])
self.transforms = self.data_transformations
def _create_binary_mask(self, x):
x[x>=self.threshold] = 1
x[x <self.threshold] = 0
return x
def _stat_norm(self, x):
norm =torchvision.transforms.Compose([torchvision.transforms.Resize(
size=(self.image_dimension,self.image_dimension),
interpolation=Image.BILINEAR),
torchvision.transforms.ToTensor()])
return norm(x)
def __len__(self):
return len(self.target_image_fp) - (self.num_input_steps+self.num_future_steps)
def __getitem__(self, index):
X = torch.zeros((self.image_dimension, self.image_dimension, self.num_input_steps+1))
for step_idx, step in enumerate(np.arange(index, self.num_input_steps, 1)):
target_image = Image.open(self.target_image_fp[step+self.num_input_steps+self.num_future_steps-1])
target_image = self.transforms(target_image)
target_image = self._create_binary_mask(target_image)
X[:, :, step_idx] = target_image # (t+1)
input_img = Image.open(self.input_image_fp[index]).convert('L')
# input_img = self.transforms(input_img)
input_img = self.transforms(input_img)
X[:, :, 0] = input_img
target_image = Image.open(self.target_image_fp[index+self.num_input_steps+self.num_future_steps-1])
target_image = self.transforms(target_image)
target_image = self._create_binary_mask(target_image)
image_name = self.target_image_fp[index+self.num_input_steps+self.num_future_steps-1].split('/')[-1]
Y = target_image
return X, Y, image_name
I'm currently making a object detection app, which is able to detect if tires are damaged or not. For this I'm using Google's AutoML edge, which exports a TFlite model. Now I wanna implement this model in my code, but apparently the coordinates it predicts are normalized and I'm stuck in denormalizing them
Have a Look at my code here:
import tensorflow as tf
import numpy as np
import cv2
MODEL_PATH = 'Resources/model_v1_OD.tflite'
LABEL_PATH = 'Resources/model_v1_OD.txt'
class TFTireModel():
labels = []
intepreter = None
input_details = []
output_details = []
height = 0
width = 0
def __init__(self):
with open(LABEL_PATH, 'r') as f:
self.labels = [line.strip() for line in f.readlines()]
# Init TFlite interpreter
self.interpreter = tf.lite.Interpreter(model_path=MODEL_PATH)
self.interpreter.allocate_tensors()
# Get input and output tensors.
self.input_details = self.interpreter.get_input_details()
self.output_details = self.interpreter.get_output_details()
# Get input dimensions
self.height = self.input_details[0]['shape'][1]
self.width = self.input_details[0]['shape'][2]
def predict(self, img, threshold=0.3):
# Resize image to input dimensions
img = cv2.resize(img, (self.width, self.height))
img = np.expand_dims(img, axis=0)
img = (2.0 / 255.0) * img - 1.0
img = img.astype('uint8')
# Predict image
self.interpreter.set_tensor(self.input_details[0]['index'], img)
self.interpreter.invoke()
# get results
boxes = self.interpreter.get_tensor(
self.output_details[0]['index'])
print(f"boxes: {boxes}")
classes = self.interpreter.get_tensor(
self.output_details[1]['index'])
scores = self.interpreter.get_tensor(
self.output_details[2]['index'])
num = self.interpreter.get_tensor(
self.output_details[3]['index'])
# Get output
output =self._boxes_coordinates(boxes=np.squeeze(boxes[0]),
classes=np.squeeze(classes[0]+1).astype(np.int32),
scores=np.squeeze(scores[0]),
im_width=self.width,
im_height=self.height,
min_score_thresh=threshold)
print(f"output: {output}")
# Format output
return output
def _boxes_coordinates(self,
boxes,
classes,
scores,
im_width,
im_height,
max_boxes_to_draw=4,
min_score_thresh=0.4):
print(f"width: {im_width}, height {im_height}" )
if not max_boxes_to_draw:
max_boxes_to_draw = boxes.shape[0]
number_boxes = min(max_boxes_to_draw, boxes.shape[0])
tire_boxes = []
# person_labels = []
for i in range(number_boxes):
if scores is None or scores[i] > min_score_thresh:
box = tuple(boxes[i].tolist())
ymin, xmin, ymax, xmax = box
xmin, ymin, xmax, ymax = (int(xmin * im_width), int(xmax * im_width), int(ymin * im_height), int(ymax * im_height)) #TODO: DO A LOOP
#tire_boxes.append([(ymin, xmin, ymax, xmax), scores[i], self.labels[classes[i]]]) #More complete
tire_boxes.append((xmin, ymin, xmax, ymax))
return tire_boxes
Things go wrong at:
boxes = self.interpreter.get_tensor(
self.output_details[0]['index'])
print(f"boxes: {boxes}"
boxes: [[[ 0.00263482 0.50020593 0.3734043 0.83953816]
[ 0.12580797 0.14952084 0.65327024 0.61710536]
[ 0.13584864 0.38896233 0.6485662 0.85324436]
[ 0.31914377 0.3945622 0.87147605 0.8458656 ]
[ 0.01334581 0.03666234 0.46443292 0.55461186]
[ 0.1018104 -0.08279537 0.6541427 0.37984413]
Since here the output is normalized, and I don't know how to denormalize it. The desired output is percentages of the width and height as can be seen in the _boxes_coordinates function.
The outputs from TFLite's Object Detection models are in the format:
[top/Height, left/Width, bottom/Height, right/Width]
So if you know the dimensions of your image, you can compute the boundaries of each computed object.