How to create an area to detect people in python - python

I have a script, developed in Python + TensorFlow, this script is able to detect people as below:
import numpy as np
import tensorflow as tf
import cv2
import time
class DetectorAPI:
def __init__(self, path_to_ckpt):
self.path_to_ckpt = path_to_ckpt
self.detection_graph = tf.Graph()
with self.detection_graph.as_default():
od_graph_def = tf.GraphDef()
with tf.gfile.GFile(self.path_to_ckpt, 'rb') as fid:
serialized_graph = fid.read()
od_graph_def.ParseFromString(serialized_graph)
tf.import_graph_def(od_graph_def, name='')
self.default_graph = self.detection_graph.as_default()
self.sess = tf.Session(graph=self.detection_graph)
# Definite input and output Tensors for detection_graph
self.image_tensor = self.detection_graph.get_tensor_by_name('image_tensor:0')
self.detection_boxes = self.detection_graph.get_tensor_by_name('detection_boxes:0')
self.detection_scores = self.detection_graph.get_tensor_by_name('detection_scores:0')
self.detection_classes = self.detection_graph.get_tensor_by_name('detection_classes:0')
self.num_detections = self.detection_graph.get_tensor_by_name('num_detections:0')
def processFrame(self, image):
shape: [1, None, None, 3]
image_np_expanded = np.expand_dims(image, axis=0)
# Actual detection.
start_time = time.time()
(boxes, scores, classes, num) = self.sess.run(
[self.detection_boxes, self.detection_scores, self.detection_classes, self.num_detections],
feed_dict={self.image_tensor: image_np_expanded})
end_time = time.time()
print("Elapsed Time:", end_time-start_time)
im_height, im_width,_ = image.shape
boxes_list = [None for i in range(boxes.shape[1])]
for i in range(boxes.shape[1]):
boxes_list[i] = (int(boxes[0,i,0] * im_height),
int(boxes[0,i,1]*im_width),
int(boxes[0,i,2] * im_height),
int(boxes[0,i,3]*im_width))
return boxes_list, scores[0].tolist(), [int(x) for x in classes[0].tolist()], int(num[0])
def close(self):
self.sess.close()
self.default_graph.close()
if __name__ == "__main__":
model_path = 'modelo/frozen_inference_graph.pb'
odapi = DetectorAPI(path_to_ckpt=model_path)
threshold = 0.7
cap = cv2.VideoCapture('http://81.198.213.128:82/mjpg/video.mjpg')
while True:
r, img = cap.read()
img = cv2.resize(img, (1280, 720))
boxes, scores, classes, num = odapi.processFrame(img)
for i in range(len(boxes)):
if classes[i] == 1 and scores[i] > threshold:
box = boxes[i]
cv2.rectangle(img,(box[1],box[0]),(box[3],box[2]),(255,0,0),2)
cv2.imshow("Preview", img)
key = cv2.waitKey(1)
if key & 0xFF == ord('q'):
break
But this code captures the entire frame of the screen and I need the detection to occur only in an expecific position.
Example:
I need the detection to occur only within the red frame!
How can I do this?

Just pass only the red region of the frame into your model.
Assuming you have the top, bot, left, right coordinate of the region in the frame:
while True:
r, img = cap.read()
img = img[top:bot, left:right]
img = cv2.resize(img, (1280, 720))
boxes, scores, classes, num = odapi.processFrame(img)
...

Related

ValueError: could not broadcast input array from shape (320,320,3) into shape (640,640,3)

I've been trying to run a detection model on a raspberry pi but when I try I get the error that:
could not broadcast input array from shape (320,320,3) into shape (640,640,3)
when I run this
import re
import cv2
from tflite_runtime.interpreter import Interpreter
import numpy as np
CAMERA_WIDTH = 640
CAMERA_HEIGHT = 480
def load_labels(path='labels.txt'):
"""Loads the labels file. Supports files with or without index numbers."""
with open(path, 'r', encoding='utf-8') as f:
lines = f.readlines()
labels = {}
for row_number, content in enumerate(lines):
pair = re.split(r'[:\s]+', content.strip(), maxsplit=1)
if len(pair) == 2 and pair[0].strip().isdigit():
labels[int(pair[0])] = pair[1].strip()
else:
labels[row_number] = pair[0].strip()
return labels
def set_input_tensor(interpreter, image):
"""Sets the input tensor."""
tensor_index = interpreter.get_input_details()[0]['index']
input_tensor = interpreter.tensor(tensor_index)()[0]
input_tensor[:, :] = np.expand_dims((image-255)/255, axis=0)
def get_output_tensor(interpreter, index):
"""Returns the output tensor at the given index."""
output_details = interpreter.get_output_details()[index]
tensor = np.squeeze(interpreter.get_tensor(output_details['index']))
return tensor
def detect_objects(interpreter, image, threshold):
"""Returns a list of detection results, each a dictionary of object info."""
set_input_tensor(interpreter, image)
interpreter.invoke()
# Get all output details
boxes = get_output_tensor(interpreter, 0)
classes = get_output_tensor(interpreter, 1)
scores = get_output_tensor(interpreter, 2)
count = int(get_output_tensor(interpreter, 3))
results = []
for i in range(count):
if scores[i] >= threshold:
result = {
'bounding_box': boxes[i],
'class_id': classes[i],
'score': scores[i]
}
results.append(result)
return results
def main():
labels = load_labels()
interpreter = Interpreter('detect.tflite')
interpreter.allocate_tensors()
_, input_height, input_width, _ = interpreter.get_input_details()[0]['shape']
cap = cv2.VideoCapture(0)
while cap.isOpened():
ret, frame = cap.read()
img = cv2.resize(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB), (320,320))
res = detect_objects(interpreter, img, 0.8)
print(res)
for result in res:
ymin, xmin, ymax, xmax = result['bounding_box']
xmin = int(max(1,xmin * CAMERA_WIDTH))
xmax = int(min(CAMERA_WIDTH, xmax * CAMERA_WIDTH))
ymin = int(max(1, ymin * CAMERA_HEIGHT))
ymax = int(min(CAMERA_HEIGHT, ymax * CAMERA_HEIGHT))
cv2.rectangle(frame,(xmin, ymin),(xmax, ymax),(0,255,0),3)
cv2.putText(frame,labels[int(result['class_id'])],(xmin, min(ymax, CAMERA_HEIGHT-20)), cv2.FONT_HERSHEY_SIMPLEX, 0.5,(255,255,255),2,cv2.LINE_AA)
cv2.imshow('Pi Feed', frame)
if cv2.waitKey(10) & 0xFF ==ord('q'):
cap.release()
cv2.destroyAllWindows()
if __name__ == "__main__":
main()
the model is an SSD Mobilenet 640x640 and the images for the model were taken on the raspberry pi as 1028x720 but were downscaled during model training. But I still get this error and I'm not sure how to fix it.

Is it possible to run two YOLO (yolov4) object detection models in a single application?

I understand that it would be much simpler to just train YOLOv4 accordingly but:
I have limited computational resources and was hoping I could combine pre-trained models I found online, saving my time, or train a model and combine it with other models available online.
If I have one '.weights' file of a custom object detector that detects traffic signs and another '.weights' file of detector that detect pedestrians. Is there a way to combine these models, so that when run on a video/image (or in real-time capture), it detects pedestrians and traffic signs simultaneously.
By combining I mean, either to edit the 'weights' file somehow to achieve this, or editing the python code (while running the detector) that gets this done. (or any other way)
If not possible - is there any way to make them run in a sequence, efficiently?
Yes, It is possible to do that. Create two different python files and load the model in them. Then create a new file and initialise both of them in the new file. Then take the continous video / image feed as an input, the file will give output of detected traffic signs. Use this output as an input to your pedestrian problem.
import numpy as np
import pandas as pd
import cv2
import time
margin = 20
import detection_cls
run = detection_cls.Ppe_Detection_1()
class Ppe_Detection():
def __init__(self):
self.weightfile = 'yolov4_pretrained.weights'
self.cfgfile = 'cfg/yolov4_pretrained.cfg'
self.PpeNet = cv2.dnn.readNet(self.weightfile,self.cfgfile)
self.classes = self.get_classes()
layer_names = self.PpeNet.getLayerNames()
self.output_layers = [layer_names[i - 1] for i in self.PpeNet.getUnconnectedOutLayers()]
# = [self.PpeNet.getLayerNames()[(i[0] - 1)] for i in self.PpeNet.getUnconnectedOutLayers()]
def get_classes(self):
# self.classes = []
with open("coco.names","r") as f:
self.classes_val = [line.strip() for line in f.readlines()]
return self.classes_val
def detection(self,img):
start = time.perf_counter()
height,width,channels = img.shape
# Detecting objects
blob = cv2.dnn.blobFromImage(img,0.00392,(416,416),(0,0,0),True,crop=False)
self.PpeNet.setInput(blob)
outs = self.PpeNet.forward(self.output_layers)
time_took = time.perf_counter() - start
fps = str(int(1/time_took))
# getting the list
class_ids = []
confidences = []
boxes = []
for out in outs:
for detection in out:
# print(detection)
scores = detection[5:]
class_id = np.argmax(scores)
confidence = scores[class_id]
if confidence > 0.4:
# object detected
center_x = int(detection[0]*width)
center_y = int(detection[1]*height)
w = int(detection[2]*width)
h = int(detection[3]*height)
# Rectangle Coordinates
x = int(center_x-w/2)
y = int(center_y-h/2)
boxes.append([x,y,w,h])
confidences.append(float(confidence))
class_ids.append(class_id)
indexes = cv2.dnn.NMSBoxes(boxes,confidences,0.4,0.4)
info = []
if len(indexes) > 0:
for i in indexes.flatten():
x,y = boxes[i][0], boxes[i][1]
w,h = boxes[i][2], boxes[i][3]
conf = confidences[i]
if x<0:
x = 0
if y < 0:
y = 0
type = '{}'.format(self.classes[class_ids[i]])
info.append([x,y,w,h,type,conf])
new_frame_time = time.time()
font = cv2.FONT_HERSHEY_PLAIN
for i in range(len(boxes)):
for i in indexes:
x,y,w,h = boxes[i]
label = str(self.classes[class_ids[i]])
color = (0,0,145)
# cv2.rectangle(img,(x,y),(x+w,y+h),color,2)
cv2.putText(img,fps,(10,30),font,3,color,3)
cv2.putText(img,label,(x,y+30),font,3,color,3)
# print(boxes,confidences,class_ids)
# print("opened")
# print(info)
return info
def cropping_detection(self):
count = 0
cropped_frame = []
value_img = []
cap = cv2.VideoCapture(0)
while cap.isOpened():
r,f = cap.read()
# counts +=1
detection = ppe.detection(f)
try:
x,y,w,h,cls,conf = detection[0]
except Exception as e:
pass
# for i in range(len(detection)):
# print(detection[i])
try:
if cls=='person':
print("Person detected")
cropped_img = f[y-margin:y+h+margin,x-margin:x+w+margin]
# print(cropped_img)
# value = ppe.detection(cropped_img)
final = run.detection(cropped_img)
print('final',final)
# print(value)
# print(cropped_img)
# value_img.append(value)
# cropped_frame.append(cropped_img)
# cv2.imwrite("data/frame%d.jpg" % count, cropped_img)
cv2.imshow("Hi",cropped_img)
if cv2.waitKey(1) & 0xff==ord('q'):
break
# count += 1
else:
pass
except Exception as e:
print("In except part")
# cap.release()
# cv2.destroyAllWindows()
# print(cropped_img)
return value,cropped_img
ppe = Ppe_Detection()
ppe.cropping_detection()
# ppe.get_classes()
# ppe.run_video()
# take_photo()
# inference_image()
The function that I've imported here can be found below
import numpy as np
import pandas as pd
import cv2
import time
class Ppe_Detection():
def __init__(self):
self.weightfile = 'backup/yolov3_best.weights'
self.cfgfile = 'yolov3.cfg'
self.PpeNet = cv2.dnn.readNet(self.weightfile,self.cfgfile)
self.classes = self.get_classes()
layer_names = self.PpeNet.getLayerNames()
self.output_layers = [layer_names[i - 1] for i in self.PpeNet.getUnconnectedOutLayers()]
# = [self.PpeNet.getLayerNames()[(i[0] - 1)] for i in self.PpeNet.getUnconnectedOutLayers()]
def get_classes(self):
# self.classes = []
with open("obj.names","r") as f:
self.classes_val = [line.strip() for line in f.readlines()]
return self.classes_val
def print_all(self):
print(self.classes)
def detection(self,img):
start = time.perf_counter()
height,width,channels = img.shape
# Detecting objects
blob = cv2.dnn.blobFromImage(img,0.00392,(416,416),(0,0,0),True,crop=False)
self.PpeNet.setInput(blob)
outs = self.PpeNet.forward(self.output_layers)
time_took = time.perf_counter() - start
fps = str(int(1/time_took))
# getting the list
class_ids = []
confidences = []
boxes = []
for out in outs:
for detection in out:
scores = detection[5:]
class_id = np.argmax(scores)
confidence = scores[class_id]
if confidence > 0.4:
# object detected
center_x = int(detection[0]*width)
center_y = int(detection[1]*width)
w = int(detection[2]*width)
h = int(detection[3]*height)
# Rectangle Coordinates
x = int(center_x-w/2)
y = int(center_y-h/2)
boxes.append([x,y,w,h])
confidences.append(float(confidence))
class_ids.append(class_id)
indexes = cv2.dnn.NMSBoxes(boxes,confidences,0.4,0.4)
info = []
if len(indexes) > 0:
for i in indexes.flatten():
x,y = boxes[i][0], boxes[i][1]
w,h = boxes[i][2], boxes[i][3]
conf = confidences[i]
if x<0:
x = 0
if y < 0:
y = 0
type = '{}'.format(self.classes[class_ids[i]])
info.append([x,y,w,h,type,conf])
new_frame_time = time.time()
font = cv2.FONT_HERSHEY_PLAIN
for i in range(len(boxes)):
for i in indexes:
x,y,w,h = boxes[i]
label = str(self.classes[class_ids[i]])
color = (0,255,145)
cv2.rectangle(img,(x,y),(x+w,y+h),color,2)
cv2.putText(img,fps,(10,30),font,3,color,3)
cv2.putText(img,label,(x,y+30),font,3,color,3)
# print(boxes,confidences,class_ids)
# print("opened")
print(info)
return info
ppe = Ppe_Detection()
ppe.get_classes()
ppe.print_all()
#############
def inference_image():
img = cv2.imread("sumit_off.jpg")
ppe.detection(img)
cv2.imshow("Image",img)
cv2.waitKey(0)
cv2.destroyAllWindows()
def run_video():
cap = cv2.VideoCapture(0)
while cap.isOpened():
r,f = cap.read()
try:
info = ppe.detection(f)
except Exception as e:
print("______",e)
cv2.imshow("image",f)
if cv2.waitKey(1) & 0xFF == ord("q") :
break
cap.release()
cv2.destroyAllWindows()
run_video()

How use for loop many variable?

I am using Yolov5. I want change my webcam -> lancamera
class LoadStreams: # multiple IP or RTSP cameras
def __init__(self, sources='streams.txt', img_size=640):
self.mode = 'images'
self.img_size = img_size
if os.path.isfile(sources):
with open(sources, 'r') as f:
sources = [x.strip() for x in f.read().splitlines() if len(x.strip())]
else:
sources = [sources]
n = len(sources)
self.imgs = [None] * n
self.sources = sources
for i, s in enumerate(sources):
# Start the thread to read frames from the video stream
print('%g/%g: %s... ' % (i + 1, n, s), end='')
cap = cv2.VideoCapture(eval(s) if s.isnumeric() else s)
assert cap.isOpened(), 'Failed to open %s' % s
w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS) % 100
_, self.imgs[i] = cap.read() # guarantee first frame
thread = Thread(target=self.update, args=([i, cap]), daemon=True)
print(' success (%gx%g at %.2f FPS).' % (w, h, fps))
thread.start()
print('') # newline
# check for common shapes
s = np.stack([letterbox(x, new_shape=self.img_size)[0].shape for x in self.imgs], 0) # inference shapes
self.rect = np.unique(s, axis=0).shape[0] == 1 # rect inference if all shapes equal
if not self.rect:
print('WARNING: Different stream shapes detected. For optimal performance supply similarly-shaped streams.')
def update(self, index, cap):
# Read next stream frame in a daemon thread
n = 0
while cap.isOpened():
n += 1
# _, self.imgs[index] = cap.read()
cap.grab()
if n == 4: # read every 4th frame
_, self.imgs[index] = cap.retrieve()
n = 0
time.sleep(0.01) # wait time
def __iter__(self):
self.count = -1
return self
def __next__(self):
self.count += 1
img0 = self.imgs.copy()
if cv2.waitKey(1) == ord('q'): # q to quit
cv2.destroyAllWindows()
raise StopIteration
# Letterbox
img = [letterbox(x, new_shape=self.img_size, auto=self.rect)[0] for x in img0]
# Stack
img = np.stack(img, 0)
# Convert
img = img[:, :, :, ::-1].transpose(0, 3, 1, 2) # BGR to RGB, to bsx3x416x416
img = np.ascontiguousarray(img)
return self.sources, img, img0, None
def __len__(self):
return 0 # 1E12 frames = 32 streams at 30 FPS for 30 years
this code return 'self.sources, img, img0, None'
if webcam:
view_img = True
cudnn.benchmark = True # set True to speed up constant image size inference
dataset = LoadStreams(source, img_size=imgsz)
print((dataset))
I use 'dataset'
for path, img, im0s, vid_cap in dataset:
img = torch.from_numpy(img).to(device)
img = img.half() if half else img.float() # uint8 to fp16/32
img /= 255.0 # 0 - 255 to 0.0 - 1.0
How to use for path, img, im0s, vid_cap in dataset: ??
my lancam code
def livecame():
vimba = Vimba()
vimba.startup()
system = vimba.system()
system.run_feature_command("GeVDiscoveryAllOnce")
time.sleep(0.1)
camera_ids = vimba.camera_ids()
# for cam_id in camera_ids:
# print("Camera found: ", cam_id)
print(camera_ids[0])
c0 = vimba.camera(camera_ids[0])
c0.open()
pixel_format = c0.feature("PixelFormat")
pixel_format.value = "BayerBG8"
try:
c0.StreamBytesPerSecond = 100000000
except:
pass
frame = c0.new_frame()
frame.announce()
c0.start_capture()
try:
frame.queue_for_capture()
success = True
except:
success = False
c0.run_feature_command("AcquisitionStart")
c0.run_feature_command("AcquisitionStop")
frame.wait_for_capture(1000)
frame_data = frame.buffer_data()
k = cv2.waitKey(1)
if k == 0x1b:
cv2.destroyAllWindows()
if success:
img = np.ndarray(buffer=frame_data,
dtype=np.uint8,
shape=(frame.data.height, frame.data.width, 1))
img = cv2.cvtColor(img, cv2.COLOR_BAYER_BG2RGB)
img0 = img.copy()
img = img.tolist()
img = [letterbox(x, new_shape=(800,400), auto= True)[0] for x in img0]
#img = np.ascontiguousarray(img)
img = np.stack(img, 0)
#img = img[:, :, :, ::-1].transpose(0, 3, 1, 2) # BGR to RGB, to bsx3x416x416
img = np.ascontiguousarray(img)
return ['0'], img, img0
but I use dataset = new_file.livecame()
I can see error ValueError: not enough values to unpack (expected 3, got 1)
in for path, img, im0s, vid_cap in dataset:
how to use many variable? in for loop?
In Python OpenCV, one way is simply to use zip.
for component in zip(contours, hierarchy):
cntr = component[0]
hier = component[1]

How can I convert pytorch cpu-based transformation to cuda-based?

The original issue for the code is availablehere.
I am using this repository for a line segmentation project and I developed this code to get an input (whether image or video) and draw road lines on it and give it in output:
import argparse
import sys
from time import time, clock
from os.path import splitext, basename, exists
from model import SCNN
from utils.check_extension import is_video, is_image
from utils.transforms import *
# I will put all the necessary code for utils.transforms after this
# ------------------------------------------------ SCNN parameters
time1 = time()
net = SCNN(input_size=(800, 288), pretrained=False)
mean = (0.3598, 0.3653, 0.3662) # CULane mean, std
std = (0.2573, 0.2663, 0.2756)
transform_img = Resize((800, 288))
transform_to_net = Compose(ToTensor(), Normalize(mean=mean, std=std))
# ------------------------------------------------ Arguments
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--weights', type=str,
default='models/vgg_SCNN_DULR_w9.pth',
help='path to vgg models')
parser.add_argument('--input', type=str, default='demo/line_3.mp4',
help='path to image file')
parser.add_argument('--output', type=str, default='public/',
help='path to the output directory')
args = parser.parse_args()
return args
def main():
args = parse_args()
filename, extension = splitext(basename(args.input))
print("Loading file [{}] ....".format(filename))
if not exists(args.input):
print("file [{}] is not recognized".format(args.input))
sys.exit()
if is_video(extension):
video_capture = cv2.VideoCapture()
fourcc = cv2.VideoWriter_fourcc(*'XVID')
output = args.output + filename + '.avi'
if video_capture.open(args.input):
property_id = int(cv2.CAP_PROP_FRAME_COUNT)
total_frames = int(cv2.VideoCapture.get(video_capture, property_id))
frame_no = 1
width, height = int(video_capture.get(cv2.CAP_PROP_FRAME_WIDTH)), \
int(video_capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = video_capture.get(cv2.CAP_PROP_FPS)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
save_dict = torch.load(args.weights, map_location=device)
net.load_state_dict(save_dict['net'])
net.eval()
# can't write out mp4, so try to write into an AVI file
video_writer = cv2.VideoWriter(output, fourcc, fps, (width, height))
while video_capture.isOpened():
start = time()
ret, frame = video_capture.read()
if not ret:
break
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
frame = transform_img({'img': frame})['img']
x = transform_to_net({'img': frame})['img']
x.unsqueeze_(0)
stop1 = time()
print('stop1: ', stop1 - start)
seg_pred, exist_pred = net(x)[:2]
seg_pred = seg_pred.detach().cpu().numpy()
exist_pred = exist_pred.detach().cpu().numpy()
seg_pred = seg_pred[0]
stop2 = time()
print('stop2: ', stop2 - stop1)
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
lane_img = np.zeros_like(frame)
color = np.array([[255, 125, 0], [0, 255, 0], [0, 0, 255], [0, 255, 255]], dtype='uint8')
coord_mask = np.argmax(seg_pred, axis=0)
for i in range(0, 4):
if exist_pred[0, i] > 0.5:
lane_img[coord_mask == (i + 1)] = color[i]
img = cv2.addWeighted(src1=lane_img, alpha=0.8, src2=frame, beta=1., gamma=0.)
img = cv2.resize(img, (width, height))
stop3 = time()
print('stop3: ', stop3 - stop2)
# if frame_no % 20 == 0:
# print('# {}/{} frames processed!'.format(frame_no, total_frames))
frame_no += 1
video_writer.write(img)
end = time()
print('Whole loop: {} seconds'.format(end - start))
print('------------')
print('------------')
print('# All frames processed ')
video_capture.release()
video_writer.release()
elif is_image(extension):
img = cv2.imread(args.input)
height, width, _ = img.shape
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = transform_img({'img': img})['img']
x = transform_to_net({'img': img})['img']
x.unsqueeze_(0)
save_dict = torch.load(args.weights, map_location='cpu')
net.load_state_dict(save_dict['net'])
net.eval()
seg_pred, exist_pred = net(x)[:2]
seg_pred = seg_pred.detach().cpu().numpy()
exist_pred = exist_pred.detach().cpu().numpy()
seg_pred = seg_pred[0]
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
lane_img = np.zeros_like(img)
color = np.array([[255, 125, 0], [0, 255, 0], [0, 0, 255], [0, 255, 255]], dtype='uint8')
coord_mask = np.argmax(seg_pred, axis=0)
for i in range(0, 4):
if exist_pred[0, i] > 0.5:
lane_img[coord_mask == (i + 1)] = color[i]
img = cv2.addWeighted(src1=lane_img, alpha=0.8, src2=img, beta=1., gamma=0.)
img = cv2.resize(img, (width, height))
output = args.output + filename + '.jpg'
cv2.imwrite(output, img)
else:
print("file format [{}] is not supported".format(args.input))
sys.exit()
if __name__ == '__main__':
main()
The code which belong to Resize, ToTensor, Normalize, Compose are here:
class Compose(CustomTransform):
"""
All transform in Compose should be able to accept two non None variable, img and boxes
"""
def __init__(self, *transforms):
self.transforms = [*transforms]
def __call__(self, sample):
for t in self.transforms:
sample = t(sample)
return sample
def __iter__(self):
return iter(self.transforms)
def modules(self):
yield self
for t in self.transforms:
if isinstance(t, Compose):
for _t in t.modules():
yield _t
else:
yield t
class Normalize(CustomTransform):
def __init__(self, mean, std):
self.transform = Normalize_th(mean, std)
def __call__(self, sample):
img = sample.get('img')
img = self.transform(img)
_sample = sample.copy()
_sample['img'] = img
return _sample
class ToTensor(CustomTransform):
def __init__(self, dtype=torch.float):
self.dtype=dtype
def __call__(self, sample):
img = sample.get('img')
segLabel = sample.get('segLabel', None)
exist = sample.get('exist', None)
img = img.transpose(2, 0, 1)
img = torch.from_numpy(img).type(self.dtype) / 255.
if segLabel is not None:
segLabel = torch.from_numpy(segLabel).type(torch.long)
if exist is not None:
exist = torch.from_numpy(exist).type(torch.float32) # BCEloss requires float tensor
_sample = sample.copy()
_sample['img'] = img
_sample['segLabel'] = segLabel
_sample['exist'] = exist
return _sample
class Resize(CustomTransform):
def __init__(self, size):
if isinstance(size, int):
size = (size, size)
self.size = size #(W, H)
def __call__(self, sample):
img = sample.get('img')
segLabel = sample.get('segLabel', None)
img = cv2.resize(img, self.size, interpolation=cv2.INTER_CUBIC)
if segLabel is not None:
segLabel = cv2.resize(segLabel, self.size, interpolation=cv2.INTER_NEAREST)
_sample = sample.copy()
_sample['img'] = img
_sample['segLabel'] = segLabel
return _sample
def reset_size(self, size):
if isinstance(size, int):
size = (size, size)
self.size = size
The code works fine but I found out that its too slow for testing in real-time application. I added some time measurement to see if I can find out the bottlenecks and this is the output for one loop:
------------
stop1: 0.002989053726196289
stop2: 1.4032211303710938
stop3: 0.004946708679199219
Whole loop: 1.41636061668396 seconds
These lines happened to be the most computationally expensive lines:
seg_pred, exist_pred = net(x)[:2]
seg_pred = seg_pred.detach().cpu().numpy()
exist_pred = exist_pred.detach().cpu().numpy()
seg_pred = seg_pred[0]
Now I am stuck with this issue that how I can modify the code to improve the computation speed.
Initially I thought of modifying the code to allow cuda computation. I asked the main author how I can modify the code for cuda version in here and he pointed out to these lines:
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
frame = transform_img({'img': frame})['img']
x = transform_to_net({'img': frame})['img']
x.unsqueeze_(0)
Unfortunately my experience with pytorch is not much, so I am asking for help now.
I hope the information I shared suffices for the readers. Any help would be appreciated
Thanks
Set device:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
what he means his putting the data on device:
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
frame = transform_img({'img': frame})['img']
x = transform_to_net({'img': frame})['img']
x.unsqueeze_(0).to(device)

How to get the video file length in Yolo v3

I wanted to find out how the video frame length was calculated in the below code.
[UPD] Before I was thinking it was done by Yolo, but later I realized it was OpenCV that dealt with number of frames in a video file.
"""
Class definition of YOLO_v3 style detection model on image and video
"""
import colorsys
import os
from timeit import default_timer as timer
import numpy as np
from keras import backend as K
from keras.models import load_model
from keras.layers import Input
from PIL import Image, ImageFont, ImageDraw
from yolo3.model import yolo_eval, yolo_body, tiny_yolo_body
from yolo3.utils import letterbox_image
import os
from keras.utils import multi_gpu_model
class YOLO(object):
_defaults = {
"model_path": 'model_data/yolo.h5',
"anchors_path": 'model_data/yolo_anchors.txt',
"classes_path": 'model_data/coco_classes.txt',
"score" : 0.3,
"iou" : 0.45,
"model_image_size" : (416, 416),
"gpu_num" : 1,
}
#classmethod
def get_defaults(cls, n):
if n in cls._defaults:
return cls._defaults[n]
else:
return "Unrecognized attribute name '" + n + "'"
def __init__(self, **kwargs):
self.__dict__.update(self._defaults) # set up default values
self.__dict__.update(kwargs) # and update with user overrides
self.class_names = self._get_class()
self.anchors = self._get_anchors()
self.sess = K.get_session()
self.boxes, self.scores, self.classes = self.generate()
def _get_class(self):
classes_path = os.path.expanduser(self.classes_path)
with open(classes_path) as f:
class_names = f.readlines()
class_names = [c.strip() for c in class_names]
return class_names
def _get_anchors(self):
anchors_path = os.path.expanduser(self.anchors_path)
with open(anchors_path) as f:
anchors = f.readline()
anchors = [float(x) for x in anchors.split(',')]
return np.array(anchors).reshape(-1, 2)
def generate(self):
model_path = os.path.expanduser(self.model_path)
assert model_path.endswith('.h5'), 'weights must be a .h5 file.'
# Load model, or construct model and load weights.
num_anchors = len(self.anchors)
num_classes = len(self.class_names)
is_tiny_version = num_anchors==6 # default setting
try:
self.yolo_model = load_model(model_path, compile=False)
except:
self.yolo_model = tiny_yolo_body(Input(shape=(None,None,3)), num_anchors//2, num_classes) \
if is_tiny_version else yolo_body(Input(shape=(None,None,3)), num_anchors//3, num_classes)
self.yolo_model.load_weights(self.model_path) # make sure model, anchors and classes match
else:
assert self.yolo_model.layers[-1].output_shape[-1] == \
num_anchors/len(self.yolo_model.output) * (num_classes + 5), \
'Mismatch between model and given anchor and class sizes'
print('{} model, anchors, and classes loaded.'.format(model_path))
# Generate colors for drawing bounding boxes.
hsv_tuples = [(x / len(self.class_names), 1., 1.)
for x in range(len(self.class_names))]
self.colors = list(map(lambda x: colorsys.hsv_to_rgb(*x), hsv_tuples))
self.colors = list(
map(lambda x: (int(x[0] * 255), int(x[1] * 255), int(x[2] * 255)),
self.colors))
np.random.seed(10101) # Fixed seed for consistent colors across runs.
np.random.shuffle(self.colors) # Shuffle colors to decorrelate adjacent classes.
np.random.seed(None) # Reset seed to default.
# Generate output tensor targets for filtered bounding boxes.
self.input_image_shape = K.placeholder(shape=(2, ))
if self.gpu_num>=2:
self.yolo_model = multi_gpu_model(self.yolo_model, gpus=self.gpu_num)
boxes, scores, classes = yolo_eval(self.yolo_model.output, self.anchors,
len(self.class_names), self.input_image_shape,
score_threshold=self.score, iou_threshold=self.iou)
return boxes, scores, classes
def detect_image(self, image):
start = timer()
if self.model_image_size != (None, None):
assert self.model_image_size[0]%32 == 0, 'Multiples of 32 required'
assert self.model_image_size[1]%32 == 0, 'Multiples of 32 required'
boxed_image = letterbox_image(image, tuple(reversed(self.model_image_size)))
else:
new_image_size = (image.width - (image.width % 32),
image.height - (image.height % 32))
boxed_image = letterbox_image(image, new_image_size)
image_data = np.array(boxed_image, dtype='float32')
print(image_data.shape)
image_data /= 255.
image_data = np.expand_dims(image_data, 0) # Add batch dimension.
out_boxes, out_scores, out_classes = self.sess.run(
[self.boxes, self.scores, self.classes],
feed_dict={
self.yolo_model.input: image_data,
self.input_image_shape: [image.size[1], image.size[0]],
K.learning_phase(): 0
})
print('Found {} boxes for {}'.format(len(out_boxes), 'img'))
font = ImageFont.truetype(font='font/FiraMono-Medium.otf',
size=np.floor(3e-2 * image.size[1] + 0.5).astype('int32'))
thickness = (image.size[0] + image.size[1]) // 300
for i, c in reversed(list(enumerate(out_classes))):
predicted_class = self.class_names[c]
box = out_boxes[i]
score = out_scores[i]
label = '{} {:.2f}'.format(predicted_class, score)
draw = ImageDraw.Draw(image)
label_size = draw.textsize(label, font)
top, left, bottom, right = box
top = max(0, np.floor(top + 0.5).astype('int32'))
left = max(0, np.floor(left + 0.5).astype('int32'))
bottom = min(image.size[1], np.floor(bottom + 0.5).astype('int32'))
right = min(image.size[0], np.floor(right + 0.5).astype('int32'))
print(label, (left, top), (right, bottom))
if top - label_size[1] >= 0:
text_origin = np.array([left, top - label_size[1]])
else:
text_origin = np.array([left, top + 1])
# My kingdom for a good redistributable image drawing library.
for i in range(thickness):
draw.rectangle(
[left + i, top + i, right - i, bottom - i],
outline=self.colors[c])
draw.rectangle(
[tuple(text_origin), tuple(text_origin + label_size)],
fill=self.colors[c])
draw.text(text_origin, label, fill=(0, 0, 0), font=font)
del draw
end = timer()
print(end - start)
return image
def close_session(self):
self.sess.close()
def detect_video(yolo, video_path, output_path=""):
import cv2
video_path = './input.mp4'
vid = cv2.VideoCapture(video_path)
if not vid.isOpened():
raise IOError("Couldn't open webcam or video")
video_FourCC = int(vid.get(cv2.CAP_PROP_FOURCC))
video_fps = vid.get(cv2.CAP_PROP_FPS)
video_size = (int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)),
int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)))
isOutput = True if output_path != "" else False
if isOutput:
print("!!! TYPE:", type(output_path), type(video_FourCC), type(video_fps), type(video_size))
out = cv2.VideoWriter(output_path, video_FourCC, video_fps, video_size)
accum_time = 0
curr_fps = 0
fps = "FPS: ??"
prev_time = timer()
while True:
return_value, frame = vid.read()
image = Image.fromarray(frame)
image = yolo.detect_image(image)
result = np.asarray(image)
curr_time = timer()
exec_time = curr_time - prev_time
prev_time = curr_time
accum_time = accum_time + exec_time
curr_fps = curr_fps + 1
if accum_time == 10 : mouseBrush(image)
if accum_time > 1:
accum_time = accum_time - 1
fps = "FPS: " + str(curr_fps)
curr_fps = 0
cv2.putText(result, text=fps, org=(3, 15), fontFace=cv2.FONT_HERSHEY_SIMPLEX,
fontScale=0.50, color=(255, 0, 0), thickness=2)
cv2.namedWindow("result", cv2.WINDOW_NORMAL)
cv2.imshow("result", result)
if isOutput:
out.write(result)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
yolo.close_session()
Actually, this code is just one part of the all Yolo3 model, but I think the part that deals with the number of video frames is included here.
If you mean the current FPS. This is the part showing the current FPS in string.
while True:
return_value, frame = vid.read()
image = Image.fromarray(frame)
image = yolo.detect_image(image)
result = np.asarray(image)
curr_time = timer()
exec_time = curr_time - prev_time
prev_time = curr_time
accum_time = accum_time + exec_time
curr_fps = curr_fps + 1
if accum_time > 1:
accum_time = accum_time - 1
fps = "FPS: " + str(curr_fps)
curr_fps = 0
cv2.putText(result, text=fps, org=(3, 15), fontFace=cv2.FONT_HERSHEY_SIMPLEX,
fontScale=0.50, color=(255, 0, 0), thickness=2)
cv2.namedWindow("result", cv2.WINDOW_NORMAL)
cv2.imshow("result", result)
if curr_fps == 10: # Stops at 10th frame.
time.sleep(60) # Delay for 1 minute (60 seconds).
if isOutput:
out.write(result)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
I needed the frame number to control every 10th frame in the video file, and thanks to above comments, I figured out that the line I was looking for is:
curr_fps = curr_fps + 1
UPD: The following line calculated the number of frames in a video file.
NumberOfFrame = int(vid.get(cv2.CAP_PROP_FRAME_COUNT))

Categories

Resources