I'm running object detection on a webcam stream or on video files, using Python on Ubuntu18.04. So far, I've been running the inference frame-by-frame, using this code:
def main():
ctx = mx.gpu(0)
# Load pretrained model
net = gcv.model_zoo.get_model('ssd_512_mobilenet1.0_coco', pretrained=True)
net.hybridize()
# Load the webcam handler
cap = cv2.VideoCapture(0)
count_frame = 0
while(True):
print(f"Frame: {count_frame}")
# Load frame from the camera
ret, frame = cap.read()
# print(type(frame))
if (cv2.waitKey(25) & 0xFF == ord('q')) or (ret == False):
cv2.destroyAllWindows()
cap.release()
print("Done!!!")
break
# Image pre-processing
frame = mx.nd.array(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)).astype('uint8')
frame_nd, frame_np = gcv.data.transforms.presets.ssd.transform_test(frame, short=512, max_size=700)
# Run frame through network
frame_nd = frame_nd.as_in_context(ctx)
class_IDs, scores, bounding_boxes = net(frame_nd)
# Display result with cv
img = gcv.utils.viz.cv_plot_bbox(frame_np, bounding_boxes[0], scores[0], class_IDs[0], thresh=0.3, class_names=net.classes)
gcv.utils.viz.cv_plot_image(img)
count_frame += 1
cv2.destroyAllWindows()
cap.release()
if __name__ == "__main__":
main()
I wanted to try an alternative version, where I don't perform the detection frame-by-frame, but on batches of frames.
I tried this way:
create an empty list at the beginning;
append every frame (after image pre-processing) to the list;
after N frames (say, N=50), convert the list to a mx.nd.array;
feed said array to the model;
so, with this code:
def main():
ctx = mx.gpu(0)
# Load a pretrained model
net = gcv.model_zoo.get_model('ssd_512_mobilenet1.0_coco', pretrained=True)
net.hybridize()
# Load the webcam handler
cap = cv2.VideoCapture(0)
count_frame = 0
batch = []
while(True):
print(f"Frame: {count_frame}")
# Load frame from the camera
ret, frame = cap.read()
if (cv2.waitKey(25) & 0xFF == ord('q')) or (ret == False):
cv2.destroyAllWindows()
cap.release()
print("Done!!!")
break
# Image pre-processing
frame = mx.nd.array(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)).astype('uint8')
frame_nd, frame_np = gcv.data.transforms.presets.ssd.transform_test(frame, short=512, max_size=700)
batch.append(frame_nd)
if (count_frame>0) and (count_frame%50 == 0):
print("\tStarting detection.")
batch_nd = mx.nd.array(batch)
batch_nd = batch_nd.as_in_context(ctx)
class_IDs, scores, bounding_boxes = net(batch)
print("\tDetection performed.")
count_frame += 1
cv2.destroyAllWindows()
cap.release()
if __name__ == "__main__":
main()
The problem is that, when I run it, the execution gets completely stuck when reaching the line:
batch_nd = mx.nd.array(batch)
for reference, this is the output:
Frame: 0
Frame: 1
Frame: 2
Frame: 3
Frame: 4
Frame: 5
Frame: 6
Frame: 7
Frame: 8
Frame: 9
Frame: 10
Frame: 11
Frame: 12
Frame: 13
Frame: 14
Frame: 15
Frame: 16
Frame: 17
Frame: 18
Frame: 19
Frame: 20
Starting detection.
any clue what I'm doing wrong? Is there any better way to send batches of frames to the model?
Related
I am trying to run a simple objection detection on webcam using Yolov5 but I keep getting the error below.
zsh: segmentation fault
The camera appears to open then shut off immediately and the code exit with the above error.
Here is my code
def object_detector():
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
model = torch.hub.load('ultralytics/yolov5', 'yolov5s')
# mmocr = MMOCR(det='TextSnake', recog='SAR')
cam = cv2.VideoCapture(0)
while(True):
ret, frame = cam.read()
# ocr_result = mmocr.readtext(frame, output='demo/cam.jpg', export='demo/', print_result=True, imshow=True)
# print("RESULT \n ", ocr_result)
frame = frame[:, :, [2,1,0]]
frame = Image.fromarray(frame)
frame = cv2.cvtColor(np.array(frame), cv2.COLOR_RGB2BGR)
# ocr_result = mmocr.readtext(frame, output='demo/cam.jpg', export='demo/', print_result=True, imshow=True)
# print("RESULT \n ", ocr_result)
result = model(frame,size=640)
# Results
# crops = result.crop(save=True)
cv2.imshow('YOLO', np.squeeze(result.render()))
if cv2.waitKey(1) & 0xFF == ord('q'):
break
cam.release()
cv2.destroyAllWindows()
what am i doing wrong and how can i fix it ?
You're not testing the return value from cam.read() ensure that ret is a success code, and that frame is not a nullptr before you proceed.
You need to check that an image is returned in the first place. The first return resulting from cam.read() tells you whether an image has been received. This is how you can make use of it:
...
while(True):
ret, frame = cam.read()
if ret:
frame = frame[:, :, [2,1,0]]
...
if cv2.waitKey(1) & 0xFF == ord('q'):
break
cam.release()
cv2.destroyAllWindows()
I am attempting to save the output of the code proposed here: https://www.programmersought.com/article/34317272452/
The output video is created, but when I try to reproduce the result, there is this message: 'an error occurred could not demultiplex stream'. Also, I have tried several forms to save the video; for example, I have used the shape of the frames, but it neither recognizes them. Could anyone help to figure out where I am doing wrong?
def read_video(video):
# Saliency detection algorithm
saliency_algorithm = "BinWangApr2014"
start_frame = 0
if saliency_algorithm is None or video is None:
print('Please set saliency_algorithm and video')
return
cap = cv2.VideoCapture(video)
video = cv2.VideoWriter('out_j.avi',cv2.VideoWriter_fourcc(*'XVID'), 5, (360,360))
# Set the video start frame
cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
_, frame = cap.read()
if frame is None:
print('Please set saliency_algorithm and video')
return
image = frame.copy()
if saliency_algorithm.find("BinWangApr2014")==0:
saliency_algorithm = cv2.saliency.MotionSaliencyBinWangApr2014_create()
# set the size of the data structure
saliency_algorithm.setImagesize(image.shape[1], image.shape[0])
# Initialization
saliency_algorithm.init()
paused = False
while True:
if not paused:
_, frame = cap.read()
if frame is None:
break
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
start = cv2.getTickCount()
success, saliencyMap = saliency_algorithm.computeSaliency(frame)
duration = (cv2.getCPUTickCount() - start)/ \
cv2.getTickFrequency()
#print("computeBinaryMap cost time is: {} ms".format(duration * 1000))
video.write(frame)
#cv2.imshow('image', frame)
cv2.imshow('saliencyMap', saliencyMap*255)
c = cv2.waitKey(2)
c = chr(c) if c !=-1 else 0
if c == 'q':
break
if c == 'p':
paused = not paused
#video.write(frame)
cv2.destroyAllWindows()
video.release()
#camera.stop()
return
Thanks in advance
I'm doing a violence detection project and I have trained my model using resnet but when I'm just printing probability it gives some error.
# import packages done
#load model
model = load_model('E:\Docs & Other\C-VS\ViolenceDetection\Resources\VDresnet152v2.h5')
img_width, img_hight = 224, 224
#start web cam
cap = cv2.VideoCapture(0)
#sart reading images and prediction
while True:
#read image from webcam
responce, color_img = cap.read()
#if respoce False the break the loop
if responce == False:
break
#resize image with 50 % ratio
color_img = cv2.resize(color_img,(224,224))
color_img = color_img.reshape(1,224,224,3)
pred_prob = model.predict(color_img)
print(pred_prob[0][0].round(2))
pred=np.argmax(pred_prob)
# display image
cv2.imshow('LIVE', color_img)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
# Release the VideoCapture object
cap.release()
cv2.destroyAllWindows()
What am I doing wrong? when I run this code it gives an error.
after this line:
color_img = color_img.reshape(1,224,224,3)
color_img is a 4d tensor (with seperate color planes !), not usable with cv2.imshow()
please do not "recycle" variable names, esp. if their context changes.
color_img = cv2.resize(color_img,(224,224))
color_tensor = color_img.reshape(1,224,224,3)
pred_prob = model.predict(color_tensor)
...
cv2.imshow('LIVE', color_img)
I am working with a 2h-long 4K MP4 video shot at 25 fps. I am using openCV in python 3. From the video, I need to extract 3 circular ROIs.
Because of the large number of frames (212831), processing the whole video frame by frame with the code below takes over 24h on a 64 GB RAM workstation. Is there a way to speed up the processing or a workaround that does something slightly different to a similar effect?
Here is the code.
import numpy as np
import cv2
import sys
import time
# name of source video and paths
video = 'C0023_2hanalysis'
input_vidpath = 'path_to_video/' + video + '.MP4'
output_vidpath = 'path_to_video/' + video + '-withROI.MP4'
codec = 'avc1'
# set ROI coordinates extrapolated from last video frame as well as fixed parameters for analysis (radius)
x = 1188 # in pixels
y = 1204 # in pixels
radius = 75
# set parameters for output video as identical to input
fps = 25.0
scaling = 1.0 # output vs input video speed (?)
## import video
cap = cv2.VideoCapture(input_vidpath)
if cap.isOpened() == False:
sys.exit('Video file cannot be read! Please check input_vidpath to ensure it is correctly pointing to the video file')
## Video writer class to output video
fourcc = cv2.VideoWriter_fourcc(*codec) # concatenate the 4 chars to a fourcc code, i.e. the 4-char name of the codec used to compress the frames
# adjust output frame size to scaling if any is applied
#(frame shape is given as height,width , so the output needs to be re-ordered to match VideoWriter arguments)
o_height = cap.read()[1].shape[0]
o_width = cap.read()[1].shape[1]
output_framesize = (int(o_width*scaling),int(o_height*scaling))
out = cv2.VideoWriter(filename = output_vidpath, fourcc = 0x7634706d, fps = fps, frameSize = output_framesize, isColor = True)
## apply ROI frame by frame and thread them back into output video
start = time.time()
f = -1
last = 0
while(True):
# Capture frame-by-frame
ret, frame = cap.read() #'return' value (T/F) and frame
this = cap.get(1) # get 'CV_CAP_PROP_POS_FRAMES'
if ret == True:
#frame = cv2.resize(frame, None, fx = scaling, fy = scaling, interpolation = cv2.INTER_LINEAR) # no need to resize in this case
# Apply mask to area of interest
mask = np.zeros((o_height,o_width), np.uint8)
mask = cv2.circle(mask,(x,y),radius,255,thickness=-1) #image, row and column coord of centre of circle, radius, color (black), thickness
frame[mask == 0] = 0
out.write(frame)
key = cv2.waitKey(1) & 0xFF
# if the `q` key was pressed, break from the loop
if key == ord("q"):
break
f += 1
if f%1000==0:
print(f)
if last == this:
break
last = this
## When everything done, release the capture
cap.release()
out.release()
cv2.destroyAllWindows()
cv2.waitKey(1)
## End time and duration
end = time.time()
duration = end - start
print("--- %s seconds ---" %duration)
This is a common mistake. You shouldn't call waitKey(1) when you want to process the frames as fast possible. That function is basically add a short sleep after processing each frame, and that sleep time is much longer that processing time.
You just need to remove that, and still you can just kill the process if want to stop in the middle.
import numpy as np
import cv2
import sys
import time
# name of source video and paths
video = 'C0023_2hanalysis'
input_vidpath = 'path_to_video/' + video + '.MP4'
output_vidpath = 'path_to_video/' + video + '-withROI.MP4'
codec = 'avc1'
# set ROI coordinates extrapolated from last video frame as well as fixed parameters for analysis (radius)
x = 1188 # in pixels
y = 1204 # in pixels
radius = 75
# set parameters for output video as identical to input
fps = 25.0
scaling = 1.0 # output vs input video speed (?)
## import video
cap = cv2.VideoCapture(input_vidpath)
if cap.isOpened() == False:
sys.exit('Video file cannot be read! Please check input_vidpath to ensure it is correctly pointing to the video file')
## Video writer class to output video
fourcc = cv2.VideoWriter_fourcc(*codec) # concatenate the 4 chars to a fourcc code, i.e. the 4-char name of the codec used to compress the frames
# adjust output frame size to scaling if any is applied
#(frame shape is given as height,width , so the output needs to be re-ordered to match VideoWriter arguments)
o_height = cap.read()[1].shape[0]
o_width = cap.read()[1].shape[1]
output_framesize = (int(o_width*scaling),int(o_height*scaling))
out = cv2.VideoWriter(filename = output_vidpath, fourcc = 0x7634706d, fps = fps, frameSize = output_framesize, isColor = True)
## apply ROI frame by frame and thread them back into output video
start = time.time()
f = -1
last = 0
while(True):
# Capture frame-by-frame
ret, frame = cap.read() #'return' value (T/F) and frame
this = cap.get(1) # get 'CV_CAP_PROP_POS_FRAMES'
if ret == True:
#frame = cv2.resize(frame, None, fx = scaling, fy = scaling, interpolation = cv2.INTER_LINEAR) # no need to resize in this case
# Apply mask to area of interest
mask = np.zeros((o_height,o_width), np.uint8)
mask = cv2.circle(mask,(x,y),radius,255,thickness=-1) #image, row and column coord of centre of circle, radius, color (black), thickness
frame[mask == 0] = 0
out.write(frame)
f += 1
if f%1000==0:
print(f)
if last == this:
break
last = this
## When everything done, release the capture
cap.release()
out.release()
cv2.destroyAllWindows()
cv2.waitKey(1)
## End time and duration
end = time.time()
duration = end - start
print("--- %s seconds ---" %duration)
I have a project which i should implement a video stabilizer using fft and filters (lpf or hpf)
Here is some part of the code that i want to modify it:
import cv2
import numpy as np
# Create a VideoCapture object and read from input file
cap = cv2.VideoCapture('Vibrated2.avi')
# load data
data = np.loadtxt('Vibrated2.txt', delimiter=',')
# Define the codec and create VideoWriter object
frame_width = int(cap.get(3))
frame_height = int(cap.get(4))
out = cv2.VideoWriter('Stabilized.avi',cv2.VideoWriter_fourcc('M','J','P','G'), 25, (frame_width,frame_height))
# Check if camera opened successfully
if (cap.isOpened()== False):
print("Error opening video stream or file")
def transform(frame, param):
ti = cv2.getRotationMatrix2D((0,0), 0, 1)
ti[0,2] += param[0]
ti[1,2] += param[1]
frame = cv2.warpAffine(frame, ti, frame.shape[1:-4:-1])
return frame
num = 0
# Read until video is completed
while(cap.isOpened()):
# Capture frame-by-frame
ret, frame = cap.read()
if ret == True:
# apply transformation
frame = transform(frame, data[num])
print (frame, "dn")
num+=1
# Display the resulting frame
cv2.imshow('Frame',frame)
# Write the frame into the file 'output.avi'
out.write(frame)
# Press Q on keyboard to exit
if cv2.waitKey(25) & 0xFF == ord('q'):
break
# Break the loop
else:
break
# When everything done, release the video capture object
cap.release()
# Closes all the frames
cv2.destroyAllWindows()
There is a video named Vibrated1.avi
And a text file which includes frames difference named Virated1.txt and it is looked like this:
0.341486, -0.258215
0.121945, 1.27605
-0.0811261, 0.78985
,...
I don't know how and where should i add some filters to this code to remove video vibration
Could anyone help me ?
I can write short pseudocode with C++ parts:
cv::Mat homoFiltered = cv::Mat::eye(3, 3, CV_32F);
const double alpha = 0.9;
cv::Mat a1 = cv::Mat::eye(3, 3, CV_32F) * alpha;
cv::Mat a2 = cv::Mat::eye(3, 3, CV_32F) * (1. - alpha);
while cap >> frame:
cv::Mat homo = CalcHomography(frame)
homoFiltered = a1 * (homoFiltered * homo) + a2 * homo;
cv::warpPerspective(..., homoFiltered, ...)
alpha - low pass smoothing coefficient in [0; 1]
a1 and a2 - matrices for applying alpha and (1 - alpha) to the transformation matrix
homoFiltered - filtered transformation matrix
homo - current matrix between Frame(t-1) and Frame(t)