Multi process Video Processing - python

I would like to do video processing on neighboring frames. More specific, I would like to compute the mean square error between neighboring frames:
mean_squared_error(prev_frame,frame)
I know how to compute this in a linear straightforward way: I use the imutils package to utilize a queue to decouple loading the frames and processing them. By storing them in a queue, I don't need to wait for them before I can process them. ... but I want to be even faster...
# import the necessary packages to read the video
import imutils
from imutils.video import FileVideoStream
# package to compute mean squared errror
from skimage.metrics import mean_squared_error
if __name__ == '__main__':
# SPECIFY PATH TO VIDEO FILE
file = "VIDEO_PATH.mp4"
# START IMUTILS VIDEO STREAM
print("[INFO] starting video file thread...")
fvs = FileVideoStream(path_video, transform=transform_image).start()
# INITALIZE LIST to store the results
mean_square_error_list = []
# READ PREVIOUS FRAME
prev_frame = fvs.read()
# LOOP over frames from the video file stream
while fvs.more():
# GRAP THE NEXT FRAME from the threaded video file stream
frame = fvs.read()
# COMPUTE the metric
metric_val = mean_squared_error(prev_frame,frame)
mean_square_error_list.append(1-metric_val) # Append to list
# UPDATE previous frame variable
prev_frame = frame
Now my question is: How can I mutliprocess the computation of the metric to increase speed and save time ?
My operating system is Windows 10 and I am using python 3.8.0

There are too many aspects of making things faster, I'll only focus on the multiprocessing part.
As you don't want to read the whole video at a time, we have to read the video frame by frame.
I'll be using opencv (cv2), numpy for reading the frames, calculating mse, and saving the mse to disk.
First, we can start without any multiprocessing so we can benchmark our results. I'm using a video of 1920 by 1080 dimension, 60 FPS, duration: 1:29, size: 100 MB.
import cv2
import sys
import time
import numpy as np
import subprocess as sp
import multiprocessing as mp
filename = '2.mp4'
def process_video():
cap = cv2.VideoCapture(filename)
proc_frames = 0
mse = []
prev_frame = None
ret = True
while ret:
ret, frame = cap.read() # reading frames sequentially
if ret == False:
break
if not (prev_frame is None):
c_mse = np.mean(np.square(prev_frame-frame))
mse.append(c_mse)
prev_frame = frame
proc_frames += 1
np.save('data/' + 'sp' + '.npy', np.array(mse))
cap.release()
return
if __name__ == "__main__":
t1 = time.time()
process_video()
t2 = time.time()
print(t2-t1)
In my system, it runs for 142 secs.
Now, we can take the multiprocessing approach. The idea can be summarized in the following illustration.
GIF credit: Google
We make some segments (based on how many cpu cores we have) and process those segmented frames in parallel.
import cv2
import sys
import time
import numpy as np
import subprocess as sp
import multiprocessing as mp
filename = '2.mp4'
def process_video(group_number):
cap = cv2.VideoCapture(filename)
num_processes = mp.cpu_count()
frame_jump_unit = cap.get(cv2.CAP_PROP_FRAME_COUNT) // num_processes
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_jump_unit * group_number)
proc_frames = 0
mse = []
prev_frame = None
while proc_frames < frame_jump_unit:
ret, frame = cap.read()
if ret == False:
break
if not (prev_frame is None):
c_mse = np.mean(np.square(prev_frame-frame))
mse.append(c_mse)
prev_frame = frame
proc_frames += 1
np.save('data/' + str(group_number) + '.npy', np.array(mse))
cap.release()
return
if __name__ == "__main__":
t1 = time.time()
num_processes = mp.cpu_count()
print(f'CPU: {num_processes}')
# only meta-data
cap = cv2.VideoCapture(filename)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)
frame_jump_unit = cap.get(cv2.CAP_PROP_FRAME_COUNT) // num_processes
cap.release()
p = mp.Pool(num_processes)
p.map(process_video, range(num_processes))
# merging
# the missing mse will be
final_mse = []
for i in range(num_processes):
na = np.load(f'data/{i}.npy')
final_mse.extend(na)
try:
cap = cv2.VideoCapture(filename) # you could also take it outside the loop to reduce some overhead
frame_no = (frame_jump_unit) * (i+1) - 1
print(frame_no)
cap.set(1, frame_no)
_, frame1 = cap.read()
#cap.set(1, ((frame_jump_unit) * (i+1)))
_, frame2 = cap.read()
c_mse = np.mean(np.square(frame1-frame2))
final_mse.append(c_mse)
cap.release()
except:
print('failed in 1 case')
# in the last few frames, nothing left
pass
t2 = time.time()
print(t2-t1)
np.save(f'data/final_mse.npy', np.array(final_mse))
I'm using just numpy save to save the partial results, you can try something better.
This one runs for 49.56 secs with my cpu_count = 12. There are definitely some bottlenecks that can be avoided to make it run faster.
The only issue with my implementation is, it's missing the mse for regions where the video was segmented, it's pretty easy to add. As we can index individual frames at any location with OpenCV in O(1), we can just go to those locations and calculate mse separately and merge to the final solution. [Check the updated code it fixes the merging part]
You can write a simple sanity check to ensure, both provide the same result.
import numpy as np
a = np.load('data/sp.npy')
b = np.load('data/final_mse.npy')
print(a.shape)
print(b.shape)
print(a[:10])
print(b[:10])
for i in range(len(a)):
if a[i] != b[i]:
print(i)
Now, some additional speedups can come from using a CUDA-compiled opencv, ffmpeg, adding queuing mechanism plus multiprocessing, etc.

Related

Python multiprocessing with a while loop and shared resources

I'm new to programming and i cant seem to figure out how to correctly optimise my project, i have a function which takes 2 images and used opencv to stitch the images together. This process ussually takes 0.5seconds for each image to be stitched together, i would like to optimise this so that the images are stitched together at a faster rate.
So, at the moment i have a 2 arrays each containing 800 images, i also have a function called stitch_images which processes each image set to be stitched together. However, for this function im using a while loop to go through each image and stitch it to its corresponding image - this seems to be causing me issues as the while loop is blocking the process. I'm also using 2 shared global variables which contain the images.
Theoretically what i would like to achieve is 4 processes, each process process takes a set of image and works on it --> effectively reducing the computational time by 1/4th.
my question is, how would i go about achieving this? i understand that there are multiple different ways of multiprocessing in python such as threading, multiprocess, queues. which would be the best option for me? if there is an easy way to implement this would anyone have any example code for this?
this is my current set up:
import multiprocessing
import time
import cv2
# Global variables:
frames_1 = []
frames_2 = []
panorama = []
# converting the video into frames for individual image processing
def convert_video_to_frames():
cap = cv2.VideoCapture("Sample_video_1.mp4")
ret = True
while ret:
ret, img = cap.read() # read one frame from the 'capture' object; img is (H, W, C)
if ret:
frames_1.append(img)
cap = cv2.VideoCapture("Sample_video_2.mp4")
ret = True
while ret:
ret, img = cap.read() # read one frame from the 'capture' object; img is (H, W, C)
if ret:
frames_2.append(img)
return frames_1, frames_2
#converting final output images back to video
def convert_frames_to_video():
print("now creating stitched image video")
height, width, layers = panorama[0].shape
size = (width, height)
out = cv2.VideoWriter('project.avi', cv2.VideoWriter_fourcc(*'DIVX'), 15, size)
for i in range(len(panorama)):
out.write(panorama[i])
out.release()
def stitch_images():
print("image processing starting...")
stitcher = cv2.Stitcher_create(cv2.STITCHER_PANORAMA)
while len(frames_1) != 0:
status, result = stitcher.stitch((frames_1.pop(0), frames_2.pop(0)))
if status == 0: # pass
panorama.append(result)
else:
print("image stitching failed")
if __name__ == '__main__':
convert_video_to_frames() # dummy function
start = time.perf_counter()
stitch_images()
finish = time.perf_counter()
print(f'finished in {round(finish - start, 2)} seconds(s)')
print("now converting images to video...")
convert_frames_to_video()
Also, i've attempted at using multiprocessing and adding locks to achieve this but adding:
p1 = multiprocessing.Process(target=stitch_images)
p2 = multiprocessing.Process(target=stitch_images)
p1.start()
p2.start()
p1.join()
p2.join()
but when i run this it seems to skip the while loop all together?

How to edit videos in python in OpenCV in relatively short time?

So I am doing something like this:
fourcc = cv2.VideoWriter_fourcc(*'avi')
source = cv2.VideoCapture('video.avi')
while(source.isOpened()):
ret, frame = source.read()
if(type(frame) != type(None)):
for line in frame:
for pixel in line:
someedit()
The problem I have is that 5 seconds of 300:400 video takes around 5 minutes even if the someedit() is something basic like pixel[0]+1. Is python generally to slow to do something like this, or is there a workaround?
#import deps
import cv2
import numpy as np
#read the video
source = cv2.VideoCapture('videoPath')
# flag
ret = True
# iterate through all the frames if the video clip present
while(source.isOpened() and ret):
#read the frame
ret, frame = source.read()
#check for flag
if(ret):
# add 1 to 0th channel
frame[:, :, 0] = 1 + frame[:, :, 0]
print("Done")

OpenCV code snippet running slower inside Python multiprocessing process

I was doing some tests with multiprocessing to parallelize face detection and recognition and I came across a strange behaviour, in which detectMultiScale() (that performs the face detection) was running slower inside a child process than in the parent process (just calling the function).
Thus, I wrote the code below in which 10 images are enqueued and then the face detection is performed sequentially with one of two approaches: just calling the detection function or running it inside a single new process. For each detectMultiScale() call, the time of execution is printed. Executing this code gives me an average of 0.22s for each call in the first approach and 0.54s for the second. Also, the total time to process the 10 images is greater in the second approach too.
I don't know why the same code snippet is running slower inside the new process. If only the total time were greater I would understand (considering the overhead of setup a new process), but this I don't get it. For the record, I'm running it in a Raspberry Pi 3B+.
import cv2
import multiprocessing
from time import time, sleep
def detect(face_cascade, img_queue, bnd_queue):
while True:
image = img_queue.get()
if image is not None:
gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
ti = time()
########################################
faces = face_cascade.detectMultiScale(
gray_image,
scaleFactor=1.1,
minNeighbors=3,
minSize=(130, 130))
########################################
tf = time()
print('det time: ' + str(tf-ti))
if len(faces) > 0:
max_bounds = (0,0,0,0)
max_size = 0
for (x,y,w,h) in faces:
if w*h > max_size:
max_size = w*h
max_bounds = (x,y,w,h)
img_queue.task_done()
bnd_queue.put('bound')
else:
img_queue.task_done()
break
face_cascade = cv2.CascadeClassifier('../lbpcascade_frontalface_improved.xml')
cam = cv2.VideoCapture(0)
cam.set(cv2.CAP_PROP_FRAME_WIDTH, 2592)
cam.set(cv2.CAP_PROP_FRAME_HEIGHT, 1944)
cam.set(cv2.CAP_PROP_BUFFERSIZE, 1)
img_queue = multiprocessing.JoinableQueue()
i = 0
while i < 10:
is_there_frame, image = cam.read()
if is_there_frame:
image = image[0:1944, 864:1728]
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
img_queue.put(image)
i += 1
bnd_queue = multiprocessing.JoinableQueue()
num_process = 1
ti = time()
# MULTIPROCESSING PROCESS APPROACH
for _ in range(num_process):
p = multiprocessing.Process(target=detect, args=(face_cascade, img_queue, bnd_queue))
p.start()
for _ in range(num_process):
img_queue.put(None)
#
# FUNCTION CALL APPROACH
#img_queue.put(None)
#while not img_queue.empty():
# detect(face_cascade, img_queue, bnd_queue)
img_queue.join()
tf = time()
print('TOTAL TIME: ' + str(tf-ti))
while not bnd_queue.empty():
bound = bnd_queue.get()
if bound != 'bound':
print('ERROR')
bnd_queue.task_done()
I am having same issue and I think the reason is that tasks is somewhat I/O bound and also the overhead created by multiprocessing itself.
You can also read the article here https://www.pyimagesearch.com/2019/09/09/multiprocessing-with-opencv-and-python/
And the problem you mentioned specifically with detectMultiScale() method is same as mine. I have also tried using serialize and making variables global and also of class level but nothing help..

OPENCV: Extract the lifetime for a frame in a video with variable frame rate

I'm using python OpenCV to read frames from a video file that has a variable frame rate.
I need to understand how the wall time changes between frames (i.e. I want to be able to write the timestamp on every frame). My understanding is that the underlying video format stores how long each frame lasts i.e. the video file contains the information to tell the video player how long it should display the given frame before moving to the next one.
I would like to programmatically access this data using the python OpenCV interface. An example (that does not work) of what this might look like is:
import numpy as np
import cv2 as cv
cap = cv.VideoCapture('my_variable_frame_rate_video.mp4')
while cap.isOpened():
ret, frame = cap.read()
frame_length_ms = ret.frame_length() # obviously doesn't work
print("The length of the frame in ms is {}".format(frame_length_ms))
# Should print: The length of the frame in ms is 23
Things that don't work
Reading the source frame rate from the capture device:
fps = cap.get(cv2.cv.CV_CAP_PROP_FPS)
Counting the total number of frames:
fps = cap.get(cv2.cv.CV_CAP_PROP_FRAME_COUNT)/total_length
Both of these don't work because we are dealing with a variable frame rate
An idea could be extract the amount of time and write it in each frame
import cv2
import time
path="video.mp4"
video = cv2.VideoCapture(path);
start = time.time()
ret, frame = video.read()
font = cv2.FONT_HERSHEY_SIMPLEX
i = 0
while ret:
start = time.time()
ret, frame = video.read()
end = time.time()
seconds = end - start
cv2.putText(frame,str(seconds),(10,500), font, 4,(255,255,255),2,cv2.LINE_AA)
cv2.imwrite(str(i)+".jpg", frame)
i+=1
video.release()

Number of frames in a video using PyAV

I use the library PyAV because it's one of the fatest available with Python.
Here is a simple example of code I would like to use:
import av
video = av.open("My_Super_Video.mp4")
total_frames = # ????
for i, frame in enumerate(video.decode(video=0)):
img = frame.to_image() # PIL image
print("Frame: %d/%d ..." % (i, total_frames))
I could obviously use other libraries to load the library, however I would prefer using PyAV if possible due to its processing speed.
Question 1: Is it possible to obtain the number of frames with PyAV ? If yes, how ?
Question 2: In the case, I would consider using another library to load and process the video frame by frame. Which library would allow me to do the above with the highest speed as possible. I know the followings, but don't know how they compare:
PIMS On top of PyAV, could add some interesting feature ?
MoviePy (limited to videos which fit in RAM), but what about perf ?
Imageio (probably same limitation as above), but what about perf ?
OpenCV (probably same limitation as above), but what about perf ?
Others ?
To get the frames of the first video stream do:
container = av.open("My_Super_Video.mp4")
total_frames = container.streams.video[0].frames
You can get the number of frames in stream with Stream.frames attribute.
Source: http://docs.mikeboers.com/pyav/develop/api/stream.html#av.stream.Stream
Old question, but only partly answered. Let me answer the second question as well.
Question 1: Is it possible to obtain the number of frames with PyAV ? If yes, how ?
import av
with av.open("My_Super_Video.mp4") as container:
total_frames = container.streams.video[0].frames
Question 2: In the case, I would consider using another library to load and process the video frame by frame. Which library would allow me to do the above with the highest speed as possible. I know the followings, but don't know how they compare: [...]
ImageIO timings: 0.497
PyAV timings: 0.908
MoviePy timings: 0.766
OpenCV timings: 0.766
OpenCV timings: 0.569 (no conversion to RGB)
ImageIO is the fastest; hands down. OpenCV comes close (14% slower), but only if you can do your processing in BGR. If you have to work in RGB then the conversion costs you dearly (54% slower 🥵).
That said, it is highly workload-dependent and you should always benchmark with your specific setup. In practice, the difference is often negligible compared to how much time you spend processing each frame.
Here is the benchmark code for those interested:
import cv2
import av
import imageio.v3 as iio
from moviepy.editor import VideoFileClip
from PIL import Image
from timeit import Timer
# create a test video (roughly 11 sec and sane encoding)
frames = iio.imread("imageio:cockatoo.mp4", plugin="pyav")
iio.imwrite("test_video.mp4", frames, plugin="pyav", codec="h264")
def iio_read():
total_frames = iio.improps("test_video.mp4", plugin="pyav").shape[0]
for idx, frame in enumerate(iio.imiter("test_video.mp4", plugin="pyav")):
foo = Image.fromarray(frame)
# Note: I will not print in the benchmark. This will skew the result
# print("Frame: %d/%d ..." % (idx, total_frames))
def av_read():
with av.open("test_video.mp4") as container:
total_frames = container.streams.video[0].frames
for frame in container.decode(video=0):
foo = frame.to_image()
def moviepy_read():
# Can not read frame_count
for frame in VideoFileClip("test_video.mp4").iter_frames():
foo = Image.fromarray(frame)
def cv2_read():
cap = cv2.VideoCapture("test_video.mp4")
total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
success, frame = cap.read()
idx = 0
while success:
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
foo = Image.fromarray(frame)
success, frame = cap.read()
idx += 1
def cv2_read2():
cap = cv2.VideoCapture("test_video.mp4")
total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
success, frame = cap.read()
idx = 0
while success:
foo = Image.fromarray(frame)
success, frame = cap.read()
idx += 1
repeats = 10
time_moviepy = min(Timer("moviepy_read()", globals=globals()).repeat(repeats, number=1))
time_cv2 = min(Timer("cv2_read()", globals=globals()).repeat(repeats, number=1))
time_cv2_no_convert = min(Timer("cv2_read2()", globals=globals()).repeat(repeats, number=1))
time_iio = min(Timer("iio_read()", globals=globals()).repeat(repeats, number=1))
time_av = min(Timer("av_read()", globals=globals()).repeat(repeats, number=1))
print(
f"""
ImageIO timings: {time_iio:.3f}
PyAV timings: {time_av:.3f}
MoviePy timings: {time_moviepy:.3f}
OpenCV timings: {time_cv2:.3f}
OpenCV timings: {time_cv2_no_convert:.3f} (no conversion to RGB)
"""
)
Package Versions:
av==10.0.0
moviepy==1.0.3
Pillow==9.4.0
opencv-python==4.7.0.68
imageio==2.25.0

Categories

Resources