To start, this is a continuation of this question: Multithreading degrades GPU performance. However, since that question never got resolved due to everyone not being able to reproduce the results, I have created a new question with code here that reproduces the slower results outlined there.
To recap: when using cv2.VideoCapture with multi-threading, the inferencing time for Detectron2 is much slower compared to when multi-threading is disabled.
Some additional information is that I am operating on Windows and am using an RTX3070 so inferencing times may be slightly different for those trying to rerun this.
Here is the code:
import time
import cv2
from queue import Queue
from threading import Thread
from detectron2.config import get_cfg
from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor
class FileVideoStream:
def __init__(self, path, queueSize=15):
self.stream = cv2.VideoCapture(path)
self.stopped = False
self.Q = Queue(maxsize=queueSize)
def start(self):
t = Thread(target=self.update, args=())
t.daemon = True
t.start()
return self
def update(self):
while True:
if self.stopped:
self.stream.release()
return
if not self.Q.full():
(grabbed, frame) = self.stream.read()
if not grabbed:
self.stop()
return
self.Q.put(frame)
cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file(
"COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml")
)
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.7 # set threshold for this model
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url(
"COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml"
)
cfg.MODEL.DEVICE = "cuda"
predictor = DefaultPredictor(cfg)
def threading_example():
print("Threading Example:")
fvs = FileVideoStream(r"DemoVideo.mp4")
fvs.start()
# allow time for thread to fill the queue
time.sleep(1)
for i in range(5):
img = fvs.Q.get()
start = time.time()
p = predictor(img)
end = time.time()
print(f"Frame {i} Prediction: {(end - start):.2f}s")
fvs.stopped = True
def non_threading_example():
print("Non-Threading Example:")
video = cv2.VideoCapture(r"DemoVideo.mp4")
for i in range(5):
_, img = video.read()
start = time.time()
p = predictor(img)
end = time.time()
print(f"Frame {i} Prediction: {(end - start):.2f}s")
non_threading_example()
threading_example()
This produces the following output:
Non-Threading Example:
Frame 0 Prediction: 1.41s
Frame 1 Prediction: 0.14s
Frame 2 Prediction: 0.14s
Frame 3 Prediction: 0.14s
Frame 4 Prediction: 0.14s
Threading Example:
Frame 0 Prediction: 10.55s
Frame 1 Prediction: 10.41s
Frame 2 Prediction: 10.77s
Frame 3 Prediction: 10.64s
Frame 4 Prediction: 10.27s
EDIT: I've added code to answer a comment about testing if the GPU on inferencing when inside a thread, which does not appear to be the case.
def infer_5(img):
for i in range(5):
start = time.time()
p = predictor(img)
end = time.time()
print(f"Frame {i}: {(end - start):.2f}s")
def system_load():
img = cv2.imread(
r"Image.jpg")
t = Thread(target=infer_5, args=(img,))
t.start()
Frame 0: 7.51s
Frame 1: 0.39s
Frame 2: 0.15s
Frame 3: 0.15s
Frame 4: 0.15s
Related
I would like to implement a status bar that gives the user feedback of the status of the script. I have build a script that will loop through a video directory and for every video that it detects it runs a process against it. While the initial code does work, the issue I am having is that when the code finishes using a trained tensorflow model, it fails to release the gpu memory reserved for the process. My solution to this is to run a separate script that has values passed through to it from the initial script. Any help would be appreciated.
Here is the code I referenced for building a multi-status bar output:
`
import multiprocessing
import random
from concurrent.futures import ProcessPoolExecutor
from time import sleep
#from rich import progress
from rich.progress import Progress
def long_running_fn(progress, task_id):
len_of_task = random.randint(3, 20) # take some random length of time
for n in range(0, len_of_task):
sleep(1) # sleep for a bit to simulate work
progress[task_id] = {"progress": n + 1, "total": len_of_task}
if __name__ == "__main__":
n_workers = 8 # set this to the number of cores you have on your machine
with Progress() as progress:
futures = [] # keep track of the jobs
with multiprocessing.Manager() as manager:
# this is the key - we share some state between our
# main process and our worker functions
_progress = manager.dict()
overall_progress_task = progress.add_task("[green]All jobs progress:")
with ProcessPoolExecutor(max_workers=n_workers) as executor:
for n in range(0, 20): # iterate over the jobs we need to run
# set visible false so we don't have a lot of bars all at once:
task_id = progress.add_task(f"task {n}", visible=False)
futures.append(executor.submit(long_running_fn, _progress, task_id))
# monitor the progress:
while (n_finished := sum([future.done() for future in futures])) < len(
futures
):
progress.update(
overall_progress_task, completed=n_finished, total=len(futures)
)
for task_id, update_data in _progress.items():
latest = update_data["progress"]
total = update_data["total"]
# update the progress bar for this task:
progress.update(
task_id,
completed=latest,
total=total,
visible=latest < total,
)
# raise any errors:
for future in futures:
future.result()
`
Which gives a result that look like this:
progress bars
In implementing this into my code, the progress bars work, but only until the gpu memory fills up than I am unable to keep the code running. If I run a two script variant without a progress bar everything works perfectly.
Here is the code that I currently have. I apologize in advance its a WIP.
`
def detect_frames(progress, task_id, gpu, filename, xl, size):
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu) #"1"
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
for gpu in gpus:
try:
tf.config.experimental.set_virtual_device_configuration(gpu,
[tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*size)]) #1024 #starts mem limit at 1Gig, 5715 is for the R720
except RuntimeError as e:
print(e)
model = tensorflow.keras.models.load_model('SQ-D4096-V7.h5', compile=False)
labels = ['Corrupt','Good'] #needs to be in same order as text file
start_time = time.time()
f3 = 0
T = 0 #time
T2 = 0 #time2 for testing
cap = cv.VideoCapture(filename)
fps = cap.get(cv.CAP_PROP_FPS)
totalNoFrames = cap.get(cv.CAP_PROP_FRAME_COUNT)#/30
durationInSeconds = (totalNoFrames / fps)/fps
good = []
corrupt = []
bad = []
count = 0
while cap.isOpened():
ret, frame = cap.read()
count += 120
cap.set(cv.CAP_PROP_POS_FRAMES, count)
fnum = cap.get(cv.CAP_PROP_POS_FRAMES);
progress[task_id] = {"progress": fnum, "total": totalNoFrames}
rest of def ....
if __name__ == "__main__":
CWD = os.path.dirname(os.path.realpath(__file__))
nvidia_smi.nvmlInit()
try:
deviceCount = nvmlDeviceGetCount()
for i in range(deviceCount):
#gpus = len(i)
#print("Num GPU: ",gpus)
handle = nvmlDeviceGetHandleByIndex(i)
info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
card = nvmlDeviceGetName(handle)
print("Device", i, ":", nvmlDeviceGetName(handle))
print("Total memory: %sGB" %(info.total//(1.024*10**9)))
print("Free memory: %sGB" %(info.free//(1.024*10**9)))
print("Used memory: %sGB" %(info.used//(1.024*10**9)))
Num = (info.free//(1.024*10**9))-1
print("Max number of scripts at once: ",Num)
#print("Recommended Max number of scripts at once, per GPU: ",Num-1)
except NVMLError as error:
print(error)
numgpu = deviceCount
nvidia_smi.nvmlShutdown()
# Input area
#%%
#Where are the videos located?
#file = glob.glob("/Data/2_DeInter/*.avi") #for production
file = glob.glob("/Data/2_DeInter/Untitled_5.avi") #for Testing
#file = sys.argv[1]
#print(file)
#How many videos do you want to process per GPU?
#Num = 12
NumProc = 1
#nvidia_smi.nvmlInit()
if (str(card) == "b'NVIDIA GeForce GTX 1050 Ti'") and (NumProc > 2 ):
print("Unfortunately the 1050 Ti can only process two streams per card:")
print("Changing number of processes to 2!")
NumProc = 2
#if NumProc == 1:
# MemFree = ((Num//NumProc)-1)
#else:
# MemFree = (Num//NumProc)
print("NumProc: ",NumProc)
MemFree = 1/((Num*1.2)/NumProc)
print("Memory free:",MemFree)
size = (Num/NumProc)#MemFree #2 #1.2 #size of the tensorflow model
print('Memory size Used: ',size)
#print(size)
print("\n")
n_workers = NumProc#*numgpu #2
#%%
gpu = 0
x = 0
T = 0
nvidia_smi.nvmlInit()
with Progress() as progress:
futures = [] # keep track of the jobs
# with multiprocessing.Manager() as manager:
# # this is the key - we share some state between our
# # main process and our worker functions
# _progress = manager.dict()
# overall_progress_task = progress.add_task("[green]All jobs progress:")
with multiprocessing.Manager() as manager:
# this is the key - we share some state between our
# main process and our worker functions
_progress = manager.dict()
overall_progress_task = progress.add_task("[green]All jobs progress:")
#for n in range(len(file)):
with ProcessPoolExecutor(max_workers=n_workers) as executor:
#with ProcessPoolExecutor(max_workers=n_workers) as executor:
while True:
#for n in range(len(file)):
#task_id = progress.add_task(f"task {x}", visible=False)
task_id = progress.add_task(f"Processing video: {Path(file[x]).name}", visible=False)
#print("X: %s"%x)
#while True:
#for gpu in range(deviceCount):
#print("GPU: ",gpu)
print("X:",x)
# if x > NumProc: #clear the tf models
# tf.keras.backend.clear_session()
# tf.compat.v1.reset_default_graph()
# del model
# load.model()
#print("Mem Free: ",info.free//(size*10**9)-1)
#with Progress() as progress:
# futures = []
while info.free/(size*10**9) <= MemFree:
#print("Mem Free: ",info.free/(1.024*10**9))
#print("Mem Free: ",info.free/(size*10**9))
#print("No Free Memory!")
#print("Checking GPU: %s for space"%gpu)
#print("Update Progress Bar")
if (n_finished := sum([future.done() for future in futures])) < len(futures):
progress.update(overall_progress_task, completed=n_finished, total=len(futures))
for task_id, update_data in _progress.items():
latest = update_data["progress"]
total = update_data["total"]
# update the progress bar for this task:
progress.update(task_id,completed=latest,total=total,visible=latest < total,)
time.sleep(1)
gpu = gpu+1
if gpu == deviceCount:
gpu = 0
handle = nvmlDeviceGetHandleByIndex(gpu)
info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
if info.free/(size*10**9) > MemFree:
tf.config.experimental.set_virtual_device_configuration(gpu,
futures.append(executor.submit(detect_frames, _progress, task_id, gpu, file[x], x, size))
if (n_finished := sum([future.done() for future in futures])) < len(futures):
progress.update(overall_progress_task, completed=n_finished, total=len(futures))
for task_id, update_data in _progress.items():
latest = update_data["progress"]
total = update_data["total"]
# update the progress bar for this task:
progress.update(task_id,completed=latest,total=total,visible=latest < total,)
time.sleep(5)
#print("Update Progress Bar")
if (n_finished := sum([future.done() for future in futures])) < len(futures):
progress.update(overall_progress_task, completed=n_finished, total=len(futures))
for task_id, update_data in _progress.items():
latest = update_data["progress"]
total = update_data["total"]
# update the progress bar for this task:
progress.update(task_id,completed=latest,total=total,visible=latest < total,)
x = x+1
gpu = gpu+1
if gpu == deviceCount:
gpu = 0
#break
if x == len(file):
print("End of file list")
#print("Update Progress Bar")
while (n_finished := sum([future.done() for future in futures])) < len(futures):
progress.update(overall_progress_task, completed=n_finished, total=len(futures))
for task_id, update_data in _progress.items():
latest = update_data["progress"]
total = update_data["total"]
# update the progress bar for this task:
progress.update(task_id,completed=latest,total=total,visible=latest < total,)
break
handle = nvmlDeviceGetHandleByIndex(gpu)
info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
nvidia_smi.nvmlShutdown()
`
I have implemented a python app that performs real time car_model and license plate recognition from a camera using yolo3 / opencv- gpu. Frames are captured in Threading having a descent fps.
I am trying to update it by getting streaming from 4 cameras showing the frames to a pysimplegui window. However I can't figure out how to make stream from the cameras run in parallel. It seems that each stream is waiting from the previous one to finish in order to stream a frame, so the overall fps, drops to 1/4.
if __name__ == '__main__':
Results.num_cam = 2
# initialize the video streams and allow them to warmup
img_size_x, img_size_y = 1280, 800
window = create_window(img_size_x, img_size_y,Results.num_cam)
print("[INFO] starting cameras...")
prev_frame_time = 0
prev_frame_time = 0
if Results.num_cam == 2:
webcam1 = VideoStream(src='rtsp://admin:VVZOZR#192.168.1.11:554/11').start()
webcam2 = VideoStream(src='rtsp://admin:VVZOZR#192.168.1.11:554/11').start()
else:
webcam1 = VideoStream(src='rtsp://admin:VVZOZR#192.168.1.11:554/11').start()
webcam2 = VideoStream(src='rtsp://admin:VVZOZR#192.168.1.11:554/11').start()
webcam3 = VideoStream(src='rtsp://admin:VVZOZR#192.168.1.11:554/11').start()
webcam4 = VideoStream(src='rtsp://admin:VVZOZR#192.168.1.11:554/11').start()
time.sleep(2.0)
Results.car_m, Results.car_l, _ = yolo_cars()
Results.pl_m, Results.pl_l, _ = yolo_plate()
# initialize the two motion detectors, along with the total
# number of frames read
results = Results()
total = 0
while True:
# initialize image elements objects
if Results.num_cam == 2:
cam_view = window['-FRAME1-']
cam1_view = window['-FRAME2-']
else:
cam_view = window['-FRAME1-']
cam1_view = window['-FRAME2-']
cam2_view = window['-FRAME3-']
cam3_view = window['-FRAME4-']
# initialize the list of frames that have been processed
frames = []
event, ts = window.read(timeout=0.05)
t1 = time.perf_counter()
if event == sg.WINDOW_CLOSED:
break
# loop over the frames and their respective motion detectors
if Results.num_cam == 2:
res_all = (results, results)
web_all = (webcam1, webcam2)
else:
res_all = (results, results, results, results)
web_all = (webcam1, webcam2, webcam3, webcam4)
for (stream, res) in zip(web_all, res_all):
# read the next frame from the video stream and resize
frame = stream.read()
res_inst = res.process_frame(frame,True)
if total < 32:
frames.append(res_inst.frame)
continue
# update the frames list
frames.append(res_inst.frame)
# increment the total number of frames read and grab the
# current timestamp
total += 1
if Results.num_cam == 2:
for (frame, name) in zip(frames, (cam_view, cam1_view)): # , 'Webcam3', 'Webcam4')):
name.update(frame)
else:
for (frame, name) in zip(frames, (cam_view, cam1_view, cam2_view, cam3_view)):
name.update(frame)
new_frame_time = time.time()
fps = 1 / (new_frame_time - prev_frame_time)
prev_frame_time = new_frame_time
fps = int(fps)
window['-FR-'].update(str(fps))
# do a bit of cleanup
print("[INFO] cleaning up...")
for web in web_all:
web.stop()`.
I am working on a project to predict the pedestrian path using a history and so on so i am using this paper and trying to run and implement the missing annotation in this
https://github.com/JunweiLiang/Multiverse/blob/master/SimAug
so i used yolov5 to solve this issue but i found that the fps is 30 and it takes a long time to process all these frames so i wanted to decrease the fps so for example it take only 2 frames per second and run the pipeline
(detection --> tracking) --> segmentation --> prediction and so on.
so basically here is what i did
import numpy as np
import cv2
import datetime
import queue
from threading import Thread
# global variables
stop_thread = False # controls thread execution
def start_capture_thread(cap, queue):
# global stop_thread
i=0
# continuously read fames from the camera
while True:
_, img = cap.read()
queue.put(img)
# cv2.imwrite('Images/frame{:d}.jpg'.format(i), img)
i=i+1
# if (stop_thread):
# break
def main():
global stop_thread
# create display window
cv2.namedWindow("webcam", cv2.WINDOW_NORMAL)
# initialize webcam capture object
cap = cv2.VideoCapture(0)
#cap = cv2.VideoCapture(0 + cv2.CAP_DSHOW)
# retrieve properties of the capture object
cap_width = cap.get(cv2.CAP_PROP_FRAME_WIDTH)
cap_height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
cap_fps = cap.get(cv2.CAP_PROP_FPS)
print('* Capture width:', cap_width)
print('* Capture height:', cap_height)
print('* Capture FPS:', cap_fps)
# create a queue
frames_queue = queue.Queue(maxsize=0)
# start the capture thread: reads frames from the camera (non-stop) and stores the result in img
t = Thread(target=start_capture_thread, args=(cap, frames_queue,), daemon=True) # a deamon thread is killed when the application exits
t.start()
# initialize time and frame count variables
last_time = datetime.datetime.now()
frames = 0
cur_fps = 0
i=0
while (True):
if (frames_queue.empty()):
continue
if i%5 !=0 :
_ = frames_queue.get()
i+=1
continue
# blocks until the entire frame is read
frames += 1
# # measure runtime: current_time - last_time
# delta_time = datetime.datetime.now() - last_time
# elapsed_time = delta_time.total_seconds()
# # compute fps but avoid division by zero
# if (elapsed_time != 0):
# cur_fps = np.around(frames / elapsed_time, 1)
# retrieve an image from the queue
img = frames_queue.get()
cv2.imwrite('Images/frame{:d}.jpg'.format(i), img)
i+=1
# for i in range(10):
# _ = frames_queue.get()
# i+=1
# TODO: process the image here if needed
# draw FPS text and display image
# if (img is not None):
# cv2.putText(img, 'FPS: ' + str(cur_fps), (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 2, cv2.LINE_AA)
# cv2.imshow("webcam", img)
# wait 1ms for ESC to be pressed
key = cv2.waitKey(1)
if (key == 27):
stop_thread = True
break
# release resources
cv2.destroyAllWindows()
cap.release()
if __name__ == "__main__":
main()
and it works so i want to use this in the track.py in yolov5 instead of the
dataset = LoadStreams(source, img_size=imgsz, stride=stride, auto=pt and not jit)
but i can't modify it as it used in the code like this
for frame_idx, (path, img, im0s, vid_cap, s) in enumerate(dataset):
but i couldn't full understand this line
so i wanted to modify the LoadStreams function in here https://github.com/ultralytics/yolov5/blob/master/utils/datasets.py but i am kinda stuck here
I am making a PyQt5 real-time application that streamline images and its segmentation result on interface. It has 3 components as a cycle:
pull out frame
feed the frame into segmentation module
print out original frame and segmentation result on interface
In particularly, (2) is a bottleneck. I tried to run (2) in main thread, and compared its speed and CPU consumption rate against running (2) in background (using QThread). I strangely found that running by QThread costs ~20 ms slower and is more CPU intensive.
Method 1: Wrapping (1), (2) in a QObject Worker and Run in Main Thread
I don't intentionally want to run all these in main thread because it causes window being locked, but I do this for speed testing. Below is my implementation of the QObject worker (FrameStore is an object retrieving key results in QObject and pass them to GUI):
class FrameObject(QObject):
frame_signal = pyqtSignal(FrameStore)
def __init__(self, parent = None):
super().__init__()
self.load_img()
self.IS_SILENT = True
self.model_config = ModelMetaConfig()
self.frame_store = FrameStore()
def load_img(self):
self.PATH = os.path.join(os.getcwd(), 'test_cases', 'test.jpg')
rgb_img = cv2.imread(self.PATH)
self.rgb_img = rgb_img[:, :, ::-1]
#pyqtSlot()
def run(self):
end_loop = time.time()
while True:
crt_t = time.time()
print('time = {:10.4f} s'.format(time.time()))
print('unexplain = {:10.4f} s'.format(crt_t - end_loop))
torch.cuda.synchronize()
start = time.time()
seg_out = self.model_config.raw_predict(self.rgb_img, self.IS_SILENT)
seg_out = self.model_config.process_predict(seg_out, self.IS_SILENT)
torch.cuda.synchronize()
end = time.time()
print('model run = {:10.4f} s'.format(end - start))
start = time.time()
# store key data at a snapsho
self.frame_store.rgb_img = self.rgb_img
self.frame_store.seg_out = seg_out
#self.frame_signal.emit(self.frame_store)
end_loop = time.time()
self.frame_signal.emit(self.frame_store)
print('emit = {:10.4f} s'.format(end_loop - start))
Below screenshot tells that CPU usage is 60% with frame rate ~120 ms:
Screenshot: ~60% CPU Usage + ~120 ms Frame Rate
Method 2: Wrapping (1), (2) in a QThread
My implementation of QThread is similar to the QObject above:
class FrameThread(QThread):
frame_signal = pyqtSignal(FrameStore)
def __init__(self, parent = None):
super().__init__()
self.load_img()
self.IS_SILENT = True
self.model_config = ModelMetaConfig()
self.frame_store = FrameStore()
def load_img(self):
self.PATH = os.path.join(os.getcwd(), 'test_cases', 'test.jpg')
rgb_img = cv2.imread(self.PATH)
self.rgb_img = rgb_img[:, :, ::-1]
def run(self):
end_loop = time.time()
while True:
crt_t = time.time()
print('time = {:10.4f} s'.format(time.time()))
print('unexplain = {:10.4f} s'.format(crt_t - end_loop))
torch.cuda.synchronize()
start = time.time()
seg_out = self.model_config.raw_predict(self.rgb_img, self.IS_SILENT)
seg_out = self.model_config.process_predict(seg_out, self.IS_SILENT)
torch.cuda.synchronize()
end = time.time()
print('model run = {:10.4f} s'.format(end - start))
start = time.time()
# store key data at a snapsho
self.frame_store.rgb_img = self.rgb_img
self.frame_store.rgb2_img = self.rgb_img
self.frame_store.rgb3_img = self.rgb_img
self.frame_store.rgb4_img = self.rgb_img
self.frame_store.seg_out = seg_out
self.frame_signal.emit(self.frame_store)
end_loop = time.time()
print('emit = {:10.4f} s'.format(end_loop - start))
But it turns out the frame rate is ~20-30 ms slower with 100% CPU usage, as shown below:
Screenshot: 100% CPU Usage + ~145 ms Frame Rate
Reproducing My Result
I have uploaded a simplified version of my application on my repo. You can reproduce my result by running $ python main.py. Note the application requires a GPU and installation of pytorch.
To switch between method 1 and method 2, just toggle comment in the following lines in main.py:
def __init__(self):
super().__init__()
# set up window
self.title = 'Simply Reproducing the Slow Speed'
self.top = 100
self.left = 100
self.width = 1280
self.height = 1280
self.init_window()
#self.init_qobject() #<- uncomment this to run QObject in main thread
#self.init_thread() #<- uncomment this to run QThread
A brief outline of key scripts:
model_utils.py: wrapping all segmentation procedures into a class
object_utils.py: a QObject worker class computing (1) and (2)
thread_utils.py: a QThread class computing (1) and (2)
main.py: main interface application
Question
The slow down and extra CPU usage is so annoying to me. Could anyone suggests me an alternative implementation that could delegate the work to background more efficiently?
I would like to have several processes, each one loading different images one in a time and performing inference (for example VGG16).
I am using Keras with tensorFlow backend, one GPU (GTX 1070). Following is the code:
import tensorflow as tf
import multiprocessing
from multiprocessing import Pool, Process, Queue
import os
from os.path import isfile, join
from PIL import Image
import time
from keras.applications.vgg16 import VGG16
import numpy as np
from keras.backend.tensorflow_backend import set_session
test_path = 'test path to images ...'
output = Queue()
def worker(file_names, output):
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.25
config.gpu_options.visible_device_list = "0"
set_session(tf.Session(config=config))
inference_model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3), pooling='avg')
model_image_size = (224,224)
times = []
for file_name in file_names:
image = Image.open(os.path.join(test_path, file_name))
im_width = image.size[0]
im_height = image.size[1]
m = (im_width - im_height) // 2
image = image.crop((m, 0, im_width - m, im_height))
image = image.resize((model_image_size), Image.BICUBIC)
image = np.array(image, dtype='float32')
image /= 255.
image = np.expand_dims(image, 0) # Add batch dimension.
start = time.time()
res = inference_model.predict(image)
end = time.time()
elapsed_time = end - start
print("elapsed time", elapsed_time)
times.append(elapsed_time)
average_time = np.mean(times[2:])
print("average time ", average_time)
if __name__ == '__main__':
file_names = [f for f in os.listdir(test_path) if isfile(join(test_path, f))]
file_names.sort()
num_workers = 3
processes = [Process(target=worker, args=(file_names[x::num_workers], output)) for x in range(num_workers)]
for p in processes:
p.start()
for p in processes:
p.join()
I have noticed that the inference elapsed times per image are slower for multi processes compared to single process. For example while for single image the inference elapsed time is 0.012 sec. When running 3 processes, I would expect the same result, however, the average inference time per image is almost 0.02 sec. What could be the reason for that? (Maybe CUDA context – switching?) Is there a way to solve this?