How to make mediapipe pose estimation faster (python)

How to make mediapipe pose estimation faster (python) - python

I'm making a pose estimation script for my game. However, it's working at 20-30 fps and not using the whole CPU even if there is no fps limit. It's not using whole GPU too. Can someone help me?
Here is resource usage while playing a dance video: https://imgur.com/a/6yI2TWg
Here is my code:
import cv2
import mediapipe as mp
import time
inFile = '/dev/video0'
capture = cv2.VideoCapture(inFile)
FramesVideo = int(capture.get(cv2.CAP_PROP_FRAME_COUNT)) # Number of frames inside video
FrameCount = 0 # Currently playing frame
prevTime = 0
# some objects for mediapipe
mpPose = mp.solutions.pose
mpDraw = mp.solutions.drawing_utils
pose = mpPose.Pose()
while True:
FrameCount += 1
#read image and convert to rgb
success, img = capture.read()
imgRGB = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
#process image
results = pose.process(imgRGB)
if results.pose_landmarks:
mpDraw.draw_landmarks(img, results.pose_landmarks, mpPose.POSE_CONNECTIONS)
#get landmark positions
landmarks = []
for id, lm in enumerate(results.pose_landmarks.landmark):
h, w, c = img.shape
cx, cy = int(lm.x * w), int(lm.y * h)
cv2.putText(img, str(id), (cx,cy), cv2.FONT_HERSHEY_PLAIN, 1, (255,0,0), 1)
landmarks.append((cx,cy))
# calculate and print fps
frameTime = time.time()
fps = 1/(frameTime-prevTime)
prevTime = frameTime
cv2.putText(img, str(int(fps)), (30,50), cv2.FONT_HERSHEY_PLAIN, 3, (255,0,0), 3)
#show image
cv2.imshow('Video', img)
cv2.waitKey(1)
if FrameCount == FramesVideo-1:
capture.release()
cv2.destroyAllWindows()
break

Set the model_complexity of mp.Pose to 0.
As the documentation states:
MODEL_COMPLEXITY
Complexity of the pose landmark model: 0, 1 or 2. Landmark accuracy as well as inference latency generally go up with the model complexity. Default to 1.
This is the best solution I've found, also use this.

Related

Implement multiprocessing to test two videos simultaneously in opencv for object detection

I am implementing an object detection model using a YOLO algorithm with PyTorch and OpenCV. Running my model on a single video works fine. But whenever I am trying to use multiprocessing for testing more videos at once it is freezing. Can you please explain what is wrong with this code ??
import torch
import cv2
import time
from multiprocessing import Process
model = torch.hub.load('ultralytics/yolov5', 'custom', path='runs/best.pt', force_reload=True)
def detectObject(video,name):
cap = cv2.VideoCapture(video)
while cap.isOpened():
pTime = time.time()
ret, img = cap.read()
cTime = time.time()
fps = str(int(1 / (cTime - pTime)))
if img is None:
break
else:
results = model(img)
labels = results.xyxyn[0][:, -1].cpu().numpy()
cord = results.xyxyn[0][:, :-1].cpu().numpy()
n = len(labels)
x_shape, y_shape = img.shape[1], img.shape[0]
for i in range(n):
row = cord[i]
# If score is less than 0.3 we avoid making a prediction.
if row[4] < 0.3:
continue
x1 = int(row[0] * x_shape)
y1 = int(row[1] * y_shape)
x2 = int(row[2] * x_shape)
y2 = int(row[3] * y_shape)
bgr = (0, 255, 0) # color of the box
classes = model.names # Get the name of label index
label_font = cv2.FONT_HERSHEY_COMPLEX # Font for the label.
cv2.rectangle(img, (x1, y1), (x2, y2), bgr, 2) # Plot the boxes
cv2.putText(img, classes[int(labels[i])], (x1, y1), label_font, 2, bgr, 2)
cv2.putText(img, f'FPS={fps}', (8, 70), label_font, 3, (100, 255, 0), 3, cv2.LINE_AA)
img = cv2.resize(img, (700, 700))
cv2.imshow(name, img)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
cap.release()
Videos = ['../Dataset/Test1.mp4','../Dataset/Test2.mp4']
for i in Videos:
process = Process(target=detectObject, args=(i, str(i)))
process.start()
Every time I run that code it freezes.
Here is the output :
Downloading: "https://github.com/ultralytics/yolov5/archive/master.zip" to /home/com/.cache/torch/hub/master.zip
YOLOv5 🚀 2022-6-27 Python-3.9.9 torch-1.11.0+cu102 CPU
Fusing layers...
YOLOv5s summary: 213 layers, 7023610 parameters, 0 gradients
Adding AutoShape...

I got it to work by adding torch multiprocessing code.
from torch.multiprocessing import Pool, Process, set_start_method
try:
set_start_method('spawn', force=True)
except RuntimeError:
pass
videos = ['videos/video1.mp4', 'videos/video2.mp4']
for i in videos:
process = Process(target=detectObject, args=(i, str(i)))
process.start()
I was able to run on multiple videos at once this way.

How to remove background and video from facial landmarks

Im using this code to detect teh 468 facial landmarks from a face:
import cv2
import mediapipe as mp
import time
cap = cv2.VideoCapture(0)
pTime = 0
mpDraw = mp.solutions.drawing_utils
mpFaceMesh = mp.solutions.face_mesh
faceMesh = mpFaceMesh.FaceMesh(max_num_faces=2)
drawSpec = mpDraw.DrawingSpec(thickness=1, circle_radius=2)
while True:
success, img = cap.read()
imgRGB = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
results = faceMesh.process(imgRGB)
if results.multi_face_landmarks:
for faceLms in results.multi_face_landmarks:
mpDraw.draw_landmarks(img, faceLms, mpFaceMesh.FACEMESH_CONTOURS,
drawSpec,drawSpec)
for id,lm in enumerate(faceLms.landmark):
#print(lm)
ih, iw, ic = img.shape
x,y = int(lm.x*iw), int(lm.y*ih)
print(id,x,y)
cTime = time.time()
fps = 1 / (cTime - pTime)
pTime = cTime
cv2.putText(img, f'FPS: {int(fps)}', (20, 70), cv2.FONT_HERSHEY_PLAIN,
3, (255, 0, 0), 3)
cv2.imshow("Image", img)
cv2.waitKey(1)
When I run this script, I can see the facial landmarks printed on my face, but what I want to achieve is that I display the facial landmarks on a black background without actually showing a face.
How can I achieve that?

You can create an image with dark (black) pixels (0) of the same dimensions of the frame captured by the camera. And carry out your drawings there
# create the dark image
black = np.zeros(img.shape , np.uint8)
# Replace the `img` with `black` while drawing the landmarks
mpDraw.draw_landmarks(black, faceLms, mpFaceMesh.FACEMESH_CONTOURS, drawSpec,drawSpec)
# Display the result
cv2.imshow("Result", black)

How to get one of the specific values(with the specific index) of this iterable enumerate (results.pose_landmarks.landmark)?

import cv2 #OpenCV is the library that we will be using for image processing
import mediapipe as mp #Mediapipe is the framework that will allow us to get our pose estimation
import time
mpDraw = mp.solutions.drawing_utils
mpPose = mp.solutions.pose
pose = mpPose.Pose()
#pose = mpPose.Pose(static_image_mode = False, upper_body_only = True) #ONLY UPPER_BODY_TRACKING
#cap = cv2.VideoCapture(0)
cap = cv2.VideoCapture('PoseVideos/1_girl_choreography.mp4')
pTime = 0 #previous time
while True:
success, img = cap.read() #that will give it our image and then we can write the cv2.imshow()
imgRGB = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) #convert our image to RGB because Mediapipe use that format
results = pose.process(imgRGB) #we are simply going to send this image to our model
#print(enumerate(results.pose_landmarks.landmark)) #<enumerate object at 0x0000012312DD1A00>
#so then we will check if it is detected or not
if results.pose_landmarks:
mpDraw.draw_landmarks(img, results.pose_landmarks, mpPose.POSE_CONNECTIONS)
for id, lm in enumerate(results.pose_landmarks.landmark):
h, w, c = img.shape #get dimensions(h height, w width) and the c channel of image
print(id)
print(lm)
cx, cy = int(lm.x * w), int(lm.y * h)
cv2.circle(img, (cx, cy), 5, (255, 0, 0), cv2.FILLED)
cTime = time.time()
fps = 1 / (cTime - pTime)
pTime = cTime
cv2.putText(img, str(int(fps)), (70, 50), cv2.FONT_HERSHEY_PLAIN, 3, (255, 0, 0), 3)
cv2.imshow("Image", img)
cv2.waitKey(1)
I want to do what the for does but instead of doing it for all the elements (id) inside the enumerate (), do it only for the id = 25, because in this case it is the only point that interests me.
What would I need to change in this instruction of the loop that uses as iterable to this enumerate(results.pose_landmarks.landmark):
How would I enter an id[25] and a lm[25] ?
I try with itertools, but ir does work in this case
import itertools
gen = (id, lm for id, lm in enumerate(results.pose_landmarks.landmark))
specific_id, specific_lm = 25,25 #index
print( next(itertools.islice(gen, specific_id, None)) )

You can directly access results.pose_landmarks.landmark, no need to enumerate.
lm = results.pose_landmarks.landmark[25]
results.pose_landmarks.landmark is a list

You could try:
for id, lm in enumerate(results.pose_landmarks.landmark):
if not id == 25:
continue
...
Not elegant, but gets the job done.

how to REAL TIME track my cursor to a coordinate? (DIY aimbot)

i need to track my cursor to the nose of a human to create a DIY aimbot with pose detection.
(just for fun, not intending to cheat, there would be so many better and easier options than to make my own)
i already have the first part of the code and it shows you your screen and the skeleton, as well as the exact coordinates of the nose with no problem,
but the method that im using to move my cursor over to that point is not working
im using mouse.move and have tried other stuff like pyautogui, tkinter.
it doesn't give me an error but still does not work
import cv2
import mediapipe as mp
import numpy as np
import time
import pyautogui
import mouse
mp_drawing = mp.solutions.drawing_utils
mp_pose = mp.solutions.pose
# display screen resolution, get it from your OS settings
SCREEN_SIZEX = (1920)
SCREEN_SIZEY = (1080)
# define the codec
fourcc = cv2.VideoWriter_fourcc(*"XVID")
# create the video write object
out = cv2.VideoWriter("output.avi", fourcc, 20.0, (SCREEN_SIZEX, SCREEN_SIZEY))
with mp_pose.Pose(min_detection_confidence=0.1, min_tracking_confidence=0.9) as pose:
while True:
# make a screenshot
img = pyautogui.screenshot()
# convert these pixels to a proper numpy array to work with OpenCV
frame = np.array(img)
# convert colors from BGR to RGB
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# Recolor image to RGB
image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
image.flags.writeable = False
# Make detection
results = pose.process(image)
# Recolor back to BGR
image.flags.writeable = True
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
try:
landmarks = results.pose_landmarks.landmark
lndmark = landmarks[mp_pose.PoseLandmark.NOSE.value]
x = [landmarks[mp_pose.PoseLandmark.NOSE.value].x]
y = [landmarks[mp_pose.PoseLandmark.NOSE.value].y]
#print(x)
#print(y)
mouse.move(x, y)
except:
pass
# Render detections
mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_pose.POSE_CONNECTIONS,
mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=2),
mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
)
# write the frame
out.write(frame)
pTime = 0
cTime = time.time()
fps = 1 / (cTime - pTime)
pTime = cTime
cv2.putText(image, str(int(fps)), (20, 50), cv2.FONT_HERSHEY_PLAIN, 3,
(255, 0, 0), 3)
cv2.imshow('Mediapipe Feed', image)
if cv2.waitKey(10) & 0xFF == ord('q'):
break
out.release()
cv2.destroyAllWindows()
#for lndmark in mp_pose.PoseLandmark:
#print(lndmark)
this is the part that doesn't work:
try:
landmarks = results.pose_landmarks.landmark
lndmark = landmarks[mp_pose.PoseLandmark.NOSE.value]
x = [landmarks[mp_pose.PoseLandmark.NOSE.value].x]
y = [landmarks[mp_pose.PoseLandmark.NOSE.value].y]
mouse.move(x, y)
except:
pass
i would assume that it is beacuse x and y are supposed to numbers or somehow it can't read or proccess it correctly
but it doesn't give me an error, so im asking it here hoping on of you guys had already figured this one out

OpenCv facial recognition app detection problem

This is a facial recognition app I had created using python , opencv and haar cascade classifier, the app is working good in classifying trained persons, however the app has a problem of detecting unknown persons as a known person who is trained previously , How to fix such problem ?
this is the dataset creation code
import cv2
import os
import time
cam = cv2.VideoCapture(0)
cam.set(3, 640) # set video width
cam.set(4, 480) # set video height
face_detector = cv2.CascadeClassifier('Cascades/haarcascade_frontalface_default.xml')
# For each person, enter one numeric face id
face_id = input('\n enter user id end press <return> ==> ')
print("\n [INFO] Initializing face capture. Look the camera and wait ...")
# Initialize individual sampling face count
count = 0
while(True):
ret, img = cam.read()
img = cv2.flip(img, 1)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
faces = face_detector.detectMultiScale(gray, 1.3, 5)
width_d, height_d = 150, 150
for (x,y,w,h) in faces:
cv2.rectangle(img, (x,y), (x+w,y+h), (255,0,0), 2)
count += 1
# Save the captured image into the datasets folder
cv2.imwrite("dataset/User." + str(face_id) + '.' + str(count) + ".jpg", cv2.resize(gray[y:y+h,x:x+w] , (width_d, height_d)))
cv2.imshow('image', img)
k = cv2.waitKey(100) & 0xff # Press 'ESC' for exiting video
if k == 27:
break
elif count >= 400: # Take 30 face sample and stop video
break
# Do a bit of cleanup
print("\n [INFO] Exiting Program and cleanup stuff")
cam.release()
cv2.destroyAllWindows()
This is the training phase
import cv2
import numpy as np
from PIL import Image
import os
# Path for face image database
path = 'dataset'
recognizer = cv2.face.LBPHFaceRecognizer_create()
detector = cv2.CascadeClassifier("Cascades/haarcascade_frontalface_default.xml");
# function to get the images and label data
def getImagesAndLabels(path):
width_d, height_d = 150, 150 # Declare your own width and height
imagePaths = [os.path.join(path,f) for f in os.listdir(path)]
faceSamples=[]
ids = []
for imagePath in imagePaths:
PIL_img = Image.open(imagePath).convert('L') # convert it to grayscale
img_numpy = np.array(PIL_img,'uint8')
id = int(os.path.split(imagePath)[-1].split(".")[1])
faces = detector.detectMultiScale(img_numpy)
for (x,y,w,h) in faces:
faceSamples.append(cv2.resize(img_numpy[y:y+h,x:x+w], (width_d, height_d)))
ids.append(id)
return faceSamples,ids
print ("\n [INFO] Training faces. It will take a few seconds. Wait ...")
faces,ids = getImagesAndLabels(path)
recognizer.train(faces, np.array(ids))
# Save the model into trainer/trainer.yml
recognizer.write('trainer/trainer.yml') # recognizer.save() worked on Mac, but not on Pi
# Print the numer of faces trained and end program
print("\n [INFO] {0} faces trained. Exiting Program".format(len(np.unique(ids))))
This is the recognizing phase
import cv2
import numpy as np
import os
recognizer = cv2.face.LBPHFaceRecognizer_create()
recognizer.read('trainer/trainer.yml')
cascadePath = "Cascades/haarcascade_frontalface_default.xml"
faceCascade = cv2.CascadeClassifier(cascadePath);
font = cv2.FONT_HERSHEY_SIMPLEX
#iniciate id counter
id =0
# names related to ids: example ==> Marcelo: id=1, etc
names = ['Mamdouh Alaa' , 'Dr.Ahmed Seddawy' , 'Dr.Ismail Abdulghaffar']
# Initialize and start realtime video capture
cam = cv2.VideoCapture(0)
cam.set(3, 1366) # set video widht
cam.set(4, 768) # set video height[enter image description here][1]
# Define min window size to be recognized as a face
minW = 0.1*cam.get(3)
minH = 0.1*cam.get(4)
while True:
ret, img =cam.read()
img = cv2.flip(img, 1)
gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
width_d, height_d = 150, 150
faces = faceCascade.detectMultiScale(
gray,
scaleFactor = 1.2,
minNeighbors = 5,
minSize = (int(minW), int(minH)),
)
for(x,y,w,h) in faces:
cv2.rectangle(img, (x,y), (x+w,y+h), (255,0,0), 2)
id, confidence = recognizer.predict(cv2.resize(gray[y:y+h,x:x+w], (width_d, height_d)))
# Check if confidence is less them 100 ==> "0" is perfect match
if (confidence < 100) :
id = names[id]
confidence = " {0}%".format(round(100 - confidence))
else:
id = "unknown person"
confidence = " {0}%".format(round(100 - confidence))
cv2.putText(img, str(id), (x+5,y-5), font, 1, (255,255,255), 2)
cv2.putText(img, str(confidence), (x+5,y+h-5), font, 1, (255,255,0), 1)
cv2.imshow('IFR',img)
k = cv2.waitKey(10) & 0xff # Press 'ESC' for exiting video
if k == 27:
break
# Do a bit of cleanup
print("\n [INFO] Exiting Program and cleanup stuff")
cam.release()
cv2.destroyAllWindows()

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to make mediapipe pose estimation faster (python) - python

Set the model_complexity of mp.Pose to 0. As the documentation states: MODEL_COMPLEXITY Complexity of the pose landmark model: 0, 1 or 2. Landmark accuracy as well as inference latency generally go up with the model complexity. Default to 1. This is the best solution I've found, also use this.

Related

Implement multiprocessing to test two videos simultaneously in opencv for object detection

How to remove background and video from facial landmarks

How to get one of the specific values(with the specific index) of this iterable enumerate (results.pose_landmarks.landmark)?

how to REAL TIME track my cursor to a coordinate? (DIY aimbot)

OpenCv facial recognition app detection problem

Categories

Resources