Count people/object using raspberry PI by OPENCV, TENSORFLOW and PYTHON - python

My code can detect an object now, which is human and other objects. But the issue that i faced now is i want to count the moving human entering a classroom. I would like to count in and out so that i can know the number of people in a room. How can do this? something like this
How can i make that detected object in rectangle to count in and out when they pass through the red and blue line?
import os
import cv2
import numpy as np
from picamera.array import PiRGBArray
from picamera import PiCamera
import tensorflow as tf
import argparse
import sys
# Set up camera constants
IM_WIDTH = 1280
#IM_WIDTH = 640 Use smaller resolution for
#IM_HEIGHT = 480 slightly faster framerate
# Select camera type (if user enters --usbcam when calling this script,
# a USB webcam will be used)
camera_type = 'picamera'
parser = argparse.ArgumentParser()
parser.add_argument('--usbcam', help='Use a USB webcam instead of picamera',
args = parser.parse_args()
if args.usbcam:
camera_type = 'usb'
# This is needed since the working directory is the object_detection folder.
# Import utilites
from utils import label_map_util
from utils import visualization_utils as vis_util
# Name of the directory containing the object detection module we're using
MODEL_NAME = 'ssdlite_mobilenet_v2_coco_2018_05_09'
# Grab path to current working directory
CWD_PATH = os.getcwd()
# Path to frozen detection graph .pb file, which contains the model that is used
# for object detection.
PATH_TO_CKPT = os.path.join(CWD_PATH,MODEL_NAME,'frozen_inference_graph.pb')
# Path to label map file
PATH_TO_LABELS = os.path.join(CWD_PATH,'data','mscoco_label_map.pbtxt')
# Number of classes the object detector can identify
## Load the label map.
# Label maps map indices to category names, so that when the convolution
# network predicts `5`, we know that this corresponds to `airplane`.
# Here we use internal utility functions, but anything that returns a
# dictionary mapping integers to appropriate string labels would be fine
label_map = label_map_util.load_labelmap(PATH_TO_LABELS)
categories = label_map_util.convert_label_map_to_categories(label_map,
max_num_classes=NUM_CLASSES, use_display_name=True)
category_index = label_map_util.create_category_index(categories)
# Load the Tensorflow model into memory.
detection_graph = tf.Graph()
with detection_graph.as_default():
od_graph_def = tf.GraphDef()
with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid:
serialized_graph =
tf.import_graph_def(od_graph_def, name='')
sess = tf.Session(graph=detection_graph)
# Define input and output tensors (i.e. data) for the object detection
# Input tensor is the image
image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
# Output tensors are the detection boxes, scores, and classes
# Each box represents a part of the image where a particular object was detected
detection_boxes = detection_graph.get_tensor_by_name('detection_boxes:0')
# Each score represents level of confidence for each of the objects.
# The score is shown on the result image, together with the class label.
detection_scores = detection_graph.get_tensor_by_name('detection_scores:0')
detection_classes =
# Number of objects detected
num_detections = detection_graph.get_tensor_by_name('num_detections:0')
# Initialize frame rate calculation
frame_rate_calc = 1
freq = cv2.getTickFrequency()
# Initialize camera and perform object detection.
# The camera has to be set up and used differently depending on if it's a
# Picamera or USB webcam.
# I know this is ugly, but I basically copy+pasted the code for the object
# detection loop twice, and made one work for Picamera and the other work
# for USB.
### Picamera ###
if camera_type == 'picamera':
# Initialize Picamera and grab reference to the raw capture
camera = PiCamera()
camera.resolution = (IM_WIDTH,IM_HEIGHT)
camera.framerate = 10
rawCapture = PiRGBArray(camera, size=(IM_WIDTH,IM_HEIGHT))
for frame1 in camera.capture_continuous(rawCapture, format="bgr",use_video_port=True):
t1 = cv2.getTickCount()
# Acquire frame and expand frame dimensions to have shape: [1, None, None, 3]
# i.e. a single-column array, where each item in the column has the pixel RGB value
frame = frame1.array
frame_expanded = np.expand_dims(frame, axis=0)
# Perform the actual detection by running the model with the image as input
(boxes, scores, classes, num) =
[detection_boxes, detection_scores, detection_classes, num_detections],
feed_dict={image_tensor: frame_expanded})
# Draw the results of the detection (aka 'visulaize the results')
# Blue line
cv2.line(frame, (IM_WIDTH // 2, 0), (IM_WIDTH // 2 , IM_WIDTH), (250, 0, 1), 2)
# Red line
cv2.line(frame, (IM_WIDTH // 2 - 50, 0), (IM_WIDTH // 2 - 50, IM_WIDTH), (0, 0, 255), 2)
# FPS Text
cv2.putText(frame,"FPS: {0:.2f}".format(frame_rate_calc),(30,50),font,1,(255,255,0),2,cv2.LINE_AA)
# All the results have been drawn on the frame, so it's time to display it.
cv2.imshow('Object detector', frame)
t2 = cv2.getTickCount()
time1 = (t2-t1)/freq
frame_rate_calc = 1/time1
# Press 'q' to quit
if cv2.waitKey(1) == ord('q'):


# Import packages
import os
import argparse
import cv2
import numpy as np
import sys
import time
from threading import Thread
import importlib.util
# Define VideoStream class to handle streaming of video from webcam in separate processing thread
# Source - Adrian Rosebrock, PyImageSearch:
class VideoStream:
"""Camera object that controls video streaming from the Picamera"""
def __init__(self,resolution=(640,480),framerate=30):
# Initialize the PiCamera and the camera image stream = cv2.VideoCapture(0)
ret =, cv2.VideoWriter_fourcc(*'MJPG'))
ret =,resolution[0])
ret =,resolution[1])
# Read first frame from the stream
(self.grabbed, self.frame) =
# Variable to control when the camera is stopped
self.stopped = False
def start(self):
# Start the thread that reads frames from the video stream
return self
def update(self):
# Keep looping indefinitely until the thread is stopped
while True:
# If the camera is stopped, stop the thread
if self.stopped:
# Close camera resources
# Otherwise, grab the next frame from the stream
(self.grabbed, self.frame) =
def read(self):
# Return the most recent frame
return self.frame
def stop(self):
# Indicate that the camera and thread should be stopped
self.stopped = True
# Define and parse input arguments
parser = argparse.ArgumentParser()
parser.add_argument('--modeldir', help='Folder the .tflite file is located in',
parser.add_argument('--graph', help='Name of the .tflite file, if different than detect.tflite',
parser.add_argument('--labels', help='Name of the labelmap file, if different than labelmap.txt',
parser.add_argument('--threshold', help='Minimum confidence threshold for displaying detected objects',
parser.add_argument('--resolution', help='Desired webcam resolution in WxH. If the webcam does not support the resolution entered, errors may occur.',
parser.add_argument('--edgetpu', help='Use Coral Edge TPU Accelerator to speed up detection',
args = parser.parse_args()
MODEL_NAME = args.modeldir
GRAPH_NAME = args.graph
LABELMAP_NAME = args.labels
min_conf_threshold = float(args.threshold)
resW, resH = args.resolution.split('x')
imW, imH = int(resW), int(resH)
use_TPU = args.edgetpu
# Import TensorFlow libraries
# If tflite_runtime is installed, import interpreter from tflite_runtime, else import from regular tensorflow
# If using Coral Edge TPU, import the load_delegate library
pkg = importlib.util.find_spec('tflite_runtime')
if pkg:
from tflite_runtime.interpreter import Interpreter
if use_TPU:
from tflite_runtime.interpreter import load_delegate
from tensorflow.lite.python.interpreter import Interpreter
if use_TPU:
from tensorflow.lite.python.interpreter import load_delegate
GRAPH_NAME = 'edgetpu.tflite'
# Get path to current working directory
CWD_PATH = os.getcwd()
# Path to .tflite file, which contains the model that is used for object detection
# Path to label map file
# Load the label map
with open(PATH_TO_LABELS, 'r') as f:
labels = [line.strip() for line in f.readlines()]
# Have to do a weird fix for label map if using the COCO "starter model" from
# First label is '???', which has to be removed.
if labels[0] == '???':
# Load the Tensorflow Lite model.
# If using Edge TPU, use special load_delegate argument
if use_TPU:
interpreter = Interpreter(model_path=PATH_TO_CKPT,
interpreter = Interpreter(model_path=PATH_TO_CKPT)
# Get model details
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()
height = input_details[0]['shape'][1]
width = input_details[0]['shape'][2]
floating_model = (input_details[0]['dtype'] == np.float32)
input_mean = 127.5
input_std = 127.5
# Check output layer name to determine if this model was created with TF2 or TF1,
# because outputs are ordered differently for TF2 and TF1 models
outname = output_details[0]['name']
if ('StatefulPartitionedCall' in outname): # This is a TF2 model
boxes_idx, classes_idx, scores_idx = 1, 3, 0
else: # This is a TF1 model
boxes_idx, classes_idx, scores_idx = 0, 1, 2
# Initialize frame rate calculation
frame_rate_calc = 1
freq = cv2.getTickFrequency()
# Initialize video stream
videostream = VideoStream(resolution=(imW,imH),framerate=30).start()
#for frame1 in camera.capture_continuous(rawCapture, format="bgr",use_video_port=True):
while True:
# Start timer (for calculating frame rate)
t1 = cv2.getTickCount()
# Grab frame from video stream
frame1 =
frame1 = cv2.flip(frame1,0)
# Acquire frame and resize to expected shape [1xHxWx3]
frame = frame1.copy()
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
frame_resized = cv2.resize(frame_rgb, (width, height))
input_data = np.expand_dims(frame_resized, axis=0)
# Normalize pixel values if using a floating model (i.e. if model is non-quantized)
if floating_model:
input_data = (np.float32(input_data) - input_mean) / input_std
# Perform the actual detection by running the model with the image as input
# Retrieve detection results
boxes = interpreter.get_tensor(output_details[boxes_idx]['index'])[0] # Bounding box coordinates of detected objects
classes = interpreter.get_tensor(output_details[classes_idx]['index'])[0] # Class index of detected objects
scores = interpreter.get_tensor(output_details[scores_idx]['index'])[0] # Confidence of detected objects
# Loop over all detections and draw detection box if confidence is above minimum threshold
for i in range(len(scores)):
if ((scores[i] > min_conf_threshold) and (scores[i] <= 1.0)):
# Get bounding box coordinates and draw box
# Interpreter can return coordinates that are outside of image dimensions, need to force them to be within image using max() and min()
ymin = int(max(1,(boxes[i][0] * imH)))
xmin = int(max(1,(boxes[i][1] * imW)))
ymax = int(min(imH,(boxes[i][2] * imH)))
xmax = int(min(imW,(boxes[i][3] * imW)))
cv2.rectangle(frame, (xmin,ymin), (xmax,ymax), (10, 255, 0), 2)
# Draw label
object_name = labels[int(classes[i])] # Look up object name from "labels" array using class index
label = '%s: %d%%' % (object_name, int(scores[i]*100)) # Example: 'person: 72%'
labelSize, baseLine = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.7, 2) # Get font size
label_ymin = max(ymin, labelSize[1] + 10) # Make sure not to draw label too close to top of window
cv2.rectangle(frame, (xmin, label_ymin-labelSize[1]-10), (xmin+labelSize[0], label_ymin+baseLine-10), (255, 255, 255), cv2.FILLED) # Draw white box to put label text in
cv2.putText(frame, label, (xmin, label_ymin-7), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 0), 2) # Draw label text
# Draw circle in center
xcenter = xmin + (int(round((xmax - xmin) / 2)))
ycenter = ymin + (int(round((ymax - ymin) / 2))), (xcenter, ycenter), 5, (0,0,255), thickness=-1)
# Print info
print('Object ' + str(i) + ': ' + object_name + ' at (' + str(xcenter) + ', ' + str(ycenter) + ')')
# Draw framerate in corner of frame
cv2.putText(frame,'FPS: {0:.2f}'.format(frame_rate_calc),(30,50),cv2.FONT_HERSHEY_SIMPLEX,1,(255,255,0),2,cv2.LINE_AA)
# All the results have been drawn on the frame, so it's time to display it.
cv2.imshow('Object detector', frame)
# Press 'q' to quit
if cv2.waitKey(1) == ord('q'):
# Clean up
I would like to try something in this code but i can't limit the detection to 1 at a time
The object detection and the model itself are good but i want to limit the result to 1 instead of multiple
this is the whole code i get from the internet
i also tried to add (parse.addArgument --maxResult) but i don't know how to run it in the tflite_runtime
i am new to the codes in python and in using RPi

I am using Mask detect code from github(I changed little) and trying to close video window with timer(or function like timer). This code detect mask and show text "Mask", or "No Mask". I want to close window when label showed "Mask" for 5seconds.
I tried using timer code
def startTimer():
global count
timer = threading.Timer(1, startTimer)
count += 1
if count > 5:
but it doesn't work well. There are 3 problems with it.
Timer ran but it does not work in seconds.
2.And timer should stop when count become 5, but it goes up to 9 then window close.
3. When label showed "Mask", video stop.
I want to know how to close window with timer in this code, and how to fix these 3 problems.
I would be really appreciate for your help
from imutils import video
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.models import load_model
from import VideoStream
import numpy as np
import argparse
import imutils
import time
import cv2
import os
import threading
from tensorflow.python.ops.math_ops import truediv
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
def detect_and_predict_mask(frame, faceNet, maskNet):
# grab the dimensions of the frame and then construct a blob
# from it
(h, w) = frame.shape[:2]
blob = cv2.dnn.blobFromImage(frame, 1.0, (300, 300),
(104.0, 180.0, 124.0))
# pass the blob through the network and obtain the face detections
detections = faceNet.forward()
# initialize our list of faces, their corresponding locations,
# and the list of predictions from our face mask network
faces = []
locs = []
preds = []
# loop over the detections
for i in range(0, detections.shape[2]):
# extract the confidence (i.e., probability) associated with
# the detection
confidence = detections[0, 0, i, 2]
# filter out weak detections by ensuring the confidence is
# greater than the minimum confidence
if confidence > args["confidence"]:
# compute the (x, y)-coordinates of the bounding box for
# the object
box = detections[0, 0, i, 3:7] * np.array([w, h, w, h])
(startX, startY, endX, endY) = box.astype("int")
# ensure the bounding boxes fall within the dimensions of
# the frame
(startX, startY) = (max(0, startX), max(0, startY))
(endX, endY) = (min(w - 1, endX), min(h - 1, endY))
# extract the face ROI, convert it from BGR to RGB channel
# ordering, resize it to 224x224, and preprocess it
face = frame[startY:endY, startX:endX]
face = cv2.cvtColor(face, cv2.COLOR_BGR2RGB)
face = cv2.resize(face, (224, 224))
face = img_to_array(face)
face = preprocess_input(face)
# add the face and bounding boxes to their respective
# lists
locs.append((startX, startY, endX, endY))
# only make a predictions if at least one face was detected
if len(faces) > 0:
# for faster inference we'll make batch predictions on *all*
# faces at the same time rather than one-by-one predictions
# in the above `for` loop
faces = np.array(faces, dtype="float32")
preds = maskNet.predict(faces, batch_size=32)
# return a 2-tuple of the face locations and their corresponding
# locations
return (locs, preds)
# construct the argument parser and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-f", "--face", type=str,
help="path to face detector model directory")
ap.add_argument("-m", "--model", type=str,
help="path to trained face mask detector model")
ap.add_argument("-c", "--confidence", type=float, default=0.5,
help="minimum probability to filter weak detections")
args = vars(ap.parse_args())
# load our serialized face detector model from disk
print("[INFO] loading face detector model...")
prototxtPath = os.path.sep.join([args["face"], "deploy.prototxt"])
weightsPath = os.path.sep.join([args["face"],
faceNet = cv2.dnn.readNet(prototxtPath, weightsPath)
# load the face mask detector model from disk
print("[INFO] loading face mask detector model...")
maskNet = load_model(args["model"])
# initialize the video stream and allow the camera sensor to warm up
print("[INFO] starting video stream...")
vs = cv2.VideoCapture(0 + cv2.CAP_DSHOW)
count = 0
Maskon = False
def startTimer():
global count
timer = threading.Timer(1, startTimer)
count += 1
if count > 5:
# loop over the frames from the video stream
while True:
# grab the frame from the threaded video stream and resize it
# to have a maximum width of 400 pixels
check, frame =
if isinstance(frame, np.ndarray):
# detect faces in the frame and determine if they are wearing a
# face mask or not
(locs, preds) = detect_and_predict_mask(frame, faceNet, maskNet)
# loop over the detected face locations and their corresponding
# locations
for (box, pred) in zip(locs, preds):
# unpack the bounding box and predictions
(startX, startY, endX, endY) = box
(mask, withoutMask) = pred
# determine the class label and color we'll use to draw
# the bounding box and text
label = "Mask" if mask > withoutMask else "No Mask"
color = (0, 255, 0) if label == "Mask" else (0, 0, 255)
if label == "Mask":
Maskon = True
Maskon = False
# include the probability in the label
label = "{}: {:.2f}%".format(label, max(mask, withoutMask) * 100)
# display the label and bounding box rectangle on the output
# frame
cv2.putText(frame, label, (startX, startY - 10),
cv2.FONT_HERSHEY_SIMPLEX, 0.45, color, 2)
cv2.rectangle(frame, (startX, startY), (endX, endY), color, 2)
if Maskon == True:
# show the output frame
cv2.imshow('Frame', frame)
key = cv2.waitKey(1) & 0xFF
if count == 5:
# do a bit of cleanup
I made it!
Not working in seconds, but it works how i wanted.
from imutils import video
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.models import load_model
from import VideoStream
import numpy as np
import argparse
import imutils
import time
import cv2
import os
import threading
from tensorflow.python.ops.math_ops import truediv
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
def detect_and_predict_mask(frame, faceNet, maskNet):
# grab the dimensions of the frame and then construct a blob
# from it
(h, w) = frame.shape[:2]
blob = cv2.dnn.blobFromImage(frame, 1.0, (300, 300),
(104.0, 180.0, 124.0))
# pass the blob through the network and obtain the face detections
detections = faceNet.forward()
# initialize our list of faces, their corresponding locations,
# and the list of predictions from our face mask network
faces = []
locs = []
preds = []
# loop over the detections
for i in range(0, detections.shape[2]):
# extract the confidence (i.e., probability) associated with
# the detection
confidence = detections[0, 0, i, 2]
# filter out weak detections by ensuring the confidence is
# greater than the minimum confidence
if confidence > args["confidence"]:
# compute the (x, y)-coordinates of the bounding box for
# the object
box = detections[0, 0, i, 3:7] * np.array([w, h, w, h])
(startX, startY, endX, endY) = box.astype("int")
# ensure the bounding boxes fall within the dimensions of
# the frame
(startX, startY) = (max(0, startX), max(0, startY))
(endX, endY) = (min(w - 1, endX), min(h - 1, endY))
# extract the face ROI, convert it from BGR to RGB channel
# ordering, resize it to 224x224, and preprocess it
face = frame[startY:endY, startX:endX]
face = cv2.cvtColor(face, cv2.COLOR_BGR2RGB)
face = cv2.resize(face, (224, 224))
face = img_to_array(face)
face = preprocess_input(face)
# add the face and bounding boxes to their respective
# lists
locs.append((startX, startY, endX, endY))
# only make a predictions if at least one face was detected
if len(faces) > 0:
# for faster inference we'll make batch predictions on *all*
# faces at the same time rather than one-by-one predictions
# in the above `for` loop
faces = np.array(faces, dtype="float32")
preds = maskNet.predict(faces, batch_size=32)
# return a 2-tuple of the face locations and their corresponding
# locations
return (locs, preds)
# construct the argument parser and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-f", "--face", type=str,
help="path to face detector model directory")
ap.add_argument("-m", "--model", type=str,
help="path to trained face mask detector model")
ap.add_argument("-c", "--confidence", type=float, default=0.5,
help="minimum probability to filter weak detections")
args = vars(ap.parse_args())
# load our serialized face detector model from disk
print("[INFO] loading face detector model...")
prototxtPath = os.path.sep.join([args["face"], "deploy.prototxt"])
weightsPath = os.path.sep.join([args["face"],
faceNet = cv2.dnn.readNet(prototxtPath, weightsPath)
# load the face mask detector model from disk
print("[INFO] loading face mask detector model...")
maskNet = load_model(args["model"])
# initialize the video stream and allow the camera sensor to warm up
print("[INFO] starting video stream...")
vs = cv2.VideoCapture(0 + cv2.CAP_DSHOW)
count = 0
Maskon = False
def startTimer():
global count
timer = threading.Timer(1, startTimer)
count += 1
if count > 5:
# loop over the frames from the video stream
while True:
# grab the frame from the threaded video stream and resize it
# to have a maximum width of 400 pixels
check, frame =
if isinstance(frame, np.ndarray):
# detect faces in the frame and determine if they are wearing a
# face mask or not
(locs, preds) = detect_and_predict_mask(frame, faceNet, maskNet)
# loop over the detected face locations and their corresponding
# locations
for (box, pred) in zip(locs, preds):
# unpack the bounding box and predictions
(startX, startY, endX, endY) = box
(mask, withoutMask) = pred
# determine the class label and color we'll use to draw
# the bounding box and text
label = "Mask" if mask > withoutMask else "No Mask"
color = (0, 255, 0) if label == "Mask" else (0, 0, 255)
if label == "Mask":
Maskon = True
Maskon = False
# include the probability in the label
label = "{}: {:.2f}%".format(label, max(mask, withoutMask) * 100)
# display the label and bounding box rectangle on the output
# frame
cv2.putText(frame, label, (startX, startY - 10),
cv2.FONT_HERSHEY_SIMPLEX, 0.45, color, 2)
cv2.rectangle(frame, (startX, startY), (endX, endY), color, 2)
# show the output frame
cv2.imshow('Frame', frame)
key = cv2.waitKey(1) & 0xFF
if Maskon == True:
count = 0
if count > 70:
# do a bit of cleanup
This code does not work in seconds, and i am still trying to make it work in seconds.
I'd really appreciate for your help

I am trying to detect a simple object, and then detect his color. I have created my own CUSTOM HAARCASCADE but it converts the camera's image to grayscale. Maybe I could detect the object through a mask? I have not found any articles on this online.
Here is my code if u need it:
import cv2
import numpy as np
#path = 'haarcascades/haarcascade_eye.xml' # PATH OF THE CASCADE
path = 'haarcascades/Azuolas.xml' # PATH OF THE CASCADE
#path = 'haarcascades/haarcascade_frontalface_default.xml' # PATH OF THE CASCADE
#path = 'haarcascades/haarcascade_smile.xml' # PATH OF THE CASCAD
objectName = 'Azuolas' # OBJECT NAME TO DISPLAY
frameWidth= 640 # DISPLAY WIDTH
frameHeight = 480 # DISPLAY HEIGHT
color= (255,0,255)
cap = cv2.VideoCapture(0)
cap.set(3, frameWidth)
cap.set(4, frameHeight)
def empty(a):
cv2.createTrackbar("Min Area","Result",0,100000,empty)
cascade = cv2.CascadeClassifier(path)
while True:
cameraBrightness = cv2.getTrackbarPos("Brightness", "Result")
cap.set(10, cameraBrightness)
success, img =
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
scaleVal =1 + (cv2.getTrackbarPos("Scale", "Result") /1000)
neig=cv2.getTrackbarPos("Neig", "Result")
objects = cascade.detectMultiScale(gray,scaleVal, neig)
for (x,y,w,h) in objects:
print(objectName ,"is in my fov")
area = w*h
minArea = cv2.getTrackbarPos("Min Area", "Result")
if area >minArea:
roi_color = img[y:y+h, x:x+w]
cv2.imshow("Result", img)
if cv2.waitKey(1) & 0xFF == ord('q'):
Thanks in advance!!!!
I'm familiar with cv2, but haven't used it in years, so bear with me here. What I think you're saying is that when you call cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) it's overwriting the original img to grayscale, even though you've assigned it to the variable gray.
You probably need to copy the image before converting to grayscale. Clone an image in cv2 python
gray = cv2.cvtColor( img.copy(), cv2.COLOR_BGR2GRAY )

So I create a Neural Network(CNN) that can predict in real-time using opencv the gender of a person, everything works perfect, but, when I run the code OpenCv has so much lag, my webcam is not that bad, here is my code
Real-time Face Gender Recognition using Conv-Nueral Network (CNN) and Cv2
Here we predict the save model that it is train
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.models import load_model
import numpy as np
import cv2
import os
import cvlib as cv
import imutils
# load the model
model = load_model('gender_detection.model')
# open webcams and initiate the camara
webcam = cv2.VideoCapture(0, cv2.CAP_DSHOW)
classes = ['hombre', 'mujer']
# loop through frames
while webcam.isOpened():
# read frame from webcam
status, frame =
#webcam.set(cv2.CAP_PROP_FPS, 1000)
frame = cv2.flip(frame, 1)
# apply face detection
face, confidence = cv.detect_face(frame) # this detects that there is a face in the camara, cvlib does, but not if it is a man that detects the neural network
# loop through detected faces
for idx, f in enumerate(face):
# get corner points of face rectangle
# this only will draw a rectangle when the cvlib detects the face with the vars giving up there
startX, startY = f[0], f[1]
endX, endY = f[2], f[3]
# draw the rectangle over the face
cv2.rectangle(frame, (startX, startY), (endX, endY), (0,255,0), 2)
# crop the detected face region
face_crop = np.copy(frame[startY:endY, startX:endX])
if face_crop.shape[0] < 10 or face_crop.shape[1] < 10:
# preprocessing for gender detection model
face_crop = cv2.resize(face_crop, (96,96))
face_crop = face_crop.astype("float") / 255.0
face_crop = img_to_array(face_crop)
face_crop = np.expand_dims(face_crop, axis=0)
# apply gender detection face with the model
conf = model.predict(face_crop)[0]
# get label with max acc
idx = np.argmax(conf)
label = classes[idx]
label = "{}: {:.2f}".format(label, conf[idx] * 100)
Y = startY - 10 if startY - 10 > 10 else startY + 10
# write label and confidence above the face rectangle
cv2.putText(frame, label, (startX, Y), cv2.FONT_HERSHEY_SIMPLEX,
0.7, (0,255,0), 2)
# display output
cv2.imshow("Gender Detection", frame)
# press "Q" to stop
if cv2.waitKey(1) & 0xFF == ord('q'):
#realese resources
and I also tried to use cv2.CAP_PROB_FPS but that only helps a little bit, not much.
I've had this same problem using openCV video capture with text-detection. It's not the webcam quality, but the fact that openCV can only show you frames as fast as you can process them in your gender detection. The solution that worked for me is to use multi-threading.
You can create a thread for the OpenCV video capture, then another thread for your image processing. The caveat: you can't magically make your image processing happen quicker without making changes the image process itself. It takes as long as it takes. What you can do is allow openCV to work on its own and send the frames to an exchange class, then allow the image processing to grab a frame and work at its own pace as CV2 continues as normal.
Here is a (shortened) version of my class for OCR image processing. You can see that in start() i'm creating a thread pointed at the ocr() process. This is where your gender identification process can go.
class OCR:
# def __init__(self, exchange: VideoStream, language=None):
def __init__(self): = None
# init stuff for OCR not relevant to my example, but note that it
# takes a VideoStream class called exchange which is where this class
# grabs frames to process
def start(self):
Thread(target=self.ocr, args=()).start()
return self
def set_exchange(self, video_stream): = video_stream
def ocr(self):
while not self.stopped:
if is not None:
frame =
# # # OCR stuff goes here
Now here is the VideoStream class that grabs frames at its own pace, in a different thread. The image processing class (OCR) can then take these frames at it's own pace and the two don't affect each other's performance.
class VideoStream:
"""Class for CV2 video capture. The start() method will create a new
thread to read the video stream"""
def __init__(self, src=0): = cv2.VideoCapture(src)
(self.grabbed, self.frame) =
# self._boxes = None
self.stopped = False
def start(self):
Thread(target=self.get, args=()).start()
return self
def get(self):
while not self.stopped:
(self.grabbed, self.frame) =
def get_video_dimensions(self):
width =
height =
return int(width), int(height)
def stop_process(self):
self.stopped = True
Then you can do your CV2 imshow loop as you normally would.
exchange = VideoStream(0).start()
ocr = OCR().start()
while True: # Begins a loop for the real-time OCR display
pressed_key = cv2.waitKey(1) & 0xFF
if pressed_key == ord('q'):
stop_stream_ocr(exchange, ocr)
frame = exchange.frame
cv2.imshow("Video Get Frame", frame)
Note that you can't really control what thread CV2 decides to use in its internal workings, but this approach will allow you to display your webcam at its natural fps and while image processing happens in the background.

I have a video(mp4) that I read into a Numpy Array and edit. Now I want to convert the array back to a video
I already tried to use VideoWriter but the created mp4 was always corrupt
frame_array = []
with detection_graph.as_default():
with tf.Session(graph=detection_graph) as sess:
while cap.isOpened():
ret, image_np =
if ret == True:
image_np_expanded = np.expand_dims(image_np, axis=0)
image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
boxes = detection_graph.get_tensor_by_name('detection_boxes:0')
scores = detection_graph.get_tensor_by_name('detection_scores:0')
classes = detection_graph.get_tensor_by_name('detection_classes:0')
num_detections = detection_graph.get_tensor_by_name('num_detections:0')
(boxes, scores, classes, num_detections) =
[boxes, scores, classes, num_detections],
feed_dict={image_tensor: image_np_expanded})
cv2.imwrite('pictures/'+ str(i) + '.jpg',image_np)
print("Everything detected")
out = cv2.VideoWriter('video.mp4',cv2.VideoWriter_fourcc(*'DIVX'), 30, (640, 480))
for i in frame_array:
The created mp4 was always corrupt and couldnt be opened. Every saved Picture with cv2.imwrite is okay and viewable

