YUV frame averaging using opencv - python

I am trying to read a yuv video, then perform frame averaging, however, I run into problems when doing so, my code is shown below (this was made by combining a yuv reader and well as code to split the frame into the separate y, u,v components as well as adding what I thought was the way to proceed with this)
I get an error that the argument should be int or none not float but doesn't reference the line, but I do know this has to do with the reading the yuv file part. I am trying to average every two frames within the video. The YUV file is 10bit, 420
import cv2
import numpy as np
class readYUV:
def __init__(self, filename, size):
self.height, self.width = size
self.frame_len = self.width * self.height * 3 / 2
self.f = open(filename, 'rb')
self.shape = (int(self.height*1.5), self.width)
def read_raw(self):
raw = self.f.read(self.frame_len)
yuv = np.frombuffer(raw, dtype=np.uint8)
yuv = yuv.reshape(self.shape)
except Exception as e:
print (str(e))
return False, None
return True, yuv
def read(self):
ret, yuv = self.read_raw()
if not ret:
return ret, yuv
bgr = cv2.cvtColor(yuv, cv2.COLOR_YUV2BGR_NV21)
return ret, bgr
def make_lut_u():
return np.array([[[i, 255 - i, 0] for i in range(256)]], dtype=np.uint8)
def make_lut_v():
return np.array([[[0, 255 - i, i] for i in range(256)]], dtype=np.uint8)
# otherwise, split the frame into its respective channels
def splitter(img):
cv2.imshow("frame", img)
img_yuv = cv2.cvtColor(img, cv2.COLOR_BGR2YUV)
y, u, v = cv2.split(img_yuv)
lut_u, lut_v = make_lut_u(), make_lut_v()
# Convert back to BGR so we can apply the LUT and stack the images
y = cv2.cvtColor(y, cv2.COLOR_GRAY2BGR)
u = cv2.cvtColor(u, cv2.COLOR_GRAY2BGR)
v = cv2.cvtColor(v, cv2.COLOR_GRAY2BGR)
u_mapped = cv2.LUT(u, lut_u)
v_mapped = cv2.LUT(v, lut_v)
result = np.vstack([img, y, u_mapped, v_mapped])
return result
if __name__ == "__main__":
filename = "file.yuv"
size = (3840, 2160)
cap = readYUV(filename, size)
while 1:
ret, frame = cap.read()
result1 = splitter(frame)
ret2, frame2 = cap.read()
result2 = splitter(frame2)
result = (result1 +result2)/2
cv2.imwrite('average.png', result)
Template matching with updating templates from previous detection

I am trying to write a code where after matching with a given template, the detected part of that frame becomes the template for the next frame.
temp = "image.png"
while True:
_, frame = cap.read()
copy = frame.copy()
for pt in zip(*loc[::-1]):
point = pt
cropped_image = copy[point[1]:point[1]+h, point[0]:point[0]+w]
temp = cropped_image #update the template
but after writing this code the template matching is going in totally wrong direction, even though if i remove the "temp = cropped_image" then the cropped_image is actually good.
You can find x,y,w,h of the matched image with cv2.minMaxLoc()
import cv2
src = cv2.imread("source.png", cv2.IMREAD_GRAYSCALE)
templit = cv2.imread("initial_template.png", cv2.IMREAD_GRAYSCALE)
result = cv2.matchTemplate(src, templit, cv2.TM_SQDIFF_NORMED)
minVal, maxVal, minLoc, maxLoc = cv2.minMaxLoc(result)
x, y = minLoc
h, w = templit.shape
cropped_img = src[y: y + h, x: x + w]
//do template matching again with cropped_image

ValueError: could not broadcast input array from shape (320,320,3) into shape (640,640,3)

I've been trying to run a detection model on a raspberry pi but when I try I get the error that:
could not broadcast input array from shape (320,320,3) into shape (640,640,3)
when I run this
import re
import cv2
from tflite_runtime.interpreter import Interpreter
import numpy as np
def load_labels(path='labels.txt'):
"""Loads the labels file. Supports files with or without index numbers."""
with open(path, 'r', encoding='utf-8') as f:
lines = f.readlines()
labels = {}
for row_number, content in enumerate(lines):
pair = re.split(r'[:\s]+', content.strip(), maxsplit=1)
if len(pair) == 2 and pair[0].strip().isdigit():
labels[int(pair[0])] = pair[1].strip()
labels[row_number] = pair[0].strip()
return labels
def set_input_tensor(interpreter, image):
"""Sets the input tensor."""
tensor_index = interpreter.get_input_details()[0]['index']
input_tensor = interpreter.tensor(tensor_index)()[0]
input_tensor[:, :] = np.expand_dims((image-255)/255, axis=0)
def get_output_tensor(interpreter, index):
"""Returns the output tensor at the given index."""
output_details = interpreter.get_output_details()[index]
tensor = np.squeeze(interpreter.get_tensor(output_details['index']))
return tensor
def detect_objects(interpreter, image, threshold):
"""Returns a list of detection results, each a dictionary of object info."""
set_input_tensor(interpreter, image)
# Get all output details
boxes = get_output_tensor(interpreter, 0)
classes = get_output_tensor(interpreter, 1)
scores = get_output_tensor(interpreter, 2)
count = int(get_output_tensor(interpreter, 3))
results = []
for i in range(count):
if scores[i] >= threshold:
result = {
'bounding_box': boxes[i],
'class_id': classes[i],
'score': scores[i]
return results
def main():
labels = load_labels()
interpreter = Interpreter('detect.tflite')
_, input_height, input_width, _ = interpreter.get_input_details()[0]['shape']
cap = cv2.VideoCapture(0)
while cap.isOpened():
ret, frame = cap.read()
img = cv2.resize(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB), (320,320))
res = detect_objects(interpreter, img, 0.8)
for result in res:
ymin, xmin, ymax, xmax = result['bounding_box']
xmin = int(max(1,xmin * CAMERA_WIDTH))
xmax = int(min(CAMERA_WIDTH, xmax * CAMERA_WIDTH))
ymin = int(max(1, ymin * CAMERA_HEIGHT))
ymax = int(min(CAMERA_HEIGHT, ymax * CAMERA_HEIGHT))
cv2.rectangle(frame,(xmin, ymin),(xmax, ymax),(0,255,0),3)
cv2.putText(frame,labels[int(result['class_id'])],(xmin, min(ymax, CAMERA_HEIGHT-20)), cv2.FONT_HERSHEY_SIMPLEX, 0.5,(255,255,255),2,cv2.LINE_AA)
cv2.imshow('Pi Feed', frame)
if cv2.waitKey(10) & 0xFF ==ord('q'):
if __name__ == "__main__":
the model is an SSD Mobilenet 640x640 and the images for the model were taken on the raspberry pi as 1028x720 but were downscaled during model training. But I still get this error and I'm not sure how to fix it.

How use for loop many variable?

I am using Yolov5. I want change my webcam -> lancamera
class LoadStreams: # multiple IP or RTSP cameras
def __init__(self, sources='streams.txt', img_size=640):
self.mode = 'images'
self.img_size = img_size
if os.path.isfile(sources):
with open(sources, 'r') as f:
sources = [x.strip() for x in f.read().splitlines() if len(x.strip())]
sources = [sources]
n = len(sources)
self.imgs = [None] * n
self.sources = sources
for i, s in enumerate(sources):
# Start the thread to read frames from the video stream
print('%g/%g: %s... ' % (i + 1, n, s), end='')
cap = cv2.VideoCapture(eval(s) if s.isnumeric() else s)
assert cap.isOpened(), 'Failed to open %s' % s
w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS) % 100
_, self.imgs[i] = cap.read() # guarantee first frame
thread = Thread(target=self.update, args=([i, cap]), daemon=True)
print(' success (%gx%g at %.2f FPS).' % (w, h, fps))
print('') # newline
# check for common shapes
s = np.stack([letterbox(x, new_shape=self.img_size)[0].shape for x in self.imgs], 0) # inference shapes
self.rect = np.unique(s, axis=0).shape[0] == 1 # rect inference if all shapes equal
if not self.rect:
print('WARNING: Different stream shapes detected. For optimal performance supply similarly-shaped streams.')
def update(self, index, cap):
# Read next stream frame in a daemon thread
n = 0
while cap.isOpened():
n += 1
# _, self.imgs[index] = cap.read()
if n == 4: # read every 4th frame
_, self.imgs[index] = cap.retrieve()
n = 0
time.sleep(0.01) # wait time
def __iter__(self):
self.count = -1
return self
def __next__(self):
self.count += 1
img0 = self.imgs.copy()
if cv2.waitKey(1) == ord('q'): # q to quit
raise StopIteration
# Letterbox
img = [letterbox(x, new_shape=self.img_size, auto=self.rect)[0] for x in img0]
# Stack
img = np.stack(img, 0)
# Convert
img = img[:, :, :, ::-1].transpose(0, 3, 1, 2) # BGR to RGB, to bsx3x416x416
img = np.ascontiguousarray(img)
return self.sources, img, img0, None
def __len__(self):
return 0 # 1E12 frames = 32 streams at 30 FPS for 30 years
this code return 'self.sources, img, img0, None'
if webcam:
view_img = True
cudnn.benchmark = True # set True to speed up constant image size inference
dataset = LoadStreams(source, img_size=imgsz)
I use 'dataset'
for path, img, im0s, vid_cap in dataset:
img = torch.from_numpy(img).to(device)
img = img.half() if half else img.float() # uint8 to fp16/32
img /= 255.0 # 0 - 255 to 0.0 - 1.0
How to use for path, img, im0s, vid_cap in dataset: ??
my lancam code
def livecame():
vimba = Vimba()
system = vimba.system()
camera_ids = vimba.camera_ids()
# for cam_id in camera_ids:
# print("Camera found: ", cam_id)
c0 = vimba.camera(camera_ids[0])
pixel_format = c0.feature("PixelFormat")
pixel_format.value = "BayerBG8"
c0.StreamBytesPerSecond = 100000000
frame = c0.new_frame()
success = True
success = False
frame_data = frame.buffer_data()
k = cv2.waitKey(1)
if k == 0x1b:
if success:
img = np.ndarray(buffer=frame_data,
shape=(frame.data.height, frame.data.width, 1))
img = cv2.cvtColor(img, cv2.COLOR_BAYER_BG2RGB)
img0 = img.copy()
img = img.tolist()
img = [letterbox(x, new_shape=(800,400), auto= True)[0] for x in img0]
#img = np.ascontiguousarray(img)
img = np.stack(img, 0)
#img = img[:, :, :, ::-1].transpose(0, 3, 1, 2) # BGR to RGB, to bsx3x416x416
img = np.ascontiguousarray(img)
return ['0'], img, img0
but I use dataset = new_file.livecame()
I can see error ValueError: not enough values to unpack (expected 3, got 1)
in for path, img, im0s, vid_cap in dataset:
how to use many variable? in for loop?
In Python OpenCV, one way is simply to use zip.
for component in zip(contours, hierarchy):
cntr = component[0]
hier = component[1]

How can I convert pytorch cpu-based transformation to cuda-based?

The original issue for the code is availablehere.
I am using this repository for a line segmentation project and I developed this code to get an input (whether image or video) and draw road lines on it and give it in output:
import argparse
import sys
from time import time, clock
from os.path import splitext, basename, exists
from model import SCNN
from utils.check_extension import is_video, is_image
from utils.transforms import *
# I will put all the necessary code for utils.transforms after this
# ------------------------------------------------ SCNN parameters
time1 = time()
net = SCNN(input_size=(800, 288), pretrained=False)
mean = (0.3598, 0.3653, 0.3662) # CULane mean, std
std = (0.2573, 0.2663, 0.2756)
transform_img = Resize((800, 288))
transform_to_net = Compose(ToTensor(), Normalize(mean=mean, std=std))
# ------------------------------------------------ Arguments
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--weights', type=str,
help='path to vgg models')
parser.add_argument('--input', type=str, default='demo/line_3.mp4',
help='path to image file')
parser.add_argument('--output', type=str, default='public/',
help='path to the output directory')
args = parser.parse_args()
return args
def main():
args = parse_args()
filename, extension = splitext(basename(args.input))
print("Loading file [{}] ....".format(filename))
if not exists(args.input):
print("file [{}] is not recognized".format(args.input))
if is_video(extension):
video_capture = cv2.VideoCapture()
fourcc = cv2.VideoWriter_fourcc(*'XVID')
output = args.output + filename + '.avi'
if video_capture.open(args.input):
property_id = int(cv2.CAP_PROP_FRAME_COUNT)
total_frames = int(cv2.VideoCapture.get(video_capture, property_id))
frame_no = 1
width, height = int(video_capture.get(cv2.CAP_PROP_FRAME_WIDTH)), \
fps = video_capture.get(cv2.CAP_PROP_FPS)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
save_dict = torch.load(args.weights, map_location=device)
# can't write out mp4, so try to write into an AVI file
video_writer = cv2.VideoWriter(output, fourcc, fps, (width, height))
while video_capture.isOpened():
start = time()
ret, frame = video_capture.read()
if not ret:
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
frame = transform_img({'img': frame})['img']
x = transform_to_net({'img': frame})['img']
stop1 = time()
print('stop1: ', stop1 - start)
seg_pred, exist_pred = net(x)[:2]
seg_pred = seg_pred.detach().cpu().numpy()
exist_pred = exist_pred.detach().cpu().numpy()
seg_pred = seg_pred[0]
stop2 = time()
print('stop2: ', stop2 - stop1)
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
lane_img = np.zeros_like(frame)
color = np.array([[255, 125, 0], [0, 255, 0], [0, 0, 255], [0, 255, 255]], dtype='uint8')
coord_mask = np.argmax(seg_pred, axis=0)
for i in range(0, 4):
if exist_pred[0, i] > 0.5:
lane_img[coord_mask == (i + 1)] = color[i]
img = cv2.addWeighted(src1=lane_img, alpha=0.8, src2=frame, beta=1., gamma=0.)
img = cv2.resize(img, (width, height))
stop3 = time()
print('stop3: ', stop3 - stop2)
# if frame_no % 20 == 0:
# print('# {}/{} frames processed!'.format(frame_no, total_frames))
frame_no += 1
end = time()
print('Whole loop: {} seconds'.format(end - start))
print('# All frames processed ')
elif is_image(extension):
img = cv2.imread(args.input)
height, width, _ = img.shape
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = transform_img({'img': img})['img']
x = transform_to_net({'img': img})['img']
save_dict = torch.load(args.weights, map_location='cpu')
seg_pred, exist_pred = net(x)[:2]
seg_pred = seg_pred.detach().cpu().numpy()
exist_pred = exist_pred.detach().cpu().numpy()
seg_pred = seg_pred[0]
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
lane_img = np.zeros_like(img)
color = np.array([[255, 125, 0], [0, 255, 0], [0, 0, 255], [0, 255, 255]], dtype='uint8')
coord_mask = np.argmax(seg_pred, axis=0)
for i in range(0, 4):
if exist_pred[0, i] > 0.5:
lane_img[coord_mask == (i + 1)] = color[i]
img = cv2.addWeighted(src1=lane_img, alpha=0.8, src2=img, beta=1., gamma=0.)
img = cv2.resize(img, (width, height))
output = args.output + filename + '.jpg'
cv2.imwrite(output, img)
print("file format [{}] is not supported".format(args.input))
if __name__ == '__main__':
The code which belong to Resize, ToTensor, Normalize, Compose are here:
class Compose(CustomTransform):
All transform in Compose should be able to accept two non None variable, img and boxes
def __init__(self, *transforms):
self.transforms = [*transforms]
def __call__(self, sample):
for t in self.transforms:
sample = t(sample)
return sample
def __iter__(self):
return iter(self.transforms)
def modules(self):
yield self
for t in self.transforms:
if isinstance(t, Compose):
for _t in t.modules():
yield _t
yield t
class Normalize(CustomTransform):
def __init__(self, mean, std):
self.transform = Normalize_th(mean, std)
def __call__(self, sample):
img = sample.get('img')
img = self.transform(img)
_sample = sample.copy()
_sample['img'] = img
return _sample
class ToTensor(CustomTransform):
def __init__(self, dtype=torch.float):
def __call__(self, sample):
img = sample.get('img')
segLabel = sample.get('segLabel', None)
exist = sample.get('exist', None)
img = img.transpose(2, 0, 1)
img = torch.from_numpy(img).type(self.dtype) / 255.
if segLabel is not None:
segLabel = torch.from_numpy(segLabel).type(torch.long)
if exist is not None:
exist = torch.from_numpy(exist).type(torch.float32) # BCEloss requires float tensor
_sample = sample.copy()
_sample['img'] = img
_sample['segLabel'] = segLabel
_sample['exist'] = exist
return _sample
class Resize(CustomTransform):
def __init__(self, size):
if isinstance(size, int):
size = (size, size)
self.size = size #(W, H)
def __call__(self, sample):
img = sample.get('img')
segLabel = sample.get('segLabel', None)
img = cv2.resize(img, self.size, interpolation=cv2.INTER_CUBIC)
if segLabel is not None:
segLabel = cv2.resize(segLabel, self.size, interpolation=cv2.INTER_NEAREST)
_sample = sample.copy()
_sample['img'] = img
_sample['segLabel'] = segLabel
return _sample
def reset_size(self, size):
if isinstance(size, int):
size = (size, size)
self.size = size
The code works fine but I found out that its too slow for testing in real-time application. I added some time measurement to see if I can find out the bottlenecks and this is the output for one loop:
stop1: 0.002989053726196289
stop2: 1.4032211303710938
stop3: 0.004946708679199219
Whole loop: 1.41636061668396 seconds
These lines happened to be the most computationally expensive lines:
seg_pred, exist_pred = net(x)[:2]
seg_pred = seg_pred.detach().cpu().numpy()
exist_pred = exist_pred.detach().cpu().numpy()
seg_pred = seg_pred[0]
Now I am stuck with this issue that how I can modify the code to improve the computation speed.
Initially I thought of modifying the code to allow cuda computation. I asked the main author how I can modify the code for cuda version in here and he pointed out to these lines:
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
frame = transform_img({'img': frame})['img']
x = transform_to_net({'img': frame})['img']
Unfortunately my experience with pytorch is not much, so I am asking for help now.
I hope the information I shared suffices for the readers. Any help would be appreciated
Set device:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
what he means his putting the data on device:
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
frame = transform_img({'img': frame})['img']
x = transform_to_net({'img': frame})['img']

Face Recognition with multiple faces don't detect multiple faces

I've written a small program, which detects faces and saves them to an Train file for the recognition.
I have some trouble with this algorithm. Sometimes it throws the error, that the LBPH::Train was feed with empty data, which is wrong.
OpenCV Error: Unsupported format or combination of formats (Empty training data was given. You'll need more than one sample to learn a model.) in cv::LBPH::train, file ........\opencv\modules\contrib\src\facerec.cpp, line 917
Traceback (most recent call last):
Moreover the algorithm detects multiple faces, but recognizes it just as the same face, which is wrong.
Could someone give me a hint on what I'm missing?
import cv2
import os
import numpy as np
import sys
i = 0
global allFaces
global first
first = True
allFaces = []
cap = cv2.VideoCapture(0)
faceCascade = cv2.CascadeClassifier('haarcascade_frontalface_default.xml')
recognizer = cv2.createLBPHFaceRecognizer()
id = 0
class Face:
def __init__(self, id, face):
self.id = id
self.face = face
self.gatheredFaces = []
def main(self):
def getFace(self):
return self.face
def setKnownFace(self):
self.known = False
def getKownFace(self):
return self.knwon
def getId(self):
return self.id
def setFacesInfo(self, frame, face):
x,y,h,w = face
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
self.gatheredFaces.append(gray[y:y+h, x:x+w])
# count = 0
# while (count != 10):
# gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
# cv2.imshow("frame in set", frame)
# faces = faceCascade.detectMultiScale(gray)
# for face in faces:
# self.gatheredFaces.append(gray[y:y+h,x:x+w])
# cv2.imshow("gathered Faces", self.gatheredFaces[0])
# cv2.imwrite("dataSet/User"+ str(self.getId()) +".jpg", gray)
# count = count+1
# cv2.waitKey(30)
def getFacesInfo(self):
return self.gatheredFaces
def trainDetector(self):
faceSamples = []
Ids = []
print("laenge von gathered FAces")
for (i) in range(len(allFaces)):
temp = allFaces[i].getFacesInfo()
for (j) in range(len(temp)):
imageNP = np.array(temp[j], 'uint8')
id = allFaces[i].getId()
faces = faceCascade.detectMultiScale(imageNP)
for (x,y,h,w) in faces:
recognizer.train(faceSamples, np.array(Ids))
def updateDetector(self):
faceSamples = []
Ids = []
for (i) in range(len(allFaces)):
temp = allFaces[i].getFacesInfo()
for (j) in range(len(temp)):
imageNP = np.array(temp[j], 'uint8')
id = allFaces[i].getId()
faces = faceCascade.detectMultiScale(imageNP)
for (x,y,h,w) in faces:
recognizer.update(faceSamples, np.array(Ids))
while True:
ret, frame = cap.read()
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
cv2.imshow("actual Frame", frame)
cv2.imshow("gray", gray)
faces = faceCascade.detectMultiScale(gray, 1.3, 5)
for face in faces:
x,y,h,w = face
temp = Face(id, frame[y:y+h,x:x+w])
temp = None
id = id+1
detector = cv2.SIFT()
flannParam = dict(algorithm = FLANN_INDEX_KDTREE, tree = 5)
flann = cv2.FlannBasedMatcher(flannParam,{})
trainImg = allFaces[0].getFace()
trainKP, trainDecs = detector.detectAndCompute(trainImg, None)
if((len(allFaces)==1) and first):
print("only one object in allFaces")
for i in range(10):
allFaces[0].setFacesInfo(frame, face)
first = False
for(i) in range(len(allFaces)):
QueryImg = cv2.cvtColor(allFaces[i].getFace(), cv2.COLOR_BGR2GRAY)
queryKP, queryDesc = detector.detectAndCompute(QueryImg, None)
matches = flann.knnMatch(queryDesc, trainDecs, k = 2)
goodMatch = []
for m, n in matches:
if(m.distance < 0.75 * n.distance):
if(len(goodMatch) > 30):
print("good match")
tp = []
qp = []
for m in goodMatch:
tp, qp = np.float32((tp, qp))
H, status = cv2.findHomography(tp, qp, cv2.RANSAC, 3.0)
print ("bad match")
for i in range(10):
allFaces[len(allFaces)-1].setFacesInfo(frame, face)
for (x,y,w,h) in faces:
cv2.rectangle(frame, (x,y), (x+w,y+h), (0,0,255),2)
tempid, conf = recognizer.predict(gray[y:y+h,x:x+w])
cv2.cv.PutText(cv2.cv.fromarray(frame), str(tempid),(x,y+h),font,(0,0,255))
cv2.imshow("detectedFace", frame)

