I'm trying to save both, the depth and color images of the Intel Realsense D435i camera in a list of 300 images. Then I will use multiprocessing to save this chunk of 300 images onto my disk. But every time I try, the program successfully appends 15 images in the list and then I get this error:
Frame didn't arrived within 5000
I made sure I had the 64 bit version on python 3.6 installed and the camera streams perfectly well when I do not try to save the images in a list. The real-sense viewer works great too. I also tried with different resolutions and frame rates but it doesn't seem to work either. What is interesting is if I only save the color images, I will not get the same error, instead I will get the same color image over and over in the list.
if __name__ == '__main__':
pipeline = rs.pipeline()
config = rs.config()
config.enable_stream(rs.stream.depth, 640, 480, rs.format.z16, 30)
config.enable_stream(rs.stream.color, 1280, 720, rs.format.bgr8, 30)
profile = pipeline.start(config)
depth_sensor = profile.get_device().first_depth_sensor()
depth_sensor.set_option(
rs.option.visual_preset, 3
) # Set high accuracy for depth sensor
depth_scale = depth_sensor.get_depth_scale()
align_to = rs.stream.color
align = rs.align(align_to)
# Init variables
im_count = 0
image_chunk = []
image_chunk2 = []
# sentinel = True
try:
while True:
# Wait for a coherent pair of frames: depth and color
frames = pipeline.wait_for_frames()
aligned_frames = align.process(frames)
aligned_depth_frame = aligned_frames.get_depth_frame()
color_frame = aligned_frames.get_color_frame()
if not aligned_depth_frame or not color_frame:
print("problem here")
raise RuntimeError("Could not acquire depth or color frames.")
depth_image = np.asanyarray(aligned_depth_frame.get_data())
color_image = np.asanyarray(color_frame.get_data())
image_chunk.append(color_image)
image_chunk2.append(depth_image)
except Exception as e:
print(e)
finally:
# Stop streaming
pipeline.stop()
I simply need it to save 300 images in a row, that's all, so I am quite troubled as to what is causing this issue.
Holding onto the frame locks the memory, and eventually it hits a limit, which prevents acquiring more images. Even though you are creating an image, the data is still from the frame. You need to clone the image after you create it to release the link to the frame's memory.
depth_image = np.asanyarray(aligned_depth_frame.get_data())
color_image = np.asanyarray(color_frame.get_data())
depth_image = depth_image.copy()
color_image = color_image.copy()
image_chunk.append(color_image)
image_chunk2.append(depth_image)
Read more on frames and memory management here:
https://dev.intelrealsense.com/docs/frame-management
I created a wrapper class to extract the various elements out of the frame set that can't be recreated later. It's a bit heavy, but shows some common operations that might be helpful for others:
colorizer = None
align_to_depth = None
align_to_color = None
pointcloud = rs.pointcloud()
class IntelD435ImagePacket:
"""
Class that contains image and associated processing data.
"""
#property
def frame_id(self):
return self._frame_id
#property
def timestamp(self):
return self._timestamp
#property
def image_color(self):
return self._image_color
#property
def image_depth(self):
return self._image_depth
#property
def image_color_aligned(self):
return self._image_color_aligned
#property
def image_depth_aligned(self):
return self._image_depth_aligned
#property
def image_depth_colorized(self):
if not self._image_depth_colorized:
self._image_depth_colorized = cv2.applyColorMap(self.image_depth, cv2.COLORMAP_JET);
return self._image_depth_colorized
#property
def intrinsics(self):
return self._intrinsics
#property
def pointcloud(self):
return self._pointcloud
#property
def pointcloud_texture(self):
return self._pointcloud_texture
def _rs_intrinsics_to_opencv_matrix(self, rs_intrinsics):
fx = rs_intrinsics.fx
fy = rs_intrinsics.fy
cx = rs_intrinsics.ppx
cy = rs_intrinsics.ppy
s = 0 # skew
return np.array([fx, s, cx,
0, fy, cy,
0, 0, 1]).reshape(3, 3)
def __init__(self, frame_set, frame_id=None, timestamp=None, *args, **kwargs):
global colorizer
if not colorizer:
colorizer = rs.colorizer()
colorizer.set_option(rs.option.color_scheme, 0)
global align_to_depth
if not align_to_depth:
align_to_depth = rs.align(rs.stream.depth)
global align_to_color
if not align_to_color:
align_to_color = rs.align(rs.stream.color)
global pointcloud
if not pointcloud:
pointcloud = rs.pointcloud()
# Get intrinsics
profile = frame_set.get_profile()
video_stream_profile = profile.as_video_stream_profile()
rs_intrinsics = video_stream_profile.get_intrinsics()
self._intrinsics = self._rs_intrinsics_to_opencv_matrix(rs_intrinsics)
# Get pointcloud
depth_frame = frame_set.get_depth_frame()
color_frame = frame_set.get_color_frame()
pointcloud.map_to(color_frame)
points = pointcloud.calculate(depth_frame)
vtx = np.asanyarray(points.get_vertices())
points_arr = vtx.view(np.float32).reshape(vtx.shape + (-1,)).copy()
self._pointcloud = points_arr
# Get pointcloud texture mapping
tex = np.asanyarray(points.get_texture_coordinates())
color_map_arr = tex.view(np.float32).reshape(tex.shape + (-1,)).copy()
self._pointcloud_texture = color_map_arr
# Extract color image
color_frame = frame_set.get_color_frame()
self._image_color = np.asanyarray(color_frame.get_data()).copy()
# Extract depth image
depth_frame = frame_set.get_depth_frame()
self._image_depth = np.asanyarray(depth_frame.get_data()).copy()
# Align the color frame to depth frame and extract color image
color_frame_aligned = align_to_depth.process(frame_set).get_color_frame()
self._image_color_aligned = np.asanyarray(color_frame_aligned.get_data()).copy()
# Align the depth frame to color frame and extract depth image
depth_frame_aligned = align_to_color.process(frame_set).get_depth_frame()
self._image_depth_aligned = np.asanyarray(depth_frame_aligned.get_data()).copy()
self._image_depth_colorized = None
if frame_id:
self._frame_id = frame_id
else:
self._frame_id = frame_set.frame_number
if timestamp:
self._timestamp = timestamp
else:
self._timestamp = frame_set.timestamp
self.__dict__.update(kwargs)
Related
So I'm trying to resize an image and maintain its ratio so that it perfectly fits in 1980x1080 in the moviepy library.
Currently, I'm doing this with a function like this:
def FitClip(size):
#size is basicly clip.size
clipRes = size
#print(size)
v = ''
if clipRes[0] >= clipRes[1]:
toresize = 1980
v = 'h'
else:
toresize = 1080
v = 'v'
return [toresize, v]
and I'm calling it like this:
def generate_clip_var(clip_name, start_time):
clip_audio = AudioFileClip(f"out/{clip_name}.mp3").set_start(start_time + 2)
clip_video = ImageClip(f"out/{clip_name}.jpg").set_duration(1).set_start(start_time)
if FitClip(clip_video.size)[1] == 'v':
clip_video = ImageClip(f"out/{clip_name}.jpg").set_duration(clip_audio.duration + 1).set_position("center").set_audio(clip_audio).resize(height = FitClip(clip_video.size)[0]).set_start(start_time)
else:
clip_video = ImageClip(f"out/{clip_name}.jpg").set_duration(clip_audio.duration + 1).set_position("center").set_audio(clip_audio).resize(width = FitClip(clip_video.size)[0]).set_start(start_time)
return [clip_audio, clip_video]
My problem is that whenever image is too small or too big it just goes outside bounds.
help
You could try the moviepy native function resize():
from moviepy.video.fx.resize import resize
def generate_clip_var(clip_name, start_time):
clip_audio = AudioFileClip(f"out/{clip_name}.mp3").set_start(start_time + 2)
clip_video = ImageClip(f"out/{clip_name}.jpg").set_duration(1).set_start(start_time)
# Resize the clip_video object to fit within a 1980x1080 frame while maintaining its aspect ratio
clip_video = resize(clip_video, width=1980, height=1080)
# Set the duration and audio of the resized clip_video object
clip_video = clip_video.set_duration(clip_audio.duration + 1).set_position("center").set_audio(clip_audio).set_start(start_time)
return [clip_audio, clip_video]
I've followed the End-to-End image classification tutorial for tensorflow lite and have created and saved my model as '/path/to/model.tflite'.
What I haven't been able to figure out is how to load it.
I'm looking for some kind of syntax that is similar to this:
from tflite_model_maker import image_classifier
from tflite_model_maker.image_classifier import DataLoader
model = image_classifier.Load('/path/to/model.tflite')
I'm sure I'm missing something obvious here. This is definitely not the first place I've looked at. This seems to be the best place for me to find what I need, but the syntax used confuses me.
What do I want to be able to do with the model?
test = DataLoader.from_folder('/path/to/testImages')
loss, accuracy = model.evaluate(test)
# A helper function that returns 'red'/'black' depending on if its two input
# parameter matches or not.
def get_label_color(val1, val2):
if val1 == val2:
return 'black'
else:
return 'red'
# Then plot 100 test images and their predicted labels.
# If a prediction result is different from the label provided label in "test"
# dataset, we will highlight it in red color.
test_data = data
plt.figure(figsize=(20, 20))
predicts = model.predict_top_k(test_data)
for i, (image, label) in enumerate(test_data.gen_dataset().unbatch().take(100)):
ax = plt.subplot(10, 10, i+1)
plt.xticks([])
plt.yticks([])
plt.grid(False)
plt.imshow(image.numpy(), cmap=plt.cm.gray)
predict_label = predicts[i][0][0]
color = get_label_color(predict_label,
test_data.index_to_label[label.numpy()])
ax.xaxis.label.set_color(color)
plt.xlabel('Predicted: %s' % predict_label)
plt.show()
From the syntax above it seems the model isn't just a file but is a type/class/method depending on what name is most suitable for python.
Feels like this should only take one line of code but I haven't been able to find it anywhere.
Managed to do a simple version of it. The images coming up as a stream doesn't work for me using cv2 with Windows as it does for the pi. So instead I created a webpage in the same directory as this script. This generates an image with the bounding box, using a specified tflite model. This is in no way ideal.
It uses a webcam to get the image and saves the image to the directory the script is run in. It then renames the file so it can be viewed by the webpage I setup to view it.
The majority of this code comes from the TFLite Object Detection Raspberry Pi sample.
import time, os
from PIL import Image
from tflite_support import metadata
import platform
from typing import List, NamedTuple
import json
import cv2 as cv2
import numpy as np
import tensorflow as tf
from matplotlib import pyplot as plt
Interpreter = tf.lite.Interpreter
load_delegate = tf.lite.experimental.load_delegate
class ObjectDetectorOptions(NamedTuple):
"""A config to initialize an object detector."""
enable_edgetpu: bool = False
"""Enable the model to run on EdgeTPU."""
label_allow_list: List[str] = None
"""The optional allow list of labels."""
label_deny_list: List[str] = None
"""The optional deny list of labels."""
max_results: int = -1
"""The maximum number of top-scored detection results to return."""
num_threads: int = 1
"""The number of CPU threads to be used."""
score_threshold: float = 0.0
"""The score threshold of detection results to return."""
class Rect(NamedTuple):
"""A rectangle in 2D space."""
left: float
top: float
right: float
bottom: float
class Category(NamedTuple):
"""A result of a classification task."""
label: str
score: float
index: int
class Detection(NamedTuple):
"""A detected object as the result of an ObjectDetector."""
bounding_box: Rect
categories: List[Category]
def edgetpu_lib_name():
"""Returns the library name of EdgeTPU in the current platform."""
return {
'Darwin': 'libedgetpu.1.dylib',
'Linux': 'libedgetpu.so.1',
'Windows': 'edgetpu.dll',
}.get(platform.system(), None)
class ObjectDetector:
"""A wrapper class for a TFLite object detection model."""
_OUTPUT_LOCATION_NAME = 'location'
_OUTPUT_CATEGORY_NAME = 'category'
_OUTPUT_SCORE_NAME = 'score'
_OUTPUT_NUMBER_NAME = 'number of detections'
def __init__(
self,
model_path: str,
options: ObjectDetectorOptions = ObjectDetectorOptions()
) -> None:
"""Initialize a TFLite object detection model.
Args:
model_path: Path to the TFLite model.
options: The config to initialize an object detector. (Optional)
Raises:
ValueError: If the TFLite model is invalid.
OSError: If the current OS isn't supported by EdgeTPU.
"""
# Load metadata from model.
displayer = metadata.MetadataDisplayer.with_model_file(model_path)
# Save model metadata for preprocessing later.
model_metadata = json.loads(displayer.get_metadata_json())
process_units = model_metadata['subgraph_metadata'][0]['input_tensor_metadata'][0]['process_units']
mean = 0.0
std = 1.0
for option in process_units:
if option['options_type'] == 'NormalizationOptions':
mean = option['options']['mean'][0]
std = option['options']['std'][0]
self._mean = mean
self._std = std
# Load label list from metadata.
file_name = displayer.get_packed_associated_file_list()[0]
label_map_file = displayer.get_associated_file_buffer(file_name).decode()
label_list = list(filter(lambda x: len(x) > 0, label_map_file.splitlines()))
self._label_list = label_list
# Initialize TFLite model.
if options.enable_edgetpu:
if edgetpu_lib_name() is None:
raise OSError("The current OS isn't supported by Coral EdgeTPU.")
interpreter = Interpreter(
model_path=model_path,
experimental_delegates=[load_delegate(edgetpu_lib_name())],
num_threads=options.num_threads)
else:
interpreter = Interpreter(
model_path=model_path, num_threads=options.num_threads)
interpreter.allocate_tensors()
input_detail = interpreter.get_input_details()[0]
# From TensorFlow 2.6, the order of the outputs become undefined.
# Therefore we need to sort the tensor indices of TFLite outputs and to know
# exactly the meaning of each output tensor. For example, if
# output indices are [601, 599, 598, 600], tensor names and indices aligned
# are:
# - location: 598
# - category: 599
# - score: 600
# - detection_count: 601
# because of the op's ports of TFLITE_DETECTION_POST_PROCESS
# (https://github.com/tensorflow/tensorflow/blob/a4fe268ea084e7d323133ed7b986e0ae259a2bc7/tensorflow/lite/kernels/detection_postprocess.cc#L47-L50).
sorted_output_indices = sorted(
[output['index'] for output in interpreter.get_output_details()])
self._output_indices = {
self._OUTPUT_LOCATION_NAME: sorted_output_indices[0],
self._OUTPUT_CATEGORY_NAME: sorted_output_indices[1],
self._OUTPUT_SCORE_NAME: sorted_output_indices[2],
self._OUTPUT_NUMBER_NAME: sorted_output_indices[3],
}
self._input_size = input_detail['shape'][2], input_detail['shape'][1]
self._is_quantized_input = input_detail['dtype'] == np.uint8
self._interpreter = interpreter
self._options = options
def detect(self, input_image: np.ndarray) -> List[Detection]:
"""Run detection on an input image.
Args:
input_image: A [height, width, 3] RGB image. Note that height and width
can be anything since the image will be immediately resized according
to the needs of the model within this function.
Returns:
A Person instance.
"""
image_height, image_width, _ = input_image.shape
input_tensor = self._preprocess(input_image)
self._set_input_tensor(input_tensor)
self._interpreter.invoke()
# Get all output details
boxes = self._get_output_tensor(self._OUTPUT_LOCATION_NAME)
classes = self._get_output_tensor(self._OUTPUT_CATEGORY_NAME)
scores = self._get_output_tensor(self._OUTPUT_SCORE_NAME)
count = int(self._get_output_tensor(self._OUTPUT_NUMBER_NAME))
return self._postprocess(boxes, classes, scores, count, image_width,
image_height)
def _preprocess(self, input_image: np.ndarray) -> np.ndarray:
"""Preprocess the input image as required by the TFLite model."""
# Resize the input
input_tensor = cv2.resize(input_image, self._input_size)
# Normalize the input if it's a float model (aka. not quantized)
if not self._is_quantized_input:
input_tensor = (np.float32(input_tensor) - self._mean) / self._std
# Add batch dimension
input_tensor = np.expand_dims(input_tensor, axis=0)
return input_tensor
def _set_input_tensor(self, image):
"""Sets the input tensor."""
tensor_index = self._interpreter.get_input_details()[0]['index']
input_tensor = self._interpreter.tensor(tensor_index)()[0]
input_tensor[:, :] = image
def _get_output_tensor(self, name):
"""Returns the output tensor at the given index."""
output_index = self._output_indices[name]
tensor = np.squeeze(self._interpreter.get_tensor(output_index))
return tensor
def _postprocess(self, boxes: np.ndarray, classes: np.ndarray,
scores: np.ndarray, count: int, image_width: int,
image_height: int) -> List[Detection]:
"""Post-process the output of TFLite model into a list of Detection objects.
Args:
boxes: Bounding boxes of detected objects from the TFLite model.
classes: Class index of the detected objects from the TFLite model.
scores: Confidence scores of the detected objects from the TFLite model.
count: Number of detected objects from the TFLite model.
image_width: Width of the input image.
image_height: Height of the input image.
Returns:
A list of Detection objects detected by the TFLite model.
"""
results = []
# Parse the model output into a list of Detection entities.
for i in range(count):
if scores[i] >= self._options.score_threshold:
y_min, x_min, y_max, x_max = boxes[i]
bounding_box = Rect(
top=int(y_min * image_height),
left=int(x_min * image_width),
bottom=int(y_max * image_height),
right=int(x_max * image_width))
class_id = int(classes[i])
category = Category(
score=scores[i],
label=self._label_list[class_id], # 0 is reserved for background
index=class_id)
result = Detection(bounding_box=bounding_box, categories=[category])
results.append(result)
# Sort detection results by score ascending
sorted_results = sorted(
results,
key=lambda detection: detection.categories[0].score,
reverse=True)
# Filter out detections in deny list
filtered_results = sorted_results
if self._options.label_deny_list is not None:
filtered_results = list(
filter(
lambda detection: detection.categories[0].label not in self.
_options.label_deny_list, filtered_results))
# Keep only detections in allow list
if self._options.label_allow_list is not None:
filtered_results = list(
filter(
lambda detection: detection.categories[0].label in self._options.
label_allow_list, filtered_results))
# Only return maximum of max_results detection.
if self._options.max_results > 0:
result_count = min(len(filtered_results), self._options.max_results)
filtered_results = filtered_results[:result_count]
return filtered_results
_MARGIN = 10 # pixels
_ROW_SIZE = 10 # pixels
_FONT_SIZE = 1
_FONT_THICKNESS = 1
_TEXT_COLOR = (0, 0, 255) # red
def visualize(
image: np.ndarray,
detections: List[Detection],
) -> np.ndarray:
"""Draws bounding boxes on the input image and return it.
Args:
image: The input RGB image.
detections: The list of all "Detection" entities to be visualize.
Returns:
Image with bounding boxes.
"""
for detection in detections:
# Draw bounding_box
start_point = detection.bounding_box.left, detection.bounding_box.top
end_point = detection.bounding_box.right, detection.bounding_box.bottom
cv2.rectangle(image, start_point, end_point, _TEXT_COLOR, 3)
# Draw label and score
category = detection.categories[0]
class_name = category.label
probability = round(category.score, 2)
result_text = class_name + ' (' + str(probability) + ')'
text_location = (_MARGIN + detection.bounding_box.left,
_MARGIN + _ROW_SIZE + detection.bounding_box.top)
cv2.putText(image, result_text, text_location, cv2.FONT_HERSHEY_PLAIN,
_FONT_SIZE, _TEXT_COLOR, _FONT_THICKNESS)
return image
# ---------------------------------- #
# This is where the custom code starts
# ---------------------------------- #
# Load the TFLite model
TFLITE_MODEL_PATH='object.tflite'
DETECTION_THRESHOLD = 0.5 # 50% threshold required before identifying
options = ObjectDetectorOptions(
num_threads=4,
score_threshold=DETECTION_THRESHOLD,
)
# Close camera if already open
try:
cap.release()
except:
print("",end="") # do nothing
detector = ObjectDetector(model_path=TFLITE_MODEL_PATH, options=options)
cap = cv2.VideoCapture(0) #webcam
counter = 0 # Store many times model has run
while cap.isOpened():
success, image = cap.read()
if not success:
sys.exit(
'ERROR: Unable to read from webcam. Please verify your webcam settings.'
)
image = cv2.flip(image, 1)
# Convert the image from BGR to RGB as required by the TFLite model.
rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
#image.thumbnail((512, 512), Image.ANTIALIAS)
image_np = np.asarray(image)
# Run object detection estimation using the model.
detections = detector.detect(image_np)
# Draw keypoints and edges on input image
image_np = visualize(image_np, detections)
if counter == 10: # <- Change this to decide how many iterations
cap.release()
break
image_np = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
plt.imsave('tmp.jpg',image_np) # Saves the image
os.replace("tmp.jpg", "web.jpg",) # Renames it for the webpage
counter += 1
print(counter)
cap.release()
Here's the HTML for the document placed in the same directory as the python file, I saved it as index.html and opened in the browser while running the python script above.
<!DOCTYPE html>
<html>
<head>
<title>Object Detection</title>
</head>
<body>
<h1>Object Detection</h1>
<p>This displays images saved during detection process</p>
<canvas id="x" width="700px" height="500px"></canvas>
<script>
var newImage = new Image();
newImage.src = "web.jpg";
var canvas = document.getElementById("x");
var context = canvas.getContext("2d");
newImage.onload = function() {
context.drawImage(newImage, 0, 0);
console.log("trigger")
setTimeout(timedRefresh, 1000);
};
function timedRefresh() {
// just change src attribute, will always trigger the onload callback
try {
newImage.src = ("web.jpg#" + new Date().getTime());
}catch(e){
console.log(e);
}
}
setTimeout(timedRefresh, 100);
</script>
</body>
</html>
It's incredibly slow, not ideal in many ways and probably breaks many good coding conventions. It was only used locally, would definitely not use this for a production environment nor recommend its use. Just needed a quick proof of concept and this worked for that.
I tried to check the length of my training data to train the model but I got this error. I am implementing this in PyTorch. I have 3 main functions. dataset, extract beat and extract signal. can someone help to fix this issue, please?
This is my dataset class
class MyDataset(Dataset):
def __init__(self, patient_ids,bih2aami=True):#This method runs once when we call this class, and we pass the data or its references here with the label data.
self.patient_ids = patient_ids # list of patients ID
self.directory="C:\\Users\\User\\Downloads\\list\mit-bih-arrhythmia-database-1.0.0\\" # path
self.nb_qrs = 99 #number of beats extracted for each patient, found that each recording had at least 99 normal beats
self.idx_tuples = flatten([[(patient_idx, rpeak_idx) for rpeak_idx in range(self.nb_qrs)]
for patient_idx in range(len(patient_ids))])
self.bih2aami=bih2aami
#if bih2aami==True:
# self.y = self.bih2aami(self.y)
def __len__(self):#returns the size of the data set.
return len(self.idx_tuples) # length of the dataset
def __getitem__(self, idx): # get one sample from the dataset
patient_idx, rpeak_idx = self.idx_tuples[idx]
patient_id = self.patient_ids[patient_idx]
file = self.directory + patient_id
signal, normal_qrs_pos = get_signal(file)
qrs_pos = normal_qrs_pos[rpeak_idx]
beat, label = extract_beat(signal, qrs_pos)
#sample = {'signal': torch.tensor(beat).float(),
# 'label': torch.tensor(label).float()}
print(patient_id, patient_idx, beat.shape,label.shape) # bug : what if label null ??
X, y = torch.tensor(beat).float(), torch.tensor(label).float()
return X,y
Get signal function
def get_signal(file):
record = wfdb.rdrecord(file, channels=[0])
df = pd.DataFrame(record.p_signal, columns=record.sig_name)
lead = df.columns[0]
signal = df[lead] #getting the 1D signal
annotation = wfdb.rdann(file, 'atr') #getting the annotation
relabeled_ann = bih2lamedo(annotation.symbol)
annotations = pd.DataFrame(relabeled_ann,annotation.sample)
normal_qrs_pos = list(annotations[annotations[0]=='N'].index) #normal beats
#normal_qrs_pos = list(annotations[annotations[0]!='O'].index) #beats
#normal_qrs_pos = list(annotations.index) #normal beats
return signal, normal_qrs_pos
Get beat function
def extract_beat(signal, win_pos, qrs_positions, win_msec=40, fs=360, start_beat=36, end_beat=108):
"""
win_pos position at which you place the window of your beat
qrs_positions (list) the qrs indices from the annotations (read them from the atr file)-->obtained from annotation.sample
win_msec in milliseconds
"""
#extract signal
signal = np.array(signal)
#print(signal.shape)
#beat_array = np.zeros(start_beat+end_beat)#number of channels
start = int(max(win_pos-start_beat,0))
stop=start+start_beat+end_beat
#print(beat_array.shape,signal.shape)
beat = signal[start:stop]
#compute the nearest neighbor of win_pos among qrs_positions
tolerance = fs*win_msec//1000 #samples at a distance <tolrance are matched
nbr = NearestNeighbors(n_neighbors=1).fit(qrs_positions)
distances, indices = nbr.kneighbors(np.array([[win_pos]]).reshape(-1,1))
#label
if distances[0][0] <= tolerance:
label = 1
else:
label = 0
print(distances[0],tolerance,label)
return beat, label
I'm trying to add a high FPS screen recorder to my application.
I use Python 3.7 on Windows.
The modules and methods I've tried are mss (python-mss) and d3dshot, but I'm still only achieving 15-19 FPS for a long video (more than 20 seconds).
The resolution I'm recording at is 1920 x 1080.
What is the best way to optimize screen recording? I've tried to use the multiprocessing library, but it seems like it's still not fast enough. I'm not sure I'm using it in the optimal way, what are some ways I could use it to improve processing performance?
Using OBS Studio, I'm able to get 30 FPS, no matter how long the video is. My objective is to achieve the same results with my own code.
Here is what I've written so far:
from multiprocessing import Process, Queue
from time import sleep, time
import cv2
import d3dshot
import numpy as np
def grab(queue):
d = d3dshot.create(capture_output="numpy", frame_buffer_size=500)
d.capture()
sleep(0.1)
c=0
begin = time()
while time() - begin < 30:
starter = time()
frame = d.get_latest_frame()
queue.put(frame)
c+=1
ender = time()
sleep(max(0, 1/60 - (ender -starter)))
# Tell the other worker to stop
queue.put(None)
final=time()
print(c/(final-begin))
d.stop()
def save(queue):
SCREEN_SIZE = 1920, 1080
# Define the codec and create VideoWriter object
fourcc = cv2.VideoWriter_fourcc(*'DIVX') # In Windows: DIVX
out = cv2.VideoWriter(r"output.avi",fourcc, 30.0, (SCREEN_SIZE))
# type: (Queue) -> None
last_img = None
while "there are screenshots":
img = queue.get()
if img is None:
break
if img is last_img:
continue
out.write(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
last_img = img
if __name__ == "__main__":
# The screenshots queue
queue = Queue() # type: Queue
# 2 processes: one for grabing and one for saving PNG files
Process(target=grab, args=(queue,)).start()
Process(target=save, args=(queue,)).start()
The goal is to capture a game, while performing automated keyboard and mouse actions.
I have faced the same problem in trying to get high speed recording for games. This was the fastest solution I was able to find for Windows. The code is using raw buffer objects and leads to around ~27 FPS. I cannot find the original post on which this code is based, but if someone finds it I will add the reference.
Note that the framerate will significantly increase if you make the region smaller than 1920x1080.
"""
Alternative screen capture device, when there is no camera of webcam connected
to the desktop.
"""
import logging
import sys
import time
import cv2
import numpy as np
if sys.platform == 'win32':
import win32gui, win32ui, win32con, win32api
else:
logging.warning(f"Screen capture is not supported on platform: `{sys.platform}`")
from collections import namedtuple
class ScreenCapture:
"""
Captures a fixed region of the total screen. If no region is given
it will take the full screen size.
region_ltrb: Tuple[int, int, int, int]
Specific region that has to be taken from the screen using
the top left `x` and `y`, bottom right `x` and `y` (ltrb coordinates).
"""
__region = namedtuple('region', ('x', 'y', 'width', 'height'))
def __init__(self, region_ltrb=None):
self.region = region_ltrb
self.hwin = win32gui.GetDesktopWindow()
# Time management
self._time_start = time.time()
self._time_taken = 0
self._time_average = 0.04
def __getitem__(self, item):
return self.screenshot()
def __next__(self):
return self.screenshot()
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()
if exc_type and isinstance(exc_val, StopIteration):
return True
return False
#staticmethod
def screen_dimensions():
""" Retrieve total screen dimensions. """
left = win32api.GetSystemMetrics(win32con.SM_XVIRTUALSCREEN)
top = win32api.GetSystemMetrics(win32con.SM_YVIRTUALSCREEN)
height = win32api.GetSystemMetrics(win32con.SM_CYVIRTUALSCREEN)
width = win32api.GetSystemMetrics(win32con.SM_CXVIRTUALSCREEN)
return left, top, height, width
#property
def fps(self):
return int(1 / self._time_average) * (self._time_average > 0)
#property
def region(self):
return self._region
#property
def size(self):
return self._region.width, self._region.height
#region.setter
def region(self, value):
if value is None:
self._region = self.__region(*self.screen_dimensions())
else:
assert len(value) == 4, f"Region requires 4 input, x, y of left top, and x, y of right bottom."
left, top, x2, y2 = value
width = x2 - left + 1
height = y2 - top + 1
self._region = self.__region(*list(map(int, (left, top, width, height))))
def screenshot(self, color=None):
"""
Takes a part of the screen, defined by the region.
:param color: cv2.COLOR_....2...
Converts the created BGRA image to the requested image output.
:return: np.ndarray
An image of the region in BGRA values.
"""
left, top, width, height = self._region
hwindc = win32gui.GetWindowDC(self.hwin)
srcdc = win32ui.CreateDCFromHandle(hwindc)
memdc = srcdc.CreateCompatibleDC()
bmp = win32ui.CreateBitmap()
bmp.CreateCompatibleBitmap(srcdc, width, height)
memdc.SelectObject(bmp)
memdc.BitBlt((0, 0), (width, height), srcdc, (left, top), win32con.SRCCOPY)
signed_ints_array = bmp.GetBitmapBits(True)
img = np.frombuffer(signed_ints_array, dtype='uint8')
img.shape = (height, width, 4)
srcdc.DeleteDC()
memdc.DeleteDC()
win32gui.ReleaseDC(self.hwin, hwindc)
win32gui.DeleteObject(bmp.GetHandle())
# This makes sure that the FPS are taken in comparison to screenshots rates and vary only slightly.
self._time_taken, self._time_start = time.time() - self._time_start, time.time()
self._time_average = self._time_average * 0.95 + self._time_taken * 0.05
if color is not None:
return cv2.cvtColor(img, color)
return img
def show(self, screenshot=None):
""" Displays an image to the screen. """
image = screenshot if screenshot is not None else self.screenshot()
cv2.imshow('Screenshot', image)
if cv2.waitKey(1) & 0xff == ord('q'):
raise StopIteration
return image
def close(self):
""" Needs to be called before exiting when `show` is used, otherwise an error will occur. """
cv2.destroyWindow('Screenshot')
def scale(self, src: np.ndarray, size: tuple):
return cv2.resize(src, size, interpolation=cv2.INTER_LINEAR_EXACT)
def save(self, path, screenshot=None):
""" Store the current screenshot in the provided path. Full path, with img name is required.) """
image = screenshot if screenshot is not None else self.screenshot
cv2.imwrite(filename=path, img=image)
if __name__ == '__main__':
# Example usage when displaying.
with ScreenCapture((0, 0, 1920, 1080)) as capture:
for _ in range(100):
capture.show()
print(f"\rCapture framerate: {capture.fps}", end='')
# Example usage as generator.
start_time = time.perf_counter()
for frame, screenshot in enumerate(ScreenCapture((0, 0, 1920, 1080)), start=1):
print(f"\rFPS: {frame / (time.perf_counter() - start_time):3.0f}", end='')
Edit
I noticed some small mistake in the window show function, and the self.screenshot calls in the __getitem__ and __next__ method. These have been resolved.
Next to the for example using the ScreenCapture as a context manager, I added an example of using it as a generator.
I am new to Pytorch and so far it has been incredible. I am using it to count the number of pills in an image. I have found that in the majority of my images the max number of objects that it detects is 100. For the picture below it reaches a max count of 100 with the confidence around .6. After that it doesn't increase anymore even down to .1 confidence. I haven't been able to find anything in the docs or any other places online. I am using the fasterrcnn_resnet50_fpn model. Below is the code that load the trained model and evaluate the image. Any tips or even different packages that would be able to count all objects would be super helpful.
## Loading the trained module
loaded_model = get_model(num_classes = 2)
loaded_model.load_state_dict(torch.load('Pillcount/model'))
os.chdir('../pytorchobjdet/vision')
class CountDataset(torch.utils.data.Dataset):
def __init__(self, root, data_file, transforms=None):
self.root = root
self.transforms = transforms
self.imgs = sorted(os.listdir(os.path.join(root, "count")))
self.path_to_data_file = data_file
def __getitem__(self, idx):
# load images and bounding boxes
img_path = os.path.join(self.root, "count", self.imgs[idx])
img = Image.open(img_path).convert("RGB")
box_list = parse_one_annot(self.path_to_data_file,
self.imgs[idx])
boxes = torch.as_tensor(box_list, dtype=torch.float32)
num_objs = len(box_list)
# there is only one class
labels = torch.ones((num_objs,), dtype=torch.int64)
image_id = torch.tensor([idx])
area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:,
0])
# suppose all instances are not crowd
iscrowd = torch.zeros((num_objs,), dtype=torch.int64)
target = {}
target["boxes"] = boxes
target["labels"] = labels
target["image_id"] = image_id
target["area"] = area
target["iscrowd"] = iscrowd
if self.transforms is not None:
img, target = self.transforms(img, target)
return img, target
def __len__(self):
return len(self.imgs)
dataset_count = CountDataset(root='../../Pill_Object_Detection',
data_file = "../../Pill_Object_Detection/count_labels.csv",
transforms = get_transform(train=False))
idx = 1
img, _ = dataset_count[idx]
#put the model in evaluation mode
loaded_model.eval()
with torch.no_grad():
prediction = loaded_model([img])
image = Image.fromarray(img.mul(255).permute(1, 2,0).byte().numpy())
draw = ImageDraw.Draw(image)
# draw groundtruth
count = 0
for element in range(len(prediction[0]["boxes"])):
boxes = prediction[0]["boxes"][element].cpu().numpy()
score = np.round(prediction[0]["scores"][element].cpu().numpy(),
decimals= 4)
if score > 0.6:
draw.rectangle([(boxes[0], boxes[1]), (boxes[2], boxes[3])],
outline ="red", width =3)
draw.text((boxes[0], boxes[1]), text = str(score))
count +=1
print(f'count = {count}')
image
The advice from the comment above was very helpful. I used the YOLO5vs model and it did an incredible job. This tutorial had a super easy set up that had you upload the annotated images into roboflow, and then it had some google colab tutorials set up for almost all of the current object detectors out there. Here is the result. I just need to give better quality training data but it did extremely well for the few pictures that I gave it. It can count well over 150 objects in the same image no problem.