I converted the pytorch Real-ESRGAN model to model.engine file from using c++ code <here>. After conversion inference is working well on c++. But when I try to infer the image from this model.engine in python. It gives me the black image as given below here is the image.
GPU : RTX 3090
OS : Ubuntu20.04
Cuda version : 11.4
TensorRT version : 22
import os
import time
import cv2
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt
from PIL import Image
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
def load_engine(trt_runtime, engine_path):
with open(engine_path, "rb") as f:
engine_data = f.read()
engine = trt_runtime.deserialize_cuda_engine(engine_data)
return engine
def allocate_buffers(engine, batch_size=1):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
# Transfer input data to the GPU.
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference.
context.execute_async(
batch_size=1, bindings=bindings, stream_handle=stream.handle
)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out.host for out in outputs]
def preprocess_image(input_image_path):
image_raw = cv2.imread(input_image_path)
return image_raw
def process_image(arr, w, h):
image = Image.fromarray(np.uint8(arr))
image_resized = image.resize(size=(w, h), resample=Image.BILINEAR)
img_np = np.array(image_resized)
# HWC -> CHW
img_np = img_np.transpose((2, 0, 1))
# Normalize to [0.0, 1.0] interval (expected by model)
img_np = (1.0 / 255.0) * img_np
print(img_np.shape)
img_np = img_np.ravel()
return img_np
def predict(image):
img = preprocess_image(image)
print(img.shape)
np.copyto(inputs[0].host, img.ravel())
inference_start_time = time.time()
# Fetch output from the model
output = do_inference(
context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream
)
# Output inference time
print(output)
# And return results
return output
# -------------- MODEL PARAMETERS FOR DETECTNET_V2 --------------------------------
model_h = 1536
model_w = 1536
import ctypes
PLUGIN_LIBRARY = "build/libmyplugins.so"
ctypes.CDLL(PLUGIN_LIBRARY)
# TensorRT logger singleton
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
trt_engine_path = 'build/real-esrgan_f32.engine'
trt_runtime = trt.Runtime(TRT_LOGGER)
trt_engine = load_engine(trt_runtime, trt_engine_path)
# This allocates memory for network inputs/outputs on both CPU and GPU
inputs, outputs, bindings, stream = allocate_buffers(trt_engine)
# Execution context is needed for inference
context = trt_engine.create_execution_context()
output = predict('image.jpg')[0]
output = output.reshape(3072,3072,3)
print(output.shape)
print(output.dtype)
cv2.imwrite('output.jpg', output)
Related
Unsupported: ONNX export of convolution for kernel of unknown shape. [Caused by the value 'x.47 defined in (%x.47 : Float(*, *, *, *, strides=[12168000, 67600, 260, 1], requires_grad=0, device=cpu) = onnx::Slice(%874, %875, %876, %877, %878), scope: torch_utils.persistence.persistent_class..Decorator::/torch_utils.persistence.persistent_class..Decorator::synthesis/torch_utils.persistence.persistent_class..Decorator::first_stage/torch_utils.persistence.persistent_class..Decorator::enc_conv.1/torch_utils.persistence.persistent_class..Decorator::conv # /Users/QSoft019/Documents/ai-image-research/MAT/torch_utils/ops/upfirdn2d.py:190:0
)' (type 'Tensor') in the TorchScript graph. The containing node has kind 'onnx::Slice'.]
github: https://github.com/fenglinglwb/mat
there is no error when running generate_image.py with pretrained file, but when converting to onnx, there are many warnings
finally, it stoped at line
assert isinstance(groups, int) and (groups >= 1)
in file MAT/torch_utils/ops/conv2d_resample.py
I had commented that line, but it still stopped at file venv/lib/python3.8/site-packages/torch/onnx/symbolic_opset9.py because weight_size (kernel_shape) variable was full of None value
I found that many integer variable -when converting to onnx- became tensors
this caused warnings, groups variable became a tensor, too
Am I in error at some where ?
My fuction:
def convert_torch_to_onnx_(onnx_path, image_path, model=None, torch_path=None):
"""
Coverts Pytorch model file to ONNX
:param torch_path: Torch model path to load
:param onnx_path: ONNX model path to save
:param image_path: Path to test image to use in export progress
"""
from datasets.mask_generator_512 import RandomMask
if torch_path is not None:
pytorch_model = get_torch_model(torch_path)
else:
pytorch_model = model
device = torch.device('cpu')
# image, _, torch_image = get_example_input(image_path)
image = read_image(image_path)
torch_image = (torch.from_numpy(image).float().to(device) / 127.5 - 1).unsqueeze(0)
label = torch.zeros([1, pytorch_model.c_dim], device=device)
resolution = 512
mask = RandomMask(resolution) # adjust the masking ratio by using 'hole_range'
mask = torch.from_numpy(mask).float().to(device).unsqueeze(0)
z = torch.from_numpy(np.random.randn(1, pytorch_model.z_dim)).to(device)
truncation_psi = 1
noise_mode = 'const'
torch.onnx.export(
pytorch_model,
(torch_image, mask, z, label, truncation_psi, noise_mode),
onnx_path,
verbose=True,
export_params=True,
# do_constant_folding=False,
# input_names=['input'],
opset_version=11,
# output_names=['output']
)
and generate_images function provided by author (default values of input variable were edited)
def generate_images(
# network_pkl: str = 'pretrained/CelebA-HQ_512.pkl',
network_pkl: str = '/Downloads/MAT/models/Places_512_FullData.pkl',
dpath: str = 'test_sets/CelebA-HQ/images',
# mpath=None,
mpath: str = 'test_sets/CelebA-HQ/masks',
resolution: int = 512,
truncation_psi: float = 1,
noise_mode: str = 'const',
outdir: str = 'samples',
model: bool = False,
):
"""
Generate images using pretrained network pickle.
"""
seed = 240 # pick up a random number
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
print(f'Loading data from: {dpath}')
img_list = sorted(glob.glob(dpath + '/*.png') + glob.glob(dpath + '/*.jpg'))
if mpath is not None:
print(f'Loading mask from: {mpath}')
mask_list = sorted(glob.glob(mpath + '/*.png') + glob.glob(mpath + '/*.jpg'))
assert len(img_list) == len(mask_list), 'illegal mapping'
print(f'Loading networks from: {network_pkl}')
device = torch.device('cpu')
# device = torch.device('cuda')
with dnnlib.util.open_url(network_pkl) as f:
G_saved = legacy.load_network_pkl(f)['G_ema'].to(device).eval().requires_grad_(False) # type: ignore
net_res = 512 if resolution > 512 else resolution
G = Generator(z_dim=512, c_dim=0, w_dim=512, img_resolution=net_res, img_channels=3).to(device).eval().requires_grad_(False)
copy_params_and_buffers(G_saved, G, require_all=True)
if model:
return G
os.makedirs(outdir, exist_ok=True)
# no Labels.
label = torch.zeros([1, G.c_dim], device=device)
if resolution != 512:
noise_mode = 'random'
with torch.no_grad():
for i, ipath in enumerate(img_list):
iname = os.path.basename(ipath).replace('.jpg', '.png')
print(f'Prcessing: {iname}')
image = read_image(ipath)
image = (torch.from_numpy(image).float().to(device) / 127.5 - 1).unsqueeze(0)
if mpath is not None:
mask = cv2.imread(mask_list[i], cv2.IMREAD_GRAYSCALE).astype(np.float32) / 255.0
mask = torch.from_numpy(mask).float().to(device).unsqueeze(0).unsqueeze(0)
else:
mask = RandomMask(resolution) # adjust the masking ratio by using 'hole_range'
mask = torch.from_numpy(mask).float().to(device).unsqueeze(0)
z = torch.from_numpy(np.random.randn(1, G.z_dim)).to(device)
output = G(image, mask, z, label, truncation_psi=truncation_psi, noise_mode=noise_mode)
output = (output.permute(0, 2, 3, 1) * 127.5 + 127.5).round().clamp(0, 255).to(torch.uint8)
output = output[0].cpu().numpy()
I'm trying to use the Edge TPU USB Accelerator with multiprocessing.
The fist process handles images to a queue, but do not use the Edge TPU. The second Process get the image from the queue and set it to the input tensor. But it seams that after calling invoke() to the tf.lite.Interpreter the process stops.
def load_model(model_file_path: str):
try:
delegate = tf.lite.experimental.load_delegate('libedgetpu.so.1.0')
except ValueError:
logging.error("EdgeTPU could not be loaded")
pass
interpreter = tf.lite.Interpreter(model_file_path, experimental_delegates=[delegate])
interpreter.allocate_tensors()
return interpreter
def get_detection_results(interpreter):
output_details = interpreter.get_output_details()
if get_output_tensor(interpreter, output_details[3]).size == 1:
boxes = get_output_tensor(interpreter, output_details[0])
classes = get_output_tensor(interpreter, output_details[1])
scores = get_output_tensor(interpreter, output_details[2])
count = int(get_output_tensor(interpreter, output_details[3]))
else:
boxes = get_output_tensor(interpreter, output_details[1])
classes = get_output_tensor(interpreter, output_details[3])
scores = get_output_tensor(interpreter, output_details[0])
count = int(get_output_tensor(interpreter, output_details[2]))
# some conversions
classes = DataUtil.convert_class_ids_to_names(classes)
return boxes, classes, scores, count
def set_input_tensor(interpreter, image):
tensor_index = interpreter.get_input_details()[0]['index']
input_tensor = interpreter.tensor(tensor_index)()[0]
img_height, img_width, _ = image.shape
input_height, input_width, _ = input_tensor.shape
if img_height != input_height or img_width != input_width:
logging.error("Image is not the correct size")
else:
input_tensor[:, :, :] = image
This is the problematic class:
class ImgProcessor:
def __init__(self):
self.detect_work_queue = multiprocessing.Queue()
self.detect_result_queue = multiprocessing.Queue()
self.detect_interpreter = TfUtil.load_model(models_path)
This method will be called in a loop for each image of a video:
def process_img(self, frame: np.ndarray):
self.detect_work_queue.put((frame, frame.shape))
This method is running as the second process:
def detect(self, detect_queue, detect_result_queue, lock):
while True:
frame, shape = detect_queue.get()
# paste the image to nn
TfUtil.set_input_tensor(self.detect_interpreter, frame)
# run detection
self.detect_interpreter.invoke() # Error
boxes, classes, scores, _ = TfUtil.get_detection_results(self.detect_interpreter)
detect_result_queue.put((boxes, classes, scores))
Dose anyone know if tf_lite with the edgeTPU is maybe incompatible to multiprocessing or what I'm doing wrong?
I just start to learn about the YOLO v5 PyTorch version and I was able to build a model, so then I tried to implement a flask application for real-time prediction using this trained model.
class for load model and predict
class Model(object):
def __init__(self, model):
self.device = torch_utils.select_device()
print(self.device)
model = torch.load(model, map_location=self.device)['model']
self.half = False and self.device.type != 'cpu'
print('half = ' + str(self.half))
if self.half:
model.half()
# model = model.to(self.device).eval()
model.cuda()
self.loaded_model = model
def predict(self, img):
global session
# img1 = torch.from_numpy(img).to(self.device)
# img = img1.reshape(1, 3, 640, 640)
img = img.half() if self.half else img.float() # uint8 to fp16/32
img /= 255.0 # 0 - 255 to 0.0 - 1.0
print(img.ndimension())
if img.ndimension() == 3:
img = img.unsqueeze(0)
print(self.loaded_model)
img = img.to(self.device)
# img = img.half()
self.preds = self.loaded_model(img, augment=False)[0]
print(self.predict())
return self.preds
Camera class for reading frames from camera or video
model = Model("weights/best.pt")
class Camera(object):
def __init__(self):
# self.video = cv2.VideoCapture('facial_exp.mkv')
self.video = cv2.VideoCapture(0)
def __del__(self):
self.video.release()
def get_frame(self):
_, fr = self.video.read()
loader = transforms.Compose([transforms.ToTensor()])
image = cv2.resize(fr, (640, 640), interpolation=cv2.INTER_AREA)
input_im = image.reshape(1, 640, 640, 3)
pil_im = Image.fromarray(fr)
image = loader(pil_im).float()
# image = Variable(image, requires_grad=True)
image = image.unsqueeze(0)
pred = model.predict(input_im)
pred = model.predict(image)
print(pred)
_, jpeg = cv2.imencode('.jpg', fr)
return jpeg.tobytes()
Some of the commented lines are the ways which I tried but in all times bellow line
self.preds = self.loaded_model(img, augment=False)[0] throws below error
RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.cuda.HalfTensor) should be the same
any idea or guidance for solving this error thank you.
this error means: the input type is float32, the weight type(of your model) is float16.
for exsample, this code below runned:
model.half() # so the weight type is float16
but this code below not runned:
img = img.half() # so the input type is float32
please check your code.
for more information about 'half', you can refer to torch.Tensor.to() and torch.nn.Module.to()
I was facing the same error, basically I was not sending my model to the GPU, so moving the model to the GPU device solved the error:
model = model.to(device)
I finetuned two Mobilenet models on diferent datasets based on the tensorflow object_detection API example from here. When I use eager mode (tf.executing_eagerly() is True) using only one model then the inference runs at 0.036 seconds per image. When I load two models Keras required to convert to graph mode (tf.executing_eagerly() is False) and the inference runs at 1.8 seconds per image. What I'm doing wrong?
def inference(pipeline_config, checkpoint_path):
print('Building model and restoring weights', flush=True)
num_classes = 3
# Load pipeline config and build a detection model.
configs = config_util.get_configs_from_pipeline_file(pipeline_config)
model_config = configs['model']
model_config.ssd.num_classes = num_classes
detection_model = model_builder.build(
model_config=model_config, is_training=False)
ckpt = tf.compat.v2.train.Checkpoint(model=detection_model)
ckpt.restore(checkpoint_path).expect_partial()
# Run model through a dummy image so that variables are created
image, shapes = detection_model.preprocess(tf.zeros([1, 320, 320, 3]))
prediction_dict = detection_model.predict(image, shapes)
_ = detection_model.postprocess(prediction_dict, shapes)
print('Weights restored!')
return detection_model
def get_model_detection_function(detection_model):
"""Get a tf.function for detection."""
# Again, uncomment this decorator if you want to run inference eagerly
#tf.function
def detect(input_tensor):
"""Run detection on an input image.
Args:
input_tensor: A [1, height, width, 3] Tensor of type tf.float32.
Note that height and width can be anything since the image will be
immediately resized according to the needs of the model within this
function.
Returns:
A dict containing 3 Tensors (`detection_boxes`, `detection_classes`,
and `detection_scores`).
"""
preprocessed_image, shapes = detection_model.preprocess(input_tensor)
prediction_dict = detection_model.predict(preprocessed_image, shapes)
return detection_model.postprocess(prediction_dict, shapes)
return detect
def mainProcess():
print('Loading model 1...')
g1 = tf.Graph()
s1 = tf.compat.v1.Session(graph=g1)
with g1.as_default(), s1.as_default():
detection_model_1 = inference('config_1/pipeline.config', 'Checkpoint_1/ckpt-1')
detect_fn_1 = get_model_detection_function(detection_model_1)
s1.run(tf.compat.v1.global_variables_initializer())
print('Loading model 2...')
g2 = tf.Graph()
s2 = tf.compat.v1.Session(graph=g2)
with g2.as_default():
detection_model_2 = inference('config_2/pipeline.config', 'Checkpoint_2/ckpt-1')
detect_fn_2 = get_model_detection_function(detection_model_2)
s2.run(tf.compat.v1.global_variables_initializer())
for i, f in enumerate(listdir('images_dir/')):
...
... read the image
...
with g1.as_default():
with s1.as_default():
sec = time.time()
input_tensor = tf.convert_to_tensor(test_img, dtype=tf.float32)
detections = detect_fn_1(input_tensor)
detections = s1.run(detections)
curr = time.time()
print("Finished iterating in: " + str(curr - sec) + " seconds")
# the same for detection_model_2
For eager mode with only one model the mainProcess is:
def mainProcess():
print('Loading model...')
detection_model_1 = inference('config_1/pipeline.config', 'Checkpoint_1/ckpt-1')
detect_fn_1 = get_model_detection_function(detection_model_1)
for i, f in enumerate(listdir('images_dir/')):
...
... read the image
...
sec = time.time()
input_tensor = tf.convert_to_tensor(test_img, dtype=tf.float32)
detections = detect_fn_1(input_tensor)
print(detections['detection_boxes'][0].numpy())
print(detections['detection_scores'][0].numpy())
curr = time.time()
print("Finished iterating in: " + str(curr - sec) + " seconds")
While I was following the deepdream iPython notebook which is here: https://github.com/google/deepdream/blob/master/dream.ipynb, I successfully ran the code and initialized the network until i get this error:
I0218 20:53:01.108750 12174 net.cpp:283] Network initialization done.
I0218 20:53:06.017426 12174 net.cpp:816] Ignoring source layer data
I0218 20:53:06.139768 12174 net.cpp:816] Ignoring source layer loss
Traceback (most recent call last):
File "/home/andrew/PycharmProjects/deepmeme/deepmeme.py", line 122, in <module>
<IPython.core.display.Image object>
frame = deepdream(net, frame)
File "/home/andrew/PycharmProjects/deepmeme/deepmeme.py", line 78, in deepdream
octaves = [preprocess(net, base_img)]
File "/home/andrew/PycharmProjects/deepmeme/deepmeme.py", line 43, in preprocess
return np.float32(np.rollaxis(img, 2)[::-1]) - net.transformer.mean['data']
KeyError: 'data'
This is my code for the python file:
import sys
sys.path.append("/home/andrew/caffe/python")
from cStringIO import StringIO
import numpy as np
import scipy.ndimage as nd
import PIL.Image
from IPython.display import clear_output, Image, display
from google.protobuf import text_format
import caffe
# If your GPU supports CUDA and Caffe was built with CUDA support,
# uncomment the following to run Caffe operations on the GPU.
# caffe.set_mode_gpu()
# caffe.set_device(0) # select GPU device if multiple devices exist
def showarray(a, fmt='jpeg'):
a = np.uint8(np.clip(a, 0, 255))
f = StringIO()
PIL.Image.fromarray(a).save(f, fmt)
display(Image(data=f.getvalue()))
model_path = '/home/andrew/caffe/models/bvlc_reference_caffenet/' # substitute your path here
net_fn = model_path + 'deploy.prototxt'
param_fn = model_path + 'caffe_train_iter_500.caffemodel'
# Patching model to be able to compute gradients.
# Note that you can also manually add "force_backward: true" line to "deploy.prototxt".
model = caffe.io.caffe_pb2.NetParameter()
text_format.Merge(open(net_fn).read(), model)
model.force_backward = True
open('deploy.prototxt', 'w').write(str(model))
net = caffe.Classifier('/home/andrew/caffe/models/bvlc_reference_caffenet/deploy.prototxt', '/home/andrew/caffe/models/bvlc_reference_caffenet/caffenet_train_iter_500.caffemodel', caffe.TEST)
# a couple of utility functions for converting to and from Caffe's input image layout
def preprocess(net, img):
return np.float32(np.rollaxis(img, 2)[::-1]) - net.transformer.mean['data']
def deprocess(net, img):
return np.dstack((img + net.transformer.mean['data'])[::-1])
def objective_L2(dst):
dst.diff[:] = dst.data
def make_step(net, step_size=1.5, end='inception_4c/output',
jitter=32, clip=True, objective=objective_L2):
'''Basic gradient ascent step.'''
src = net.blobs['data'] # input image is stored in Net's 'data' blob
dst = net.blobs[end]
ox, oy = np.random.randint(-jitter, jitter+1, 2)
src.data[0] = np.roll(np.roll(src.data[0], ox, -1), oy, -2) # apply jitter shift
net.forward(end=end)
objective(dst) # specify the optimization objective
net.backward(start=end)
g = src.diff[0]
# apply normalized ascent step to the input image
src.data[:] += step_size/np.abs(g).mean() * g
src.data[0] = np.roll(np.roll(src.data[0], -ox, -1), -oy, -2) # unshift image
if clip:
bias = net.transformer.mean['data']
src.data[:] = np.clip(src.data, -bias, 255-bias)
def deepdream(net, base_img, iter_n=10, octave_n=4, octave_scale=1.4,
end='inception_4c/output', clip=True, **step_params):
# prepare base images for all octaves
octaves = [preprocess(net, base_img)]
for i in xrange(octave_n-1):
octaves.append(nd.zoom(octaves[-1], (1, 1.0/octave_scale,1.0/octave_scale), order=1))
src = net.blobs['data']
detail = np.zeros_like(octaves[-1]) # allocate image for network-produced details
for octave, octave_base in enumerate(octaves[::-1]):
h, w = octave_base.shape[-2:]
if octave > 0:
# upscale details from the previous octave
h1, w1 = detail.shape[-2:]
detail = nd.zoom(detail, (1, 1.0*h/h1,1.0*w/w1), order=1)
src.reshape(1,3,h,w) # resize the network's input image size
src.data[0] = octave_base+detail
for i in xrange(iter_n):
make_step(net, end=end, clip=clip, **step_params)
# visualization
vis = deprocess(net, src.data[0])
if not clip: # adjust image contrast if clipping is disabled
vis = vis*(255.0/np.percentile(vis, 99.98))
showarray(vis)
print octave, i, end, vis.shape
clear_output(wait=True)
# extract details produced on the current octave
detail = src.data[0]-octave_base
# returning the resulting image
return deprocess(net, src.data[0])
img = np.float32(PIL.Image.open('/home/andrew/caffe/examples/images/cat.jpg'))
showarray(img)
net.blobs.keys()
frame = img
frame_i = 0
h, w = frame.shape[:2]
s = 0.05 # scale coefficient
for i in xrange(100):
frame = deepdream(net, frame)
PIL.Image.fromarray(np.uint8(frame)).save("frames/%04d.jpg"%frame_i)
frame = nd.affine_transform(frame, [1-s,1-s,1], [h*s/2,w*s/2,0], order=1)
frame_i += 1
Image(filename='frames/0029.jpg')
Does anybody know what's happening? I am using my own data that I successfully trained a model with.
From the deepdream iPython notebook:
net = caffe.Classifier('tmp.prototxt', param_fn,
mean = np.float32([104.0, 116.0, 122.0]), # ImageNet mean, training set dependent
channel_swap = (2,1,0)) # the reference model has channels in BGR order instead of RGB
vs your:
net = caffe.Classifier('/home/andrew/caffe/models/bvlc_reference_caffenet/deploy.prototxt', '/home/andrew/caffe/models/bvlc_reference_caffenet/caffenet_train_iter_500.caffemodel', caffe.TEST)
You do not seem to include a mean when you create a caffe.Classifier.
See the definition of caffe.Classifier.
If you don't have a mean, you could probably just remove the mention of mean from preprocess/deprocess:
def preprocess(net, img):
return np.float32(np.rollaxis(img, 2)[::-1])
def deprocess(net, img):
return np.dstack((img)[::-1])