cuMemcpyHtoDAsync failed: invalid argument by using TensorRT (Python) - python

I am trying to copy an np array to the GPU using TensorRT in Python but I keep getting the error 'cuMemcpyHtoDAsync failed: invalid argument'. The array has the correct format (float32) and size, but the error remains. Does anyone have an idea of what I am doing wrong or how I can fix this error?
import tensorrt as trt
import pycuda.driver as cuda
import numpy as np
import cv2
def allocate_buffers(engine):
inputs = []
outputs = []
bindings = []
cuda.init()
device = cuda.Device(0)
ctx = device.make_context()
stream = cuda.Stream()
# stream = cuda.Stream()
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(host_mem)
else:
outputs.append(host_mem)
return inputs, outputs, bindings, stream
def do_inference(context, bindings, inputs, outputs, stream):
# Transfer input data to the GPU.
[cuda.memcpy_htod_async(inp, i, stream) for inp, i in zip(bindings[:len(inputs)], inputs)]
# Run inference.
context.execute_async(bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out, o, stream) for out, o in zip(outputs, bindings[len(inputs):])]
# Synchronize the stream
stream.synchronize()
def detect_objects(image, engine, context, threshold=0.5):
# Preprocess the image
image = cv2.resize(image, (640, 640))
image = np.transpose(image, (2, 0, 1))
image = np.expand_dims(image, axis=0)
# Allocate buffers
inputs, outputs, bindings, stream = allocate_buffers(engine)
#inputs[0] = np.ascontiguousarray(image)
inputs[0] = np.ascontiguousarray(image, dtype=np.float32) / 255.0
print(inputs[0].shape)
print(inputs[0].dtype)
# Run inference
do_inference(context, bindings, inputs, outputs, stream)
# Postprocess the outputs
outputs = outputs[0]
outputs = outputs[outputs[:, 0] > threshold]
# Get the bounding boxes
boxes = outputs[:, 1:]
return boxes
# Load the engine
engine = trt.Runtime(trt.Logger(trt.Logger.WARNING)).deserialize_cuda_engine(open("Modelle/best.engine", "rb").read())
context = engine.create_execution_context()
# Read the image
image = cv2.imread("Test.jpg")
# Detect objects in the image
boxes = detect_objects(image, engine, context)
print (boxes)
or am I doing something fundamentally wrong when loading the tensorRT file? Is there another way to index an object on an image?
Thanks

Related

Python Harvesters Image Acquisition GigeCam

I tried get image from my gige camera. In the camera's own software its working just fine, but when I do it with harvesters my image has a weird grid and I don't know why is it there and how to remove it. I need this for a stereovision project. Any idea?
Don't mind the brightness I tried it with higher expo as well, it did not changed a thing. :D
enter image description here
import genicam.genapi as ge
import cv2
from harvesters.core import Harvester
import matplotlib.pyplot as plt
import numpy as np
# Create a Harvester object:
h = Harvester()
# Load a GenTL Producer; you can load many more if you want to:
h.add_file("C:/Program Files\MATRIX VISION/mvIMPACT Acquire/bin/x64/mvGenTLProducer.cti")
# Enumerate the available devices that GenTL Producers can handle:
h.update()
# Select a target device and create an ImageAcquire object that
# controls the device:
ia = h.create(0)
ia2 = h.create(1)
# Configure the target device; it looks very small but this is just
# for demonstration:
ia.remote_device.node_map.Width.value = 1456
ia.remote_device.node_map.Height.value = 1088
# ia.remote_device.node_map.PixelFormat.symbolics
ia.remote_device.node_map.PixelFormat.value = 'BayerRG8'
ia2.remote_device.node_map.Width.value = 1456
ia2.remote_device.node_map.Height.value = 1088
# ia2.remote_device.node_map.PixelFormat.symbolics
ia2.remote_device.node_map.PixelFormat.value = 'BayerRG8'
ia.remote_device.node_map.ChunkSelector.value = 'ExposureTime'
ia.remote_device.node_map.ExposureTime.set_value(100000.0)
ia2.remote_device.node_map.ChunkSelector.value = 'ExposureTime'
ia2.remote_device.node_map.ExposureTime.set_value(100000.0)
# Allow the ImageAcquire object to start image acquisition:
ia.start()
ia2.start()
# We are going to fetch a buffer filled up with an image:
# Note that you'll have to queue the buffer back to the
# ImageAcquire object once you consumed the buffer; the
# with statement takes care of it on behalf of you:
while True:
with ia.fetch() as buffer:
component = buffer.payload.components[0]
_2d = component.data.reshape(1088, 1456)
img = _2d
img = cv2.resize(img,(640,480))
cv2.imshow('right',img)
cv2.imwrite('test_left.png',img)
cv2.waitKey(10)
with ia2.fetch() as buffer:
component = buffer.payload.components[0]
_2d = component.data.reshape(component.height, component.width)
img2 = _2d
img2 = cv2.resize(img2, (640, 480))
cv2.imshow('left', img2)
cv2.imwrite('test_right.png',img2)
cv2.waitKey(10)
ia.stop()
ia2.stop()
ia.destroy()
ia2.destroy()
h.reset()
I just had to convert it to Gray or RGB with cvtColor, and its working.
Thanks anyway.

Running tensorflow on server Python

trying to use tensorflow on ipage server with operating system centOs 7
I don't know if there is a GPU or not , but I get this error message
kernel driver does not appear to be running on this host /proc/driver/nvidia/version does not exist
I tried this code
from tensorflow import config
config.set_soft_device_placement = True
and also this code trying to prevent tensorflow from using GPU
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
but I kept receiving the same error message
Note :
I am trying to run face recognition script on server to be used by php through terminal using exec() function by passing the image as string base64 and then decoding it and then adding new face or fetching faces and comparing and then will be saved in the database
and this is my script :
import face_recognition as fr
from tensorflow.keras.models import model_from_json
import numpy as np
from PIL import Image
import base64
import io
import json
import cv2
from tensorflow import config
config.set_soft_device_placement = True
def check_if_spoof(image_string_base64):
# decoding the image
msg = base64.b64decode(image_string_base64)
buf = io.BytesIO(msg)
img = Image.open(buf).convert("RGB")
opencv_image = np.array(img)
test_image = opencv_image[:,:,::-1].copy()
json_file = open('antispoofing_models/antispoofing_model.json','r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)
# load antispoofing model weights
model.load_weights('antispoofing_models/antispoofing_model.h5')
resized_face = cv2.resize(test_image,(160,160))
resized_face = resized_face.astype("float") / 255.0
# resized_face = img_to_array(resized_face)
resized_face = np.expand_dims(resized_face, axis=0)
# pass the face ROI through the trained liveness detector
# model to determine if the face is "real" or "fake"
preds = model(resized_face)[0]
if preds> 0.5:
return True
else:
return False
def decode_img(image_string_base64):
msg = base64.b64decode(image_string_base64)
buf = io.BytesIO(msg)
img = Image.open(buf)
return img
def resize_image(image_string_base64, height=500):
image = decode_img(image_string_base64)
height_percent = (height / float(image.size[1]))
width_size = int((float(image.size[0]) * float(height_percent)))
image = image.resize((width_size, height), Image.NEAREST)
return np.array(image)
def add_person(image_string_base64):
if check_if_spoof(image_string_base64):
return {'error':'image is fake'}
img = np.array(decode_img(image_string_base64))
img_encodings = fr.face_encodings(img, num_jitters=4)
return np.array(img_encodings)
def get_person(image_string_base64, data, tolerance=0.35):
if check_if_spoof(image_string_base64):
return {'error':'image is fake'}
data = json.loads(data)
resized_img = resize_image(image_string_base64)
img_encoding = fr.face_encodings(resized_img)
############## testing purpose only #########
# return fr.compare_faces(data,img_encoding)
#############################################
# type of recieved data is : { id ->number : encoding->list}
for user in data:
for id_ , encoding in user:
if True in fr.compare_faces(encoding, img_encoding):
return id_
return {'error':"user not existed"}

How to use "model.trt" in Python

I have a pytorch model that I exported to ONNX and converted to a tensorflow model with the following command:
trtexec --onnx=model.onnx --batch=400 --saveEngine=model.trt
All of this works, but how do I now load this model.trt in python and run the inference?
The official documentation has a lot of examples. The basic steps to follow are:
ONNX parser: takes a trained model in ONNX format as input and populates a network object in TensorRT
Builder: takes a network in TensorRT and generates an engine that is optimized for the target platform
Engine: takes input data, performs inferences and emits inference output
Logger: object associated with the builder and engine to capture errors, warnings and other information during the build and inference phases
An example for the engine is:
import tensorrt as trt
import pycuda.autoinit
import pycuda.driver as cuda
from onnx import ModelProto
import onnx
import numpy as np
import matplotlib.pyplot as plt
from time import time
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
trt_runtime = trt.Runtime(TRT_LOGGER)
#batch_size = 1
explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
#inp_shape = [batch_size, 3, 1024, 1024] # the shape I was using
def build_engine(onnx_path, shape = inp_shape):
with trt.Builder(TRT_LOGGER) as builder,builder.create_builder_config() as config,\
builder.create_network(explicit_batch) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
if builder.platform_has_fast_fp16:
builder.fp16_mode = True
builder.max_workspace_size = (1 << 30)
#builder.max_workspace_size = (3072 << 20)
#profile = builder.create_optimization_profile()
#config.max_workspace_size = (3072 << 20)
#config.add_optimization_profile(profile)
print("parsing")
with open(onnx_path, 'rb') as model:
print("onnx found")
if not parser.parse(model.read()):
print("parse failed")
for error in range(parser.num_errors):
print(parser.get_error(error))
#parser.parse(model.read())
last_layer = network.get_layer(network.num_layers - 1)
# Check if last layer recognizes it's output
if not last_layer.get_output(0):
# If not, then mark the output using TensorRT API
network.mark_output(last_layer.get_output(0))
network.get_input(0).shape = shape
engine = builder.build_cuda_engine(network)
return engine
def save_engine(engine, file_name):
buf = engine.serialize()
with open(file_name, 'wb') as f:
f.write(buf)
def load_engine(trt_runtime, plan_path):
with open(engine_path, 'rb') as f:
engine_data = f.read()
engine = trt_runtime.deserialize_cuda_engine(engine_data)
return engine
if __name__ == "__main__":
onnx_path = "./path/to/your/model.onnx"
engine_name = "./path/to/engine.plan"
model = ModelProto()
with open(onnx_path, "rb") as f:
model.ParseFromString(f.read())
d0 = model.graph.input[0].type.tensor_type.shape.dim[1].dim_value
d1 = model.graph.input[0].type.tensor_type.shape.dim[2].dim_value
d2 = model.graph.input[0].type.tensor_type.shape.dim[3].dim_value
shape = [batch_size , d0, d1 ,d2]
print(shape)
print("trying to build engine")
engine = build_engine(onnx_path,shape)
save_engine(engine,engine_name)
print("finished")
Follow this page for another example and information.
Found an answer based on this tutorial.
import numpy as np
import tensorrt as trt
import pycuda.driver as cuda
dev = cuda.Device(0)
ctx = dev.make_context()
try:
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
with open("model.trt", 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
engine = runtime.deserialize_cuda_engine(f.read())
with engine.create_execution_context() as context:
# get sizes of input and output and allocate memory required for input data and for output data
for binding in engine:
if engine.binding_is_input(binding): # we expect only one input
input_shape = engine.get_binding_shape(binding)
input_size = trt.volume(input_shape) * engine.max_batch_size * np.dtype(np.float32).itemsize # in bytes
device_input = cuda.mem_alloc(input_size)
else: # and one output
output_shape = engine.get_binding_shape(binding)
# create page-locked memory buffers (i.e. won't be swapped to disk)
host_output = cuda.pagelocked_empty(trt.volume(output_shape) * engine.max_batch_size, dtype=np.float32)
device_output = cuda.mem_alloc(host_output.nbytes)
stream = cuda.Stream()
host_input = np.array(batch, dtype=np.float32, order='C')
cuda.memcpy_htod_async(device_input, host_input, stream)
context.execute_async(bindings=[int(device_input), int(device_output)], stream_handle=stream.handle)
cuda.memcpy_dtoh_async(host_output, device_output, stream)
stream.synchronize()
# postprocess results
output_data = host_output.reshape(engine.max_batch_size, output_shape[0]).T
finally:
ctx.pop()

an illegal memory access was encountered using PyCUDA and TensorRT

I used TensorRT in python code. So I use PyCUDA.
In the following inference code, there is an illegal memory access was encountered happened at stream.synchronize().
def infer(engine, x, batch_size, context):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
img = np.array(x).ravel()
np.copyto(inputs[0].host, 1.0 - img / 255.0)
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out.host for out in outputs]
What could be wrong?
EDIT:
My program is combination of Tensorflow and TensorRT codes.
The error happened only when I run
self.graph = tf.get_default_graph()
self.persistent_sess = tf.Session(graph=self.graph, config=tf_config)
before running infer(). If I don't run the above two lines, I have no issue.
The issue here is I have two python codes.
Say tensorrtcode.py and tensorflowcode.py.
tensorrtcode.py has only tensorrt codes.
def infer(engine, x, batch_size, context):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
img = np.array(x).ravel()
np.copyto(inputs[0].host, 1.0 - img / 255.0)
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out.host for out in outputs]
def main():
.....
infer(......)
.....
Then tensorflowcode.py has only tensorflow apis and execute with session.
self.graph = tf.get_default_graph()
self.persistent_sess = tf.Session(graph=self.graph, config=tf_config)
Problem is when I need to interface class from tensorflow to tensorrt class,
declare the tensorflow code's class instance inside tensorrt's main as
def main():
.....
t_flow_code=tensorflowclass()
infer(......)
.....
then I have error as illegal memory access was encountered happened at stream.synchronize()
The problem is solved by adding another session at tensorrt just before t_flow_code=tensorflowclass().
I don't understand why I need as I have it own session for execution at tensorflow class. Why I need another session before class interface in tensorrt code.

Inference time using of Tensorflow Object Detection

I have deployed my object detection model to Google Kubernetes Engine. My model is trained using faster_rcnn_resnet101_pets configuration. The inference time of my model is very high (~10 seconds total time for prediction and ) even though I am using a Nvidia Tesla K80 GPU in my cluster node. I am using gRPC for getting predicitons from the model. The script for making prediciton requests is :
import argparse
import os
import time
import sys
import tensorflow as tf
from PIL import Image
import numpy as np
from grpc.beta import implementations
sys.path.append("..")
from object_detection.core.standard_fields import \
DetectionResultFields as dt_fields
from object_detection.utils import label_map_util
from argparse import RawTextHelpFormatter
from tensorflow_serving.apis import predict_pb2
from tensorflow_serving.apis import prediction_service_pb2_grpc
tf.logging.set_verbosity(tf.logging.INFO)
WIDTH = 1024
HEIGHT = 768
def load_image_into_numpy_array(input_image):
image = Image.open(input_image)
image = image.resize((WIDTH, HEIGHT), Image.ANTIALIAS)
(im_width, im_height) = image.size
image_arr = np.array(image.getdata()).reshape(
(im_height, im_width, 3)).astype(np.uint8)
image.close()
return image_arr
def load_input_tensor(input_image):
image_np = load_image_into_numpy_array(input_image)
image_np_expanded = np.expand_dims(image_np, axis=0).astype(np.uint8)
tensor = tf.contrib.util.make_tensor_proto(image_np_expanded)
return tensor
def main(args):
start_main = time.time()
host, port = args.server.split(':')
channel = implementations.insecure_channel(host, int(port))._channel
stub = prediction_service_pb2_grpc.PredictionServiceStub(channel)
request = predict_pb2.PredictRequest()
request.model_spec.name = args.model_name
input_tensor = load_input_tensor(args.input_image)
request.inputs['inputs'].CopyFrom(input_tensor)
start = time.time()
result = stub.Predict(request, 60.0)
end = time.time()
output_dict = {}
output_dict[dt_fields.detection_classes] = np.squeeze(
result.outputs[dt_fields.detection_classes].float_val).astype(np.uint8)
output_dict[dt_fields.detection_boxes] = np.reshape(
result.outputs[dt_fields.detection_boxes].float_val, (-1, 4))
output_dict[dt_fields.detection_scores] = np.squeeze(
result.outputs[dt_fields.detection_scores].float_val)
category_index = label_map_util.create_category_index_from_labelmap(args.label_map,
use_display_name=True)
classes = output_dict[dt_fields.detection_classes]
scores = output_dict[dt_fields.detection_scores]
classes.shape = (1, 300)
scores.shape = (1, 300)
print("prediction time : " + str(end-start))
objects = []
threshold = 0.5 # in order to get higher percentages you need to lower this number; usually at 0.01 you get 100% predicted objects
for index, value in enumerate(classes[0]):
object_dict = {}
if scores[0, index] > threshold:
object_dict[(category_index.get(value)).get('name').encode('utf8')] = \
scores[0, index]
objects.append(object_dict)
print(objects)
end_main = time.time()
print("Overall Time : " + str(end_main-start_main))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Object detection grpc client.",
formatter_class=RawTextHelpFormatter)
parser.add_argument('--server',
type=str,
default='localhost:9000',
help='PredictionService host:port')
parser.add_argument('--model_name',
type=str,
default="my-model",
help='Name of the model')
parser.add_argument('--input_image',
type=str,
default='./test_images/123.jpg',
help='Path to input image')
parser.add_argument('--output_directory',
type=str,
default='./',
help='Path to output directory')
parser.add_argument('--label_map',
type=str,
default="./data/object_detection.pbtxt",
help='Path to label map file')
args = parser.parse_args()
main(args)
I have used kubectl port forwarding for testing purposes so the request port is set to localhost:9000.
The output is :
prediction time : 6.690936326980591
[{b'goi_logo': 0.9999970197677612}]
Overall Time : 10.25893259048462
What can I do to make my inference faster? I have seen that the inference time is in the order of milliseconds so in comparison 10 seconds is a very long duration and unfit for production environments. I understand that port forwarding is slow. What is another method that I can use? I need to make this client available to the world as an API endpoint.
As previous answers stated, you should indeed try to do multiple requests because tf-serving needs some overhead the first time(s). You can prevent this by using a warm-up script.
To add some extra options:
from tf-serving v1.8 you can also use a http rest API service. Then you can call the service that you have created on your GKE from a google compute engine to reduce the connection lag. In my case it had a big speed-up because my local connection was mediocre at best. Next to http rest api being more workable to debug, you can also send much bigger requests. The grpc limit seems to be 1.5 mb while the http one is a lot higher.
Are you sending b64 encoded images? Sending the images themselves is a lot slower than sending b64 encoded strings. The way I handled this is sending b64 encoded strings from the images and create some extra layers in front of my network that transform the string to jpeg images again and then process them through the model. Some code to help you on your way:
from keras.applications.inception_v3 import InceptionV3, preprocess_input
from keras.models import Model
import numpy as np
import cv2
import tensorflow as tf
from keras.layers import Input, Lambda
from keras import backend as K
base_model = InceptionV3(
weights='imagenet',
include_top=True)
model = Model(
inputs=base_model.input,
outputs=base_model.get_layer('avg_pool').output)
def prepare_image(image_str_tensor):
#image = tf.squeeze(tf.cast(image_str_tensor, tf.string), axis=[0])
image_str_tensor = tf.cast(image_str_tensor, tf.string)
image = tf.image.decode_jpeg(image_str_tensor,
channels=3)
#image = tf.divide(image, 255)
#image = tf.expand_dims(image, 0)
image = tf.image.convert_image_dtype(image, tf.float32)
return image
def prepare_image_batch(image_str_tensor):
return tf.map_fn(prepare_image, image_str_tensor, dtype=tf.float32)
# IF BYTE STR
model.layers.pop(0)
print(model.layers[0])
input_img = Input(dtype= tf.string,
name ='string_input',
shape = ()
)
outputs = Lambda(prepare_image_batch)(input_img)
outputs = model(outputs)
inception_model = Model(input_img, outputs)
inception_model.compile(optimizer = "sgd", loss='categorical_crossentropy')
weights = inception_model.get_weights()
Next to that, I would say use a bigger gpu. I have basic yolo (keras implementation) now running on a P100 with about 0.4s latency when called from a compute engine. We noticed that the darknet implementation (in c++) is a lot faster than the keras implementation tho.

Categories

Resources