How to use "model.trt" in Python

How to use "model.trt" in Python - python

I have a pytorch model that I exported to ONNX and converted to a tensorflow model with the following command:
trtexec --onnx=model.onnx --batch=400 --saveEngine=model.trt
All of this works, but how do I now load this model.trt in python and run the inference?

The official documentation has a lot of examples. The basic steps to follow are:
ONNX parser: takes a trained model in ONNX format as input and populates a network object in TensorRT
Builder: takes a network in TensorRT and generates an engine that is optimized for the target platform
Engine: takes input data, performs inferences and emits inference output
Logger: object associated with the builder and engine to capture errors, warnings and other information during the build and inference phases
An example for the engine is:
import tensorrt as trt
import pycuda.autoinit
import pycuda.driver as cuda
from onnx import ModelProto
import onnx
import numpy as np
import matplotlib.pyplot as plt
from time import time
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
trt_runtime = trt.Runtime(TRT_LOGGER)
#batch_size = 1
explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
#inp_shape = [batch_size, 3, 1024, 1024] # the shape I was using
def build_engine(onnx_path, shape = inp_shape):
with trt.Builder(TRT_LOGGER) as builder,builder.create_builder_config() as config,\
builder.create_network(explicit_batch) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
if builder.platform_has_fast_fp16:
builder.fp16_mode = True
builder.max_workspace_size = (1 << 30)
#builder.max_workspace_size = (3072 << 20)
#profile = builder.create_optimization_profile()
#config.max_workspace_size = (3072 << 20)
#config.add_optimization_profile(profile)
print("parsing")
with open(onnx_path, 'rb') as model:
print("onnx found")
if not parser.parse(model.read()):
print("parse failed")
for error in range(parser.num_errors):
print(parser.get_error(error))
#parser.parse(model.read())
last_layer = network.get_layer(network.num_layers - 1)
# Check if last layer recognizes it's output
if not last_layer.get_output(0):
# If not, then mark the output using TensorRT API
network.mark_output(last_layer.get_output(0))
network.get_input(0).shape = shape
engine = builder.build_cuda_engine(network)
return engine
def save_engine(engine, file_name):
buf = engine.serialize()
with open(file_name, 'wb') as f:
f.write(buf)
def load_engine(trt_runtime, plan_path):
with open(engine_path, 'rb') as f:
engine_data = f.read()
engine = trt_runtime.deserialize_cuda_engine(engine_data)
return engine
if __name__ == "__main__":
onnx_path = "./path/to/your/model.onnx"
engine_name = "./path/to/engine.plan"
model = ModelProto()
with open(onnx_path, "rb") as f:
model.ParseFromString(f.read())
d0 = model.graph.input[0].type.tensor_type.shape.dim[1].dim_value
d1 = model.graph.input[0].type.tensor_type.shape.dim[2].dim_value
d2 = model.graph.input[0].type.tensor_type.shape.dim[3].dim_value
shape = [batch_size , d0, d1 ,d2]
print(shape)
print("trying to build engine")
engine = build_engine(onnx_path,shape)
save_engine(engine,engine_name)
print("finished")
Follow this page for another example and information.

Found an answer based on this tutorial.
import numpy as np
import tensorrt as trt
import pycuda.driver as cuda
dev = cuda.Device(0)
ctx = dev.make_context()
try:
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
with open("model.trt", 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
engine = runtime.deserialize_cuda_engine(f.read())
with engine.create_execution_context() as context:
# get sizes of input and output and allocate memory required for input data and for output data
for binding in engine:
if engine.binding_is_input(binding): # we expect only one input
input_shape = engine.get_binding_shape(binding)
input_size = trt.volume(input_shape) * engine.max_batch_size * np.dtype(np.float32).itemsize # in bytes
device_input = cuda.mem_alloc(input_size)
else: # and one output
output_shape = engine.get_binding_shape(binding)
# create page-locked memory buffers (i.e. won't be swapped to disk)
host_output = cuda.pagelocked_empty(trt.volume(output_shape) * engine.max_batch_size, dtype=np.float32)
device_output = cuda.mem_alloc(host_output.nbytes)
stream = cuda.Stream()
host_input = np.array(batch, dtype=np.float32, order='C')
cuda.memcpy_htod_async(device_input, host_input, stream)
context.execute_async(bindings=[int(device_input), int(device_output)], stream_handle=stream.handle)
cuda.memcpy_dtoh_async(host_output, device_output, stream)
stream.synchronize()
# postprocess results
output_data = host_output.reshape(engine.max_batch_size, output_shape[0]).T
finally:
ctx.pop()

Related

cuMemcpyHtoDAsync failed: invalid argument by using TensorRT (Python)

I am trying to copy an np array to the GPU using TensorRT in Python but I keep getting the error 'cuMemcpyHtoDAsync failed: invalid argument'. The array has the correct format (float32) and size, but the error remains. Does anyone have an idea of what I am doing wrong or how I can fix this error?
import tensorrt as trt
import pycuda.driver as cuda
import numpy as np
import cv2
def allocate_buffers(engine):
inputs = []
outputs = []
bindings = []
cuda.init()
device = cuda.Device(0)
ctx = device.make_context()
stream = cuda.Stream()
# stream = cuda.Stream()
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(host_mem)
else:
outputs.append(host_mem)
return inputs, outputs, bindings, stream
def do_inference(context, bindings, inputs, outputs, stream):
# Transfer input data to the GPU.
[cuda.memcpy_htod_async(inp, i, stream) for inp, i in zip(bindings[:len(inputs)], inputs)]
# Run inference.
context.execute_async(bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out, o, stream) for out, o in zip(outputs, bindings[len(inputs):])]
# Synchronize the stream
stream.synchronize()
def detect_objects(image, engine, context, threshold=0.5):
# Preprocess the image
image = cv2.resize(image, (640, 640))
image = np.transpose(image, (2, 0, 1))
image = np.expand_dims(image, axis=0)
# Allocate buffers
inputs, outputs, bindings, stream = allocate_buffers(engine)
#inputs[0] = np.ascontiguousarray(image)
inputs[0] = np.ascontiguousarray(image, dtype=np.float32) / 255.0
print(inputs[0].shape)
print(inputs[0].dtype)
# Run inference
do_inference(context, bindings, inputs, outputs, stream)
# Postprocess the outputs
outputs = outputs[0]
outputs = outputs[outputs[:, 0] > threshold]
# Get the bounding boxes
boxes = outputs[:, 1:]
return boxes
# Load the engine
engine = trt.Runtime(trt.Logger(trt.Logger.WARNING)).deserialize_cuda_engine(open("Modelle/best.engine", "rb").read())
context = engine.create_execution_context()
# Read the image
image = cv2.imread("Test.jpg")
# Detect objects in the image
boxes = detect_objects(image, engine, context)
print (boxes)
or am I doing something fundamentally wrong when loading the tensorRT file? Is there another way to index an object on an image?
Thanks

Running python script with multiple values of command line arguments

I have a python script for pre-processing audio and it has frame length, frame step and fft length as the command line arguments. I am able to run the code if I have single values of these arguments. I wanted to know if there is a way in which I can run the python script with multiple values of the arguments? For example, get the output if values of fft lengths are 128, 256 and 512 instead of just one value.
The code for pre-processing is as follows:
import numpy as np
import pandas as pd
import tensorflow as tf
from scipy.io import wavfile
import os
import time
import pickle
import random
import argparse
import configlib
from configlib import config as C
import mfccwithpaddingandcmd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow import keras
from tensorflow.python.keras import Sequential
from tensorflow.keras.layers import Dense,Conv2D,MaxPooling2D,Flatten,Dropout,BatchNormalization,LSTM,Lambda,Reshape,Bidirectional,GRU
from tensorflow.keras.callbacks import TensorBoard
start = time.time()
classes = ['blinds','fan','light','music','tv']
#dire = r"/mnt/beegfs/home/gehani/test_speech_command/"
parser = configlib.add_parser("Preprocessing config")
parser.add_argument("-dir","--dire", metavar="", help="Directory for the audio files")
def pp():
data_list=[] #To save paths of all the audio files.....all audio files in list format in data_list
#data_list-->folder-->files in folder
for index,label in enumerate(classes):
class_list=[]
if label=='silence': #creating silence folder and storing 1sec noise audio files
silence_path = os.path.join(C["dire"],'silence')
if not os.path.exists(silence_path):
os.mkdir(silence_path)
silence_stride = 2000
#sample_rate = 16000
folder = os.path.join(C["dire"],'_background_noise_') #all silence are kept in the background_noise folder
for file_ in os.listdir(folder):
if '.wav' in file_:
load_path = os.path.join(folder,file_)
sample_rate,y = wavfile.read(load_path)
for i in range(0,len(y)-sample_rate,silence_stride):
file_path = "silence/{}_{}.wav".format(file_[:-4],i)
y_slice = y[i:i+sample_rate]
wavfile.write(os.path.join(C["dire"],file_path),sample_rate,y_slice)
class_list.append(file_path)
else:
folder = os.path.join(C["dire"],label)
for file_ in os.listdir(folder):
file_path = '{}/{}'.format(label,file_) #Ex: up/c9b653a0_nohash_2.wav
class_list.append(file_path)
random.shuffle(class_list) #To shuffle files
data_list.append(class_list) #if not a silence file then just append to the datalist
X = []
Y = []
preemphasis = 0.985
print("Feature Extraction Started")
for i,class_list in enumerate(data_list): #datalist = all files, class list = folder name in datalist, sample = path to the audio file in that particular class list
for j,samples in enumerate(class_list): #samples are of the form classes_name/audio file
if(samples.endswith('.wav')):
sample_rate,audio = wavfile.read(os.path.join(C["dire"],samples))
if(audio.size<sample_rate):
audio = np.pad(audio,(sample_rate-audio.size,0),mode="constant")
#print("****")
#print(sample_rate)
#print(preemphasis)
#print(audio.shape)
coeff = mfccwithpaddingandcmd.mfcc(audio,sample_rate,preemphasis) # 0.985 = preemphasis
#print("****")
#print(coeff)
#print("****")
X.append(coeff)
#print(X)
if(samples.split('/')[0] in classes):
Y.append(samples.split('/')[0])
elif(samples.split('/')[0]=='_background_noise_'):
Y.append('silence')
#print(len(X))
#print(len(Y))
#X= coefficient array and Y = name of the class
A = np.zeros((len(X),X[0].shape[0],X[0][0].shape[0]),dtype='object')
for i in range(0,len(X)):
A[i] = np.array(X[i]) #Converting list X into array A
end1 = time.time()
print("Time taken for feature extraction:{}sec".format(end1-start))
MLB = MultiLabelBinarizer() # one hot encoding for converting labels into binary form
MLB.fit(pd.Series(Y).fillna("missing").str.split(', '))
Y_MLB = MLB.transform(pd.Series(Y).fillna("missing").str.split(', '))
MLB.classes_ #Same like classes array
print(Y_MLB.shape)
pickle_out = open("A_all.pickle","wb") #Writes array A to a file A.pickle
pickle.dump(A, pickle_out) #pickle is the file containing the extracted features
pickle_out.close()
pickle_out = open("Y_all.pickle","wb")
pickle.dump(Y_MLB, pickle_out)
pickle_out.close()
pickle_in = open("Y_all.pickle","rb")
Y = pickle.load(pickle_in)
X = tf.keras.utils.normalize(X)
X_train,X_valtest,Y_train,Y_valtest = train_test_split(X,Y,test_size=0.2,random_state=37)
X_val,X_test,Y_val,Y_test = train_test_split(X_valtest,Y_valtest,test_size=0.5,random_state=37)
print(X_train.shape,X_val.shape,X_test.shape,Y_train.shape,Y_val.shape,Y_test.shape)
if __name__ == "__main__":
configlib.parse(save_fname="last_arguments.txt")
print("Running with configuration:")
configlib.print_config()
pp()
The code for MFCC is as follows:
import tensorflow as tf
import scipy.io.wavfile as wav
import numpy as np
import matplotlib.pyplot as plt
import pickle
import argparse
import configlib
from configlib import config as C
# Configuration arguments
parser = configlib.add_parser("MFCC config")
parser.add_argument("-fl","--frame_length", type=int, default=400, metavar="", help="Frame Length")
parser.add_argument("-fs","--frame_step", type=int, default=160, metavar="", help="Frame Step")
parser.add_argument("-fft","--fft_length", type=int, default=512, metavar="", help="FFT length")
#args = parser.parse_args()
def Preemphasis(signal,pre_emp):
return np.append(signal[0],signal[1:]-pre_emp*signal[:-1])
def Paddinggg(framelength,framestep,samplerate):
frameStart = np.arange(0,samplerate,framestep)
frameEnd = frameStart + framelength
padding = min(frameEnd[(frameEnd > samplerate)]) - samplerate
return padding
def mfcc(audio,sample_rate,pre_emp):
audio = np.pad(audio,(Paddinggg(C["frame_length"],C["frame_step"],sample_rate),0),mode='reflect')
audio = audio.astype('float32')
#Normalization
audio = tf.keras.utils.normalize(audio)
#Preemphasis
audio = Preemphasis(audio,pre_emp)
stfts = tf.signal.stft(audio,C["frame_length"],C["frame_step"],C["fft_length"],window_fn=tf.signal.hann_window)
spectrograms = tf.abs(stfts)
num_spectrogram_bins = stfts.shape[-1]
lower_edge_hertz, upper_edge_hertz, num_mel_bins = 0.0, sample_rate/2.0, 32
linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(num_mel_bins, num_spectrogram_bins, sample_rate, lower_edge_hertz,upper_edge_hertz)
mel_spectrograms = tf.tensordot(spectrograms, linear_to_mel_weight_matrix, 1)
mel_spectrograms.set_shape(spectrograms.shape[:-1].concatenate(linear_to_mel_weight_matrix.shape[-1:]))
# Compute a stabilized log to get log-magnitude mel-scale spectrograms.
log_mel_spectrograms = tf.math.log(mel_spectrograms + 1e-6)
# Compute MFCCs from log_mel_spectrograms and take the first 13.
return log_mel_spectrograms
print("End")
And the code for configlib is as follows:
from typing import Dict, Any
import logging
import pprint
import sys
import argparse
# Logging for config library
logger = logging.getLogger(__name__)
# Our global parser that we will collect arguments into
parser = argparse.ArgumentParser(description=__doc__, fromfile_prefix_chars="#")
# Global configuration dictionary that will contain parsed arguments
# It is also this variable that modules use to access parsed arguments
config:Dict[str, Any] = {}
def add_parser(title: str, description: str = ""):
"""Create a new context for arguments and return a handle."""
return parser.add_argument_group(title, description)
def parse(save_fname: str = "") -> Dict[str, Any]:
"""Parse given arguments."""
config.update(vars(parser.parse_args()))
logging.info("Parsed %i arguments.", len(config))
# Optionally save passed arguments
if save_fname:
with open(save_fname, "w") as fout:
fout.write("\n".join(sys.argv[1:]))
logging.info("Saving arguments to %s.", save_fname)
return config
def print_config():
"""Print the current config to stdout."""
pprint.pprint(config)
I use the following command to run my python file:
python3.7 preprocessingwithpaddingandcmd.py -fl 1103 -fs 88 -fft 512 -dir /mnt/beegfs/home/gehani/appliances_audio_one_channel
Should I be writing a shell script or python has some options for it?
EDIT 1
I tried using
parser.add_argument('-fft', '--fft_length', type=int, default=[], nargs=3)
for getting fft length from the command line and used the command
run preprocessingwithpaddingandcmd -dir filepath -fl 1765 -fs 1102 -fft 512 218 64
to run it. But, it gives me this error: ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Can anyone please help?

I found you can do it by these. mfcc features extraction
You can create your own mfcc features extraction or you can limit window lengths and ceptrums that is enough for simple works except you need logarithms scales where you can use target matrix ( convolution ) or else.
It is logarithms when you use FFT or alternative derivation but mfcc is only extraction where I will provide the sample output in picture.
[ Sample ]:
from python_speech_features import mfcc
from python_speech_features import logfbank
import scipy.io.wavfile as wav
import tensorflow as tf
import matplotlib.pyplot as plt
(rate,sig) = wav.read("F:\\temp\\Python\\Speech\\temple_of_love-sisters_of_mercy.wav")
mfcc_feat = mfcc(signal=sig, samplerate=rate, winlen=0.025, winstep=0.01, numcep=13, nfilt=26, nfft=512, lowfreq=0, highfreq=None, preemph=0.97, ceplifter=22, appendEnergy=True)
fbank_feat = logfbank(sig,rate)
plt.plot( mfcc_feat[50:42000,0] )
plt.xlabel("sample")
plt.show()
plt.close()
input('...')

Running tensorflow on server Python

trying to use tensorflow on ipage server with operating system centOs 7
I don't know if there is a GPU or not , but I get this error message
kernel driver does not appear to be running on this host /proc/driver/nvidia/version does not exist
I tried this code
from tensorflow import config
config.set_soft_device_placement = True
and also this code trying to prevent tensorflow from using GPU
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
but I kept receiving the same error message
Note :
I am trying to run face recognition script on server to be used by php through terminal using exec() function by passing the image as string base64 and then decoding it and then adding new face or fetching faces and comparing and then will be saved in the database
and this is my script :
import face_recognition as fr
from tensorflow.keras.models import model_from_json
import numpy as np
from PIL import Image
import base64
import io
import json
import cv2
from tensorflow import config
config.set_soft_device_placement = True
def check_if_spoof(image_string_base64):
# decoding the image
msg = base64.b64decode(image_string_base64)
buf = io.BytesIO(msg)
img = Image.open(buf).convert("RGB")
opencv_image = np.array(img)
test_image = opencv_image[:,:,::-1].copy()
json_file = open('antispoofing_models/antispoofing_model.json','r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)
# load antispoofing model weights
model.load_weights('antispoofing_models/antispoofing_model.h5')
resized_face = cv2.resize(test_image,(160,160))
resized_face = resized_face.astype("float") / 255.0
# resized_face = img_to_array(resized_face)
resized_face = np.expand_dims(resized_face, axis=0)
# pass the face ROI through the trained liveness detector
# model to determine if the face is "real" or "fake"
preds = model(resized_face)[0]
if preds> 0.5:
return True
else:
return False
def decode_img(image_string_base64):
msg = base64.b64decode(image_string_base64)
buf = io.BytesIO(msg)
img = Image.open(buf)
return img
def resize_image(image_string_base64, height=500):
image = decode_img(image_string_base64)
height_percent = (height / float(image.size[1]))
width_size = int((float(image.size[0]) * float(height_percent)))
image = image.resize((width_size, height), Image.NEAREST)
return np.array(image)
def add_person(image_string_base64):
if check_if_spoof(image_string_base64):
return {'error':'image is fake'}
img = np.array(decode_img(image_string_base64))
img_encodings = fr.face_encodings(img, num_jitters=4)
return np.array(img_encodings)
def get_person(image_string_base64, data, tolerance=0.35):
if check_if_spoof(image_string_base64):
return {'error':'image is fake'}
data = json.loads(data)
resized_img = resize_image(image_string_base64)
img_encoding = fr.face_encodings(resized_img)
############## testing purpose only #########
# return fr.compare_faces(data,img_encoding)
#############################################
# type of recieved data is : { id ->number : encoding->list}
for user in data:
for id_ , encoding in user:
if True in fr.compare_faces(encoding, img_encoding):
return id_
return {'error':"user not existed"}

Cannot get prediction from google AI platform with Pytorch

I've deployed a custom Pytorch model to the Google AI platform for prediction, but when I try to make a prediction request with image data using gcloud tools I get the following error in response:
{
"error": "Prediction failed: unknown error."
}
I've tried to encode my image data in b64 format or to place it into a multidimensional python array, by doing the following:
pil_im = Image.open('Pic512.png')
pil_im = pil_im.resize((224,224)).convert('RGB')
im_arr = np.asarray(pil_im)
py_arr = im_arr.tolist()
json_instance_1 = {'instances': py_arr}
with open('json_instance_1.json', 'w') as f:
json.dump(json_instance_1, f)
I converted it into b64 like so, after adjusting my Predictor code accordingly:
with open('Pic512.png', 'rb') as f:
byte_im = f.read()
json_instance = {'instances': {'b64': base64.b64encode(byte_im).decode()}}
with open('json_instance.json', 'w') as f:
json.dump(json_instance, f)
I've tried converting with different file formats and similar methods, but all of them give me the same error.
My predictor module:
from facenet_pytorch import MTCNN, InceptionResnetV1, extract_face
import torch
from torchvision import transforms
from torch.nn import functional as F
from PIL import Image
# from sklearn.externals import joblib
import numpy as np
import os
import io
import base64
class MyPredictor(object):
"""An example Predictor for an AI Platform custom prediction routine."""
def __init__(self, model, preprocessor, device):
"""Stores artifacts for prediction. Only initialized via `from_path`.
"""
self._resnet = model
self._mtcnn_mult = preprocessor
self._device = device
self.get_std_tensor = transforms.Compose([
np.float32,
np.uint8,
transforms.ToTensor(),
])
self.tensor2pil = transforms.ToPILImage(mode='RGB')
self.trans_resnet = transforms.Compose([
transforms.Resize((100, 100)),
np.float32,
transforms.ToTensor()
])
def predict(self, instances, **kwargs):
pil_transform = transforms.Resize((512, 512))
imarr = np.uint8(np.array(instances))
# img_bytes_string = io.BytesIO(base64.b64decode(instances))
pil_im = Image.fromarray(imarr)
# pil_im = Image.open(img_bytes_string)
image = pil_im.convert('RGB')
pil_im_512 = pil_transform(image)
boxes, _ = self._mtcnn_mult.detect(pil_im_512)
box = boxes[0]
face_tensor = extract_face(pil_im_512, box, margin=40)
std_tensor = self.get_std_tensor(face_tensor.permute(1, 2, 0))
cropped_pil_im = self.tensor2pil(std_tensor)
face_tensor = self.trans_resnet(cropped_pil_im)
face_tensor4d = face_tensor.unsqueeze(0)
face_tensor4d = face_tensor4d.to(self._device)
self._resnet.eval()
prediction = self._resnet(face_tensor4d)
preds = F.softmax(prediction, dim=1).detach().numpy().reshape(-1)
print('probability of (class1, class2) = ({:.4f}, {:.4f})'.format(preds[0], preds[1]))
return {'probs':preds.tolist()}
#classmethod
def from_path(cls, model_dir):
device_path = os.path.join(model_dir, 'device_cpu.pt')
device = torch.load(device_path)
model_path = os.path.join(model_dir, 'FullResNetRefinedExtra_no_norm_100x100_8634.pt')
classifier = torch.load(model_path, map_location=device)
mtcnn_path = os.path.join(model_dir, 'mtcnn_mult.pt')
mtcnn_mult = torch.load(mtcnn_path)
return cls(classifier, mtcnn_mult, device)
When I test the class locally everything works, so I assume it's a problem related the serialisation and deserialisation on the side of Google Platform. How can I resolve this issue?

Inference time using of Tensorflow Object Detection

I have deployed my object detection model to Google Kubernetes Engine. My model is trained using faster_rcnn_resnet101_pets configuration. The inference time of my model is very high (~10 seconds total time for prediction and ) even though I am using a Nvidia Tesla K80 GPU in my cluster node. I am using gRPC for getting predicitons from the model. The script for making prediciton requests is :
import argparse
import os
import time
import sys
import tensorflow as tf
from PIL import Image
import numpy as np
from grpc.beta import implementations
sys.path.append("..")
from object_detection.core.standard_fields import \
DetectionResultFields as dt_fields
from object_detection.utils import label_map_util
from argparse import RawTextHelpFormatter
from tensorflow_serving.apis import predict_pb2
from tensorflow_serving.apis import prediction_service_pb2_grpc
tf.logging.set_verbosity(tf.logging.INFO)
WIDTH = 1024
HEIGHT = 768
def load_image_into_numpy_array(input_image):
image = Image.open(input_image)
image = image.resize((WIDTH, HEIGHT), Image.ANTIALIAS)
(im_width, im_height) = image.size
image_arr = np.array(image.getdata()).reshape(
(im_height, im_width, 3)).astype(np.uint8)
image.close()
return image_arr
def load_input_tensor(input_image):
image_np = load_image_into_numpy_array(input_image)
image_np_expanded = np.expand_dims(image_np, axis=0).astype(np.uint8)
tensor = tf.contrib.util.make_tensor_proto(image_np_expanded)
return tensor
def main(args):
start_main = time.time()
host, port = args.server.split(':')
channel = implementations.insecure_channel(host, int(port))._channel
stub = prediction_service_pb2_grpc.PredictionServiceStub(channel)
request = predict_pb2.PredictRequest()
request.model_spec.name = args.model_name
input_tensor = load_input_tensor(args.input_image)
request.inputs['inputs'].CopyFrom(input_tensor)
start = time.time()
result = stub.Predict(request, 60.0)
end = time.time()
output_dict = {}
output_dict[dt_fields.detection_classes] = np.squeeze(
result.outputs[dt_fields.detection_classes].float_val).astype(np.uint8)
output_dict[dt_fields.detection_boxes] = np.reshape(
result.outputs[dt_fields.detection_boxes].float_val, (-1, 4))
output_dict[dt_fields.detection_scores] = np.squeeze(
result.outputs[dt_fields.detection_scores].float_val)
category_index = label_map_util.create_category_index_from_labelmap(args.label_map,
use_display_name=True)
classes = output_dict[dt_fields.detection_classes]
scores = output_dict[dt_fields.detection_scores]
classes.shape = (1, 300)
scores.shape = (1, 300)
print("prediction time : " + str(end-start))
objects = []
threshold = 0.5 # in order to get higher percentages you need to lower this number; usually at 0.01 you get 100% predicted objects
for index, value in enumerate(classes[0]):
object_dict = {}
if scores[0, index] > threshold:
object_dict[(category_index.get(value)).get('name').encode('utf8')] = \
scores[0, index]
objects.append(object_dict)
print(objects)
end_main = time.time()
print("Overall Time : " + str(end_main-start_main))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Object detection grpc client.",
formatter_class=RawTextHelpFormatter)
parser.add_argument('--server',
type=str,
default='localhost:9000',
help='PredictionService host:port')
parser.add_argument('--model_name',
type=str,
default="my-model",
help='Name of the model')
parser.add_argument('--input_image',
type=str,
default='./test_images/123.jpg',
help='Path to input image')
parser.add_argument('--output_directory',
type=str,
default='./',
help='Path to output directory')
parser.add_argument('--label_map',
type=str,
default="./data/object_detection.pbtxt",
help='Path to label map file')
args = parser.parse_args()
main(args)
I have used kubectl port forwarding for testing purposes so the request port is set to localhost:9000.
The output is :
prediction time : 6.690936326980591
[{b'goi_logo': 0.9999970197677612}]
Overall Time : 10.25893259048462
What can I do to make my inference faster? I have seen that the inference time is in the order of milliseconds so in comparison 10 seconds is a very long duration and unfit for production environments. I understand that port forwarding is slow. What is another method that I can use? I need to make this client available to the world as an API endpoint.

As previous answers stated, you should indeed try to do multiple requests because tf-serving needs some overhead the first time(s). You can prevent this by using a warm-up script.
To add some extra options:
from tf-serving v1.8 you can also use a http rest API service. Then you can call the service that you have created on your GKE from a google compute engine to reduce the connection lag. In my case it had a big speed-up because my local connection was mediocre at best. Next to http rest api being more workable to debug, you can also send much bigger requests. The grpc limit seems to be 1.5 mb while the http one is a lot higher.
Are you sending b64 encoded images? Sending the images themselves is a lot slower than sending b64 encoded strings. The way I handled this is sending b64 encoded strings from the images and create some extra layers in front of my network that transform the string to jpeg images again and then process them through the model. Some code to help you on your way:
from keras.applications.inception_v3 import InceptionV3, preprocess_input
from keras.models import Model
import numpy as np
import cv2
import tensorflow as tf
from keras.layers import Input, Lambda
from keras import backend as K
base_model = InceptionV3(
weights='imagenet',
include_top=True)
model = Model(
inputs=base_model.input,
outputs=base_model.get_layer('avg_pool').output)
def prepare_image(image_str_tensor):
#image = tf.squeeze(tf.cast(image_str_tensor, tf.string), axis=[0])
image_str_tensor = tf.cast(image_str_tensor, tf.string)
image = tf.image.decode_jpeg(image_str_tensor,
channels=3)
#image = tf.divide(image, 255)
#image = tf.expand_dims(image, 0)
image = tf.image.convert_image_dtype(image, tf.float32)
return image
def prepare_image_batch(image_str_tensor):
return tf.map_fn(prepare_image, image_str_tensor, dtype=tf.float32)
# IF BYTE STR
model.layers.pop(0)
print(model.layers[0])
input_img = Input(dtype= tf.string,
name ='string_input',
shape = ()
)
outputs = Lambda(prepare_image_batch)(input_img)
outputs = model(outputs)
inception_model = Model(input_img, outputs)
inception_model.compile(optimizer = "sgd", loss='categorical_crossentropy')
weights = inception_model.get_weights()
Next to that, I would say use a bigger gpu. I have basic yolo (keras implementation) now running on a P100 with about 0.4s latency when called from a compute engine. We noticed that the darknet implementation (in c++) is a lot faster than the keras implementation tho.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to use "model.trt" in Python - python

I have a pytorch model that I exported to ONNX and converted to a tensorflow model with the following command: trtexec --onnx=model.onnx --batch=400 --saveEngine=model.trt All of this works, but how do I now load this model.trt in python and run the inference?

Related

cuMemcpyHtoDAsync failed: invalid argument by using TensorRT (Python)

Running python script with multiple values of command line arguments

Running tensorflow on server Python

Cannot get prediction from google AI platform with Pytorch

Inference time using of Tensorflow Object Detection

Categories

Resources