I have deployed my object detection model to Google Kubernetes Engine. My model is trained using faster_rcnn_resnet101_pets configuration. The inference time of my model is very high (~10 seconds total time for prediction and ) even though I am using a Nvidia Tesla K80 GPU in my cluster node. I am using gRPC for getting predicitons from the model. The script for making prediciton requests is :
import argparse
import os
import time
import sys
import tensorflow as tf
from PIL import Image
import numpy as np
from grpc.beta import implementations
sys.path.append("..")
from object_detection.core.standard_fields import \
DetectionResultFields as dt_fields
from object_detection.utils import label_map_util
from argparse import RawTextHelpFormatter
from tensorflow_serving.apis import predict_pb2
from tensorflow_serving.apis import prediction_service_pb2_grpc
tf.logging.set_verbosity(tf.logging.INFO)
WIDTH = 1024
HEIGHT = 768
def load_image_into_numpy_array(input_image):
image = Image.open(input_image)
image = image.resize((WIDTH, HEIGHT), Image.ANTIALIAS)
(im_width, im_height) = image.size
image_arr = np.array(image.getdata()).reshape(
(im_height, im_width, 3)).astype(np.uint8)
image.close()
return image_arr
def load_input_tensor(input_image):
image_np = load_image_into_numpy_array(input_image)
image_np_expanded = np.expand_dims(image_np, axis=0).astype(np.uint8)
tensor = tf.contrib.util.make_tensor_proto(image_np_expanded)
return tensor
def main(args):
start_main = time.time()
host, port = args.server.split(':')
channel = implementations.insecure_channel(host, int(port))._channel
stub = prediction_service_pb2_grpc.PredictionServiceStub(channel)
request = predict_pb2.PredictRequest()
request.model_spec.name = args.model_name
input_tensor = load_input_tensor(args.input_image)
request.inputs['inputs'].CopyFrom(input_tensor)
start = time.time()
result = stub.Predict(request, 60.0)
end = time.time()
output_dict = {}
output_dict[dt_fields.detection_classes] = np.squeeze(
result.outputs[dt_fields.detection_classes].float_val).astype(np.uint8)
output_dict[dt_fields.detection_boxes] = np.reshape(
result.outputs[dt_fields.detection_boxes].float_val, (-1, 4))
output_dict[dt_fields.detection_scores] = np.squeeze(
result.outputs[dt_fields.detection_scores].float_val)
category_index = label_map_util.create_category_index_from_labelmap(args.label_map,
use_display_name=True)
classes = output_dict[dt_fields.detection_classes]
scores = output_dict[dt_fields.detection_scores]
classes.shape = (1, 300)
scores.shape = (1, 300)
print("prediction time : " + str(end-start))
objects = []
threshold = 0.5 # in order to get higher percentages you need to lower this number; usually at 0.01 you get 100% predicted objects
for index, value in enumerate(classes[0]):
object_dict = {}
if scores[0, index] > threshold:
object_dict[(category_index.get(value)).get('name').encode('utf8')] = \
scores[0, index]
objects.append(object_dict)
print(objects)
end_main = time.time()
print("Overall Time : " + str(end_main-start_main))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Object detection grpc client.",
formatter_class=RawTextHelpFormatter)
parser.add_argument('--server',
type=str,
default='localhost:9000',
help='PredictionService host:port')
parser.add_argument('--model_name',
type=str,
default="my-model",
help='Name of the model')
parser.add_argument('--input_image',
type=str,
default='./test_images/123.jpg',
help='Path to input image')
parser.add_argument('--output_directory',
type=str,
default='./',
help='Path to output directory')
parser.add_argument('--label_map',
type=str,
default="./data/object_detection.pbtxt",
help='Path to label map file')
args = parser.parse_args()
main(args)
I have used kubectl port forwarding for testing purposes so the request port is set to localhost:9000.
The output is :
prediction time : 6.690936326980591
[{b'goi_logo': 0.9999970197677612}]
Overall Time : 10.25893259048462
What can I do to make my inference faster? I have seen that the inference time is in the order of milliseconds so in comparison 10 seconds is a very long duration and unfit for production environments. I understand that port forwarding is slow. What is another method that I can use? I need to make this client available to the world as an API endpoint.
As previous answers stated, you should indeed try to do multiple requests because tf-serving needs some overhead the first time(s). You can prevent this by using a warm-up script.
To add some extra options:
from tf-serving v1.8 you can also use a http rest API service. Then you can call the service that you have created on your GKE from a google compute engine to reduce the connection lag. In my case it had a big speed-up because my local connection was mediocre at best. Next to http rest api being more workable to debug, you can also send much bigger requests. The grpc limit seems to be 1.5 mb while the http one is a lot higher.
Are you sending b64 encoded images? Sending the images themselves is a lot slower than sending b64 encoded strings. The way I handled this is sending b64 encoded strings from the images and create some extra layers in front of my network that transform the string to jpeg images again and then process them through the model. Some code to help you on your way:
from keras.applications.inception_v3 import InceptionV3, preprocess_input
from keras.models import Model
import numpy as np
import cv2
import tensorflow as tf
from keras.layers import Input, Lambda
from keras import backend as K
base_model = InceptionV3(
weights='imagenet',
include_top=True)
model = Model(
inputs=base_model.input,
outputs=base_model.get_layer('avg_pool').output)
def prepare_image(image_str_tensor):
#image = tf.squeeze(tf.cast(image_str_tensor, tf.string), axis=[0])
image_str_tensor = tf.cast(image_str_tensor, tf.string)
image = tf.image.decode_jpeg(image_str_tensor,
channels=3)
#image = tf.divide(image, 255)
#image = tf.expand_dims(image, 0)
image = tf.image.convert_image_dtype(image, tf.float32)
return image
def prepare_image_batch(image_str_tensor):
return tf.map_fn(prepare_image, image_str_tensor, dtype=tf.float32)
# IF BYTE STR
model.layers.pop(0)
print(model.layers[0])
input_img = Input(dtype= tf.string,
name ='string_input',
shape = ()
)
outputs = Lambda(prepare_image_batch)(input_img)
outputs = model(outputs)
inception_model = Model(input_img, outputs)
inception_model.compile(optimizer = "sgd", loss='categorical_crossentropy')
weights = inception_model.get_weights()
Next to that, I would say use a bigger gpu. I have basic yolo (keras implementation) now running on a P100 with about 0.4s latency when called from a compute engine. We noticed that the darknet implementation (in c++) is a lot faster than the keras implementation tho.
Related
I am trying to make a website that can make predictions on images using tensorflow, flask, and python.
This is my code:
from flask import Flask, render_template
import os
import numpy as np
import pandas as pd
app = Flask(__name__)
#app.route('/')
def index():
return render_template('index.html')
import tensorflow as tf
import tensorflow_hub as hub
model = tf.keras.models.load_model(MODEL_PATH)
IMG_SIZE = 224
BATCH_SIZE = 32
custom_path = "http://t1.gstatic.com/licensed-image?q=tbn:ANd9GcQd6lM4HtInRF3cxw6h3MgUZIIiJCdMgFvXKrhaJrbw61tN3aYpMIVBi0dx0KPv1sdCrLk0sBhPeNVt8m0"
custom_data = create_data_batches(custom_path, test_data=True)
custom_preds = model.predict(custom_data)
# Get custom image prediction labels
custom_pred_labels = [get_pred_label(custom_preds[i]) for i in range(len(custom_preds))]
print(custom_pred_labels)
#app.route('/my-link/')
def my_link():
return f"The predictions are: {custom_pred_labels}"
if __name__ == '__main__':
app.run(host="localhost", port=3000, debug=True)
The process_image function:
def process_image(image_path, img_size=IMG_SIZE):
"""
Takes an image file path and turns the image into a Tensor.
"""
image = tf.io.read_file(image_path)
image = tf.image.decode_jpeg(image, channels=3)
image = tf.image.convert_image_dtype(image, tf.float32)
image = tf.image.resize(image, size=[img_size, img_size])
return image
The needed part of the create_data_batches function:
def create_data_batches(X, y=None, batch_size=BATCH_SIZE, valid_data=False, test_data=False):
"""
Creates batches out of data out of image (X) and label (y) pairs.
Shuffles the data if it's training data but doesn't shuffle if it's validation data.
Also accepts test data as input (no labels)
"""
if test_data:
print("Creating test data batches...")
data = tf.data.Dataset.from_tensor_slices((tf.constant(X))) # only filepaths (no labels)
data_batch = data.map(process_image).batch(BATCH_SIZE)
return data_batch
The get_image_label function:
def get_image_label(image_path, label):
"""
Takes an image file path name and the associated label, processes the image and returns a tuple of (image, label).
"""
image = process_image(image_path)
return image, label
The get_pred_label function:
def get_pred_label(prediction_probabilites):
"""
Turns an array of prediction probabilities into a label.
"""
return unique_breeds[np.argmax(prediction_probabilites)]
Now when I run this, I get the following error:
ValueError: Unbatching a tensor is only supported for rank >= 1
I tried turning it into a list as one of the solutions I found said:
custom_path = ["http://t1.gstatic.com/licensed-image?q=tbn:ANd9GcQd6lM4HtInRF3cxw6h3MgUZIIiJCdMgFvXKrhaJrbw61tN3aYpMIVBi0dx0KPv1sdCrLk0sBhPeNVt8m0"]
But when I run that, I get this error:
UNIMPLEMENTED: File system scheme 'http' not implemented (file: 'http://t1.gstatic.com/licensed-image?q=tbn:ANd9GcQd6lM4HtInRF3cxw6h3MgUZIIiJCdMgFvXKrhaJrbw61tN3aYpMIVBi0dx0KPv1sdCrLk0sBhPeNVt8m0')
Any help would be appreciated.
I am trying to copy an np array to the GPU using TensorRT in Python but I keep getting the error 'cuMemcpyHtoDAsync failed: invalid argument'. The array has the correct format (float32) and size, but the error remains. Does anyone have an idea of what I am doing wrong or how I can fix this error?
import tensorrt as trt
import pycuda.driver as cuda
import numpy as np
import cv2
def allocate_buffers(engine):
inputs = []
outputs = []
bindings = []
cuda.init()
device = cuda.Device(0)
ctx = device.make_context()
stream = cuda.Stream()
# stream = cuda.Stream()
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(host_mem)
else:
outputs.append(host_mem)
return inputs, outputs, bindings, stream
def do_inference(context, bindings, inputs, outputs, stream):
# Transfer input data to the GPU.
[cuda.memcpy_htod_async(inp, i, stream) for inp, i in zip(bindings[:len(inputs)], inputs)]
# Run inference.
context.execute_async(bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out, o, stream) for out, o in zip(outputs, bindings[len(inputs):])]
# Synchronize the stream
stream.synchronize()
def detect_objects(image, engine, context, threshold=0.5):
# Preprocess the image
image = cv2.resize(image, (640, 640))
image = np.transpose(image, (2, 0, 1))
image = np.expand_dims(image, axis=0)
# Allocate buffers
inputs, outputs, bindings, stream = allocate_buffers(engine)
#inputs[0] = np.ascontiguousarray(image)
inputs[0] = np.ascontiguousarray(image, dtype=np.float32) / 255.0
print(inputs[0].shape)
print(inputs[0].dtype)
# Run inference
do_inference(context, bindings, inputs, outputs, stream)
# Postprocess the outputs
outputs = outputs[0]
outputs = outputs[outputs[:, 0] > threshold]
# Get the bounding boxes
boxes = outputs[:, 1:]
return boxes
# Load the engine
engine = trt.Runtime(trt.Logger(trt.Logger.WARNING)).deserialize_cuda_engine(open("Modelle/best.engine", "rb").read())
context = engine.create_execution_context()
# Read the image
image = cv2.imread("Test.jpg")
# Detect objects in the image
boxes = detect_objects(image, engine, context)
print (boxes)
or am I doing something fundamentally wrong when loading the tensorRT file? Is there another way to index an object on an image?
Thanks
trying to use tensorflow on ipage server with operating system centOs 7
I don't know if there is a GPU or not , but I get this error message
kernel driver does not appear to be running on this host /proc/driver/nvidia/version does not exist
I tried this code
from tensorflow import config
config.set_soft_device_placement = True
and also this code trying to prevent tensorflow from using GPU
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
but I kept receiving the same error message
Note :
I am trying to run face recognition script on server to be used by php through terminal using exec() function by passing the image as string base64 and then decoding it and then adding new face or fetching faces and comparing and then will be saved in the database
and this is my script :
import face_recognition as fr
from tensorflow.keras.models import model_from_json
import numpy as np
from PIL import Image
import base64
import io
import json
import cv2
from tensorflow import config
config.set_soft_device_placement = True
def check_if_spoof(image_string_base64):
# decoding the image
msg = base64.b64decode(image_string_base64)
buf = io.BytesIO(msg)
img = Image.open(buf).convert("RGB")
opencv_image = np.array(img)
test_image = opencv_image[:,:,::-1].copy()
json_file = open('antispoofing_models/antispoofing_model.json','r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)
# load antispoofing model weights
model.load_weights('antispoofing_models/antispoofing_model.h5')
resized_face = cv2.resize(test_image,(160,160))
resized_face = resized_face.astype("float") / 255.0
# resized_face = img_to_array(resized_face)
resized_face = np.expand_dims(resized_face, axis=0)
# pass the face ROI through the trained liveness detector
# model to determine if the face is "real" or "fake"
preds = model(resized_face)[0]
if preds> 0.5:
return True
else:
return False
def decode_img(image_string_base64):
msg = base64.b64decode(image_string_base64)
buf = io.BytesIO(msg)
img = Image.open(buf)
return img
def resize_image(image_string_base64, height=500):
image = decode_img(image_string_base64)
height_percent = (height / float(image.size[1]))
width_size = int((float(image.size[0]) * float(height_percent)))
image = image.resize((width_size, height), Image.NEAREST)
return np.array(image)
def add_person(image_string_base64):
if check_if_spoof(image_string_base64):
return {'error':'image is fake'}
img = np.array(decode_img(image_string_base64))
img_encodings = fr.face_encodings(img, num_jitters=4)
return np.array(img_encodings)
def get_person(image_string_base64, data, tolerance=0.35):
if check_if_spoof(image_string_base64):
return {'error':'image is fake'}
data = json.loads(data)
resized_img = resize_image(image_string_base64)
img_encoding = fr.face_encodings(resized_img)
############## testing purpose only #########
# return fr.compare_faces(data,img_encoding)
#############################################
# type of recieved data is : { id ->number : encoding->list}
for user in data:
for id_ , encoding in user:
if True in fr.compare_faces(encoding, img_encoding):
return id_
return {'error':"user not existed"}
I have a pytorch model that I exported to ONNX and converted to a tensorflow model with the following command:
trtexec --onnx=model.onnx --batch=400 --saveEngine=model.trt
All of this works, but how do I now load this model.trt in python and run the inference?
The official documentation has a lot of examples. The basic steps to follow are:
ONNX parser: takes a trained model in ONNX format as input and populates a network object in TensorRT
Builder: takes a network in TensorRT and generates an engine that is optimized for the target platform
Engine: takes input data, performs inferences and emits inference output
Logger: object associated with the builder and engine to capture errors, warnings and other information during the build and inference phases
An example for the engine is:
import tensorrt as trt
import pycuda.autoinit
import pycuda.driver as cuda
from onnx import ModelProto
import onnx
import numpy as np
import matplotlib.pyplot as plt
from time import time
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
trt_runtime = trt.Runtime(TRT_LOGGER)
#batch_size = 1
explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
#inp_shape = [batch_size, 3, 1024, 1024] # the shape I was using
def build_engine(onnx_path, shape = inp_shape):
with trt.Builder(TRT_LOGGER) as builder,builder.create_builder_config() as config,\
builder.create_network(explicit_batch) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
if builder.platform_has_fast_fp16:
builder.fp16_mode = True
builder.max_workspace_size = (1 << 30)
#builder.max_workspace_size = (3072 << 20)
#profile = builder.create_optimization_profile()
#config.max_workspace_size = (3072 << 20)
#config.add_optimization_profile(profile)
print("parsing")
with open(onnx_path, 'rb') as model:
print("onnx found")
if not parser.parse(model.read()):
print("parse failed")
for error in range(parser.num_errors):
print(parser.get_error(error))
#parser.parse(model.read())
last_layer = network.get_layer(network.num_layers - 1)
# Check if last layer recognizes it's output
if not last_layer.get_output(0):
# If not, then mark the output using TensorRT API
network.mark_output(last_layer.get_output(0))
network.get_input(0).shape = shape
engine = builder.build_cuda_engine(network)
return engine
def save_engine(engine, file_name):
buf = engine.serialize()
with open(file_name, 'wb') as f:
f.write(buf)
def load_engine(trt_runtime, plan_path):
with open(engine_path, 'rb') as f:
engine_data = f.read()
engine = trt_runtime.deserialize_cuda_engine(engine_data)
return engine
if __name__ == "__main__":
onnx_path = "./path/to/your/model.onnx"
engine_name = "./path/to/engine.plan"
model = ModelProto()
with open(onnx_path, "rb") as f:
model.ParseFromString(f.read())
d0 = model.graph.input[0].type.tensor_type.shape.dim[1].dim_value
d1 = model.graph.input[0].type.tensor_type.shape.dim[2].dim_value
d2 = model.graph.input[0].type.tensor_type.shape.dim[3].dim_value
shape = [batch_size , d0, d1 ,d2]
print(shape)
print("trying to build engine")
engine = build_engine(onnx_path,shape)
save_engine(engine,engine_name)
print("finished")
Follow this page for another example and information.
Found an answer based on this tutorial.
import numpy as np
import tensorrt as trt
import pycuda.driver as cuda
dev = cuda.Device(0)
ctx = dev.make_context()
try:
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
with open("model.trt", 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
engine = runtime.deserialize_cuda_engine(f.read())
with engine.create_execution_context() as context:
# get sizes of input and output and allocate memory required for input data and for output data
for binding in engine:
if engine.binding_is_input(binding): # we expect only one input
input_shape = engine.get_binding_shape(binding)
input_size = trt.volume(input_shape) * engine.max_batch_size * np.dtype(np.float32).itemsize # in bytes
device_input = cuda.mem_alloc(input_size)
else: # and one output
output_shape = engine.get_binding_shape(binding)
# create page-locked memory buffers (i.e. won't be swapped to disk)
host_output = cuda.pagelocked_empty(trt.volume(output_shape) * engine.max_batch_size, dtype=np.float32)
device_output = cuda.mem_alloc(host_output.nbytes)
stream = cuda.Stream()
host_input = np.array(batch, dtype=np.float32, order='C')
cuda.memcpy_htod_async(device_input, host_input, stream)
context.execute_async(bindings=[int(device_input), int(device_output)], stream_handle=stream.handle)
cuda.memcpy_dtoh_async(host_output, device_output, stream)
stream.synchronize()
# postprocess results
output_data = host_output.reshape(engine.max_batch_size, output_shape[0]).T
finally:
ctx.pop()
I've deployed a custom Pytorch model to the Google AI platform for prediction, but when I try to make a prediction request with image data using gcloud tools I get the following error in response:
{
"error": "Prediction failed: unknown error."
}
I've tried to encode my image data in b64 format or to place it into a multidimensional python array, by doing the following:
pil_im = Image.open('Pic512.png')
pil_im = pil_im.resize((224,224)).convert('RGB')
im_arr = np.asarray(pil_im)
py_arr = im_arr.tolist()
json_instance_1 = {'instances': py_arr}
with open('json_instance_1.json', 'w') as f:
json.dump(json_instance_1, f)
I converted it into b64 like so, after adjusting my Predictor code accordingly:
with open('Pic512.png', 'rb') as f:
byte_im = f.read()
json_instance = {'instances': {'b64': base64.b64encode(byte_im).decode()}}
with open('json_instance.json', 'w') as f:
json.dump(json_instance, f)
I've tried converting with different file formats and similar methods, but all of them give me the same error.
My predictor module:
from facenet_pytorch import MTCNN, InceptionResnetV1, extract_face
import torch
from torchvision import transforms
from torch.nn import functional as F
from PIL import Image
# from sklearn.externals import joblib
import numpy as np
import os
import io
import base64
class MyPredictor(object):
"""An example Predictor for an AI Platform custom prediction routine."""
def __init__(self, model, preprocessor, device):
"""Stores artifacts for prediction. Only initialized via `from_path`.
"""
self._resnet = model
self._mtcnn_mult = preprocessor
self._device = device
self.get_std_tensor = transforms.Compose([
np.float32,
np.uint8,
transforms.ToTensor(),
])
self.tensor2pil = transforms.ToPILImage(mode='RGB')
self.trans_resnet = transforms.Compose([
transforms.Resize((100, 100)),
np.float32,
transforms.ToTensor()
])
def predict(self, instances, **kwargs):
pil_transform = transforms.Resize((512, 512))
imarr = np.uint8(np.array(instances))
# img_bytes_string = io.BytesIO(base64.b64decode(instances))
pil_im = Image.fromarray(imarr)
# pil_im = Image.open(img_bytes_string)
image = pil_im.convert('RGB')
pil_im_512 = pil_transform(image)
boxes, _ = self._mtcnn_mult.detect(pil_im_512)
box = boxes[0]
face_tensor = extract_face(pil_im_512, box, margin=40)
std_tensor = self.get_std_tensor(face_tensor.permute(1, 2, 0))
cropped_pil_im = self.tensor2pil(std_tensor)
face_tensor = self.trans_resnet(cropped_pil_im)
face_tensor4d = face_tensor.unsqueeze(0)
face_tensor4d = face_tensor4d.to(self._device)
self._resnet.eval()
prediction = self._resnet(face_tensor4d)
preds = F.softmax(prediction, dim=1).detach().numpy().reshape(-1)
print('probability of (class1, class2) = ({:.4f}, {:.4f})'.format(preds[0], preds[1]))
return {'probs':preds.tolist()}
#classmethod
def from_path(cls, model_dir):
device_path = os.path.join(model_dir, 'device_cpu.pt')
device = torch.load(device_path)
model_path = os.path.join(model_dir, 'FullResNetRefinedExtra_no_norm_100x100_8634.pt')
classifier = torch.load(model_path, map_location=device)
mtcnn_path = os.path.join(model_dir, 'mtcnn_mult.pt')
mtcnn_mult = torch.load(mtcnn_path)
return cls(classifier, mtcnn_mult, device)
When I test the class locally everything works, so I assume it's a problem related the serialisation and deserialisation on the side of Google Platform. How can I resolve this issue?