Tensorflow serving returns [[[0]]] - python

I'm trying to use Tensorflow Serving to make predictions for my model.
This is the client that I'm using:
#!/usr/bin/env python2.7
"""A client that talks to tensorflow_model_server loaded with deepspeech model.
The client queries the service with the given audio and prints a ranked list
of decoded outputs to the standard output, one per line.
Typical usage example:
deepspeech_client.py --server=localhost:9000 --file audio.wav
import os
import sys
local_tf = os.path.join(os.path.dirname(os.path.dirname(os.path.join(os.path.abspath(__file__)))), 'local_tf')
import threading
from grpc.beta import implementations
import numpy as np
import tensorflow as tf
from tensorflow_serving.apis import predict_pb2
from tensorflow_serving.apis import prediction_service_pb2
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), '..'))
from util.text import ndarray_to_text
from util.audio import audiofile_to_input_vector
tf.app.flags.DEFINE_string('server', '', 'PredictionService host:port')
tf.app.flags.DEFINE_string('file', '', 'Wave audio file')
# These need to match the constants used when training the deepspeech model
tf.app.flags.DEFINE_string('n_input', 26, 'Number of MFCC features')
tf.app.flags.DEFINE_string('n_context', 9, 'Number of frames of context')
FLAGS = tf.app.flags.FLAGS
def _create_rpc_callback(event):
def _callback(result_future):
exception = result_future.exception()
if exception:
print exception
results = tf.contrib.util.make_ndarray(result_future.result().outputs['outputs'])
for result in results[0]:
print ndarray_to_text(result)
return _callback
def do_inference(hostport, audio):
host, port = hostport.split(':')
channel = implementations.insecure_channel(host, int(port))
stub = prediction_service_pb2.beta_create_PredictionService_stub(channel)
request = predict_pb2.PredictRequest()
request.model_spec.name = 'deepspeech'
event = threading.Event()
result_future = stub.Predict.future(request, 5.0) # 5 seconds
if event.is_set() != True:
def main(_):
if not FLAGS.server:
print 'please specify server host:port'
if not FLAGS.file:
print 'pleace specify an audio file'
audio_waves = audiofile_to_input_vector(
FLAGS.file, FLAGS.n_input, FLAGS.n_context)
audio = np.array([ audio_waves ])
do_inference(FLAGS.server, audio)
if __name__ == '__main__':
It's taken directly from an older commit of the Mozilla DeepSpeech implementation.
I have a trained model (from the same commit) that I'm trying to use to make predictions. I'm using the LDC193S1.wav file to try and make predictions.
When I run the model, I get back a response that is equal to
dtype: DT_INT64 tensor_shape { dim { size: 1 } dim { size: 1 } dim { size: 1 } } int64_val: 0 [[[0]]]
What can I do to fix this?


Computer crashing when using python tools in same script

I am attempting to use the speech recognition toolkit VOSK and the speech diarization package Resemblyzer to transcibe audio and then identify the speakers in the audio.
I can do both things individually but run into issues when trying to do them when running the one python script.
I used the following guide when setting up the diarization system:
Computer specs are as follows:
Intel(R) Core(TM) i3-7100 CPU # 3.90GHz, 3912 Mhz, 2 Core(s), 4 Logical Processor(s)
The following is my code, I am not to sure if using threading is appropriate or if I even implemented it correctly, how can I best optimize this code as to achieve the results I am looking for and not crash.
from vosk import Model, KaldiRecognizer
from pydub import AudioSegment
import json
import sys
import os
import subprocess
import datetime
from resemblyzer import preprocess_wav, VoiceEncoder
from pathlib import Path
from resemblyzer.hparams import sampling_rate
from spectralcluster import SpectralClusterer
import threading
import queue
import gc
def recognition(queue, audio, FRAME_RATE):
model = Model("Vosk_Models/vosk-model-small-en-us-0.15")
rec = KaldiRecognizer(model, FRAME_RATE)
result = rec.Result()
transcript = json.loads(result)#["text"]
#return transcript
def diarization(queue, audio):
wav = preprocess_wav(audio)
encoder = VoiceEncoder("cpu")
_, cont_embeds, wav_splits = encoder.embed_utterance(wav, return_partials=True, rate=16)
clusterer = SpectralClusterer(
labels = clusterer.predict(cont_embeds)
def create_labelling(labels, wav_splits):
times = [((s.start + s.stop) / 2) / sampling_rate for s in wav_splits]
labelling = []
start_time = 0
for i, time in enumerate(times):
if i > 0 and labels[i] != labels[i - 1]:
temp = [str(labels[i - 1]), start_time, time]
start_time = time
if i == len(times) - 1:
temp = [str(labels[i]), start_time, time]
return labelling
labelling = create_labelling(labels, wav_splits)
def identify_speaker(queue1, queue2):
transcript = queue1.get()
labelling = queue2.get()
for speaker in labelling:
speakerID = speaker[0]
speakerStart = speaker[1]
speakerEnd = speaker[2]
result = transcript['result']
words = [r['word'] for r in result if speakerStart < r['start'] < speakerEnd]
print("Speaker",speakerID,":",' '.join(words), "\n")
def main():
queue1 = queue.Queue()
queue2 = queue.Queue()
FRAME_RATE = 16000
podcast = AudioSegment.from_mp3("Podcast_Audio/Film-Release-Clip.mp3")
podcast = podcast.set_channels(CHANNELS)
podcast = podcast.set_frame_rate(FRAME_RATE)
first_thread = threading.Thread(target=recognition, args=(queue1, podcast, FRAME_RATE))
second_thread = threading.Thread(target=diarization, args=(queue2, podcast))
third_thread = threading.Thread(target=identify_speaker, args=(queue1, queue2))
# transcript = recognition(podcast,FRAME_RATE)
# labelling = diarization(podcast)
# print(identify_speaker(transcript, labelling))
if __name__ == '__main__':
When I say crash I mean everything freezes, I have to hold down the power button on the desktop and turn it back on again. No blue/blank screen, just frozen in my IDE looking at my code. Any help in resolving this issue would be greatly appreciated.
Pydubs AudioSegment was not returning a suitable type for the Resembylzer function preprocess_wav.
podcast = AudioSegment.from_mp3("Podcast_Audio/Film-Release-Clip.mp3")
preprocess_wav instead requires a Numpy Array / Path.
audio_file_path = 'Podcast_Audio/WAV-Film-Release-Clip.wav'
wav_fpath = Path(audio_file_path)
wav = preprocess_wav(wav_fpath)
Additionally preprocess_wav functionality can be achieved using Librosa if desired.
import librosa
def preprocess_wav(waveform, sr):
waveform = librosa.resample(waveform, orig_sr=sr, target_sr=16000)
waveform = waveform.astype(np.float32) / np.max(np.abs(waveform))
return waveform
waveform, sr = librosa.load('Podcast_Audio/WAV-Film-Release-Clip.wav')
wav = preprocess_wav(waveform, sr)

How to use "model.trt" in Python

I have a pytorch model that I exported to ONNX and converted to a tensorflow model with the following command:
trtexec --onnx=model.onnx --batch=400 --saveEngine=model.trt
All of this works, but how do I now load this model.trt in python and run the inference?
The official documentation has a lot of examples. The basic steps to follow are:
ONNX parser: takes a trained model in ONNX format as input and populates a network object in TensorRT
Builder: takes a network in TensorRT and generates an engine that is optimized for the target platform
Engine: takes input data, performs inferences and emits inference output
Logger: object associated with the builder and engine to capture errors, warnings and other information during the build and inference phases
An example for the engine is:
import tensorrt as trt
import pycuda.autoinit
import pycuda.driver as cuda
from onnx import ModelProto
import onnx
import numpy as np
import matplotlib.pyplot as plt
from time import time
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
trt_runtime = trt.Runtime(TRT_LOGGER)
#batch_size = 1
explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
#inp_shape = [batch_size, 3, 1024, 1024] # the shape I was using
def build_engine(onnx_path, shape = inp_shape):
with trt.Builder(TRT_LOGGER) as builder,builder.create_builder_config() as config,\
builder.create_network(explicit_batch) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
if builder.platform_has_fast_fp16:
builder.fp16_mode = True
builder.max_workspace_size = (1 << 30)
#builder.max_workspace_size = (3072 << 20)
#profile = builder.create_optimization_profile()
#config.max_workspace_size = (3072 << 20)
with open(onnx_path, 'rb') as model:
print("onnx found")
if not parser.parse(model.read()):
print("parse failed")
for error in range(parser.num_errors):
last_layer = network.get_layer(network.num_layers - 1)
# Check if last layer recognizes it's output
if not last_layer.get_output(0):
# If not, then mark the output using TensorRT API
network.get_input(0).shape = shape
engine = builder.build_cuda_engine(network)
return engine
def save_engine(engine, file_name):
buf = engine.serialize()
with open(file_name, 'wb') as f:
def load_engine(trt_runtime, plan_path):
with open(engine_path, 'rb') as f:
engine_data = f.read()
engine = trt_runtime.deserialize_cuda_engine(engine_data)
return engine
if __name__ == "__main__":
onnx_path = "./path/to/your/model.onnx"
engine_name = "./path/to/engine.plan"
model = ModelProto()
with open(onnx_path, "rb") as f:
d0 = model.graph.input[0].type.tensor_type.shape.dim[1].dim_value
d1 = model.graph.input[0].type.tensor_type.shape.dim[2].dim_value
d2 = model.graph.input[0].type.tensor_type.shape.dim[3].dim_value
shape = [batch_size , d0, d1 ,d2]
print("trying to build engine")
engine = build_engine(onnx_path,shape)
Follow this page for another example and information.
Found an answer based on this tutorial.
import numpy as np
import tensorrt as trt
import pycuda.driver as cuda
dev = cuda.Device(0)
ctx = dev.make_context()
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
with open("model.trt", 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
engine = runtime.deserialize_cuda_engine(f.read())
with engine.create_execution_context() as context:
# get sizes of input and output and allocate memory required for input data and for output data
for binding in engine:
if engine.binding_is_input(binding): # we expect only one input
input_shape = engine.get_binding_shape(binding)
input_size = trt.volume(input_shape) * engine.max_batch_size * np.dtype(np.float32).itemsize # in bytes
device_input = cuda.mem_alloc(input_size)
else: # and one output
output_shape = engine.get_binding_shape(binding)
# create page-locked memory buffers (i.e. won't be swapped to disk)
host_output = cuda.pagelocked_empty(trt.volume(output_shape) * engine.max_batch_size, dtype=np.float32)
device_output = cuda.mem_alloc(host_output.nbytes)
stream = cuda.Stream()
host_input = np.array(batch, dtype=np.float32, order='C')
cuda.memcpy_htod_async(device_input, host_input, stream)
context.execute_async(bindings=[int(device_input), int(device_output)], stream_handle=stream.handle)
cuda.memcpy_dtoh_async(host_output, device_output, stream)
# postprocess results
output_data = host_output.reshape(engine.max_batch_size, output_shape[0]).T

How to deploy Keras-yolo model to the web with Flask?

I'm successfully trained my own dataset using Keras yolov3 Github project link
and I've got good predictions:
I would like to deploy this model on the web using flask to make it work with a stream or with IP cameras.
I saw many tutorials explains how to do that but, in reality, I did not find what I am looking for.
How can I get started?
You can use flask-restful to design a simple rest API.
You can use opencv VideoCapture to grab the video stream and get frames.
import numpy as np
import cv2
# Open a sample video available in sample-videos
vcap = cv2.VideoCapture('URL')
The client will take an image/ frame, encode it using base64, add other details like height, width, and make a request.
import numpy as np
import base64
import zlib
import requests
import time
t1 = time.time()
for _ in range(1000): # 1000 continuous request
frame = np.random.randint(0,256, (416,416,3), dtype=np.uint8) # dummy rgb image
# replace frame with your image
# compress
data = frame # zlib.compress(frame)
data = base64.b64encode(data)
data_send = data
#data2 = base64.b64decode(data)
#data2 = zlib.decompress(data2)
#fdata = np.frombuffer(data2, dtype=np.uint8)
r = requests.post("", json={'imgb64' : data_send.decode(), 'w': 416, 'h': 416})
# make a post request
# print the response here
t2 = time.time()
Your server will load the darknet model, and when it receives a post request it will simply return the model output.
from flask import Flask, request
from flask_restful import Resource, Api, reqparse
import json
import numpy as np
import base64
# compression
import zlib
# load keras model
# load_model('model.h5')
app = Flask(__name__)
api = Api(app)
parser = reqparse.RequestParser()
parser.add_argument('imgb64', location='json', help = 'type error')
parser.add_argument('w', type = int, location='json', help = 'type error')
parser.add_argument('h', type = int, location='json', help = 'type error')
class Predict(Resource):
def post(self):
data = parser.parse_args()
if data['imgb64'] == "":
return {
'message':'No file found',
img = data['imgb64']
w = data['w']
h = data['h']
data2 = img.encode()
data2 = base64.b64decode(data2)
#data2 = zlib.decompress(data2)
fdata = np.frombuffer(data2, dtype=np.uint8).reshape(w, h, -1)
# do model inference here
if img:
return json.dumps({
'mean': np.mean(fdata),
'channel': fdata.shape[-1],
'message':'darknet processed',
return {
'message':'Something when wrong',
if __name__ == '__main__':
app.run(debug=True, host = '', port = 5000, threaded=True)
In the # do model inference here part, just use your detect/predict function.
If you want to use native darknet, https://github.com/zabir-nabil/tf-model-server4-yolov3
If you want to use gRPC instead of REST, https://github.com/zabir-nabil/simple-gRPC

How to deploy a simple neural network from A-Z in MXNet

I am trying to build and deploy a simple neural network in MXNet and deploy it on a server using mxnet-model-server.
The biggest issue is to deploy the model - model server crashes after uploading the .mar file but I have no idea what the problem could be.
I used the following code to create a custom (but very simple) neural network for testing:
from __future__ import print_function
import numpy as np
import mxnet as mx
from mxnet import nd, autograd, gluon
data_ctx = mx.cpu()
model_ctx = mx.cpu()
# fix the seed
num_examples = 1000
X = mx.random.uniform(shape=(num_examples, 49))
y = mx.random.uniform(shape=(num_examples, 1))
dataset_train = mx.gluon.data.dataset.ArrayDataset(X, y)
dataset_test = dataset_train
data_loader_train = mx.gluon.data.DataLoader(dataset_train, batch_size=25)
data_loader_test = mx.gluon.data.DataLoader(dataset_test, batch_size=25)
num_outputs = 2
net = gluon.nn.HybridSequential()
with net.name_scope():
net.add(gluon.nn.Dense(49, activation="relu"))
net.add(gluon.nn.Dense(64, activation="relu"))
net.collect_params().initialize(mx.init.Normal(sigma=.1), ctx=model_ctx)
softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': .01})
epochs = 1
smoothing_constant = .01
for e in range(epochs):
cumulative_loss = 0
for i, (data, label) in enumerate(data_loader_train):
data = data.as_in_context(model_ctx).reshape((-1, 49))
label = label.as_in_context(model_ctx)
with autograd.record():
output = net(data)
loss = softmax_cross_entropy(output, label)
cumulative_loss += nd.sum(loss).asscalar()
Following, exported the model using:
The result are a .json and .params file.
I created a signature.json
"inputs": [
"data_name": "data",
"data_shape": [
The model handler is the same from the mxnet tutorial:
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
# http://www.apache.org/licenses/LICENSE-2.0
# or in the "license" file accompanying this file. This file is distributed
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
ModelHandler defines a base model handler.
import logging
import time
class ModelHandler(object):
A base Model handler implementation.
def __init__(self):
self.error = None
self._context = None
self._batch_size = 0
self.initialized = False
def initialize(self, context):
Initialize model. This will be called during model loading time
:param context: Initial context contains model server system properties.
self._context = context
self._batch_size = context.system_properties["batch_size"]
self.initialized = True
def preprocess(self, batch):
Transform raw input into model input data.
:param batch: list of raw requests, should match batch size
:return: list of preprocessed model input data
assert self._batch_size == len(batch), "Invalid input batch size: {}".format(len(batch))
return None
def inference(self, model_input):
Internal inference methods
:param model_input: transformed model input data
:return: list of inference output in NDArray
return None
def postprocess(self, inference_output):
Return predict result in batch.
:param inference_output: list of inference output
:return: list of predict results
return ["OK"] * self._batch_size
def handle(self, data, context):
Custom service entry point function.
:param data: list of objects, raw input from request
:param context: model server context
:return: list of outputs to be send back to client
self.error = None # reset earlier errors
preprocess_start = time.time()
data = self.preprocess(data)
inference_start = time.time()
data = self.inference(data)
postprocess_start = time.time()
data = self.postprocess(data)
end_time = time.time()
metrics = context.metrics
metrics.add_time("PreprocessTime", round((inference_start - preprocess_start) * 1000, 2))
metrics.add_time("InferenceTime", round((postprocess_start - inference_start) * 1000, 2))
metrics.add_time("PostprocessTime", round((end_time - postprocess_start) * 1000, 2))
return data
except Exception as e:
logging.error(e, exc_info=True)
request_processor = context.request_processor
request_processor.report_status(500, "Unknown inference error")
return [str(e)] * self._batch_size
Following, I created the .mar file using:
model-archiver --model-name my_project --model-path my_project --handler ssd_service:handle
Starting the model on the server:
mxnet-model-server --start --model_store my_project --models ssd=my_project.mar
I literally followed every tutorial on:
However, the server is crashing. The worker die, backend worker die, workers are disconnected, Load model failed: ssd, error: worker died
I have absolutely no clue what to do so I would be very glad if you helped me out!
I tried out your code and it works fine on my laptop. If I run: curl -X POST -F "data=[0 1 2 3 4]", I get: OK%
I can only guess why it doesn't work on your machine:
Notice that model-store argument should be written with - not with _ as it is in your example. My command to run mxnet-model-server looks like this: mxnet-model-server --start --model-store ./ --models ssd=my_project.mar
Which version of mxnet-model-server you use? The latest is 1.0.2, but I have 1.0.1 installed, so maybe you want to downgrade and try it out: pip install mxnet-model-server==1.0.1.
Same question to MXNet version. In my case I use nightly build which I get via pip install mxnet --pre. I see that your model is very basic, so it shouldn't depend much... Nevertheless, install the 1.4.0 (current one) just in case.
Not sure, but hope it will help you.

Inference time using of Tensorflow Object Detection

I have deployed my object detection model to Google Kubernetes Engine. My model is trained using faster_rcnn_resnet101_pets configuration. The inference time of my model is very high (~10 seconds total time for prediction and ) even though I am using a Nvidia Tesla K80 GPU in my cluster node. I am using gRPC for getting predicitons from the model. The script for making prediciton requests is :
import argparse
import os
import time
import sys
import tensorflow as tf
from PIL import Image
import numpy as np
from grpc.beta import implementations
from object_detection.core.standard_fields import \
DetectionResultFields as dt_fields
from object_detection.utils import label_map_util
from argparse import RawTextHelpFormatter
from tensorflow_serving.apis import predict_pb2
from tensorflow_serving.apis import prediction_service_pb2_grpc
WIDTH = 1024
HEIGHT = 768
def load_image_into_numpy_array(input_image):
image = Image.open(input_image)
image = image.resize((WIDTH, HEIGHT), Image.ANTIALIAS)
(im_width, im_height) = image.size
image_arr = np.array(image.getdata()).reshape(
(im_height, im_width, 3)).astype(np.uint8)
return image_arr
def load_input_tensor(input_image):
image_np = load_image_into_numpy_array(input_image)
image_np_expanded = np.expand_dims(image_np, axis=0).astype(np.uint8)
tensor = tf.contrib.util.make_tensor_proto(image_np_expanded)
return tensor
def main(args):
start_main = time.time()
host, port = args.server.split(':')
channel = implementations.insecure_channel(host, int(port))._channel
stub = prediction_service_pb2_grpc.PredictionServiceStub(channel)
request = predict_pb2.PredictRequest()
request.model_spec.name = args.model_name
input_tensor = load_input_tensor(args.input_image)
start = time.time()
result = stub.Predict(request, 60.0)
end = time.time()
output_dict = {}
output_dict[dt_fields.detection_classes] = np.squeeze(
output_dict[dt_fields.detection_boxes] = np.reshape(
result.outputs[dt_fields.detection_boxes].float_val, (-1, 4))
output_dict[dt_fields.detection_scores] = np.squeeze(
category_index = label_map_util.create_category_index_from_labelmap(args.label_map,
classes = output_dict[dt_fields.detection_classes]
scores = output_dict[dt_fields.detection_scores]
classes.shape = (1, 300)
scores.shape = (1, 300)
print("prediction time : " + str(end-start))
objects = []
threshold = 0.5 # in order to get higher percentages you need to lower this number; usually at 0.01 you get 100% predicted objects
for index, value in enumerate(classes[0]):
object_dict = {}
if scores[0, index] > threshold:
object_dict[(category_index.get(value)).get('name').encode('utf8')] = \
scores[0, index]
end_main = time.time()
print("Overall Time : " + str(end_main-start_main))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Object detection grpc client.",
help='PredictionService host:port')
help='Name of the model')
help='Path to input image')
help='Path to output directory')
help='Path to label map file')
args = parser.parse_args()
I have used kubectl port forwarding for testing purposes so the request port is set to localhost:9000.
The output is :
prediction time : 6.690936326980591
[{b'goi_logo': 0.9999970197677612}]
Overall Time : 10.25893259048462
What can I do to make my inference faster? I have seen that the inference time is in the order of milliseconds so in comparison 10 seconds is a very long duration and unfit for production environments. I understand that port forwarding is slow. What is another method that I can use? I need to make this client available to the world as an API endpoint.
As previous answers stated, you should indeed try to do multiple requests because tf-serving needs some overhead the first time(s). You can prevent this by using a warm-up script.
To add some extra options:
from tf-serving v1.8 you can also use a http rest API service. Then you can call the service that you have created on your GKE from a google compute engine to reduce the connection lag. In my case it had a big speed-up because my local connection was mediocre at best. Next to http rest api being more workable to debug, you can also send much bigger requests. The grpc limit seems to be 1.5 mb while the http one is a lot higher.
Are you sending b64 encoded images? Sending the images themselves is a lot slower than sending b64 encoded strings. The way I handled this is sending b64 encoded strings from the images and create some extra layers in front of my network that transform the string to jpeg images again and then process them through the model. Some code to help you on your way:
from keras.applications.inception_v3 import InceptionV3, preprocess_input
from keras.models import Model
import numpy as np
import cv2
import tensorflow as tf
from keras.layers import Input, Lambda
from keras import backend as K
base_model = InceptionV3(
model = Model(
def prepare_image(image_str_tensor):
#image = tf.squeeze(tf.cast(image_str_tensor, tf.string), axis=[0])
image_str_tensor = tf.cast(image_str_tensor, tf.string)
image = tf.image.decode_jpeg(image_str_tensor,
#image = tf.divide(image, 255)
#image = tf.expand_dims(image, 0)
image = tf.image.convert_image_dtype(image, tf.float32)
return image
def prepare_image_batch(image_str_tensor):
return tf.map_fn(prepare_image, image_str_tensor, dtype=tf.float32)
input_img = Input(dtype= tf.string,
name ='string_input',
shape = ()
outputs = Lambda(prepare_image_batch)(input_img)
outputs = model(outputs)
inception_model = Model(input_img, outputs)
inception_model.compile(optimizer = "sgd", loss='categorical_crossentropy')
weights = inception_model.get_weights()
Next to that, I would say use a bigger gpu. I have basic yolo (keras implementation) now running on a P100 with about 0.4s latency when called from a compute engine. We noticed that the darknet implementation (in c++) is a lot faster than the keras implementation tho.

