import keras from keras.models import Model from keras import backend as K from keras.layers import Dense, Activation import tensorflow as tf from tensorflow.contrib import tensorrt as tftrt import copy import numpy as np import sys import time class FrozenGraph(object): def __init__(self, model, shape): shape = (None, shape[0], shape[1], shape[2]) x_name = 'image_tensor_x' with K.get_session() as sess: x_tensor = tf.placeholder(tf.float32, shape, x_name) K.set_learning_phase(0) y_tensor = model(x_tensor) y_name = y_tensor.name[:-2] graph = sess.graph.as_graph_def() graph0 = tf.graph_util.convert_variables_to_constants(sess, graph, [y_name]) graph1 = tf.graph_util.remove_training_nodes(graph0) self.x_name = [x_name] self.y_name = [y_name] self.frozen = graph1 class TfEngine(object): def __init__(self, graph): g = tf.Graph() with g.as_default(): x_op, y_op = tf.import_graph_def( graph_def=graph.frozen, return_elements=graph.x_name + graph.y_name) self.x_tensor = x_op.outputs[0] self.y_tensor = y_op.outputs[0] config = tf.ConfigProto(gpu_options= tf.GPUOptions(per_process_gpu_memory_fraction=0.5, allow_growth=True)) self.sess = tf.Session(graph=g, config=config) def infer(self, x): y = self.sess.run(self.y_tensor, feed_dict={self.x_tensor: x}) return y class TftrtEngine(TfEngine): def __init__(self, graph, batch_size, precision): tftrt_graph = tftrt.create_inference_graph( graph.frozen, outputs=graph.y_name, max_batch_size=batch_size, max_workspace_size_bytes=1 << 30, precision_mode=precision, minimum_segment_size=2) self.tftrt_graph = tftrt_graph opt_graph = copy.deepcopy(graph) opt_graph.frozen = tftrt_graph super(TftrtEngine, self).__init__(opt_graph) self.batch_size = batch_size def infer(self, x): num_tests = x.shape[0] y = np.empty((num_tests, 1), np.float32) batch_size = self.batch_size for i in range(0, num_tests, batch_size): x_part = x[i: i + batch_size] y_part = self.sess.run(self.y_tensor, feed_dict={self.x_tensor: x_part}) y[i: i + batch_size] = y_part return y mobilenet = keras.applications.mobilenet.MobileNet(input_shape=(224, 224, 3), weights='imagenet', pooling='max') new_output = mobilenet.get_layer('global_average_pooling2d_1').output new_output = Dense(1, activation='softmax')(new_output) model = Model(inputs=mobilenet.input, outputs=new_output) model.summary() frozen_graph = FrozenGraph(model, (224, 224, 3)) x_test = np.random.random((10, 224, 224, 3)) tf_engine = TfEngine(frozen_graph) t0 = time.time() y_tf = tf_engine.infer(x_test) print(y_tf) t1 = time.time() print('Tensorflow time', t1 - t0) batch_size = 1 tftrt_engine = TftrtEngine(frozen_graph, batch_size, 'FP32') t0 = time.time() y_tftrt = tftrt_engine.infer(x_test) print(y_tftrt) t1 = time.time() print('TFTRT time', t1 - t0) tftrt_engine = TftrtEngine(frozen_graph, batch_size, 'FP16') t0 = time.time() y_tftrt = tftrt_engine.infer(x_test) print(y_tftrt) t1 = time.time() print('TFTRT_FP16 time', t1 - t0)