TensorRT¶

NVIDIA® TensorRT™ is an SDK for optimizing trained deep learning models to enable high-performance inference.

A typical usage of TensorRt is to convert a trained model and improve its inference performance. For instance, the more recent Ampere GPUs has much better low-precision floating point performances, which can be exploited at the inference stage.

Example with TensorFlow/Keras¶

Here is an example of converting a keras model from TensorRT documentation:

import os
import numpy as np
import tensorflow as tf
from timeit import repeat
from tensorflow import keras
from tensorflow.python.compiler.tensorrt import trt_convert as trt
tmpdir = os.environ['TMPDIR']
# ^-- preparation


model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=(28, 28)),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(10)
])

model.compile(optimizer='adam',
              loss=keras.losses.SparseCategoricalCrossentropy(
                  from_logits=True),
              metrics=['accuracy'])

mnist = keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0
x_train = tf.cast(x_train, dtype=tf.float32)
y_train = tf.cast(y_train, dtype=tf.float32)
x_test = tf.cast(x_test, dtype=tf.float32)
y_test = tf.cast(y_test, dtype=tf.float32)
model.fit(x_train, y_train, epochs=5)
model.evaluate(x_test,  y_test, verbose=2)
# ^-- regular keras model trianing


model.save(f'{tmpdir}/keras_model')
converter = trt.TrtGraphConverterV2(
    input_saved_model_dir=f'{tmpdir}/keras_model',
    precision_mode=trt.TrtPrecisionMode.FP16,
    # or
    # precision_mode=trt.TrtPrecisionMode.FP32
)
trt_func = converter.convert()
# ^-- trt conversion of the model


random_input = tf.constant(np.random.rand(100, 28, 28), dtype=tf.float32)
for i in range(10):
    # warm up both functions for inference
    model(random_input[(a := np.random.randint(90)):a+10])
    trt_func(random_input[(a := np.random.randint(90)):a+10])

rep_keras = repeat(
    lambda: model(random_input[(a := np.random.randint(90)):a+10]),
    number=1000, repeat=7)
rep_trt = repeat(
    lambda: trt_func(random_input[(a := np.random.randint(90)):a+10]),
    number=1000, repeat=7)
print(f'Keras:    {np.mean(rep_keras):.6f}({np.std(rep_keras):.6f})ms')
print(f'TensorRT: {np.mean(rep_trt):.6f}({np.std(rep_trt):.6f})ms')
# ^-- timing the models


# In jupyter one can instead do:
# %timeit model(random_input[(a:=np.random.randint(90)):a+10])
# %timeit trt_func(random_input[(a:=np.random.randint(90)):a+10])

On A100 GPUs we can see some significant improvement of the inference speed:

Keras: 2.287080(0.009547)ms
TensorRT: 1.493052(0.008913)ms