TensorRT
From Nvidia TensorRT documentation:
NVIDIA® TensorRT™ is an SDK for optimizing trained deep learning models to
enable high-performance inference.
A typical usage of TensorRt is to convert a trained model and improve its
inference performance. For instance, the more recent Ampere GPUs has much better
low-precision floating point performances, which can be exploited at the
inference stage.
Example with TensorFlow/Keras
Here is an example of converting a keras model from TensorRT
documentation:
| import os
import numpy as np
import tensorflow as tf
from timeit import repeat
from tensorflow import keras
from tensorflow.python.compiler.tensorrt import trt_convert as trt
tmpdir = os.environ['TMPDIR']
# ^-- preparation
model = keras.models.Sequential([
keras.layers.Flatten(input_shape=(28, 28)),
keras.layers.Dense(128, activation='relu'),
keras.layers.Dense(128, activation='relu'),
keras.layers.Dense(128, activation='relu'),
keras.layers.Dropout(0.2),
keras.layers.Dense(10)
])
model.compile(optimizer='adam',
loss=keras.losses.SparseCategoricalCrossentropy(
from_logits=True),
metrics=['accuracy'])
mnist = keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0
x_train = tf.cast(x_train, dtype=tf.float32)
y_train = tf.cast(y_train, dtype=tf.float32)
x_test = tf.cast(x_test, dtype=tf.float32)
y_test = tf.cast(y_test, dtype=tf.float32)
model.fit(x_train, y_train, epochs=5)
model.evaluate(x_test, y_test, verbose=2)
# ^-- regular keras model trianing
model.save(f'{tmpdir}/keras_model')
converter = trt.TrtGraphConverterV2(
input_saved_model_dir=f'{tmpdir}/keras_model',
precision_mode=trt.TrtPrecisionMode.FP16,
# or
# precision_mode=trt.TrtPrecisionMode.FP32
)
trt_func = converter.convert()
# ^-- trt conversion of the model
random_input = tf.constant(np.random.rand(100, 28, 28), dtype=tf.float32)
for i in range(10):
# warm up both functions for inference
model(random_input[(a := np.random.randint(90)):a+10])
trt_func(random_input[(a := np.random.randint(90)):a+10])
rep_keras = repeat(
lambda: model(random_input[(a := np.random.randint(90)):a+10]),
number=1000, repeat=7)
rep_trt = repeat(
lambda: trt_func(random_input[(a := np.random.randint(90)):a+10]),
number=1000, repeat=7)
print(f'Keras: {np.mean(rep_keras):.6f}({np.std(rep_keras):.6f})ms')
print(f'TensorRT: {np.mean(rep_trt):.6f}({np.std(rep_trt):.6f})ms')
# ^-- timing the models
# In jupyter one can instead do:
# %timeit model(random_input[(a:=np.random.randint(90)):a+10])
# %timeit trt_func(random_input[(a:=np.random.randint(90)):a+10])
|
On A100 GPUs we can see some significant improvement of the inference speed:
Keras: 2.287080(0.009547)ms
TensorRT: 1.493052(0.008913)ms