TensorRT 优化主要有三种方式:
# 安装 TensorRT(推荐使用 pip)
pip install nvidia-tensorrt
# 或者从 NVIDIA 官网下载 TensorRT 安装包
# https://developer.nvidia.com/tensorrt
# 安装 ONNX Runtime(用于转换)
pip install onnx onnxruntime-gpu
# 安装 PyTorch(如果还没有)
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
创建转换脚本 convert_to_onnx.py:
import torch
import torch.onnx
from lightglue import LightGlue, SuperPoint
import argparse
def convert_superpoint_to_onnx(model, output_path, input_shape=(1, 1, 480, 640)):
"""转换 SuperPoint 模型到 ONNX"""
model.eval()
dummy_input = torch.randn(*input_shape).cuda()
torch.onnx.export(
model,
({"image": dummy_input}),
output_path,
input_names=["image"],
output_names=["keypoints", "descriptors", "scores"],
dynamic_axes={
"image": {0: "batch_size"},
"keypoints": {0: "batch_size", 1: "num_keypoints"},
"descriptors": {0: "batch_size", 1: "num_keypoints"},
"scores": {0: "batch_size", 1: "num_keypoints"},
},
opset_version=11,
do_constant_folding=True,
)
print(f"SuperPoint ONNX model saved to {output_path}")
def convert_lightglue_to_onnx(model, output_path):
"""转换 LightGlue 模型到 ONNX"""
model.eval()
# 需要两个输入图像的特征
dummy_kpts0 = torch.randn(1, 128, 2).cuda()
dummy_kpts1 = torch.randn(1, 128, 2).cuda()
dummy_desc0 = torch.randn(1, 128, 256).cuda()
dummy_desc1 = torch.randn(1, 128, 256).cuda()
torch.onnx.export(
model,
({
"image0": {"keypoints": dummy_kpts0, "descriptors": desc0},
"image1": {"keypoints": dummy_kpts1, "descriptors": desc1},
}),
output_path,
input_names=["keypoints0", "descriptors0", "keypoints1", "descriptors1"],
output_names=["matches", "scores"],
dynamic_axes={
"keypoints0": {0: "batch_size", 1: "num_keypoints0"},
"descriptors0": {0: "batch_size", 1: "num_keypoints0"},
"keypoints1": {0: "batch_size", 1: "num_keypoints1"},
"descriptors1": {0: "batch_size", 1: "num_keypoints1"},
},
opset_version=11,
do_constant_folding=True,
)
print(f"LightGlue ONNX model saved to {output_path}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--max_keypoints", type=int, default=128)
parser.add_argument("--output_dir", type=str, default="./models")
args = parser.parse_args()
device = "cuda" if torch.cuda.is_available() else "cpu"
# 创建模型
extractor = SuperPoint(max_num_keypoints=args.max_keypoints).eval().to(device)
matcher = LightGlue(features="superpoint").eval().to(device)
# 转换为 ONNX
import os
os.makedirs(args.output_dir, exist_ok=True)
convert_superpoint_to_onnx(
extractor,
os.path.join(args.output_dir, "superpoint.onnx"),
input_shape=(1, 1, 480, 640)
)
# 注意:LightGlue 的转换更复杂,可能需要特殊处理
# convert_lightglue_to_onnx(matcher, os.path.join(args.output_dir, "lightglue.onnx"))
创建转换脚本 convert_onnx_to_tensorrt.py:
import tensorrt as trt
import numpy as np
def build_engine(onnx_file_path, engine_file_path, precision="fp16"):
"""将 ONNX 模型转换为 TensorRT 引擎"""
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(TRT_LOGGER)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
parser = trt.OnnxParser(network, TRT_LOGGER)
# 解析 ONNX 文件
with open(onnx_file_path, 'rb') as model:
if not parser.parse(model.read()):
for error in range(parser.num_errors):
print(parser.get_error(error))
return None
# 配置构建器
config = builder.create_builder_config()
config.max_workspace_size = 1 << 30 # 1GB
# 设置精度
if precision == "fp16":
if builder.platform_has_fast_fp16:
config.set_flag(trt.BuilderFlag.FP16)
print("FP16 precision enabled")
elif precision == "int8":
if builder.platform_has_fast_int8:
config.set_flag(trt.BuilderFlag.INT8)
print("INT8 precision enabled")
# 构建引擎
print("Building TensorRT engine... This may take a while...")
engine = builder.build_engine(network, config)
if engine is None:
print("Failed to build engine")
return None
# 保存引擎
with open(engine_file_path, 'wb') as f:
f.write(engine.serialize())
print(f"TensorRT engine saved to {engine_file_path}")
return engine
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--onnx", type=str, required=True, help="ONNX model path")
parser.add_argument("--engine", type=str, required=True, help="Output TensorRT engine path")
parser.add_argument("--precision", type=str, default="fp16", choices=["fp32", "fp16", "int8"])
args = parser.parse_args()
build_engine(args.onnx, args.engine, args.precision)
创建 tensorrt_inference.py:
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import torch
class TensorRTInference:
"""TensorRT 推理包装器"""
def __init__(self, engine_path):
self.engine_path = engine_path
self.engine = None
self.context = None
self.inputs = []
self.outputs = []
self.bindings = []
self.stream = None
self._load_engine()
def _load_engine(self):
"""加载 TensorRT 引擎"""
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
# 反序列化引擎
with open(self.engine_path, 'rb') as f:
runtime = trt.Runtime(TRT_LOGGER)
self.engine = runtime.deserialize_cuda_engine(f.read())
self.context = self.engine.create_execution_context()
self.stream = cuda.Stream()
# 分配输入输出缓冲区
for binding in self.engine:
size = trt.volume(self.engine.get_binding_shape(binding)) * \
self.engine.max_batch_size
dtype = trt.nptype(self.engine.get_binding_dtype(binding))
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
self.bindings.append(int(device_mem))
if self.engine.binding_is_input(binding):
self.inputs.append({'host': host_mem, 'device': device_mem})
else:
self.outputs.append({'host': host_mem, 'device': device_mem})
def infer(self, input_data):
"""执行推理"""
# 将输入数据复制到 GPU
np.copyto(self.inputs[0]['host'], input_data.ravel())
cuda.memcpy_htod_async(
self.inputs[0]['device'],
self.inputs[0]['host'],
self.stream
)
# 执行推理
self.context.execute_async_v2(
bindings=self.bindings,
stream_handle=self.stream.handle
)
# 将输出数据复制回 CPU
outputs = []
for output in self.outputs:
cuda.memcpy_dtoh_async(
output['host'],
output['device'],
self.stream
)
outputs.append(output['host'])
self.stream.synchronize()
return outputs
pip install torch-tensorrt
import torch_tensorrt
# 编译模型
trt_model = torch_tensorrt.compile(
model,
inputs=[torch.randn(1, 1, 480, 640).cuda()],
enabled_precisions={torch.float, torch.half}, # FP32 和 FP16
workspace_size=1 << 30, # 1GB
)
# 保存编译后的模型
torch.jit.save(trt_model, "model_trt.ts")
在模型加载部分添加 TensorRT 支持:
# 在模型加载后添加
if opt.use_tensorrt:
try:
from tensorrt_inference import TensorRTInference
# 加载 TensorRT 引擎
extractor_trt = TensorRTInference("models/superpoint_fp16.engine")
matcher_trt = TensorRTInference("models/lightglue_fp16.engine")
# 替换原始模型
extractor = extractor_trt
matcher = matcher_trt
print("Using TensorRT optimized models")
except Exception as e:
print(f"Failed to load TensorRT models: {e}")
print("Falling back to PyTorch models")