# TensorRT 优化指南 ## 方案概述 TensorRT 优化主要有三种方式: 1. **ONNX → TensorRT**(最稳定,推荐) 2. **Torch-TensorRT**(PyTorch 原生支持) 3. **torch2trt**(简单但可能不兼容) ## 前置要求 ### 1. 硬件要求 - NVIDIA GPU(支持 CUDA) - CUDA 11.8+ 或 12.0+ - cuDNN 8.6+ ### 2. 软件安装 ```bash # 安装 TensorRT(推荐使用 pip) pip install nvidia-tensorrt # 或者从 NVIDIA 官网下载 TensorRT 安装包 # https://developer.nvidia.com/tensorrt # 安装 ONNX Runtime(用于转换) pip install onnx onnxruntime-gpu # 安装 PyTorch(如果还没有) pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 ``` ## 方案 1:ONNX → TensorRT(推荐) ### 步骤 1:将 PyTorch 模型转换为 ONNX 创建转换脚本 `convert_to_onnx.py`: ```python import torch import torch.onnx from lightglue import LightGlue, SuperPoint import argparse def convert_superpoint_to_onnx(model, output_path, input_shape=(1, 1, 480, 640)): """转换 SuperPoint 模型到 ONNX""" model.eval() dummy_input = torch.randn(*input_shape).cuda() torch.onnx.export( model, ({"image": dummy_input}), output_path, input_names=["image"], output_names=["keypoints", "descriptors", "scores"], dynamic_axes={ "image": {0: "batch_size"}, "keypoints": {0: "batch_size", 1: "num_keypoints"}, "descriptors": {0: "batch_size", 1: "num_keypoints"}, "scores": {0: "batch_size", 1: "num_keypoints"}, }, opset_version=11, do_constant_folding=True, ) print(f"SuperPoint ONNX model saved to {output_path}") def convert_lightglue_to_onnx(model, output_path): """转换 LightGlue 模型到 ONNX""" model.eval() # 需要两个输入图像的特征 dummy_kpts0 = torch.randn(1, 128, 2).cuda() dummy_kpts1 = torch.randn(1, 128, 2).cuda() dummy_desc0 = torch.randn(1, 128, 256).cuda() dummy_desc1 = torch.randn(1, 128, 256).cuda() torch.onnx.export( model, ({ "image0": {"keypoints": dummy_kpts0, "descriptors": desc0}, "image1": {"keypoints": dummy_kpts1, "descriptors": desc1}, }), output_path, input_names=["keypoints0", "descriptors0", "keypoints1", "descriptors1"], output_names=["matches", "scores"], dynamic_axes={ "keypoints0": {0: "batch_size", 1: "num_keypoints0"}, "descriptors0": {0: "batch_size", 1: "num_keypoints0"}, "keypoints1": {0: "batch_size", 1: "num_keypoints1"}, "descriptors1": {0: "batch_size", 1: "num_keypoints1"}, }, opset_version=11, do_constant_folding=True, ) print(f"LightGlue ONNX model saved to {output_path}") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--max_keypoints", type=int, default=128) parser.add_argument("--output_dir", type=str, default="./models") args = parser.parse_args() device = "cuda" if torch.cuda.is_available() else "cpu" # 创建模型 extractor = SuperPoint(max_num_keypoints=args.max_keypoints).eval().to(device) matcher = LightGlue(features="superpoint").eval().to(device) # 转换为 ONNX import os os.makedirs(args.output_dir, exist_ok=True) convert_superpoint_to_onnx( extractor, os.path.join(args.output_dir, "superpoint.onnx"), input_shape=(1, 1, 480, 640) ) # 注意:LightGlue 的转换更复杂,可能需要特殊处理 # convert_lightglue_to_onnx(matcher, os.path.join(args.output_dir, "lightglue.onnx")) ``` ### 步骤 2:将 ONNX 转换为 TensorRT 创建转换脚本 `convert_onnx_to_tensorrt.py`: ```python import tensorrt as trt import numpy as np def build_engine(onnx_file_path, engine_file_path, precision="fp16"): """将 ONNX 模型转换为 TensorRT 引擎""" TRT_LOGGER = trt.Logger(trt.Logger.WARNING) builder = trt.Builder(TRT_LOGGER) network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) parser = trt.OnnxParser(network, TRT_LOGGER) # 解析 ONNX 文件 with open(onnx_file_path, 'rb') as model: if not parser.parse(model.read()): for error in range(parser.num_errors): print(parser.get_error(error)) return None # 配置构建器 config = builder.create_builder_config() config.max_workspace_size = 1 << 30 # 1GB # 设置精度 if precision == "fp16": if builder.platform_has_fast_fp16: config.set_flag(trt.BuilderFlag.FP16) print("FP16 precision enabled") elif precision == "int8": if builder.platform_has_fast_int8: config.set_flag(trt.BuilderFlag.INT8) print("INT8 precision enabled") # 构建引擎 print("Building TensorRT engine... This may take a while...") engine = builder.build_engine(network, config) if engine is None: print("Failed to build engine") return None # 保存引擎 with open(engine_file_path, 'wb') as f: f.write(engine.serialize()) print(f"TensorRT engine saved to {engine_file_path}") return engine if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("--onnx", type=str, required=True, help="ONNX model path") parser.add_argument("--engine", type=str, required=True, help="Output TensorRT engine path") parser.add_argument("--precision", type=str, default="fp16", choices=["fp32", "fp16", "int8"]) args = parser.parse_args() build_engine(args.onnx, args.engine, args.precision) ``` ### 步骤 3:创建 TensorRT 推理包装器 创建 `tensorrt_inference.py`: ```python import tensorrt as trt import pycuda.driver as cuda import pycuda.autoinit import numpy as np import torch class TensorRTInference: """TensorRT 推理包装器""" def __init__(self, engine_path): self.engine_path = engine_path self.engine = None self.context = None self.inputs = [] self.outputs = [] self.bindings = [] self.stream = None self._load_engine() def _load_engine(self): """加载 TensorRT 引擎""" TRT_LOGGER = trt.Logger(trt.Logger.WARNING) # 反序列化引擎 with open(self.engine_path, 'rb') as f: runtime = trt.Runtime(TRT_LOGGER) self.engine = runtime.deserialize_cuda_engine(f.read()) self.context = self.engine.create_execution_context() self.stream = cuda.Stream() # 分配输入输出缓冲区 for binding in self.engine: size = trt.volume(self.engine.get_binding_shape(binding)) * \ self.engine.max_batch_size dtype = trt.nptype(self.engine.get_binding_dtype(binding)) host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) self.bindings.append(int(device_mem)) if self.engine.binding_is_input(binding): self.inputs.append({'host': host_mem, 'device': device_mem}) else: self.outputs.append({'host': host_mem, 'device': device_mem}) def infer(self, input_data): """执行推理""" # 将输入数据复制到 GPU np.copyto(self.inputs[0]['host'], input_data.ravel()) cuda.memcpy_htod_async( self.inputs[0]['device'], self.inputs[0]['host'], self.stream ) # 执行推理 self.context.execute_async_v2( bindings=self.bindings, stream_handle=self.stream.handle ) # 将输出数据复制回 CPU outputs = [] for output in self.outputs: cuda.memcpy_dtoh_async( output['host'], output['device'], self.stream ) outputs.append(output['host']) self.stream.synchronize() return outputs ``` ## 方案 2:Torch-TensorRT(更简单,但可能不稳定) ### 安装 ```bash pip install torch-tensorrt ``` ### 使用示例 ```python import torch_tensorrt # 编译模型 trt_model = torch_tensorrt.compile( model, inputs=[torch.randn(1, 1, 480, 640).cuda()], enabled_precisions={torch.float, torch.half}, # FP32 和 FP16 workspace_size=1 << 30, # 1GB ) # 保存编译后的模型 torch.jit.save(trt_model, "model_trt.ts") ``` ## 集成到现有代码 ### 修改 demo_lightglue_camera_position_async.py 在模型加载部分添加 TensorRT 支持: ```python # 在模型加载后添加 if opt.use_tensorrt: try: from tensorrt_inference import TensorRTInference # 加载 TensorRT 引擎 extractor_trt = TensorRTInference("models/superpoint_fp16.engine") matcher_trt = TensorRTInference("models/lightglue_fp16.engine") # 替换原始模型 extractor = extractor_trt matcher = matcher_trt print("Using TensorRT optimized models") except Exception as e: print(f"Failed to load TensorRT models: {e}") print("Falling back to PyTorch models") ``` ## 性能预期 - **FP16 TensorRT**: 比 FP16 PyTorch 快 1.5-2 倍 - **INT8 TensorRT**: 比 FP16 PyTorch 快 2-4 倍(但精度可能下降) ## 注意事项 1. **模型兼容性**: 某些操作可能不被 TensorRT 支持,需要修改模型 2. **动态形状**: TensorRT 对动态形状支持有限,需要固定输入尺寸 3. **精度损失**: INT8 量化可能导致精度下降 4. **构建时间**: TensorRT 引擎构建需要较长时间(几分钟到几十分钟) ## 推荐流程 1. 先尝试 **Torch-TensorRT**(最简单) 2. 如果失败,使用 **ONNX → TensorRT**(更稳定) 3. 如果还不够快,尝试 **INT8 量化**