tensorrt_optimization_guide.md 10.0 KB

TensorRT 优化指南

方案概述

TensorRT 优化主要有三种方式:

  1. ONNX → TensorRT(最稳定,推荐)
  2. Torch-TensorRT(PyTorch 原生支持)
  3. torch2trt(简单但可能不兼容)

前置要求

1. 硬件要求

  • NVIDIA GPU(支持 CUDA)
  • CUDA 11.8+ 或 12.0+
  • cuDNN 8.6+

2. 软件安装

# 安装 TensorRT(推荐使用 pip)
pip install nvidia-tensorrt

# 或者从 NVIDIA 官网下载 TensorRT 安装包
# https://developer.nvidia.com/tensorrt

# 安装 ONNX Runtime(用于转换)
pip install onnx onnxruntime-gpu

# 安装 PyTorch(如果还没有)
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

方案 1:ONNX → TensorRT(推荐)

步骤 1:将 PyTorch 模型转换为 ONNX

创建转换脚本 convert_to_onnx.py

import torch
import torch.onnx
from lightglue import LightGlue, SuperPoint
import argparse

def convert_superpoint_to_onnx(model, output_path, input_shape=(1, 1, 480, 640)):
    """转换 SuperPoint 模型到 ONNX"""
    model.eval()
    dummy_input = torch.randn(*input_shape).cuda()
    
    torch.onnx.export(
        model,
        ({"image": dummy_input}),
        output_path,
        input_names=["image"],
        output_names=["keypoints", "descriptors", "scores"],
        dynamic_axes={
            "image": {0: "batch_size"},
            "keypoints": {0: "batch_size", 1: "num_keypoints"},
            "descriptors": {0: "batch_size", 1: "num_keypoints"},
            "scores": {0: "batch_size", 1: "num_keypoints"},
        },
        opset_version=11,
        do_constant_folding=True,
    )
    print(f"SuperPoint ONNX model saved to {output_path}")

def convert_lightglue_to_onnx(model, output_path):
    """转换 LightGlue 模型到 ONNX"""
    model.eval()
    # 需要两个输入图像的特征
    dummy_kpts0 = torch.randn(1, 128, 2).cuda()
    dummy_kpts1 = torch.randn(1, 128, 2).cuda()
    dummy_desc0 = torch.randn(1, 128, 256).cuda()
    dummy_desc1 = torch.randn(1, 128, 256).cuda()
    
    torch.onnx.export(
        model,
        ({
            "image0": {"keypoints": dummy_kpts0, "descriptors": desc0},
            "image1": {"keypoints": dummy_kpts1, "descriptors": desc1},
        }),
        output_path,
        input_names=["keypoints0", "descriptors0", "keypoints1", "descriptors1"],
        output_names=["matches", "scores"],
        dynamic_axes={
            "keypoints0": {0: "batch_size", 1: "num_keypoints0"},
            "descriptors0": {0: "batch_size", 1: "num_keypoints0"},
            "keypoints1": {0: "batch_size", 1: "num_keypoints1"},
            "descriptors1": {0: "batch_size", 1: "num_keypoints1"},
        },
        opset_version=11,
        do_constant_folding=True,
    )
    print(f"LightGlue ONNX model saved to {output_path}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--max_keypoints", type=int, default=128)
    parser.add_argument("--output_dir", type=str, default="./models")
    args = parser.parse_args()
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    # 创建模型
    extractor = SuperPoint(max_num_keypoints=args.max_keypoints).eval().to(device)
    matcher = LightGlue(features="superpoint").eval().to(device)
    
    # 转换为 ONNX
    import os
    os.makedirs(args.output_dir, exist_ok=True)
    
    convert_superpoint_to_onnx(
        extractor, 
        os.path.join(args.output_dir, "superpoint.onnx"),
        input_shape=(1, 1, 480, 640)
    )
    
    # 注意:LightGlue 的转换更复杂,可能需要特殊处理
    # convert_lightglue_to_onnx(matcher, os.path.join(args.output_dir, "lightglue.onnx"))

步骤 2:将 ONNX 转换为 TensorRT

创建转换脚本 convert_onnx_to_tensorrt.py

import tensorrt as trt
import numpy as np

def build_engine(onnx_file_path, engine_file_path, precision="fp16"):
    """将 ONNX 模型转换为 TensorRT 引擎"""
    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
    
    builder = trt.Builder(TRT_LOGGER)
    network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
    parser = trt.OnnxParser(network, TRT_LOGGER)
    
    # 解析 ONNX 文件
    with open(onnx_file_path, 'rb') as model:
        if not parser.parse(model.read()):
            for error in range(parser.num_errors):
                print(parser.get_error(error))
            return None
    
    # 配置构建器
    config = builder.create_builder_config()
    config.max_workspace_size = 1 << 30  # 1GB
    
    # 设置精度
    if precision == "fp16":
        if builder.platform_has_fast_fp16:
            config.set_flag(trt.BuilderFlag.FP16)
            print("FP16 precision enabled")
    elif precision == "int8":
        if builder.platform_has_fast_int8:
            config.set_flag(trt.BuilderFlag.INT8)
            print("INT8 precision enabled")
    
    # 构建引擎
    print("Building TensorRT engine... This may take a while...")
    engine = builder.build_engine(network, config)
    
    if engine is None:
        print("Failed to build engine")
        return None
    
    # 保存引擎
    with open(engine_file_path, 'wb') as f:
        f.write(engine.serialize())
    
    print(f"TensorRT engine saved to {engine_file_path}")
    return engine

if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--onnx", type=str, required=True, help="ONNX model path")
    parser.add_argument("--engine", type=str, required=True, help="Output TensorRT engine path")
    parser.add_argument("--precision", type=str, default="fp16", choices=["fp32", "fp16", "int8"])
    args = parser.parse_args()
    
    build_engine(args.onnx, args.engine, args.precision)

步骤 3:创建 TensorRT 推理包装器

创建 tensorrt_inference.py

import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import torch

class TensorRTInference:
    """TensorRT 推理包装器"""
    
    def __init__(self, engine_path):
        self.engine_path = engine_path
        self.engine = None
        self.context = None
        self.inputs = []
        self.outputs = []
        self.bindings = []
        self.stream = None
        self._load_engine()
    
    def _load_engine(self):
        """加载 TensorRT 引擎"""
        TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
        
        # 反序列化引擎
        with open(self.engine_path, 'rb') as f:
            runtime = trt.Runtime(TRT_LOGGER)
            self.engine = runtime.deserialize_cuda_engine(f.read())
        
        self.context = self.engine.create_execution_context()
        self.stream = cuda.Stream()
        
        # 分配输入输出缓冲区
        for binding in self.engine:
            size = trt.volume(self.engine.get_binding_shape(binding)) * \
                   self.engine.max_batch_size
            dtype = trt.nptype(self.engine.get_binding_dtype(binding))
            
            host_mem = cuda.pagelocked_empty(size, dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)
            
            self.bindings.append(int(device_mem))
            
            if self.engine.binding_is_input(binding):
                self.inputs.append({'host': host_mem, 'device': device_mem})
            else:
                self.outputs.append({'host': host_mem, 'device': device_mem})
    
    def infer(self, input_data):
        """执行推理"""
        # 将输入数据复制到 GPU
        np.copyto(self.inputs[0]['host'], input_data.ravel())
        cuda.memcpy_htod_async(
            self.inputs[0]['device'], 
            self.inputs[0]['host'], 
            self.stream
        )
        
        # 执行推理
        self.context.execute_async_v2(
            bindings=self.bindings,
            stream_handle=self.stream.handle
        )
        
        # 将输出数据复制回 CPU
        outputs = []
        for output in self.outputs:
            cuda.memcpy_dtoh_async(
                output['host'], 
                output['device'], 
                self.stream
            )
            outputs.append(output['host'])
        
        self.stream.synchronize()
        return outputs

方案 2:Torch-TensorRT(更简单,但可能不稳定)

安装

pip install torch-tensorrt

使用示例

import torch_tensorrt

# 编译模型
trt_model = torch_tensorrt.compile(
    model,
    inputs=[torch.randn(1, 1, 480, 640).cuda()],
    enabled_precisions={torch.float, torch.half},  # FP32 和 FP16
    workspace_size=1 << 30,  # 1GB
)

# 保存编译后的模型
torch.jit.save(trt_model, "model_trt.ts")

集成到现有代码

修改 demo_lightglue_camera_position_async.py

在模型加载部分添加 TensorRT 支持:

# 在模型加载后添加
if opt.use_tensorrt:
    try:
        from tensorrt_inference import TensorRTInference
        
        # 加载 TensorRT 引擎
        extractor_trt = TensorRTInference("models/superpoint_fp16.engine")
        matcher_trt = TensorRTInference("models/lightglue_fp16.engine")
        
        # 替换原始模型
        extractor = extractor_trt
        matcher = matcher_trt
        print("Using TensorRT optimized models")
    except Exception as e:
        print(f"Failed to load TensorRT models: {e}")
        print("Falling back to PyTorch models")

性能预期

  • FP16 TensorRT: 比 FP16 PyTorch 快 1.5-2 倍
  • INT8 TensorRT: 比 FP16 PyTorch 快 2-4 倍(但精度可能下降)

注意事项

  1. 模型兼容性: 某些操作可能不被 TensorRT 支持,需要修改模型
  2. 动态形状: TensorRT 对动态形状支持有限,需要固定输入尺寸
  3. 精度损失: INT8 量化可能导致精度下降
  4. 构建时间: TensorRT 引擎构建需要较长时间(几分钟到几十分钟)

推荐流程

  1. 先尝试 Torch-TensorRT(最简单)
  2. 如果失败,使用 ONNX → TensorRT(更稳定)
  3. 如果还不够快,尝试 INT8 量化