hpi.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301
  1. # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import ctypes.util
  15. import importlib.resources
  16. import importlib.util
  17. import json
  18. import platform
  19. from collections import defaultdict
  20. from functools import lru_cache
  21. from typing import Any, Dict, List, Literal, Optional, Tuple, Union
  22. from pydantic import BaseModel, Field
  23. from typing_extensions import Annotated, TypeAlias
  24. from ...utils import logging
  25. from ...utils.deps import function_requires_deps, is_paddle2onnx_plugin_available
  26. from ...utils.env import get_paddle_cuda_version, get_paddle_version
  27. from ...utils.flags import USE_PIR_TRT
  28. from .misc import is_mkldnn_available
  29. from .model_paths import ModelPaths
  30. class PaddleInferenceInfo(BaseModel):
  31. trt_dynamic_shapes: Optional[Dict[str, List[List[int]]]] = None
  32. trt_dynamic_shape_input_data: Optional[Dict[str, List[List[float]]]] = None
  33. class TensorRTInfo(BaseModel):
  34. dynamic_shapes: Optional[Dict[str, List[List[int]]]] = None
  35. class InferenceBackendInfoCollection(BaseModel):
  36. paddle_infer: Optional[PaddleInferenceInfo] = None
  37. tensorrt: Optional[TensorRTInfo] = None
  38. # Does using `TypedDict` make things more convenient?
  39. class HPIInfo(BaseModel):
  40. backend_configs: Optional[InferenceBackendInfoCollection] = None
  41. # For multi-backend inference only
  42. InferenceBackend: TypeAlias = Literal[
  43. "paddle", "openvino", "onnxruntime", "tensorrt", "om"
  44. ]
  45. class OpenVINOConfig(BaseModel):
  46. cpu_num_threads: int = 10
  47. class ONNXRuntimeConfig(BaseModel):
  48. cpu_num_threads: int = 10
  49. class TensorRTConfig(BaseModel):
  50. precision: Literal["fp32", "fp16"] = "fp32"
  51. use_dynamic_shapes: bool = True
  52. dynamic_shapes: Optional[Dict[str, List[List[int]]]] = None
  53. # TODO: Control caching behavior
  54. class OMConfig(BaseModel):
  55. pass
  56. class HPIConfig(BaseModel):
  57. pdx_model_name: Annotated[str, Field(alias="model_name")]
  58. device_type: str
  59. device_id: Optional[int] = None
  60. auto_config: bool = True
  61. backend: Optional[InferenceBackend] = None
  62. backend_config: Optional[Dict[str, Any]] = None
  63. hpi_info: Optional[HPIInfo] = None
  64. auto_paddle2onnx: bool = True
  65. # TODO: Add more validation logic here
  66. class ModelInfo(BaseModel):
  67. name: str
  68. hpi_info: Optional[HPIInfo] = None
  69. ModelFormat: TypeAlias = Literal["paddle", "onnx", "om"]
  70. @lru_cache(1)
  71. def _get_hpi_model_info_collection():
  72. with importlib.resources.open_text(
  73. __package__, "hpi_model_info_collection.json", encoding="utf-8"
  74. ) as f:
  75. hpi_model_info_collection = json.load(f)
  76. return hpi_model_info_collection
  77. @function_requires_deps("ultra-infer")
  78. def suggest_inference_backend_and_config(
  79. hpi_config: HPIConfig,
  80. model_paths: ModelPaths,
  81. ) -> Union[Tuple[InferenceBackend, Dict[str, Any]], Tuple[None, str]]:
  82. # TODO: The current strategy is naive. It would be better to consider
  83. # additional important factors, such as NVIDIA GPU compute capability and
  84. # device manufacturers. We should also allow users to provide hints.
  85. from ultra_infer import (
  86. is_built_with_om,
  87. is_built_with_openvino,
  88. is_built_with_ort,
  89. is_built_with_trt,
  90. )
  91. is_onnx_model_available = "onnx" in model_paths
  92. # TODO: Give a warning if the Paddle2ONNX plugin is not available but
  93. # can be used to select a better backend.
  94. if hpi_config.auto_paddle2onnx and is_paddle2onnx_plugin_available():
  95. is_onnx_model_available = is_onnx_model_available or "paddle" in model_paths
  96. available_backends = []
  97. if "paddle" in model_paths:
  98. available_backends.append("paddle")
  99. if (
  100. is_built_with_openvino()
  101. and is_onnx_model_available
  102. and hpi_config.device_type == "cpu"
  103. ):
  104. available_backends.append("openvino")
  105. if (
  106. is_built_with_ort()
  107. and is_onnx_model_available
  108. and hpi_config.device_type in ("cpu", "gpu")
  109. ):
  110. available_backends.append("onnxruntime")
  111. if (
  112. is_built_with_trt()
  113. and is_onnx_model_available
  114. and hpi_config.device_type == "gpu"
  115. ):
  116. available_backends.append("tensorrt")
  117. if is_built_with_om() and "om" in model_paths and hpi_config.device_type == "npu":
  118. available_backends.append("om")
  119. if not available_backends:
  120. return None, "No inference backends are available."
  121. if hpi_config.backend is not None and hpi_config.backend not in available_backends:
  122. return None, f"Inference backend {repr(hpi_config.backend)} is unavailable."
  123. paddle_version = get_paddle_version()
  124. if paddle_version[:3] >= (3, 1, 0):
  125. logging.debug(
  126. "Paddle version %s is not supported yet. The prior knowledge of Paddle 3.1.1 will be used.",
  127. paddle_version,
  128. )
  129. paddle_version = (3, 1, 1, None)
  130. if (3, 0) <= paddle_version[:2] <= (3, 1) and paddle_version[3] is None:
  131. if paddle_version[2] == 0:
  132. paddle_version = f"paddle{paddle_version[0]}{paddle_version[1]}"
  133. else:
  134. paddle_version = (
  135. f"paddle{paddle_version[0]}{paddle_version[1]}{paddle_version[2]}"
  136. )
  137. else:
  138. return (
  139. None,
  140. f"{paddle_version} is not a supported Paddle version.",
  141. )
  142. if hpi_config.device_type == "cpu":
  143. uname = platform.uname()
  144. arch = uname.machine.lower()
  145. if arch == "x86_64":
  146. key = "cpu_x64"
  147. else:
  148. return None, f"{repr(arch)} is not a supported architecture."
  149. elif hpi_config.device_type == "gpu":
  150. # TODO: Is it better to also check the runtime versions of CUDA and
  151. # cuDNN, and the versions of CUDA and cuDNN used to build `ultra-infer`?
  152. cuda_version = get_paddle_cuda_version()
  153. if not cuda_version:
  154. return None, "No CUDA version was found."
  155. cuda_version = cuda_version[0]
  156. key = f"gpu_cuda{cuda_version}"
  157. else:
  158. return None, f"{repr(hpi_config.device_type)} is not a supported device type."
  159. hpi_model_info_collection = _get_hpi_model_info_collection()
  160. if key not in hpi_model_info_collection:
  161. return None, "No prior knowledge can be utilized."
  162. hpi_model_info_collection_for_env = hpi_model_info_collection[key][paddle_version]
  163. if hpi_config.pdx_model_name not in hpi_model_info_collection_for_env:
  164. return None, f"{repr(hpi_config.pdx_model_name)} is not a known model."
  165. supported_pseudo_backends = hpi_model_info_collection_for_env[
  166. hpi_config.pdx_model_name
  167. ].copy()
  168. if not (is_mkldnn_available() and hpi_config.device_type == "cpu"):
  169. for pb in supported_pseudo_backends[:]:
  170. if pb.startswith("paddle_mkldnn"):
  171. supported_pseudo_backends.remove(pb)
  172. # XXX
  173. if not (
  174. USE_PIR_TRT
  175. and importlib.util.find_spec("tensorrt")
  176. and ctypes.util.find_library("nvinfer")
  177. and hpi_config.device_type == "gpu"
  178. ):
  179. for pb in supported_pseudo_backends[:]:
  180. if pb.startswith("paddle_tensorrt"):
  181. supported_pseudo_backends.remove(pb)
  182. supported_backends = []
  183. backend_to_pseudo_backends = defaultdict(list)
  184. for pb in supported_pseudo_backends:
  185. if pb.startswith("paddle"):
  186. backend = "paddle"
  187. elif pb.startswith("tensorrt"):
  188. backend = "tensorrt"
  189. else:
  190. backend = pb
  191. if available_backends is not None and backend not in available_backends:
  192. continue
  193. supported_backends.append(backend)
  194. backend_to_pseudo_backends[backend].append(pb)
  195. if not supported_backends:
  196. return None, "No inference backend can be selected."
  197. if hpi_config.backend is not None:
  198. if hpi_config.backend not in supported_backends:
  199. return (
  200. None,
  201. f"{repr(hpi_config.backend)} is not a supported inference backend.",
  202. )
  203. suggested_backend = hpi_config.backend
  204. else:
  205. # Prefer the first one.
  206. suggested_backend = supported_backends[0]
  207. pseudo_backends = backend_to_pseudo_backends[suggested_backend]
  208. if hpi_config.backend_config is not None:
  209. requested_base_pseudo_backend = None
  210. if suggested_backend == "paddle":
  211. if "run_mode" in hpi_config.backend_config:
  212. if hpi_config.backend_config["run_mode"].startswith("mkldnn"):
  213. requested_base_pseudo_backend = "paddle_mkldnn"
  214. elif hpi_config.backend_config["run_mode"].startswith("trt"):
  215. requested_base_pseudo_backend = "paddle_tensorrt"
  216. if requested_base_pseudo_backend:
  217. for pb in pseudo_backends:
  218. if pb.startswith(requested_base_pseudo_backend):
  219. break
  220. else:
  221. return None, "Unsupported backend configuration."
  222. pseudo_backend = pseudo_backends[0]
  223. suggested_backend_config = {}
  224. if suggested_backend == "paddle":
  225. assert pseudo_backend in (
  226. "paddle",
  227. "paddle_fp16",
  228. "paddle_mkldnn",
  229. "paddle_tensorrt",
  230. "paddle_tensorrt_fp16",
  231. ), pseudo_backend
  232. if pseudo_backend == "paddle":
  233. suggested_backend_config.update({"run_mode": "paddle"})
  234. elif pseudo_backend == "paddle_fp16":
  235. suggested_backend_config.update({"run_mode": "paddle_fp16"})
  236. elif pseudo_backend == "paddle_mkldnn":
  237. suggested_backend_config.update({"run_mode": "mkldnn"})
  238. elif pseudo_backend == "paddle_tensorrt":
  239. suggested_backend_config.update({"run_mode": "trt_fp32"})
  240. elif pseudo_backend == "paddle_tensorrt_fp16":
  241. # TODO: Check if the target device supports FP16.
  242. suggested_backend_config.update({"run_mode": "trt_fp16"})
  243. elif suggested_backend == "tensorrt":
  244. assert pseudo_backend in ("tensorrt", "tensorrt_fp16"), pseudo_backend
  245. if pseudo_backend == "tensorrt_fp16":
  246. suggested_backend_config.update({"precision": "fp16"})
  247. if hpi_config.backend_config is not None:
  248. suggested_backend_config.update(hpi_config.backend_config)
  249. return suggested_backend, suggested_backend_config