quant_config.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406
  1. # -------------------------------------------------------------------------
  2. # Copyright (c) Microsoft Corporation. All rights reserved.
  3. # Licensed under the MIT License. See License.txt in the project root for
  4. # license information.
  5. # --------------------------------------------------------------------------
  6. from __future__ import annotations
  7. import copy
  8. import logging
  9. from pathlib import Path
  10. from typing import Any
  11. import numpy as np
  12. import onnx
  13. from ...calibrate import CalibrationDataReader, CalibrationMethod
  14. from ...quant_utils import QuantType
  15. from ...quantize import StaticQuantConfig
  16. from ...tensor_quant_overrides import TensorQuantOverridesHelper
  17. from .mixed_precision_overrides_utils import MixedPrecisionTensorQuantOverridesFixer
  18. Q16_TYPES = {QuantType.QInt16, QuantType.QUInt16}
  19. Q8_TYPES = {QuantType.QInt8, QuantType.QUInt8}
  20. Q4_TYPES = {QuantType.QInt4, QuantType.QUInt4}
  21. OP_TYPES_TO_EXCLUDE = {"Cast"}
  22. MODEL_SIZE_THRESHOLD = 2147483648 # Quant model should use external data if >= 2GB
  23. def warn_unable_to_override(
  24. node: onnx.NodeProto,
  25. what_str: str,
  26. tensor_name: str,
  27. io_kind: str,
  28. ):
  29. logging.warning(
  30. f"Unable to override {what_str} for {node.op_type} node's {io_kind} "
  31. "because it has already been overridden! Check the initial quantization overrides provided "
  32. "to get_qnn_qdq_config() if the generated QDQ model does not run on QNN EP. "
  33. f"Node name: {node.name}, {io_kind} name: {tensor_name}"
  34. )
  35. def get_qnn_qdq_config(
  36. model_input: str | Path | onnx.ModelProto,
  37. calibration_data_reader: CalibrationDataReader,
  38. calibrate_method: CalibrationMethod = CalibrationMethod.MinMax,
  39. activation_type: QuantType = QuantType.QUInt8,
  40. weight_type: QuantType = QuantType.QUInt8,
  41. per_channel: bool = False,
  42. init_overrides: dict[str, list[dict[str, Any]]] | None = None,
  43. add_qtype_converts: bool = True,
  44. activation_symmetric: bool = False,
  45. weight_symmetric: bool | None = None,
  46. keep_removable_activations: bool = False,
  47. stride: int | None = None,
  48. calibration_providers: list[str] | None = None,
  49. op_types_to_quantize: list[str] | None = None,
  50. nodes_to_exclude: list[str] | None = None,
  51. ) -> StaticQuantConfig:
  52. """
  53. Returns a static quantization configuration suitable for running QDQ models on QNN EP.
  54. This is done primarily by setting tensor-level quantization overrides.
  55. Params:
  56. model_input: Path to the input model file or ModelProto.
  57. calibration_data_reader: Calibration data reader.
  58. calibrate_methode: The calibration method. Defaults to MinMax.
  59. activation_type: The default activation quantization type. Defaults to QUInt8.
  60. weight_type: The default weight quantization type. Defaults to QUInt8.
  61. per_channel: Global option that determines if a fixed set of operator types should be quantized per-channel.
  62. Defaults to false. Alternatively, use the tensor-level `init_overrides` to select individual operators
  63. and their quantization axes.
  64. If set, the quantization tool uses per-channel quantization for the following operator types and inputs:
  65. - Conv:
  66. - input[1] on axis 0
  67. - input[2] (bias) on axis 0
  68. - ConvTranspose:
  69. - input[1] on axis 1
  70. - input[2] (bias) on axis 0
  71. init_overrides: Initial tensor-level quantization overrides. Defaults to None. This function updates of a copy
  72. of these overrides with any necessary adjustments and includes them in the returned
  73. configuration object (i.e., config.extra_options['TensorQuantOverrides']).
  74. The key is a tensor name and the value is a list of dictionaries. For per-tensor quantization, the list
  75. contains a single dictionary. For per-channel quantization, the list contains either a dictionary for
  76. each channel in the tensor or a single dictionary that is assumed to apply to all channels. An 'axis'
  77. key must be present in the first dictionary for per-channel quantization.
  78. Each dictionary contains optional overrides with the following keys and values.
  79. 'quant_type' = QuantType : The tensor's quantization data type.
  80. 'axis' = Int : The per-channel axis. Must be present for per-channel weights.
  81. 'scale' = Float : The scale value to use. Must also specify `zero_point` if set.
  82. 'zero_point' = Int : The zero-point value to use. Must also specify `scale` is set.
  83. 'symmetric' = Bool : If the tensor should use symmetric quantization. Invalid if also
  84. set `scale` or `zero_point`.
  85. 'reduce_range' = Bool : If the quantization range should be reduced. Invalid if also
  86. set `scale` or `zero_point`. Only valid for initializers.
  87. 'rmax' = Float : Override the maximum real tensor value in calibration data.
  88. Invalid if also set `scale` or `zero_point`.
  89. 'rmin' = Float : Override the minimum real tensor value in calibration data.
  90. Invalid if also set `scale` or `zero_point`.
  91. 'convert' = Dict : A nested dictionary with the same keys for an activation
  92. tensor that should be converted to another quantization type.
  93. 'convert["recv_nodes"] = Set : Set of node names that consume the converted activation,
  94. other nodes get the original type. If not specified,
  95. assume all consumer nodes get the converted type.
  96. add_qtype_converts: True if this function should automatically add "convert" entries to the provided
  97. `init_overrides` to ensure that operators use valid input/output types (activations only).
  98. Ex: if you override the output of an Add to 16-bit, this option ensures that the activation inputs
  99. of the Add are also up-converted to 16-bit and that data types for surrounding ops are converted
  100. appropriately. Refer to the documentation in mixed_precision_overrides_utils.py for additional details.
  101. activation_symmetric: True if activations should be quantized symmetrically (i.e, rmax == -rmin) by default.
  102. Defaults to false. For int8 and int16, this results in zero-point values of 0. For uint8 and uin16,
  103. the zero-point values are 128 and 32,768, respectively.
  104. weight_symmetric: True if weights should be quantized symmetrically (i.e., rmax == -rmin) by default.
  105. Defaults to None. If set to None, weight_symmetric is assumed true if the weight_type is a signed int.
  106. keep_removable_activations: Defaults to false. If true, "removable" activations (e.g., Clip or Relu) will not
  107. be removed, and will be explicitly represented in the QDQ model. If false, these activations
  108. are automatically removed if activations are asymmetrically quantized. Keeping these activations
  109. is necessary if optimizations or EP transformations will later remove
  110. QuantizeLinear/DequantizeLinear operators from the model.
  111. calibration_providers: Execution providers to run the session during calibration. Default is None which uses
  112. [ "CPUExecutionProvider" ].
  113. op_types_to_quantize: If set to None, all operator types will be quantized except for OP_TYPES_TO_EXCLUDE
  114. nodes_to_exclude: List of nodes names to exclude from quantization. The nodes in this list will be excluded from
  115. quantization when it is not None.
  116. Returns:
  117. A StaticQuantConfig object
  118. """
  119. if weight_symmetric is None:
  120. weight_symmetric = weight_type in {QuantType.QInt8, QuantType.QInt16}
  121. model = (
  122. model_input
  123. if isinstance(model_input, onnx.ModelProto)
  124. else onnx.load_model(model_input, load_external_data=False)
  125. )
  126. op_types = set()
  127. model_has_external_data = False
  128. name_to_initializer = {}
  129. # Build map of initializers (name -> initializer) and
  130. # check if the model has external data.
  131. for initializer in model.graph.initializer:
  132. name_to_initializer[initializer.name] = initializer
  133. if onnx.external_data_helper.uses_external_data(initializer):
  134. model_has_external_data = True
  135. overrides_helper = TensorQuantOverridesHelper(copy.deepcopy(init_overrides) if init_overrides else {})
  136. if not overrides_helper.empty() and add_qtype_converts:
  137. # Fix mixed-precision overrides.
  138. overrides_fixer = MixedPrecisionTensorQuantOverridesFixer.create_from_model(
  139. overrides_helper, model, activation_type
  140. )
  141. overrides_fixer.apply(activation_type, activation_symmetric)
  142. # Setup quantization overrides for specific operator types to ensure compatibility with QNN EP.
  143. qnn_compat = QnnCompatibilityOverrides(
  144. activation_type,
  145. weight_type,
  146. activation_symmetric,
  147. weight_symmetric,
  148. per_channel,
  149. overrides_helper,
  150. name_to_initializer,
  151. )
  152. op_types_to_quantize_set = set(op_types_to_quantize) if op_types_to_quantize else None
  153. nodes_to_exclude_set = set(nodes_to_exclude) if nodes_to_exclude else None
  154. for node in model.graph.node:
  155. if op_types_to_quantize_set and node.op_type not in op_types_to_quantize_set:
  156. continue
  157. if nodes_to_exclude_set and node.name in nodes_to_exclude_set:
  158. continue
  159. op_types.add(node.op_type)
  160. qnn_compat.process_node(node)
  161. extra_options = {
  162. "MinimumRealRange": 0.0001,
  163. "DedicatedQDQPair": False, # Let ORT optimizer duplicate DQ nodes
  164. "QDQKeepRemovableActivations": keep_removable_activations,
  165. "TensorQuantOverrides": overrides_helper.get_dict(),
  166. "ActivationSymmetric": activation_symmetric,
  167. "WeightSymmetric": weight_symmetric,
  168. "CalibStridedMinMax": stride,
  169. }
  170. # ONNX opset < 21 does not support 16-bit quantization, so must use 'com.microsoft' domain
  171. # on Q/DQ operators if using 16-bit or 4-bit quantization.
  172. onnx_opset = next(x for x in model.opset_import if x.domain == "" or x.domain == "ai.onnx")
  173. if onnx_opset.version < 21:
  174. opset21_types = Q16_TYPES.union(Q4_TYPES)
  175. overrides_have_opset21_types = any(t in opset21_types for t in overrides_helper.get_quant_types())
  176. if activation_type in opset21_types or weight_type in opset21_types or overrides_have_opset21_types:
  177. extra_options["UseQDQContribOps"] = True
  178. return StaticQuantConfig(
  179. calibration_data_reader,
  180. calibrate_method=calibrate_method,
  181. activation_type=activation_type,
  182. weight_type=weight_type,
  183. op_types_to_quantize=(
  184. op_types_to_quantize if op_types_to_quantize else list(op_types.difference(OP_TYPES_TO_EXCLUDE))
  185. ),
  186. nodes_to_exclude=nodes_to_exclude,
  187. per_channel=per_channel,
  188. use_external_data_format=(model_has_external_data or model.ByteSize() >= MODEL_SIZE_THRESHOLD),
  189. calibration_providers=calibration_providers,
  190. extra_options=extra_options,
  191. )
  192. class QnnCompatibilityOverrides:
  193. """
  194. Helper that processes nodes to generate quantization overrides that make the resulting QDQ model
  195. compatible with QNN EP.
  196. """
  197. def __init__(
  198. self,
  199. default_activation_qtype: QuantType,
  200. default_weight_qtype: QuantType,
  201. activation_symmetric: bool,
  202. weight_symmetric: bool,
  203. per_channel: bool,
  204. overrides: TensorQuantOverridesHelper,
  205. initializers: dict[str, onnx.TensorProto],
  206. ):
  207. self.default_activation_qtype = default_activation_qtype
  208. self.default_weight_qtype = default_weight_qtype
  209. self.activation_symmetric = activation_symmetric
  210. self.weight_symmetric = weight_symmetric
  211. self.per_channel = per_channel
  212. self.overrides = overrides
  213. self.initializers = initializers
  214. self.process_fns = {
  215. "MatMul": self._process_matmul,
  216. "LayerNormalization": self._process_layernorm,
  217. "Sigmoid": self._process_sigmoid,
  218. "Tanh": self._process_tanh,
  219. }
  220. def process_node(self, node: onnx.NodeProto):
  221. process_fn = self.process_fns.get(node.op_type)
  222. if process_fn is not None:
  223. process_fn(node)
  224. def _make_static_inputs_use_default_weight_type(self, node: onnx.NodeProto):
  225. """
  226. Overrides initializer input(s) to use the default weight type if:
  227. - The default weight type is 8-bit
  228. - One of the inputs is a 16-bit activation
  229. - The other input is an initializer (per-tensor quantized)
  230. This is necessary because the quantization tool does not assign MatMul or LayerNorm initializer
  231. inputs the default weight type. Instead, it assigns the default activation type.
  232. """
  233. if self.default_weight_qtype not in Q8_TYPES:
  234. return
  235. input_16bit_act_name = None
  236. input_weight_name = None
  237. # Loop through first 2 inputs to find a 16-bit activation and a (per-tensor) weight.
  238. for i in range(2):
  239. input_name = node.input[i]
  240. if not input_name:
  241. continue
  242. is_weight = input_name in self.initializers
  243. qtype_info = self.overrides.get_node_input_qtype_info(
  244. input_name,
  245. node.name,
  246. default_qtype=None if is_weight else self.default_activation_qtype,
  247. )
  248. if qtype_info.axis is not None:
  249. return # Don't process MatMul with a per-channel quantized input.
  250. if (
  251. is_weight
  252. and qtype_info.quant_type == self.default_weight_qtype
  253. and qtype_info.symmetric == self.weight_symmetric
  254. ):
  255. return # Return. Weight is already overridden to use the desired weight type.
  256. if is_weight:
  257. input_weight_name = input_name
  258. elif qtype_info.quant_type in Q16_TYPES:
  259. input_16bit_act_name = input_name
  260. # Override initializer input to use the default weight type.
  261. if input_16bit_act_name and input_weight_name:
  262. did_update = self.overrides.update_tensor_overrides(
  263. input_weight_name,
  264. {"quant_type": self.default_weight_qtype, "symmetric": self.weight_symmetric},
  265. overwrite=False,
  266. )
  267. if not did_update:
  268. warn_unable_to_override(node, "quant_type/symmetric", input_weight_name, "input weight")
  269. def _process_matmul(self, node: onnx.NodeProto):
  270. assert node.op_type == "MatMul", f"Expected MatMul, but got {node.op_type}"
  271. if not self.per_channel:
  272. self._make_static_inputs_use_default_weight_type(node)
  273. return
  274. # QNN does not support per-channel MatMul. However, the ORT quantization tool attempts to use per-channel
  275. # quantization for MatMul by default *if* the global per_channel setting is enabled. So, we need to
  276. # provide explicit per-tensor quantization overrides for MatMul if per_channel is enabled and
  277. # the user did not provide any other overrides.
  278. for input_name in node.input:
  279. is_weight_no_overrides = input_name in self.initializers and input_name not in self.overrides
  280. if is_weight_no_overrides:
  281. self.overrides.update_tensor_overrides(
  282. input_name,
  283. {"quant_type": self.default_weight_qtype, "symmetric": self.weight_symmetric},
  284. )
  285. def _process_layernorm(self, node: onnx.NodeProto):
  286. assert node.op_type == "LayerNormalization", f"Expected LayerNormalization, but got {node.op_type}"
  287. if not self.per_channel:
  288. self._make_static_inputs_use_default_weight_type(node)
  289. return
  290. has_weight_no_overrides = node.input[1] in self.initializers and node.input[1] not in self.overrides
  291. has_bias_no_overrides = (
  292. len(node.input) > 2
  293. and node.input[2]
  294. and node.input[2] in self.initializers
  295. and node.input[2] not in self.overrides
  296. )
  297. if has_weight_no_overrides or has_bias_no_overrides:
  298. # TODO: Make bias input not per-channel. QNN needs it to be per-tensor, but quantizer
  299. # tries to makes it per-channel if the weight is also per-channel.
  300. raise ValueError(
  301. "get_qnn_qdq_config() does not currently support the global per_channel option with LayerNormalization."
  302. " Please try using custom overrides that make bias per-tensor quantized."
  303. )
  304. def _process_sigmoid(self, node: onnx.NodeProto):
  305. """
  306. Overrides 16-bit Sigmoid's output scale and zero-point as per QNN requirements.
  307. """
  308. assert node.op_type == "Sigmoid", f"Expected Sigmoid, but got {node.op_type}"
  309. output_type = self.overrides.get_node_output_qtype_info(
  310. node.output[0], self.default_activation_qtype
  311. ).quant_type
  312. if output_type == QuantType.QUInt16:
  313. self.overrides.update_tensor_overrides(
  314. node.output[0],
  315. {
  316. "quant_type": output_type,
  317. "scale": np.array(1.0 / 65536.0, dtype=np.float32),
  318. "zero_point": np.array(0, dtype=np.uint16),
  319. },
  320. )
  321. elif output_type == QuantType.QInt16:
  322. self.overrides.update_tensor_overrides(
  323. node.output[0],
  324. {
  325. "quant_type": output_type,
  326. "scale": np.array(1.0 / 32768.0, dtype=np.float32),
  327. "zero_point": np.array(0, dtype=np.int16),
  328. },
  329. )
  330. def _process_tanh(self, node: onnx.NodeProto):
  331. """
  332. Overrides 16-bit Tanh's output scale and zero-point as per QNN requirements.
  333. """
  334. assert node.op_type == "Tanh", f"Expected Tanh, but got {node.op_type}"
  335. output_type = self.overrides.get_node_output_qtype_info(
  336. node.output[0], self.default_activation_qtype
  337. ).quant_type
  338. if output_type == QuantType.QUInt16:
  339. self.overrides.update_tensor_overrides(
  340. node.output[0],
  341. {
  342. "quant_type": output_type,
  343. "scale": np.array(1.0 / 32768.0, dtype=np.float32),
  344. "zero_point": np.array(32768, dtype=np.uint16),
  345. },
  346. )
  347. elif output_type == QuantType.QInt16:
  348. self.overrides.update_tensor_overrides(
  349. node.output[0],
  350. {
  351. "quant_type": output_type,
  352. "scale": np.array(1.0 / 32768.0, dtype=np.float32),
  353. "zero_point": np.array(0, dtype=np.int16),
  354. },
  355. )