| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406 |
- # -------------------------------------------------------------------------
- # Copyright (c) Microsoft Corporation. All rights reserved.
- # Licensed under the MIT License. See License.txt in the project root for
- # license information.
- # --------------------------------------------------------------------------
- from __future__ import annotations
- import copy
- import logging
- from pathlib import Path
- from typing import Any
- import numpy as np
- import onnx
- from ...calibrate import CalibrationDataReader, CalibrationMethod
- from ...quant_utils import QuantType
- from ...quantize import StaticQuantConfig
- from ...tensor_quant_overrides import TensorQuantOverridesHelper
- from .mixed_precision_overrides_utils import MixedPrecisionTensorQuantOverridesFixer
- Q16_TYPES = {QuantType.QInt16, QuantType.QUInt16}
- Q8_TYPES = {QuantType.QInt8, QuantType.QUInt8}
- Q4_TYPES = {QuantType.QInt4, QuantType.QUInt4}
- OP_TYPES_TO_EXCLUDE = {"Cast"}
- MODEL_SIZE_THRESHOLD = 2147483648 # Quant model should use external data if >= 2GB
- def warn_unable_to_override(
- node: onnx.NodeProto,
- what_str: str,
- tensor_name: str,
- io_kind: str,
- ):
- logging.warning(
- f"Unable to override {what_str} for {node.op_type} node's {io_kind} "
- "because it has already been overridden! Check the initial quantization overrides provided "
- "to get_qnn_qdq_config() if the generated QDQ model does not run on QNN EP. "
- f"Node name: {node.name}, {io_kind} name: {tensor_name}"
- )
- def get_qnn_qdq_config(
- model_input: str | Path | onnx.ModelProto,
- calibration_data_reader: CalibrationDataReader,
- calibrate_method: CalibrationMethod = CalibrationMethod.MinMax,
- activation_type: QuantType = QuantType.QUInt8,
- weight_type: QuantType = QuantType.QUInt8,
- per_channel: bool = False,
- init_overrides: dict[str, list[dict[str, Any]]] | None = None,
- add_qtype_converts: bool = True,
- activation_symmetric: bool = False,
- weight_symmetric: bool | None = None,
- keep_removable_activations: bool = False,
- stride: int | None = None,
- calibration_providers: list[str] | None = None,
- op_types_to_quantize: list[str] | None = None,
- nodes_to_exclude: list[str] | None = None,
- ) -> StaticQuantConfig:
- """
- Returns a static quantization configuration suitable for running QDQ models on QNN EP.
- This is done primarily by setting tensor-level quantization overrides.
- Params:
- model_input: Path to the input model file or ModelProto.
- calibration_data_reader: Calibration data reader.
- calibrate_methode: The calibration method. Defaults to MinMax.
- activation_type: The default activation quantization type. Defaults to QUInt8.
- weight_type: The default weight quantization type. Defaults to QUInt8.
- per_channel: Global option that determines if a fixed set of operator types should be quantized per-channel.
- Defaults to false. Alternatively, use the tensor-level `init_overrides` to select individual operators
- and their quantization axes.
- If set, the quantization tool uses per-channel quantization for the following operator types and inputs:
- - Conv:
- - input[1] on axis 0
- - input[2] (bias) on axis 0
- - ConvTranspose:
- - input[1] on axis 1
- - input[2] (bias) on axis 0
- init_overrides: Initial tensor-level quantization overrides. Defaults to None. This function updates of a copy
- of these overrides with any necessary adjustments and includes them in the returned
- configuration object (i.e., config.extra_options['TensorQuantOverrides']).
- The key is a tensor name and the value is a list of dictionaries. For per-tensor quantization, the list
- contains a single dictionary. For per-channel quantization, the list contains either a dictionary for
- each channel in the tensor or a single dictionary that is assumed to apply to all channels. An 'axis'
- key must be present in the first dictionary for per-channel quantization.
- Each dictionary contains optional overrides with the following keys and values.
- 'quant_type' = QuantType : The tensor's quantization data type.
- 'axis' = Int : The per-channel axis. Must be present for per-channel weights.
- 'scale' = Float : The scale value to use. Must also specify `zero_point` if set.
- 'zero_point' = Int : The zero-point value to use. Must also specify `scale` is set.
- 'symmetric' = Bool : If the tensor should use symmetric quantization. Invalid if also
- set `scale` or `zero_point`.
- 'reduce_range' = Bool : If the quantization range should be reduced. Invalid if also
- set `scale` or `zero_point`. Only valid for initializers.
- 'rmax' = Float : Override the maximum real tensor value in calibration data.
- Invalid if also set `scale` or `zero_point`.
- 'rmin' = Float : Override the minimum real tensor value in calibration data.
- Invalid if also set `scale` or `zero_point`.
- 'convert' = Dict : A nested dictionary with the same keys for an activation
- tensor that should be converted to another quantization type.
- 'convert["recv_nodes"] = Set : Set of node names that consume the converted activation,
- other nodes get the original type. If not specified,
- assume all consumer nodes get the converted type.
- add_qtype_converts: True if this function should automatically add "convert" entries to the provided
- `init_overrides` to ensure that operators use valid input/output types (activations only).
- Ex: if you override the output of an Add to 16-bit, this option ensures that the activation inputs
- of the Add are also up-converted to 16-bit and that data types for surrounding ops are converted
- appropriately. Refer to the documentation in mixed_precision_overrides_utils.py for additional details.
- activation_symmetric: True if activations should be quantized symmetrically (i.e, rmax == -rmin) by default.
- Defaults to false. For int8 and int16, this results in zero-point values of 0. For uint8 and uin16,
- the zero-point values are 128 and 32,768, respectively.
- weight_symmetric: True if weights should be quantized symmetrically (i.e., rmax == -rmin) by default.
- Defaults to None. If set to None, weight_symmetric is assumed true if the weight_type is a signed int.
- keep_removable_activations: Defaults to false. If true, "removable" activations (e.g., Clip or Relu) will not
- be removed, and will be explicitly represented in the QDQ model. If false, these activations
- are automatically removed if activations are asymmetrically quantized. Keeping these activations
- is necessary if optimizations or EP transformations will later remove
- QuantizeLinear/DequantizeLinear operators from the model.
- calibration_providers: Execution providers to run the session during calibration. Default is None which uses
- [ "CPUExecutionProvider" ].
- op_types_to_quantize: If set to None, all operator types will be quantized except for OP_TYPES_TO_EXCLUDE
- nodes_to_exclude: List of nodes names to exclude from quantization. The nodes in this list will be excluded from
- quantization when it is not None.
- Returns:
- A StaticQuantConfig object
- """
- if weight_symmetric is None:
- weight_symmetric = weight_type in {QuantType.QInt8, QuantType.QInt16}
- model = (
- model_input
- if isinstance(model_input, onnx.ModelProto)
- else onnx.load_model(model_input, load_external_data=False)
- )
- op_types = set()
- model_has_external_data = False
- name_to_initializer = {}
- # Build map of initializers (name -> initializer) and
- # check if the model has external data.
- for initializer in model.graph.initializer:
- name_to_initializer[initializer.name] = initializer
- if onnx.external_data_helper.uses_external_data(initializer):
- model_has_external_data = True
- overrides_helper = TensorQuantOverridesHelper(copy.deepcopy(init_overrides) if init_overrides else {})
- if not overrides_helper.empty() and add_qtype_converts:
- # Fix mixed-precision overrides.
- overrides_fixer = MixedPrecisionTensorQuantOverridesFixer.create_from_model(
- overrides_helper, model, activation_type
- )
- overrides_fixer.apply(activation_type, activation_symmetric)
- # Setup quantization overrides for specific operator types to ensure compatibility with QNN EP.
- qnn_compat = QnnCompatibilityOverrides(
- activation_type,
- weight_type,
- activation_symmetric,
- weight_symmetric,
- per_channel,
- overrides_helper,
- name_to_initializer,
- )
- op_types_to_quantize_set = set(op_types_to_quantize) if op_types_to_quantize else None
- nodes_to_exclude_set = set(nodes_to_exclude) if nodes_to_exclude else None
- for node in model.graph.node:
- if op_types_to_quantize_set and node.op_type not in op_types_to_quantize_set:
- continue
- if nodes_to_exclude_set and node.name in nodes_to_exclude_set:
- continue
- op_types.add(node.op_type)
- qnn_compat.process_node(node)
- extra_options = {
- "MinimumRealRange": 0.0001,
- "DedicatedQDQPair": False, # Let ORT optimizer duplicate DQ nodes
- "QDQKeepRemovableActivations": keep_removable_activations,
- "TensorQuantOverrides": overrides_helper.get_dict(),
- "ActivationSymmetric": activation_symmetric,
- "WeightSymmetric": weight_symmetric,
- "CalibStridedMinMax": stride,
- }
- # ONNX opset < 21 does not support 16-bit quantization, so must use 'com.microsoft' domain
- # on Q/DQ operators if using 16-bit or 4-bit quantization.
- onnx_opset = next(x for x in model.opset_import if x.domain == "" or x.domain == "ai.onnx")
- if onnx_opset.version < 21:
- opset21_types = Q16_TYPES.union(Q4_TYPES)
- overrides_have_opset21_types = any(t in opset21_types for t in overrides_helper.get_quant_types())
- if activation_type in opset21_types or weight_type in opset21_types or overrides_have_opset21_types:
- extra_options["UseQDQContribOps"] = True
- return StaticQuantConfig(
- calibration_data_reader,
- calibrate_method=calibrate_method,
- activation_type=activation_type,
- weight_type=weight_type,
- op_types_to_quantize=(
- op_types_to_quantize if op_types_to_quantize else list(op_types.difference(OP_TYPES_TO_EXCLUDE))
- ),
- nodes_to_exclude=nodes_to_exclude,
- per_channel=per_channel,
- use_external_data_format=(model_has_external_data or model.ByteSize() >= MODEL_SIZE_THRESHOLD),
- calibration_providers=calibration_providers,
- extra_options=extra_options,
- )
- class QnnCompatibilityOverrides:
- """
- Helper that processes nodes to generate quantization overrides that make the resulting QDQ model
- compatible with QNN EP.
- """
- def __init__(
- self,
- default_activation_qtype: QuantType,
- default_weight_qtype: QuantType,
- activation_symmetric: bool,
- weight_symmetric: bool,
- per_channel: bool,
- overrides: TensorQuantOverridesHelper,
- initializers: dict[str, onnx.TensorProto],
- ):
- self.default_activation_qtype = default_activation_qtype
- self.default_weight_qtype = default_weight_qtype
- self.activation_symmetric = activation_symmetric
- self.weight_symmetric = weight_symmetric
- self.per_channel = per_channel
- self.overrides = overrides
- self.initializers = initializers
- self.process_fns = {
- "MatMul": self._process_matmul,
- "LayerNormalization": self._process_layernorm,
- "Sigmoid": self._process_sigmoid,
- "Tanh": self._process_tanh,
- }
- def process_node(self, node: onnx.NodeProto):
- process_fn = self.process_fns.get(node.op_type)
- if process_fn is not None:
- process_fn(node)
- def _make_static_inputs_use_default_weight_type(self, node: onnx.NodeProto):
- """
- Overrides initializer input(s) to use the default weight type if:
- - The default weight type is 8-bit
- - One of the inputs is a 16-bit activation
- - The other input is an initializer (per-tensor quantized)
- This is necessary because the quantization tool does not assign MatMul or LayerNorm initializer
- inputs the default weight type. Instead, it assigns the default activation type.
- """
- if self.default_weight_qtype not in Q8_TYPES:
- return
- input_16bit_act_name = None
- input_weight_name = None
- # Loop through first 2 inputs to find a 16-bit activation and a (per-tensor) weight.
- for i in range(2):
- input_name = node.input[i]
- if not input_name:
- continue
- is_weight = input_name in self.initializers
- qtype_info = self.overrides.get_node_input_qtype_info(
- input_name,
- node.name,
- default_qtype=None if is_weight else self.default_activation_qtype,
- )
- if qtype_info.axis is not None:
- return # Don't process MatMul with a per-channel quantized input.
- if (
- is_weight
- and qtype_info.quant_type == self.default_weight_qtype
- and qtype_info.symmetric == self.weight_symmetric
- ):
- return # Return. Weight is already overridden to use the desired weight type.
- if is_weight:
- input_weight_name = input_name
- elif qtype_info.quant_type in Q16_TYPES:
- input_16bit_act_name = input_name
- # Override initializer input to use the default weight type.
- if input_16bit_act_name and input_weight_name:
- did_update = self.overrides.update_tensor_overrides(
- input_weight_name,
- {"quant_type": self.default_weight_qtype, "symmetric": self.weight_symmetric},
- overwrite=False,
- )
- if not did_update:
- warn_unable_to_override(node, "quant_type/symmetric", input_weight_name, "input weight")
- def _process_matmul(self, node: onnx.NodeProto):
- assert node.op_type == "MatMul", f"Expected MatMul, but got {node.op_type}"
- if not self.per_channel:
- self._make_static_inputs_use_default_weight_type(node)
- return
- # QNN does not support per-channel MatMul. However, the ORT quantization tool attempts to use per-channel
- # quantization for MatMul by default *if* the global per_channel setting is enabled. So, we need to
- # provide explicit per-tensor quantization overrides for MatMul if per_channel is enabled and
- # the user did not provide any other overrides.
- for input_name in node.input:
- is_weight_no_overrides = input_name in self.initializers and input_name not in self.overrides
- if is_weight_no_overrides:
- self.overrides.update_tensor_overrides(
- input_name,
- {"quant_type": self.default_weight_qtype, "symmetric": self.weight_symmetric},
- )
- def _process_layernorm(self, node: onnx.NodeProto):
- assert node.op_type == "LayerNormalization", f"Expected LayerNormalization, but got {node.op_type}"
- if not self.per_channel:
- self._make_static_inputs_use_default_weight_type(node)
- return
- has_weight_no_overrides = node.input[1] in self.initializers and node.input[1] not in self.overrides
- has_bias_no_overrides = (
- len(node.input) > 2
- and node.input[2]
- and node.input[2] in self.initializers
- and node.input[2] not in self.overrides
- )
- if has_weight_no_overrides or has_bias_no_overrides:
- # TODO: Make bias input not per-channel. QNN needs it to be per-tensor, but quantizer
- # tries to makes it per-channel if the weight is also per-channel.
- raise ValueError(
- "get_qnn_qdq_config() does not currently support the global per_channel option with LayerNormalization."
- " Please try using custom overrides that make bias per-tensor quantized."
- )
- def _process_sigmoid(self, node: onnx.NodeProto):
- """
- Overrides 16-bit Sigmoid's output scale and zero-point as per QNN requirements.
- """
- assert node.op_type == "Sigmoid", f"Expected Sigmoid, but got {node.op_type}"
- output_type = self.overrides.get_node_output_qtype_info(
- node.output[0], self.default_activation_qtype
- ).quant_type
- if output_type == QuantType.QUInt16:
- self.overrides.update_tensor_overrides(
- node.output[0],
- {
- "quant_type": output_type,
- "scale": np.array(1.0 / 65536.0, dtype=np.float32),
- "zero_point": np.array(0, dtype=np.uint16),
- },
- )
- elif output_type == QuantType.QInt16:
- self.overrides.update_tensor_overrides(
- node.output[0],
- {
- "quant_type": output_type,
- "scale": np.array(1.0 / 32768.0, dtype=np.float32),
- "zero_point": np.array(0, dtype=np.int16),
- },
- )
- def _process_tanh(self, node: onnx.NodeProto):
- """
- Overrides 16-bit Tanh's output scale and zero-point as per QNN requirements.
- """
- assert node.op_type == "Tanh", f"Expected Tanh, but got {node.op_type}"
- output_type = self.overrides.get_node_output_qtype_info(
- node.output[0], self.default_activation_qtype
- ).quant_type
- if output_type == QuantType.QUInt16:
- self.overrides.update_tensor_overrides(
- node.output[0],
- {
- "quant_type": output_type,
- "scale": np.array(1.0 / 32768.0, dtype=np.float32),
- "zero_point": np.array(32768, dtype=np.uint16),
- },
- )
- elif output_type == QuantType.QInt16:
- self.overrides.update_tensor_overrides(
- node.output[0],
- {
- "quant_type": output_type,
- "scale": np.array(1.0 / 32768.0, dtype=np.float32),
- "zero_point": np.array(0, dtype=np.int16),
- },
- )
|