| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551 |
- # Copyright 2020 The HuggingFace Team. All rights reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- import warnings
- from argparse import ArgumentParser
- from os import listdir, makedirs
- from pathlib import Path
- from typing import Optional
- from packaging.version import Version, parse
- from transformers.pipelines import Pipeline, pipeline
- from transformers.tokenization_utils import BatchEncoding
- from transformers.utils import ModelOutput, is_tf_available, is_torch_available
- # This is the minimal required version to
- # support some ONNX Runtime features
- ORT_QUANTIZE_MINIMUM_VERSION = parse("1.4.0")
- SUPPORTED_PIPELINES = [
- "feature-extraction",
- "ner",
- "sentiment-analysis",
- "fill-mask",
- "question-answering",
- "text-generation",
- "translation_en_to_fr",
- "translation_en_to_de",
- "translation_en_to_ro",
- ]
- class OnnxConverterArgumentParser(ArgumentParser):
- """
- Wraps all the script arguments supported to export transformers models to ONNX IR
- """
- def __init__(self):
- super().__init__("ONNX Converter")
- self.add_argument(
- "--pipeline",
- type=str,
- choices=SUPPORTED_PIPELINES,
- default="feature-extraction",
- )
- self.add_argument(
- "--model",
- type=str,
- required=True,
- help="Model's id or path (ex: google-bert/bert-base-cased)",
- )
- self.add_argument("--tokenizer", type=str, help="Tokenizer's id or path (ex: google-bert/bert-base-cased)")
- self.add_argument(
- "--framework",
- type=str,
- choices=["pt", "tf"],
- help="Framework for loading the model",
- )
- self.add_argument("--opset", type=int, default=11, help="ONNX opset to use")
- self.add_argument(
- "--check-loading",
- action="store_true",
- help="Check ONNX is able to load the model",
- )
- self.add_argument(
- "--use-external-format",
- action="store_true",
- help="Allow exporting model >= than 2Gb",
- )
- self.add_argument(
- "--quantize",
- action="store_true",
- help="Quantize the neural network to be run with int8",
- )
- self.add_argument("output")
- def generate_identified_filename(filename: Path, identifier: str) -> Path:
- """
- Append a string-identifier at the end (before the extension, if any) to the provided filepath
- Args:
- filename: pathlib.Path The actual path object we would like to add an identifier suffix
- identifier: The suffix to add
- Returns: String with concatenated identifier at the end of the filename
- """
- return filename.parent.joinpath(filename.stem + identifier).with_suffix(filename.suffix)
- def check_onnxruntime_requirements(minimum_version: Version):
- """
- Check onnxruntime is installed and if the installed version match is recent enough
- Raises:
- ImportError: If onnxruntime is not installed or too old version is found
- """
- try:
- import onnxruntime
- # Parse the version of the installed onnxruntime
- ort_version = parse(onnxruntime.__version__)
- # We require 1.4.0 minimum
- if ort_version < ORT_QUANTIZE_MINIMUM_VERSION:
- raise ImportError(
- f"We found an older version of onnxruntime ({onnxruntime.__version__}) "
- f"but we require onnxruntime to be >= {minimum_version} to enable all the conversions options.\n"
- "Please update onnxruntime by running `pip install --upgrade onnxruntime`"
- )
- except ImportError:
- raise ImportError(
- "onnxruntime doesn't seem to be currently installed. "
- "Please install the onnxruntime by running `pip install onnxruntime`"
- " and relaunch the conversion."
- )
- def ensure_valid_input(model, tokens, input_names):
- """
- Ensure inputs are presented in the correct order, without any Non
- Args:
- model: The model used to forward the input data
- tokens: BatchEncoding holding the input data
- input_names: The name of the inputs
- Returns: Tuple
- """
- print("Ensuring inputs are in correct order")
- model_args_name = model.forward.__code__.co_varnames
- model_args, ordered_input_names = [], []
- for arg_name in model_args_name[1:]: # start at index 1 to skip "self" argument
- if arg_name in input_names:
- ordered_input_names.append(arg_name)
- model_args.append(tokens[arg_name])
- else:
- print(f"{arg_name} is not present in the generated input list.")
- break
- print(f"Generated inputs order: {ordered_input_names}")
- return ordered_input_names, tuple(model_args)
- def infer_shapes(nlp: Pipeline, framework: str) -> tuple[list[str], list[str], dict, BatchEncoding]:
- """
- Attempt to infer the static vs dynamic axes for each input and output tensors for a specific model
- Args:
- nlp: The pipeline object holding the model to be exported
- framework: The framework identifier to dispatch to the correct inference scheme (pt/tf)
- Returns:
- - List of the inferred input variable names
- - List of the inferred output variable names
- - Dictionary with input/output variables names as key and shape tensor as value
- - a BatchEncoding reference which was used to infer all the above information
- """
- def build_shape_dict(name: str, tensor, is_input: bool, seq_len: int):
- if isinstance(tensor, (tuple, list)):
- return [build_shape_dict(name, t, is_input, seq_len) for t in tensor]
- else:
- # Let's assume batch is the first axis with only 1 element (~~ might not be always true ...)
- axes = {[axis for axis, numel in enumerate(tensor.shape) if numel == 1][0]: "batch"}
- if is_input:
- if len(tensor.shape) == 2:
- axes[1] = "sequence"
- else:
- raise ValueError(f"Unable to infer tensor axes ({len(tensor.shape)})")
- else:
- seq_axes = [dim for dim, shape in enumerate(tensor.shape) if shape == seq_len]
- axes.update(dict.fromkeys(seq_axes, "sequence"))
- print(f"Found {'input' if is_input else 'output'} {name} with shape: {axes}")
- return axes
- tokens = nlp.tokenizer("This is a sample output", return_tensors=framework)
- seq_len = tokens.input_ids.shape[-1]
- outputs = nlp.model(**tokens) if framework == "pt" else nlp.model(tokens)
- if isinstance(outputs, ModelOutput):
- outputs = outputs.to_tuple()
- if not isinstance(outputs, (list, tuple)):
- outputs = (outputs,)
- # Generate input names & axes
- input_vars = list(tokens.keys())
- input_dynamic_axes = {k: build_shape_dict(k, v, True, seq_len) for k, v in tokens.items()}
- # flatten potentially grouped outputs (past for gpt2, attentions)
- outputs_flat = []
- for output in outputs:
- if isinstance(output, (tuple, list)):
- outputs_flat.extend(output)
- else:
- outputs_flat.append(output)
- # Generate output names & axes
- output_names = [f"output_{i}" for i in range(len(outputs_flat))]
- output_dynamic_axes = {k: build_shape_dict(k, v, False, seq_len) for k, v in zip(output_names, outputs_flat)}
- # Create the aggregated axes representation
- dynamic_axes = dict(input_dynamic_axes, **output_dynamic_axes)
- return input_vars, output_names, dynamic_axes, tokens
- def load_graph_from_args(
- pipeline_name: str, framework: str, model: str, tokenizer: Optional[str] = None, **models_kwargs
- ) -> Pipeline:
- """
- Convert the set of arguments provided through the CLI to an actual pipeline reference (tokenizer + model
- Args:
- pipeline_name: The kind of pipeline to use (ner, question-answering, etc.)
- framework: The actual model to convert the pipeline from ("pt" or "tf")
- model: The model name which will be loaded by the pipeline
- tokenizer: The tokenizer name which will be loaded by the pipeline, default to the model's value
- Returns: Pipeline object
- """
- # If no tokenizer provided
- if tokenizer is None:
- tokenizer = model
- # Check the wanted framework is available
- if framework == "pt" and not is_torch_available():
- raise Exception("Cannot convert because PyTorch is not installed. Please install torch first.")
- if framework == "tf" and not is_tf_available():
- raise Exception("Cannot convert because TF is not installed. Please install tensorflow first.")
- print(f"Loading pipeline (model: {model}, tokenizer: {tokenizer})")
- # Allocate tokenizer and model
- return pipeline(pipeline_name, model=model, tokenizer=tokenizer, framework=framework, model_kwargs=models_kwargs)
- def convert_pytorch(nlp: Pipeline, opset: int, output: Path, use_external_format: bool):
- """
- Export a PyTorch backed pipeline to ONNX Intermediate Representation (IR
- Args:
- nlp: The pipeline to be exported
- opset: The actual version of the ONNX operator set to use
- output: Path where will be stored the generated ONNX model
- use_external_format: Split the model definition from its parameters to allow model bigger than 2GB
- Returns:
- """
- if not is_torch_available():
- raise Exception("Cannot convert because PyTorch is not installed. Please install torch first.")
- import torch
- from torch.onnx import export
- print(f"Using framework PyTorch: {torch.__version__}")
- with torch.no_grad():
- input_names, output_names, dynamic_axes, tokens = infer_shapes(nlp, "pt")
- ordered_input_names, model_args = ensure_valid_input(nlp.model, tokens, input_names)
- export(
- nlp.model,
- model_args,
- f=output.as_posix(),
- input_names=ordered_input_names,
- output_names=output_names,
- dynamic_axes=dynamic_axes,
- do_constant_folding=True,
- opset_version=opset,
- )
- def convert_tensorflow(nlp: Pipeline, opset: int, output: Path):
- """
- Export a TensorFlow backed pipeline to ONNX Intermediate Representation (IR)
- Args:
- nlp: The pipeline to be exported
- opset: The actual version of the ONNX operator set to use
- output: Path where will be stored the generated ONNX model
- Notes: TensorFlow cannot export model bigger than 2GB due to internal constraint from TensorFlow
- """
- if not is_tf_available():
- raise Exception("Cannot convert because TF is not installed. Please install tensorflow first.")
- print("/!\\ Please note TensorFlow doesn't support exporting model > 2Gb /!\\")
- try:
- import tensorflow as tf
- import tf2onnx
- from tf2onnx import __version__ as t2ov
- print(f"Using framework TensorFlow: {tf.version.VERSION}, tf2onnx: {t2ov}")
- # Build
- input_names, output_names, dynamic_axes, tokens = infer_shapes(nlp, "tf")
- # Forward
- nlp.model.predict(tokens.data)
- input_signature = [tf.TensorSpec.from_tensor(tensor, name=key) for key, tensor in tokens.items()]
- model_proto, _ = tf2onnx.convert.from_keras(
- nlp.model, input_signature, opset=opset, output_path=output.as_posix()
- )
- except ImportError as e:
- raise Exception(
- f"Cannot import {e.name} required to convert TF model to ONNX. Please install {e.name} first. {e}"
- )
- def convert(
- framework: str,
- model: str,
- output: Path,
- opset: int,
- tokenizer: Optional[str] = None,
- use_external_format: bool = False,
- pipeline_name: str = "feature-extraction",
- **model_kwargs,
- ):
- """
- Convert the pipeline object to the ONNX Intermediate Representation (IR) format
- Args:
- framework: The framework the pipeline is backed by ("pt" or "tf")
- model: The name of the model to load for the pipeline
- output: The path where the ONNX graph will be stored
- opset: The actual version of the ONNX operator set to use
- tokenizer: The name of the model to load for the pipeline, default to the model's name if not provided
- use_external_format:
- Split the model definition from its parameters to allow model bigger than 2GB (PyTorch only)
- pipeline_name: The kind of pipeline to instantiate (ner, question-answering, etc.)
- model_kwargs: Keyword arguments to be forwarded to the model constructor
- Returns:
- """
- warnings.warn(
- "The `transformers.convert_graph_to_onnx` package is deprecated and will be removed in version 5 of"
- " Transformers",
- FutureWarning,
- )
- print(f"ONNX opset version set to: {opset}")
- # Load the pipeline
- nlp = load_graph_from_args(pipeline_name, framework, model, tokenizer, **model_kwargs)
- if not output.parent.exists():
- print(f"Creating folder {output.parent}")
- makedirs(output.parent.as_posix())
- elif len(listdir(output.parent.as_posix())) > 0:
- raise Exception(f"Folder {output.parent.as_posix()} is not empty, aborting conversion")
- # Export the graph
- if framework == "pt":
- convert_pytorch(nlp, opset, output, use_external_format)
- else:
- convert_tensorflow(nlp, opset, output)
- def optimize(onnx_model_path: Path) -> Path:
- """
- Load the model at the specified path and let onnxruntime look at transformations on the graph to enable all the
- optimizations possible
- Args:
- onnx_model_path: filepath where the model binary description is stored
- Returns: Path where the optimized model binary description has been saved
- """
- from onnxruntime import InferenceSession, SessionOptions
- # Generate model name with suffix "optimized"
- opt_model_path = generate_identified_filename(onnx_model_path, "-optimized")
- sess_option = SessionOptions()
- sess_option.optimized_model_filepath = opt_model_path.as_posix()
- _ = InferenceSession(onnx_model_path.as_posix(), sess_option)
- print(f"Optimized model has been written at {opt_model_path}: \N{HEAVY CHECK MARK}")
- print("/!\\ Optimized model contains hardware specific operators which might not be portable. /!\\")
- return opt_model_path
- def quantize(onnx_model_path: Path) -> Path:
- """
- Quantize the weights of the model from float32 to in8 to allow very efficient inference on modern CPU
- Args:
- onnx_model_path: Path to location the exported ONNX model is stored
- Returns: The Path generated for the quantized
- """
- import onnx
- import onnxruntime
- from onnx.onnx_pb import ModelProto
- from onnxruntime.quantization import QuantizationMode
- from onnxruntime.quantization.onnx_quantizer import ONNXQuantizer
- from onnxruntime.quantization.registry import IntegerOpsRegistry
- # Load the ONNX model
- onnx_model = onnx.load(onnx_model_path.as_posix())
- if parse(onnx.__version__) < parse("1.5.0"):
- print(
- "Models larger than 2GB will fail to quantize due to protobuf constraint.\n"
- "Please upgrade to onnxruntime >= 1.5.0."
- )
- # Copy it
- copy_model = ModelProto()
- copy_model.CopyFrom(onnx_model)
- # Construct quantizer
- # onnxruntime renamed input_qType to activation_qType in v1.13.1, so we
- # check the onnxruntime version to ensure backward compatibility.
- # See also: https://github.com/microsoft/onnxruntime/pull/12873
- if parse(onnxruntime.__version__) < parse("1.13.1"):
- quantizer = ONNXQuantizer(
- model=copy_model,
- per_channel=False,
- reduce_range=False,
- mode=QuantizationMode.IntegerOps,
- static=False,
- weight_qType=True,
- input_qType=False,
- tensors_range=None,
- nodes_to_quantize=None,
- nodes_to_exclude=None,
- op_types_to_quantize=list(IntegerOpsRegistry),
- )
- else:
- quantizer = ONNXQuantizer(
- model=copy_model,
- per_channel=False,
- reduce_range=False,
- mode=QuantizationMode.IntegerOps,
- static=False,
- weight_qType=True,
- activation_qType=False,
- tensors_range=None,
- nodes_to_quantize=None,
- nodes_to_exclude=None,
- op_types_to_quantize=list(IntegerOpsRegistry),
- )
- # Quantize and export
- quantizer.quantize_model()
- # Append "-quantized" at the end of the model's name
- quantized_model_path = generate_identified_filename(onnx_model_path, "-quantized")
- # Save model
- print(f"Quantized model has been written at {quantized_model_path}: \N{HEAVY CHECK MARK}")
- onnx.save_model(quantizer.model.model, quantized_model_path.as_posix())
- return quantized_model_path
- def verify(path: Path):
- from onnxruntime import InferenceSession, SessionOptions
- from onnxruntime.capi.onnxruntime_pybind11_state import RuntimeException
- print(f"Checking ONNX model loading from: {path} ...")
- try:
- onnx_options = SessionOptions()
- _ = InferenceSession(path.as_posix(), onnx_options, providers=["CPUExecutionProvider"])
- print(f"Model {path} correctly loaded: \N{HEAVY CHECK MARK}")
- except RuntimeException as re:
- print(f"Error while loading the model {re}: \N{HEAVY BALLOT X}")
- if __name__ == "__main__":
- parser = OnnxConverterArgumentParser()
- args = parser.parse_args()
- # Make sure output is absolute path
- args.output = Path(args.output).absolute()
- try:
- print("\n====== Converting model to ONNX ======")
- # Convert
- convert(
- args.framework,
- args.model,
- args.output,
- args.opset,
- args.tokenizer,
- args.use_external_format,
- args.pipeline,
- )
- if args.quantize:
- # Ensure requirements for quantization on onnxruntime is met
- check_onnxruntime_requirements(ORT_QUANTIZE_MINIMUM_VERSION)
- # onnxruntime optimizations doesn't provide the same level of performances on TensorFlow than PyTorch
- if args.framework == "tf":
- print(
- "\t Using TensorFlow might not provide the same optimization level compared to PyTorch.\n"
- "\t For TensorFlow users you can try optimizing the model directly through onnxruntime_tools.\n"
- "\t For more information, please refer to the onnxruntime documentation:\n"
- "\t\thttps://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers\n"
- )
- print("\n====== Optimizing ONNX model ======")
- # Quantization works best when using the optimized version of the model
- args.optimized_output = optimize(args.output)
- # Do the quantization on the right graph
- args.quantized_output = quantize(args.optimized_output)
- # And verify
- if args.check_loading:
- print("\n====== Check exported ONNX model(s) ======")
- verify(args.output)
- if hasattr(args, "optimized_output"):
- verify(args.optimized_output)
- if hasattr(args, "quantized_output"):
- verify(args.quantized_output)
- except Exception as e:
- print(f"Error while converting the model: {e}")
- exit(1)
|