yichael
/
AutoAndroidController


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186
							# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from types import MethodType

import torch.nn as nn

from .imports import is_hpu_available, is_transformer_engine_available
from .operations import GatheredParameters


# Do not import `transformer_engine` at package level to avoid potential issues


def convert_model(model, to_transformer_engine=True, _convert_linear=True, _convert_ln=True):
    """
    Recursively converts the linear and layernorm layers of a model to their `transformers_engine` counterpart.
    """
    if not is_transformer_engine_available():
        raise ImportError("Using `convert_model` requires transformer_engine to be installed.")

    if is_hpu_available():
        import intel_transformer_engine as te

        if not hasattr(te, "LayerNorm"):
            # HPU does not have a LayerNorm implementation in TE
            te.LayerNorm = nn.LayerNorm
    else:
        import transformer_engine.pytorch as te

    for name, module in model.named_children():
        if isinstance(module, nn.Linear) and to_transformer_engine and _convert_linear:
            has_bias = module.bias is not None
            params_to_gather = [module.weight]
            if has_bias:
                params_to_gather.append(module.bias)

            with GatheredParameters(params_to_gather, modifier_rank=0):
                if any(p % 16 != 0 for p in module.weight.shape):
                    return
                te_module = te.Linear(
                    module.in_features, module.out_features, bias=has_bias, params_dtype=module.weight.dtype
                )
                te_module.weight.copy_(module.weight)
                if has_bias:
                    te_module.bias.copy_(module.bias)

                setattr(model, name, te_module)
        # Note: @xrsrke (Phuc) found that te.LayerNorm doesn't have any real memory savings or speedups over nn.LayerNorm
        elif isinstance(module, nn.LayerNorm) and to_transformer_engine and _convert_ln:
            with GatheredParameters([module.weight, module.bias], modifier_rank=0):
                has_bias = module.bias is not None
                te_module = te.LayerNorm(module.normalized_shape[0], eps=module.eps, params_dtype=module.weight.dtype)
                te_module.weight.copy_(module.weight)
                if has_bias:
                    te_module.bias.copy_(module.bias)

            setattr(model, name, te_module)
        elif isinstance(module, te.Linear) and not to_transformer_engine and _convert_linear:
            has_bias = module.bias is not None
            new_module = nn.Linear(
                module.in_features, module.out_features, bias=has_bias, params_dtype=module.weight.dtype
            )
            new_module.weight.copy_(module.weight)
            if has_bias:
                new_module.bias.copy_(module.bias)

            setattr(model, name, new_module)
        elif isinstance(module, te.LayerNorm) and not to_transformer_engine and _convert_ln:
            new_module = nn.LayerNorm(module.normalized_shape[0], eps=module.eps, params_dtype=module.weight.dtype)
            new_module.weight.copy_(module.weight)
            new_module.bias.copy_(module.bias)

            setattr(model, name, new_module)
        else:
            convert_model(
                module,
                to_transformer_engine=to_transformer_engine,
                _convert_linear=_convert_linear,
                _convert_ln=_convert_ln,
            )


def has_transformer_engine_layers(model):
    """
    Returns whether a given model has some `transformer_engine` layer or not.
    """
    if not is_transformer_engine_available():
        raise ImportError("Using `has_transformer_engine_layers` requires transformer_engine to be installed.")

    if is_hpu_available():
        import intel_transformer_engine as te

        module_cls_to_check = te.Linear
    else:
        import transformer_engine.pytorch as te

        module_cls_to_check = (te.LayerNorm, te.Linear, te.TransformerLayer)

    for m in model.modules():
        if isinstance(m, module_cls_to_check):
            return True

    return False


def contextual_fp8_autocast(model_forward, fp8_recipe, use_during_eval=False):
    """
    Wrapper for a model's forward method to apply FP8 autocast. Is context aware, meaning that by default it will
    disable FP8 autocast during eval mode, which is generally better for more accurate metrics.
    """
    if not is_transformer_engine_available():
        raise ImportError("Using `contextual_fp8_autocast` requires transformer_engine to be installed.")

    if is_hpu_available():
        from intel_transformer_engine import fp8_autocast
    else:
        from transformer_engine.pytorch import fp8_autocast

    def forward(self, *args, **kwargs):
        enabled = use_during_eval or self.training
        with fp8_autocast(enabled=enabled, fp8_recipe=fp8_recipe):
            return model_forward(*args, **kwargs)

    # To act like a decorator so that it can be popped when doing `extract_model_from_parallel`
    forward.__wrapped__ = model_forward

    return forward


def apply_fp8_autowrap(model, fp8_recipe_handler):
    """
    Applies FP8 context manager to the model's forward method
    """
    if not is_transformer_engine_available():
        raise ImportError("Using `apply_fp8_autowrap` requires transformer_engine to be installed.")

    if is_hpu_available():
        import intel_transformer_engine.recipe as te_recipe

        is_fp8_block_scaling_available = False
        message = "MXFP8 block scaling is not available on HPU."

    else:
        import transformer_engine.common.recipe as te_recipe
        import transformer_engine.pytorch as te

        is_fp8_block_scaling_available, message = te.fp8.check_mxfp8_support()

    kwargs = fp8_recipe_handler.to_kwargs() if fp8_recipe_handler is not None else {}
    if "fp8_format" in kwargs:
        kwargs["fp8_format"] = getattr(te_recipe.Format, kwargs["fp8_format"])
    use_during_eval = kwargs.pop("use_autocast_during_eval", False)
    use_mxfp8_block_scaling = kwargs.pop("use_mxfp8_block_scaling", False)

    if use_mxfp8_block_scaling and not is_fp8_block_scaling_available:
        raise ValueError(f"MXFP8 block scaling is not available: {message}")

    if use_mxfp8_block_scaling:
        if "amax_compute_algo" in kwargs:
            raise ValueError("`amax_compute_algo` is not supported for MXFP8 block scaling.")
        if "amax_history_len" in kwargs:
            raise ValueError("`amax_history_len` is not supported for MXFP8 block scaling.")
        fp8_recipe = te_recipe.MXFP8BlockScaling(**kwargs)
    else:
        fp8_recipe = te_recipe.DelayedScaling(**kwargs)

    new_forward = contextual_fp8_autocast(model.forward, fp8_recipe, use_during_eval)

    if hasattr(model.forward, "__func__"):
        model.forward = MethodType(new_forward, model)
    else:
        model.forward = new_forward

    return model