| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373 |
- # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- import math
- from paddle import _C_ops
- from ...base import core, framework, unique_name
- from ...base.data_feeder import check_variable_and_dtype
- from ...base.framework import (
- _current_expected_place,
- in_dygraph_mode,
- in_pir_mode,
- )
- from .initializer import Initializer
- __all__ = []
- class XavierInitializer(Initializer):
- r"""
- This class implements the Xavier weight initializer from the paper
- `Understanding the difficulty of training deep feedforward neural
- networks <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_
- by Xavier Glorot and Yoshua Bengio.
- This initializer is designed to keep the scale of the gradients
- approximately same in all the layers. In case of Uniform distribution,
- the range is [-x, x], where
- .. math::
- x = gain \times \sqrt{\\frac{6.0}{fan\_in + fan\_out}}
- In case of Normal distribution, the mean is 0 and the standard deviation
- is
- .. math::
- gain \times \sqrt{\\frac{2.0}{fan\_in + fan\_out}}
- Args:
- uniform (bool, optional): whether to use uniform ,if False use normal distribution. Default is True.
- fan_in (float, optional): fan_in for Xavier initialization. If None, it is
- inferred from the variable. Default is None.
- fan_out (float, optional): fan_out for Xavier initialization. If None, it is
- inferred from the variable. Default is None.
- gain (float, optional): Scaling Tensor. Default is 1.0.
- seed (int, optional): Random seed. Default is 0.
- Note:
- It is recommended to set fan_in and fan_out to None for most cases.
- """
- def __init__(
- self, uniform=True, fan_in=None, fan_out=None, seed=0, gain=1.0
- ):
- assert uniform is not None
- assert seed is not None
- super().__init__()
- self._uniform = uniform
- self._fan_in = fan_in
- self._fan_out = fan_out
- self._seed = seed
- self._gain = gain
- def forward(self, var, block=None):
- """Initialize the input tensor with Xavier initialization.
- Args:
- var(Tensor): Tensor that needs to be initialized.
- block(Block, optional): The block in which initialization ops
- should be added. Used in static graph only, default None.
- Returns:
- The initialization op
- """
- import paddle
- block = self._check_block(block)
- assert isinstance(block, (framework.Block, paddle.pir.Block))
- if not isinstance(var, paddle.pir.core.ParameterMeta):
- check_variable_and_dtype(
- var,
- "Out",
- ["uint16", "float16", "float32", "float64"],
- "xavier_init",
- )
- f_in, f_out = self._compute_fans(var)
- # If fan_in and fan_out are passed, use them
- fan_in = f_in if self._fan_in is None else self._fan_in
- fan_out = f_out if self._fan_out is None else self._fan_out
- if self._seed == 0:
- self._seed = block.program.random_seed
- out_var_shape = (
- var._local_shape
- if (isinstance(var, framework.EagerParamBase) and var.is_dist())
- else var.shape
- )
- # to be compatible of fp16 initializers
- if var.dtype == core.VarDesc.VarType.FP16 or (
- var.dtype == core.VarDesc.VarType.BF16 and not self._uniform
- ):
- out_dtype = core.VarDesc.VarType.FP32
- out_var = block.create_var(
- name=unique_name.generate(
- ".".join(['xavier_init', var.name, 'tmp'])
- ),
- shape=out_var_shape,
- dtype=out_dtype,
- type=core.VarDesc.VarType.LOD_TENSOR,
- persistable=False,
- )
- elif (
- var.dtype in (core.DataType.FLOAT16, core.DataType.BFLOAT16)
- and not self._uniform
- ):
- out_dtype = core.DataType.FLOAT32
- out_var = var
- else:
- out_dtype = var.dtype
- out_var = var
- if in_dygraph_mode():
- if self._uniform:
- limit = self._gain * math.sqrt(6.0 / float(fan_in + fan_out))
- out_var = _C_ops.uniform(
- out_var_shape,
- out_dtype,
- -limit,
- limit,
- self._seed,
- _current_expected_place(),
- )
- else:
- std = self._gain * math.sqrt(2.0 / float(fan_in + fan_out))
- place = _current_expected_place()
- out_var = _C_ops.gaussian(
- out_var_shape,
- 0.0,
- std,
- self._seed,
- out_dtype,
- place,
- )
- if var.dtype == core.VarDesc.VarType.FP16 or (
- var.dtype == core.VarDesc.VarType.BF16 and not self._uniform
- ):
- out_var = _C_ops.cast(out_var, var.dtype)
- if isinstance(var, framework.EagerParamBase) and var.is_dist():
- # lazy init for dist tensor
- out_var = (
- paddle.distributed.auto_parallel.api.dtensor_from_local(
- out_var, var.process_mesh, var.placements
- )
- )
- out_var._share_underline_tensor_to(var)
- return None
- elif in_pir_mode():
- if self._uniform:
- limit = self._gain * math.sqrt(6.0 / float(fan_in + fan_out))
- out_var = paddle._pir_ops.uniform(
- out_var.shape,
- out_dtype,
- -limit,
- limit,
- self._seed,
- _current_expected_place(),
- )
- else:
- std = self._gain * math.sqrt(2.0 / float(fan_in + fan_out))
- out_var = _C_ops.gaussian(
- out_var.shape,
- 0.0,
- std,
- self._seed,
- out_dtype,
- _current_expected_place(),
- )
- if (
- var.dtype in (core.DataType.FLOAT16, core.DataType.BFLOAT16)
- and not self._uniform
- ):
- return _C_ops.cast(out_var, var.dtype)
- return out_var
- else:
- if self._uniform:
- limit = self._gain * math.sqrt(6.0 / float(fan_in + fan_out))
- op = block.append_op(
- type="uniform_random",
- inputs={},
- outputs={"Out": out_var},
- attrs={
- "shape": out_var.shape,
- "dtype": out_dtype,
- "min": -limit,
- "max": limit,
- "seed": self._seed,
- },
- stop_gradient=True,
- )
- else:
- std = self._gain * math.sqrt(2.0 / float(fan_in + fan_out))
- op = block.append_op(
- type="gaussian_random",
- outputs={"Out": out_var},
- attrs={
- "shape": out_var.shape,
- "dtype": out_var.dtype,
- "mean": 0.0,
- "std": std,
- "seed": self._seed,
- },
- stop_gradient=True,
- )
- if var.dtype == core.VarDesc.VarType.FP16 or (
- var.dtype == core.VarDesc.VarType.BF16 and not self._uniform
- ):
- block.append_op(
- type="cast",
- inputs={"X": out_var},
- outputs={"Out": var},
- attrs={"in_dtype": out_var.dtype, "out_dtype": var.dtype},
- )
- var.op = op
- return op
- class XavierNormal(XavierInitializer):
- r"""
- This class implements the Xavier weight initializer from the paper
- `Understanding the difficulty of training deep feedforward neural
- networks <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_
- by Xavier Glorot and Yoshua Bengio, using a normal distribution whose mean is :math:`0` and standard deviation is
- .. math::
- gain \times \sqrt{\frac{2.0}{fan\_in + fan\_out}}.
- Args:
- fan_in (float, optional): fan_in for Xavier initialization, which is
- inferred from the Tensor. Default is None.
- fan_out (float, optional): fan_out for Xavier initialization, which is
- inferred from the Tensor. Default is None.
- gain (float, optional): Scaling Tensor. Default is 1.0.
- name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
- Returns:
- A parameter initialized by Xavier weight, using a normal distribution.
- Examples:
- .. code-block:: python
- >>> import paddle
- >>> paddle.seed(1)
- >>> data = paddle.ones(shape=[3, 1, 2], dtype='float32')
- >>> weight_attr = paddle.framework.ParamAttr(
- ... name="linear_weight",
- ... initializer=paddle.nn.initializer.XavierNormal())
- >>> bias_attr = paddle.framework.ParamAttr(
- ... name="linear_bias",
- ... initializer=paddle.nn.initializer.XavierNormal())
- >>> linear = paddle.nn.Linear(2, 2, weight_attr=weight_attr, bias_attr=bias_attr)
- >>> print(linear.weight)
- Parameter containing:
- Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=False,
- [[-0.21607460, 0.08382989],
- [ 0.29147008, -0.07049121]])
- >>> print(linear.bias)
- Parameter containing:
- Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=False,
- [1.06076419, 0.87684733])
- >>> res = linear(data)
- >>> print(res)
- Tensor(shape=[3, 1, 2], dtype=float32, place=Place(cpu), stop_gradient=False,
- [[[1.13615966, 0.89018601]],
- [[1.13615966, 0.89018601]],
- [[1.13615966, 0.89018601]]])
- """
- def __init__(self, fan_in=None, fan_out=None, gain=1.0, name=None):
- super().__init__(
- uniform=False, fan_in=fan_in, fan_out=fan_out, seed=0, gain=gain
- )
- class XavierUniform(XavierInitializer):
- r"""
- This class implements the Xavier weight initializer from the paper
- `Understanding the difficulty of training deep feedforward neural
- networks <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_
- by Xavier Glorot and Yoshua Bengio.
- This initializer is designed to keep the scale of the gradients
- approximately same in all the layers. In case of Uniform distribution,
- the range is :math:`[-x,x]`, where
- .. math::
- x = gain \times \sqrt{\frac{6.0}{fan\_in + fan\_out}}.
- Args:
- fan_in (float, optional): fan_in for Xavier initialization, which is
- inferred from the Tensor. Default is None.
- fan_out (float, optional): fan_out for Xavier initialization, which is
- inferred from the Tensor. Default is None.
- gain (float, optional): Scaling Tensor. Default is 1.0.
- name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
- Returns:
- A parameter initialized by Xavier weight, using a uniform distribution.
- Examples:
- .. code-block:: python
- >>> import paddle
- >>> paddle.seed(1)
- >>> data = paddle.ones(shape=[3, 1, 2], dtype='float32')
- >>> weight_attr = paddle.framework.ParamAttr(
- ... name="linear_weight",
- ... initializer=paddle.nn.initializer.XavierUniform())
- >>> bias_attr = paddle.framework.ParamAttr(
- ... name="linear_bias",
- ... initializer=paddle.nn.initializer.XavierUniform())
- >>> linear = paddle.nn.Linear(2, 2, weight_attr=weight_attr, bias_attr=bias_attr)
- >>> print(linear.weight)
- Parameter containing:
- Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=False,
- [[-1.18095720, 0.64892638],
- [ 0.43125069, -1.11156428]])
- >>> print(linear.bias)
- Parameter containing:
- Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=False,
- [-0.27524316, 1.13808715])
- >>> res = linear(data)
- >>> print(res)
- Tensor(shape=[3, 1, 2], dtype=float32, place=Place(cpu), stop_gradient=False,
- [[[-1.02494967, 0.67544925]],
- [[-1.02494967, 0.67544925]],
- [[-1.02494967, 0.67544925]]])
- """
- def __init__(self, fan_in=None, fan_out=None, gain=1.0, name=None):
- super().__init__(
- uniform=True, fan_in=fan_in, fan_out=fan_out, seed=0, gain=gain
- )
|