xavier.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373
  1. # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import math
  15. from paddle import _C_ops
  16. from ...base import core, framework, unique_name
  17. from ...base.data_feeder import check_variable_and_dtype
  18. from ...base.framework import (
  19. _current_expected_place,
  20. in_dygraph_mode,
  21. in_pir_mode,
  22. )
  23. from .initializer import Initializer
  24. __all__ = []
  25. class XavierInitializer(Initializer):
  26. r"""
  27. This class implements the Xavier weight initializer from the paper
  28. `Understanding the difficulty of training deep feedforward neural
  29. networks <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_
  30. by Xavier Glorot and Yoshua Bengio.
  31. This initializer is designed to keep the scale of the gradients
  32. approximately same in all the layers. In case of Uniform distribution,
  33. the range is [-x, x], where
  34. .. math::
  35. x = gain \times \sqrt{\\frac{6.0}{fan\_in + fan\_out}}
  36. In case of Normal distribution, the mean is 0 and the standard deviation
  37. is
  38. .. math::
  39. gain \times \sqrt{\\frac{2.0}{fan\_in + fan\_out}}
  40. Args:
  41. uniform (bool, optional): whether to use uniform ,if False use normal distribution. Default is True.
  42. fan_in (float, optional): fan_in for Xavier initialization. If None, it is
  43. inferred from the variable. Default is None.
  44. fan_out (float, optional): fan_out for Xavier initialization. If None, it is
  45. inferred from the variable. Default is None.
  46. gain (float, optional): Scaling Tensor. Default is 1.0.
  47. seed (int, optional): Random seed. Default is 0.
  48. Note:
  49. It is recommended to set fan_in and fan_out to None for most cases.
  50. """
  51. def __init__(
  52. self, uniform=True, fan_in=None, fan_out=None, seed=0, gain=1.0
  53. ):
  54. assert uniform is not None
  55. assert seed is not None
  56. super().__init__()
  57. self._uniform = uniform
  58. self._fan_in = fan_in
  59. self._fan_out = fan_out
  60. self._seed = seed
  61. self._gain = gain
  62. def forward(self, var, block=None):
  63. """Initialize the input tensor with Xavier initialization.
  64. Args:
  65. var(Tensor): Tensor that needs to be initialized.
  66. block(Block, optional): The block in which initialization ops
  67. should be added. Used in static graph only, default None.
  68. Returns:
  69. The initialization op
  70. """
  71. import paddle
  72. block = self._check_block(block)
  73. assert isinstance(block, (framework.Block, paddle.pir.Block))
  74. if not isinstance(var, paddle.pir.core.ParameterMeta):
  75. check_variable_and_dtype(
  76. var,
  77. "Out",
  78. ["uint16", "float16", "float32", "float64"],
  79. "xavier_init",
  80. )
  81. f_in, f_out = self._compute_fans(var)
  82. # If fan_in and fan_out are passed, use them
  83. fan_in = f_in if self._fan_in is None else self._fan_in
  84. fan_out = f_out if self._fan_out is None else self._fan_out
  85. if self._seed == 0:
  86. self._seed = block.program.random_seed
  87. out_var_shape = (
  88. var._local_shape
  89. if (isinstance(var, framework.EagerParamBase) and var.is_dist())
  90. else var.shape
  91. )
  92. # to be compatible of fp16 initializers
  93. if var.dtype == core.VarDesc.VarType.FP16 or (
  94. var.dtype == core.VarDesc.VarType.BF16 and not self._uniform
  95. ):
  96. out_dtype = core.VarDesc.VarType.FP32
  97. out_var = block.create_var(
  98. name=unique_name.generate(
  99. ".".join(['xavier_init', var.name, 'tmp'])
  100. ),
  101. shape=out_var_shape,
  102. dtype=out_dtype,
  103. type=core.VarDesc.VarType.LOD_TENSOR,
  104. persistable=False,
  105. )
  106. elif (
  107. var.dtype in (core.DataType.FLOAT16, core.DataType.BFLOAT16)
  108. and not self._uniform
  109. ):
  110. out_dtype = core.DataType.FLOAT32
  111. out_var = var
  112. else:
  113. out_dtype = var.dtype
  114. out_var = var
  115. if in_dygraph_mode():
  116. if self._uniform:
  117. limit = self._gain * math.sqrt(6.0 / float(fan_in + fan_out))
  118. out_var = _C_ops.uniform(
  119. out_var_shape,
  120. out_dtype,
  121. -limit,
  122. limit,
  123. self._seed,
  124. _current_expected_place(),
  125. )
  126. else:
  127. std = self._gain * math.sqrt(2.0 / float(fan_in + fan_out))
  128. place = _current_expected_place()
  129. out_var = _C_ops.gaussian(
  130. out_var_shape,
  131. 0.0,
  132. std,
  133. self._seed,
  134. out_dtype,
  135. place,
  136. )
  137. if var.dtype == core.VarDesc.VarType.FP16 or (
  138. var.dtype == core.VarDesc.VarType.BF16 and not self._uniform
  139. ):
  140. out_var = _C_ops.cast(out_var, var.dtype)
  141. if isinstance(var, framework.EagerParamBase) and var.is_dist():
  142. # lazy init for dist tensor
  143. out_var = (
  144. paddle.distributed.auto_parallel.api.dtensor_from_local(
  145. out_var, var.process_mesh, var.placements
  146. )
  147. )
  148. out_var._share_underline_tensor_to(var)
  149. return None
  150. elif in_pir_mode():
  151. if self._uniform:
  152. limit = self._gain * math.sqrt(6.0 / float(fan_in + fan_out))
  153. out_var = paddle._pir_ops.uniform(
  154. out_var.shape,
  155. out_dtype,
  156. -limit,
  157. limit,
  158. self._seed,
  159. _current_expected_place(),
  160. )
  161. else:
  162. std = self._gain * math.sqrt(2.0 / float(fan_in + fan_out))
  163. out_var = _C_ops.gaussian(
  164. out_var.shape,
  165. 0.0,
  166. std,
  167. self._seed,
  168. out_dtype,
  169. _current_expected_place(),
  170. )
  171. if (
  172. var.dtype in (core.DataType.FLOAT16, core.DataType.BFLOAT16)
  173. and not self._uniform
  174. ):
  175. return _C_ops.cast(out_var, var.dtype)
  176. return out_var
  177. else:
  178. if self._uniform:
  179. limit = self._gain * math.sqrt(6.0 / float(fan_in + fan_out))
  180. op = block.append_op(
  181. type="uniform_random",
  182. inputs={},
  183. outputs={"Out": out_var},
  184. attrs={
  185. "shape": out_var.shape,
  186. "dtype": out_dtype,
  187. "min": -limit,
  188. "max": limit,
  189. "seed": self._seed,
  190. },
  191. stop_gradient=True,
  192. )
  193. else:
  194. std = self._gain * math.sqrt(2.0 / float(fan_in + fan_out))
  195. op = block.append_op(
  196. type="gaussian_random",
  197. outputs={"Out": out_var},
  198. attrs={
  199. "shape": out_var.shape,
  200. "dtype": out_var.dtype,
  201. "mean": 0.0,
  202. "std": std,
  203. "seed": self._seed,
  204. },
  205. stop_gradient=True,
  206. )
  207. if var.dtype == core.VarDesc.VarType.FP16 or (
  208. var.dtype == core.VarDesc.VarType.BF16 and not self._uniform
  209. ):
  210. block.append_op(
  211. type="cast",
  212. inputs={"X": out_var},
  213. outputs={"Out": var},
  214. attrs={"in_dtype": out_var.dtype, "out_dtype": var.dtype},
  215. )
  216. var.op = op
  217. return op
  218. class XavierNormal(XavierInitializer):
  219. r"""
  220. This class implements the Xavier weight initializer from the paper
  221. `Understanding the difficulty of training deep feedforward neural
  222. networks <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_
  223. by Xavier Glorot and Yoshua Bengio, using a normal distribution whose mean is :math:`0` and standard deviation is
  224. .. math::
  225. gain \times \sqrt{\frac{2.0}{fan\_in + fan\_out}}.
  226. Args:
  227. fan_in (float, optional): fan_in for Xavier initialization, which is
  228. inferred from the Tensor. Default is None.
  229. fan_out (float, optional): fan_out for Xavier initialization, which is
  230. inferred from the Tensor. Default is None.
  231. gain (float, optional): Scaling Tensor. Default is 1.0.
  232. name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
  233. Returns:
  234. A parameter initialized by Xavier weight, using a normal distribution.
  235. Examples:
  236. .. code-block:: python
  237. >>> import paddle
  238. >>> paddle.seed(1)
  239. >>> data = paddle.ones(shape=[3, 1, 2], dtype='float32')
  240. >>> weight_attr = paddle.framework.ParamAttr(
  241. ... name="linear_weight",
  242. ... initializer=paddle.nn.initializer.XavierNormal())
  243. >>> bias_attr = paddle.framework.ParamAttr(
  244. ... name="linear_bias",
  245. ... initializer=paddle.nn.initializer.XavierNormal())
  246. >>> linear = paddle.nn.Linear(2, 2, weight_attr=weight_attr, bias_attr=bias_attr)
  247. >>> print(linear.weight)
  248. Parameter containing:
  249. Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=False,
  250. [[-0.21607460, 0.08382989],
  251. [ 0.29147008, -0.07049121]])
  252. >>> print(linear.bias)
  253. Parameter containing:
  254. Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=False,
  255. [1.06076419, 0.87684733])
  256. >>> res = linear(data)
  257. >>> print(res)
  258. Tensor(shape=[3, 1, 2], dtype=float32, place=Place(cpu), stop_gradient=False,
  259. [[[1.13615966, 0.89018601]],
  260. [[1.13615966, 0.89018601]],
  261. [[1.13615966, 0.89018601]]])
  262. """
  263. def __init__(self, fan_in=None, fan_out=None, gain=1.0, name=None):
  264. super().__init__(
  265. uniform=False, fan_in=fan_in, fan_out=fan_out, seed=0, gain=gain
  266. )
  267. class XavierUniform(XavierInitializer):
  268. r"""
  269. This class implements the Xavier weight initializer from the paper
  270. `Understanding the difficulty of training deep feedforward neural
  271. networks <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_
  272. by Xavier Glorot and Yoshua Bengio.
  273. This initializer is designed to keep the scale of the gradients
  274. approximately same in all the layers. In case of Uniform distribution,
  275. the range is :math:`[-x,x]`, where
  276. .. math::
  277. x = gain \times \sqrt{\frac{6.0}{fan\_in + fan\_out}}.
  278. Args:
  279. fan_in (float, optional): fan_in for Xavier initialization, which is
  280. inferred from the Tensor. Default is None.
  281. fan_out (float, optional): fan_out for Xavier initialization, which is
  282. inferred from the Tensor. Default is None.
  283. gain (float, optional): Scaling Tensor. Default is 1.0.
  284. name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
  285. Returns:
  286. A parameter initialized by Xavier weight, using a uniform distribution.
  287. Examples:
  288. .. code-block:: python
  289. >>> import paddle
  290. >>> paddle.seed(1)
  291. >>> data = paddle.ones(shape=[3, 1, 2], dtype='float32')
  292. >>> weight_attr = paddle.framework.ParamAttr(
  293. ... name="linear_weight",
  294. ... initializer=paddle.nn.initializer.XavierUniform())
  295. >>> bias_attr = paddle.framework.ParamAttr(
  296. ... name="linear_bias",
  297. ... initializer=paddle.nn.initializer.XavierUniform())
  298. >>> linear = paddle.nn.Linear(2, 2, weight_attr=weight_attr, bias_attr=bias_attr)
  299. >>> print(linear.weight)
  300. Parameter containing:
  301. Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=False,
  302. [[-1.18095720, 0.64892638],
  303. [ 0.43125069, -1.11156428]])
  304. >>> print(linear.bias)
  305. Parameter containing:
  306. Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=False,
  307. [-0.27524316, 1.13808715])
  308. >>> res = linear(data)
  309. >>> print(res)
  310. Tensor(shape=[3, 1, 2], dtype=float32, place=Place(cpu), stop_gradient=False,
  311. [[[-1.02494967, 0.67544925]],
  312. [[-1.02494967, 0.67544925]],
  313. [[-1.02494967, 0.67544925]]])
  314. """
  315. def __init__(self, fan_in=None, fan_out=None, gain=1.0, name=None):
  316. super().__init__(
  317. uniform=True, fan_in=fan_in, fan_out=fan_out, seed=0, gain=gain
  318. )