kaiming.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327
  1. # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # TODO: define the initializers of Kaiming functions in neural network
  15. import math
  16. import paddle
  17. from paddle import _C_ops
  18. from ...base import core, framework, unique_name
  19. from ...base.framework import (
  20. _current_expected_place,
  21. in_dygraph_mode,
  22. in_pir_mode,
  23. )
  24. from .initializer import Initializer, calculate_gain
  25. __all__ = []
  26. class MSRAInitializer(Initializer):
  27. r"""Implements the MSRA initializer a.k.a. Kaiming Initializer
  28. This class implements the weight initialization from the paper
  29. `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
  30. ImageNet Classification <https://arxiv.org/abs/1502.01852>`_
  31. by Kaiming He, Xiangyu Zhang, Shaoqing Ren and Jian Sun. This is a
  32. robust initialization method that particularly considers the rectifier
  33. nonlinearities. In case of Uniform distribution, the range is [-x, x], where
  34. .. math::
  35. x = gain \times \sqrt{\frac{3}{fan\_in}}
  36. In case of Normal distribution, the mean is 0 and the standard deviation
  37. is
  38. .. math::
  39. \frac{gain}{\sqrt{{fan\_in}}}
  40. Args:
  41. uniform (bool, optional): whether to use uniform or normal distribution. Default is True.
  42. fan_in (float32|None, optional): fan_in (in_features) of trainable Tensor, If None, it will be infered automatically. If you don't want to use in_features of the Tensor, you can set the value of 'fan_in' smartly by yourself. Default is None.
  43. seed (int32, optional): random seed. Default is 0.
  44. negative_slope (float, optional): negative_slope (only used with leaky_relu). Default is 0.0.
  45. nonlinearity(str, optional): the non-linear function. Default is relu.
  46. Note:
  47. It is recommended to set fan_in to None for most cases.
  48. """
  49. def __init__(
  50. self,
  51. uniform=True,
  52. fan_in=None,
  53. seed=0,
  54. negative_slope=0,
  55. nonlinearity='relu',
  56. ):
  57. """Constructor for MSRAInitializer"""
  58. assert uniform is not None
  59. assert seed is not None
  60. super().__init__()
  61. self._uniform = uniform
  62. self._fan_in = fan_in
  63. self._seed = seed
  64. self._negative_slope = negative_slope
  65. self._nonlinearity = nonlinearity
  66. def forward(self, var, block=None):
  67. """Initialize the input tensor with MSRA initialization.
  68. Args:
  69. var(Tensor): Tensor that needs to be initialized.
  70. block(Block, optional): The block in which initialization ops
  71. should be added. Used in static graph only, default None.
  72. Returns:
  73. The initialization op.
  74. """
  75. assert not (
  76. isinstance(var, framework.EagerParamBase) and var.is_dist()
  77. ), "Currently, kaiming initializer not support lazy init for dist param."
  78. block = self._check_block(block)
  79. assert isinstance(
  80. var, (framework.Variable, paddle.pir.core.ParameterMeta)
  81. )
  82. assert isinstance(block, (framework.Block, paddle.pir.Block))
  83. f_in, f_out = self._compute_fans(var)
  84. # If fan_in is passed, use it
  85. fan_in = f_in if self._fan_in is None else self._fan_in
  86. if self._seed == 0:
  87. self._seed = block.program.random_seed
  88. # to be compatible of fp16 initializers
  89. if var.dtype == core.VarDesc.VarType.FP16 or (
  90. var.dtype == core.VarDesc.VarType.BF16 and not self._uniform
  91. ):
  92. out_dtype = core.VarDesc.VarType.FP32
  93. out_var = block.create_var(
  94. name=unique_name.generate(
  95. ".".join(['masra_init', var.name, 'tmp'])
  96. ),
  97. shape=var.shape,
  98. dtype=out_dtype,
  99. type=core.VarDesc.VarType.LOD_TENSOR,
  100. persistable=False,
  101. )
  102. elif (
  103. var.dtype in (core.DataType.FLOAT16, core.DataType.BFLOAT16)
  104. and not self._uniform
  105. ):
  106. out_dtype = core.DataType.FLOAT32
  107. out_var = var
  108. else:
  109. out_dtype = var.dtype
  110. out_var = var
  111. if in_dygraph_mode():
  112. if self._uniform:
  113. gain = calculate_gain(self._nonlinearity, self._negative_slope)
  114. limit = gain * math.sqrt(3.0 / float(fan_in))
  115. out_var = _C_ops.uniform(
  116. var.shape,
  117. out_dtype,
  118. -limit,
  119. limit,
  120. self._seed,
  121. _current_expected_place(),
  122. )
  123. else:
  124. gain = calculate_gain(self._nonlinearity, self._negative_slope)
  125. std = gain / math.sqrt(float(fan_in))
  126. place = _current_expected_place()
  127. out_var = _C_ops.gaussian(
  128. out_var.shape, 0.0, std, self._seed, out_dtype, place
  129. )
  130. if var.dtype == core.VarDesc.VarType.FP16 or (
  131. var.dtype == core.VarDesc.VarType.BF16 and not self._uniform
  132. ):
  133. var_tmp = _C_ops.cast(out_var, var.dtype)
  134. var_tmp._share_underline_tensor_to(var)
  135. else:
  136. out_var._share_underline_tensor_to(var)
  137. return None
  138. elif in_pir_mode():
  139. if self._uniform:
  140. gain = calculate_gain(self._nonlinearity, self._negative_slope)
  141. limit = gain * math.sqrt(3.0 / float(fan_in))
  142. out_var = _C_ops.uniform(
  143. var.shape,
  144. out_dtype,
  145. -limit,
  146. limit,
  147. self._seed,
  148. _current_expected_place(),
  149. )
  150. else:
  151. gain = calculate_gain(self._nonlinearity, self._negative_slope)
  152. std = gain / math.sqrt(float(fan_in))
  153. place = _current_expected_place()
  154. out_var = _C_ops.gaussian(
  155. out_var.shape, 0.0, std, self._seed, out_dtype, place
  156. )
  157. if (
  158. var.dtype in (core.DataType.FLOAT16, core.DataType.BFLOAT16)
  159. and not self._uniform
  160. ):
  161. return _C_ops.cast(out_var, var.dtype)
  162. return out_var
  163. else:
  164. if self._uniform:
  165. gain = calculate_gain(self._nonlinearity, self._negative_slope)
  166. limit = gain * math.sqrt(3.0 / float(fan_in))
  167. op = block.append_op(
  168. type="uniform_random",
  169. inputs={},
  170. outputs={"Out": out_var},
  171. attrs={
  172. "shape": out_var.shape,
  173. "dtype": int(out_dtype),
  174. "min": -limit,
  175. "max": limit,
  176. "seed": self._seed,
  177. },
  178. stop_gradient=True,
  179. )
  180. else:
  181. gain = calculate_gain(self._nonlinearity, self._negative_slope)
  182. std = gain / math.sqrt(float(fan_in))
  183. op = block.append_op(
  184. type="gaussian_random",
  185. outputs={"Out": out_var},
  186. attrs={
  187. "shape": out_var.shape,
  188. "dtype": int(out_dtype),
  189. "mean": 0.0,
  190. "std": std,
  191. "seed": self._seed,
  192. },
  193. stop_gradient=True,
  194. )
  195. if var.dtype == core.VarDesc.VarType.FP16 or (
  196. var.dtype == core.VarDesc.VarType.BF16 and not self._uniform
  197. ):
  198. block.append_op(
  199. type="cast",
  200. inputs={"X": out_var},
  201. outputs={"Out": var},
  202. attrs={"in_dtype": out_var.dtype, "out_dtype": var.dtype},
  203. )
  204. var.op = op
  205. return op
  206. class KaimingNormal(MSRAInitializer):
  207. r"""Implements the Kaiming Normal initializer
  208. This class implements the weight initialization from the paper
  209. `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
  210. ImageNet Classification <https://arxiv.org/abs/1502.01852>`_
  211. by Kaiming He, Xiangyu Zhang, Shaoqing Ren and Jian Sun. This is a
  212. robust initialization method that particularly considers the rectifier
  213. nonlinearities.
  214. In case of Normal distribution, the mean is 0 and the standard deviation
  215. is
  216. .. math::
  217. \frac{gain}{\sqrt{{fan\_in}}}
  218. Args:
  219. fan_in (float32|None, optional): fan_in (in_features) of trainable Tensor, If None, it will be infered automatically. If you don't want to use in_features of the Tensor, you can set the value of 'fan_in' smartly by yourself. Default is None.
  220. negative_slope (float, optional): negative_slope (only used with leaky_relu). Default is 0.0.
  221. nonlinearity(str, optional): the non-linear function. Default is relu.
  222. Note:
  223. It is recommended to set fan_in to None for most cases.
  224. Examples:
  225. .. code-block:: python
  226. >>> import paddle
  227. >>> import paddle.nn as nn
  228. >>> linear = nn.Linear(2, 4, weight_attr=nn.initializer.KaimingNormal())
  229. >>> data = paddle.rand([30, 10, 2], dtype='float32')
  230. >>> res = linear(data)
  231. """
  232. def __init__(self, fan_in=None, negative_slope=0.0, nonlinearity='relu'):
  233. super().__init__(
  234. uniform=False,
  235. fan_in=fan_in,
  236. seed=0,
  237. negative_slope=negative_slope,
  238. nonlinearity=nonlinearity,
  239. )
  240. class KaimingUniform(MSRAInitializer):
  241. r"""Implements the Kaiming Uniform initializer
  242. This class implements the weight initialization from the paper
  243. `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
  244. ImageNet Classification <https://arxiv.org/abs/1502.01852>`_
  245. by Kaiming He, Xiangyu Zhang, Shaoqing Ren and Jian Sun. This is a
  246. robust initialization method that particularly considers the rectifier
  247. nonlinearities.
  248. In case of Uniform distribution, the range is [-x, x], where
  249. .. math::
  250. x = gain \times \sqrt{\frac{3}{fan\_in}}
  251. Args:
  252. fan_in (float32|None, optional): fan_in (in_features) of trainable Tensor, If None, it will be infered automaticly. If you don't want to use in_features of the Tensor, you can set the value of 'fan_in' smartly by yourself. Default is None.
  253. negative_slope (float, optional): negative_slope (only used with leaky_relu). Default is 0.0.
  254. nonlinearity(str, optional): the non-linear function. Default is relu.
  255. Note:
  256. It is recommended to set fan_in to None for most cases.
  257. Examples:
  258. .. code-block:: python
  259. >>> import paddle
  260. >>> import paddle.nn as nn
  261. >>> linear = nn.Linear(2, 4, weight_attr=nn.initializer.KaimingUniform())
  262. >>> data = paddle.rand([30, 10, 2], dtype='float32')
  263. >>> res = linear(data)
  264. """
  265. def __init__(self, fan_in=None, negative_slope=0.0, nonlinearity='relu'):
  266. super().__init__(
  267. uniform=True,
  268. fan_in=fan_in,
  269. seed=0,
  270. negative_slope=negative_slope,
  271. nonlinearity=nonlinearity,
  272. )