adadelta.py 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245
  1. # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import warnings
  15. from paddle import _C_ops
  16. from paddle.base.framework import in_dynamic_or_pir_mode
  17. from ..base import framework
  18. from ..base.dygraph import no_grad
  19. from .optimizer import Optimizer
  20. __all__ = []
  21. class Adadelta(Optimizer):
  22. r"""
  23. **Notes: This API does not support sparse parameter optimization.**
  24. Adadelta Optimizer. Please refer to this for details:
  25. `ADADELTA: AN ADAPTIVE LEARNING RATE METHOD <https://arxiv.org/abs/1212.5701>`_.
  26. The update is done as follows:
  27. .. math::
  28. E(g_t^2) &= \rho * E(g_{t-1}^2) + (1-\rho) * g^2
  29. learning\_rate &= \sqrt{ ( E(dx_{t-1}^2) + \epsilon ) / ( E(g_t^2) + \epsilon ) }
  30. E(dx_t^2) &= \rho * E(dx_{t-1}^2) + (1-\rho) * (-g*learning\_rate)^2
  31. Args:
  32. learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
  33. It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
  34. epsilon (float): a small float number for numeric stability. Default 1.0e-6.
  35. rho (float): a floating point value indicating the decay rate. Default 0.95.
  36. parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
  37. This parameter is required in dygraph mode. And you can specify different options for \
  38. different parameter groups such as the learning rate, weight decay, etc, \
  39. then the parameters are list of dict. Note that the learning_rate in paramter groups \
  40. represents the scale of base learning_rate. \
  41. The default value is None in static graph mode, at this time all parameters will be updated.
  42. weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
  43. It canbe a float value as coeff of L2 regularization or \
  44. :ref:`api_paddle_regularizer_L1Decay`, :ref:`api_paddle_regularizer_L2Decay`.
  45. If a parameter has set regularizer using :ref:`api_paddle_ParamAttr` already, \
  46. the regularization setting here in optimizer will be ignored for this parameter. \
  47. Otherwise, the regularization setting here in optimizer will take effect. \
  48. Default None, meaning there is no regularization.
  49. grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
  50. some derived class of ``GradientClipBase`` . There are three cliping strategies
  51. ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` ,
  52. :ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
  53. name (str, optional): The default value is None. Normally there is no need for user
  54. to set this property. For more information, please refer to
  55. :ref:`api_guide_Name` .
  56. Examples:
  57. .. code-block:: python
  58. >>> import paddle
  59. >>> inp = paddle.uniform([10, 10], dtype="float32", min=-0.1, max=0.1)
  60. >>> linear = paddle.nn.Linear(10, 10)
  61. >>> out = linear(inp)
  62. >>> loss = paddle.mean(out)
  63. >>> beta1 = paddle.to_tensor([0.9], dtype="float32")
  64. >>> beta2 = paddle.to_tensor([0.99], dtype="float32")
  65. >>> adadelta = paddle.optimizer.Adadelta(learning_rate=0.1, parameters=linear.parameters(), weight_decay=0.01)
  66. >>> back = out.backward()
  67. >>> adadelta.step()
  68. >>> adadelta.clear_grad()
  69. >>> # Note that the learning_rate of linear_2 is 0.01.
  70. >>> linear_1 = paddle.nn.Linear(10, 10)
  71. >>> linear_2 = paddle.nn.Linear(10, 10)
  72. >>> inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
  73. >>> out = linear_1(inp)
  74. >>> out = linear_2(out)
  75. >>> loss = paddle.mean(out)
  76. >>> adadelta = paddle.optimizer.Adadelta(
  77. ... learning_rate=0.1,
  78. ... parameters=[{
  79. ... 'params': linear_1.parameters()
  80. ... }, {
  81. ... 'params': linear_2.parameters(),
  82. ... 'weight_decay': 0.001,
  83. ... 'learning_rate': 0.1,
  84. ... }],
  85. ... weight_decay=0.01)
  86. >>> out.backward()
  87. >>> adadelta.step()
  88. >>> adadelta.clear_grad()
  89. """
  90. _avg_squared_grad_acc_str = "_avg_squared_grad"
  91. _avg_squared_update_acc_str = "_avg_squared_update"
  92. def __init__(
  93. self,
  94. learning_rate=0.001,
  95. epsilon=1.0e-6,
  96. rho=0.95,
  97. parameters=None,
  98. weight_decay=None,
  99. grad_clip=None,
  100. name=None,
  101. ):
  102. if learning_rate is None:
  103. raise ValueError("learning_rate is not set.")
  104. if epsilon is None:
  105. raise ValueError("epsilon is not set.")
  106. if rho is None:
  107. raise ValueError("rho is not set.")
  108. super().__init__(
  109. learning_rate=learning_rate,
  110. parameters=parameters,
  111. weight_decay=weight_decay,
  112. grad_clip=grad_clip,
  113. name=name,
  114. )
  115. self._multi_precision = False
  116. self._master_weights = {}
  117. self.type = "adadelta"
  118. self._epsilon = epsilon
  119. self._rho = rho
  120. self._default_dict = {
  121. 'epsilon': epsilon,
  122. 'rho': rho,
  123. }
  124. def _create_accumulators(self, block, parameters):
  125. if not isinstance(block, framework.Block):
  126. raise TypeError("block is not instance of framework.Block.")
  127. if isinstance(parameters, dict):
  128. parameters = parameters.get('params')
  129. for p in parameters:
  130. if p.name in self._already_create_accumulator:
  131. continue
  132. if self._multi_precision and self._is_dtype_fp16_or_bf16(p.dtype):
  133. master_p = self._create_master_weight(p)
  134. self._add_accumulator(self._avg_squared_grad_acc_str, master_p)
  135. self._add_accumulator(
  136. self._avg_squared_update_acc_str, master_p
  137. )
  138. self._already_create_accumulator.add(p.name)
  139. continue
  140. if (
  141. self._is_dtype_fp16_or_bf16(p.dtype)
  142. and not self._multi_precision
  143. ):
  144. warnings.warn(
  145. "Accumulating with FP16/BF16 in optimizer can lead to poor accuracy or slow convergence."
  146. "Consider using multi_precision=True option of the Lars optimizer."
  147. )
  148. self._add_accumulator(self._avg_squared_grad_acc_str, p)
  149. self._add_accumulator(self._avg_squared_update_acc_str, p)
  150. self._already_create_accumulator.add(p.name)
  151. def _append_optimize_op(self, block, param_and_grad):
  152. if isinstance(param_and_grad, dict):
  153. param_and_grad = self._update_param_group(param_and_grad)
  154. avg_squared_grad_acc = self._get_accumulator_master(
  155. self._avg_squared_grad_acc_str, param_and_grad[0]
  156. )
  157. avg_squared_update_acc = self._get_accumulator_master(
  158. self._avg_squared_update_acc_str, param_and_grad[0]
  159. )
  160. find_master = self._multi_precision and self._is_dtype_fp16_or_bf16(
  161. param_and_grad[0].dtype
  162. )
  163. master_weight = (
  164. self._master_weights[param_and_grad[0].name]
  165. if find_master
  166. else None
  167. )
  168. if in_dynamic_or_pir_mode():
  169. with no_grad():
  170. _C_ops.adadelta_(
  171. param_and_grad[0],
  172. param_and_grad[1],
  173. avg_squared_grad_acc,
  174. avg_squared_update_acc,
  175. self._create_param_lr(param_and_grad),
  176. master_weight,
  177. self._rho,
  178. self._epsilon,
  179. find_master,
  180. )
  181. return None
  182. else:
  183. if not isinstance(block, framework.Block):
  184. raise TypeError("block is not instance of framework.Block.")
  185. # Create the adadelta optimizer op
  186. inputs = {
  187. "Param": param_and_grad[0],
  188. "Grad": param_and_grad[1],
  189. "AvgSquaredGrad": avg_squared_grad_acc,
  190. "AvgSquaredUpdate": avg_squared_update_acc,
  191. "LearningRate": self._create_param_lr(param_and_grad),
  192. }
  193. outputs = {
  194. "ParamOut": param_and_grad[0],
  195. "AvgSquaredGradOut": avg_squared_grad_acc,
  196. "AvgSquaredUpdateOut": avg_squared_update_acc,
  197. }
  198. if find_master:
  199. inputs["MasterParam"] = master_weight
  200. outputs["MasterParamOut"] = master_weight
  201. adadelta_op = block.append_op(
  202. type=self.type,
  203. inputs=inputs,
  204. outputs=outputs,
  205. attrs={
  206. "epsilon": self._epsilon,
  207. "rho": self._rho,
  208. "multi_precision": find_master,
  209. },
  210. stop_gradient=True,
  211. )
  212. return adadelta_op
  213. def _update_param_group(self, parameters):
  214. self._epsilon = parameters.get('epsilon', self._default_dict['epsilon'])
  215. self._rho = parameters.get('rho', self._default_dict['rho'])
  216. parameters = parameters.get('params')
  217. return parameters