sgd.py 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170
  1. # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import warnings
  15. from paddle import _C_ops, pir
  16. from ..base import framework
  17. from ..base.dygraph import no_grad
  18. from ..base.framework import in_dynamic_or_pir_mode
  19. from .optimizer import Optimizer
  20. __all__ = []
  21. class SGD(Optimizer):
  22. r"""
  23. Optimizer of the stochastic gradient descent algorithm.
  24. .. math::
  25. param\_out = param - learning\_rate * grad
  26. Parameters:
  27. learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
  28. It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
  29. parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
  30. This parameter is required in dygraph mode. \
  31. The default value is None in static graph mode, at this time all parameters will be updated.
  32. weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
  33. It can be a float value as coeff of L2 regularization or \
  34. :ref:`api_paddle_regularizer_L1Decay`, :ref:`api_paddle_regularizer_L2Decay`.
  35. If a parameter has set regularizer using :ref:`api_paddle_ParamAttr` already, \
  36. the regularization setting here in optimizer will be ignored for this parameter. \
  37. Otherwise, the regularization setting here in optimizer will take effect. \
  38. Default None, meaning there is no regularization.
  39. grad_clip (GradientClipBase, optional): Gradient clipping strategy, it's an instance of
  40. some derived class of ``GradientClipBase`` . There are three clipping strategies
  41. ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` ,
  42. :ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
  43. name (str, optional): The default value is None. Normally there is no need for user
  44. to set this property. For more information, please refer to
  45. :ref:`api_guide_Name` .
  46. Examples:
  47. .. code-block:: python
  48. >>> import paddle
  49. >>> inp = paddle.uniform(min=-0.1, max=0.1, shape=[10, 10], dtype='float32')
  50. >>> linear = paddle.nn.Linear(10, 10)
  51. >>> inp = paddle.to_tensor(inp)
  52. >>> out = linear(inp)
  53. >>> loss = paddle.mean(out)
  54. >>> sgd = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), weight_decay=0.01)
  55. >>> out.backward()
  56. >>> sgd.step()
  57. >>> sgd.clear_grad()
  58. """
  59. def __init__(
  60. self,
  61. learning_rate=0.001,
  62. parameters=None,
  63. weight_decay=None,
  64. grad_clip=None,
  65. multi_precision=False,
  66. name=None,
  67. ):
  68. if learning_rate is None:
  69. raise ValueError("learning_rate is not set")
  70. super().__init__(
  71. learning_rate=learning_rate,
  72. parameters=parameters,
  73. weight_decay=weight_decay,
  74. grad_clip=grad_clip,
  75. name=name,
  76. )
  77. self.type = "sgd"
  78. self._multi_precision = multi_precision
  79. self._master_weights = {}
  80. def _create_accumulators(self, block, parameters):
  81. assert isinstance(block, (framework.Block, pir.Block))
  82. if isinstance(parameters, dict):
  83. parameters = self._update_param_group(parameters)
  84. # Create accumulator tensors for first and second moments
  85. for p in parameters:
  86. if p.name in self._already_create_accumulator:
  87. continue
  88. if self._multi_precision and self._is_dtype_fp16_or_bf16(p.dtype):
  89. master_p = self._create_master_weight(p)
  90. self._already_create_accumulator.add(p.name)
  91. continue
  92. if (
  93. self._is_dtype_fp16_or_bf16(p.dtype)
  94. and not self._multi_precision
  95. ):
  96. warnings.warn(
  97. "Accumulating with FP16/BF16 in optimizer can lead to poor accuracy or slow convergence."
  98. "Consider using multi_precision=True option of the Adam optimizer."
  99. )
  100. @no_grad
  101. def _append_optimize_op(self, block, param_and_grad):
  102. if isinstance(param_and_grad, dict):
  103. param_and_grad = self._update_param_group(param_and_grad)
  104. find_master = self._multi_precision and self._is_dtype_fp16_or_bf16(
  105. param_and_grad[0].dtype
  106. )
  107. master_weight = (
  108. self._master_weights[param_and_grad[0].name]
  109. if find_master
  110. else None
  111. )
  112. lr = self._create_param_lr(param_and_grad)
  113. if in_dynamic_or_pir_mode():
  114. _C_ops.sgd_(
  115. param_and_grad[0],
  116. lr,
  117. param_and_grad[1],
  118. master_weight,
  119. find_master,
  120. )
  121. return None
  122. else:
  123. assert isinstance(block, framework.Block)
  124. # create the optimize op
  125. inputs = {
  126. "Param": param_and_grad[0],
  127. "Grad": param_and_grad[1],
  128. "LearningRate": lr,
  129. }
  130. outputs = {"ParamOut": param_and_grad[0]}
  131. attrs = {"multi_precision": find_master}
  132. if find_master:
  133. inputs["MasterParam"] = master_weight
  134. outputs["MasterParamOut"] = master_weight
  135. sgd_op = block.append_op(
  136. type=self.type,
  137. inputs=inputs,
  138. outputs=outputs,
  139. attrs=attrs,
  140. stop_gradient=True,
  141. )
  142. return sgd_op
  143. def _update_param_group(self, parameters):
  144. parameters = parameters.get('params')
  145. return parameters