| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316 |
- # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- import warnings
- from paddle import _C_ops
- from ..base import framework
- from ..base.framework import in_dynamic_or_pir_mode
- from .optimizer import Optimizer
- __all__ = []
- class RMSProp(Optimizer):
- r"""
- Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning
- rate method. The original slides proposed RMSProp: Slide 29 of
- http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf .
- The original equation is as follows:
- .. math::
- r(w, t) & = \rho r(w, t-1) + (1 - \rho)(\nabla Q_{i}(w))^2
- w & = w - \frac{\eta} {\sqrt{r(w,t) + \epsilon}} \nabla Q_{i}(w)
- The first equation calculates moving average of the squared gradient for
- each weight. Then dividing the gradient by :math:`sqrt{v(w,t)}`.
- In some cases, adding a momentum term :math: `\\beta` is beneficial.
- In our implementation, Nesterov momentum is used:
- .. math::
- r(w, t) & = \rho r(w, t-1) + (1 - \rho)(\nabla Q_{i}(w))^2
- v(w, t) & = \beta v(w, t-1) + \frac{\eta} {\sqrt{r(w,t) +
- \epsilon}} \nabla Q_{i}(w)
- w & = w - v(w, t)
- if centered is True:
- .. math::
- r(w, t) & = \rho r(w, t-1) + (1 - \rho)(\nabla Q_{i}(w))^2
- g(w, t) & = \rho g(w, t-1) + (1 - \rho)\nabla Q_{i}(w)
- v(w, t) & = \beta v(w, t-1) + \frac{\eta} {\sqrt{r(w,t) - (g(w, t))^2 +
- \epsilon}} \nabla Q_{i}(w)
- w & = w - v(w, t)
- where, :math:`\rho` is a hyperparameter and typical values are 0.9, 0.95
- and so on. :math:`\beta` is the momentum term. :math:`\epsilon` is a
- smoothing term to avoid division by zero, usually set somewhere in range
- from 1e-4 to 1e-8.
- Parameters:
- learning_rate (float|LRScheduler): The learning rate used to update ``Parameter``.
- It can be a float value or a LRScheduler.
- rho(float, optional): rho is :math:`\rho` in equation, default is 0.95.
- epsilon(float, optional): :math:`\epsilon` in equation is smoothing term to
- avoid division by zero, default is 1e-6.
- momentum(float, optional): :math:`\beta` in equation is the momentum term,
- default is 0.0.
- centered(bool, optional): If True, gradients are normalized by the estimated variance of
- the gradient; if False, by the uncentered second moment. Setting this to
- True may help with training, but is slightly more expensive in terms of
- computation and memory. Defaults to False.
- parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``.
- This parameter is required in dygraph mode. And you can specify different options for
- different parameter groups such as the learning rate, weight decay, etc,
- then the parameters are list of dict. Note that the learning_rate in parameter groups
- represents the scale of base learning_rate.
- The default value is None in static graph mode, at this time all parameters will be updated.
- weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization.
- It can be a float value as coeff of L2 regularization or \
- :ref:`api_paddle_regularizer_L1Decay`, :ref:`api_paddle_regularizer_L2Decay`.
- If a parameter has set regularizer using :ref:`api_paddle_ParamAttr` already,
- the regularization setting here in optimizer will be ignored for this parameter.
- Otherwise, the regularization setting here in optimizer will take effect.
- Default None, meaning there is no regularization.
- grad_clip (GradientClipBase, optional): Gradient clipping strategy, it's an instance of
- some derived class of ``GradientClipBase`` . There are three clipping strategies
- ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` ,
- :ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
- name (str, optional): This parameter is used by developers to print debugging information.
- For details, please refer to :ref:`api_guide_Name`. Default is None.
- Examples:
- .. code-block:: python
- >>> import paddle
- >>> inp = paddle.rand([10,10], dtype="float32")
- >>> linear = paddle.nn.Linear(10, 10)
- >>> out = linear(inp)
- >>> loss = paddle.mean(out)
- >>> rmsprop = paddle.optimizer.RMSProp(learning_rate=0.1,
- ... parameters=linear.parameters(),
- ... weight_decay=0.01)
- >>> out.backward()
- >>> rmsprop.step()
- >>> rmsprop.clear_grad()
- >>> # Note that the learning_rate of linear_2 is 0.01.
- >>> linear_1 = paddle.nn.Linear(10, 10)
- >>> linear_2 = paddle.nn.Linear(10, 10)
- >>> inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
- >>> out = linear_1(inp)
- >>> out = linear_2(out)
- >>> loss = paddle.mean(out)
- >>> rmsprop = paddle.optimizer.RMSProp(
- ... learning_rate=0.1,
- ... parameters=[{
- ... 'params': linear_1.parameters()
- ... }, {
- ... 'params': linear_2.parameters(),
- ... 'weight_decay': 0.001,
- ... 'learning_rate': 0.1
- ... }],
- ... weight_decay=0.01
- ... )
- >>> out.backward()
- >>> rmsprop.step()
- >>> rmsprop.clear_grad()
- """
- _momentum_acc_str = "momentum"
- _mean_square_acc_str = "mean_square"
- _mean_grad_acc_str = "mean_grad"
- def __init__(
- self,
- learning_rate,
- rho=0.95,
- epsilon=1.0e-6,
- momentum=0.0,
- centered=False,
- parameters=None,
- weight_decay=None,
- grad_clip=None,
- name=None,
- ):
- if learning_rate is None:
- raise ValueError("learning_rate is not set.")
- if rho is None:
- raise ValueError("rho is not set.")
- if epsilon is None:
- raise ValueError("epsilon is not set.")
- if momentum is None:
- raise ValueError("momentum is not set.")
- if not 0.0 <= epsilon:
- raise ValueError("Invalid value of epsilon, expect epsilon >= 0.")
- if not 0.0 <= momentum:
- raise ValueError("Invalid value of momentum, expect momentum >= 0.")
- if not 0.0 <= rho:
- raise ValueError("Invalid value of rho, expect rho >= 0.")
- super().__init__(
- learning_rate=learning_rate,
- parameters=parameters,
- weight_decay=weight_decay,
- grad_clip=grad_clip,
- name=name,
- )
- self.type = "rmsprop"
- self._rho = rho
- self._epsilon = epsilon
- self._momentum = momentum
- self._centered = centered
- self._multi_precision = False
- self._master_weights = {}
- self._default_dict = {
- 'rho': rho,
- 'epsilon': epsilon,
- 'momentum': momentum,
- 'centered': centered,
- }
- def _create_accumulators(self, block, parameters):
- if not isinstance(block, framework.Block):
- raise TypeError("block is not instance of framework.Block.")
- if isinstance(parameters, dict):
- parameters = parameters.get('params')
- for p in parameters:
- if p.name in self._already_create_accumulator:
- continue
- if self._multi_precision and self._is_dtype_fp16_or_bf16(p.dtype):
- master_p = self._create_master_weight(p)
- self._add_accumulator(self._momentum_acc_str, master_p)
- self._add_accumulator(self._mean_square_acc_str, master_p)
- self._add_accumulator(self._mean_grad_acc_str, master_p)
- self._already_create_accumulator.add(p.name)
- continue
- if (
- self._is_dtype_fp16_or_bf16(p.dtype)
- and not self._multi_precision
- ):
- warnings.warn(
- "Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence."
- "Consider using multi_precision=True option of the Lars optimizer."
- )
- self._add_accumulator(self._momentum_acc_str, p)
- self._add_accumulator(self._mean_square_acc_str, p)
- self._add_accumulator(self._mean_grad_acc_str, p)
- self._already_create_accumulator.add(p.name)
- def _append_optimize_op(self, block, param_and_grad):
- if not isinstance(block, framework.Block):
- raise TypeError("block is not instance of framework.Block.")
- if isinstance(param_and_grad, dict):
- param_and_grad = self._update_param_group(param_and_grad)
- momentum_acc = self._get_accumulator_master(
- self._momentum_acc_str, param_and_grad[0]
- )
- mean_square_acc = self._get_accumulator_master(
- self._mean_square_acc_str, param_and_grad[0]
- )
- mean_grad_acc = self._get_accumulator_master(
- self._mean_grad_acc_str, param_and_grad[0]
- )
- find_master = self._multi_precision and self._is_dtype_fp16_or_bf16(
- param_and_grad[0].dtype
- )
- master_weight = (
- self._master_weights[param_and_grad[0].name]
- if find_master
- else None
- )
- if in_dynamic_or_pir_mode():
- _C_ops.rmsprop_(
- param_and_grad[0],
- mean_square_acc,
- param_and_grad[1],
- momentum_acc,
- self._create_param_lr(param_and_grad),
- mean_grad_acc,
- master_weight,
- self._epsilon,
- self._rho,
- self._momentum,
- self._centered,
- find_master,
- )
- return None
- else:
- inputs = {
- "Param": param_and_grad[0],
- "Grad": param_and_grad[1],
- "Moment": momentum_acc,
- "MeanSquare": mean_square_acc,
- "MeanGrad": mean_grad_acc,
- "LearningRate": self._create_param_lr(param_and_grad),
- }
- outputs = {
- "ParamOut": param_and_grad[0],
- "MomentOut": momentum_acc,
- "MeanSquareOut": mean_square_acc,
- "MeanGradOut": mean_grad_acc,
- }
- if find_master:
- inputs["MasterParam"] = master_weight
- outputs["MasterParamOut"] = master_weight
- rmsprop_op = block.append_op(
- type=self.type,
- inputs=inputs,
- outputs=outputs,
- attrs={
- "epsilon": self._epsilon,
- "decay": self._rho,
- "momentum": self._momentum,
- "centered": self._centered,
- },
- stop_gradient=True,
- )
- return rmsprop_op
- def _update_param_group(self, parameters):
- self._epsilon = parameters.get('epsilon', self._default_dict['epsilon'])
- self._rho = parameters.get('rho', self._default_dict['rho'])
- self._momentum = parameters.get(
- 'momentum', self._default_dict['momentum']
- )
- self._centered = parameters.get(
- 'centered', self._default_dict['centered']
- )
- parameters = parameters.get('params')
- return parameters
|