rprop.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267
  1. # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import warnings
  15. from paddle import _C_ops
  16. from paddle.tensor.creation import to_tensor
  17. from ..base import framework
  18. from ..base.dygraph import no_grad
  19. from ..base.framework import in_dynamic_or_pir_mode
  20. from .optimizer import Optimizer
  21. __all__ = []
  22. class Rprop(Optimizer):
  23. r"""
  24. **Notes: This optimizer is only applicable to full-batch training.**
  25. Optimizer of the Rprop algorithm.Please refer to this for details:
  26. `A direct adaptive method for faster backpropagation learning : The RPROP algorithm <https://ieeexplore.ieee.org/document/298623>`_.
  27. .. math::
  28. \begin{aligned}
  29. &\hspace{0mm} For\ all\ weights\ and\ biases\{ \\
  30. &\hspace{5mm} \textbf{if} \: (\frac{\partial E}{\partial w_{ij}}(t-1)*\frac{\partial E}{\partial w_{ij}}(t)> 0)\ \textbf{then} \: \{ \\
  31. &\hspace{10mm} learning\_rate_{ij}(t)=\mathrm{minimum}(learning\_rate_{ij}(t-1)*\eta^{+},learning\_rate_{max}) \\
  32. &\hspace{10mm} \Delta w_{ij}(t)=-sign(\frac{\partial E}{\partial w_{ij}}(t))*learning\_rate_{ij}(t) \\
  33. &\hspace{10mm} w_{ij}(t+1)=w_{ij}(t)+\Delta w_{ij}(t) \\
  34. &\hspace{5mm} \} \\
  35. &\hspace{5mm} \textbf{else if} \: (\frac{\partial E}{\partial w_{ij}}(t-1)*\frac{\partial E}{\partial w_{ij}}(t)< 0)\ \textbf{then} \: \{ \\
  36. &\hspace{10mm} learning\_rate_{ij}(t)=\mathrm{maximum}(learning\_rate_{ij}(t-1)*\eta^{-},learning\_rate_{min}) \\
  37. &\hspace{10mm} w_{ij}(t+1)=w_{ij}(t) \\
  38. &\hspace{10mm} \frac{\partial E}{\partial w_{ij}}(t)=0 \\
  39. &\hspace{5mm} \} \\
  40. &\hspace{5mm} \textbf{else if} \: (\frac{\partial E}{\partial w_{ij}}(t-1)*\frac{\partial E}{\partial w_{ij}}(t)= 0)\ \textbf{then} \: \{ \\
  41. &\hspace{10mm} \Delta w_{ij}(t)=-sign(\frac{\partial E}{\partial w_{ij}}(t))*learning\_rate_{ij}(t) \\
  42. &\hspace{10mm} w_{ij}(t+1)=w_{ij}(t)+\Delta w_{ij}(t) \\
  43. &\hspace{5mm} \} \\
  44. &\hspace{0mm} \} \\
  45. \end{aligned}
  46. Parameters:
  47. learning_rate (float|Tensor|LearningRateDecay, optional): The initial learning rate used to update ``Parameter``.
  48. It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
  49. learning_rate_range (tuple, optional): The range of learning rate.
  50. Learning rate cannot be smaller than the first element of the tuple;
  51. learning rate cannot be larger than the second element of the tuple.
  52. The default value is (1e-5, 50).
  53. parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``.
  54. This parameter is required in dygraph mode.
  55. The default value is None in static graph mode, at this time all parameters will be updated.
  56. etas (tuple, optional): Tuple used to update learning rate.
  57. The first element of the tuple is the multiplicative decrease factor;
  58. the second element of the tuple is the multiplicative increase factor.
  59. The default value is (0.5, 1.2).
  60. grad_clip (GradientClipBase, optional): Gradient clipping strategy, it's an instance of some derived class of ``GradientClipBase`` .
  61. There are three clipping strategies ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` , :ref:`api_paddle_nn_ClipGradByValue` ).
  62. Default None, meaning there is no gradient clipping.
  63. multi_precision (bool, optional): In mixed precision training scenarios based on GPU,
  64. this parameter is mainly used to ensure the numerical stability of gradient updates.
  65. When it is set to True, the optimizer will save a backup of FP32 type parameters with an equal value for FP16 type parameters.
  66. When updating gradients, first increase the gradient type to FP32, and then assign it to the FP32 type parameter backup.
  67. Finally, the updated FP32 type value will be converted to FP16 type first,
  68. and then assigned to the actual FP16 type parameters participating in the calculation.
  69. The default value is False.
  70. name (str, optional): The default value is None. Normally there is no need for user to set this property.
  71. For more information, please refer to :ref:`api_guide_Name` .
  72. Examples:
  73. .. code-block:: python
  74. >>> import paddle
  75. >>> inp = paddle.uniform(min=-0.1, max=0.1, shape=[1, 100], dtype='float32')
  76. >>> linear = paddle.nn.Linear(100, 10)
  77. >>> inp = paddle.to_tensor(inp)
  78. >>> out = linear(inp)
  79. >>> loss = paddle.mean(out)
  80. >>> rprop = paddle.optimizer.Rprop(learning_rate=0.001, learning_rate_range=(0.0001,0.1), parameters=linear.parameters(), etas=(0.5,1.2))
  81. >>> out.backward()
  82. >>> rprop.step()
  83. >>> rprop.clear_grad()
  84. """
  85. _prevs_acc_str = "prevs"
  86. _learning_rates_acc_str = "learning_rates"
  87. def __init__(
  88. self,
  89. learning_rate=0.001,
  90. learning_rate_range=(1e-5, 50),
  91. parameters=None,
  92. etas=(0.5, 1.2),
  93. grad_clip=None,
  94. multi_precision=False,
  95. name=None,
  96. ):
  97. if learning_rate is None:
  98. raise ValueError("learning_rate is not set")
  99. if (
  100. not 0.0
  101. < learning_rate_range[0]
  102. <= learning_rate
  103. <= learning_rate_range[1]
  104. ):
  105. raise ValueError(
  106. "'0.0 < learning_rate_range[0] <= learning_rate <= learning_rate_range[1]' must be true"
  107. )
  108. if not 0.0 < etas[0] < 1.0 < etas[1]:
  109. raise ValueError("'0.0 < etas[0] < 1.0 < etas[1]' must be true")
  110. super().__init__(
  111. learning_rate=learning_rate,
  112. parameters=parameters,
  113. weight_decay=0.0,
  114. grad_clip=grad_clip,
  115. name=name,
  116. )
  117. self.type = "rprop"
  118. self._initial_learning_rate = learning_rate
  119. self._multi_precision = multi_precision
  120. self._master_weights = {}
  121. self._learning_rate_range = [learning_rate_range]
  122. self._etas = [etas]
  123. self._sign = True
  124. def _to_tensor(self, block, dtype):
  125. assert isinstance(block, framework.Block)
  126. self._learning_rate_range = to_tensor(
  127. self._learning_rate_range, dtype=dtype
  128. )
  129. self._etas = to_tensor(self._etas, dtype=dtype)
  130. def _create_accumulators(self, block, parameters):
  131. assert isinstance(block, framework.Block)
  132. if isinstance(parameters, dict):
  133. parameters = self._update_param_group(parameters)
  134. # Create accumulator tensors for first and second moments
  135. for p in parameters:
  136. if p.name in self._already_create_accumulator:
  137. continue
  138. if self._multi_precision and self._is_dtype_fp16_or_bf16(p.dtype):
  139. master_p = self._create_master_weight(p)
  140. self._add_accumulator(
  141. self._prevs_acc_str,
  142. master_p,
  143. p.dtype,
  144. 0,
  145. )
  146. self._add_accumulator(
  147. self._learning_rates_acc_str,
  148. master_p,
  149. p.dtype,
  150. self._initial_learning_rate,
  151. )
  152. self._already_create_accumulator.add(p.name)
  153. continue
  154. if (
  155. self._is_dtype_fp16_or_bf16(p.dtype)
  156. and not self._multi_precision
  157. ):
  158. warnings.warn(
  159. "Accumulating with FP16/BF16 in optimizer can lead to poor accuracy or slow convergence."
  160. "Consider using multi_precision=True option of the Adam optimizer."
  161. )
  162. self._add_accumulator(
  163. self._prevs_acc_str,
  164. p,
  165. p.dtype,
  166. 0,
  167. )
  168. self._add_accumulator(
  169. self._learning_rates_acc_str,
  170. p,
  171. p.dtype,
  172. fill_value=self._initial_learning_rate,
  173. )
  174. self._already_create_accumulator.add(p.name)
  175. @no_grad
  176. def _append_optimize_op(self, block, param_and_grad):
  177. if isinstance(param_and_grad, dict):
  178. param_and_grad = self._update_param_group(param_and_grad)
  179. if self._sign:
  180. self._to_tensor(block, param_and_grad[0][0].dtype)
  181. self._sign = False
  182. prevs = self._get_accumulator_master(
  183. self._prevs_acc_str, param_and_grad[0]
  184. )
  185. learning_rates = self._get_accumulator_master(
  186. self._learning_rates_acc_str, param_and_grad[0]
  187. )
  188. find_master = self._multi_precision and self._is_dtype_fp16_or_bf16(
  189. param_and_grad[0].dtype
  190. )
  191. master_weight = (
  192. self._master_weights[param_and_grad[0].name]
  193. if find_master
  194. else None
  195. )
  196. if in_dynamic_or_pir_mode():
  197. _C_ops.rprop_(
  198. param_and_grad[0],
  199. param_and_grad[1],
  200. prevs,
  201. learning_rates,
  202. master_weight,
  203. self._learning_rate_range,
  204. self._etas,
  205. find_master,
  206. )
  207. return None
  208. else:
  209. assert isinstance(block, framework.Block)
  210. # create the optimize op
  211. inputs = {
  212. "param": param_and_grad[0],
  213. "grad": param_and_grad[1],
  214. "prev": prevs,
  215. "learning_rate": learning_rates,
  216. "learning_rate_range": self._learning_rate_range,
  217. "etas": self._etas,
  218. }
  219. outputs = {
  220. "param_out": param_and_grad[0],
  221. "prev_out": prevs,
  222. "learning_rate_out": learning_rates,
  223. }
  224. attrs = {"multi_precision": find_master}
  225. if find_master:
  226. inputs["master_param"] = master_weight
  227. outputs["master_param_out"] = master_weight
  228. rprop_op = block.append_op(
  229. type=self.type,
  230. inputs=inputs,
  231. outputs=outputs,
  232. attrs=attrs,
  233. stop_gradient=True,
  234. )
  235. return rprop_op
  236. def _update_param_group(self, parameters):
  237. parameters = parameters.get('params')
  238. return parameters