adam.py 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793
  1. # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import warnings
  15. from collections import defaultdict
  16. import paddle
  17. from paddle import _C_ops, pir
  18. from paddle.base.libpaddle import DataType
  19. from paddle.pir import Value
  20. from ..base import core, framework
  21. from ..base.dygraph import base as imperative_base
  22. from ..base.framework import (
  23. Variable,
  24. in_dygraph_mode,
  25. in_dynamic_or_pir_mode,
  26. in_pir_mode,
  27. )
  28. from .optimizer import Optimizer
  29. __all__ = []
  30. class Adam(Optimizer):
  31. r"""
  32. The Adam optimizer uses an optimization described at the end
  33. of section 2 of `Adam paper <https://arxiv.org/abs/1412.6980>`_ ,
  34. it can dynamically adjusts the learning rate of each parameter using
  35. the 1st moment estimates and the 2nd moment estimates of the gradient.
  36. The parameter ``param_out`` update rule with gradient ``grad``:
  37. .. math::
  38. t & = t + 1
  39. moment\_1\_out & = {\beta}_1 * moment\_1 + (1 - {\beta}_1) * grad
  40. moment\_2\_out & = {\beta}_2 * moment\_2 + (1 - {\beta}_2) * grad * grad
  41. learning\_rate & = learning\_rate * \
  42. \frac{\sqrt{1 - {\beta}_2^t}}{1 - {\beta}_1^t}
  43. param\_out & = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
  44. Related paper: `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_
  45. Args:
  46. learning_rate (float|LRScheduler, optional): The learning rate used to update ``Parameter``.
  47. It can be a float value or a LRScheduler. The default value is 0.001.
  48. beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates.
  49. It should be a float number or a 0-D Tensor with shape [] and data type as float32.
  50. The default value is 0.9.
  51. beta2 (float|Tensor, optional): The exponential decay rate for the 2nd moment estimates.
  52. It should be a float number or a 0-D Tensor with shape [] and data type as float32.
  53. The default value is 0.999.
  54. epsilon (float|Tensor, optional): A small float value for numerical stability.
  55. It should be a float number or a 0-D Tensor with shape [] and data type as float32.
  56. The default value is 1e-08.
  57. parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``.
  58. This parameter is required in dygraph mode. And you can specify different options for
  59. different parameter groups such as the learning rate, weight decay, etc,
  60. then the parameters are list of dict. Note that the learning_rate in parameter groups
  61. represents the scale of base learning_rate.
  62. The default value is None in static graph mode, at this time all parameters will be updated.
  63. weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization.
  64. It canbe a float value as coeff of L2 regularization or
  65. :ref:`api_paddle_regularizer_L1Decay`, :ref:`api_paddle_regularizer_L2Decay`.
  66. If a parameter has set regularizer using :ref:`api_paddle_ParamAttr` already,
  67. the regularization setting here in optimizer will be ignored for this parameter.
  68. Otherwise, the regularization setting here in optimizer will take effect.
  69. Default None, meaning there is no regularization.
  70. grad_clip (GradientClipBase, optional): Gradient clipping strategy, it's an instance of
  71. some derived class of ``GradientClipBase`` . There are three clipping strategies
  72. ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` ,
  73. :ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
  74. lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators.
  75. The accumulators are updated at every step. Every element of the two moving-average
  76. is updated in both dense mode and sparse mode. If the size of parameter is very large,
  77. then the update may be very slow. The lazy mode only update the element that has
  78. gradient in current mini-batch, so it will be much more faster. But this mode has
  79. different semantics with the original Adam algorithm and may lead to different result.
  80. The default value is False.
  81. multi_precision (bool, optional): Whether to use multi-precision during weight updating. Default is false.
  82. use_multi_tensor (bool, optional): Whether to use multi-tensor strategy to update all parameters at once . Default is false.
  83. name (str, optional): Normally there is no need for user to set this property.
  84. For more information, please refer to :ref:`api_guide_Name`.
  85. The default value is None.
  86. Examples:
  87. .. code-block:: python
  88. :name: code-example1
  89. >>> import paddle
  90. >>> linear = paddle.nn.Linear(10, 10)
  91. >>> inp = paddle.rand([10,10], dtype="float32")
  92. >>> out = linear(inp)
  93. >>> loss = paddle.mean(out)
  94. >>> adam = paddle.optimizer.Adam(learning_rate=0.1,
  95. ... parameters=linear.parameters())
  96. >>> loss.backward()
  97. >>> adam.step()
  98. >>> adam.clear_grad()
  99. .. code-block:: python
  100. :name: code-example2
  101. >>> # Adam with beta1/beta2 as Tensor and weight_decay as float
  102. >>> import paddle
  103. >>> linear = paddle.nn.Linear(10, 10)
  104. >>> inp = paddle.rand([10,10], dtype="float32")
  105. >>> out = linear(inp)
  106. >>> loss = paddle.mean(out)
  107. >>> beta1 = paddle.to_tensor([0.9], dtype="float32")
  108. >>> beta2 = paddle.to_tensor([0.99], dtype="float32")
  109. >>> adam = paddle.optimizer.Adam(learning_rate=0.1,
  110. ... parameters=linear.parameters(),
  111. ... beta1=beta1,
  112. ... beta2=beta2,
  113. ... weight_decay=0.01)
  114. >>> loss.backward()
  115. >>> adam.step()
  116. >>> adam.clear_grad()
  117. >>> # Note that the learning_rate of linear_2 is 0.01.
  118. >>> linear_1 = paddle.nn.Linear(10, 10)
  119. >>> linear_2 = paddle.nn.Linear(10, 10)
  120. >>> inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
  121. >>> out = linear_1(inp)
  122. >>> out = linear_2(out)
  123. >>> loss = paddle.mean(out)
  124. >>> adam = paddle.optimizer.Adam(
  125. ... learning_rate=0.1,
  126. ... parameters=[{
  127. ... 'params': linear_1.parameters()
  128. ... }, {
  129. ... 'params': linear_2.parameters(),
  130. ... 'weight_decay': 0.001,
  131. ... 'learning_rate': 0.1,
  132. ... 'beta1': 0.8
  133. ... }],
  134. ... weight_decay=0.01,
  135. ... beta1=0.9)
  136. >>> loss.backward()
  137. >>> adam.step()
  138. >>> adam.clear_grad()
  139. """
  140. _moment1_acc_str = "moment1"
  141. _moment2_acc_str = "moment2"
  142. _beta1_pow_acc_str = "beta1_pow_acc"
  143. _beta2_pow_acc_str = "beta2_pow_acc"
  144. def __init__(
  145. self,
  146. learning_rate=0.001,
  147. beta1=0.9,
  148. beta2=0.999,
  149. epsilon=1e-8,
  150. parameters=None,
  151. weight_decay=None,
  152. grad_clip=None,
  153. lazy_mode=False,
  154. multi_precision=False,
  155. use_multi_tensor=False,
  156. name=None,
  157. ):
  158. assert learning_rate is not None
  159. assert beta1 is not None
  160. assert beta2 is not None
  161. assert epsilon is not None
  162. if not isinstance(beta1, (Variable, Value)):
  163. if not 0 <= beta1 < 1:
  164. raise ValueError(
  165. "Invalid value of beta1, expect beta1 in [0,1)."
  166. )
  167. if not isinstance(beta2, (Variable, Value)):
  168. if not 0 <= beta2 < 1:
  169. raise ValueError(
  170. "Invalid value of beta2, expect beta2 in [0,1)."
  171. )
  172. if not isinstance(epsilon, (Variable, Value)):
  173. if not 0 <= epsilon:
  174. raise ValueError(
  175. "Invalid value of epsilon, expect epsilon >= 0."
  176. )
  177. super().__init__(
  178. learning_rate=learning_rate,
  179. parameters=parameters,
  180. weight_decay=weight_decay,
  181. grad_clip=grad_clip,
  182. name=name,
  183. )
  184. self.type = "adam"
  185. self._beta1 = beta1
  186. self._beta2 = beta2
  187. self._epsilon = epsilon
  188. self._lazy_mode = lazy_mode
  189. self._multi_precision = multi_precision
  190. self._master_weights = {}
  191. self._default_dict = {
  192. 'beta1': beta1,
  193. 'beta2': beta2,
  194. 'epsilon': epsilon,
  195. 'lazy_mode': lazy_mode,
  196. }
  197. self._use_multi_tensor = use_multi_tensor
  198. if self._use_multi_tensor:
  199. self._param_dict = self._create_multi_tensor_dict()
  200. self._moment1_dict = self._create_multi_tensor_dict()
  201. self._moment2_dict = self._create_multi_tensor_dict()
  202. self._beta1_pow_acc_dict = self._create_multi_tensor_dict()
  203. self._beta2_pow_acc_dict = self._create_multi_tensor_dict()
  204. self._master_weight_dict = self._create_multi_tensor_dict()
  205. self._master_weight_dict['FP32_LODTensor'] = None
  206. def _add_moments_pows(self, p):
  207. acc_dtype = p.dtype
  208. if self._is_dtype_fp16_or_bf16(acc_dtype):
  209. if in_pir_mode():
  210. acc_dtype = DataType.FLOAT32
  211. else:
  212. acc_dtype = core.VarDesc.VarType.FP32
  213. self._add_accumulator(self._moment1_acc_str, p, dtype=acc_dtype)
  214. self._add_accumulator(self._moment2_acc_str, p, dtype=acc_dtype)
  215. self._add_accumulator(
  216. name=self._beta1_pow_acc_str,
  217. param=p,
  218. dtype=acc_dtype,
  219. fill_value=0.9
  220. if isinstance(self._beta1, (Variable, Value))
  221. else self._beta1,
  222. shape=[1],
  223. type=core.VarDesc.VarType.LOD_TENSOR,
  224. device='cpu',
  225. )
  226. self._add_accumulator(
  227. name=self._beta2_pow_acc_str,
  228. param=p,
  229. dtype=acc_dtype,
  230. fill_value=0.999
  231. if isinstance(self._beta2, (Variable, Value))
  232. else self._beta2,
  233. shape=[1],
  234. type=core.VarDesc.VarType.LOD_TENSOR,
  235. device='cpu',
  236. )
  237. def _create_accumulators(self, block, parameters):
  238. assert isinstance(block, (framework.Block, paddle.pir.Block))
  239. if isinstance(parameters, dict):
  240. parameters = self._update_param_group(parameters)
  241. # Create accumulator tensors for first and second moments
  242. for p in parameters:
  243. if p.name in self._already_create_accumulator:
  244. continue
  245. if self._multi_precision and self._is_dtype_fp16_or_bf16(p.dtype):
  246. master_p = self._create_master_weight(p)
  247. self._add_moments_pows(master_p)
  248. self._already_create_accumulator.add(p.name)
  249. continue
  250. if (
  251. self._is_dtype_fp16_or_bf16(p.dtype)
  252. and not self._multi_precision
  253. ):
  254. warnings.warn(
  255. "Accumulating with FP16 or BF16 in optimizer can lead to poor accuracy or slow convergence."
  256. "Consider using multi_precision=True option of the Adam optimizer."
  257. )
  258. self._add_moments_pows(p)
  259. self._already_create_accumulator.add(p.name)
  260. def _append_optimize_op(self, block, param_and_grad):
  261. assert isinstance(block, (framework.Block, paddle.pir.Block))
  262. if isinstance(param_and_grad, dict):
  263. param_and_grad = self._update_param_group(param_and_grad)
  264. moment1 = self._get_accumulator_master(
  265. self._moment1_acc_str, param_and_grad[0]
  266. )
  267. moment2 = self._get_accumulator_master(
  268. self._moment2_acc_str, param_and_grad[0]
  269. )
  270. beta1_pow_acc = self._get_accumulator_master(
  271. self._beta1_pow_acc_str, param_and_grad[0]
  272. )
  273. beta2_pow_acc = self._get_accumulator_master(
  274. self._beta2_pow_acc_str, param_and_grad[0]
  275. )
  276. find_master = self._multi_precision and self._is_dtype_fp16_or_bf16(
  277. param_and_grad[0].dtype
  278. )
  279. master_weight = (
  280. self._master_weights[param_and_grad[0].name]
  281. if find_master
  282. else None
  283. )
  284. lr = self._create_param_lr(param_and_grad)
  285. # create the adam optimize op
  286. if in_dynamic_or_pir_mode():
  287. _beta1 = (
  288. self._beta1
  289. if not isinstance(self._beta1, Variable)
  290. else self._beta1.item(0)
  291. )
  292. _beta2 = (
  293. self._beta2
  294. if not isinstance(self._beta2, Variable)
  295. else self._beta2.item(0)
  296. )
  297. found_inf = (
  298. self._get_auxiliary_var('found_inf') if in_pir_mode() else None
  299. )
  300. _, _, _, _, _, _ = _C_ops.adam_(
  301. param_and_grad[0],
  302. param_and_grad[1],
  303. lr,
  304. moment1,
  305. moment2,
  306. beta1_pow_acc,
  307. beta2_pow_acc,
  308. master_weight,
  309. found_inf,
  310. _beta1,
  311. _beta2,
  312. self._epsilon,
  313. self._lazy_mode,
  314. 1000,
  315. find_master,
  316. False,
  317. )
  318. return None
  319. else:
  320. inputs = {
  321. "Param": [param_and_grad[0]],
  322. "Grad": [param_and_grad[1]],
  323. "LearningRate": [lr],
  324. "Moment1": [moment1],
  325. "Moment2": [moment2],
  326. "Beta1Pow": [beta1_pow_acc],
  327. "Beta2Pow": [beta2_pow_acc],
  328. }
  329. # Pass found_inf to adam, to skip update for not only param, but also momentum and beta_pow
  330. found_inf = self._get_auxiliary_var('found_inf')
  331. if found_inf:
  332. inputs['SkipUpdate'] = found_inf
  333. outputs = {
  334. "ParamOut": [param_and_grad[0]],
  335. "Moment1Out": [moment1],
  336. "Moment2Out": [moment2],
  337. "Beta1PowOut": [beta1_pow_acc],
  338. "Beta2PowOut": [beta2_pow_acc],
  339. }
  340. attrs = {
  341. "lazy_mode": self._lazy_mode,
  342. "min_row_size_to_use_multithread": 1000,
  343. "multi_precision": find_master,
  344. }
  345. if isinstance(self._beta1, Variable):
  346. inputs['Beta1Tensor'] = self._beta1
  347. else:
  348. attrs['beta1'] = self._beta1
  349. if isinstance(self._beta2, Variable):
  350. inputs['Beta2Tensor'] = self._beta2
  351. else:
  352. attrs['beta2'] = self._beta2
  353. if isinstance(self._epsilon, Variable):
  354. inputs['EpsilonTensor'] = self._epsilon
  355. else:
  356. attrs['epsilon'] = self._epsilon
  357. if find_master:
  358. inputs["MasterParam"] = master_weight
  359. outputs["MasterParamOut"] = master_weight
  360. adam_op = block.append_op(
  361. type=self.type,
  362. inputs=inputs,
  363. outputs=outputs,
  364. attrs=attrs,
  365. stop_gradient=True,
  366. )
  367. return adam_op
  368. @imperative_base.no_grad
  369. @framework.non_static_only
  370. def step(self):
  371. """
  372. Execute the optimizer and update parameters once.
  373. Returns:
  374. None
  375. Examples:
  376. .. code-block:: python
  377. >>> import paddle
  378. >>> a = paddle.rand([2,13], dtype="float32")
  379. >>> linear = paddle.nn.Linear(13, 5)
  380. >>> # This can be any optimizer supported by dygraph.
  381. >>> adam = paddle.optimizer.Adam(learning_rate = 0.01,
  382. ... parameters = linear.parameters())
  383. >>> out = linear(a)
  384. >>> out.backward()
  385. >>> adam.step()
  386. >>> adam.clear_grad()
  387. """
  388. if paddle.base.dygraph.base.in_to_static_mode():
  389. self._declarative_step()
  390. return
  391. if not isinstance(self._parameter_list[0], dict):
  392. params_grads = []
  393. for param in self._parameter_list:
  394. if param.stop_gradient:
  395. continue
  396. if param._grad_ivar() is not None:
  397. grad_var = param._grad_ivar()
  398. if in_dygraph_mode():
  399. if (
  400. hasattr(grad_var, "is_selected_rows")
  401. and grad_var.is_selected_rows()
  402. and self.regularization is not None
  403. ):
  404. raise RuntimeError(
  405. "Adam don't support weight_decay with sparse parameters, please set it to None."
  406. )
  407. else:
  408. if (
  409. hasattr(grad_var, "_is_sparse")
  410. and grad_var._is_sparse()
  411. and self.regularization is not None
  412. ):
  413. raise RuntimeError(
  414. "Adam don't support weight_decay with sparse parameters, please set it to None."
  415. )
  416. params_grads.append((param, grad_var))
  417. optimize_ops = self._apply_optimize(
  418. loss=None,
  419. startup_program=None,
  420. params_grads=params_grads,
  421. param_group_idx=0,
  422. )
  423. else:
  424. # optimize parameters in groups
  425. for idx, param_group in enumerate(self._param_groups):
  426. params_grads = defaultdict(lambda: [])
  427. for param in param_group['params']:
  428. if param.stop_gradient:
  429. continue
  430. if param._grad_ivar() is not None:
  431. grad_var = param._grad_ivar()
  432. params_grads['params'].append((param, grad_var))
  433. params_grads.update(
  434. {k: v for k, v in param_group.items() if k != 'params'}
  435. )
  436. self._apply_optimize(
  437. loss=None,
  438. startup_program=None,
  439. params_grads=params_grads,
  440. param_group_idx=idx,
  441. )
  442. def _multi_tensor_init(self, target_block, parameters, param_group_idx):
  443. """
  444. All parameters used for optimizer (such as: parameters, master_weight, velocity_acc for momentum) calculations are grouped into a python list by data type (bfloat16, float16, float32).
  445. This function will be overridden in the corresponding optimizer file.
  446. Args:
  447. target_block: the block in which the loss tensor is present
  448. parameters: list of parameter tensors for the optimizer
  449. """
  450. self._create_accumulators(target_block, parameters)
  451. for param in parameters:
  452. moment1 = self._get_accumulator_master(self._moment1_acc_str, param)
  453. moment2 = self._get_accumulator_master(self._moment2_acc_str, param)
  454. beta1_pow_acc = self._get_accumulator_master(
  455. self._beta1_pow_acc_str, param
  456. )
  457. beta2_pow_acc = self._get_accumulator_master(
  458. self._beta2_pow_acc_str, param
  459. )
  460. if param.dtype == paddle.float32:
  461. self._param_dict['FP32_LODTensor'][param_group_idx].append(
  462. param
  463. )
  464. self._moment1_dict['FP32_LODTensor'][param_group_idx].append(
  465. moment1
  466. )
  467. self._moment2_dict['FP32_LODTensor'][param_group_idx].append(
  468. moment2
  469. )
  470. self._beta1_pow_acc_dict['FP32_LODTensor'][
  471. param_group_idx
  472. ].append(beta1_pow_acc)
  473. self._beta2_pow_acc_dict['FP32_LODTensor'][
  474. param_group_idx
  475. ].append(beta2_pow_acc)
  476. elif self._is_dtype_fp16_or_bf16(param.dtype):
  477. self._param_dict['FP16_LODTensor'][param_group_idx].append(
  478. param
  479. )
  480. self._moment1_dict['FP16_LODTensor'][param_group_idx].append(
  481. moment1
  482. )
  483. self._moment2_dict['FP16_LODTensor'][param_group_idx].append(
  484. moment2
  485. )
  486. self._beta1_pow_acc_dict['FP16_LODTensor'][
  487. param_group_idx
  488. ].append(beta1_pow_acc)
  489. self._beta2_pow_acc_dict['FP16_LODTensor'][
  490. param_group_idx
  491. ].append(beta2_pow_acc)
  492. if self._multi_precision:
  493. self._master_weight_dict['FP16_LODTensor'][
  494. param_group_idx
  495. ].append(self._master_weights[param.name])
  496. else:
  497. self._master_weight_dict['FP16_LODTensor'] = None
  498. else:
  499. raise ValueError(
  500. "Now multi_tensor_momentum only support fp32, fp16 or bf16 parameters and grad is LOD_TENSOR."
  501. )
  502. def _append_optimize_multi_tensor_op(
  503. self,
  504. target_block,
  505. parameters_and_grads,
  506. param_group_idx,
  507. ):
  508. """
  509. For Multi Tensor, append optimize merged_operator to block.
  510. """
  511. assert isinstance(target_block, (framework.Block, pir.Block))
  512. grad_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []}
  513. lr_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []}
  514. if isinstance(parameters_and_grads, list):
  515. if framework.in_dygraph_mode():
  516. params = [pair[0] for pair in parameters_and_grads]
  517. grads_types = core.eager.get_grads_types(params)
  518. for index, tp in enumerate(grads_types):
  519. if tp == core.DataType.FLOAT32:
  520. grad_dict['FP32_LODTensor'].append(
  521. parameters_and_grads[index][1]
  522. )
  523. lr = self._create_param_lr(parameters_and_grads[index])
  524. lr_dict['FP32_LODTensor'].append(lr)
  525. elif (
  526. tp == core.DataType.FLOAT16
  527. or tp == core.DataType.BFLOAT16
  528. ):
  529. grad_dict['FP16_LODTensor'].append(
  530. parameters_and_grads[index][1]
  531. )
  532. lr = self._create_param_lr(parameters_and_grads[index])
  533. lr_dict['FP16_LODTensor'].append(lr)
  534. elif in_pir_mode():
  535. for param_and_grad in parameters_and_grads:
  536. if param_and_grad[1] is None:
  537. continue
  538. if param_and_grad[0].stop_gradient is False:
  539. if (
  540. param_and_grad[0].dtype == DataType.FLOAT32
  541. and param_and_grad[1].is_dense_tensor_type()
  542. ):
  543. grad_dict['FP32_LODTensor'].append(
  544. param_and_grad[1]
  545. )
  546. lr = self._create_param_lr(param_and_grad)
  547. lr_dict['FP32_LODTensor'].append(lr)
  548. elif (
  549. self._is_dtype_fp16_or_bf16(param_and_grad[0].dtype)
  550. and param_and_grad[1].is_dense_tensor_type()
  551. ):
  552. grad_dict['FP16_LODTensor'].append(
  553. param_and_grad[1]
  554. )
  555. lr = self._create_param_lr(param_and_grad)
  556. lr_dict['FP16_LODTensor'].append(lr)
  557. else:
  558. for param_and_grad in parameters_and_grads:
  559. if param_and_grad[1] is None:
  560. continue
  561. if param_and_grad[0].stop_gradient is False:
  562. if (
  563. param_and_grad[0].dtype == paddle.float32
  564. and param_and_grad[1].type
  565. == core.VarDesc.VarType.LOD_TENSOR
  566. ):
  567. grad_dict['FP32_LODTensor'].append(
  568. param_and_grad[1]
  569. )
  570. lr = self._create_param_lr(param_and_grad)
  571. lr_dict['FP32_LODTensor'].append(lr)
  572. elif (
  573. self._is_dtype_fp16_or_bf16(param_and_grad[0].dtype)
  574. and param_and_grad[1].type
  575. == core.VarDesc.VarType.LOD_TENSOR
  576. ):
  577. grad_dict['FP16_LODTensor'].append(
  578. param_and_grad[1]
  579. )
  580. lr = self._create_param_lr(param_and_grad)
  581. lr_dict['FP16_LODTensor'].append(lr)
  582. else:
  583. for param_and_grad in parameters_and_grads['params']:
  584. if param_and_grad[1] is None:
  585. continue
  586. if param_and_grad[0].stop_gradient is False:
  587. param_grad_dict = {}
  588. param_grad_dict['params'] = param_and_grad
  589. param_grad_dict.update(
  590. {
  591. k: v
  592. for k, v in parameters_and_grads.items()
  593. if k != 'params'
  594. }
  595. )
  596. param_and_grad = self._update_param_group(param_grad_dict)
  597. if in_pir_mode():
  598. if (
  599. param_and_grad[0].dtype == DataType.FLOAT32
  600. and param_and_grad[1].is_dense_tensor_type()
  601. ):
  602. grad_dict['FP32_LODTensor'].append(
  603. param_and_grad[1]
  604. )
  605. lr = self._create_param_lr(param_and_grad)
  606. lr_dict['FP32_LODTensor'].append(lr)
  607. elif (
  608. self._is_dtype_fp16_or_bf16(param_and_grad[0].dtype)
  609. and param_and_grad[1].is_dense_tensor_type()
  610. ):
  611. grad_dict['FP16_LODTensor'].append(
  612. param_and_grad[1]
  613. )
  614. lr = self._create_param_lr(param_and_grad)
  615. lr_dict['FP16_LODTensor'].append(lr)
  616. else:
  617. if (
  618. param_and_grad[0].dtype == paddle.float32
  619. and param_and_grad[1].type
  620. == core.VarDesc.VarType.LOD_TENSOR
  621. ):
  622. grad_dict['FP32_LODTensor'].append(
  623. param_and_grad[1]
  624. )
  625. lr = self._create_param_lr(param_and_grad)
  626. lr_dict['FP32_LODTensor'].append(lr)
  627. elif (
  628. self._is_dtype_fp16_or_bf16(param_and_grad[0].dtype)
  629. and param_and_grad[1].type
  630. == core.VarDesc.VarType.LOD_TENSOR
  631. ):
  632. grad_dict['FP16_LODTensor'].append(
  633. param_and_grad[1]
  634. )
  635. lr = self._create_param_lr(param_and_grad)
  636. lr_dict['FP16_LODTensor'].append(lr)
  637. multi_tensor_list = ['FP32_LODTensor', 'FP16_LODTensor']
  638. for key in multi_tensor_list:
  639. if len(self._param_dict[key][param_group_idx]) > 0:
  640. find_master = self._multi_precision and key == 'FP16_LODTensor'
  641. _beta1 = (
  642. self._beta1
  643. if not isinstance(self._beta1, Variable)
  644. else self._beta1.item(0)
  645. )
  646. _beta2 = (
  647. self._beta2
  648. if not isinstance(self._beta2, Variable)
  649. else self._beta2.item(0)
  650. )
  651. if in_dynamic_or_pir_mode():
  652. master_weight = self._master_weight_dict[key]
  653. master_weight = (
  654. master_weight[param_group_idx]
  655. if master_weight is not None
  656. else None
  657. )
  658. found_inf = self._get_auxiliary_var('found_inf')
  659. if found_inf:
  660. if isinstance(
  661. found_inf, (core.eager.Tensor, pir.Value)
  662. ):
  663. self._set_auxiliary_var('found_inf', True)
  664. else:
  665. if isinstance(
  666. found_inf, (core.eager.Tensor, pir.Value)
  667. ):
  668. self._set_auxiliary_var('found_inf', False)
  669. _, _, _, _, _, _ = _C_ops.merged_adam_(
  670. self._param_dict[key][param_group_idx],
  671. grad_dict[key],
  672. lr_dict[key],
  673. self._moment1_dict[key][param_group_idx],
  674. self._moment2_dict[key][param_group_idx],
  675. self._beta1_pow_acc_dict[key][param_group_idx],
  676. self._beta2_pow_acc_dict[key][param_group_idx],
  677. master_weight,
  678. _beta1,
  679. _beta2,
  680. self._epsilon,
  681. find_master,
  682. False,
  683. )
  684. else:
  685. inputs = {
  686. "Param": self._param_dict[key][param_group_idx],
  687. "Grad": grad_dict[key],
  688. "LearningRate": lr_dict[key],
  689. "Moment1": self._moment1_dict[key][param_group_idx],
  690. "Moment2": self._moment2_dict[key][param_group_idx],
  691. "Beta1Pow": self._beta1_pow_acc_dict[key][
  692. param_group_idx
  693. ],
  694. "Beta2Pow": self._beta2_pow_acc_dict[key][
  695. param_group_idx
  696. ],
  697. }
  698. outputs = {
  699. "ParamOut": self._param_dict[key][param_group_idx],
  700. "Moment1Out": self._moment1_dict[key][param_group_idx],
  701. "Moment2Out": self._moment2_dict[key][param_group_idx],
  702. "Beta1PowOut": self._beta1_pow_acc_dict[key][
  703. param_group_idx
  704. ],
  705. "Beta2PowOut": self._beta2_pow_acc_dict[key][
  706. param_group_idx
  707. ],
  708. }
  709. attrs = {
  710. "epsilon": self._epsilon,
  711. "beta1": _beta1,
  712. "beta2": _beta2,
  713. }
  714. if find_master:
  715. inputs["MasterParam"] = self._master_weight_dict[key][
  716. param_group_idx
  717. ]
  718. outputs["MasterParamOut"] = self._master_weight_dict[
  719. key
  720. ][param_group_idx]
  721. attrs["multi_precision"] = find_master
  722. target_block.append_op(
  723. type="merged_adam",
  724. inputs=inputs,
  725. outputs=outputs,
  726. attrs=attrs,
  727. stop_gradient=True,
  728. )
  729. def _update_param_group(self, parameters):
  730. self._beta1 = parameters.get('beta1', self._default_dict['beta1'])
  731. self._beta2 = parameters.get('beta2', self._default_dict['beta2'])
  732. self._epsilon = parameters.get('epsilon', self._default_dict['epsilon'])
  733. self._lazy_mode = parameters.get(
  734. 'lazy_mode', self._default_dict['lazy_mode']
  735. )
  736. parameters = parameters.get('params')
  737. return parameters