clip.py 49 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315
  1. # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import copy
  15. import warnings
  16. from sqlite3 import NotSupportedError
  17. import paddle
  18. import paddle.autograd as imperative_base
  19. import paddle.distributed as dist
  20. from paddle import _C_ops
  21. from paddle.base import core, framework, unique_name
  22. from paddle.base.data_feeder import check_variable_and_dtype
  23. from paddle.base.libpaddle import DataType
  24. from paddle.common_ops_import import Variable, check_type, default_main_program
  25. from paddle.framework import (
  26. LayerHelper,
  27. in_dynamic_mode,
  28. in_dynamic_or_pir_mode,
  29. in_pir_mode,
  30. )
  31. __all__ = []
  32. def clip_by_norm(x, max_norm, name=None):
  33. r"""
  34. Limits the L2 norm of the input :math:`x` within :math:`max\_norm`.
  35. If the L2 norm of :math:`x` is less than or equal to :math:`max\_norm`, :math:`out` will be
  36. the same as :math:`x`. If the L2 norm of :math:`x` is greater than :math:`max\_norm`, :math:`x` will
  37. be linearly scaled to make the L2 norm of :math:`out` equal to :math:`max\_norm`, as
  38. shown in the following formula:
  39. .. math::
  40. out = \frac{max\_norm * x}{norm(x)}
  41. where :math:`norm(x)` represents the L2 norm of :math:`x`.
  42. Args:
  43. x(Tensor): The input of clip_by_norm and data type is float32.
  44. The number of dimensions must be between [1, 9].
  45. max_norm(float): The maximum norm value.
  46. name(str, optional): For detailed information, please refer
  47. to :ref:`api_guide_Name`. Usually name is no need to set and
  48. None by default.
  49. Returns:
  50. Tensor: The output of clip_by_norm with shape as input.
  51. The data type is float32.
  52. Examples:
  53. .. code-block:: python
  54. >>> import paddle
  55. >>> from paddle.nn import clip
  56. >>> input = paddle.to_tensor([[2.0, 2.0], [2.0, 2.0]], dtype='float32')
  57. >>> reward = clip.clip_by_norm(x=input, max_norm=1.0)
  58. >>> print(reward)
  59. Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
  60. [[0.50000000, 0.50000000],
  61. [0.50000000, 0.50000000]])
  62. """
  63. if in_dynamic_or_pir_mode():
  64. return _C_ops.clip_by_norm(x, max_norm)
  65. helper = LayerHelper("clip_by_norm", **locals())
  66. check_variable_and_dtype(
  67. x, 'X', ['float16', 'float32', 'uint16'], 'clip_by_norm'
  68. )
  69. check_type(max_norm, 'max_norm', (float), 'clip_by_norm')
  70. if name is None:
  71. name = unique_name.generate_with_ignorable_key(
  72. ".".join([helper.name, 'tmp'])
  73. )
  74. out = helper.create_variable(
  75. type=x.type, name=name, dtype=x.dtype, persistable=False
  76. )
  77. helper.append_op(
  78. type="clip_by_norm",
  79. inputs={"X": x},
  80. attrs={"max_norm": max_norm},
  81. outputs={"Out": out},
  82. )
  83. return out
  84. def merge_selected_rows(x, name=None):
  85. """
  86. Merge by adding duplicated rows in the input SelectedRows object.
  87. Args:
  88. x(Tensor): The input selected rows to be merge.
  89. name(basestring|None): Name of the output.
  90. Returns:
  91. Tensor, merged output.
  92. Examples:
  93. .. code-block:: python
  94. >>> import paddle
  95. >>> import paddle.base as base
  96. >>> b = paddle.static.default_main_program().global_block()
  97. >>> var = b.create_var(
  98. ... name="X", dtype="float32", persistable=True,
  99. ... type=base.core.VarDesc.VarType.SELECTED_ROWS)
  100. >>> y = paddle.nn.clip.merge_selected_rows(var)
  101. """
  102. if in_dynamic_or_pir_mode():
  103. return _C_ops.merge_selected_rows(x)
  104. helper = LayerHelper("merge_selected_rows", **locals())
  105. out = helper.create_variable_for_type_inference(dtype=x.dtype)
  106. helper.append_op(
  107. type="merge_selected_rows",
  108. inputs={"X": x},
  109. attrs={},
  110. outputs={"Out": out},
  111. )
  112. return out
  113. def get_tensor_from_selected_rows(x, name=None):
  114. """
  115. Get tensor data from input with SelectedRows type, and outputs a Tensor.
  116. .. code-block:: text
  117. input x is SelectedRows:
  118. x.rows = [0, 5, 5, 4, 19]
  119. x.height = 20
  120. x.value = [[1, 1] [2, 2] [2, 2] [3, 3] [6, 6]]
  121. Output is LoDTensor:
  122. out.shape = [5, 2]
  123. out.data = [[1, 1],
  124. [2, 2],
  125. [2, 2],
  126. [3, 3],
  127. [6, 6]]
  128. Args:
  129. x(SelectedRows): Input with SelectedRows type. The data type is float32, float64, int32 or int64.
  130. name(str, optional): The default value is None. Normally there is no need for user to set this property.
  131. For more information, please refer to :ref:`api_guide_Name` .
  132. Returns:
  133. Variable: LoDTensor transformed from SelectedRows. The data type is same with input.
  134. Examples:
  135. .. code-block:: python
  136. >>> import paddle
  137. >>> import paddle.base as base
  138. >>> from paddle.base import core
  139. >>> paddle.enable_static()
  140. >>> scope = core.Scope()
  141. >>> block = paddle.static.default_main_program().global_block()
  142. >>> x_rows = [0, 5, 5, 4, 19]
  143. >>> height = 20
  144. >>> x = scope.var('X').get_selected_rows()
  145. >>> x.set_rows(x_rows)
  146. >>> x.set_height(height)
  147. >>> x = block.create_var(name="X", dtype="float32", persistable=True, type=base.core.VarDesc.VarType.SELECTED_ROWS)
  148. >>> z = paddle.nn.clip.get_tensor_from_selected_rows(x)
  149. """
  150. if in_pir_mode():
  151. return _C_ops.get_tensor_from_selected_rows(x)
  152. check_type(x, 'x', Variable, 'get_tensor_from_selected_rows')
  153. if x.type != core.VarDesc.VarType.SELECTED_ROWS:
  154. raise TypeError(
  155. "The type of 'x' in get_tensor_from_selected_rows must be SELECTED_ROWS."
  156. )
  157. helper = LayerHelper('get_tensor_from_selected_rows', **locals())
  158. out = helper.create_variable_for_type_inference(dtype=x.dtype)
  159. helper.append_op(
  160. type='get_tensor_from_selected_rows',
  161. inputs={'X': x},
  162. outputs={'Out': out},
  163. attrs={},
  164. )
  165. return out
  166. _clip_by_global_norm_using_mp_type_flag = False
  167. def _clip_by_global_norm_using_mp_type(*args):
  168. global _clip_by_global_norm_using_mp_type_flag
  169. assert len(args) <= 1
  170. if len(args) == 1:
  171. assert isinstance(args[0], bool)
  172. old_value = _clip_by_global_norm_using_mp_type_flag
  173. _clip_by_global_norm_using_mp_type_flag = args[0]
  174. return old_value
  175. else:
  176. return _clip_by_global_norm_using_mp_type_flag
  177. def _cast_to_mp_type_if_enabled(x):
  178. if (
  179. x.dtype == core.VarDesc.VarType.FP16
  180. or x.dtype == core.VarDesc.VarType.BF16
  181. ) and _clip_by_global_norm_using_mp_type():
  182. return x.astype(core.VarDesc.VarType.FP32)
  183. elif (
  184. x.dtype == DataType.FLOAT16 or x.dtype == DataType.BFLOAT16
  185. ) and _clip_by_global_norm_using_mp_type():
  186. return x.astype(DataType.FP32)
  187. else:
  188. return x
  189. def _squared_l2_norm(x):
  190. r"""
  191. Return the squared L2 norm of a tensor.
  192. """
  193. x = _cast_to_mp_type_if_enabled(x)
  194. if in_dynamic_or_pir_mode():
  195. return _C_ops.squared_l2_norm(x)
  196. op_type = 'squared_l2_norm'
  197. check_variable_and_dtype(
  198. x, 'x', ['float32', 'float64', 'float16', 'uint16'], op_type
  199. )
  200. helper = LayerHelper(op_type, **locals())
  201. out = helper.create_variable_for_type_inference(x.dtype)
  202. inputs = {"X": x}
  203. outputs = {'Out': out}
  204. helper.append_op(type=op_type, inputs=inputs, outputs=outputs)
  205. return out
  206. class BaseErrorClipAttr:
  207. def __str__(self):
  208. raise NotImplementedError()
  209. def _append_clip_op(self, block, grad_name):
  210. raise NotImplementedError()
  211. class ErrorClipByValue(BaseErrorClipAttr):
  212. r"""
  213. Clip tensor values to the range [min, max].
  214. Given a tensor ``t`` (see Examples below), this operation clips its value \
  215. to ``min`` and ``max`` inplace.
  216. - Any values less than min are set to min.
  217. - Any values greater than max are set to max.
  218. Args:
  219. max (float): The maximum value to clip by.
  220. min (float, optional): The minimum value to clip by. if not set by user, \
  221. will be set to ``-max`` by framework.
  222. Examples:
  223. .. code-block:: python
  224. >>> import paddle
  225. >>> paddle.enable_static()
  226. >>> BATCH_SIZE = 128
  227. >>> CLIP_MAX = 2e-6
  228. >>> CLIP_MIN = -1e-6
  229. >>> prog = paddle.static.Program()
  230. >>> with paddle.static.program_guard(main_program=prog):
  231. ... image = paddle.static.data(name='x', shape=[None, 784], dtype='float32')
  232. ... hidden1 = paddle.static.nn.fc(image, size=128, activation='relu')
  233. ... hidden2 = paddle.static.nn.fc(hidden1, size=64, activation='relu')
  234. ... predict = paddle.static.nn.fc(hidden2, size=10, activation='softmax')
  235. ... label = paddle.static.data(name='y', shape=[1], dtype='int64')
  236. ... cost = paddle.nn.functional.cross_entropy(input=predict, label=label)
  237. ... avg_cost = paddle.mean(cost)
  238. >>> prog_clip = prog.clone()
  239. >>> prog_clip.block(0).var(hidden1.name)._set_error_clip(
  240. ... paddle.nn.clip.ErrorClipByValue(
  241. ... max=CLIP_MAX, min=CLIP_MIN))
  242. """
  243. def __init__(self, max, min=None):
  244. max = float(max)
  245. if min is None:
  246. min = -max
  247. else:
  248. min = float(min)
  249. self.max = max
  250. self.min = min
  251. def __str__(self):
  252. return f"ByValue, min={self.min:f}, max={self.max:f}"
  253. def _append_clip_op(self, block, grad_name):
  254. clip_op_desc = block.desc.append_op()
  255. clip_op_desc.set_type("clip")
  256. clip_op_desc.set_input("X", [grad_name])
  257. clip_op_desc.set_output("Out", [grad_name])
  258. clip_op_desc._set_attr("min", self.min)
  259. clip_op_desc._set_attr("max", self.max)
  260. def error_clip_callback(block, context):
  261. # the context is a grad_to_var map
  262. grad_to_var = context
  263. op_desc = block.desc.op(block.desc.op_size() - 1)
  264. for grad_n in [n for n in op_desc.output_arg_names() if n in grad_to_var]:
  265. fwd_var = block._var_recursive(grad_to_var[grad_n])
  266. error_clip = getattr(fwd_var, "error_clip", None)
  267. if not (
  268. error_clip is None or isinstance(error_clip, BaseErrorClipAttr)
  269. ):
  270. raise TypeError(
  271. "Variable's error_clip should be an instance of BaseErrorClipAttr or None."
  272. )
  273. if error_clip is not None:
  274. error_clip._append_clip_op(block, grad_n)
  275. class ClipGradBase:
  276. def __init__(self):
  277. super().__init__()
  278. def __str__(self):
  279. raise NotImplementedError()
  280. @imperative_base.no_grad()
  281. def _dygraph_clip(self, params_grads):
  282. raise NotImplementedError
  283. def _pir_clip(self, params_grads):
  284. raise NotImplementedError
  285. def _static_clip(self, params_grads):
  286. raise NotImplementedError
  287. def __call__(self, params_grads):
  288. if in_dynamic_mode():
  289. return self._dygraph_clip(params_grads)
  290. elif in_pir_mode():
  291. return self._pir_clip(params_grads)
  292. else:
  293. for p, g in params_grads:
  294. if getattr(p, 'gradient_clip_attr', None) is not None:
  295. warnings.warn(
  296. "'set_gradient_clip' will be ineffective, because you have "
  297. "set 'need_clip' in 'ParamAttr'. So, 'set_gradient_clip' "
  298. "is redundant and you can remove it."
  299. )
  300. break
  301. return self._static_clip(params_grads)
  302. def _process_context(self, context, param, grad):
  303. raise NotImplementedError()
  304. def _create_operators(self, param, grad):
  305. raise NotImplementedError()
  306. class ClipGradByValue(ClipGradBase):
  307. """
  308. Limit the value of multi-dimensional Tensor :math:`X` to the range [min, max].
  309. - Any values less than min are set to ``min``.
  310. - Any values greater than max are set to ``max``.
  311. The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
  312. If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
  313. Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
  314. (for example: :ref:`api_paddle_optimizer_SGD`).
  315. Note:
  316. ``need_clip`` of ``ClipGradByValue`` HAS BEEN DEPRECATED since 2.0.
  317. Please use ``need_clip`` in ``ParamAttr`` to specify the clip scope.
  318. Args:
  319. max (float): The maximum value to clip by.
  320. min (float, optional): The minimum value to clip by. if not set by user, it will be set to ``-max``
  321. automatically. In this case, ``max`` must be greater than :math:`0`.
  322. Examples:
  323. .. code-block:: python
  324. >>> import paddle
  325. >>> x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
  326. >>> linear = paddle.nn.Linear(in_features=10, out_features=10,
  327. ... weight_attr=paddle.ParamAttr(need_clip=True),
  328. ... bias_attr=paddle.ParamAttr(need_clip=False))
  329. >>> out = linear(x)
  330. >>> loss = paddle.mean(out)
  331. >>> loss.backward()
  332. >>> clip = paddle.nn.ClipGradByValue(min=-1, max=1)
  333. >>> sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
  334. >>> sdg.step()
  335. """
  336. def __init__(self, max, min=None):
  337. super().__init__()
  338. if min is None:
  339. assert max > 0.0
  340. min = -max
  341. self.max = float(max)
  342. self.min = float(min)
  343. def __str__(self):
  344. return f"Clip Gradient By Value, min = {self.min:f}, max={self.max:f}"
  345. @imperative_base.no_grad()
  346. def _dygraph_clip(self, params_grads):
  347. params_and_grads = []
  348. for p, g in params_grads:
  349. if g is None:
  350. continue
  351. if getattr(p, 'need_clip', True) is False:
  352. params_and_grads.append((p, g))
  353. continue
  354. new_grad = paddle.clip(x=g, min=self.min, max=self.max)
  355. params_and_grads.append((p, new_grad))
  356. return params_and_grads
  357. def _static_clip(self, params_grads):
  358. params_and_grads = []
  359. param_new_grad_name_dict = {}
  360. with framework.name_scope('gradient_clip'):
  361. for p, g in params_grads:
  362. if g is None:
  363. continue
  364. if getattr(p, 'need_clip', True) is False:
  365. params_and_grads.append((p, g))
  366. continue
  367. with p.block.program._optimized_guard([p, g]):
  368. new_grad = paddle.clip(x=g, min=self.min, max=self.max)
  369. params_and_grads.append((p, new_grad))
  370. param_new_grad_name_dict[p.name] = new_grad.name
  371. _correct_clip_op_role_var(params_and_grads, param_new_grad_name_dict)
  372. return params_and_grads
  373. def _process_context(self, context, param, grad):
  374. pass
  375. def _create_operators(self, param, grad):
  376. new_grad = paddle.clip(x=grad, min=self.min, max=self.max)
  377. return param, new_grad
  378. class ClipGradByNorm(ClipGradBase):
  379. r"""
  380. Limit the l2 norm of multi-dimensional Tensor :math:`X` to ``clip_norm`` .
  381. - If the l2 norm of :math:`X` is greater than ``clip_norm`` , :math:`X` will be compressed by a ratio.
  382. - If the l2 norm of :math:`X` is less than or equal to ``clip_norm`` , nothing will be done.
  383. The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
  384. If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
  385. Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
  386. (for example: :ref:`api_paddle_optimizer_SGD`).
  387. The clipping formula is:
  388. .. math::
  389. Out =
  390. \left\{
  391. \begin{array}{ccl}
  392. X & & if (norm(X) \leq clip\_norm) \\
  393. \frac{clip\_norm*X}{norm(X)} & & if (norm(X) > clip\_norm) \\
  394. \end{array}
  395. \right.
  396. where :math:`norm(X)` represents the L2 norm of :math:`X`.
  397. .. math::
  398. norm(X) = ( \sum_{i=1}^{n}|x\_i|^2)^{ \frac{1}{2}}
  399. Note:
  400. ``need_clip`` of ``ClipGradByNorm`` HAS BEEN DEPRECATED since 2.0.
  401. Please use ``need_clip`` in ``ParamAttr`` to specify the clip scope.
  402. Args:
  403. clip_norm(float): The maximum norm value.
  404. Examples:
  405. .. code-block:: python
  406. >>> import paddle
  407. >>> x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
  408. >>> linear = paddle.nn.Linear(in_features=10, out_features=10,
  409. ... weight_attr=paddle.ParamAttr(need_clip=True),
  410. ... bias_attr=paddle.ParamAttr(need_clip=False))
  411. >>> out = linear(x)
  412. >>> loss = paddle.mean(out)
  413. >>> loss.backward()
  414. >>> clip = paddle.nn.ClipGradByNorm(clip_norm=1.0)
  415. >>> sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
  416. >>> sdg.step()
  417. """
  418. def __init__(self, clip_norm):
  419. super().__init__()
  420. self.clip_norm = float(clip_norm)
  421. def __str__(self):
  422. return "Gradient Clip By Norm, clip_norm=%f" % self.clip_norm
  423. def _clip_gradients(self, params_grads):
  424. params_and_grads = []
  425. for p, g in params_grads:
  426. if g is None:
  427. continue
  428. if getattr(p, 'need_clip', True) is False:
  429. params_and_grads.append((p, g))
  430. continue
  431. new_grad = clip_by_norm(x=g, max_norm=self.clip_norm)
  432. params_and_grads.append((p, new_grad))
  433. return params_and_grads
  434. @imperative_base.no_grad()
  435. def _dygraph_clip(self, params_grads):
  436. return self._clip_gradients(params_grads)
  437. def _pir_clip(self, params_grads):
  438. return self._clip_gradients(params_grads)
  439. def _static_clip(self, params_grads):
  440. params_and_grads = []
  441. with framework.name_scope('gradient_clip'):
  442. param_new_grad_name_dict = {}
  443. for p, g in params_grads:
  444. if g is None:
  445. continue
  446. if getattr(p, 'need_clip', True) is False:
  447. params_and_grads.append((p, g))
  448. continue
  449. with p.block.program._optimized_guard([p, g]):
  450. new_grad = clip_by_norm(x=g, max_norm=self.clip_norm)
  451. param_new_grad_name_dict[p.name] = new_grad.name
  452. params_and_grads.append((p, new_grad))
  453. _correct_clip_op_role_var(params_and_grads, param_new_grad_name_dict)
  454. return params_and_grads
  455. def _process_context(self, context, param, grad):
  456. pass
  457. def _create_operators(self, param, grad):
  458. new_grad = clip_by_norm(x=grad, max_norm=self.clip_norm)
  459. return param, new_grad
  460. _allow_pure_fp16_global_norm_clip_flag = False
  461. def _allow_pure_fp16_global_norm_clip(*args):
  462. global _allow_pure_fp16_global_norm_clip_flag
  463. if len(args) == 0:
  464. return _allow_pure_fp16_global_norm_clip_flag
  465. else:
  466. assert len(args) == 1 and isinstance(args[0], bool)
  467. old_value = _allow_pure_fp16_global_norm_clip_flag
  468. _allow_pure_fp16_global_norm_clip_flag = args[0]
  469. return old_value
  470. _allow_pure_bf16_global_norm_clip_flag = False
  471. def _allow_pure_bf16_global_norm_clip(*args):
  472. global _allow_pure_bf16_global_norm_clip_flag
  473. if len(args) == 0:
  474. return _allow_pure_bf16_global_norm_clip_flag
  475. else:
  476. assert len(args) == 1 and isinstance(args[0], bool)
  477. old_value = _allow_pure_bf16_global_norm_clip_flag
  478. _allow_pure_bf16_global_norm_clip_flag = args[0]
  479. return old_value
  480. class ClipGradByGlobalNorm(ClipGradBase):
  481. r"""
  482. Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in
  483. :math:`t\_list` , and limit it to ``clip_norm`` .
  484. - If the global norm is greater than ``clip_norm`` , all elements of :math:`t\_list` will be compressed by a ratio.
  485. - If the global norm is less than or equal to ``clip_norm`` , nothing will be done.
  486. The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
  487. If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
  488. Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
  489. (for example: :ref:`api_paddle_optimizer_SGD`).
  490. The clipping formula is:
  491. .. math::
  492. t\_list[i] = t\_list[i] * \frac{clip\_norm}{\max(global\_norm, clip\_norm)}
  493. where:
  494. .. math::
  495. global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2}
  496. Note:
  497. ``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0.
  498. Please use ``need_clip`` in ``ParamAttr`` to specify the clip scope.
  499. Args:
  500. clip_norm (float): The maximum norm value.
  501. group_name (str, optional): The group name for this clip. Default value is ``default_group``.
  502. auto_skip_clip (bool, optional): skip clipping gradient. Default value is ``False``.
  503. Examples:
  504. .. code-block:: python
  505. >>> import paddle
  506. >>> x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
  507. >>> linear = paddle.nn.Linear(in_features=10, out_features=10,
  508. ... weight_attr=paddle.ParamAttr(need_clip=True),
  509. ... bias_attr=paddle.ParamAttr(need_clip=False))
  510. >>> out = linear(x)
  511. >>> loss = paddle.mean(out)
  512. >>> loss.backward()
  513. >>> clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
  514. >>> sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
  515. >>> sdg.step()
  516. """
  517. def __init__(
  518. self, clip_norm, group_name="default_group", auto_skip_clip=False
  519. ):
  520. super().__init__()
  521. self.clip_norm = float(clip_norm)
  522. self.group_name = group_name
  523. assert isinstance(auto_skip_clip, bool)
  524. self.auto_skip_clip = auto_skip_clip
  525. # TODO(zhiqiu): Now, in dygraph mode async_add_n is always used.
  526. # However, in static mode, it is only used in auto_parallel mode
  527. # by setting self._async_add_n to True. The reason is that there
  528. # are so many hard code depends on `add_n` in the legacy static
  529. # manual hybrid-parallel.
  530. self._async_add_n = None
  531. def __str__(self):
  532. return "Gradient Clip By GlobalNorm, global_norm=%f" % (self.clip_norm)
  533. @imperative_base.no_grad()
  534. def _dygraph_clip(self, params_grads):
  535. params_and_grads = []
  536. sum_square_list = []
  537. sum_square_list_fp16 = []
  538. sum_square_list_fp32 = []
  539. if len(params_grads) > 0 and len(params_grads[0]) > 0:
  540. src_mesh = params_grads[0][0].process_mesh
  541. else:
  542. src_mesh = None
  543. for p, g in params_grads:
  544. if g is None:
  545. continue
  546. if getattr(p, 'need_clip', True) is False:
  547. continue
  548. merge_grad = g
  549. if in_dynamic_mode() and g.is_selected_rows():
  550. merge_grad = merge_selected_rows(g)
  551. merge_grad = merge_grad._get_tensor_from_selected_rows()
  552. elif g.type == core.VarDesc.VarType.SELECTED_ROWS:
  553. merge_grad = merge_selected_rows(g)
  554. merge_grad = get_tensor_from_selected_rows(merge_grad)
  555. sum_square = _squared_l2_norm(merge_grad)
  556. # if the gradient mesh is not equal to src mesh
  557. # do reshard to get the result of squared_l2 from other pp stage mesh
  558. if src_mesh is not None and g.process_mesh != src_mesh:
  559. sum_square = dist.reshard(
  560. sum_square, src_mesh, sum_square.placements
  561. )
  562. if (
  563. sum_square.dtype == paddle.float16
  564. or sum_square.dtype == paddle.bfloat16
  565. ):
  566. sum_square_list_fp16.append(sum_square)
  567. elif sum_square.dtype == paddle.float32:
  568. sum_square_list_fp32.append(sum_square)
  569. else:
  570. sum_square_list.append(sum_square)
  571. # all parameters have been filterd out
  572. if (
  573. len(sum_square_list)
  574. + len(sum_square_list_fp16)
  575. + len(sum_square_list_fp32)
  576. == 0
  577. ):
  578. return params_grads
  579. def async_add_n(var_list):
  580. return paddle.stack(var_list).sum()
  581. sum_dtype = 'float64' if len(sum_square_list) > 0 else "float32"
  582. global_norm_var = []
  583. if len(sum_square_list_fp16) > 0:
  584. global_norm_var_fp16 = async_add_n(sum_square_list_fp16)
  585. global_norm_var.append(global_norm_var_fp16.astype(sum_dtype))
  586. if len(sum_square_list_fp32) > 0:
  587. global_norm_var_fp32 = async_add_n(sum_square_list_fp32)
  588. if sum_dtype == 'float32':
  589. global_norm_var.append(global_norm_var_fp32)
  590. else:
  591. global_norm_var.append(global_norm_var_fp32.astype(sum_dtype))
  592. if len(sum_square_list) > 0:
  593. global_norm_var_fp64 = async_add_n(sum_square_list)
  594. global_norm_var.append(global_norm_var_fp64)
  595. global_norm_var = async_add_n(global_norm_var)
  596. global_norm_var = paddle.sqrt(global_norm_var)
  597. max_global_norm = paddle.full(
  598. shape=[], dtype=sum_dtype, fill_value=self.clip_norm
  599. )
  600. need_clip = False
  601. if not self.auto_skip_clip: # always apply clip
  602. need_clip = True
  603. clip_var = paddle.divide(
  604. x=max_global_norm,
  605. y=paddle.maximum(x=global_norm_var, y=max_global_norm),
  606. )
  607. elif global_norm_var > max_global_norm:
  608. # only when global_norm_var > max_global_norm, grad need clip
  609. need_clip = True
  610. clip_var = paddle.divide(x=max_global_norm, y=global_norm_var)
  611. for p, g in params_grads:
  612. if g is None:
  613. continue
  614. if getattr(p, 'need_clip', True) is False:
  615. params_and_grads.append((p, g))
  616. continue
  617. # TODO(wangxi): use inplace elementwise_mul
  618. if need_clip:
  619. clip_input = (
  620. clip_var.astype(g.dtype)
  621. if clip_var.dtype != g.dtype
  622. else clip_var
  623. )
  624. if clip_input.process_mesh != g.process_mesh:
  625. # TODO(pkuzyc): refine the reshard function between local
  626. # and global mesh to avoid the following "_local_tensor()"
  627. # operation.
  628. if set(g.process_mesh.process_ids) < set(
  629. clip_input.process_mesh.process_ids
  630. ):
  631. placements = clip_input.placements
  632. is_replicate = True
  633. for placement in placements:
  634. if not placement.is_replicated():
  635. is_replicate = False
  636. break
  637. if is_replicate:
  638. clip_input = clip_input._local_value()
  639. else:
  640. raise NotImplementedError(
  641. "Reshard a sharded tensor from a local mesh to a global mesh is not supported"
  642. )
  643. else:
  644. clip_input = paddle.distributed.reshard(
  645. clip_input, g.process_mesh, clip_input.placements
  646. )
  647. new_grad = paddle.multiply(g, clip_input)
  648. params_and_grads.append((p, new_grad))
  649. else:
  650. params_and_grads.append((p, g))
  651. return params_and_grads
  652. def _pir_clip(self, params_grads):
  653. params_and_grads = []
  654. sum_square_list = []
  655. sum_square_list_fp16 = []
  656. sum_square_list_fp32 = []
  657. for p, g in params_grads:
  658. if g is None:
  659. continue
  660. if getattr(p, 'need_clip', True) is False:
  661. continue
  662. merge_grad = g
  663. if in_pir_mode() and g.is_selected_row_type():
  664. merge_grad = merge_selected_rows(g)
  665. merge_grad = get_tensor_from_selected_rows(merge_grad)
  666. sum_square = _squared_l2_norm(merge_grad)
  667. if (
  668. sum_square.dtype == DataType.FLOAT16
  669. or sum_square.dtype == DataType.BFLOAT16
  670. ):
  671. sum_square_list_fp16.append(sum_square)
  672. elif sum_square.dtype == DataType.FLOAT32:
  673. sum_square_list_fp32.append(sum_square)
  674. else:
  675. sum_square_list.append(sum_square)
  676. # all parameters have been filterd out
  677. if (
  678. len(sum_square_list)
  679. + len(sum_square_list_fp16)
  680. + len(sum_square_list_fp32)
  681. == 0
  682. ):
  683. return params_grads
  684. def async_add_n(var_list):
  685. return paddle.stack(var_list).sum()
  686. sum_dtype = 'float64' if len(sum_square_list) > 0 else "float32"
  687. global_norm_var = []
  688. if len(sum_square_list_fp16) > 0:
  689. global_norm_var_fp16 = async_add_n(sum_square_list_fp16)
  690. global_norm_var.append(global_norm_var_fp16.astype(sum_dtype))
  691. if len(sum_square_list_fp32) > 0:
  692. global_norm_var_fp32 = async_add_n(sum_square_list_fp32)
  693. if sum_dtype == 'float32':
  694. global_norm_var.append(global_norm_var_fp32)
  695. else:
  696. global_norm_var.append(global_norm_var_fp32.astype(sum_dtype))
  697. if len(sum_square_list) > 0:
  698. global_norm_var_fp64 = async_add_n(sum_square_list)
  699. global_norm_var.append(global_norm_var_fp64)
  700. global_norm_var = async_add_n(global_norm_var)
  701. global_norm_var = paddle.sqrt(global_norm_var)
  702. max_global_norm = paddle.full(
  703. shape=[], dtype=global_norm_var.dtype, fill_value=self.clip_norm
  704. )
  705. need_clip = False
  706. if not self.auto_skip_clip: # always apply clip
  707. need_clip = True
  708. clip_var = paddle.divide(
  709. x=max_global_norm,
  710. y=paddle.maximum(x=global_norm_var, y=max_global_norm),
  711. )
  712. elif global_norm_var > max_global_norm:
  713. # only when global_norm_var > max_global_norm, grad need clip
  714. need_clip = True
  715. clip_var = paddle.divide(x=max_global_norm, y=global_norm_var)
  716. for p, g in params_grads:
  717. if g is None:
  718. continue
  719. if getattr(p, 'need_clip', True) is False:
  720. params_and_grads.append((p, g))
  721. continue
  722. # TODO(wangxi): use inplace elementwise_mul
  723. if need_clip:
  724. clip_input = (
  725. clip_var.astype(g.dtype)
  726. if clip_var.dtype != g.dtype
  727. else clip_var
  728. )
  729. new_grad = paddle.multiply(g, clip_input)
  730. params_and_grads.append((p, new_grad))
  731. else:
  732. params_and_grads.append((p, g))
  733. return params_and_grads
  734. def _static_clip(self, params_grads):
  735. params_and_grads = []
  736. sum_square_list = []
  737. sum_square_list_fp16 = []
  738. sum_square_list_bf16 = []
  739. sum_square_list_fp32 = []
  740. def _add_n(var_list):
  741. if self._async_add_n:
  742. return paddle.stack(var_list).sum()
  743. else:
  744. return paddle.add_n(var_list)
  745. with framework.name_scope('gradient_clip'):
  746. for p, g in params_grads:
  747. if g is None:
  748. continue
  749. if getattr(p, 'need_clip', True) is False:
  750. continue
  751. merge_grad = g
  752. with p.block.program._optimized_guard([p, g]):
  753. if g.type == core.VarDesc.VarType.SELECTED_ROWS:
  754. merge_grad = merge_selected_rows(g)
  755. merge_grad = get_tensor_from_selected_rows(merge_grad)
  756. sum_square = _squared_l2_norm(merge_grad)
  757. if sum_square.dtype == core.VarDesc.VarType.FP16:
  758. sum_square_list_fp16.append(sum_square)
  759. elif sum_square.dtype == core.VarDesc.VarType.BF16:
  760. sum_square_list_bf16.append(sum_square)
  761. elif sum_square.dtype == core.VarDesc.VarType.FP32:
  762. sum_square_list_fp32.append(sum_square)
  763. else:
  764. sum_square_list.append(sum_square)
  765. if len(sum_square_list_fp16) > 0 and len(sum_square_list_bf16) > 0:
  766. raise NotSupportedError(
  767. 'FP16 and BF16 are not supported at the same time.'
  768. )
  769. # all parameters have been filterd out
  770. if (
  771. len(sum_square_list)
  772. + len(sum_square_list_fp16)
  773. + len(sum_square_list_fp32)
  774. == 0
  775. ) and (
  776. len(sum_square_list)
  777. + len(sum_square_list_bf16)
  778. + len(sum_square_list_fp32)
  779. == 0
  780. ):
  781. return params_grads
  782. with p.block.program._optimized_guard([p, g]):
  783. sum_dtype = 'float64' if len(sum_square_list) > 0 else "float32"
  784. global_norm_var = []
  785. if len(sum_square_list_fp16) > 0:
  786. global_norm_var_fp16 = _add_n(sum_square_list_fp16)
  787. if (
  788. sum_square_list_fp32
  789. or sum_square_list
  790. or not _allow_pure_fp16_global_norm_clip()
  791. ):
  792. global_norm_var.append(
  793. global_norm_var_fp16.astype(sum_dtype)
  794. )
  795. else:
  796. global_norm_var.append(global_norm_var_fp16)
  797. if len(sum_square_list_bf16) > 0:
  798. global_norm_var_bf16 = _add_n(sum_square_list_bf16)
  799. if (
  800. sum_square_list_fp32
  801. or sum_square_list
  802. or not _allow_pure_bf16_global_norm_clip()
  803. ):
  804. global_norm_var.append(
  805. global_norm_var_bf16.astype(sum_dtype)
  806. )
  807. else:
  808. global_norm_var.append(global_norm_var_bf16)
  809. if len(sum_square_list_fp32) > 0:
  810. global_norm_var_fp32 = _add_n(sum_square_list_fp32)
  811. if sum_dtype == 'float32':
  812. global_norm_var.append(global_norm_var_fp32)
  813. else:
  814. global_norm_var.append(
  815. global_norm_var_fp32.astype(sum_dtype)
  816. )
  817. if len(sum_square_list) > 0:
  818. # fp64
  819. global_norm_var_other_dtype = _add_n(sum_square_list)
  820. global_norm_var.append(global_norm_var_other_dtype)
  821. global_norm_var = (
  822. _add_n(global_norm_var)
  823. if len(global_norm_var) > 1
  824. else global_norm_var[0]
  825. )
  826. global_norm_var = paddle.sqrt(x=global_norm_var)
  827. max_global_norm = paddle.full(
  828. shape=[1],
  829. dtype=global_norm_var.dtype,
  830. fill_value=self.clip_norm,
  831. )
  832. scale_var = paddle.divide(
  833. x=max_global_norm,
  834. y=paddle.maximum(x=max_global_norm, y=global_norm_var),
  835. )
  836. param_new_grad_name_dict = {}
  837. for p, g in params_grads:
  838. if g is None:
  839. continue
  840. if getattr(p, 'need_clip', True) is False:
  841. params_and_grads.append((p, g))
  842. continue
  843. with p.block.program._optimized_guard([p, g]):
  844. new_g = _cast_to_mp_type_if_enabled(g)
  845. # inplace
  846. if (
  847. new_g.dtype == core.VarDesc.VarType.FP16
  848. and scale_var.dtype != core.VarDesc.VarType.FP16
  849. ):
  850. scale_input = scale_var.astype('float16')
  851. elif (
  852. new_g.dtype == core.VarDesc.VarType.BF16
  853. and scale_var.dtype != core.VarDesc.VarType.BF16
  854. ):
  855. scale_input = scale_var.astype('bfloat16')
  856. else:
  857. scale_input = scale_var
  858. # NOTE(Yuang Liu): For pure dp with gradient merge, the p and g
  859. # will be in different blocks with the gradient clip related ops.
  860. # We need to handle the correct block, otherwise will encounter
  861. # a 'NotFoundError' during compile time.
  862. block = default_main_program().current_block()
  863. block.append_op(
  864. type='elementwise_mul',
  865. inputs={'X': new_g, 'Y': scale_input},
  866. outputs={'Out': new_g},
  867. )
  868. if new_g is not g:
  869. block.append_op(
  870. type='cast',
  871. inputs={'X': new_g},
  872. outputs={'Out': g},
  873. attrs={
  874. 'in_dtype': new_g.dtype,
  875. 'out_dtype': g.dtype,
  876. },
  877. )
  878. param_new_grad_name_dict[p.name] = g.name
  879. params_and_grads.append((p, g))
  880. _correct_clip_op_role_var(params_and_grads, param_new_grad_name_dict)
  881. return params_and_grads
  882. def _process_context(self, context, param, grad):
  883. if self.group_name not in context:
  884. context[self.group_name] = []
  885. context[self.group_name + "_clip_value"] = self.clip_norm
  886. context[self.group_name + "_clip"] = paddle.full(
  887. shape=[1], dtype=grad.dtype, fill_value=self.clip_norm
  888. )
  889. else:
  890. if not self.clip_norm == context[self.group_name + "_clip_value"]:
  891. raise ValueError(
  892. "All parameters' 'clip_norm' of a same group should be the same"
  893. )
  894. merge_grad = grad
  895. if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
  896. merge_grad = merge_selected_rows(grad)
  897. merge_grad = get_tensor_from_selected_rows(merge_grad)
  898. elif in_pir_mode() and grad.is_selected_row_type():
  899. merge_grad = merge_selected_rows(grad)
  900. merge_grad = get_tensor_from_selected_rows(merge_grad)
  901. local_norm_var = _squared_l2_norm(merge_grad)
  902. context[self.group_name].append(local_norm_var)
  903. self.context = context
  904. def _create_operators(self, param, grad):
  905. def async_add_n(var_list):
  906. return paddle.stack(var_list).sum()
  907. group_scale_name = self.group_name + "_scale"
  908. if group_scale_name not in self.context:
  909. group_norm_var = async_add_n(self.context[self.group_name])
  910. group_norm_var = paddle.sqrt(x=group_norm_var)
  911. clip_var = self.context[self.group_name + "_clip"]
  912. group_scale_var = paddle.divide(
  913. x=clip_var,
  914. y=paddle.maximum(x=clip_var, y=group_norm_var),
  915. )
  916. assert group_scale_var.shape == (1,)
  917. self.context[group_scale_name] = group_scale_var
  918. if in_pir_mode():
  919. grad = paddle.multiply(grad, self.context[group_scale_name])
  920. return param, grad
  921. # inplace
  922. param.block.append_op(
  923. type='elementwise_mul',
  924. inputs={'X': grad, 'Y': self.context[group_scale_name]},
  925. outputs={'Out': grad},
  926. )
  927. return param, grad
  928. @framework.dygraph_not_support
  929. def set_gradient_clip(clip, param_list=None, program=None):
  930. """
  931. Warning:
  932. This API must be used after building network, and before ``minimize`` ,
  933. and it may be removed in future releases, so it is not recommended.
  934. It is recommended to set ``grad_clip`` when initializing the ``optimizer`` ,
  935. this is a better method to clip gradient. There are three clipping strategies:
  936. :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` ,
  937. :ref:`api_paddle_nn_ClipGradByValue` .
  938. To specify parameters that require gradient clip.
  939. Args:
  940. grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
  941. some derived class of ``GradientClipBase`` . There are three cliping strategies
  942. ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` ,
  943. :ref:`api_paddle_nn_ClipGradByValue` ). Default value: None, and there is no
  944. gradient clipping.
  945. param_list (list(Variable), optional): Parameters that require gradient clip.
  946. It can be a list of parameter or a list of parameter's name.
  947. Default None, meaning that all parameters in the program will be included.
  948. program (Program, optional): The program where parameters are located.
  949. Default None, meaning that using :ref:`api_paddle_static_default_main_program` .
  950. Returns:
  951. None
  952. Examples:
  953. .. code-block:: python
  954. >>> import paddle
  955. >>> paddle.enable_static()
  956. >>> def network():
  957. ... image = paddle.static.data(name='image', shape=[
  958. ... None, 28], dtype='float32')
  959. ... param_attr1 = paddle.ParamAttr("fc1_param")
  960. ... fc1 = paddle.static.nn.fc(image, size=10, weight_attr=param_attr1)
  961. ... param_attr2 = paddle.ParamAttr("fc2_param")
  962. ... fc2 = paddle.static.nn.fc(fc1, size=10, weight_attr=param_attr2)
  963. ... loss = paddle.mean(fc2)
  964. ... return loss
  965. >>> # network 1: clip all parameter gradient
  966. >>> with paddle.static.program_guard(paddle.static.Program(), paddle.static.Program()):
  967. ... loss = network()
  968. ... paddle.nn.clip.set_gradient_clip(
  969. ... paddle.nn.ClipGradByGlobalNorm(clip_norm=2.0))
  970. ... sgd = paddle.optimizer.SGD(learning_rate=1e-3)
  971. ... sgd.minimize(loss)
  972. >>> # network 2: clip parameter gradient by name
  973. >>> with paddle.static.program_guard(base.Program(), paddle.static.Program()):
  974. ... loss = network()
  975. ... paddle.nn.clip.set_gradient_clip(
  976. ... paddle.nn.ClipGradByValue(min=-1.0, max=1.0),
  977. ... param_list=["fc1_param", "fc2_param"])
  978. ... sgd = paddle.optimizer.SGD(learning_rate=1e-3)
  979. ... sgd.minimize(loss)
  980. >>> # network 3: clip parameter gradient by value
  981. >>> with paddle.static.program_guard(base.Program(), paddle.static.Program()):
  982. ... loss = network()
  983. ... param_var1 = paddle.static.default_main_program().global_block().var("fc1_param")
  984. ... param_var2 = paddle.static.default_main_program().global_block().var("fc2_param")
  985. ... paddle.nn.clip.set_gradient_clip(
  986. ... paddle.nn.ClipGradByValue(min=-1.0, max=1.0),
  987. ... param_list=[param_var1, param_var2])
  988. ... sgd = paddle.optimizer.SGD(learning_rate=1e-3)
  989. ... sgd.minimize(loss)
  990. >>> # network 4: use 'set_gradient_clip' and 'optimize(grad_clip=clip)' together
  991. >>> with paddle.static.program_guard(base.Program(), paddle.static.Program()):
  992. ... loss = network()
  993. ... clip1 = paddle.nn.ClipGradByValue(min=-1.0, max=1.0)
  994. ... clip2 = paddle.nn.ClipGradByNorm(clip_norm=1.0)
  995. ... # Set the gradient clipping strategy: clip1
  996. ... paddle.nn.clip.set_gradient_clip(clip1)
  997. ... # Set the gradient clipping strategy: clip2
  998. ... sgd = paddle.optimizer.SGD(learning_rate=1e-3, grad_clip=clip2)
  999. ... sgd.minimize(loss)
  1000. ... # 'set_gradient_clip' will not take effect when setting has a conflict,
  1001. ... # and the gradient clipping strategy will be 'clip2'
  1002. """
  1003. warnings.warn(
  1004. "Caution! 'set_gradient_clip' is not recommended "
  1005. "and may be deprecated in future! "
  1006. "We recommend a new strategy: set 'grad_clip' "
  1007. "when initializing the 'optimizer'. "
  1008. "This method can reduce the mistakes, please "
  1009. "refer to documention of 'optimizer'."
  1010. )
  1011. if not isinstance(clip, ClipGradBase):
  1012. raise TypeError(
  1013. "'clip' should be an instance of ClipGradBase's derived class"
  1014. )
  1015. if program is None:
  1016. program = framework.default_main_program()
  1017. for op in program.block(0).ops:
  1018. if 'op_namescope' in op.all_attrs() and "optimizer" in op.attr(
  1019. "op_namescope"
  1020. ):
  1021. warnings.warn(
  1022. "'minimize' has been invoked before, this will make 'set_gradient_clip' "
  1023. "be ineffective! Please invoke 'set_gradient_clip' before 'minimize'."
  1024. )
  1025. break
  1026. if param_list is None:
  1027. param_list = program.block(0).all_parameters()
  1028. if all(isinstance(elem, str) for elem in param_list):
  1029. param_list = [program.block(0).var(elem) for elem in param_list]
  1030. if not all(isinstance(elem, framework.Parameter) for elem in param_list):
  1031. raise TypeError(
  1032. "'param_list' should be a list of Parameter or basestring(parameter's name)."
  1033. )
  1034. for param in param_list:
  1035. param.gradient_clip_attr = copy.deepcopy(clip)
  1036. def append_gradient_clip_ops(param_grads):
  1037. context = {}
  1038. for p, g in param_grads:
  1039. if g is None:
  1040. continue
  1041. with p.block.program._optimized_guard([p, g]), framework.name_scope(
  1042. 'gradient_clip'
  1043. ):
  1044. clip_attr = getattr(p, 'gradient_clip_attr', None)
  1045. if clip_attr is None:
  1046. return param_grads
  1047. if not isinstance(clip_attr, ClipGradBase):
  1048. raise TypeError(
  1049. "clip attribute should be an instance of GradientClipBase"
  1050. )
  1051. clip_attr._process_context(context=context, param=p, grad=g)
  1052. res = []
  1053. param_new_grad_name_dict = {}
  1054. for p, g in param_grads:
  1055. if g is None:
  1056. continue
  1057. with p.block.program._optimized_guard([p, g]), framework.name_scope(
  1058. 'gradient_clip'
  1059. ):
  1060. param, new_grad = clip_attr._create_operators(param=p, grad=g)
  1061. param_new_grad_name_dict[param.name] = new_grad.name
  1062. res.append([param, new_grad])
  1063. _correct_clip_op_role_var(res, param_new_grad_name_dict)
  1064. return res
  1065. # change wrong mapping relation between param & grad in clip op
  1066. # Note: This function is sensitive to the time cost of the network with gradient clipping
  1067. # and should not be changed easily. If you must change, please test the time cost.
  1068. def _correct_clip_op_role_var(params_grads, param_new_grad_name_dict):
  1069. block_id_list = []
  1070. if len(param_new_grad_name_dict) == 0:
  1071. return
  1072. for param, grad in params_grads:
  1073. if grad is None:
  1074. continue
  1075. block_id = param.block.idx
  1076. if block_id in block_id_list:
  1077. continue
  1078. block_id_list.append(block_id)
  1079. for op in param.block.program.global_block().ops:
  1080. if (
  1081. op.has_attr("op_namescope")
  1082. and "gradient_clip" in op.attr("op_namescope")
  1083. and op.attr('op_role_var')
  1084. ):
  1085. param_name = op.attr('op_role_var')[0]
  1086. if param_name in param_new_grad_name_dict:
  1087. correct_p_g = [
  1088. param_name,
  1089. param_new_grad_name_dict[param_name],
  1090. ]
  1091. op._set_attr('op_role_var', correct_p_g)
  1092. GradientClipBase = ClipGradBase
  1093. GradientClipByValue = ClipGradByValue
  1094. GradientClipByNorm = ClipGradByNorm
  1095. GradientClipByGlobalNorm = ClipGradByGlobalNorm