lr.py 125 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085
  1. # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import math
  15. import warnings
  16. import numpy
  17. import paddle
  18. from paddle import Tensor
  19. from paddle.base import core
  20. from paddle.base.data_feeder import check_type
  21. from paddle.base.framework import (
  22. Variable,
  23. default_main_program,
  24. in_dygraph_mode,
  25. )
  26. from paddle.base.layer_helper import LayerHelper
  27. __all__ = [
  28. 'LRScheduler',
  29. 'NoamDecay',
  30. 'PiecewiseDecay',
  31. 'NaturalExpDecay',
  32. 'InverseTimeDecay',
  33. 'PolynomialDecay',
  34. 'LinearWarmup',
  35. 'ExponentialDecay',
  36. 'MultiStepDecay',
  37. 'StepDecay',
  38. 'LambdaDecay',
  39. 'ReduceOnPlateau',
  40. 'CosineAnnealingDecay',
  41. 'MultiplicativeDecay',
  42. 'OneCycleLR',
  43. 'CyclicLR',
  44. 'LinearLR',
  45. 'CosineAnnealingWarmRestarts',
  46. ]
  47. class LRScheduler:
  48. """
  49. LRScheduler Base class. Define the common interface of a learning rate scheduler.
  50. There are currently 17 strategies implemented in paddle based on this base class, which are:
  51. - ``NoamDecay``: Related algorithms are derived from `*Attention Is All You Need* <http://blog.inkypy.com>`_ . Please refer to :ref:`api_paddle_optimizer_lr_NoamDecay`.
  52. - ``ExponentialDecay``: The next learning rate is obtained by multiplying the current learning rate by a given decay rate. Please refer to :ref:`api_paddle_optimizer_lr_ExponentialDecay`.
  53. - ``NaturalExpDecay``: Each time the current learning rate is multiplied by the natural index of the given decay rate to obtain the next learning rate. Please refer to :ref:`api_paddle_optimizer_lr_NaturalExpDecay`.
  54. - ``InverseTimeDecay``: The resulting learning rate is inversely proportional to the current number of decays. Please refer to :ref:`api_paddle_optimizer_lr_InverseTimeDecay`.
  55. - ``PolynomialDecay``: The resulting learning rate is the interpolation of the score points between the initial learning rate and the given final learning determined by polynomial computation weights. Please refer to :ref:`api_paddle_optimizer_lr_PolynomialDecay`.
  56. - ``PiecewiseDecay``: Segments decay in a step-like fashion by a given number of steps, and each segment has the same learning rate. Please refer to :ref:`api_paddle_optimizer_lr_PiecewiseDecay`.
  57. - ``CosineAnnealingDecay``: The learning rate varies periodically with the number of steps as a cosine function. Please refer to :ref:`api_paddle_optimizer_lr_CosineAnnealingDecay`.
  58. - ``LinearWarmup``: The learning rate increases linearly with the number of steps to the specified learning rate. Please refer to :ref:`api_paddle_optimizer_lr_LinearWarmup`.
  59. - ``StepDecay``: The learning rate decays every fixed interval number of steps, and the number of step intervals needs to be specified. Please refer to :ref:`api_paddle_optimizer_lr_StepDecay`.
  60. - ``MultiStepDecay``: The learning rate decays at a specific number of steps, and the node location at which the decay occurs needs to be specified. Please refer to :ref:`api_paddle_optimizer_lr_MultiStepDecay`.
  61. - ``LambdaDecay``: The learning rate decays according to a custom lambda function. Please refer to :ref:`api_paddle_optimizer_lr_LambdaDecay`.
  62. - ``ReduceOnPlateau``: The learning rate is adaptively adjusted according to the current metric (typically loss), and the learning rate is attenuated when the loss becomes stable. Please refer to :ref:`api_paddle_optimizer_lr_ReduceOnPlateau`.
  63. - ``MultiplicativeDecay``: The resulting learning rate is obtained by multiplying the current learning rate each time by a lambda function. Please refer to :ref:`api_paddle_optimizer_lr_MultiplicativeDecay`.
  64. - ``OneCycleLR``: The learning rate goes up to the maximum and then down to the minimum. Please refer to :ref:`api_paddle_optimizer_lr_OneCycleLR`.
  65. - ``CyclicLR``: Think of the process of learning rate change as a cycle, with the learning rate changing between the minimum and maximum learning rates according to a fixed frequency. Please refer to :ref:`api_paddle_optimizer_lr_CyclicLR`.
  66. - ``LinearLR``: The learning rate increases linearly with the number of steps to the specified learning rate. Please refer to :ref:`api_paddle_optimizer_lr_LinearLR`.
  67. - ``CosineAnnealingWarmRestarts``: The learning rate varies periodically with the number of steps as a cosine function. Please refer to :ref:`api_paddle_optimizer_lr_CosineAnnealingWarmRestarts`.
  68. User can import it by ``from paddle.optimizer.lr import LRScheduler`` ,
  69. then overload it for your subclass and have a custom implementation of ``get_lr()`` .
  70. Otherwise, an ``NotImplementedError`` exception will be thrown.
  71. Args:
  72. learning_rate (float): The initial learning rate. It is a python float number.
  73. last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
  74. verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
  75. Returns:
  76. instance to schedule learning rate.
  77. Examples:
  78. Here is an example of a simple ``StepDecay`` implementation.
  79. .. code-block:: python
  80. >>> import paddle
  81. >>> from paddle.optimizer.lr import LRScheduler
  82. >>> class StepDecay(LRScheduler):
  83. ... def __init__(self,
  84. ... learning_rate,
  85. ... step_size,
  86. ... gamma=0.1,
  87. ... last_epoch=-1,
  88. ... verbose=False):
  89. ... if not isinstance(step_size, int):
  90. ... raise TypeError(
  91. ... "The type of 'step_size' must be 'int', but received %s." %
  92. ... type(step_size))
  93. ... if gamma >= 1.0:
  94. ... raise ValueError('gamma should be < 1.0.')
  95. ...
  96. ... self.step_size = step_size
  97. ... self.gamma = gamma
  98. ... super().__init__(learning_rate, last_epoch, verbose)
  99. ...
  100. ... def get_lr(self):
  101. ... i = self.last_epoch // self.step_size
  102. ... return self.base_lr * (self.gamma**i)
  103. ...
  104. """
  105. def __init__(self, learning_rate=0.1, last_epoch=-1, verbose=False):
  106. if not isinstance(learning_rate, (float, int)):
  107. raise TypeError(
  108. f"The type of learning rate must be float, but received {type(learning_rate)}"
  109. )
  110. if learning_rate < 0:
  111. raise ValueError(f"Invalid learning rate: {learning_rate}")
  112. self.base_lr = float(learning_rate)
  113. self.last_lr = float(learning_rate)
  114. self.last_epoch = last_epoch
  115. self.verbose = verbose
  116. self._var_name = None
  117. self.step()
  118. def __call__(self):
  119. """
  120. Return latest computed learning rate on current epoch.
  121. """
  122. return self.last_lr
  123. def step(self, epoch=None):
  124. """
  125. ``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` .
  126. The new learning rate will take effect on next ``optimizer.step`` .
  127. Args:
  128. epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.
  129. Returns:
  130. None
  131. Examples:
  132. .. code-block:: python
  133. >>> import paddle
  134. >>> value = paddle.arange(26, dtype='float32')
  135. >>> a = paddle.reshape(value, [2, 13])
  136. >>> linear = paddle.nn.Linear(13, 5)
  137. >>> adadelta = paddle.optimizer.Adadelta(learning_rate=0.0003, epsilon=1e-06, rho=0.95,
  138. ... parameters = linear.parameters())
  139. >>> out = linear(a)
  140. >>> out.backward()
  141. >>> adadelta.step()
  142. >>> adadelta.clear_grad()
  143. .. code-block:: python
  144. >>> import paddle
  145. >>> value = paddle.arange(26, dtype='float32')
  146. >>> a = paddle.reshape(value, [2, 13])
  147. >>> linear = paddle.nn.Linear(13, 5)
  148. >>> adadelta = paddle.optimizer.Adadelta(learning_rate=0.0003, epsilon=1e-06, rho=0.95,
  149. ... parameters = linear.parameters())
  150. >>> out = linear(a)
  151. >>> out.backward()
  152. >>> adadelta.step()
  153. >>> adadelta.clear_grad()
  154. """
  155. if epoch is None:
  156. self.last_epoch += 1
  157. self.last_lr = self.get_lr()
  158. else:
  159. self.last_epoch = epoch
  160. if hasattr(self, "_get_closed_form_lr"):
  161. self.last_lr = self._get_closed_form_lr()
  162. else:
  163. self.last_lr = self.get_lr()
  164. if self.verbose:
  165. print(
  166. f'Epoch {self.last_epoch}: {self.__class__.__name__} set learning rate to {self.last_lr}.'
  167. )
  168. def state_dict(self):
  169. """
  170. Returns the state of the scheduler as a :class:`dict`.
  171. It is a subset of ``self.__dict__`` .
  172. """
  173. self.state_keys()
  174. state_dict = {}
  175. for key in self.keys:
  176. if key not in self.__dict__:
  177. continue
  178. value = self.__dict__[key]
  179. if isinstance(value, Tensor):
  180. assert (
  181. value.size == 1
  182. ), "numel of Tensor in state_dict must be 1"
  183. value = float(value)
  184. state_dict[key] = value
  185. return state_dict
  186. # For those subclass who overload LRScheduler, "last_epoch, last_lr" will be saved by default.
  187. # (Note): you can change it for your subclass.
  188. def state_keys(self):
  189. """
  190. For those subclass who overload ``LRScheduler`` (Base Class). Acquiescently, "last_epoch, last_lr" will be saved by ``self.keys = ['last_epoch', 'last_lr']`` .
  191. ``last_epoch`` is the current epoch num, and ``last_lr`` is the current learning rate.
  192. If you want to change the default behavior, you should have a custom implementation of ``_state_keys()`` to redefine ``self.keys`` .
  193. """
  194. self.keys = ['last_epoch', 'last_lr']
  195. def set_state_dict(self, state_dict):
  196. """
  197. Loads the schedulers state.
  198. """
  199. self.state_keys()
  200. for key in self.keys:
  201. if key in state_dict:
  202. self.__dict__[key] = state_dict[key]
  203. else:
  204. raise RuntimeError(
  205. f"Please check whether state_dict is correct for optimizer. Can't find [ {key} ] in state_dict"
  206. )
  207. if len(state_dict) > len(self.keys):
  208. warnings.warn(
  209. "There are some unused values in state_dict. Maybe the optimizer have different 'LearningRateDecay' when invoking state_dict and set_dict"
  210. )
  211. # alias for set_state_dict
  212. set_dict = set_state_dict
  213. def get_lr(self):
  214. """
  215. For those subclass who overload ``LRScheduler`` (Base Class), User should have a custom implementation of ``get_lr()`` .
  216. Otherwise, an ``NotImplementedError`` exception will be thrown.
  217. """
  218. # calculate by python float
  219. raise NotImplementedError
  220. class NoamDecay(LRScheduler):
  221. r"""
  222. Applies Noam Decay to the initial learning rate.
  223. The algorithm can be described as following.
  224. .. math::
  225. new\_learning\_rate = learning\_rate * d_{model}^{-0.5} * min(epoch^{-0.5}, epoch * warmup\_steps^{-1.5})
  226. Please reference `attention is all you need <https://arxiv.org/pdf/1706.03762.pdf>`_
  227. Args:
  228. d$_{model}$(int): The dimensionality of input and output feature vector of model. It is a python int number.
  229. warmup_steps(int): The number of warmup steps. A super parameter. It is a python int number
  230. learning_rate (float): The initial learning rate. It is a python float number. Default: 1.0.
  231. last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
  232. verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
  233. Returns:
  234. ``NoamDecay`` instance to schedule learning rate.
  235. Examples:
  236. .. code-block:: python
  237. :name: code-example1
  238. >>> # Example1: train on default dynamic graph mode
  239. >>> import paddle
  240. >>> import numpy as np
  241. >>> # train on default dynamic graph mode
  242. >>> linear = paddle.nn.Linear(10, 10)
  243. >>> scheduler = paddle.optimizer.lr.NoamDecay(d_model=0.01, warmup_steps=100, verbose=True)
  244. >>> sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
  245. >>> for epoch in range(20):
  246. ... for batch_id in range(5):
  247. ... x = paddle.uniform([10, 10])
  248. ... out = linear(x)
  249. ... loss = paddle.mean(out)
  250. ... loss.backward()
  251. ... sgd.step()
  252. ... sgd.clear_gradients()
  253. ... scheduler.step() # If you update learning rate each step
  254. ... # scheduler.step() # If you update learning rate each epoch
  255. .. code-block:: python
  256. :name: code-example2
  257. >>> # Example2: train on static graph mode
  258. >>> import paddle
  259. >>> import numpy as np
  260. >>> paddle.enable_static()
  261. >>> main_prog = paddle.static.Program()
  262. >>> start_prog = paddle.static.Program()
  263. >>> with paddle.static.program_guard(main_prog, start_prog):
  264. ... x = paddle.static.data(name='x', shape=[None, 4, 5])
  265. ... y = paddle.static.data(name='y', shape=[None, 4, 5])
  266. ... z = paddle.static.nn.fc(x, 100)
  267. ... loss = paddle.mean(z)
  268. ... scheduler = paddle.optimizer.lr.NoamDecay(d_model=0.01, warmup_steps=100, verbose=True)
  269. ... sgd = paddle.optimizer.SGD(learning_rate=scheduler)
  270. ... sgd.minimize(loss)
  271. ...
  272. >>> exe = paddle.static.Executor()
  273. >>> exe.run(start_prog)
  274. >>> for epoch in range(20):
  275. ... for batch_id in range(5):
  276. ... out = exe.run(
  277. ... main_prog,
  278. ... feed={
  279. ... 'x': np.random.randn(3, 4, 5).astype('float32'),
  280. ... 'y': np.random.randn(3, 4, 5).astype('float32')
  281. ... },
  282. ... fetch_list=loss.name)
  283. ... scheduler.step() # If you update learning rate each step
  284. ... # scheduler.step() # If you update learning rate each epoch
  285. ...
  286. """
  287. def __init__(
  288. self,
  289. d_model,
  290. warmup_steps,
  291. learning_rate=1.0,
  292. last_epoch=-1,
  293. verbose=False,
  294. ):
  295. if d_model <= 0:
  296. raise ValueError("d_model should be grater than 0")
  297. self.d_model = d_model
  298. self.warmup_steps = warmup_steps
  299. super().__init__(learning_rate, last_epoch, verbose)
  300. def get_lr(self):
  301. if self.last_epoch == 0:
  302. a = 1
  303. else:
  304. a = self.last_epoch**-0.5
  305. b = self.warmup_steps**-1.5 * self.last_epoch
  306. return self.base_lr * (self.d_model**-0.5) * min(a, b)
  307. class PiecewiseDecay(LRScheduler):
  308. """
  309. Piecewise learning rate scheduler.
  310. The algorithm can be described as the code below:
  311. .. code-block:: text
  312. boundaries = [100, 200]
  313. values = [1.0, 0.5, 0.1]
  314. if epoch < 100:
  315. learning_rate = 1.0
  316. elif 100 <= global_step < 200:
  317. learning_rate = 0.5
  318. else:
  319. learning_rate = 0.1
  320. Args:
  321. boundaries(list|tuple): A list/tuple of steps numbers. The type of element in the list is python int.
  322. values(list|tuple): A list/tuple of learning rate values that will be picked during different epoch boundaries.
  323. The type of element in the list is python float. The ``values`` have one more element than ``boundaries``.
  324. last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
  325. verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
  326. Returns:
  327. ``PiecewiseDecay`` instance to schedule learning rate.
  328. Examples:
  329. .. code-block:: python
  330. :name: code-example1
  331. >>> # Example1: train on default dynamic graph mode
  332. >>> import paddle
  333. >>> import numpy as np
  334. >>> # train on default dynamic graph mode
  335. >>> linear = paddle.nn.Linear(10, 10)
  336. >>> scheduler = paddle.optimizer.lr.PiecewiseDecay(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True)
  337. >>> sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
  338. >>> for epoch in range(20):
  339. ... for batch_id in range(5):
  340. ... x = paddle.uniform([10, 10])
  341. ... out = linear(x)
  342. ... loss = paddle.mean(out)
  343. ... loss.backward()
  344. ... sgd.step()
  345. ... sgd.clear_gradients()
  346. ... scheduler.step() # If you update learning rate each step
  347. ... # scheduler.step() # If you update learning rate each epoch
  348. .. code-block:: python
  349. :name: code-example2
  350. >>> # Example2: train on static graph mode
  351. >>> import paddle
  352. >>> import numpy as np
  353. >>> paddle.enable_static()
  354. >>> main_prog = paddle.static.Program()
  355. >>> start_prog = paddle.static.Program()
  356. >>> with paddle.static.program_guard(main_prog, start_prog):
  357. ... x = paddle.static.data(name='x', shape=[None, 4, 5])
  358. ... y = paddle.static.data(name='y', shape=[None, 4, 5])
  359. ... z = paddle.static.nn.fc(x, 100)
  360. ... loss = paddle.mean(z)
  361. ... scheduler = paddle.optimizer.lr.PiecewiseDecay(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True)
  362. ... sgd = paddle.optimizer.SGD(learning_rate=scheduler)
  363. ... sgd.minimize(loss)
  364. ...
  365. >>> exe = paddle.static.Executor()
  366. >>> exe.run(start_prog)
  367. >>> for epoch in range(20):
  368. ... for batch_id in range(5):
  369. ... out = exe.run(
  370. ... main_prog,
  371. ... feed={
  372. ... 'x': np.random.randn(3, 4, 5).astype('float32'),
  373. ... 'y': np.random.randn(3, 4, 5).astype('float32')
  374. ... },
  375. ... fetch_list=loss.name)
  376. ... scheduler.step() # If you update learning rate each step
  377. ... # scheduler.step() # If you update learning rate each epoch
  378. """
  379. def __init__(self, boundaries, values, last_epoch=-1, verbose=False):
  380. if len(boundaries) == 0:
  381. raise ValueError('The boundaries cannot be empty.')
  382. if len(values) <= len(boundaries):
  383. raise ValueError(
  384. f'The values have one more element than boundaries, but received len(values) [{len(values)}] < len(boundaries) + 1 [{len(boundaries) + 1}].'
  385. )
  386. self.boundaries = boundaries
  387. self.values = values
  388. super().__init__(last_epoch=last_epoch, verbose=verbose)
  389. def get_lr(self):
  390. for i in range(len(self.boundaries)):
  391. if self.last_epoch < self.boundaries[i]:
  392. return self.values[i]
  393. return self.values[len(self.values) - 1]
  394. class NaturalExpDecay(LRScheduler):
  395. r"""
  396. Applies natural exponential decay to the initial learning rate.
  397. The algorithm can be described as following:
  398. .. math::
  399. new\_learning\_rate = learning\_rate * e^{- gamma * epoch}
  400. Args:
  401. learning_rate (float): The initial learning rate. It is a python float number.
  402. gamma (float, optional): A Ratio to update the learning rate, should greater than 0.0 to make learning rate decay. Default: 0.1.
  403. last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
  404. verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
  405. Returns:
  406. ``NaturalExpDecay`` instance to schedule learning rate.
  407. Examples:
  408. .. code-block:: python
  409. :name: code-example1
  410. >>> # Example1: train on default dynamic graph mode
  411. >>> import paddle
  412. >>> import numpy as np
  413. >>> linear = paddle.nn.Linear(10, 10)
  414. >>> scheduler = paddle.optimizer.lr.NaturalExpDecay(learning_rate=0.5, gamma=0.1, verbose=True)
  415. >>> sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
  416. >>> for epoch in range(20):
  417. ... for batch_id in range(5):
  418. ... x = paddle.uniform([10, 10])
  419. ... out = linear(x)
  420. ... loss = paddle.mean(out)
  421. ... loss.backward()
  422. ... sgd.step()
  423. ... sgd.clear_gradients()
  424. ... scheduler.step() # If you update learning rate each step
  425. ... # scheduler.step() # If you update learning rate each epoch
  426. .. code-block:: python
  427. :name: code-example2
  428. >>> # Example2: train on static graph mode
  429. >>> import paddle
  430. >>> import numpy as np
  431. >>> paddle.enable_static()
  432. >>> main_prog = paddle.static.Program()
  433. >>> start_prog = paddle.static.Program()
  434. >>> with paddle.static.program_guard(main_prog, start_prog):
  435. ... x = paddle.static.data(name='x', shape=[None, 4, 5])
  436. ... y = paddle.static.data(name='y', shape=[None, 4, 5])
  437. ... z = paddle.static.nn.fc(x, 100)
  438. ... loss = paddle.mean(z)
  439. ... scheduler = paddle.optimizer.lr.NaturalExpDecay(learning_rate=0.5, gamma=0.1, verbose=True)
  440. ... sgd = paddle.optimizer.SGD(learning_rate=scheduler)
  441. ... sgd.minimize(loss)
  442. ...
  443. >>> exe = paddle.static.Executor()
  444. >>> exe.run(start_prog)
  445. >>> for epoch in range(20):
  446. ... for batch_id in range(5):
  447. ... out = exe.run(
  448. ... main_prog,
  449. ... feed={
  450. ... 'x': np.random.randn(3, 4, 5).astype('float32'),
  451. ... 'y': np.random.randn(3, 4, 5).astype('float32')
  452. ... },
  453. ... fetch_list=loss.name)
  454. ... scheduler.step() # If you update learning rate each step
  455. ... # scheduler.step() # If you update learning rate each epoch
  456. """
  457. def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
  458. assert (
  459. gamma > 0.0
  460. ), " 'gamma' must be a positive number so that the learning rate will decay."
  461. self.gamma = gamma
  462. super().__init__(learning_rate, last_epoch, verbose)
  463. def get_lr(self):
  464. return self.base_lr * math.exp(-1 * self.gamma * self.last_epoch)
  465. class InverseTimeDecay(LRScheduler):
  466. r"""
  467. Applies inverse time decay to the initial learning rate.
  468. The algorithm can be described as following:
  469. .. math::
  470. new\_learning\_rate = \frac{learning\_rate}{1 + gamma * epoch}
  471. Args:
  472. learning_rate (float): The initial learning rate. It is a python float number.
  473. gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
  474. It should be less than 1.0. Default: 0.1.
  475. last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
  476. verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
  477. Returns:
  478. ``InverseTimeDecay`` instance to schedule learning rate.
  479. Examples:
  480. .. code-block:: python
  481. :name: code-example1
  482. >>> # Example1: train on default dynamic graph mode
  483. >>> import paddle
  484. >>> import numpy as np
  485. >>> # train on default dynamic graph mode
  486. >>> linear = paddle.nn.Linear(10, 10)
  487. >>> scheduler = paddle.optimizer.lr.InverseTimeDecay(learning_rate=0.5, gamma=0.1, verbose=True)
  488. >>> sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
  489. >>> for epoch in range(20):
  490. ... for batch_id in range(5):
  491. ... x = paddle.uniform([10, 10])
  492. ... out = linear(x)
  493. ... loss = paddle.mean(out)
  494. ... loss.backward()
  495. ... sgd.step()
  496. ... sgd.clear_gradients()
  497. ... scheduler.step() # If you update learning rate each step
  498. ... # scheduler.step() # If you update learning rate each epoch
  499. .. code-block:: python
  500. :name: code-example2
  501. >>> # Example2: train on static graph mode
  502. >>> import paddle
  503. >>> import numpy as np
  504. >>> paddle.enable_static()
  505. >>> main_prog = paddle.static.Program()
  506. >>> start_prog = paddle.static.Program()
  507. >>> with paddle.static.program_guard(main_prog, start_prog):
  508. ... x = paddle.static.data(name='x', shape=[None, 4, 5])
  509. ... y = paddle.static.data(name='y', shape=[None, 4, 5])
  510. ... z = paddle.static.nn.fc(x, 100)
  511. ... loss = paddle.mean(z)
  512. ... scheduler = paddle.optimizer.lr.InverseTimeDecay(learning_rate=0.5, gamma=0.1, verbose=True)
  513. ... sgd = paddle.optimizer.SGD(learning_rate=scheduler)
  514. ... sgd.minimize(loss)
  515. ...
  516. >>> exe = paddle.static.Executor()
  517. >>> exe.run(start_prog)
  518. >>> for epoch in range(20):
  519. ... for batch_id in range(5):
  520. ... out = exe.run(
  521. ... main_prog,
  522. ... feed={
  523. ... 'x': np.random.randn(3, 4, 5).astype('float32'),
  524. ... 'y': np.random.randn(3, 4, 5).astype('float32')
  525. ... },
  526. ... fetch_list=loss.name)
  527. ... scheduler.step() # If you update learning rate each step
  528. ... # scheduler.step() # If you update learning rate each epoch
  529. ...
  530. """
  531. def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
  532. self.gamma = gamma
  533. super().__init__(learning_rate, last_epoch, verbose)
  534. def get_lr(self):
  535. return self.base_lr / (1 + self.gamma * self.last_epoch)
  536. class PolynomialDecay(LRScheduler):
  537. r"""
  538. Applies polynomial decay to the initial learning rate.
  539. The algorithm can be described as following.
  540. If cycle is set to True, then:
  541. .. math::
  542. decay\_steps & = decay\_steps * math.ceil(\frac{epoch}{decay\_steps})
  543. new\_learning\_rate & = (learning\_rate-end\_lr)*(1-\frac{epoch}{decay\_steps})^{power}+end\_lr
  544. If cycle is set to False, then:
  545. .. math::
  546. epoch & = min(epoch, decay\_steps)
  547. new\_learning\_rate & = (learning\_rate-end\_lr)*(1-\frac{epoch}{decay\_steps})^{power}+end\_lr
  548. Args:
  549. learning_rate (float): The initial learning rate. It is a python float number.
  550. decay_steps(int): The decay step size. It determines the decay cycle. It must be a positive integer.
  551. end_lr(float, optional): The minimum final learning rate. Default: 0.0001.
  552. power(float, optional): Power of polynomial, should greater than 0.0 to get learning rate decay. Default: 1.0.
  553. cycle(bool, optional): Whether the learning rate rises again. If True, then the learning rate will rise when it decrease
  554. to ``end_lr`` . If False, the learning rate is monotone decreasing. Default: False.
  555. last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
  556. verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
  557. Returns:
  558. ``PolynomialDecay`` instance to schedule learning rate.
  559. Examples:
  560. .. code-block:: python
  561. :name: code-example1
  562. >>> # Example1: train on default dynamic graph mode
  563. >>> import paddle
  564. >>> import numpy as np
  565. >>> # train on default dynamic graph mode
  566. >>> linear = paddle.nn.Linear(10, 10)
  567. >>> scheduler = paddle.optimizer.lr.PolynomialDecay(learning_rate=0.5, decay_steps=20, verbose=True)
  568. >>> sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
  569. >>> for epoch in range(20):
  570. ... for batch_id in range(5):
  571. ... x = paddle.uniform([10, 10])
  572. ... out = linear(x)
  573. ... loss = paddle.mean(out)
  574. ... loss.backward()
  575. ... sgd.step()
  576. ... sgd.clear_gradients()
  577. ... scheduler.step() # If you update learning rate each step
  578. ... # scheduler.step() # If you update learning rate each epoch
  579. .. code-block:: python
  580. :name: code-example2
  581. >>> # Example2: train on static graph mode
  582. >>> import paddle
  583. >>> import numpy as np
  584. >>> paddle.enable_static()
  585. >>> main_prog = paddle.static.Program()
  586. >>> start_prog = paddle.static.Program()
  587. >>> with paddle.static.program_guard(main_prog, start_prog):
  588. ... x = paddle.static.data(name='x', shape=[None, 4, 5])
  589. ... y = paddle.static.data(name='y', shape=[None, 4, 5])
  590. ... z = paddle.static.nn.fc(x, 100)
  591. ... loss = paddle.mean(z)
  592. ... scheduler = paddle.optimizer.lr.PolynomialDecay(learning_rate=0.5, decay_steps=20, verbose=True)
  593. ... sgd = paddle.optimizer.SGD(learning_rate=scheduler)
  594. ... sgd.minimize(loss)
  595. ...
  596. >>> exe = paddle.static.Executor()
  597. >>> exe.run(start_prog)
  598. >>> for epoch in range(20):
  599. ... for batch_id in range(5):
  600. ... out = exe.run(
  601. ... main_prog,
  602. ... feed={
  603. ... 'x': np.random.randn(3, 4, 5).astype('float32'),
  604. ... 'y': np.random.randn(3, 4, 5).astype('float32')
  605. ... },
  606. ... fetch_list=loss.name)
  607. ... scheduler.step() # If you update learning rate each step
  608. ... # scheduler.step() # If you update learning rate each epoch
  609. """
  610. def __init__(
  611. self,
  612. learning_rate,
  613. decay_steps,
  614. end_lr=0.0001,
  615. power=1.0,
  616. cycle=False,
  617. last_epoch=-1,
  618. verbose=False,
  619. ):
  620. assert decay_steps > 0 and isinstance(
  621. decay_steps, int
  622. ), " 'decay_steps' must be a positive integer."
  623. self.decay_steps = decay_steps
  624. self.end_lr = end_lr
  625. assert (
  626. power > 0.0
  627. ), " 'power' must be greater than 0.0 so that the learning rate will decay."
  628. self.power = power
  629. self.cycle = cycle
  630. super().__init__(learning_rate, last_epoch, verbose)
  631. def get_lr(self):
  632. tmp_epoch_num = self.last_epoch
  633. tmp_decay_steps = self.decay_steps
  634. if self.cycle:
  635. div_res = math.ceil(
  636. float(self.last_epoch) / float(self.decay_steps)
  637. )
  638. if self.last_epoch == 0:
  639. div_res = 1
  640. tmp_decay_steps = self.decay_steps * div_res
  641. else:
  642. tmp_epoch_num = min(self.last_epoch, self.decay_steps)
  643. return (self.base_lr - self.end_lr) * (
  644. (1 - float(tmp_epoch_num) / float(tmp_decay_steps)) ** self.power
  645. ) + self.end_lr
  646. class LinearWarmup(LRScheduler):
  647. r"""
  648. Linear learning rate warm up strategy. Update the learning rate preliminarily before the normal learning rate scheduler.
  649. For more information, please refer to `Bag of Tricks for Image Classification with Convolutional Neural Networks <https://arxiv.org/abs/1812.01187>`_
  650. When epoch < warmup_steps, learning rate is updated as:
  651. .. math::
  652. lr = start\_lr + (end\_lr - start\_lr) * \frac{epoch}{warmup\_steps}
  653. where start_lr is the initial learning rate, and end_lr is the final learning rate;
  654. When epoch >= warmup_steps, learning rate is updated as:
  655. .. math::
  656. lr = learning_rate
  657. where ``learning_rate`` is float or any subclass of ``LRScheduler`` .
  658. Args:
  659. learning_rate (float|LRScheduler): The learning rate after warm-up. It is a python float number or any subclass of ``LRScheduler`` .
  660. warmup_steps (int): total steps of warm up. It must be a positive integer.
  661. start_lr (float): Initial learning rate of warm up.
  662. end_lr (float): Final learning rate of warm up.
  663. last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
  664. verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
  665. Returns:
  666. ``LinearWarmup`` instance to schedule learning rate.
  667. Examples:
  668. .. code-block:: python
  669. :name: code-example1
  670. >>> # Example1: train on default dynamic graph mode
  671. >>> import paddle
  672. >>> import numpy as np
  673. >>> # train on default dynamic graph mode
  674. >>> linear = paddle.nn.Linear(10, 10)
  675. >>> scheduler = paddle.optimizer.lr.LinearWarmup(
  676. ... learning_rate=0.5, warmup_steps=20, start_lr=0, end_lr=0.5, verbose=True)
  677. >>> sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
  678. >>> for epoch in range(20):
  679. ... for batch_id in range(5):
  680. ... x = paddle.uniform([10, 10])
  681. ... out = linear(x)
  682. ... loss = paddle.mean(out)
  683. ... loss.backward()
  684. ... sgd.step()
  685. ... sgd.clear_gradients()
  686. ... scheduler.step() # If you update learning rate each step
  687. ... # scheduler.step() # If you update learning rate each epoch
  688. .. code-block:: python
  689. :name: code-example2
  690. >>> # Example2: train on static graph mode
  691. >>> import paddle
  692. >>> import numpy as np
  693. >>> paddle.enable_static()
  694. >>> main_prog = paddle.static.Program()
  695. >>> start_prog = paddle.static.Program()
  696. >>> with paddle.static.program_guard(main_prog, start_prog):
  697. ... x = paddle.static.data(name='x', shape=[None, 4, 5])
  698. ... y = paddle.static.data(name='y', shape=[None, 4, 5])
  699. ... z = paddle.static.nn.fc(x, 100)
  700. ... loss = paddle.mean(z)
  701. ... scheduler = paddle.optimizer.lr.LinearWarmup(
  702. ... learning_rate=0.5, warmup_steps=20, start_lr=0, end_lr=0.5, verbose=True)
  703. ... sgd = paddle.optimizer.SGD(learning_rate=scheduler)
  704. ... sgd.minimize(loss)
  705. ...
  706. >>> exe = paddle.static.Executor()
  707. >>> exe.run(start_prog)
  708. >>> for epoch in range(20):
  709. ... for batch_id in range(5):
  710. ... out = exe.run(
  711. ... main_prog,
  712. ... feed={
  713. ... 'x': np.random.randn(3, 4, 5).astype('float32'),
  714. ... 'y': np.random.randn(3, 4, 5).astype('float32')
  715. ... },
  716. ... fetch_list=loss.name)
  717. ... scheduler.step() # If you update learning rate each step
  718. ... # scheduler.step() # If you update learning rate each epoch
  719. """
  720. def __init__(
  721. self,
  722. learning_rate,
  723. warmup_steps,
  724. start_lr,
  725. end_lr,
  726. last_epoch=-1,
  727. verbose=False,
  728. ):
  729. type_check = isinstance(learning_rate, (float, int, LRScheduler))
  730. if not type_check:
  731. raise TypeError(
  732. f"the type of learning_rate should be [int, float or LRScheduler], the current type is {learning_rate}"
  733. )
  734. self.learning_rate = learning_rate
  735. assert warmup_steps > 0 and isinstance(
  736. warmup_steps, int
  737. ), " 'warmup_steps' must be a positive integer."
  738. self.warmup_steps = warmup_steps
  739. self.start_lr = start_lr
  740. self.end_lr = end_lr
  741. assert (
  742. end_lr > start_lr
  743. ), f"end_lr {end_lr} must be greater than start_lr {start_lr}"
  744. super().__init__(start_lr, last_epoch, verbose)
  745. def state_dict(self):
  746. """
  747. Returns the state of the LinearWarmup scheduler as a :class:`dict`.
  748. It is a subset of ``self.__dict__`` .
  749. """
  750. state_dict = super().state_dict()
  751. if isinstance(self.learning_rate, LRScheduler):
  752. state_dict["LinearWarmup_LR"] = self.learning_rate.state_dict()
  753. return state_dict
  754. def set_state_dict(self, state_dict):
  755. """
  756. Loads state_dict for LinearWarmup scheduler.
  757. """
  758. super().set_state_dict(state_dict)
  759. if isinstance(self.learning_rate, LRScheduler):
  760. self.learning_rate.set_state_dict(state_dict["LinearWarmup_LR"])
  761. def get_lr(self):
  762. if self.last_epoch < self.warmup_steps:
  763. return (self.end_lr - self.start_lr) * float(
  764. self.last_epoch
  765. ) / float(self.warmup_steps) + self.start_lr
  766. else:
  767. if isinstance(self.learning_rate, LRScheduler):
  768. self.learning_rate.step(self.last_epoch - self.warmup_steps)
  769. return self.learning_rate()
  770. return self.learning_rate
  771. class ExponentialDecay(LRScheduler):
  772. r"""
  773. Update learning rate by `gamma` each epoch.
  774. The algorithm can be described as following.
  775. .. math::
  776. new\_learning\_rate = last\_learning\_rate * gamma
  777. Args:
  778. learning_rate (float): The initial learning rate. It is a python float number.
  779. gamma (float): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
  780. It should be in interval (0.0, 1.0).
  781. last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
  782. verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
  783. Returns:
  784. ``ExponentialDecay`` instance to schedule learning rate.
  785. Examples:
  786. .. code-block:: python
  787. :name: code-example1
  788. >>> # Example1: train on default dynamic graph mode
  789. >>> import paddle
  790. >>> import numpy as np
  791. >>> # train on default dynamic graph mode
  792. >>> linear = paddle.nn.Linear(10, 10)
  793. >>> scheduler = paddle.optimizer.lr.ExponentialDecay(learning_rate=0.5, gamma=0.9, verbose=True)
  794. >>> sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
  795. >>> for epoch in range(20):
  796. ... for batch_id in range(5):
  797. ... x = paddle.uniform([10, 10])
  798. ... out = linear(x)
  799. ... loss = paddle.mean(out)
  800. ... loss.backward()
  801. ... sgd.step()
  802. ... sgd.clear_gradients()
  803. ... scheduler.step() # If you update learning rate each step
  804. ... # scheduler.step() # If you update learning rate each epoch
  805. .. code-block:: python
  806. :name: code-example2
  807. >>> # Example2: train on static graph mode
  808. >>> import paddle
  809. >>> import numpy as np
  810. >>> paddle.enable_static()
  811. >>> main_prog = paddle.static.Program()
  812. >>> start_prog = paddle.static.Program()
  813. >>> with paddle.static.program_guard(main_prog, start_prog):
  814. ... x = paddle.static.data(name='x', shape=[None, 4, 5])
  815. ... y = paddle.static.data(name='y', shape=[None, 4, 5])
  816. ... z = paddle.static.nn.fc(x, 100)
  817. ... loss = paddle.mean(z)
  818. ... scheduler = paddle.optimizer.lr.ExponentialDecay(learning_rate=0.5, gamma=0.9, verbose=True)
  819. ... sgd = paddle.optimizer.SGD(learning_rate=scheduler)
  820. ... sgd.minimize(loss)
  821. ...
  822. >>> exe = paddle.static.Executor()
  823. >>> exe.run(start_prog)
  824. >>> for epoch in range(20):
  825. ... for batch_id in range(5):
  826. ... out = exe.run(
  827. ... main_prog,
  828. ... feed={
  829. ... 'x': np.random.randn(3, 4, 5).astype('float32'),
  830. ... 'y': np.random.randn(3, 4, 5).astype('float32')
  831. ... },
  832. ... fetch_list=loss.name)
  833. ... scheduler.step() # If you update learning rate each step
  834. ... # scheduler.step() # If you update learning rate each epoch
  835. """
  836. def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
  837. assert (
  838. gamma > 0.0 and gamma < 1.0
  839. ), " 'gamma' must be in interval (0.0, 1.0) so that the learning rate will decay."
  840. self.gamma = gamma
  841. super().__init__(learning_rate, last_epoch, verbose)
  842. def get_lr(self):
  843. return self.base_lr * (self.gamma**self.last_epoch)
  844. class MultiStepDecay(LRScheduler):
  845. """
  846. Update the learning rate by ``gamma`` once ``epoch`` reaches one of the milestones.
  847. The algorithm can be described as the code below.
  848. .. code-block:: text
  849. learning_rate = 0.5
  850. milestones = [30, 50]
  851. gamma = 0.1
  852. if epoch < 30:
  853. learning_rate = 0.5
  854. elif epoch < 50:
  855. learning_rate = 0.05
  856. else:
  857. learning_rate = 0.005
  858. Args:
  859. learning_rate (float): The initial learning rate. It is a python float number.
  860. milestones (tuple|list): List or tuple of each boundaries. Must be increasing.
  861. gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
  862. It should be less than 1.0. Default: 0.1.
  863. last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
  864. verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
  865. Returns:
  866. ``MultiStepDecay`` instance to schedule learning rate.
  867. Examples:
  868. .. code-block:: python
  869. :name: code-example1
  870. >>> # Example1: train on default dynamic graph mode
  871. >>> import paddle
  872. >>> import numpy as np
  873. >>> # train on default dynamic graph mode
  874. >>> linear = paddle.nn.Linear(10, 10)
  875. >>> scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
  876. >>> sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
  877. >>> for epoch in range(20):
  878. ... for batch_id in range(5):
  879. ... x = paddle.uniform([10, 10])
  880. ... out = linear(x)
  881. ... loss = paddle.mean(out)
  882. ... loss.backward()
  883. ... sgd.step()
  884. ... sgd.clear_gradients()
  885. ... scheduler.step() # If you update learning rate each step
  886. ... # scheduler.step() # If you update learning rate each epoch
  887. .. code-block:: python
  888. :name: code-example2
  889. >>> # Example2: train on static graph mode
  890. >>> import paddle
  891. >>> import numpy as np
  892. >>> paddle.enable_static()
  893. >>> main_prog = paddle.static.Program()
  894. >>> start_prog = paddle.static.Program()
  895. >>> with paddle.static.program_guard(main_prog, start_prog):
  896. ... x = paddle.static.data(name='x', shape=[None, 4, 5])
  897. ... y = paddle.static.data(name='y', shape=[None, 4, 5])
  898. ... z = paddle.static.nn.fc(x, 100)
  899. ... loss = paddle.mean(z)
  900. ... scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
  901. ... sgd = paddle.optimizer.SGD(learning_rate=scheduler)
  902. ... sgd.minimize(loss)
  903. ...
  904. >>> exe = paddle.static.Executor()
  905. >>> exe.run(start_prog)
  906. >>> for epoch in range(20):
  907. ... for batch_id in range(5):
  908. ... out = exe.run(
  909. ... main_prog,
  910. ... feed={
  911. ... 'x': np.random.randn(3, 4, 5).astype('float32'),
  912. ... 'y': np.random.randn(3, 4, 5).astype('float32')
  913. ... },
  914. ... fetch_list=loss.name)
  915. ... scheduler.step() # If you update learning rate each step
  916. ... # scheduler.step() # If you update learning rate each epoch
  917. """
  918. def __init__(
  919. self, learning_rate, milestones, gamma=0.1, last_epoch=-1, verbose=False
  920. ):
  921. if not isinstance(milestones, (tuple, list)):
  922. raise TypeError(
  923. "The type of 'milestones' in 'MultiStepDecay' must be 'tuple, list', but received %s."
  924. % type(milestones)
  925. )
  926. if not all(
  927. milestones[i] < milestones[i + 1]
  928. for i in range(len(milestones) - 1)
  929. ):
  930. raise ValueError('The elements of milestones must be incremented')
  931. if gamma >= 1.0:
  932. raise ValueError('gamma should be < 1.0.')
  933. self.milestones = milestones
  934. self.gamma = gamma
  935. super().__init__(learning_rate, last_epoch, verbose)
  936. def get_lr(self):
  937. for i in range(len(self.milestones)):
  938. if self.last_epoch < self.milestones[i]:
  939. return self.base_lr * (self.gamma**i)
  940. return self.base_lr * (self.gamma ** len(self.milestones))
  941. class StepDecay(LRScheduler):
  942. """
  943. Update the learning rate of ``optimizer`` by ``gamma`` every ``step_size`` number of epoch.
  944. The algorithm can be described as the code below.
  945. .. code-block:: text
  946. learning_rate = 0.5
  947. step_size = 30
  948. gamma = 0.1
  949. learning_rate = 0.5 if epoch < 30
  950. learning_rate = 0.05 if 30 <= epoch < 60
  951. learning_rate = 0.005 if 60 <= epoch < 90
  952. ...
  953. Args:
  954. learning_rate (float): The initial learning rate. It is a python float number.
  955. step_size (int): the interval to update. It must be a positive integer.
  956. gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
  957. It should be less than 1.0. Default: 0.1.
  958. last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
  959. verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
  960. Returns:
  961. ``StepDecay`` instance to schedule learning rate.
  962. Examples:
  963. .. code-block:: python
  964. :name: code-example1
  965. >>> # Example1: train on default dynamic graph mode
  966. >>> import paddle
  967. >>> import numpy as np
  968. >>> # train on default dynamic graph mode
  969. >>> linear = paddle.nn.Linear(10, 10)
  970. >>> scheduler = paddle.optimizer.lr.StepDecay(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True)
  971. >>> sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
  972. >>> for epoch in range(20):
  973. ... for batch_id in range(5):
  974. ... x = paddle.uniform([10, 10])
  975. ... out = linear(x)
  976. ... loss = paddle.mean(out)
  977. ... loss.backward()
  978. ... sgd.step()
  979. ... sgd.clear_gradients()
  980. ... scheduler.step() # If you update learning rate each step
  981. ... # scheduler.step() # If you update learning rate each epoch
  982. .. code-block:: python
  983. :name: code-example2
  984. >>> # Example2: train on static graph mode
  985. >>> import paddle
  986. >>> import numpy as np
  987. >>> paddle.enable_static()
  988. >>> main_prog = paddle.static.Program()
  989. >>> start_prog = paddle.static.Program()
  990. >>> with paddle.static.program_guard(main_prog, start_prog):
  991. ... x = paddle.static.data(name='x', shape=[None, 4, 5])
  992. ... y = paddle.static.data(name='y', shape=[None, 4, 5])
  993. ... z = paddle.static.nn.fc(x, 100)
  994. ... loss = paddle.mean(z)
  995. ... scheduler = paddle.optimizer.lr.StepDecay(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True)
  996. ... sgd = paddle.optimizer.SGD(learning_rate=scheduler)
  997. ... sgd.minimize(loss)
  998. ...
  999. >>> exe = paddle.static.Executor()
  1000. >>> exe.run(start_prog)
  1001. >>> for epoch in range(20):
  1002. ... for batch_id in range(5):
  1003. ... out = exe.run(
  1004. ... main_prog,
  1005. ... feed={
  1006. ... 'x': np.random.randn(3, 4, 5).astype('float32'),
  1007. ... 'y': np.random.randn(3, 4, 5).astype('float32')
  1008. ... },
  1009. ... fetch_list=loss.name)
  1010. ... scheduler.step() # If you update learning rate each step
  1011. ... # scheduler.step() # If you update learning rate each epoch
  1012. """
  1013. def __init__(
  1014. self, learning_rate, step_size, gamma=0.1, last_epoch=-1, verbose=False
  1015. ):
  1016. if not isinstance(step_size, int):
  1017. raise TypeError(
  1018. "The type of 'step_size' must be 'int', but received %s."
  1019. % type(step_size)
  1020. )
  1021. if gamma >= 1.0:
  1022. raise ValueError('gamma should be < 1.0.')
  1023. assert step_size > 0 and isinstance(
  1024. step_size, int
  1025. ), " 'step_size' must be a positive integer."
  1026. self.step_size = step_size
  1027. self.gamma = gamma
  1028. super().__init__(learning_rate, last_epoch, verbose)
  1029. def get_lr(self):
  1030. i = self.last_epoch // self.step_size
  1031. return self.base_lr * (self.gamma**i)
  1032. class LambdaDecay(LRScheduler):
  1033. """
  1034. Sets the learning rate of ``optimizer`` by function ``lr_lambda`` . ``lr_lambda`` is function which receives ``epoch`` .
  1035. The algorithm can be described as the code below.
  1036. .. code-block:: text
  1037. learning_rate = 0.5 # init learning_rate
  1038. lr_lambda = lambda epoch: 0.95 ** epoch
  1039. learning_rate = 0.5 # epoch 0, 0.5*0.95**0
  1040. learning_rate = 0.475 # epoch 1, 0.5*0.95**1
  1041. learning_rate = 0.45125 # epoch 2, 0.5*0.95**2
  1042. Args:
  1043. learning_rate (float): The initial learning rate. It is a python float number.
  1044. lr_lambda (function): A function which computes a factor by ``epoch`` , and then multiply the initial learning rate by this factor.
  1045. last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
  1046. verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
  1047. Returns:
  1048. ``LambdaDecay`` instance to schedule learning rate.
  1049. Examples:
  1050. .. code-block:: python
  1051. :name: code-example1
  1052. >>> # Example1: train on default dynamic graph mode
  1053. >>> import paddle
  1054. >>> import numpy as np
  1055. >>> # train on default dynamic graph mode
  1056. >>> linear = paddle.nn.Linear(10, 10)
  1057. >>> scheduler = paddle.optimizer.lr.LambdaDecay(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True)
  1058. >>> sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
  1059. >>> for epoch in range(20):
  1060. ... for batch_id in range(5):
  1061. ... x = paddle.uniform([10, 10])
  1062. ... out = linear(x)
  1063. ... loss = paddle.mean(out)
  1064. ... loss.backward()
  1065. ... sgd.step()
  1066. ... sgd.clear_gradients()
  1067. ... scheduler.step() # If you update learning rate each step
  1068. ... # scheduler.step() # If you update learning rate each epoch
  1069. .. code-block:: python
  1070. :name: code-example2
  1071. >>> # Example2: train on static graph mode
  1072. >>> import paddle
  1073. >>> import numpy as np
  1074. >>> paddle.enable_static()
  1075. >>> main_prog = paddle.static.Program()
  1076. >>> start_prog = paddle.static.Program()
  1077. >>> with paddle.static.program_guard(main_prog, start_prog):
  1078. ... x = paddle.static.data(name='x', shape=[None, 4, 5])
  1079. ... y = paddle.static.data(name='y', shape=[None, 4, 5])
  1080. ... z = paddle.static.nn.fc(x, 100)
  1081. ... loss = paddle.mean(z)
  1082. ... scheduler = paddle.optimizer.lr.LambdaDecay(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True)
  1083. ... sgd = paddle.optimizer.SGD(learning_rate=scheduler)
  1084. ... sgd.minimize(loss)
  1085. ...
  1086. >>> exe = paddle.static.Executor()
  1087. >>> exe.run(start_prog)
  1088. >>> for epoch in range(20):
  1089. ... for batch_id in range(5):
  1090. ... out = exe.run(
  1091. ... main_prog,
  1092. ... feed={
  1093. ... 'x': np.random.randn(3, 4, 5).astype('float32'),
  1094. ... 'y': np.random.randn(3, 4, 5).astype('float32')
  1095. ... },
  1096. ... fetch_list=loss.name)
  1097. ... scheduler.step() # If you update learning rate each step
  1098. ... # scheduler.step() # If you update learning rate each epoch
  1099. ...
  1100. """
  1101. def __init__(self, learning_rate, lr_lambda, last_epoch=-1, verbose=False):
  1102. if not callable(lr_lambda):
  1103. raise TypeError(
  1104. "The type of 'lr_lambda' in 'LambdaDecay' must be 'function', but received %s."
  1105. % type(lr_lambda)
  1106. )
  1107. self.lr_lambda = lr_lambda
  1108. super().__init__(learning_rate, last_epoch, verbose)
  1109. def get_lr(self):
  1110. return self.base_lr * self.lr_lambda(self.last_epoch)
  1111. class ReduceOnPlateau(LRScheduler):
  1112. """
  1113. Reduce learning rate when ``metrics`` has stopped descending. Models often benefit from reducing the learning rate
  1114. by 2 to 10 times once model performance has no longer improvement.
  1115. The ``metrics`` is the one which has been pass into ``step`` , it's shape must [] or [1]. When ``metrics``
  1116. stop descending for a ``patience`` number of epochs, the learning rate will be reduced to ``learning_rate * factor`` .
  1117. (Specially, ``mode`` can also be set to ``'max`` , in this case, when ``metrics`` stop ascending for a ``patience``
  1118. number of epochs, the learning rate will be reduced.)
  1119. In addition, After each reduction, it will wait a ``cooldown`` number of epochs before resuming above operation.
  1120. Args:
  1121. learning_rate (float): The initial learning rate. It is a python float number.
  1122. mode (str, optional): ``'min'`` or ``'max'`` can be selected. Normally, it is ``'min'`` , which means that the
  1123. learning rate will reduce when ``loss`` stops descending. Specially, if it's set to ``'max'`` , the learning
  1124. rate will reduce when ``loss`` stops ascending. Default: ``'min'`` .
  1125. factor (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * factor`` .
  1126. It should be less than 1.0. Default: 0.1.
  1127. patience (int, optional): When ``loss`` doesn't improve for this number of epochs, learning rate will be reduced.
  1128. Default: 10.
  1129. threshold (float, optional): ``threshold`` and ``threshold_mode`` will determine the minimum change of ``loss`` .
  1130. This make tiny changes of ``loss`` will be ignored. Default: 1e-4.
  1131. threshold_mode (str, optional): ``'rel'`` or ``'abs'`` can be selected. In ``'rel'`` mode, the minimum change of ``loss``
  1132. is ``last_loss * threshold`` , where ``last_loss`` is ``loss`` in last epoch. In ``'abs'`` mode, the minimum
  1133. change of ``loss`` is ``threshold`` . Default: ``'rel'`` .
  1134. cooldown (int, optional): The number of epochs to wait before resuming normal operation. Default: 0.
  1135. min_lr (float, optional): The lower bound of the learning rate after reduction. Default: 0.
  1136. epsilon (float, optional): Minimal decay applied to lr. If the difference between new and old lr is smaller than epsilon,
  1137. the update is ignored. Default: 1e-8.
  1138. verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False``.
  1139. Returns:
  1140. ``ReduceOnPlateau`` instance to schedule learning rate.
  1141. Examples:
  1142. .. code-block:: python
  1143. :name: code-example1
  1144. >>> # Example1: train on default dynamic graph mode
  1145. >>> import paddle
  1146. >>> import numpy as np
  1147. >>> # train on default dynamic graph mode
  1148. >>> linear = paddle.nn.Linear(10, 10)
  1149. >>> scheduler = paddle.optimizer.lr.ReduceOnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True)
  1150. >>> sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
  1151. >>> for epoch in range(20):
  1152. ... for batch_id in range(5):
  1153. ... x = paddle.uniform([10, 10])
  1154. ... out = linear(x)
  1155. ... loss = paddle.mean(out)
  1156. ... loss.backward()
  1157. ... sgd.step()
  1158. ... sgd.clear_gradients()
  1159. ... scheduler.step(loss) # If you update learning rate each step
  1160. ... # scheduler.step(loss) # If you update learning rate each epoch
  1161. .. code-block:: python
  1162. :name: code-example2
  1163. >>> # Example2: train on static graph mode
  1164. >>> import paddle
  1165. >>> import numpy as np
  1166. >>> paddle.enable_static()
  1167. >>> main_prog = paddle.static.Program()
  1168. >>> start_prog = paddle.static.Program()
  1169. >>> with paddle.static.program_guard(main_prog, start_prog):
  1170. ... x = paddle.static.data(name='x', shape=[None, 4, 5])
  1171. ... y = paddle.static.data(name='y', shape=[None, 4, 5])
  1172. ... z = paddle.static.nn.fc(x, 100)
  1173. ... loss = paddle.mean(z)
  1174. ... scheduler = paddle.optimizer.lr.ReduceOnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True)
  1175. ... sgd = paddle.optimizer.SGD(learning_rate=scheduler)
  1176. ... sgd.minimize(loss)
  1177. ...
  1178. >>> exe = paddle.static.Executor()
  1179. >>> exe.run(start_prog)
  1180. >>> for epoch in range(20):
  1181. ... for batch_id in range(5):
  1182. ... out = exe.run(
  1183. ... main_prog,
  1184. ... feed={
  1185. ... 'x': np.random.randn(3, 4, 5).astype('float32'),
  1186. ... 'y': np.random.randn(3, 4, 5).astype('float32')
  1187. ... },
  1188. ... fetch_list=loss.name)
  1189. ... scheduler.step(out[0]) # If you update learning rate each step
  1190. ... # scheduler.step(out[0]) # If you update learning rate each epoch
  1191. ...
  1192. """
  1193. def __init__(
  1194. self,
  1195. learning_rate,
  1196. mode='min',
  1197. factor=0.1,
  1198. patience=10,
  1199. threshold=1e-4,
  1200. threshold_mode='rel',
  1201. cooldown=0,
  1202. min_lr=0,
  1203. epsilon=1e-8,
  1204. verbose=False,
  1205. ):
  1206. mode = mode.lower()
  1207. if mode not in ['min', 'max']:
  1208. raise ValueError('mode: ' + mode + ' is unknown!')
  1209. self.mode = mode
  1210. if factor >= 1.0:
  1211. raise ValueError(
  1212. 'new_lr = origin_lr * gamma and gamma should be < 1.0.'
  1213. )
  1214. self.factor = factor
  1215. threshold_mode = threshold_mode.lower()
  1216. if threshold_mode not in ['rel', 'abs']:
  1217. raise ValueError(
  1218. 'threshold mode: ' + threshold_mode + ' is unknown!'
  1219. )
  1220. self.threshold_mode = threshold_mode
  1221. if not isinstance(learning_rate, (float, int)):
  1222. raise TypeError(
  1223. "The type of 'learning_rate' in 'ReduceOnPlateau' must be 'float', but received %s."
  1224. % type(learning_rate)
  1225. )
  1226. self.patience = patience
  1227. self.threshold = threshold
  1228. self.threshold_mode = threshold_mode
  1229. self.cooldown = cooldown
  1230. self.min_lr = min_lr
  1231. self.epsilon = epsilon
  1232. self.cooldown_counter = 0
  1233. self.best = None
  1234. self.num_bad_epochs = 0
  1235. # Can not call Parent __init__, so implement here.
  1236. self.base_lr = float(learning_rate)
  1237. self.last_lr = float(learning_rate)
  1238. self.last_epoch = 0
  1239. self.verbose = verbose
  1240. self._var_name = None
  1241. # "cooldown_counter / best / num_bad_epochs / last_epoch / last_lr" will be stored.
  1242. def state_keys(self):
  1243. self.keys = [
  1244. 'cooldown_counter',
  1245. 'best',
  1246. 'num_bad_epochs',
  1247. 'last_epoch',
  1248. 'last_lr',
  1249. ]
  1250. def step(self, metrics, epoch=None):
  1251. """
  1252. step should be called after `optimizer.step()` . It will update the learning rate in optimizer according to ``metrics`` .
  1253. The new learning rate will take effect on next epoch.
  1254. Args:
  1255. metrics (Tensor|numpy.ndarray|float): Which will be monitored to determine whether the learning rate will reduce.
  1256. If it stop descending for a ``patience`` number of epochs, the learning rate will reduce. If it's 'Tensor' or
  1257. 'numpy.ndarray', its numel must be 1.
  1258. epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.
  1259. Returns:
  1260. None
  1261. Examples:
  1262. Please refer to the example of current LRScheduler.
  1263. """
  1264. if epoch is None:
  1265. self.last_epoch = self.last_epoch + 1
  1266. else:
  1267. self.last_epoch = epoch
  1268. # loss must be float, numpy.ndarray or 1-D Tensor with numel 1
  1269. if isinstance(metrics, (core.eager.Tensor, numpy.ndarray)):
  1270. assert metrics.size == 1, (
  1271. f"the size of metrics must be 1, but the current metrics.size is {metrics.size}. Maybe that "
  1272. "you should call paddle.mean to process it first."
  1273. )
  1274. elif not isinstance(
  1275. metrics, (int, float, numpy.float32, numpy.float64)
  1276. ):
  1277. raise TypeError(
  1278. f"metrics must be 'int', 'float', 'np.float64', 'numpy.ndarray' or 'paddle.Tensor', but receive {type(metrics)}"
  1279. )
  1280. if self.cooldown_counter > 0:
  1281. self.cooldown_counter -= 1
  1282. else:
  1283. if self.best is None or self._is_better(metrics, self.best):
  1284. self.best = metrics
  1285. self.num_bad_epochs = 0
  1286. else:
  1287. self.num_bad_epochs += 1
  1288. if self.num_bad_epochs > self.patience:
  1289. self.cooldown_counter = self.cooldown
  1290. self.num_bad_epochs = 0
  1291. new_lr = max(self.last_lr * self.factor, self.min_lr)
  1292. if self.last_lr - new_lr > self.epsilon:
  1293. self.last_lr = new_lr
  1294. if self.verbose:
  1295. print(
  1296. f'Epoch {self.last_epoch}: {self.__class__.__name__} set learning rate to {self.last_lr}.'
  1297. )
  1298. def _is_better(self, current, best):
  1299. if self.mode == 'min' and self.threshold_mode == 'rel':
  1300. return current < best - best * self.threshold
  1301. elif self.mode == 'min' and self.threshold_mode == 'abs':
  1302. return current < best - self.threshold
  1303. elif self.mode == 'max' and self.threshold_mode == 'rel':
  1304. return current > best + best * self.threshold
  1305. else:
  1306. return current > best + self.threshold
  1307. class CosineAnnealingDecay(LRScheduler):
  1308. r"""
  1309. Set the learning rate using a cosine annealing schedule, where :math:`\eta_{max}` is set to
  1310. the initial learning_rate. :math:`T_{cur}` is the number of epochs since the last restart in
  1311. SGDR.
  1312. The algorithm can be described as following.
  1313. .. math::
  1314. \eta_t & = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1
  1315. + \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right),
  1316. & T_{cur} \neq (2k+1)T_{max};
  1317. \eta_{t+1} & = \eta_{t} + \frac{1}{2}(\eta_{max} - \eta_{min})
  1318. \left(1 - \cos\left(\frac{1}{T_{max}}\pi\right)\right),
  1319. & T_{cur} = (2k+1)T_{max}.
  1320. It has been proposed in `SGDR: Stochastic Gradient Descent with Warm Restarts <https://arxiv.org/abs/1608.03983>`_.
  1321. Note that this only implements the cosine annealing part of SGDR, and not the restarts.
  1322. Args:
  1323. learning_rate (float): The initial learning rate, that is :math:`\eta_{max}` . It can be set to python float or int number.
  1324. T_max (int): Maximum number of iterations. It is half of the decay cycle of learning rate. It must be a positive integer.
  1325. eta_min (float|int, optional): Minimum learning rate, that is :math:`\eta_{min}` . Default: 0.
  1326. last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
  1327. verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
  1328. Returns:
  1329. ``CosineAnnealingDecay`` instance to schedule learning rate.
  1330. Examples:
  1331. .. code-block:: python
  1332. :name: code-example1
  1333. >>> # Example1: train on default dynamic graph mode
  1334. >>> import paddle
  1335. >>> import numpy as np
  1336. >>> # train on default dynamic graph mode
  1337. >>> linear = paddle.nn.Linear(10, 10)
  1338. >>> scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=0.5, T_max=10, verbose=True)
  1339. >>> sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
  1340. >>> for epoch in range(20):
  1341. ... for batch_id in range(5):
  1342. ... x = paddle.uniform([10, 10])
  1343. ... out = linear(x)
  1344. ... loss = paddle.mean(out)
  1345. ... loss.backward()
  1346. ... sgd.step()
  1347. ... sgd.clear_gradients()
  1348. ... scheduler.step() # If you update learning rate each step
  1349. ... # scheduler.step() # If you update learning rate each epoch
  1350. .. code-block:: python
  1351. :name: code-example2
  1352. >>> # Example2: train on static graph mode
  1353. >>> import paddle
  1354. >>> import numpy as np
  1355. >>> paddle.enable_static()
  1356. >>> main_prog = paddle.static.Program()
  1357. >>> start_prog = paddle.static.Program()
  1358. >>> with paddle.static.program_guard(main_prog, start_prog):
  1359. ... x = paddle.static.data(name='x', shape=[None, 4, 5])
  1360. ... y = paddle.static.data(name='y', shape=[None, 4, 5])
  1361. ... z = paddle.static.nn.fc(x, 100)
  1362. ... loss = paddle.mean(z)
  1363. ... scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=0.5, T_max=10, verbose=True)
  1364. ... sgd = paddle.optimizer.SGD(learning_rate=scheduler)
  1365. ... sgd.minimize(loss)
  1366. ...
  1367. >>> exe = paddle.static.Executor()
  1368. >>> exe.run(start_prog)
  1369. >>> for epoch in range(20):
  1370. ... for batch_id in range(5):
  1371. ... out = exe.run(
  1372. ... main_prog,
  1373. ... feed={
  1374. ... 'x': np.random.randn(3, 4, 5).astype('float32'),
  1375. ... 'y': np.random.randn(3, 4, 5).astype('float32')
  1376. ... },
  1377. ... fetch_list=loss.name)
  1378. ... scheduler.step() # If you update learning rate each step
  1379. ... # scheduler.step() # If you update learning rate each epoch
  1380. """
  1381. def __init__(
  1382. self, learning_rate, T_max, eta_min=0, last_epoch=-1, verbose=False
  1383. ):
  1384. if not isinstance(T_max, int):
  1385. raise TypeError(
  1386. "The type of 'T_max' in 'CosineAnnealingDecay' must be 'int', but received %s."
  1387. % type(T_max)
  1388. )
  1389. if not isinstance(eta_min, (float, int)):
  1390. raise TypeError(
  1391. "The type of 'eta_min' in 'CosineAnnealingDecay' must be 'float, int', but received %s."
  1392. % type(eta_min)
  1393. )
  1394. assert T_max > 0 and isinstance(
  1395. T_max, int
  1396. ), " 'T_max' must be a positive integer."
  1397. self.T_max = T_max
  1398. self.eta_min = float(eta_min)
  1399. super().__init__(learning_rate, last_epoch, verbose)
  1400. def get_lr(self):
  1401. if self.last_epoch == 0:
  1402. return self.base_lr
  1403. elif (self.last_epoch - 1 - self.T_max) % (2 * self.T_max) == 0:
  1404. return (
  1405. self.last_lr
  1406. + (self.base_lr - self.eta_min)
  1407. * (1 - math.cos(math.pi / self.T_max))
  1408. / 2
  1409. )
  1410. return (1 + math.cos(math.pi * self.last_epoch / self.T_max)) / (
  1411. 1 + math.cos(math.pi * (self.last_epoch - 1) / self.T_max)
  1412. ) * (self.last_lr - self.eta_min) + self.eta_min
  1413. def _get_closed_form_lr(self):
  1414. return (
  1415. self.eta_min
  1416. + (self.base_lr - self.eta_min)
  1417. * (1 + math.cos(math.pi * self.last_epoch / self.T_max))
  1418. / 2
  1419. )
  1420. class MultiplicativeDecay(LRScheduler):
  1421. """
  1422. Multiply the learning rate of ``optimizer`` by the factor given in function ``lr_lambda`` .
  1423. The algorithm can be described as the code below.
  1424. .. code-block:: text
  1425. learning_rate = 0.5 # init learning_rate
  1426. lr_lambda = lambda epoch: 0.95
  1427. learning_rate = 0.5 # epoch 0,
  1428. learning_rate = 0.475 # epoch 1, 0.5*0.95
  1429. learning_rate = 0.45125 # epoch 2, 0.475*0.95
  1430. Args:
  1431. learning_rate (float): The initial learning rate. It is a python float number.
  1432. lr_lambda (function): A function which computes a factor by ``epoch`` , and then multiply the last learning rate by this factor.
  1433. last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
  1434. verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
  1435. Returns:
  1436. ``MultiplicativeDecay`` instance to schedule learning rate.
  1437. Examples:
  1438. .. code-block:: python
  1439. >>> import paddle
  1440. >>> # train on default dynamic graph mode
  1441. >>> linear = paddle.nn.Linear(10, 10)
  1442. >>> scheduler = paddle.optimizer.lr.MultiplicativeDecay(learning_rate=0.5, lr_lambda=lambda x:0.95, verbose=True)
  1443. >>> sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
  1444. >>> for epoch in range(20):
  1445. ... for batch_id in range(5):
  1446. ... x = paddle.uniform([10, 10])
  1447. ... out = linear(x)
  1448. ... loss = paddle.mean(out)
  1449. ... loss.backward()
  1450. ... sgd.step()
  1451. ... sgd.clear_gradients()
  1452. ... scheduler.step() # If you update learning rate each step
  1453. ... # scheduler.step() # If you update learning rate each epoch
  1454. ...
  1455. """
  1456. def __init__(self, learning_rate, lr_lambda, last_epoch=-1, verbose=False):
  1457. if not callable(lr_lambda):
  1458. raise TypeError(
  1459. "The type of 'lr_lambda' in 'MultiplicativeDecay' must be 'function', but received %s."
  1460. % type(lr_lambda)
  1461. )
  1462. self.lr_lambda = lr_lambda
  1463. super().__init__(learning_rate, last_epoch, verbose)
  1464. def get_lr(self):
  1465. cur_lr = self.base_lr
  1466. for epoch in range(1, self.last_epoch + 1):
  1467. cur_lr = cur_lr * self.lr_lambda(epoch)
  1468. return cur_lr
  1469. class OneCycleLR(LRScheduler):
  1470. r"""
  1471. Sets the learning rate according to the one cycle learning rate scheduler.
  1472. The scheduler adjusts the learning rate from an initial learning rate to the maximum learning rate and then
  1473. from that maximum learning rate to the minimum learning rate, which is much less than the initial learning rate.
  1474. It has been proposed in `Super-Convergence: Very Fast Training of Neural Networks Using Large Learning Rates <https://arxiv.org/abs/1708.07120>`_.
  1475. Please note that the default behaviour of this scheduler follows the fastai implementation of one cycle,
  1476. which claims that “unpublished work has shown even better results by using only two phases”.
  1477. If you want the behaviour of this scheduler to be consistent with the paper, please set ``three_phase=True`` .
  1478. Also note that you should update learning rate each step.
  1479. Args:
  1480. max_learning_rate (float): The maximum learning rate. It is a python float number. Functionally, it defines the initial learning rate by ``divide_factor`` .
  1481. total_steps (int): Number of total training steps.
  1482. divide_factor (float, optional): Initial learning rate will be determined by initial_learning_rate = max_learning_rate / divide_factor. Default: 25.
  1483. end_learning_rate (float, optional): The minimum learning rate during training, it should be much less than initial learning rate.
  1484. phase_pct (float): The percentage of total steps which used to increasing learning rate. Default: 0.3.
  1485. anneal_strategy (str, optional): Strategy of adjusting learning rate.'cos' for cosine annealing, 'linear' for linear annealing. Default: 'cos'.
  1486. three_phase (bool, optional): Whether to use three phase.
  1487. If ``True``:
  1488. 1. The learning rate will first increase from initial learning rate to maximum learning rate.
  1489. 2. Then it will decrease to initial learning rate. Number of step in this phase is the same as the one in first phase.
  1490. 3. Finally, it will decrease to minimum learning rate which is much less than initial learning rate.
  1491. If ``False``:
  1492. 1. The learning rate will increase to maximum learning rate.
  1493. 2. Then it will directly decrease to minimum learning rate.
  1494. last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
  1495. verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
  1496. Returns:
  1497. ``OneCycleLR`` instance to schedule learning rate.
  1498. Examples:
  1499. .. code-block:: python
  1500. :name: code-example1
  1501. >>> # Example1: train on default dynamic graph mode
  1502. >>> import paddle
  1503. >>> import numpy as np
  1504. >>> # train on default dynamic graph mode
  1505. >>> linear = paddle.nn.Linear(10, 10)
  1506. >>> scheduler = paddle.optimizer.lr.OneCycleLR(max_learning_rate=1.0, total_steps=100, verbose=True)
  1507. >>> sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
  1508. >>> for epoch in range(5):
  1509. ... for batch_id in range(20):
  1510. ... x = paddle.uniform([10, 10])
  1511. ... out = linear(x)
  1512. ... loss = paddle.mean(out)
  1513. ... loss.backward()
  1514. ... sgd.step()
  1515. ... sgd.clear_gradients()
  1516. ... scheduler.step() # You should update learning rate each step
  1517. .. code-block:: python
  1518. :name: code-example2
  1519. >>> # Example2: train on static graph mode
  1520. >>> import paddle
  1521. >>> import numpy as np
  1522. >>> paddle.enable_static()
  1523. >>> main_prog = paddle.static.Program()
  1524. >>> start_prog = paddle.static.Program()
  1525. >>> with paddle.static.program_guard(main_prog, start_prog):
  1526. ... x = paddle.static.data(name='x', shape=[None, 4, 5])
  1527. ... y = paddle.static.data(name='y', shape=[None, 4, 5])
  1528. ... z = paddle.static.nn.fc(x, 100)
  1529. ... loss = paddle.mean(z)
  1530. ... scheduler = paddle.optimizer.lr.OneCycleLR(max_learning_rate=1.0, total_steps=100, verbose=True)
  1531. ... sgd = paddle.optimizer.SGD(learning_rate=scheduler)
  1532. ... sgd.minimize(loss)
  1533. ...
  1534. >>> exe = paddle.static.Executor()
  1535. >>> exe.run(start_prog)
  1536. >>> for epoch in range(5):
  1537. ... for batch_id in range(20):
  1538. ... out = exe.run(
  1539. ... main_prog,
  1540. ... feed={
  1541. ... 'x': np.random.randn(3, 4, 5).astype('float32'),
  1542. ... 'y': np.random.randn(3, 4, 5).astype('float32')
  1543. ... },
  1544. ... fetch_list=loss.name)
  1545. ... scheduler.step() # You should update learning rate each step
  1546. ...
  1547. """
  1548. def __init__(
  1549. self,
  1550. max_learning_rate,
  1551. total_steps,
  1552. divide_factor=25.0,
  1553. end_learning_rate=0.0001,
  1554. phase_pct=0.3,
  1555. anneal_strategy='cos',
  1556. three_phase=False,
  1557. last_epoch=-1,
  1558. verbose=False,
  1559. ):
  1560. # Check type and value of max_learning_rate
  1561. if not isinstance(max_learning_rate, (float, int)):
  1562. raise TypeError(
  1563. f"'max_learning_rate' must be 'float' or 'int', but received {type(max_learning_rate)}"
  1564. )
  1565. if max_learning_rate < 0:
  1566. raise ValueError("'max_learning_rate' must be a positive integer.")
  1567. # Check type and value of end_learning_rate
  1568. if not isinstance(end_learning_rate, (float, int)):
  1569. raise TypeError(
  1570. f"'end_learning_rate' must be 'float' or 'int', but received {type(end_learning_rate)}"
  1571. )
  1572. if end_learning_rate < 0:
  1573. raise ValueError("'end_learning_rate' must be a positive integer.")
  1574. # Check type and value of total_steps
  1575. if not isinstance(total_steps, int):
  1576. raise TypeError(
  1577. f"'total_step' must be 'int', but received {type(total_steps)}"
  1578. )
  1579. if total_steps <= 0:
  1580. raise ValueError("'total_step' must be a positive integer.")
  1581. self.total_steps = total_steps
  1582. # Check type and value of pac_start
  1583. if not isinstance(phase_pct, float):
  1584. raise TypeError(
  1585. f"'phase_pct' must be 'float', but received {type(phase_pct)}"
  1586. )
  1587. if phase_pct < 0 or phase_pct > 1:
  1588. raise ValueError(
  1589. f"'phase_pct' must be between 0 and 1, but received {phase_pct}"
  1590. )
  1591. # Check type and value of divide_factor
  1592. if not isinstance(divide_factor, (float, int)):
  1593. raise TypeError(
  1594. f"'divide_factor' must be 'float' or 'int', but received {type(divide_factor)}"
  1595. )
  1596. initial_lr = max_learning_rate / float(divide_factor)
  1597. min_lr = float(end_learning_rate)
  1598. if three_phase:
  1599. if phase_pct >= 0.5:
  1600. raise ValueError(
  1601. "When three_phase is True, 'phase_pct' must be less than 0.5"
  1602. )
  1603. # start step and end step of each phase.
  1604. self._step_config = [
  1605. 0,
  1606. phase_pct * self.total_steps - 1,
  1607. 2 * phase_pct * self.total_steps - 2,
  1608. self.total_steps - 1,
  1609. self.total_steps - 1, # for the last step.
  1610. ]
  1611. # step size of each phase.
  1612. self._steps_size = [
  1613. self._step_config[1] - self._step_config[0],
  1614. self._step_config[2] - self._step_config[1],
  1615. self._step_config[3] - self._step_config[2],
  1616. self._step_config[3]
  1617. - self._step_config[2], # for the last step.
  1618. ]
  1619. # start lr and end lr of each phase.
  1620. self._lr_config = [
  1621. initial_lr,
  1622. max_learning_rate,
  1623. initial_lr,
  1624. min_lr,
  1625. ]
  1626. else:
  1627. self._step_config = [
  1628. 0,
  1629. phase_pct * self.total_steps - 1,
  1630. self.total_steps - 1,
  1631. self.total_steps - 1,
  1632. ]
  1633. self._steps_size = [
  1634. self._step_config[1] - self._step_config[0],
  1635. self._step_config[2] - self._step_config[1],
  1636. self._step_config[2] - self._step_config[1],
  1637. ]
  1638. self._lr_config = [initial_lr, max_learning_rate, min_lr]
  1639. # Check anneal_strategy
  1640. if anneal_strategy == 'cos':
  1641. self.anneal_func = self._cos_annealing
  1642. elif anneal_strategy == 'linear':
  1643. self.anneal_func = self._linear_annealing
  1644. else:
  1645. raise ValueError(
  1646. f"'anneal_strategy' must by one of 'cos' or 'linear', but received {anneal_strategy}"
  1647. )
  1648. super().__init__(initial_lr, last_epoch, verbose)
  1649. def _cos_annealing(self, start_lr, end_lr, pct):
  1650. cos_out = math.cos(math.pi * pct) + 1
  1651. return end_lr + (start_lr - end_lr) / 2.0 * cos_out
  1652. def _linear_annealing(self, start_lr, end_lr, pct):
  1653. return (end_lr - start_lr) * pct + start_lr
  1654. def get_lr(self):
  1655. current_step = self.last_epoch
  1656. if current_step > self.total_steps:
  1657. raise ValueError(
  1658. f"Tried to step {current_step} times. However the number of total steps is {self.total_steps}"
  1659. )
  1660. for i, (end_step, step_size) in enumerate(
  1661. zip(self._step_config[1:], self._steps_size)
  1662. ):
  1663. # i == len(self._lr_config) - 2 catch the last step, otherwise it will return None.
  1664. if current_step <= end_step or i == len(self._lr_config) - 2:
  1665. # self._step_config[i] means start step of a phase.
  1666. percentage = (current_step - self._step_config[i]) / step_size
  1667. return self.anneal_func(
  1668. self._lr_config[i], self._lr_config[i + 1], percentage
  1669. )
  1670. class CyclicLR(LRScheduler):
  1671. r"""
  1672. Set the learning rate according to the cyclic learning rate (CLR) scheduler.
  1673. The scheduler regards the process of learning rate adjustment as one cycle after another.
  1674. It cycles the learning rate between two boundaries with a constant frequency.
  1675. The distance between the two boundaries can be scaled on a per-iteration or per-cycle basis.
  1676. It has been proposed in `Cyclic Learning Rates for Training Neural Networks <https://arxiv.org/abs/1506.01186>`_.
  1677. According to the paper, the cyclic learning rate schedule has three build-in scale methods:
  1678. * "triangular": A basic triangular cycle without any amplitude scaling.
  1679. * "triangular2": A basic triangular cycle that reduce initial amplitude by half each cycle.
  1680. * "exp_range": A cycle that scales initial amplitude by scale function which is defined as :math:`gamma^{iterations}` .
  1681. The initial amplitude is defined as max_learning_rate - base_learning_rate.
  1682. Also note that you should update learning rate each step.
  1683. Args:
  1684. base_learning_rate (float): Initial learning rate, which is the lower boundary in the cycle. The paper recommends
  1685. that set the base_learning_rate to 1/3 or 1/4 of max_learning_rate.
  1686. max_learning_rate (float): Maximum learning rate in the cycle. It defines the cycle amplitude as above.
  1687. Since there is some scaling operation during process of learning rate adjustment,
  1688. max_learning_rate may not actually be reached.
  1689. step_size_up (int): Number of training steps, which is used to increase learning rate in a cycle.
  1690. The step size of one cycle will be defined by step_size_up + step_size_down. According to the paper, step
  1691. size should be set as at least 3 or 4 times steps in one epoch.
  1692. step_size_down (int, optional): Number of training steps, which is used to decrease learning rate in a cycle.
  1693. If not specified, it's value will initialize to `` step_size_up `` . Default: None
  1694. mode (str, optional): one of 'triangular', 'triangular2' or 'exp_range'.
  1695. If scale_fn is specified, this argument will be ignored. Default: 'triangular'
  1696. exp_gamma (float): Constant in 'exp_range' scaling function: exp_gamma**iterations. Used only when mode = 'exp_range'. Default: 1.0
  1697. scale_fn (function, optional): A custom scaling function, which is used to replace three build-in methods.
  1698. It should only have one argument. For all x >= 0, 0 <= scale_fn(x) <= 1.
  1699. If specified, then 'mode' will be ignored. Default: None
  1700. scale_mode (str, optional): One of 'cycle' or 'iterations'. Defines whether scale_fn is evaluated on cycle
  1701. number or cycle iterations (total iterations since start of training). Default: 'cycle'
  1702. last_epoch (int, optional): The index of last epoch. Can be set to restart training.Default: -1, means initial learning rate.
  1703. verbose: (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
  1704. Returns:
  1705. ``CyclicLR`` instance to schedule learning rate.
  1706. Examples:
  1707. .. code-block:: python
  1708. :name: code-example1
  1709. >>> # Example1: train on default dynamic graph mode
  1710. >>> import paddle
  1711. >>> import numpy as np
  1712. >>> # train on default dynamic graph mode
  1713. >>> linear = paddle.nn.Linear(10, 10)
  1714. >>> scheduler = paddle.optimizer.lr.CyclicLR(base_learning_rate=0.5, max_learning_rate=1.0, step_size_up=15, step_size_down=5, verbose=True)
  1715. >>> sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
  1716. >>> for epoch in range(5):
  1717. ... for batch_id in range(20):
  1718. ... x = paddle.uniform([10, 10])
  1719. ... out = linear(x)
  1720. ... loss = paddle.mean(out)
  1721. ... loss.backward()
  1722. ... sgd.step()
  1723. ... sgd.clear_gradients()
  1724. ... scheduler.step() # You should update learning rate each step
  1725. .. code-block:: python
  1726. :name: code-example2
  1727. >>> # Example2: train on static graph mode
  1728. >>> import paddle
  1729. >>> import numpy as np
  1730. >>> paddle.enable_static()
  1731. >>> main_prog = paddle.static.Program()
  1732. >>> start_prog = paddle.static.Program()
  1733. >>> with paddle.static.program_guard(main_prog, start_prog):
  1734. ... x = paddle.static.data(name='x', shape=[None, 4, 5])
  1735. ... y = paddle.static.data(name='y', shape=[None, 4, 5])
  1736. ... z = paddle.static.nn.fc(x, 100)
  1737. ... loss = paddle.mean(z)
  1738. ... scheduler = paddle.optimizer.lr.CyclicLR(base_learning_rate=0.5,
  1739. ... max_learning_rate=1.0, step_size_up=15, step_size_down=5, verbose=True)
  1740. ... sgd = paddle.optimizer.SGD(learning_rate=scheduler)
  1741. ... sgd.minimize(loss)
  1742. ...
  1743. >>> exe = paddle.static.Executor()
  1744. >>> exe.run(start_prog)
  1745. >>> for epoch in range(5):
  1746. ... for batch_id in range(20):
  1747. ... out = exe.run(
  1748. ... main_prog,
  1749. ... feed={
  1750. ... 'x': np.random.randn(3, 4, 5).astype('float32'),
  1751. ... 'y': np.random.randn(3, 4, 5).astype('float32')
  1752. ... },
  1753. ... fetch_list=loss.name)
  1754. ... scheduler.step() # You should update learning rate each step
  1755. """
  1756. def __init__(
  1757. self,
  1758. base_learning_rate,
  1759. max_learning_rate,
  1760. step_size_up,
  1761. step_size_down=None,
  1762. mode='triangular',
  1763. exp_gamma=1.0,
  1764. scale_fn=None,
  1765. scale_mode='cycle',
  1766. last_epoch=-1,
  1767. verbose=False,
  1768. ):
  1769. # check type and value of max_learning_rate
  1770. if not isinstance(max_learning_rate, (float, int)):
  1771. raise TypeError(
  1772. f"'max_learning_rate' must be 'float' or 'int', but received {type(max_learning_rate)}"
  1773. )
  1774. if max_learning_rate < 0:
  1775. raise ValueError(
  1776. f"'max_learning_rate' must be a positive integer, but received {max_learning_rate}"
  1777. )
  1778. # check type and value of step_size_up
  1779. if not isinstance(step_size_up, int):
  1780. raise TypeError(
  1781. f"The type of 'step_size_up' must be int, but received {type(step_size_up)}"
  1782. )
  1783. if step_size_up <= 0:
  1784. raise ValueError(
  1785. f"'step_size_up' must be a positive integer, but received {step_size_up}"
  1786. )
  1787. # check type and value of step_size_down
  1788. if step_size_down is not None:
  1789. if not isinstance(step_size_down, int):
  1790. raise TypeError(
  1791. f"The type of 'step_size_down' must be int, but received {type(step_size_down)}"
  1792. )
  1793. if step_size_down <= 0:
  1794. raise ValueError(
  1795. f"'step_size_down' must be a positive integer, but received {step_size_down}"
  1796. )
  1797. # check type of exp_gamma
  1798. if not isinstance(exp_gamma, float):
  1799. raise TypeError(
  1800. f"The type of 'exp_gamma' must be float, but received {type(exp_gamma)}"
  1801. )
  1802. step_size_up = float(step_size_up)
  1803. step_size_down = (
  1804. float(step_size_down)
  1805. if step_size_down is not None
  1806. else step_size_up
  1807. )
  1808. self.cycle_size = step_size_up + step_size_down
  1809. self.step_up_pct = step_size_up / self.cycle_size
  1810. self.max_lr = float(max_learning_rate)
  1811. self.amplitude = self.max_lr - base_learning_rate
  1812. if (
  1813. mode not in ['triangular', 'triangular2', 'exp_range']
  1814. and scale_fn is None
  1815. ):
  1816. raise ValueError(
  1817. "'mode' is invalid and 'scale_fn' is not specified, make sure one of 'mode' or 'scale_fn' is valid"
  1818. )
  1819. if scale_mode not in ['cycle', 'iterations']:
  1820. raise ValueError(
  1821. "'scale_mode' must be one of 'cycle' or 'iterations"
  1822. )
  1823. self.mode = mode
  1824. self.gamma = exp_gamma # only for exp_range mode
  1825. if scale_fn is None:
  1826. if self.mode == 'triangular':
  1827. self.scale_fn = self._triangular_scale_fn
  1828. self.scale_mode = 'cycle'
  1829. elif self.mode == 'triangular2':
  1830. self.scale_fn = self._triangular2_scale_fn
  1831. self.scale_mode = 'cycle'
  1832. elif self.mode == 'exp_range':
  1833. self.scale_fn = self._exp_range_scale_fn
  1834. self.scale_mode = 'iterations'
  1835. else:
  1836. self.scale_fn = scale_fn
  1837. self.scale_mode = scale_mode
  1838. super().__init__(base_learning_rate, last_epoch, verbose)
  1839. def _triangular_scale_fn(self, x):
  1840. return 1.0
  1841. def _triangular2_scale_fn(self, x):
  1842. return 1 / (2.0 ** (x - 1))
  1843. def _exp_range_scale_fn(self, x):
  1844. return self.gamma**x
  1845. def get_lr(self):
  1846. iterations = self.last_epoch
  1847. cycle = 1 + iterations // self.cycle_size
  1848. pct_per_cycle = 1.0 + iterations / self.cycle_size - cycle
  1849. if pct_per_cycle <= self.step_up_pct:
  1850. scale_factor = pct_per_cycle / self.step_up_pct
  1851. else:
  1852. scale_factor = (1 - pct_per_cycle) / (1 - self.step_up_pct)
  1853. base_height = self.amplitude * scale_factor
  1854. lr = self.base_lr + base_height * self.scale_fn(eval(self.scale_mode))
  1855. return lr
  1856. class LinearLR(LRScheduler):
  1857. r"""
  1858. Set the learning rate according to linear scheduler.
  1859. The learning rate will be firstly multiplied by start_factor and linearly increase to end learning rate.
  1860. Args:
  1861. learning_rate (float): The initial learning rate. It is a python float number.
  1862. total_steps (int): Number of iterations that the learning_rate reaches end learning_rate.
  1863. start_factor (float): Start learning rate is defined by `start_factor * learning_rate` . Default: 1./3.
  1864. end_factor (float) End learning rate is defined by `end_factor * learning_rate`. Default: 1.0.
  1865. last_epoch (int, optional): The index of last epoch. Can be set to restart training.Default: -1, means initial learning rate.
  1866. verbose: (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
  1867. Returns:
  1868. ``LinearLR`` instance to schedule learning rate.
  1869. Examples:
  1870. .. code-block:: python
  1871. :name: code-dynamic
  1872. >>> # Example1: train on default dynamic graph mode
  1873. >>> import paddle
  1874. >>> import numpy as np
  1875. >>> # train on default dynamic graph mode
  1876. >>> linear = paddle.nn.Linear(10, 10)
  1877. >>> scheduler = paddle.optimizer.lr.LinearLR(learning_rate=0.5, total_steps=5, verbose=True)
  1878. >>> sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
  1879. >>> for epoch in range(5):
  1880. ... for batch_id in range(20):
  1881. ... x = paddle.uniform([10, 10])
  1882. ... out = linear(x)
  1883. ... loss = paddle.mean(out)
  1884. ... loss.backward()
  1885. ... sgd.step()
  1886. ... sgd.clear_gradients()
  1887. ... scheduler.step()
  1888. .. code-block:: python
  1889. :name: code-static
  1890. >>> # Example2: train on static graph mode
  1891. >>> import paddle
  1892. >>> import numpy as np
  1893. >>> paddle.enable_static()
  1894. >>> main_prog = paddle.static.Program()
  1895. >>> start_prog = paddle.static.Program()
  1896. >>> with paddle.static.program_guard(main_prog, start_prog):
  1897. ... x = paddle.static.data(name='x', shape=[None, 4, 5])
  1898. ... y = paddle.static.data(name='y', shape=[None, 4, 5])
  1899. ... z = paddle.static.nn.fc(x, 100)
  1900. ... loss = paddle.mean(z)
  1901. ... scheduler = paddle.optimizer.lr.LinearLR(learning_rate=0.5,
  1902. ... total_steps=5, verbose=True)
  1903. ... sgd = paddle.optimizer.SGD(learning_rate=scheduler)
  1904. ... sgd.minimize(loss)
  1905. ...
  1906. >>> exe = paddle.static.Executor()
  1907. >>> exe.run(start_prog)
  1908. >>> for epoch in range(5):
  1909. ... for batch_id in range(20):
  1910. ... out = exe.run(
  1911. ... main_prog,
  1912. ... feed={
  1913. ... 'x': np.random.randn(3, 4, 5).astype('float32'),
  1914. ... 'y': np.random.randn(3, 4, 5).astype('float32')
  1915. ... },
  1916. ... fetch_list=loss.name)
  1917. ... scheduler.step()
  1918. """
  1919. def __init__(
  1920. self,
  1921. learning_rate,
  1922. total_steps,
  1923. start_factor=1.0 / 3,
  1924. end_factor=1.0,
  1925. last_epoch=-1,
  1926. verbose=False,
  1927. ):
  1928. if start_factor > 1.0 or start_factor <= 0:
  1929. raise ValueError(
  1930. f"`start_factor` must be greater than 0 and less or equal to 1, but got {start_factor}"
  1931. )
  1932. if end_factor > 1.0 or end_factor < 0:
  1933. raise ValueError(
  1934. f"`end_factor` must be greater than 0 and less than 1, but got {end_factor}"
  1935. )
  1936. if total_steps <= 0:
  1937. raise ValueError(
  1938. f"`total_steps` must be greater than 0, but got {total_steps}"
  1939. )
  1940. self.start_factor = start_factor
  1941. self.end_factor = end_factor
  1942. self.total_steps = total_steps
  1943. super().__init__(learning_rate, last_epoch, verbose)
  1944. def get_lr(self):
  1945. if self.last_epoch == 0:
  1946. return self.base_lr * self.start_factor
  1947. elif self.last_epoch > self.total_steps:
  1948. return self.last_lr
  1949. else:
  1950. base_lr = self.total_steps * self.start_factor
  1951. cur_factor = self.end_factor - self.start_factor
  1952. factor = 1.0 + cur_factor / (
  1953. base_lr + (self.last_epoch - 1) * cur_factor
  1954. )
  1955. return self.last_lr * factor
  1956. class CosineAnnealingWarmRestarts(LRScheduler):
  1957. r"""
  1958. Set the learning rate of each parameter group using a cosine annealing
  1959. schedule, where :math:`\eta_{max}` is set to the initial lr, :math:`T_{cur}`
  1960. is the number of epochs since the last restart and :math:`T_{i}` is the number
  1961. of epochs between two warm restarts in SGDR:
  1962. .. math::
  1963. \eta_t = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1 +
  1964. \cos\left(\frac{T_{cur}}{T_{i}}\pi\right)\right)
  1965. When :math:`T_{cur}=T_{i}`, set :math:`\eta_t = \eta_{min}`.
  1966. When :math:`T_{cur}=0` after restart, set :math:`\eta_t=\eta_{max}`.
  1967. It has been proposed in `SGDR: Stochastic Gradient Descent with Warm Restarts <https://arxiv.org/abs/1608.03983>`_.
  1968. Args:
  1969. learning_rate (float): Initial learning rate.
  1970. T_0 (int): Number of iterations for the first restart.
  1971. T_mult (int, optional): A factor increases :math:`T_{i}` after a restart. Default: 1.
  1972. eta_min (float, optional): Minimum learning rate. Default: 0.
  1973. last_epoch (int, optional): The index of last epoch. Default: -1, means initial learning rate.
  1974. verbose (bool, optional): If ``True``, prints a message to stdout for
  1975. each update. Default: ``False``.
  1976. Returns:
  1977. ``CosineAnnealingWarmRestarts`` instance to schedule learning rate.
  1978. Examples:
  1979. .. code-block:: python
  1980. :name: code-example1
  1981. >>> import paddle
  1982. >>> import numpy as np
  1983. >>> # train on default dynamic graph mode
  1984. >>> linear = paddle.nn.Linear(10, 10)
  1985. >>> scheduler = paddle.optimizer.lr.CosineAnnealingWarmRestarts(learning_rate=0.5, T_0=1, T_mult=2, verbose=True)
  1986. >>> adam = paddle.optimizer.Adam(learning_rate=scheduler, parameters=linear.parameters())
  1987. >>> for epoch in range(10):
  1988. ... for batch_id in range(10):
  1989. ... x = paddle.uniform([10, 10])
  1990. ... out = linear(x)
  1991. ... loss = paddle.mean(out)
  1992. ... loss.backward()
  1993. ... adam.step()
  1994. ... adam.clear_grad()
  1995. ... scheduler.step(epoch) # You should update learning rate each step
  1996. .. code-block:: python
  1997. :name: code-example2
  1998. >>> import paddle
  1999. >>> import numpy as np
  2000. >>> paddle.enable_static()
  2001. >>> main_prog = paddle.static.Program()
  2002. >>> start_prog = paddle.static.Program()
  2003. >>> with paddle.static.program_guard(main_prog, start_prog):
  2004. ... x = paddle.static.data(name='x', shape=[None, 4, 5])
  2005. ... y = paddle.static.data(name='y', shape=[None, 4, 5])
  2006. ... z = paddle.static.nn.fc(x, 100)
  2007. ... loss = paddle.mean(z)
  2008. ... scheduler = paddle.optimizer.lr.CosineAnnealingWarmRestarts(learning_rate=0.5, T_0=1, T_mult=2,verbose=True)
  2009. ... sgd = paddle.optimizer.SGD(learning_rate=scheduler)
  2010. ... sgd.minimize(loss)
  2011. >>> exe = paddle.static.Executor()
  2012. >>> exe.run(start_prog)
  2013. >>> for epoch in range(10):
  2014. ... for batch_id in range(10):
  2015. ... out = exe.run(
  2016. ... main_prog,
  2017. ... feed={
  2018. ... 'x': np.random.randn(3, 4, 5).astype('float32'),
  2019. ... 'y': np.random.randn(3, 4, 5).astype('float32')
  2020. ... },
  2021. ... fetch_list=loss.name)
  2022. ... scheduler.step(epoch) # You should update learning rate each step
  2023. """
  2024. def __init__(
  2025. self,
  2026. learning_rate,
  2027. T_0,
  2028. T_mult=1,
  2029. eta_min=0,
  2030. last_epoch=-1,
  2031. verbose=False,
  2032. ):
  2033. if T_0 <= 0 or not isinstance(T_0, int):
  2034. raise ValueError(f"Expected positive integer T_0, but got {T_0}")
  2035. if T_mult < 1 or not isinstance(T_mult, int):
  2036. raise ValueError(f"Expected integer T_mult >= 1, but got {T_mult}")
  2037. self.T_0 = T_0
  2038. self.T_i = T_0
  2039. self.T_mult = T_mult
  2040. self.eta_min = eta_min
  2041. self.T_cur = last_epoch
  2042. super().__init__(learning_rate, last_epoch, verbose)
  2043. def get_lr(self):
  2044. return (
  2045. self.eta_min
  2046. + (self.base_lr - self.eta_min)
  2047. * (1 + math.cos(math.pi * self.T_cur / self.T_i))
  2048. / 2
  2049. )
  2050. def step(self, epoch=None):
  2051. """
  2052. step should be called after `optimizer.step()` . It will update the learning rate in optimizer.
  2053. The new learning rate will take effect on next epoch.
  2054. Args:
  2055. epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.
  2056. Returns:
  2057. None
  2058. Examples:
  2059. Please refer to the example of current LRScheduler.
  2060. """
  2061. if epoch is None and self.last_epoch < 0:
  2062. epoch = 0
  2063. if epoch is None:
  2064. epoch = self.last_epoch + 1
  2065. self.T_cur = self.T_cur + 1
  2066. if self.T_cur >= self.T_i:
  2067. self.T_cur = self.T_cur - self.T_i
  2068. self.T_i = self.T_i * self.T_mult
  2069. else:
  2070. if epoch < 0:
  2071. raise ValueError(
  2072. f"Expected non-negative epoch, but got {epoch}"
  2073. )
  2074. if epoch >= self.T_0:
  2075. if self.T_mult == 1:
  2076. self.T_cur = epoch % self.T_0
  2077. else:
  2078. n = int(
  2079. math.log(
  2080. (epoch / self.T_0 * (self.T_mult - 1) + 1),
  2081. self.T_mult,
  2082. )
  2083. )
  2084. self.T_cur = epoch - self.T_0 * (self.T_mult**n - 1) / (
  2085. self.T_mult - 1
  2086. )
  2087. self.T_i = self.T_0 * self.T_mult ** (n)
  2088. else:
  2089. self.T_i = self.T_0
  2090. self.T_cur = epoch
  2091. self.last_epoch = math.floor(epoch)
  2092. self.last_lr = self.get_lr()
  2093. if self.verbose:
  2094. print(
  2095. f'Epoch {self.last_epoch}: {self.__class__.__name__} set learning rate to {self.last_lr}.'
  2096. )
  2097. def autoincreased_step_counter(counter_name=None, begin=1, step=1):
  2098. """
  2099. :api_attr: Static Graph
  2100. Create an auto-increase variable. which will be automatically increased
  2101. by 1 in every iteration. By default, the first return of this counter is 1,
  2102. and the step size is 1.
  2103. Args:
  2104. counter_name(str, optional): The counter name. Default '@STEP_COUNTER@'.
  2105. begin(int, optional): The first return value of this counter. Default 1.
  2106. step(int, optional): The step size. Default 1.
  2107. Returns:
  2108. Variable: The auto-increased Variable with data type int64.
  2109. Examples:
  2110. .. code-block:: python
  2111. >>> import paddle
  2112. >>> paddle.enable_static()
  2113. >>> global_step = paddle.optimizer.lr.autoincreased_step_counter(
  2114. ... counter_name='@LR_DECAY_COUNTER@', begin=0, step=1)
  2115. """
  2116. helper = LayerHelper('global_step_counter')
  2117. if counter_name is None:
  2118. counter_name = '@STEP_COUNTER@'
  2119. counter, is_new_var = helper.create_or_get_global_variable(
  2120. name=counter_name,
  2121. dtype='int64',
  2122. shape=[1],
  2123. persistable=True,
  2124. belong_to_optimizer=True,
  2125. )
  2126. if is_new_var:
  2127. helper.set_variable_initializer(
  2128. counter,
  2129. initializer=paddle.nn.initializer.ConstantInitializer(
  2130. value=begin - 1, force_cpu=True
  2131. ),
  2132. )
  2133. helper.main_program.global_block()._prepend_op(
  2134. type='increment',
  2135. inputs={'X': [counter]},
  2136. outputs={'Out': [counter]},
  2137. attrs={'step': float(step)},
  2138. )
  2139. counter.stop_gradient = True
  2140. return counter
  2141. def _decay_step_counter(begin=0):
  2142. # the first global step is zero in learning rate decay
  2143. global_step = autoincreased_step_counter(
  2144. counter_name='@LR_DECAY_COUNTER@', begin=begin, step=1
  2145. )
  2146. global_step = paddle.cast(global_step, 'float32')
  2147. return global_step
  2148. def noam_decay(d_model, warmup_steps, learning_rate=1.0):
  2149. """
  2150. Noam decay method. The numpy implementation of noam decay as follows.
  2151. .. code-block:: python
  2152. >>> import numpy as np
  2153. >>> # set hyper parameters
  2154. >>> base_lr = 0.01
  2155. >>> d_model = 2
  2156. >>> current_steps = 20
  2157. >>> warmup_steps = 200
  2158. >>> # compute
  2159. >>> lr_value = base_lr * np.power(d_model, -0.5) * np.min([
  2160. ... np.power(current_steps, -0.5),
  2161. ... np.power(warmup_steps, -1.5) * current_steps])
  2162. Please reference `attention is all you need <https://arxiv.org/pdf/1706.03762.pdf>`_.
  2163. Args:
  2164. d_model(Variable): The dimensionality of input and output of model.
  2165. warmup_steps(Variable): A super parameter.
  2166. learning_rate(Variable|float|int): The initial learning rate. If the type
  2167. is Variable, it's a 0-D Tensor with shape [], the data type can be
  2168. float32 or float64. It also can be set to python int number. Default 1.0
  2169. Returns:
  2170. The decayed learning rate.
  2171. Examples:
  2172. .. code-block:: python
  2173. >>> import paddle
  2174. >>> warmup_steps = 100
  2175. >>> learning_rate = 0.01
  2176. >>> lr = paddle.optimizer.lr.noam_decay(
  2177. ... 1/(warmup_steps *(learning_rate ** 2)),
  2178. ... warmup_steps,
  2179. ... learning_rate)
  2180. """
  2181. with default_main_program()._lr_schedule_guard():
  2182. if in_dygraph_mode():
  2183. decay = paddle.optimizer.lr.NoamDecay(
  2184. d_model, warmup_steps, learning_rate=learning_rate
  2185. )
  2186. return decay
  2187. else:
  2188. global_step = _decay_step_counter(1)
  2189. a = global_step**-0.5
  2190. b = (warmup_steps**-1.5) * global_step
  2191. lr_value = learning_rate * (d_model**-0.5) * paddle.minimum(a, b)
  2192. return lr_value
  2193. def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
  2194. """
  2195. Applies exponential decay to the learning rate.
  2196. When training a model, it is often recommended to lower the learning rate as the
  2197. training progresses. By using this function, the learning rate will be decayed by
  2198. 'decay_rate' every 'decay_steps' steps.
  2199. Decayed learning rate calculates as follows:
  2200. .. code-block:: text
  2201. >>> if staircase == True:
  2202. >>> decayed_learning_rate = learning_rate * decay_rate ^ floor(global_step / decay_steps)
  2203. >>> else:
  2204. >>> decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps)
  2205. Args:
  2206. learning_rate(Variable|float): The initial learning rate. It should be a Variable
  2207. or a float
  2208. decay_steps(int): The learning rate decay steps. See the decay computation above.
  2209. decay_rate(float): The learning rate decay rate. See the decay computation above.
  2210. staircase(bool): If True, decay the learning rate at discrete intervals, which
  2211. means the learning rate will be decayed by `decay_rate` every
  2212. `decay_steps`. If False, learning rate will be decayed continuously
  2213. and following the formula above. Default: False
  2214. Returns:
  2215. Variable: The decayed learning rate. The data type is float32.
  2216. Examples:
  2217. .. code-block:: python
  2218. >>> import paddle
  2219. >>> paddle.enable_static()
  2220. >>> base_lr = 0.1
  2221. >>> lr = paddle.optimizer.lr.exponential_decay(
  2222. ... learning_rate=base_lr,
  2223. ... decay_steps=10000,
  2224. ... decay_rate=0.5,
  2225. ... staircase=True
  2226. ... )
  2227. """
  2228. with default_main_program()._lr_schedule_guard():
  2229. if in_dygraph_mode():
  2230. decay = ExponentialDecay(learning_rate, decay_rate)
  2231. return decay
  2232. else:
  2233. global_step = _decay_step_counter()
  2234. div_res = global_step / decay_steps
  2235. if staircase:
  2236. div_res = paddle.floor(div_res)
  2237. decayed_lr = learning_rate * (decay_rate**div_res)
  2238. return decayed_lr
  2239. def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
  2240. """
  2241. Applies natural exponential decay to the initial learning rate.
  2242. When training a model, it is often recommended to lower the learning rate as the
  2243. training progresses. By using this function, the learning rate will be decayed by
  2244. natural exponential power 'decay_rate' every 'decay_steps' steps.
  2245. Decayed learning rate calculates as follows:
  2246. .. code-block:: text
  2247. >>> if not staircase:
  2248. >>> decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
  2249. >>> else:
  2250. >>> decayed_learning_rate = learning_rate * exp(- decay_rate * floor(global_step / decay_steps))
  2251. Args:
  2252. learning_rate(Variable|float): The initial learning rate. It should be a Variable
  2253. or a float
  2254. decay_steps(int): The learning rate decay steps. See the decay computation above.
  2255. decay_rate(float): The learning rate decay rate. See the decay computation above.
  2256. staircase(bool): If True, decay the learning rate at discrete intervals, which
  2257. means the learning rate will be decayed by natural exponential power
  2258. `decay_rate` every `decay_steps`. If False, learning rate will be
  2259. decayed continuously and following the formula above. Default: False
  2260. Returns:
  2261. The decayed learning rate. The data type is float32.
  2262. Examples:
  2263. .. code-block:: python
  2264. >>> import paddle
  2265. >>> paddle.enable_static()
  2266. >>> base_lr = 0.1
  2267. >>> lr = paddle.optimizer.lr.natural_exp_decay(
  2268. ... learning_rate=base_lr,
  2269. ... decay_steps=10000,
  2270. ... decay_rate=0.5,
  2271. ... staircase=True
  2272. ... )
  2273. """
  2274. with default_main_program()._lr_schedule_guard():
  2275. if in_dygraph_mode():
  2276. decay = NaturalExpDecay(learning_rate, decay_rate)
  2277. return decay
  2278. else:
  2279. global_step = _decay_step_counter()
  2280. div_res = global_step / decay_steps
  2281. if staircase:
  2282. div_res = paddle.floor(div_res)
  2283. decayed_lr = learning_rate * paddle.exp(-1 * decay_rate * div_res)
  2284. return decayed_lr
  2285. def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
  2286. """
  2287. Applies inverse time decay to the initial learning rate.
  2288. When training a model, it is often recommended to lower the learning rate as the
  2289. training progresses. By using this function, an inverse decay function will be
  2290. applied to the initial learning rate.
  2291. Decayed learning rate calculates as follows:
  2292. .. code-block:: text
  2293. >>> if staircase == True:
  2294. >>> decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step))
  2295. >>> else:
  2296. >>> decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step)
  2297. Args:
  2298. learning_rate(Variable|float): The initial learning rate. It should be a Variable
  2299. or a float
  2300. decay_steps(int): The learning rate decay steps. See the decay computation above.
  2301. decay_rate(float): The learning rate decay rate. See the decay computation above.
  2302. staircase(bool): If True, decay the learning rate at discrete intervals, which
  2303. means the learning rate will be decayed by `decay_rate` times
  2304. every `decay_steps`. If False, learning rate will be decayed
  2305. continuously and following the formula above. Default: False
  2306. Returns:
  2307. Variable: The decayed learning rate. The data type is float32.
  2308. Examples:
  2309. .. code-block:: python
  2310. >>> import paddle
  2311. >>> paddle.enable_static()
  2312. >>> base_lr = 0.1
  2313. >>> lr = paddle.optimizer.lr.inverse_time_decay(
  2314. ... learning_rate=base_lr,
  2315. ... decay_steps=10000,
  2316. ... decay_rate=0.5,
  2317. ... staircase=True
  2318. ... )
  2319. """
  2320. with default_main_program()._lr_schedule_guard():
  2321. if in_dygraph_mode():
  2322. decay = InverseTimeDecay(learning_rate, decay_rate)
  2323. return decay
  2324. else:
  2325. global_step = _decay_step_counter()
  2326. div_res = global_step / decay_steps
  2327. if staircase:
  2328. div_res = paddle.floor(div_res)
  2329. decayed_lr = learning_rate / (1 + decay_rate * div_res)
  2330. return decayed_lr
  2331. def polynomial_decay(
  2332. learning_rate, decay_steps, end_learning_rate=0.0001, power=1.0, cycle=False
  2333. ):
  2334. """
  2335. Applies polynomial decay to the initial learning rate.
  2336. .. code-block:: text
  2337. if cycle:
  2338. decay_steps = decay_steps * ceil(global_step / decay_steps)
  2339. else:
  2340. global_step = min(global_step, decay_steps)
  2341. decayed_learning_rate = (learning_rate - end_learning_rate) *
  2342. (1 - global_step / decay_steps) ^ power + end_learning_rate
  2343. Args:
  2344. learning_rate(Variable|float32): A scalar float32 value or a Variable. This
  2345. will be the initial learning rate during training.
  2346. decay_steps(int32): A Python `int32` number.
  2347. end_learning_rate(float): A Python `float` number.
  2348. power(float): A Python `float` number.
  2349. cycle(bool): If set true, decay the learning rate every decay_steps.
  2350. Returns:
  2351. Variable: The decayed learning rate
  2352. Examples:
  2353. .. code-block:: python
  2354. >>> import paddle
  2355. >>> start_lr = 0.01
  2356. >>> total_step = 5000
  2357. >>> end_lr = 0
  2358. >>> lr = paddle.optimizer.lr.polynomial_decay(
  2359. ... start_lr,
  2360. ... total_step,
  2361. ... end_lr,
  2362. ... power=1
  2363. ... )
  2364. """
  2365. with default_main_program()._lr_schedule_guard():
  2366. if in_dygraph_mode():
  2367. decay = PolynomialDecay(
  2368. learning_rate, decay_steps, end_learning_rate, power, cycle
  2369. )
  2370. return decay
  2371. else:
  2372. global_step = _decay_step_counter()
  2373. if cycle:
  2374. div_res = paddle.ceil(global_step / decay_steps)
  2375. zero_var = paddle.tensor.fill_constant(
  2376. shape=[1], dtype='float32', value=0.0
  2377. )
  2378. one_var = paddle.tensor.fill_constant(
  2379. shape=[1], dtype='float32', value=1.0
  2380. )
  2381. div_val = paddle.static.nn.cond(
  2382. global_step == zero_var, lambda: one_var, lambda: div_res
  2383. )
  2384. paddle.assign(div_val, output=div_res)
  2385. decay_steps = decay_steps * div_res
  2386. else:
  2387. decay_steps_var = paddle.tensor.fill_constant(
  2388. shape=[1], dtype='float32', value=float(decay_steps)
  2389. )
  2390. global_step = paddle.minimum(x=global_step, y=decay_steps_var)
  2391. decayed_lr = (learning_rate - end_learning_rate) * (
  2392. (1 - global_step / decay_steps) ** power
  2393. ) + end_learning_rate
  2394. return decayed_lr
  2395. def piecewise_decay(boundaries, values):
  2396. """
  2397. Applies piecewise decay to the initial learning rate.
  2398. The algorithm can be described as the code below.
  2399. .. code-block:: text
  2400. boundaries = [10000, 20000]
  2401. values = [1.0, 0.5, 0.1]
  2402. if step < 10000:
  2403. learning_rate = 1.0
  2404. elif 10000 <= step < 20000:
  2405. learning_rate = 0.5
  2406. else:
  2407. learning_rate = 0.1
  2408. Args:
  2409. boundaries: A list of steps numbers.
  2410. values: A list of learning rate values that will be picked during
  2411. different step boundaries.
  2412. Returns:
  2413. The decayed learning rate.
  2414. Examples:
  2415. .. code-block:: python
  2416. >>> import paddle
  2417. >>> paddle.enable_static()
  2418. >>> boundaries = [10000, 20000]
  2419. >>> values = [1.0, 0.5, 0.1]
  2420. >>> optimizer = paddle.optimizer.Momentum(
  2421. ... momentum=0.9,
  2422. ... learning_rate=paddle.optimizer.lr.PiecewiseDecay(boundaries, values),
  2423. ... weight_decay=paddle.regularizer.L2Decay(1e-4)
  2424. ... )
  2425. """
  2426. with default_main_program()._lr_schedule_guard():
  2427. if len(values) - len(boundaries) != 1:
  2428. raise ValueError("len(values) - len(boundaries) should be 1")
  2429. if in_dygraph_mode():
  2430. decay = PiecewiseDecay(boundaries, values)
  2431. return decay
  2432. else:
  2433. global_step = _decay_step_counter()
  2434. lr = paddle.static.create_global_var(
  2435. shape=[1],
  2436. value=0.0,
  2437. dtype='float32',
  2438. persistable=True,
  2439. name="learning_rate",
  2440. )
  2441. with paddle.static.nn.control_flow.Switch() as switch:
  2442. for i in range(len(boundaries)):
  2443. boundary_val = paddle.tensor.fill_constant(
  2444. shape=[1],
  2445. dtype='float32',
  2446. value=float(boundaries[i]),
  2447. force_cpu=True,
  2448. )
  2449. with switch.case(global_step < boundary_val):
  2450. paddle.tensor.fill_constant(
  2451. shape=[1],
  2452. dtype="float32",
  2453. value=float(values[i]),
  2454. out=lr,
  2455. )
  2456. with switch.default():
  2457. paddle.tensor.fill_constant(
  2458. shape=[1],
  2459. dtype="float32",
  2460. value=float(values[len(values) - 1]),
  2461. out=lr,
  2462. )
  2463. return lr
  2464. def cosine_decay(learning_rate, step_each_epoch, epochs):
  2465. r"""
  2466. Applies cosine decay to the learning rate.
  2467. when training a model, it is often recommended to lower the learning rate as the
  2468. training progresses. By using this function, the learning rate will be decayed by
  2469. following cosine decay strategy.
  2470. .. math::
  2471. decayed\_lr = learning\_rate * 0.5 * (math.cos * (epoch * \\frac{math.pi}{epochs} ) + 1)
  2472. Args:
  2473. learning_rate(Variable|float): The initial learning rate.
  2474. step_each_epoch(int): the number of steps in an epoch.
  2475. epochs(int): the number of epochs.
  2476. Returns:
  2477. Variable: The decayed learning rate.
  2478. Examples:
  2479. .. code-block:: python
  2480. >>> import paddle
  2481. >>> base_lr = 0.1
  2482. >>> lr = paddle.optimizer.lr.cosine_decay(
  2483. >>> learning_rate = base_lr, step_each_epoch=10000, epochs=120)
  2484. """
  2485. check_type(
  2486. learning_rate, 'learning_rate', (float, Variable), 'cosine_decay'
  2487. )
  2488. with default_main_program()._lr_schedule_guard():
  2489. if in_dygraph_mode():
  2490. decay = CosineAnnealingDecay(learning_rate, epochs)
  2491. return decay
  2492. else:
  2493. global_step = _decay_step_counter()
  2494. cur_epoch = paddle.floor(global_step / step_each_epoch)
  2495. decayed_lr = (
  2496. learning_rate
  2497. * 0.5
  2498. * (paddle.cos(cur_epoch * math.pi / epochs) + 1)
  2499. )
  2500. return decayed_lr
  2501. def linear_lr_warmup(learning_rate, warmup_steps, start_lr, end_lr):
  2502. """
  2503. This operator use the linear learning rate warm up strategy to adjust the learning rate preliminarily before the normal learning rate scheduling.
  2504. For more information, please refer to `Bag of Tricks for Image Classification with Convolutional Neural Networks <https://arxiv.org/abs/1812.01187>`_
  2505. When global_step < warmup_steps, learning rate is updated as:
  2506. .. code-block:: text
  2507. linear_step = end_lr - start_lr
  2508. lr = start_lr + linear_step * (global_step / warmup_steps)
  2509. where start_lr is the initial learning rate, and end_lr is the final learning rate;
  2510. When global_step >= warmup_steps, learning rate is updated as:
  2511. .. code-block:: text
  2512. lr = learning_rate
  2513. where lr is the learning_rate after warm-up.
  2514. Args:
  2515. learning_rate (Variable|float): Learning_rate after warm-up, it could be 1D-Tensor or single value with the data type of float32.
  2516. warmup_steps (int): Steps for warm up.
  2517. start_lr (float): Initial learning rate of warm up.
  2518. end_lr (float): Final learning rate of warm up.
  2519. Returns:
  2520. Variable: Warm-up learning rate with the same data type as learning_rate.
  2521. Examples:
  2522. .. code-block:: python
  2523. >>> import paddle
  2524. >>> paddle.enable_static()
  2525. >>> boundaries = [100, 200]
  2526. >>> lr_steps = [0.1, 0.01, 0.001]
  2527. >>> learning_rate = paddle.optimizer.lr.piecewise_decay(boundaries, lr_steps) # case1, 1D-Tensor
  2528. >>> # learning_rate = 0.1 # case2, single-value
  2529. >>> warmup_steps = 50
  2530. >>> start_lr = 0.1
  2531. >>> end_lr = 1. / 3.
  2532. >>> decayed_lr = paddle.optimizer.lr.linear_lr_warmup(
  2533. ... learning_rate,
  2534. ... warmup_steps,
  2535. ... start_lr,
  2536. ... end_lr
  2537. ... )
  2538. >>> place = paddle.CPUPlace()
  2539. >>> exe = paddle.static.Executor(place)
  2540. >>> exe.run(paddle.static.default_startup_program())
  2541. >>> out, = exe.run(fetch_list=[decayed_lr.name])
  2542. >>> print(out)
  2543. [0.1]
  2544. """
  2545. dtype = 'float32'
  2546. if isinstance(learning_rate, Variable):
  2547. dtype = learning_rate.dtype
  2548. linear_step = float(end_lr) - float(start_lr)
  2549. with default_main_program()._lr_schedule_guard():
  2550. if in_dygraph_mode():
  2551. lr = LinearWarmup(learning_rate, warmup_steps, start_lr, end_lr)
  2552. return lr
  2553. else:
  2554. lr = paddle.static.create_global_var(
  2555. shape=[1],
  2556. value=0.0,
  2557. dtype=dtype,
  2558. persistable=True,
  2559. name="learning_rate_warmup",
  2560. )
  2561. global_step = _decay_step_counter()
  2562. if not isinstance(learning_rate, Variable):
  2563. learning_rate = paddle.tensor.fill_constant(
  2564. shape=[1], dtype=dtype, value=float(learning_rate)
  2565. )
  2566. lr_val = paddle.static.nn.case(
  2567. pred_fn_pairs=[
  2568. (
  2569. global_step < warmup_steps,
  2570. lambda: start_lr
  2571. + linear_step * (global_step / float(warmup_steps)),
  2572. )
  2573. ],
  2574. default=lambda: learning_rate,
  2575. )
  2576. paddle.assign(lr_val, lr)
  2577. return lr