optimizer.py 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292
  1. # copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from __future__ import absolute_import
  15. from __future__ import division
  16. from __future__ import print_function
  17. from __future__ import unicode_literals
  18. from paddle import optimizer as optim
  19. class Momentum(object):
  20. """
  21. Simple Momentum optimizer with velocity state.
  22. Args:
  23. learning_rate (float|Variable) - The learning rate used to update parameters.
  24. Can be a float value or a Variable with one float value as data element.
  25. momentum (float) - Momentum factor.
  26. regularization (WeightDecayRegularizer, optional) - The strategy of regularization.
  27. """
  28. def __init__(
  29. self, learning_rate, momentum, weight_decay=None, grad_clip=None, **args
  30. ):
  31. super(Momentum, self).__init__()
  32. self.learning_rate = learning_rate
  33. self.momentum = momentum
  34. self.weight_decay = weight_decay
  35. self.grad_clip = grad_clip
  36. def __call__(self, model):
  37. train_params = [
  38. param for param in model.parameters() if param.trainable is True
  39. ]
  40. opt = optim.Momentum(
  41. learning_rate=self.learning_rate,
  42. momentum=self.momentum,
  43. weight_decay=self.weight_decay,
  44. grad_clip=self.grad_clip,
  45. parameters=train_params,
  46. )
  47. return opt
  48. class Adam(object):
  49. def __init__(
  50. self,
  51. learning_rate=0.001,
  52. beta1=0.9,
  53. beta2=0.999,
  54. epsilon=1e-08,
  55. parameter_list=None,
  56. weight_decay=None,
  57. grad_clip=None,
  58. name=None,
  59. lazy_mode=False,
  60. **kwargs,
  61. ):
  62. self.learning_rate = learning_rate
  63. self.beta1 = beta1
  64. self.beta2 = beta2
  65. self.epsilon = epsilon
  66. self.parameter_list = parameter_list
  67. self.learning_rate = learning_rate
  68. self.weight_decay = weight_decay
  69. self.grad_clip = grad_clip
  70. self.name = name
  71. self.lazy_mode = lazy_mode
  72. self.group_lr = kwargs.get("group_lr", False)
  73. self.training_step = kwargs.get("training_step", None)
  74. def __call__(self, model):
  75. if self.group_lr:
  76. if self.training_step == "LF_2":
  77. import paddle
  78. if isinstance(model, paddle.DataParallel): # multi gpu
  79. mlm = model._layers.head.MLM_VRM.MLM.parameters()
  80. pre_mlm_pp = (
  81. model._layers.head.MLM_VRM.Prediction.pp_share.parameters()
  82. )
  83. pre_mlm_w = (
  84. model._layers.head.MLM_VRM.Prediction.w_share.parameters()
  85. )
  86. else: # single gpu
  87. mlm = model.head.MLM_VRM.MLM.parameters()
  88. pre_mlm_pp = model.head.MLM_VRM.Prediction.pp_share.parameters()
  89. pre_mlm_w = model.head.MLM_VRM.Prediction.w_share.parameters()
  90. total = []
  91. for param in mlm:
  92. total.append(id(param))
  93. for param in pre_mlm_pp:
  94. total.append(id(param))
  95. for param in pre_mlm_w:
  96. total.append(id(param))
  97. group_base_params = [
  98. param for param in model.parameters() if id(param) in total
  99. ]
  100. group_small_params = [
  101. param for param in model.parameters() if id(param) not in total
  102. ]
  103. train_params = [
  104. {"params": group_base_params},
  105. {
  106. "params": group_small_params,
  107. "learning_rate": self.learning_rate.values[0] * 0.1,
  108. },
  109. ]
  110. else:
  111. print("group lr currently only support VisionLAN in LF_2 training step")
  112. train_params = [
  113. param for param in model.parameters() if param.trainable is True
  114. ]
  115. else:
  116. train_params = [
  117. param for param in model.parameters() if param.trainable is True
  118. ]
  119. opt = optim.Adam(
  120. learning_rate=self.learning_rate,
  121. beta1=self.beta1,
  122. beta2=self.beta2,
  123. epsilon=self.epsilon,
  124. weight_decay=self.weight_decay,
  125. grad_clip=self.grad_clip,
  126. name=self.name,
  127. lazy_mode=self.lazy_mode,
  128. parameters=train_params,
  129. )
  130. return opt
  131. class RMSProp(object):
  132. """
  133. Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning rate method.
  134. Args:
  135. learning_rate (float|Variable) - The learning rate used to update parameters.
  136. Can be a float value or a Variable with one float value as data element.
  137. momentum (float) - Momentum factor.
  138. rho (float) - rho value in equation.
  139. epsilon (float) - avoid division by zero, default is 1e-6.
  140. regularization (WeightDecayRegularizer, optional) - The strategy of regularization.
  141. """
  142. def __init__(
  143. self,
  144. learning_rate,
  145. momentum=0.0,
  146. rho=0.95,
  147. epsilon=1e-6,
  148. weight_decay=None,
  149. grad_clip=None,
  150. **args,
  151. ):
  152. super(RMSProp, self).__init__()
  153. self.learning_rate = learning_rate
  154. self.momentum = momentum
  155. self.rho = rho
  156. self.epsilon = epsilon
  157. self.weight_decay = weight_decay
  158. self.grad_clip = grad_clip
  159. def __call__(self, model):
  160. train_params = [
  161. param for param in model.parameters() if param.trainable is True
  162. ]
  163. opt = optim.RMSProp(
  164. learning_rate=self.learning_rate,
  165. momentum=self.momentum,
  166. rho=self.rho,
  167. epsilon=self.epsilon,
  168. weight_decay=self.weight_decay,
  169. grad_clip=self.grad_clip,
  170. parameters=train_params,
  171. )
  172. return opt
  173. class Adadelta(object):
  174. def __init__(
  175. self,
  176. learning_rate=0.001,
  177. epsilon=1e-08,
  178. rho=0.95,
  179. parameter_list=None,
  180. weight_decay=None,
  181. grad_clip=None,
  182. name=None,
  183. **kwargs,
  184. ):
  185. self.learning_rate = learning_rate
  186. self.epsilon = epsilon
  187. self.rho = rho
  188. self.parameter_list = parameter_list
  189. self.learning_rate = learning_rate
  190. self.weight_decay = weight_decay
  191. self.grad_clip = grad_clip
  192. self.name = name
  193. def __call__(self, model):
  194. train_params = [
  195. param for param in model.parameters() if param.trainable is True
  196. ]
  197. opt = optim.Adadelta(
  198. learning_rate=self.learning_rate,
  199. epsilon=self.epsilon,
  200. rho=self.rho,
  201. weight_decay=self.weight_decay,
  202. grad_clip=self.grad_clip,
  203. name=self.name,
  204. parameters=train_params,
  205. )
  206. return opt
  207. class AdamW(object):
  208. def __init__(
  209. self,
  210. learning_rate=0.001,
  211. beta1=0.9,
  212. beta2=0.999,
  213. epsilon=1e-8,
  214. weight_decay=0.01,
  215. multi_precision=False,
  216. grad_clip=None,
  217. no_weight_decay_name=None,
  218. one_dim_param_no_weight_decay=False,
  219. name=None,
  220. lazy_mode=False,
  221. **args,
  222. ):
  223. super().__init__()
  224. self.learning_rate = learning_rate
  225. self.beta1 = beta1
  226. self.beta2 = beta2
  227. self.epsilon = epsilon
  228. self.grad_clip = grad_clip
  229. self.weight_decay = 0.01 if weight_decay is None else weight_decay
  230. self.grad_clip = grad_clip
  231. self.name = name
  232. self.lazy_mode = lazy_mode
  233. self.multi_precision = multi_precision
  234. self.no_weight_decay_name_list = (
  235. no_weight_decay_name.split() if no_weight_decay_name else []
  236. )
  237. self.one_dim_param_no_weight_decay = one_dim_param_no_weight_decay
  238. def __call__(self, model):
  239. parameters = [param for param in model.parameters() if param.trainable is True]
  240. self.no_weight_decay_param_name_list = [
  241. p.name
  242. for n, p in model.named_parameters()
  243. if any(nd in n for nd in self.no_weight_decay_name_list)
  244. ]
  245. if self.one_dim_param_no_weight_decay:
  246. self.no_weight_decay_param_name_list += [
  247. p.name for n, p in model.named_parameters() if len(p.shape) == 1
  248. ]
  249. opt = optim.AdamW(
  250. learning_rate=self.learning_rate,
  251. beta1=self.beta1,
  252. beta2=self.beta2,
  253. epsilon=self.epsilon,
  254. parameters=parameters,
  255. weight_decay=self.weight_decay,
  256. multi_precision=self.multi_precision,
  257. grad_clip=self.grad_clip,
  258. name=self.name,
  259. lazy_mode=self.lazy_mode,
  260. apply_decay_param_fun=self._apply_decay_param_fun,
  261. )
  262. return opt
  263. def _apply_decay_param_fun(self, name):
  264. return name not in self.no_weight_decay_param_name_list