modelaverage.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600
  1. # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import paddle
  15. from paddle import _C_ops
  16. from paddle.base import framework
  17. from paddle.base.dygraph import base as imperative_base
  18. from paddle.base.layer_helper import LayerHelper
  19. from paddle.base.wrapped_decorator import signature_safe_contextmanager
  20. from paddle.framework import (
  21. in_dynamic_mode,
  22. in_dynamic_or_pir_mode,
  23. in_pir_mode,
  24. )
  25. from paddle.optimizer import Optimizer
  26. __all__ = []
  27. class ModelAverage(Optimizer):
  28. r"""
  29. The ModelAverage optimizer accumulates specific continuous historical
  30. parameters during training. The accumulated historical range can be controlled
  31. by the passed ``average_window_rate`` argument. The averaged ``Parameter`` are
  32. used in the prediction, which usually can improve the accuracy of the prediction.
  33. Accumulate the average of the ``Parameter`` in the sliding window, the result will be saved
  34. in a temporary variable, can be applied to the current model's ``Parameter`` by calling
  35. the ``apply()`` method, and the current model ``Parameter`` can be restored by calling
  36. the ``restore()`` method.
  37. The window size for calculating the average is determined by ``average_window_rate``,
  38. ``min_average_window``, ``max_average_window`` and the current ``Parameter`` update times (num_updates).
  39. When the cumulative times (num_accumulates) is greater than the specific window
  40. threshold (average_window), the accumulated ``Parameter`` temporary variable is set to 0.0.
  41. The following example will help to understand the role of these arguments:
  42. ::
  43. if num_accumulates >= min_average_window and num_accumulates >= min(max_average_window, num_updates * average_window_rate):
  44. num_accumulates = 0
  45. In the above conditional judgment statement, ``num_accumulates`` indicates the current
  46. accumulated number, which can be abstractly understood as the length of the cumulative window.
  47. The length of the window must be at least the length set by the ``min_average_window`` argument,
  48. and cannot exceed the length specified by the ``max_average_window`` argument or
  49. ``num_updates * average_window_rate``, where ``num_updates`` indicates the current ``Parameter``
  50. update times, ``average_window_rate`` is a coefficient that calculates the length of the window.
  51. Args:
  52. average_window_rate (float): The calculate ratio of the window length relative to ``Parameter`` update times.
  53. parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
  54. This parameter is required in dygraph mode. \
  55. The default value is None in static graph mode, at this time all parameters will be updated.
  56. min_average_window (int, optional): the minimum size of average window length. The default value is 10000.
  57. max_average_window (int, optional): The maximum size of average window length. The default value is 10000.
  58. name (str, optional): Normally there is no need for user to set this property.
  59. For more information, please refer to :ref:`api_guide_Name`.
  60. The default value is None.
  61. Examples:
  62. .. code-block:: python
  63. >>> # doctest: +SKIP("Cannot get source code by to_static in REPL")
  64. >>> import numpy as np
  65. >>> import paddle
  66. >>> import paddle.nn as nn
  67. >>> import paddle.optimizer as opt
  68. >>> BATCH_SIZE = 16
  69. >>> BATCH_NUM = 4
  70. >>> EPOCH_NUM = 4
  71. >>> IMAGE_SIZE = 784
  72. >>> CLASS_NUM = 10
  73. >>> # define a random dataset
  74. >>> class RandomDataset(paddle.io.Dataset):
  75. ... def __init__(self, num_samples):
  76. ... self.num_samples = num_samples
  77. ... def __getitem__(self, idx):
  78. ... image = np.random.random([IMAGE_SIZE]).astype('float32')
  79. ... label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
  80. ... return image, label
  81. ... def __len__(self):
  82. ... return self.num_samples
  83. ...
  84. >>> class LinearNet(nn.Layer):
  85. ... def __init__(self):
  86. ... super().__init__()
  87. ... self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM)
  88. ... self.bias = self._linear.bias
  89. ...
  90. ... @paddle.jit.to_static
  91. ... def forward(self, x):
  92. ... return self._linear(x)
  93. ...
  94. >>> def train(layer, loader, loss_fn, opt, model_average):
  95. ... for epoch_id in range(EPOCH_NUM):
  96. ... for batch_id, (image, label) in enumerate(loader()):
  97. ... out = layer(image)
  98. ... loss = loss_fn(out, label)
  99. ... loss.backward()
  100. ... opt.step()
  101. ... model_average.step()
  102. ... opt.clear_grad()
  103. ... model_average.clear_grad()
  104. ... print("Train Epoch {} batch {}: loss = {}, bias = {}".format(
  105. ... epoch_id, batch_id, np.mean(loss.numpy()), layer.bias.numpy()))
  106. ...
  107. >>> def evaluate(layer, loader, loss_fn):
  108. ... for batch_id, (image, label) in enumerate(loader()):
  109. ... out = layer(image)
  110. ... loss = loss_fn(out, label)
  111. ... loss.backward()
  112. ... print("Evaluate batch {}: loss = {}, bias = {}".format(
  113. ... batch_id, np.mean(loss.numpy()), layer.bias.numpy()))
  114. ...
  115. >>> # create network
  116. >>> layer = LinearNet()
  117. >>> loss_fn = nn.CrossEntropyLoss()
  118. >>> optimizer = opt.Momentum(learning_rate=0.2, momentum=0.1, parameters=layer.parameters())
  119. >>> model_average = paddle.incubate.ModelAverage(
  120. ... 0.15,
  121. ... parameters=layer.parameters(),
  122. ... min_average_window=2,
  123. ... max_average_window=10
  124. ... )
  125. ...
  126. >>> # create data loader
  127. >>> dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
  128. >>> loader = paddle.io.DataLoader(dataset,
  129. ... batch_size=BATCH_SIZE,
  130. ... shuffle=True,
  131. ... drop_last=True,
  132. ... num_workers=2)
  133. ...
  134. >>> # create data loader
  135. >>> eval_loader = paddle.io.DataLoader(dataset,
  136. ... batch_size=BATCH_SIZE,
  137. ... shuffle=True,
  138. ... drop_last=True,
  139. ... num_workers=1
  140. ... )
  141. ...
  142. >>> # train
  143. >>> train(layer, loader, loss_fn, optimizer, model_average)
  144. >>> print("\nEvaluate With ModelAverage")
  145. >>> with model_average.apply(need_restore=False):
  146. ... evaluate(layer, eval_loader, loss_fn)
  147. >>> print("\nEvaluate With Restored Parameters")
  148. >>> model_average.restore()
  149. >>> evaluate(layer, eval_loader, loss_fn)
  150. """
  151. def __init__(
  152. self,
  153. average_window_rate,
  154. parameters=None,
  155. min_average_window=10000,
  156. max_average_window=10000,
  157. name=None,
  158. ):
  159. super().__init__(
  160. learning_rate=0.0,
  161. parameters=parameters,
  162. weight_decay=None,
  163. grad_clip=None,
  164. name=name,
  165. )
  166. self.helper = LayerHelper(self.__class__.__name__)
  167. self.average_window = average_window_rate
  168. self.min_average_window = min_average_window
  169. self.max_average_window = max_average_window
  170. self.type = "average_accumulates"
  171. if not in_dynamic_mode():
  172. global_block = paddle.static.default_main_program().global_block()
  173. all_parameters = (
  174. parameters if parameters else global_block.all_parameters()
  175. )
  176. self._create_accumulators(global_block, all_parameters)
  177. for param in all_parameters:
  178. self._append_optimize_op(global_block, [param, None])
  179. self.apply_program = paddle.static.Program()
  180. block = self.apply_program.global_block()
  181. with paddle.static.program_guard(main_program=self.apply_program):
  182. for param in all_parameters:
  183. self._add_average_apply_op(block, param)
  184. self.restore_program = paddle.static.Program()
  185. block = self.restore_program.global_block()
  186. with paddle.static.program_guard(main_program=self.restore_program):
  187. for param in all_parameters:
  188. self._add_average_restore_op(block, param)
  189. def _create_accumulators(self, block, parameters):
  190. assert isinstance(block, (framework.Block, paddle.pir.Block))
  191. for param in parameters:
  192. self._add_accumulator('sum_1', param)
  193. self._add_accumulator('sum_2', param)
  194. self._add_accumulator('sum_3', param)
  195. self._add_accumulator('restore', param)
  196. self._add_accumulator(
  197. 'num_accumulates', param, dtype='int64', shape=[1]
  198. )
  199. self._add_accumulator(
  200. 'old_num_accumulates', param, dtype='int64', shape=[1]
  201. )
  202. self._add_accumulator(
  203. 'num_updates', param, dtype='int64', shape=[1]
  204. )
  205. def _append_optimize_op(self, block, param_and_grad):
  206. assert isinstance(block, (framework.Block, paddle.pir.Block))
  207. sum_1 = self._get_accumulator('sum_1', param_and_grad[0])
  208. sum_2 = self._get_accumulator('sum_2', param_and_grad[0])
  209. sum_3 = self._get_accumulator('sum_3', param_and_grad[0])
  210. num_accumulates = self._get_accumulator(
  211. 'num_accumulates', param_and_grad[0]
  212. )
  213. old_num_accumulates = self._get_accumulator(
  214. 'old_num_accumulates', param_and_grad[0]
  215. )
  216. num_updates = self._get_accumulator('num_updates', param_and_grad[0])
  217. if in_dynamic_or_pir_mode():
  218. _, _, _, _, _, _ = _C_ops.average_accumulates_(
  219. param_and_grad[0],
  220. sum_1,
  221. sum_2,
  222. sum_3,
  223. num_accumulates,
  224. old_num_accumulates,
  225. num_updates,
  226. self.average_window,
  227. self.max_average_window,
  228. self.min_average_window,
  229. )
  230. return None
  231. block = framework.default_main_program().global_block()
  232. attrs = {
  233. "average_window": self.average_window,
  234. "min_average_window": self.min_average_window,
  235. "max_average_window": self.max_average_window,
  236. }
  237. inputs = {
  238. "param": param_and_grad[0],
  239. "in_sum_1": sum_1,
  240. "in_sum_2": sum_2,
  241. "in_sum_3": sum_3,
  242. "in_num_accumulates": num_accumulates,
  243. "in_old_num_accumulates": old_num_accumulates,
  244. "in_num_updates": num_updates,
  245. }
  246. outputs = {
  247. "out_sum_1": sum_1,
  248. "out_sum_2": sum_2,
  249. "out_sum_3": sum_3,
  250. "out_num_accumulates": num_accumulates,
  251. "out_old_num_accumulates": old_num_accumulates,
  252. "out_num_updates": num_updates,
  253. }
  254. average_accumulates_op = block.append_op(
  255. type=self.type,
  256. inputs=inputs,
  257. outputs=outputs,
  258. attrs=attrs,
  259. stop_gradient=True,
  260. )
  261. return average_accumulates_op
  262. @imperative_base.no_grad
  263. def minimize(
  264. self, loss, startup_program=None, parameters=None, no_grad_set=None
  265. ):
  266. """
  267. Add operations to minimize ``loss`` by updating ``parameters``.
  268. Args:
  269. loss (Tensor): A ``Tensor`` containing the value to minimize.
  270. startup_program (Program, optional): :ref:`api_paddle_static_Program` for
  271. initializing parameters in ``parameters``. The default value
  272. is None, at this time :ref:`api_paddle_static_default_startup_program` will be used.
  273. parameters (list, optional): List of ``Tensor`` or ``Tensor.name`` to update
  274. to minimize ``loss``. The default value is None, at this time all parameters
  275. will be updated.
  276. no_grad_set (set, optional): Set of ``Tensor`` or ``Tensor.name`` that don't need
  277. to be updated. The default value is None.
  278. Returns:
  279. tuple: tuple (optimize_ops, params_grads), A list of operators appended
  280. by minimize and a list of (param, grad) tensor pairs, param is
  281. ``Parameter``, grad is the gradient value corresponding to the parameter.
  282. In static graph mode, the returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to
  283. indicate program pruning. If so, the program will be pruned by ``feed`` and
  284. ``fetch_list`` before run, see details in ``Executor``.
  285. Examples:
  286. .. code-block:: python
  287. >>> import paddle
  288. >>> inp = paddle.rand([1, 10], dtype="float32")
  289. >>> linear = paddle.nn.Linear(10, 1)
  290. >>> out = linear(inp)
  291. >>> loss = paddle.mean(out)
  292. >>> loss.backward()
  293. >>> sgd = paddle.optimizer.SGD(learning_rate=0.1,parameters=linear.parameters())
  294. >>> sgd.minimize(loss)
  295. >>> modelaverage = paddle.incubate.ModelAverage(
  296. ... 0.15,
  297. ... parameters=linear.parameters(),
  298. ... min_average_window=2,
  299. ... max_average_window=4
  300. ... )
  301. >>> modelaverage.minimize(loss)
  302. >>> sgd.clear_grad()
  303. >>> modelaverage.clear_grad()
  304. """
  305. if in_dynamic_mode():
  306. self.step()
  307. @framework.dygraph_only
  308. @imperative_base.no_grad
  309. def step(self):
  310. """
  311. Execute the optimizer and update parameters once.
  312. Returns:
  313. None
  314. Examples:
  315. .. code-block:: python
  316. >>> import paddle
  317. >>> inp = paddle.rand([1, 10], dtype="float32")
  318. >>> linear = paddle.nn.Linear(10, 1)
  319. >>> out = linear(inp)
  320. >>> loss = paddle.mean(out)
  321. >>> sgd = paddle.optimizer.SGD(learning_rate=0.1,parameters=linear.parameters())
  322. >>> modelaverage = paddle.incubate.ModelAverage(
  323. ... 0.15,
  324. ... parameters=linear.parameters(),
  325. ... min_average_window=2,
  326. ... max_average_window=4
  327. ... )
  328. >>> loss.backward()
  329. >>> sgd.step()
  330. >>> modelaverage.step()
  331. >>> sgd.clear_grad()
  332. >>> modelaverage.clear_grad()
  333. """
  334. params_grads = []
  335. for param in self._parameter_list:
  336. if not param.trainable:
  337. continue
  338. if param._grad_ivar() is not None:
  339. grad_var = param._grad_ivar()
  340. params_grads.append((param, grad_var))
  341. block = framework.default_main_program().global_block()
  342. self._create_accumulators(block, self._parameter_list)
  343. for param_and_grad in params_grads:
  344. self._append_optimize_op(block, param_and_grad)
  345. @signature_safe_contextmanager
  346. @imperative_base.no_grad
  347. def apply(self, executor=None, need_restore=True):
  348. """
  349. Apply the average of the cumulative ``Parameter`` to the parameters of the current model.
  350. Args:
  351. executor(Executor): The network executor in static-graph mode. The default value is None in dygraph mode.
  352. need_restore(bool): Restore flag variable, if set to True, the network will restore
  353. the parameters of the network to the default value, if set to False,
  354. it will not be restored. The default value is True.
  355. Examples:
  356. .. code-block:: python
  357. >>> import paddle
  358. >>> inp = paddle.rand([1, 10], dtype="float32")
  359. >>> linear = paddle.nn.Linear(10, 1)
  360. >>> out = linear(inp)
  361. >>> loss = paddle.mean(out)
  362. >>> loss.backward()
  363. >>> sgd = paddle.optimizer.SGD(learning_rate=0.1,parameters=linear.parameters())
  364. >>> modelaverage = paddle.incubate.ModelAverage(
  365. ... 0.15,
  366. ... parameters=linear.parameters(),
  367. ... min_average_window=2,
  368. ... max_average_window=4
  369. ... )
  370. >>> sgd.step()
  371. >>> modelaverage.step()
  372. >>> with modelaverage.apply():
  373. ... for param in linear.parameters():
  374. ... print(param)
  375. >>> for param in linear.parameters():
  376. ... print(param)
  377. """
  378. if in_dynamic_mode():
  379. for param in self._parameter_list:
  380. num_accumulates = self._get_accumulator(
  381. 'num_accumulates', param
  382. )
  383. old_num_accumulates = self._get_accumulator(
  384. 'old_num_accumulates', param
  385. )
  386. sum_1 = self._get_accumulator('sum_1', param)
  387. sum_2 = self._get_accumulator('sum_2', param)
  388. sum_3 = self._get_accumulator('sum_3', param)
  389. param_restore = self._get_accumulator('restore', param)
  390. paddle.assign(param, param_restore)
  391. total_param = sum_1 + sum_2 + sum_3
  392. total_accumulates = num_accumulates + old_num_accumulates
  393. total_param = paddle.cast(total_param, dtype='float32')
  394. total_accumulates = paddle.cast(
  395. total_accumulates, dtype='float32'
  396. )
  397. average_param = total_param / total_accumulates
  398. paddle.assign(average_param, param)
  399. try:
  400. yield
  401. finally:
  402. if need_restore:
  403. self.restore()
  404. return
  405. if executor is None:
  406. raise RuntimeError(
  407. "Executor should not be None in static graph mode."
  408. )
  409. executor.run(self.apply_program)
  410. try:
  411. yield
  412. finally:
  413. if need_restore:
  414. self.restore(executor)
  415. @imperative_base.no_grad
  416. def restore(self, executor=None):
  417. """
  418. Restore ``Parameter`` values of current model.
  419. Args:
  420. executor(Executor): The network executor in static-graph mode. The default value is None in dygraph mode
  421. Examples:
  422. .. code-block:: python
  423. >>> import paddle
  424. >>> inp = paddle.rand([1, 10], dtype="float32")
  425. >>> linear = paddle.nn.Linear(10, 1)
  426. >>> out = linear(inp)
  427. >>> loss = paddle.mean(out)
  428. >>> loss.backward()
  429. >>> sgd = paddle.optimizer.SGD(learning_rate=0.1,parameters=linear.parameters())
  430. >>> modelaverage = paddle.incubate.ModelAverage(
  431. ... 0.15,
  432. ... parameters=linear.parameters(),
  433. ... min_average_window=2,
  434. ... max_average_window=4
  435. ... )
  436. >>> sgd.step()
  437. >>> modelaverage.step()
  438. >>> with modelaverage.apply(need_restore=False):
  439. ... for param in linear.parameters():
  440. ... print(param)
  441. >>> for param in linear.parameters():
  442. ... print(param)
  443. >>> modelaverage.restore()
  444. >>> for param in linear.parameters():
  445. ... print(param)
  446. """
  447. if in_dynamic_mode():
  448. for param in self._parameter_list:
  449. param_restore = self._get_accumulator('restore', param)
  450. paddle.assign(param_restore, param)
  451. return
  452. if executor is None:
  453. raise RuntimeError(
  454. "Executor should not be None in static graph mode."
  455. )
  456. executor.run(self.restore_program)
  457. def _add_average_apply_op(self, block, param):
  458. if in_pir_mode():
  459. target_program = paddle.static.default_main_program()
  460. param = paddle.pir.core._get_parameter(target_program, param)
  461. restore_value = self._get_accumulator('restore', param)
  462. grad = paddle.pir.core._get_persistable_value(
  463. target_program, restore_value
  464. )
  465. sum_1 = self._get_accumulator('sum_1', param)
  466. sum_1 = paddle.pir.core._get_persistable_value(
  467. target_program, sum_1
  468. )
  469. sum_2 = self._get_accumulator('sum_2', param)
  470. sum_2 = paddle.pir.core._get_persistable_value(
  471. target_program, sum_2
  472. )
  473. sum_3 = self._get_accumulator('sum_3', param)
  474. sum_3 = paddle.pir.core._get_persistable_value(
  475. target_program, sum_3
  476. )
  477. num_accumulates = self._get_accumulator('num_accumulates', param)
  478. num_accumulates = paddle.pir.core._get_persistable_value(
  479. target_program, num_accumulates
  480. )
  481. old_num_accumulates = self._get_accumulator(
  482. 'old_num_accumulates', param
  483. )
  484. old_num_accumulates = paddle.pir.core._get_persistable_value(
  485. target_program, old_num_accumulates
  486. )
  487. else:
  488. param = block._clone_variable(param)
  489. grad = block._clone_variable(
  490. self._get_accumulator('restore', param)
  491. )
  492. sum_1 = block._clone_variable(self._get_accumulator('sum_1', param))
  493. sum_2 = block._clone_variable(self._get_accumulator('sum_2', param))
  494. sum_3 = block._clone_variable(self._get_accumulator('sum_3', param))
  495. num_accumulates = block._clone_variable(
  496. self._get_accumulator('num_accumulates', param)
  497. )
  498. old_num_accumulates = block._clone_variable(
  499. self._get_accumulator('old_num_accumulates', param)
  500. )
  501. # backup param value to grad
  502. paddle.assign(param, output=grad)
  503. # param = (sum_1 + sum_2 + sum_3) / (num_accumulates + old_num_accumulates)
  504. tmp = paddle.add_n([num_accumulates, old_num_accumulates])
  505. sum = paddle.add_n([sum_1, sum_2, sum_3])
  506. tmp = paddle.cast(
  507. x=tmp, dtype='float32' if self._dtype is None else self._dtype
  508. )
  509. sum = paddle.cast(
  510. x=sum, dtype='float32' if self._dtype is None else self._dtype
  511. )
  512. divide_out = paddle.divide(x=sum, y=tmp)
  513. paddle.assign(divide_out, output=param)
  514. def _add_average_restore_op(self, block, param):
  515. if in_pir_mode():
  516. target_program = paddle.static.default_main_program()
  517. param = paddle.pir.core._get_parameter(target_program, param)
  518. restore_value = self._get_accumulator('restore', param)
  519. grad = paddle.pir.core._get_persistable_value(
  520. target_program, restore_value
  521. )
  522. else:
  523. param = block._clone_variable(param)
  524. grad = block._clone_variable(
  525. self._get_accumulator('restore', param)
  526. )
  527. paddle.assign(grad, output=param)