rnn.py 85 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135
  1. # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import math
  15. from collections.abc import Sequence
  16. from functools import partial, reduce
  17. import numpy as np
  18. import paddle
  19. from paddle import _C_ops, _legacy_C_ops, framework, in_dynamic_mode
  20. from paddle.base.data_feeder import check_type, check_variable_and_dtype
  21. from paddle.base.dygraph.base import NON_PERSISTABLE_VAR_NAME_SUFFIX
  22. from paddle.base.framework import (
  23. default_startup_program,
  24. in_dynamic_or_pir_mode,
  25. program_guard,
  26. )
  27. from paddle.common_ops_import import Variable
  28. from paddle.framework import core
  29. from paddle.nn import (
  30. functional as F,
  31. initializer as I,
  32. )
  33. from paddle.tensor.manipulation import tensor_array_to_tensor
  34. from .container import LayerList
  35. from .layers import Layer
  36. __all__ = []
  37. def rnn(
  38. cell,
  39. inputs,
  40. initial_states=None,
  41. sequence_length=None,
  42. time_major=False,
  43. is_reverse=False,
  44. **kwargs,
  45. ):
  46. r"""
  47. rnn creates a recurrent neural network specified by RNNCell `cell`,
  48. which performs :code:`cell.call()` (for dygraph mode :code:`cell.forward`)
  49. repeatedly until reaches to the maximum length of `inputs`.
  50. Parameters:
  51. cell(RNNCellBase): An instance of `RNNCellBase`.
  52. inputs(Tensor): the input sequences.
  53. If time_major is True, the shape is
  54. `[time_steps, batch_size, input_size]`
  55. else the shape is `[batch_size, time_steps, input_size]`.
  56. initial_states(Tensor|tuple|list, optional): the initial state of the
  57. rnn cell. Tensor or a possibly nested structure of tensors. If not
  58. provided, `cell.get_initial_states` would be called to produce
  59. the initial state. Defaults to None.
  60. sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64
  61. or int32. The valid lengths of input sequences. Defaults to None.
  62. If `sequence_length` is not None, the inputs are treated as
  63. padded sequences. In each input sequence, elements whose time step
  64. index are not less than the valid length are treated as paddings.
  65. time_major (bool, optional): Whether the first dimension of the input means the
  66. time steps. Defaults to False.
  67. is_reverse (bool, optional): Indicate whether to calculate in the reverse
  68. order of input sequences. Defaults to False.
  69. **kwargs: Additional keyword arguments to pass to `forward` of the cell.
  70. Returns:
  71. outputs (Tensor|list|tuple): the output sequence. Tensor or nested
  72. structure of Tensors.
  73. If `time_major` is True, the shape of each tensor in outputs is
  74. `[time_steps, batch_size, hidden_size]`, else
  75. `[batch_size, time_steps, hidden_size]`.
  76. final_states (Tensor|list|tuple): final states. A (possibly nested structure of)
  77. tensor[s], representing the final state for RNN. It has the same
  78. structure of initial state. Each tensor in final states has the same
  79. shape and dtype as the corresponding tensor in initial states.
  80. Examples:
  81. .. code-block:: python
  82. >>> import paddle
  83. >>> inputs = paddle.rand((4, 23, 16))
  84. >>> prev_h = paddle.randn((4, 32))
  85. >>> cell = paddle.nn.SimpleRNNCell(16, 32)
  86. >>> rnn = paddle.nn.RNN(cell)
  87. >>> outputs, final_states = rnn(inputs, prev_h)
  88. >>> print(outputs.shape)
  89. [4, 23, 32]
  90. >>> print(final_states.shape)
  91. [4, 32]
  92. """
  93. if in_dynamic_or_pir_mode():
  94. return _rnn_dynamic_graph(
  95. cell,
  96. inputs,
  97. initial_states,
  98. sequence_length,
  99. time_major,
  100. is_reverse,
  101. **kwargs,
  102. )
  103. else:
  104. return _rnn_static_graph(
  105. cell,
  106. inputs,
  107. initial_states,
  108. sequence_length,
  109. time_major,
  110. is_reverse,
  111. **kwargs,
  112. )
  113. class ArrayWrapper:
  114. def __init__(self, x):
  115. self.array = [x]
  116. def append(self, x):
  117. self.array.append(x)
  118. return self
  119. def __getitem__(self, item):
  120. return self.array.__getitem__(item)
  121. def _maybe_copy(state, new_state, step_mask):
  122. """update rnn state or just pass the old state through"""
  123. new_state = paddle.tensor.math._multiply_with_axis(
  124. new_state, step_mask, axis=0
  125. ) + paddle.tensor.math._multiply_with_axis(state, (1 - step_mask), axis=0)
  126. return new_state
  127. def _transpose_batch_time(x):
  128. perm = [1, 0] + list(range(2, len(x.shape)))
  129. return paddle.transpose(x, perm)
  130. def _rnn_dynamic_graph(
  131. cell,
  132. inputs,
  133. initial_states=None,
  134. sequence_length=None,
  135. time_major=False,
  136. is_reverse=False,
  137. **kwargs,
  138. ):
  139. time_step_index = 0 if time_major else 1
  140. flat_inputs = paddle.utils.flatten(inputs)
  141. time_steps = flat_inputs[0].shape[time_step_index]
  142. if initial_states is None:
  143. initial_states = cell.get_initial_states(
  144. batch_ref=inputs, batch_dim_idx=1 if time_major else 0
  145. )
  146. if not time_major:
  147. inputs = paddle.utils.map_structure(_transpose_batch_time, inputs)
  148. if sequence_length is not None:
  149. mask = paddle.static.nn.sequence_lod.sequence_mask(
  150. sequence_length, maxlen=time_steps, dtype=inputs.dtype
  151. )
  152. mask = paddle.transpose(mask, [1, 0])
  153. if is_reverse:
  154. inputs = paddle.utils.map_structure(
  155. lambda x: paddle.reverse(x, axis=[0]), inputs
  156. )
  157. mask = (
  158. paddle.reverse(mask, axis=[0])
  159. if sequence_length is not None
  160. else None
  161. )
  162. states = initial_states
  163. outputs = []
  164. for i in range(time_steps):
  165. step_inputs = paddle.utils.map_structure(lambda x: x[i], inputs)
  166. step_outputs, new_states = cell(step_inputs, states, **kwargs)
  167. if sequence_length is not None:
  168. new_states = paddle.utils.map_structure(
  169. partial(_maybe_copy, step_mask=mask[i]), states, new_states
  170. )
  171. states = new_states
  172. outputs = (
  173. paddle.utils.map_structure(lambda x: ArrayWrapper(x), step_outputs)
  174. if i == 0
  175. else paddle.utils.map_structure(
  176. lambda x, x_array: x_array.append(x), step_outputs, outputs
  177. )
  178. )
  179. final_outputs = paddle.utils.map_structure(
  180. lambda x: paddle.stack(x.array, axis=time_step_index), outputs
  181. )
  182. if is_reverse:
  183. final_outputs = paddle.utils.map_structure(
  184. lambda x: paddle.reverse(x, axis=time_step_index), final_outputs
  185. )
  186. final_states = new_states
  187. return final_outputs, final_states
  188. def _rnn_static_graph(
  189. cell,
  190. inputs,
  191. initial_states=None,
  192. sequence_length=None,
  193. time_major=False,
  194. is_reverse=False,
  195. **kwargs,
  196. ):
  197. check_type(inputs, 'inputs', (Variable, list, tuple), 'rnn')
  198. if isinstance(inputs, (list, tuple)):
  199. for i, input_x in enumerate(inputs):
  200. check_variable_and_dtype(
  201. input_x, 'inputs[' + str(i) + ']', ['float32', 'float64'], 'rnn'
  202. )
  203. check_type(
  204. initial_states,
  205. 'initial_states',
  206. (Variable, list, tuple, type(None)),
  207. 'rnn',
  208. )
  209. check_type(
  210. sequence_length, 'sequence_length', (Variable, type(None)), 'rnn'
  211. )
  212. def _switch_grad(x, stop=False):
  213. x.stop_gradient = stop
  214. return x
  215. if initial_states is None:
  216. initial_states = cell.get_initial_states(
  217. batch_ref=inputs, batch_dim_idx=1 if time_major else 0
  218. )
  219. initial_states = paddle.utils.map_structure(_switch_grad, initial_states)
  220. if not time_major:
  221. inputs = paddle.utils.map_structure(_transpose_batch_time, inputs)
  222. max_seq_len = paddle.shape(paddle.utils.flatten(inputs)[0])[0]
  223. if sequence_length:
  224. mask = paddle.static.nn.sequence_lod.sequence_mask(
  225. sequence_length,
  226. maxlen=max_seq_len,
  227. dtype=paddle.utils.flatten(initial_states)[0].dtype,
  228. )
  229. mask = paddle.transpose(mask, [1, 0])
  230. if is_reverse:
  231. inputs = paddle.utils.map_structure(
  232. lambda x: paddle.reverse(x, axis=[0]), inputs
  233. )
  234. mask = paddle.reverse(mask, axis=[0]) if sequence_length else None
  235. with paddle.base.framework.device_guard("cpu"):
  236. start_i = paddle.zeros([], dtype="int64")
  237. end = max_seq_len
  238. end = paddle.cast(end, "int64")
  239. cond = start_i < end
  240. while_op = paddle.static.nn.control_flow.While(cond)
  241. out_array = paddle.tensor.create_array(
  242. dtype=paddle.utils.flatten(inputs)[0].dtype
  243. )
  244. init_array = paddle.utils.map_structure(
  245. lambda x: paddle.tensor.create_array(dtype=x.dtype), initial_states
  246. )
  247. paddle.utils.map_structure(
  248. lambda x, y: paddle.tensor.array_write(x, start_i, y),
  249. initial_states,
  250. init_array,
  251. )
  252. with while_op.block():
  253. step_in = inputs[start_i]
  254. # step_in = paddle.base.layers.Print( step_in, message="step in")
  255. pre_state = paddle.utils.map_structure(
  256. lambda x: paddle.tensor.array_read(x, start_i), init_array
  257. )
  258. outputs, new_states = cell(step_in, pre_state, **kwargs)
  259. assert isinstance(outputs, paddle.base.framework.Variable)
  260. paddle.utils.assert_same_structure(new_states, pre_state)
  261. if sequence_length:
  262. step_mask = paddle.unsqueeze(mask[start_i], 1)
  263. # new_states = map_structure(
  264. # partial(_maybe_copy, step_mask=step_mask),
  265. # pre_state, new_states
  266. # )
  267. new_states = paddle.utils.map_structure(
  268. lambda x, y: (x * step_mask + y * (1.0 - step_mask)),
  269. new_states,
  270. pre_state,
  271. )
  272. paddle.tensor.array_write(outputs, start_i, out_array)
  273. with paddle.base.framework.device_guard("cpu"):
  274. start_i = paddle.tensor.increment(x=start_i, value=1)
  275. paddle.utils.map_structure(
  276. lambda x, y: paddle.tensor.array_write(x, start_i, y),
  277. new_states,
  278. init_array,
  279. )
  280. with paddle.base.framework.device_guard("cpu"):
  281. new_cond = paddle.tensor.less_than(start_i, end)
  282. paddle.assign(new_cond, cond)
  283. out, _ = tensor_array_to_tensor(out_array, axis=0, use_stack=True)
  284. all_state = paddle.utils.map_structure(
  285. lambda x: tensor_array_to_tensor(x, axis=0, use_stack=True)[0],
  286. init_array,
  287. )
  288. final_outputs = out
  289. final_states = paddle.utils.map_structure(lambda x: x[-1], all_state)
  290. if is_reverse:
  291. final_outputs = paddle.utils.map_structure(
  292. lambda x: paddle.reverse(x, axis=[0]), final_outputs
  293. )
  294. if not time_major:
  295. final_outputs = paddle.utils.map_structure(
  296. _transpose_batch_time, final_outputs
  297. )
  298. return (final_outputs, final_states)
  299. def birnn(
  300. cell_fw,
  301. cell_bw,
  302. inputs,
  303. initial_states=None,
  304. sequence_length=None,
  305. time_major=False,
  306. **kwargs,
  307. ):
  308. r"""
  309. birnn creates a bidirectional recurrent neural network specified by
  310. RNNCell `cell_fw` and `cell_bw`, which performs :code:`cell.call()`
  311. (for dygraph mode :code:`cell.forward`) repeatedly until reaches to
  312. the maximum length of `inputs` and then concat the outputs for both RNNs
  313. along the last axis.
  314. Parameters:
  315. cell_fw(RNNCellBase): An instance of `RNNCellBase`.
  316. cell_bw(RNNCellBase): An instance of `RNNCellBase`.
  317. inputs(Tensor): the input sequences.
  318. If time_major is True, the shape is
  319. `[time_steps, batch_size, input_size]`
  320. else the shape is `[batch_size, time_steps, input_size]`.
  321. initial_states(tuple, optional): A tuple of initial states of
  322. `cell_fw` and `cell_bw`.
  323. If not provided, `cell.get_initial_states` would be called to
  324. produce initial state for each cell. Defaults to None.
  325. sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64
  326. or int32. The valid lengths of input sequences. Defaults to None.
  327. If `sequence_length` is not None, the inputs are treated as
  328. padded sequences. In each input sequence, elements whose time step
  329. index are not less than the valid length are treated as paddings.
  330. time_major (bool): Whether the first dimension of the input means the
  331. time steps. Defaults to False.
  332. **kwargs: Additional keyword arguments to pass to `forward` of each cell.
  333. Returns:
  334. outputs (Tensor): the outputs of the bidirectional RNN. It is the
  335. concatenation of the outputs from the forward RNN and backward
  336. RNN along the last axis.
  337. If time_major is True, the shape is `[time_steps, batch_size, size]`,
  338. else the shape is `[batch_size, time_steps, size]`, where size is
  339. `cell_fw.hidden_size + cell_bw.hidden_size`.
  340. final_states (tuple): A tuple of the final states of the forward
  341. cell and backward cell.
  342. Examples:
  343. .. code-block:: python
  344. >>> import paddle
  345. >>> cell_fw = paddle.nn.LSTMCell(16, 32)
  346. >>> cell_bw = paddle.nn.LSTMCell(16, 32)
  347. >>> rnn = paddle.nn.BiRNN(cell_fw, cell_bw)
  348. >>> inputs = paddle.rand((2, 23, 16))
  349. >>> outputs, final_states = rnn(inputs)
  350. >>> print(outputs.shape)
  351. [2, 23, 64]
  352. >>> print(final_states[0][0].shape)
  353. [2, 32]
  354. """
  355. if initial_states is None:
  356. states_fw = cell_fw.get_initial_states(
  357. batch_ref=inputs, batch_dim_idx=1 if time_major else 0
  358. )
  359. states_bw = cell_fw.get_initial_states(
  360. batch_ref=inputs, batch_dim_idx=1 if time_major else 0
  361. )
  362. else:
  363. states_fw, states_bw = initial_states
  364. outputs_fw, states_fw = rnn(
  365. cell_fw,
  366. inputs,
  367. states_fw,
  368. sequence_length,
  369. time_major=time_major,
  370. **kwargs,
  371. )
  372. outputs_bw, states_bw = rnn(
  373. cell_bw,
  374. inputs,
  375. states_bw,
  376. sequence_length,
  377. time_major=time_major,
  378. is_reverse=True,
  379. **kwargs,
  380. )
  381. outputs = paddle.utils.map_structure(
  382. lambda x, y: paddle.concat([x, y], -1), outputs_fw, outputs_bw
  383. )
  384. final_states = (states_fw, states_bw)
  385. return outputs, final_states
  386. def split_states(states, bidirectional=False, state_components=1):
  387. r"""
  388. Split states of RNN network into possibly nested list or tuple of
  389. states of each RNN cells of the RNN network.
  390. Parameters:
  391. states (Tensor|tuple|list): the concatenated states for RNN network.
  392. When `state_components` is 1, states in a Tensor with shape
  393. `(L*D, N, C)` where `L` is the number of layers of the RNN
  394. network, `D` is the number of directions of the RNN network(1
  395. for unidirectional RNNs and 2 for bidirectional RNNs), `N` is
  396. the batch size of the input to the RNN network, `C` is the
  397. hidden size of the RNN network.
  398. When `state_components` is larger than 1, `states` is a tuple of
  399. `state_components` Tensors that meet the requirements described
  400. above.
  401. For SimpleRNNs and GRUs, `state_components` is 1, and for LSTMs,
  402. `state_components` is 2.
  403. bidirectional (bool): whether the state is of a bidirectional RNN
  404. network. Defaults to False.
  405. state_components (int): the number of the components of the states. see
  406. `states` above. Defaults to 1.
  407. Returns:
  408. A nested list or tuple of RNN cell states.
  409. If `bidirectional` is True, it can be indexed twice to get an RNN
  410. cell state. The first index indicates the layer, the second index
  411. indicates the direction.
  412. If `bidirectional` is False, it can be indexed once to get an RNN
  413. cell state. The index indicates the layer.
  414. Note that if `state_components` is larger than 1, an RNN cell state
  415. can be indexed one more time to get a tensor of shape(N, C), where
  416. `N` is the batch size of the input to the RNN cell, and `C` is the
  417. hidden size of the RNN cell.
  418. """
  419. if state_components == 1:
  420. states = paddle.unstack(states)
  421. if not bidirectional:
  422. return states
  423. else:
  424. return list(zip(states[::2], states[1::2]))
  425. else:
  426. assert len(states) == state_components
  427. states = tuple([paddle.unstack(item) for item in states])
  428. if not bidirectional:
  429. return list(zip(*states))
  430. else:
  431. states = list(zip(*states))
  432. return list(zip(states[::2], states[1::2]))
  433. def concat_states(states, bidirectional=False, state_components=1):
  434. r"""
  435. Concatenate a possibly nested list or tuple of RNN cell states into a
  436. compact form.
  437. Parameters:
  438. states (list|tuple): a possibly nested list or tuple of RNN cell
  439. states.
  440. If `bidirectional` is True, it can be indexed twice to get an
  441. RNN cell state. The first index indicates the layer, the second
  442. index indicates the direction.
  443. If `bidirectional` is False, it can be indexed once to get an RNN
  444. cell state. The index indicates the layer.
  445. Note that if `state_components` is larger than 1, an RNN cell
  446. state can be indexed one more time to get a tensor of shape(N, C),
  447. where `N` is the batch size of the input to the RNN cell, and
  448. `C` is the hidden size of the RNN cell.
  449. bidirectional (bool): whether the state is of a bidirectional RNN
  450. network. Defaults to False.
  451. state_components (int): the number of the components of the states. see
  452. `states` above. Defaults to 1.
  453. Returns:
  454. Concatenated states for RNN network.
  455. When `state_components` is 1, states in a Tensor with shape
  456. `(L\*D, N, C)` where `L` is the number of layers of the RNN
  457. network, `D` is the number of directions of the RNN network(1 for
  458. unidirectional RNNs and 2 for bidirectional RNNs), `N` is the batch
  459. size of the input to the RNN network, `C` is the hidden size of the
  460. RNN network.
  461. """
  462. if state_components == 1:
  463. return paddle.stack(paddle.utils.flatten(states))
  464. else:
  465. states = paddle.utils.flatten(states)
  466. components = []
  467. for i in range(state_components):
  468. components.append(states[i::state_components])
  469. return tuple([paddle.stack(item) for item in components])
  470. class RNNCellBase(Layer):
  471. r"""
  472. RNNCellBase is the base class for abstraction representing the calculations
  473. mapping the input and state to the output and new state. It is suitable to
  474. and mostly used in RNN.
  475. """
  476. def get_initial_states(
  477. self, batch_ref, shape=None, dtype=None, init_value=0.0, batch_dim_idx=0
  478. ):
  479. r"""
  480. Generate initialized states according to provided shape, data type and
  481. value.
  482. Parameters:
  483. batch_ref (Tensor): A tensor, which shape would be used to
  484. determine the batch size, which is used to generate initial
  485. states. For `batch_ref`'s shape d, `d[batch_dim_idx]` is
  486. treated as batch size.
  487. shape (list|tuple, optional): A (possibly nested structure of) shape[s],
  488. where a shape is a list/tuple of integer. `-1` (for batch size)
  489. will be automatically prepended if a shape does not starts with
  490. it. If None, property `state_shape` will be used. Defaults to
  491. None.
  492. dtype (str|list|tuple, optional): A (possibly nested structure of)
  493. data type[s]. The structure must be same as that of `shape`,
  494. except when all tensors' in states has the same data type, a
  495. single data type can be used. If None and property `cell.state_shape`
  496. is not available, current default floating type of paddle is
  497. used. Defaults to None.
  498. init_value (float, optional): A float value used to initialize states.
  499. Defaults to 0.
  500. batch_dim_idx (int, optional): An integer indicating which
  501. dimension of the of `batch_ref` represents batch. Defaults to 0.
  502. Returns:
  503. init_states (Tensor|tuple|list): tensor of the provided shape and
  504. dtype, or list of tensors that each satisfies the requirements,
  505. packed in the same structure as `shape` and `type` does.
  506. """
  507. # TODO: use inputs and batch_size
  508. batch_ref = paddle.utils.flatten(batch_ref)[0]
  509. def _is_shape_sequence(seq):
  510. """For shape, list/tuple of integer is the finest-grained objection"""
  511. if isinstance(seq, (list, tuple)):
  512. if reduce(
  513. lambda flag, x: isinstance(x, int) and flag, seq, True
  514. ):
  515. return False
  516. # TODO: Add check for the illegal
  517. if isinstance(seq, dict):
  518. return True
  519. return isinstance(seq, Sequence) and not isinstance(seq, str)
  520. class Shape:
  521. def __init__(self, shape):
  522. self.shape = (
  523. list(shape) if shape[0] == -1 else ([-1] + list(shape))
  524. )
  525. # nested structure of shapes
  526. states_shapes = self.state_shape if shape is None else shape
  527. is_sequence_ori = paddle.utils.layers_utils.is_sequence
  528. paddle.utils.layers_utils.is_sequence = _is_shape_sequence
  529. states_shapes = paddle.utils.map_structure(
  530. lambda shape: Shape(shape), states_shapes
  531. )
  532. paddle.utils.layers_utils.is_sequence = is_sequence_ori
  533. # nested structure of dtypes
  534. try:
  535. states_dtypes = self.state_dtype if dtype is None else dtype
  536. except NotImplementedError:
  537. states_dtypes = framework.get_default_dtype()
  538. if len(paddle.utils.flatten(states_dtypes)) == 1:
  539. dtype = paddle.utils.flatten(states_dtypes)[0]
  540. states_dtypes = paddle.utils.map_structure(
  541. lambda shape: dtype, states_shapes
  542. )
  543. fill_shapes = states_shapes
  544. if batch_ref.shape[batch_dim_idx] > 0:
  545. if isinstance(fill_shapes, list):
  546. for s in fill_shapes[0]:
  547. s.shape[0] = batch_ref.shape[batch_dim_idx]
  548. elif isinstance(fill_shapes, tuple):
  549. for s in fill_shapes:
  550. s.shape[0] = batch_ref.shape[batch_dim_idx]
  551. else:
  552. fill_shapes.shape[0] = batch_ref.shape[batch_dim_idx]
  553. else:
  554. if isinstance(fill_shapes, list):
  555. for s in fill_shapes[0]:
  556. s.shape[0] = paddle.shape(batch_ref)[batch_dim_idx].item()
  557. elif isinstance(fill_shapes, tuple):
  558. for s in fill_shapes:
  559. s.shape[0] = paddle.shape(batch_ref)[batch_dim_idx].item()
  560. else:
  561. fill_shapes.shape[0] = paddle.shape(batch_ref)[
  562. batch_dim_idx
  563. ].item()
  564. init_states = paddle.utils.map_structure(
  565. lambda shape, dtype: paddle.full(
  566. shape=shape.shape,
  567. fill_value=init_value,
  568. dtype=dtype,
  569. ),
  570. fill_shapes,
  571. states_dtypes,
  572. )
  573. return init_states
  574. @property
  575. def state_shape(self):
  576. r"""
  577. Abstract method (property).
  578. Used to initialize states.
  579. A (possibly nested structure of) shape[s], where a shape is a
  580. list/tuple of integers (-1 for batch size would be automatically
  581. inserted into a shape if shape is not started with it).
  582. Not necessary to be implemented if states are not initialized by
  583. `get_initial_states` or the `shape` argument is provided when using
  584. `get_initial_states`.
  585. """
  586. raise NotImplementedError(
  587. "Please add implementation for `state_shape` in the used cell."
  588. )
  589. @property
  590. def state_dtype(self):
  591. r"""
  592. Abstract method (property).
  593. Used to initialize states.
  594. A (possibly nested structure of) data types[s]. The structure must be
  595. same as that of `shape`, except when all tensors' in states has the same
  596. data type, a single data type can be used.
  597. Not necessary to be implemented if states are not initialized
  598. by `get_initial_states` or the `dtype` argument is provided when using
  599. `get_initial_states`.
  600. """
  601. raise NotImplementedError(
  602. "Please add implementation for `state_dtype` in the used cell."
  603. )
  604. class SimpleRNNCell(RNNCellBase):
  605. r"""
  606. Elman RNN (SimpleRNN) cell. Given the inputs and previous states, it
  607. computes the outputs and updates states.
  608. The formula used is as follows:
  609. .. math::
  610. h_{t} & = act(W_{ih}x_{t} + b_{ih} + W_{hh}h_{t-1} + b_{hh})
  611. y_{t} & = h_{t}
  612. where :math:`act` is for :attr:`activation`.
  613. Please refer to `Finding Structure in Time
  614. <https://crl.ucsd.edu/~elman/Papers/fsit.pdf>`_ for more details.
  615. Parameters:
  616. input_size (int): The input size.
  617. hidden_size (int): The hidden size.
  618. activation (str, optional): The activation in the SimpleRNN cell.
  619. It can be `tanh` or `relu`. Defaults to `tanh`.
  620. weight_ih_attr (ParamAttr, optional): The parameter attribute for
  621. :math:`weight_ih`. Default: None.
  622. weight_hh_attr(ParamAttr, optional): The parameter attribute for
  623. :math:`weight_hh`. Default: None.
  624. bias_ih_attr (ParamAttr, optional): The parameter attribute for the
  625. :math:`bias_ih`. Default: None.
  626. bias_hh_attr (ParamAttr, optional): The parameter attribute for the
  627. :math:`bias_hh`. Default: None.
  628. name (str, optional): Name for the operation (optional, default is
  629. None). For more information, please refer to :ref:`api_guide_Name`.
  630. Variables:
  631. - **weight_ih** (Parameter): shape (hidden_size, input_size), input to hidden weight, corresponding to :math:`W_{ih}` in the formula.
  632. - **weight_hh** (Parameter): shape (hidden_size, hidden_size), hidden to hidden weight, corresponding to :math:`W_{hh}` in the formula.
  633. - **bias_ih** (Parameter): shape (hidden_size, ), input to hidden bias, corresponding to :math:`b_{ih}` in the formula.
  634. - **bias_hh** (Parameter): shape (hidden_size, ), hidden to hidden bias, corresponding to :math:`b_{hh}` in the formula.
  635. Inputs:
  636. - **inputs** (Tensor): shape `[batch_size, input_size]`, the input, corresponding to :math:`x_{t}` in the formula.
  637. - **states** (Tensor, optional): shape `[batch_size, hidden_size]`, the previous hidden state, corresponding to :math:`h_{t-1}` in the formula. When states is None, zero state is used. Defaults to None.
  638. Returns:
  639. - **outputs** (Tensor): shape `[batch_size, hidden_size]`, the output, corresponding to :math:`h_{t}` in the formula.
  640. - **states** (Tensor): shape `[batch_size, hidden_size]`, the new hidden state, corresponding to :math:`h_{t}` in the formula.
  641. Notes:
  642. All the weights and bias are initialized with `Uniform(-std, std)` by default. Where std = :math:`\frac{1}{\sqrt{hidden\_size}}`. For more information about parameter initialization, please refer to :ref:`api_paddle_ParamAttr`.
  643. Examples:
  644. .. code-block:: python
  645. >>> import paddle
  646. >>> x = paddle.randn((4, 16))
  647. >>> prev_h = paddle.randn((4, 32))
  648. >>> cell = paddle.nn.SimpleRNNCell(16, 32)
  649. >>> y, h = cell(x, prev_h)
  650. >>> print(y.shape)
  651. [4, 32]
  652. """
  653. def __init__(
  654. self,
  655. input_size,
  656. hidden_size,
  657. activation="tanh",
  658. weight_ih_attr=None,
  659. weight_hh_attr=None,
  660. bias_ih_attr=None,
  661. bias_hh_attr=None,
  662. name=None,
  663. ):
  664. super().__init__()
  665. if hidden_size <= 0:
  666. raise ValueError(
  667. f"hidden_size of {self.__class__.__name__} must be greater than 0, but now equals to {hidden_size}"
  668. )
  669. std = 1.0 / math.sqrt(hidden_size)
  670. if weight_ih_attr is not False:
  671. self.weight_ih = self.create_parameter(
  672. (hidden_size, input_size),
  673. weight_ih_attr,
  674. default_initializer=I.Uniform(-std, std),
  675. )
  676. else:
  677. self.weight_ih = self.create_parameter(
  678. (hidden_size, input_size),
  679. None,
  680. default_initializer=I.Constant(1.0),
  681. )
  682. self.weight_ih.stop_gradient = True
  683. if weight_hh_attr is not False:
  684. self.weight_hh = self.create_parameter(
  685. (hidden_size, hidden_size),
  686. weight_hh_attr,
  687. default_initializer=I.Uniform(-std, std),
  688. )
  689. else:
  690. self.weight_hh = self.create_parameter(
  691. (hidden_size, hidden_size),
  692. None,
  693. default_initializer=I.Constant(1.0),
  694. )
  695. self.weight_hh.stop_gradient = True
  696. if bias_ih_attr is not False:
  697. self.bias_ih = self.create_parameter(
  698. (hidden_size,),
  699. bias_ih_attr,
  700. is_bias=True,
  701. default_initializer=I.Uniform(-std, std),
  702. )
  703. else:
  704. self.bias_ih = self.create_parameter(
  705. (hidden_size,),
  706. None,
  707. is_bias=True,
  708. default_initializer=I.Constant(0.0),
  709. )
  710. self.bias_ih.stop_gradient = True
  711. if bias_hh_attr is not False:
  712. self.bias_hh = self.create_parameter(
  713. (hidden_size,),
  714. bias_hh_attr,
  715. is_bias=True,
  716. default_initializer=I.Uniform(-std, std),
  717. )
  718. else:
  719. self.bias_hh = self.create_parameter(
  720. (hidden_size,),
  721. None,
  722. is_bias=True,
  723. default_initializer=I.Constant(0.0),
  724. )
  725. self.bias_hh.stop_gradient = True
  726. self.input_size = input_size
  727. self.hidden_size = hidden_size
  728. if activation not in ["tanh", "relu"]:
  729. raise ValueError(
  730. "activation for SimpleRNNCell should be tanh or relu, "
  731. f"but get {activation}"
  732. )
  733. self.activation = activation
  734. self._activation_fn = paddle.tanh if activation == "tanh" else F.relu
  735. def forward(self, inputs, states=None):
  736. if states is None:
  737. states = self.get_initial_states(inputs, self.state_shape)
  738. pre_h = states
  739. i2h = paddle.matmul(inputs, self.weight_ih, transpose_y=True)
  740. if self.bias_ih is not None:
  741. i2h += self.bias_ih
  742. h2h = paddle.matmul(pre_h, self.weight_hh, transpose_y=True)
  743. if self.bias_hh is not None:
  744. h2h += self.bias_hh
  745. h = self._activation_fn(i2h + h2h)
  746. return h, h
  747. @property
  748. def state_shape(self):
  749. return (self.hidden_size,)
  750. def extra_repr(self):
  751. s = '{input_size}, {hidden_size}'
  752. if self.activation != "tanh":
  753. s += ', activation={activation}'
  754. return s.format(**self.__dict__)
  755. class LSTMCell(RNNCellBase):
  756. r"""
  757. Long-Short Term Memory(LSTM) RNN cell. Given the inputs and previous states,
  758. it computes the outputs and updates states.
  759. The formula used is as follows:
  760. .. math::
  761. i_{t} & = \sigma(W_{ii}x_{t} + b_{ii} + W_{hi}h_{t-1} + b_{hi})
  762. f_{t} & = \sigma(W_{if}x_{t} + b_{if} + W_{hf}h_{t-1} + b_{hf})
  763. o_{t} & = \sigma(W_{io}x_{t} + b_{io} + W_{ho}h_{t-1} + b_{ho})
  764. \widetilde{c}_{t} & = \tanh (W_{ig}x_{t} + b_{ig} + W_{hg}h_{t-1} + b_{hg})
  765. c_{t} & = f_{t} * c_{t-1} + i_{t} * \widetilde{c}_{t}
  766. h_{t} & = o_{t} * \tanh(c_{t})
  767. y_{t} & = h_{t}
  768. If `proj_size` is specified, the dimension of hidden state :math:`h_{t}` will be projected to `proj_size`:
  769. .. math::
  770. h_{t} = h_{t}W_{proj\_size}
  771. where :math:`\sigma` is the sigmoid function, and * is the elementwise
  772. multiplication operator.
  773. Please refer to `An Empirical Exploration of Recurrent Network Architectures
  774. <http://proceedings.mlr.press/v37/jozefowicz15.pdf>`_ for more details.
  775. Parameters:
  776. input_size (int): The input size.
  777. hidden_size (int): The hidden size.
  778. weight_ih_attr(ParamAttr, optional): The parameter attribute for
  779. `weight_ih`. Default: None.
  780. weight_hh_attr(ParamAttr, optional): The parameter attribute for
  781. `weight_hh`. Default: None.
  782. bias_ih_attr (ParamAttr, optional): The parameter attribute for the
  783. `bias_ih`. Default: None.
  784. bias_hh_attr (ParamAttr, optional): The parameter attribute for the
  785. `bias_hh`. Default: None.
  786. proj_size (int, optional): If specified, the output hidden state
  787. will be projected to `proj_size`. `proj_size` must be smaller than
  788. `hidden_size`. Default: None.
  789. name (str, optional): Name for the operation (optional, default is
  790. None). For more information, please refer to :ref:`api_guide_Name`.
  791. Variables:
  792. - **weight_ih** (Parameter): shape (4 * hidden_size, input_size), input to hidden weight, which corresponds to the concatenation of :math:`W_{ii}, W_{if}, W_{ig}, W_{io}` in the formula.
  793. - **weight_hh** (Parameter): shape (4 * hidden_size, hidden_size), hidden to hidden weight, which corresponds to the concatenation of :math:`W_{hi}, W_{hf}, W_{hg}, W_{ho}` in the formula. If proj_size was specified, the shape will be (4 * hidden_size, proj_size).
  794. - **weight_ho** (Parameter, optional): shape (hidden_size, proj_size), project the hidden state.
  795. - **bias_ih** (Parameter): shape (4 * hidden_size, ), input to hidden bias, which corresponds to the concatenation of :math:`b_{ii}, b_{if}, b_{ig}, b_{io}` in the formula.
  796. - **bias_hh** (Parameter): shape (4 * hidden_size, ), hidden to hidden bias, which corresponds to the concatenation of :math:`b_{hi}, b_{hf}, b_{hg}, b_{ho}` in the formula.
  797. Inputs:
  798. - **inputs** (Tensor): shape `[batch_size, input_size]`, the input, corresponding to :math:`x_t` in the formula.
  799. - **states** (list|tuple, optional): a list/tuple of two tensors, each of shape `[batch_size, hidden_size]`, the previous hidden state, corresponding to :math:`h_{t-1}, c_{t-1}` in the formula. When states is None, zero state is used. Defaults to None.
  800. Returns:
  801. - **outputs** (Tensor). Shape `[batch_size, hidden_size]`, the output, corresponding to :math:`h_{t}` in the formula. If `proj_size` is specified, output shape will be `[batch_size, proj_size]`.
  802. - **states** (tuple). A tuple of two tensors, each of shape `[batch_size, hidden_size]`, the new hidden states, corresponding to :math:`h_{t}, c_{t}` in the formula.
  803. If `proj_size` is specified, shape of :math:`h_{t}` will be `[batch_size, proj_size]`.
  804. Notes:
  805. All the weights and bias are initialized with `Uniform(-std, std)` by
  806. default. Where std = :math:`\frac{1}{\sqrt{hidden\_size}}`. For more
  807. information about parameter initialization, please refer to :ref:`api_paddle_ParamAttr`.
  808. Examples:
  809. .. code-block:: python
  810. >>> import paddle
  811. >>> x = paddle.randn((4, 16))
  812. >>> prev_h = paddle.randn((4, 32))
  813. >>> prev_c = paddle.randn((4, 32))
  814. >>> cell = paddle.nn.LSTMCell(16, 32)
  815. >>> y, (h, c) = cell(x, (prev_h, prev_c))
  816. >>> print(y.shape)
  817. [4, 32]
  818. >>> print(h.shape)
  819. [4, 32]
  820. >>> print(c.shape)
  821. [4, 32]
  822. """
  823. def __init__(
  824. self,
  825. input_size,
  826. hidden_size,
  827. weight_ih_attr=None,
  828. weight_hh_attr=None,
  829. bias_ih_attr=None,
  830. bias_hh_attr=None,
  831. proj_size=0,
  832. name=None,
  833. ):
  834. super().__init__()
  835. if hidden_size <= 0:
  836. raise ValueError(
  837. f"hidden_size of {self.__class__.__name__} must be greater than 0, but now equals to {hidden_size}"
  838. )
  839. if proj_size < 0:
  840. raise ValueError(
  841. f"proj_size of {self.__class__.__name__} must be greater than 0, but now equals to {hidden_size}"
  842. )
  843. if proj_size >= hidden_size:
  844. raise ValueError("proj_size must be smaller than hidden_size")
  845. std = 1.0 / math.sqrt(hidden_size)
  846. if weight_ih_attr is not False:
  847. self.weight_ih = self.create_parameter(
  848. (4 * hidden_size, input_size),
  849. weight_ih_attr,
  850. default_initializer=I.Uniform(-std, std),
  851. )
  852. else:
  853. self.weight_ih = self.create_parameter(
  854. (4 * hidden_size, input_size),
  855. None,
  856. default_initializer=I.Constant(1.0),
  857. )
  858. self.weight_ih.stop_gradient = True
  859. if weight_hh_attr is not False:
  860. self.weight_hh = self.create_parameter(
  861. (4 * hidden_size, proj_size or hidden_size),
  862. weight_hh_attr,
  863. default_initializer=I.Uniform(-std, std),
  864. )
  865. else:
  866. self.weight_hh = self.create_parameter(
  867. (4 * hidden_size, proj_size or hidden_size),
  868. None,
  869. default_initializer=I.Constant(1.0),
  870. )
  871. self.weight_hh.stop_gradient = True
  872. if bias_ih_attr is not False:
  873. self.bias_ih = self.create_parameter(
  874. (4 * hidden_size,),
  875. bias_ih_attr,
  876. is_bias=True,
  877. default_initializer=I.Uniform(-std, std),
  878. )
  879. else:
  880. self.bias_ih = self.create_parameter(
  881. (4 * hidden_size,),
  882. None,
  883. is_bias=True,
  884. default_initializer=I.Constant(0.0),
  885. )
  886. self.bias_ih.stop_gradient = True
  887. if bias_hh_attr is not False:
  888. self.bias_hh = self.create_parameter(
  889. (4 * hidden_size,),
  890. bias_hh_attr,
  891. is_bias=True,
  892. default_initializer=I.Uniform(-std, std),
  893. )
  894. else:
  895. self.bias_hh = self.create_parameter(
  896. (4 * hidden_size,),
  897. None,
  898. is_bias=True,
  899. default_initializer=I.Constant(0.0),
  900. )
  901. self.bias_hh.stop_gradient = True
  902. self.proj_size = proj_size
  903. if proj_size > 0:
  904. self.weight_ho = self.create_parameter(
  905. (hidden_size, proj_size),
  906. weight_hh_attr,
  907. default_initializer=I.Uniform(-std, std),
  908. )
  909. self.hidden_size = hidden_size
  910. self.input_size = input_size
  911. self._gate_activation = F.sigmoid
  912. self._activation = paddle.tanh
  913. def forward(self, inputs, states=None):
  914. if states is None:
  915. states = self.get_initial_states(inputs, self.state_shape)
  916. pre_hidden, pre_cell = states
  917. gates = paddle.matmul(inputs, self.weight_ih, transpose_y=True)
  918. if self.bias_ih is not None:
  919. gates = gates + self.bias_ih
  920. gates += paddle.matmul(pre_hidden, self.weight_hh, transpose_y=True)
  921. if self.bias_hh is not None:
  922. gates = gates + self.bias_hh
  923. chunked_gates = paddle.split(gates, num_or_sections=4, axis=-1)
  924. i = self._gate_activation(chunked_gates[0])
  925. f = self._gate_activation(chunked_gates[1])
  926. o = self._gate_activation(chunked_gates[3])
  927. c = f * pre_cell + i * self._activation(chunked_gates[2])
  928. h = o * self._activation(c)
  929. if self.proj_size > 0:
  930. h = paddle.matmul(h, self.weight_ho)
  931. return h, (h, c)
  932. @property
  933. def state_shape(self):
  934. r"""
  935. The `state_shape` of LSTMCell is a tuple with two shapes:
  936. `((hidden_size, ), (hidden_size,))`. (-1 for batch size would be
  937. automatically inserted into shape). These two shapes correspond
  938. to :math:`h_{t-1}` and :math:`c_{t-1}` separately.
  939. """
  940. return ((self.hidden_size,), (self.proj_size or self.hidden_size,))
  941. def extra_repr(self):
  942. return '{input_size}, {hidden_size}'.format(**self.__dict__)
  943. class GRUCell(RNNCellBase):
  944. r"""
  945. Gated Recurrent Unit (GRU) RNN cell. Given the inputs and previous states,
  946. it computes the outputs and updates states.
  947. The formula for GRU used is as follows:
  948. .. math::
  949. r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}h_{t-1} + b_{hr})
  950. z_{t} & = \sigma(W_{iz}x_{t} + b_{iz} + W_{hz}h_{t-1} + b_{hz})
  951. \widetilde{h}_{t} & = \tanh(W_{ic}x_{t} + b_{ic} + r_{t} * (W_{hc}h_{t-1} + b_{hc}))
  952. h_{t} & = z_{t} * h_{t-1} + (1 - z_{t}) * \widetilde{h}_{t}
  953. y_{t} & = h_{t}
  954. where :math:`\sigma` is the sigmoid function, and * is the elementwise
  955. multiplication operator.
  956. Please refer to `An Empirical Exploration of Recurrent Network Architectures
  957. <http://proceedings.mlr.press/v37/jozefowicz15.pdf>`_ for more details.
  958. Parameters:
  959. input_size (int): The input size.
  960. hidden_size (int): The hidden size.
  961. weight_ih_attr(ParamAttr, optional): The parameter attribute for
  962. `weight_ih`. Default: None.
  963. weight_hh_attr(ParamAttr, optional): The parameter attribute for
  964. `weight_hh`. Default: None.
  965. bias_ih_attr (ParamAttr, optional): The parameter attribute for the
  966. `bias_ih`. Default: None.
  967. bias_hh_attr (ParamAttr, optional): The parameter attribute for the
  968. `bias_hh`. Default: None.
  969. name (str, optional): Name for the operation (optional, default is
  970. None). For more information, please refer to :ref:`api_guide_Name`.
  971. Variables:
  972. - **weight_ih** (Parameter): shape (3 * hidden_size, input_size), input to hidden weight, which corresponds to the concatenation of :math:`W_{ir}, W_{iz}, W_{ic}` in the formula.
  973. - **weight_hh** (Parameter): shape (3 * hidden_size, hidden_size), hidden to hidden weight, which corresponds to the concatenation of :math:`W_{hr}, W_{hz}, W_{hc}` in the formula.
  974. - **bias_ih** (Parameter): shape (3 * hidden_size, ), input to hidden bias, which corresponds to the concatenation of :math:`b_{ir}, b_{iz}, b_{ic}` in the formula.
  975. - **bias_hh** (Parameter): shape (3 * hidden_size, ), hidden to hidden bias, which corresponds to the concatenation of :math:`b_{hr}, b_{hz}, b_{hc}` in the formula.
  976. Inputs:
  977. - **inputs** (Tensor): A tensor with shape `[batch_size, input_size]`, corresponding to :math:`x_t` in the formula.
  978. - **states** (Tensor): A tensor with shape `[batch_size, hidden_size]`, corresponding to :math:`h_{t-1}` in the formula.
  979. Returns:
  980. - **outputs** (Tensor): shape `[batch_size, hidden_size]`, the output, corresponding to :math:`h_{t}` in the formula.
  981. - **states** (Tensor): shape `[batch_size, hidden_size]`, the new hidden state, corresponding to :math:`h_{t}` in the formula.
  982. Notes:
  983. All the weights and bias are initialized with `Uniform(-std, std)` by
  984. default. Where std = :math:`\frac{1}{\sqrt{hidden\_size}}`. For more
  985. information about parameter initialization, please refer to s:ref:`api_paddle_ParamAttr`.
  986. Examples:
  987. .. code-block:: python
  988. >>> import paddle
  989. >>> x = paddle.randn((4, 16))
  990. >>> prev_h = paddle.randn((4, 32))
  991. >>> cell = paddle.nn.GRUCell(16, 32)
  992. >>> y, h = cell(x, prev_h)
  993. >>> print(y.shape)
  994. [4, 32]
  995. >>> print(h.shape)
  996. [4, 32]
  997. """
  998. def __init__(
  999. self,
  1000. input_size,
  1001. hidden_size,
  1002. weight_ih_attr=None,
  1003. weight_hh_attr=None,
  1004. bias_ih_attr=None,
  1005. bias_hh_attr=None,
  1006. name=None,
  1007. ):
  1008. super().__init__()
  1009. if hidden_size <= 0:
  1010. raise ValueError(
  1011. f"hidden_size of {self.__class__.__name__} must be greater than 0, but now equals to {hidden_size}"
  1012. )
  1013. std = 1.0 / math.sqrt(hidden_size)
  1014. if weight_ih_attr is not False:
  1015. self.weight_ih = self.create_parameter(
  1016. (3 * hidden_size, input_size),
  1017. weight_ih_attr,
  1018. default_initializer=I.Uniform(-std, std),
  1019. )
  1020. else:
  1021. self.weight_ih = self.create_parameter(
  1022. (3 * hidden_size, input_size),
  1023. None,
  1024. default_initializer=I.Constant(1.0),
  1025. )
  1026. self.weight_ih.stop_gradient = True
  1027. if weight_hh_attr is not False:
  1028. self.weight_hh = self.create_parameter(
  1029. (3 * hidden_size, hidden_size),
  1030. weight_hh_attr,
  1031. default_initializer=I.Uniform(-std, std),
  1032. )
  1033. else:
  1034. self.weight_hh = self.create_parameter(
  1035. (3 * hidden_size, hidden_size),
  1036. None,
  1037. default_initializer=I.Constant(1.0),
  1038. )
  1039. self.weight_hh.stop_gradient = True
  1040. if bias_ih_attr is not False:
  1041. self.bias_ih = self.create_parameter(
  1042. (3 * hidden_size,),
  1043. bias_ih_attr,
  1044. is_bias=True,
  1045. default_initializer=I.Uniform(-std, std),
  1046. )
  1047. else:
  1048. self.bias_ih = self.create_parameter(
  1049. (3 * hidden_size,),
  1050. None,
  1051. is_bias=True,
  1052. default_initializer=I.Constant(0.0),
  1053. )
  1054. self.bias_ih.stop_gradient = True
  1055. if bias_hh_attr is not False:
  1056. self.bias_hh = self.create_parameter(
  1057. (3 * hidden_size,),
  1058. bias_hh_attr,
  1059. is_bias=True,
  1060. default_initializer=I.Uniform(-std, std),
  1061. )
  1062. else:
  1063. self.bias_hh = self.create_parameter(
  1064. (3 * hidden_size,),
  1065. None,
  1066. is_bias=True,
  1067. default_initializer=I.Constant(0.0),
  1068. )
  1069. self.bias_hh.stop_gradient = True
  1070. self.hidden_size = hidden_size
  1071. self.input_size = input_size
  1072. self._gate_activation = F.sigmoid
  1073. self._activation = paddle.tanh
  1074. def forward(self, inputs, states=None):
  1075. if states is None:
  1076. states = self.get_initial_states(inputs, self.state_shape)
  1077. pre_hidden = states
  1078. x_gates = paddle.matmul(inputs, self.weight_ih, transpose_y=True)
  1079. if self.bias_ih is not None:
  1080. x_gates = x_gates + self.bias_ih
  1081. h_gates = paddle.matmul(pre_hidden, self.weight_hh, transpose_y=True)
  1082. if self.bias_hh is not None:
  1083. h_gates = h_gates + self.bias_hh
  1084. x_r, x_z, x_c = paddle.split(x_gates, num_or_sections=3, axis=1)
  1085. h_r, h_z, h_c = paddle.split(h_gates, num_or_sections=3, axis=1)
  1086. r = self._gate_activation(x_r + h_r)
  1087. z = self._gate_activation(x_z + h_z)
  1088. c = self._activation(x_c + r * h_c) # apply reset gate after mm
  1089. h = (pre_hidden - c) * z + c
  1090. return h, h
  1091. @property
  1092. def state_shape(self):
  1093. r"""
  1094. The `state_shape` of GRUCell is a shape `[hidden_size]` (-1 for batch
  1095. size would be automatically inserted into shape). The shape corresponds
  1096. to the shape of :math:`h_{t-1}`.
  1097. """
  1098. return (self.hidden_size,)
  1099. def extra_repr(self):
  1100. return '{input_size}, {hidden_size}'.format(**self.__dict__)
  1101. class RNN(Layer):
  1102. r"""
  1103. Wrapper for RNN, which creates a recurrent neural network with an RNN cell.
  1104. It performs :code:`cell.forward()` repeatedly until reaches to the maximum
  1105. length of `inputs`.
  1106. Parameters:
  1107. cell(RNNCellBase): An instance of `RNNCellBase`.
  1108. is_reverse (bool, optional): Indicate whether to calculate in the reverse
  1109. order of input sequences. Defaults to False.
  1110. time_major (bool): Whether the first dimension of the input means the
  1111. time steps. Defaults to False.
  1112. Inputs:
  1113. - **inputs** (Tensor): A (possibly nested structure of) tensor[s]. The input sequences. If time_major is False, the shape is `[batch_size, time_steps, input_size]`. If time_major is True, the shape is `[time_steps, batch_size, input_size]` where `input_size` is the input size of the cell.
  1114. - **initial_states** (Tensor|list|tuple, optional): Tensor of a possibly nested structure of tensors, representing the initial state for the rnn cell. If not provided, `cell.get_initial_states` would be called to produce the initial states. Defaults to None.
  1115. - **sequence_length** (Tensor, optional): shape `[batch_size]`, dtype: int64 or int32. The valid lengths of input sequences. Defaults to None.If `sequence_length` is not None, the inputs are treated as padded sequences. In each input sequence, elements whose time step index are not less than the valid length are treated as paddings.
  1116. - **kwargs**: Additional keyword arguments to pass to `forward` of the cell.
  1117. Outputs:
  1118. - **outputs** (Tensor|list|tuple): the output sequences. If `time_major` is True, the shape is `[time_steps, batch_size, hidden_size]`, else `[batch_size, time_steps, hidden_size]`.
  1119. - **final_states** (Tensor|list|tuple): final states of the cell. Tensor or a possibly nested structure of tensors which has the same structure with initial state. Each tensor in final states has the same shape and dtype as the corresponding tensor in initial states.
  1120. Notes:
  1121. This class is a low-level API for wrapping rnn cell into a RNN network.
  1122. Users should take care of the state of the cell. If `initial_states` is
  1123. passed to the `forward` method, make sure that it satisfies the
  1124. requirements of the cell.
  1125. Examples:
  1126. .. code-block:: python
  1127. >>> import paddle
  1128. >>> inputs = paddle.rand((4, 23, 16))
  1129. >>> prev_h = paddle.randn((4, 32))
  1130. >>> cell = paddle.nn.SimpleRNNCell(16, 32)
  1131. >>> rnn = paddle.nn.RNN(cell)
  1132. >>> outputs, final_states = rnn(inputs, prev_h)
  1133. >>> print(outputs.shape)
  1134. [4, 23, 32]
  1135. >>> print(final_states.shape)
  1136. [4, 32]
  1137. """
  1138. def __init__(self, cell, is_reverse=False, time_major=False):
  1139. super().__init__()
  1140. self.cell = cell
  1141. if not hasattr(self.cell, "call"):
  1142. # for non-dygraph mode, `rnn` api uses cell.call
  1143. self.cell.call = self.cell.forward
  1144. self.is_reverse = is_reverse
  1145. self.time_major = time_major
  1146. def forward(
  1147. self, inputs, initial_states=None, sequence_length=None, **kwargs
  1148. ):
  1149. final_outputs, final_states = rnn(
  1150. self.cell,
  1151. inputs,
  1152. initial_states=initial_states,
  1153. sequence_length=sequence_length,
  1154. time_major=self.time_major,
  1155. is_reverse=self.is_reverse,
  1156. **kwargs,
  1157. )
  1158. return final_outputs, final_states
  1159. class BiRNN(Layer):
  1160. r"""
  1161. Wrapper for bidirectional RNN, which builds a bidirectional RNN given the
  1162. forward rnn cell and backward rnn cell. A BiRNN applies forward RNN and
  1163. backward RNN with corresponding cells separately and concats the outputs
  1164. along the last axis.
  1165. Parameters:
  1166. cell_fw (RNNCellBase): A RNNCellBase instance used for forward RNN.
  1167. cell_bw (RNNCellBase): A RNNCellBase instance used for backward RNN.
  1168. time_major (bool, optional): Whether the first dimension of the input means the
  1169. time steps. Defaults to False.
  1170. Inputs:
  1171. - **inputs** (Tensor): the input sequences of both RNN. If time_major is True, the shape of is `[time_steps, batch_size, input_size]`, else the shape is `[batch_size, time_steps, input_size]`, where input_size is the input size of both cells.
  1172. - **initial_states** (list|tuple, optional): A tuple/list of the initial states of the forward cell and backward cell. Defaults to None. If not provided, `cell.get_initial_states` would be called to produce the initial states for each cell. Defaults to None.
  1173. - **sequence_length** (Tensor, optional): shape `[batch_size]`, dtype: int64 or int32. The valid lengths of input sequences. Defaults to None. If `sequence_length` is not None, the inputs are treated as padded sequences. In each input sequence, elements whose time step index are not less than the valid length are treated as paddings.
  1174. - **kwargs**: Additional keyword arguments. Arguments passed to `forward` for each cell.
  1175. Outputs:
  1176. - **outputs** (Tensor): the outputs of the bidirectional RNN. It is the concatenation of the outputs from the forward RNN and backward RNN along the last axis. If time_major is True, the shape is `[time_steps, batch_size, size]`, else the shape is `[batch_size, time_steps, size]`, where size is `cell_fw.hidden_size + cell_bw.hidden_size`.
  1177. - **final_states** (tuple): A tuple of the final states of the forward cell and backward cell.
  1178. Notes:
  1179. This class is a low level API for wrapping rnn cells into a BiRNN
  1180. network. Users should take care of the states of the cells.
  1181. If `initial_states` is passed to the `forward` method, make sure that
  1182. it satisfies the requirements of the cells.
  1183. Examples:
  1184. .. code-block:: python
  1185. >>> import paddle
  1186. >>> cell_fw = paddle.nn.LSTMCell(16, 32)
  1187. >>> cell_bw = paddle.nn.LSTMCell(16, 32)
  1188. >>> rnn = paddle.nn.BiRNN(cell_fw, cell_bw)
  1189. >>> inputs = paddle.rand((2, 23, 16))
  1190. >>> outputs, final_states = rnn(inputs)
  1191. >>> print(outputs.shape)
  1192. [2, 23, 64]
  1193. >>> print(final_states[0][0].shape,len(final_states),len(final_states[0]))
  1194. [2, 32] 2 2
  1195. """
  1196. def __init__(self, cell_fw, cell_bw, time_major=False):
  1197. super().__init__()
  1198. self.cell_fw = cell_fw
  1199. self.cell_bw = cell_bw
  1200. if cell_fw.input_size != cell_bw.input_size:
  1201. raise ValueError(
  1202. f"input size of forward cell({cell_fw.input_size}) does not equals"
  1203. f"that of backward cell({cell_bw.input_size})"
  1204. )
  1205. for cell in [self.cell_fw, self.cell_bw]:
  1206. if not hasattr(cell, "call"):
  1207. # for non-dygraph mode, `rnn` api uses cell.call
  1208. cell.call = cell.forward
  1209. self.time_major = time_major
  1210. def forward(
  1211. self, inputs, initial_states=None, sequence_length=None, **kwargs
  1212. ):
  1213. if isinstance(initial_states, (list, tuple)):
  1214. assert (
  1215. len(initial_states) == 2
  1216. ), "length of initial_states should be 2 when it is a list/tuple"
  1217. outputs, final_states = birnn(
  1218. self.cell_fw,
  1219. self.cell_bw,
  1220. inputs,
  1221. initial_states,
  1222. sequence_length,
  1223. self.time_major,
  1224. **kwargs,
  1225. )
  1226. return outputs, final_states
  1227. class RNNBase(LayerList):
  1228. r"""
  1229. RNNBase class for RNN networks. It provides `forward`, `flatten_parameters`
  1230. and other common methods for SimpleRNN, LSTM and GRU.
  1231. """
  1232. def __init__(
  1233. self,
  1234. mode,
  1235. input_size,
  1236. hidden_size,
  1237. num_layers=1,
  1238. direction="forward",
  1239. time_major=False,
  1240. dropout=0.0,
  1241. weight_ih_attr=None,
  1242. weight_hh_attr=None,
  1243. bias_ih_attr=None,
  1244. bias_hh_attr=None,
  1245. proj_size=0,
  1246. ):
  1247. super().__init__()
  1248. bidirectional_list = ["bidirectional", "bidirect"]
  1249. self.mode = mode
  1250. self.input_size = input_size
  1251. self.hidden_size = hidden_size
  1252. self.dropout = dropout
  1253. self.num_directions = 2 if direction in bidirectional_list else 1
  1254. self.time_major = time_major
  1255. self.num_layers = num_layers
  1256. self.state_components = 2 if mode == "LSTM" else 1
  1257. kwargs = {
  1258. "weight_ih_attr": weight_ih_attr,
  1259. "weight_hh_attr": weight_hh_attr,
  1260. "bias_ih_attr": bias_ih_attr,
  1261. "bias_hh_attr": bias_hh_attr,
  1262. }
  1263. self.proj_size = proj_size
  1264. if proj_size > 0:
  1265. assert mode == 'LSTM'
  1266. if mode == "LSTM":
  1267. rnn_cls = LSTMCell
  1268. kwargs["proj_size"] = proj_size
  1269. elif mode == "GRU":
  1270. rnn_cls = GRUCell
  1271. elif mode == "RNN_RELU":
  1272. rnn_cls = SimpleRNNCell
  1273. kwargs["activation"] = 'relu'
  1274. elif mode == "RNN_TANH":
  1275. rnn_cls = SimpleRNNCell
  1276. kwargs["activation"] = 'tanh'
  1277. else:
  1278. rnn_cls = SimpleRNNCell
  1279. kwargs["activation"] = self.activation
  1280. in_size = proj_size or hidden_size
  1281. if direction in ["forward"]:
  1282. is_reverse = False
  1283. cell = rnn_cls(input_size, hidden_size, **kwargs)
  1284. self.append(RNN(cell, is_reverse, time_major))
  1285. for _ in range(1, num_layers):
  1286. cell = rnn_cls(in_size, hidden_size, **kwargs)
  1287. self.append(RNN(cell, is_reverse, time_major))
  1288. elif direction in bidirectional_list:
  1289. cell_fw = rnn_cls(input_size, hidden_size, **kwargs)
  1290. cell_bw = rnn_cls(input_size, hidden_size, **kwargs)
  1291. self.append(BiRNN(cell_fw, cell_bw, time_major))
  1292. for _ in range(1, num_layers):
  1293. cell_fw = rnn_cls(2 * in_size, hidden_size, **kwargs)
  1294. cell_bw = rnn_cls(2 * in_size, hidden_size, **kwargs)
  1295. self.append(BiRNN(cell_fw, cell_bw, time_major))
  1296. else:
  1297. raise ValueError(
  1298. "direction should be forward or bidirect (or bidirectional), "
  1299. f"received direction = {direction}"
  1300. )
  1301. self.could_use_cudnn = True
  1302. self.could_use_cudnn &= len(self.parameters()) == num_layers * 4 * (
  1303. 2 if direction in bidirectional_list else 1
  1304. )
  1305. # Expose params as RNN's attribute, which can make it compatible when
  1306. # replacing small ops composed rnn with cpp rnn kernel.
  1307. # Moreover, `jit.to_static` assumes params are added by current layer
  1308. # and wouldn't include sublayer's params in current layer, which also
  1309. # requires these params are added to current layer for `jit.save`.
  1310. param_names = []
  1311. for layer in range(self.num_layers):
  1312. for direction in range(self.num_directions):
  1313. suffix = '_reverse' if direction == 1 else ''
  1314. param_names.extend(['weight_ih_l{}{}', 'weight_hh_l{}{}'])
  1315. if bias_ih_attr is not False:
  1316. param_names.append('bias_ih_l{}{}')
  1317. if bias_hh_attr is not False:
  1318. param_names.append('bias_hh_l{}{}')
  1319. param_names = [x.format(layer, suffix) for x in param_names]
  1320. for name, param in zip(param_names, self.parameters()):
  1321. setattr(self, name, param)
  1322. self.flatten_parameters()
  1323. def flatten_parameters(self):
  1324. """
  1325. Resets parameter data pointer to address in continuous memory block for
  1326. cudnn usage.
  1327. """
  1328. if self.could_use_cudnn:
  1329. # layer.parameters() is depth first and ordered
  1330. # for i in layer: for j in direct: w_ih, w_hh, b_ih, b_hh
  1331. # need to reorganize to cudnn param layout:
  1332. # all bias following all weights
  1333. params = self.parameters(include_sublayers=False)
  1334. shape = [np.prod(param.shape) for param in params]
  1335. self._all_weights = [None] * len(params)
  1336. for i, param in enumerate(params):
  1337. offset = (
  1338. 0
  1339. if i % 4 < 2
  1340. else (2 * self.num_layers * self.num_directions)
  1341. )
  1342. layer_idx = i // 4
  1343. self._all_weights[offset + layer_idx * 2 + i % 2] = param
  1344. # Wrap using a list to avoid registered into params and saving, maybe
  1345. # need a better way to handle this later. Use `create_parameter` to
  1346. # add both to main_program and startup_program for static-graph.
  1347. # Use Constant initializer to avoid make effect on random generator.
  1348. self._flat_weight = [
  1349. self.create_parameter(
  1350. shape=[np.sum(shape)],
  1351. dtype=params[0].dtype,
  1352. default_initializer=I.Constant(0.0),
  1353. )
  1354. ]
  1355. # dropout state may also can be hided and avoid saving
  1356. # should dropout state be persistable for static-graph
  1357. self._dropout_state = self.create_variable(
  1358. dtype=core.VarDesc.VarType.UINT8,
  1359. name=f"dropout_state{NON_PERSISTABLE_VAR_NAME_SUFFIX}",
  1360. )
  1361. if in_dynamic_mode():
  1362. with paddle.no_grad():
  1363. dtype = params[0].dtype
  1364. if isinstance(dtype, core.DataType):
  1365. dtype = paddle.base.framework.paddle_type_to_proto_type[
  1366. dtype
  1367. ]
  1368. _legacy_C_ops.coalesce_tensor(
  1369. self._all_weights,
  1370. self._all_weights,
  1371. self._flat_weight[0],
  1372. "copy_data",
  1373. True,
  1374. "use_align",
  1375. False,
  1376. "dtype",
  1377. dtype,
  1378. )
  1379. return
  1380. # for static-graph, append coalesce_tensor into startup program
  1381. with program_guard(
  1382. default_startup_program(), default_startup_program()
  1383. ):
  1384. with paddle.no_grad():
  1385. self._helper.append_op(
  1386. type="coalesce_tensor",
  1387. inputs={"Input": self._all_weights},
  1388. outputs={
  1389. "Output": self._all_weights,
  1390. "FusedOutput": self._flat_weight,
  1391. },
  1392. attrs={
  1393. "copy_data": True,
  1394. "use_align": False,
  1395. "dtype": params[0].dtype,
  1396. },
  1397. )
  1398. def _cudnn_impl(self, inputs, initial_states, sequence_length):
  1399. if not self.time_major:
  1400. inputs = paddle.tensor.transpose(inputs, [1, 0, 2])
  1401. if in_dynamic_or_pir_mode():
  1402. out, _, state = _C_ops.rnn(
  1403. inputs,
  1404. initial_states,
  1405. self._all_weights,
  1406. sequence_length,
  1407. self._dropout_state,
  1408. self.dropout,
  1409. self.num_directions == 2,
  1410. self.input_size,
  1411. self.hidden_size,
  1412. self.num_layers,
  1413. self.mode,
  1414. 0,
  1415. not self.training,
  1416. )
  1417. else:
  1418. out = self._helper.create_variable_for_type_inference(inputs.dtype)
  1419. state = [
  1420. self._helper.create_variable_for_type_inference(inputs.dtype)
  1421. for i in range(self.state_components)
  1422. ]
  1423. reserve = self._helper.create_variable_for_type_inference(
  1424. dtype=core.VarDesc.VarType.UINT8, stop_gradient=True
  1425. )
  1426. inputs = {
  1427. 'Input': inputs,
  1428. 'WeightList': self._all_weights,
  1429. 'PreState': initial_states,
  1430. 'SequenceLength': sequence_length,
  1431. }
  1432. attrs = {
  1433. 'dropout_prob': self.dropout,
  1434. 'is_bidirec': self.num_directions == 2,
  1435. 'input_size': self.input_size,
  1436. 'hidden_size': self.hidden_size,
  1437. 'num_layers': self.num_layers,
  1438. 'mode': self.mode,
  1439. 'is_test': not self.training,
  1440. }
  1441. outputs = {
  1442. 'Out': out,
  1443. 'State': state,
  1444. 'Reserve': reserve,
  1445. 'DropoutState': self._dropout_state,
  1446. }
  1447. self._helper.append_op(
  1448. type="rnn", inputs=inputs, outputs=outputs, attrs=attrs
  1449. )
  1450. out = (
  1451. paddle.tensor.transpose(out, [1, 0, 2])
  1452. if not self.time_major
  1453. else out
  1454. )
  1455. return out, tuple(state) if len(state) > 1 else state[0]
  1456. def forward(self, inputs, initial_states=None, sequence_length=None):
  1457. batch_index = 1 if self.time_major else 0
  1458. dtype = inputs.dtype
  1459. if initial_states is None:
  1460. dims = ([self.proj_size or self.hidden_size], [self.hidden_size])
  1461. fill_shape = [self.num_layers * self.num_directions, -1]
  1462. if inputs.shape[batch_index] > 0:
  1463. fill_shape[1] = inputs.shape[batch_index]
  1464. else:
  1465. fill_shape[1] = paddle.shape(inputs)[batch_index].item()
  1466. initial_states = tuple(
  1467. [
  1468. paddle.full(
  1469. shape=fill_shape + dims[i], fill_value=0, dtype=dtype
  1470. )
  1471. for i in range(self.state_components)
  1472. ]
  1473. )
  1474. else:
  1475. initial_states = (
  1476. [initial_states]
  1477. if isinstance(initial_states, paddle.static.Variable)
  1478. else initial_states
  1479. )
  1480. if self.could_use_cudnn and (
  1481. not paddle.device.is_compiled_with_rocm() or sequence_length is None
  1482. ):
  1483. # Add CPU kernel and dispatch in backend later
  1484. return self._cudnn_impl(inputs, initial_states, sequence_length)
  1485. states = split_states(
  1486. initial_states, self.num_directions == 2, self.state_components
  1487. )
  1488. final_states = []
  1489. for i, rnn_layer in enumerate(self):
  1490. if i > 0:
  1491. inputs = F.dropout(
  1492. inputs,
  1493. self.dropout,
  1494. training=self.training,
  1495. mode="upscale_in_train",
  1496. )
  1497. outputs, final_state = rnn_layer(inputs, states[i], sequence_length)
  1498. final_states.append(final_state)
  1499. inputs = outputs
  1500. final_states = concat_states(
  1501. final_states, self.num_directions == 2, self.state_components
  1502. )
  1503. return outputs, final_states
  1504. def extra_repr(self):
  1505. main_str = '{input_size}, {hidden_size}'
  1506. if self.num_layers != 1:
  1507. main_str += ', num_layers={num_layers}'
  1508. if self.time_major is not False:
  1509. main_str += ', time_major={time_major}'
  1510. if self.dropout != 0:
  1511. main_str += ', dropout={dropout}'
  1512. return main_str.format(**self.__dict__)
  1513. class SimpleRNN(RNNBase):
  1514. r"""
  1515. Multilayer Elman network(SimpleRNN). It takes input sequences and initial
  1516. states as inputs, and returns the output sequences and the final states.
  1517. Each layer inside the SimpleRNN maps the input sequences and initial states
  1518. to the output sequences and final states in the following manner: at each
  1519. step, it takes step inputs(:math:`x_{t}`) and previous
  1520. states(:math:`h_{t-1}`) as inputs, and returns step outputs(:math:`y_{t}`)
  1521. and new states(:math:`h_{t}`).
  1522. .. math::
  1523. h_{t} & = act(W_{ih}x_{t} + b_{ih} + W_{hh}h_{t-1} + b_{hh})
  1524. y_{t} & = h_{t}
  1525. where :math:`act` is for :attr:`activation`.
  1526. Using key word arguments to construct is recommended.
  1527. Parameters:
  1528. input_size (int): The input size of :math:`x` for the first layer's cell.
  1529. hidden_size (int): The hidden size of :math:`h` for each layer's cell.
  1530. num_layers (int, optional): Number of recurrent layers. Defaults to 1.
  1531. direction (str, optional): The direction of the network. It can be "forward"
  1532. or "bidirect"(or "bidirectional"). When "bidirect", the way to merge
  1533. outputs of forward and backward is concatenating. Defaults to "forward".
  1534. time_major (bool, optional): Whether the first dimension of the input
  1535. means the time steps. If time_major is True, the shape of Tensor is
  1536. [time_steps,batch_size,input_size], otherwise [batch_size, time_steps,input_size].
  1537. Defaults to False. `time_steps` means the length of input sequence.
  1538. dropout (float, optional): The dropout probability. Dropout is applied
  1539. to the input of each layer except for the first layer. The range of
  1540. dropout from 0 to 1. Defaults to 0.
  1541. activation (str, optional): The activation in each SimpleRNN cell. It can be
  1542. `tanh` or `relu`. Defaults to `tanh`.
  1543. weight_ih_attr (ParamAttr, optional): The parameter attribute for
  1544. `weight_ih` of each cell. Defaults to None.
  1545. weight_hh_attr (ParamAttr, optional): The parameter attribute for
  1546. `weight_hh` of each cell. Defaults to None.
  1547. bias_ih_attr (ParamAttr, optional): The parameter attribute for the
  1548. `bias_ih` of each cells. Defaults to None.
  1549. bias_hh_attr (ParamAttr, optional): The parameter attribute for the
  1550. `bias_hh` of each cells. Defaults to None.
  1551. name (str, optional): Name for the operation (optional, default is
  1552. None). For more information, please refer to :ref:`api_guide_Name`.
  1553. Inputs:
  1554. - **inputs** (Tensor): the input sequence. If `time_major` is True, the shape is `[time_steps, batch_size, input_size]`, else, the shape is `[batch_size, time_steps, input_size]`. `time_steps` means the length of the input sequence.
  1555. - **initial_states** (Tensor, optional): the initial state. The shape is `[num_layers * num_directions, batch_size, hidden_size]`. If initial_state is not given, zero initial states are used.
  1556. - **sequence_length** (Tensor, optional): shape `[batch_size]`, dtype: int64 or int32. The valid lengths of input sequences. Defaults to None. If `sequence_length` is not None, the inputs are treated as padded sequences. In each input sequence, elements whose time step index are not less than the valid length are treated as paddings.
  1557. Returns:
  1558. - **outputs** (Tensor): the output sequence. If `time_major` is True, the shape is `[time_steps, batch_size, num_directions * hidden_size]`, else, the shape is `[batch_size, time_steps, num_directions * hidden_size]`. Note that `num_directions` is 2 if direction is "bidirectional" else 1. `time_steps` means the length of the output sequence.
  1559. - **final_states** (Tensor): final states. The shape is `[num_layers * num_directions, batch_size, hidden_size]`. Note that `num_directions` is 2 if direction is "bidirectional" (the index of forward states are 0, 2, 4, 6... and the index of backward states are 1, 3, 5, 7...), else 1.
  1560. Variables:
  1561. - **weight_ih_l[k]**: the learnable input-hidden weights of the k-th layer. If `k = 0`, the shape is `[hidden_size, input_size]`. Otherwise, the shape is `[hidden_size, num_directions * hidden_size]`.
  1562. - **weight_hh_l[k]**: the learnable hidden-hidden weights of the k-th layer, with shape `[hidden_size, hidden_size]`.
  1563. - **bias_ih_l[k]**: the learnable input-hidden bias of the k-th layer, with shape `[hidden_size]`.
  1564. - **bias_hh_l[k]**: the learnable hidden-hidden bias of the k-th layer, with shape `[hidden_size]`.
  1565. Examples:
  1566. .. code-block:: python
  1567. >>> import paddle
  1568. >>> rnn = paddle.nn.SimpleRNN(16, 32, 2)
  1569. >>> x = paddle.randn((4, 23, 16))
  1570. >>> prev_h = paddle.randn((2, 4, 32))
  1571. >>> y, h = rnn(x, prev_h)
  1572. >>> print(y.shape)
  1573. [4, 23, 32]
  1574. >>> print(h.shape)
  1575. [2, 4, 32]
  1576. """
  1577. def __init__(
  1578. self,
  1579. input_size,
  1580. hidden_size,
  1581. num_layers=1,
  1582. direction="forward",
  1583. time_major=False,
  1584. dropout=0.0,
  1585. activation="tanh",
  1586. weight_ih_attr=None,
  1587. weight_hh_attr=None,
  1588. bias_ih_attr=None,
  1589. bias_hh_attr=None,
  1590. name=None,
  1591. ):
  1592. if activation == "tanh":
  1593. mode = "RNN_TANH"
  1594. elif activation == "relu":
  1595. mode = "RNN_RELU"
  1596. else:
  1597. raise ValueError(f"Unknown activation '{activation}'")
  1598. self.activation = activation
  1599. super().__init__(
  1600. mode,
  1601. input_size,
  1602. hidden_size,
  1603. num_layers,
  1604. direction,
  1605. time_major,
  1606. dropout,
  1607. weight_ih_attr,
  1608. weight_hh_attr,
  1609. bias_ih_attr,
  1610. bias_hh_attr,
  1611. 0, # proj_size
  1612. )
  1613. class LSTM(RNNBase):
  1614. r"""
  1615. Multilayer LSTM. It takes a sequence and an initial state as inputs, and
  1616. returns the output sequences and the final states.
  1617. Each layer inside the LSTM maps the input sequences and initial states
  1618. to the output sequences and final states in the following manner: at each
  1619. step, it takes step inputs(:math:`x_{t}`) and previous
  1620. states(:math:`h_{t-1}, c_{t-1}`) as inputs, and returns step
  1621. outputs(:math:`y_{t}`) and new states(:math:`h_{t}, c_{t}`).
  1622. .. math::
  1623. i_{t} & = \sigma(W_{ii}x_{t} + b_{ii} + W_{hi}h_{t-1} + b_{hi})
  1624. f_{t} & = \sigma(W_{if}x_{t} + b_{if} + W_{hf}h_{t-1} + b_{hf})
  1625. o_{t} & = \sigma(W_{io}x_{t} + b_{io} + W_{ho}h_{t-1} + b_{ho})
  1626. \widetilde{c}_{t} & = \tanh (W_{ig}x_{t} + b_{ig} + W_{hg}h_{t-1} + b_{hg})
  1627. c_{t} & = f_{t} * c_{t-1} + i_{t} * \widetilde{c}_{t}
  1628. h_{t} & = o_{t} * \tanh(c_{t})
  1629. y_{t} & = h_{t}
  1630. If `proj_size` is specified, the dimension of hidden state :math:`h_{t}` will be projected to `proj_size`:
  1631. .. math::
  1632. h_{t} = h_{t}W_{proj\_size}
  1633. where :math:`\sigma` is the sigmoid function, and * is the elementwise
  1634. multiplication operator.
  1635. Using key word arguments to construct is recommended.
  1636. Parameters:
  1637. input_size (int): The input size of :math:`x` for the first layer's cell.
  1638. hidden_size (int): The hidden size of :math:`h` for each layer's cell.
  1639. num_layers (int, optional): Number of recurrent layers. Defaults to 1.
  1640. direction (str, optional): The direction of the network. It can be "forward"
  1641. or "bidirect"(or "bidirectional"). When "bidirect", the way to merge
  1642. outputs of forward and backward is concatenating. Defaults to "forward".
  1643. time_major (bool, optional): Whether the first dimension of the input
  1644. means the time steps. If time_major is True, the shape of Tensor is
  1645. [time_steps,batch_size,input_size], otherwise [batch_size, time_steps,input_size].
  1646. Defaults to False. `time_steps` means the length of input sequence.
  1647. dropout (float, optional): The dropout probability. Dropout is applied
  1648. to the input of each layer except for the first layer. The range of
  1649. dropout from 0 to 1. Defaults to 0.
  1650. weight_ih_attr (ParamAttr, optional): The parameter attribute for
  1651. `weight_ih` of each cell. Default: None.
  1652. weight_hh_attr (ParamAttr, optional): The parameter attribute for
  1653. `weight_hh` of each cell. Default: None.
  1654. bias_ih_attr (ParamAttr, optional): The parameter attribute for the
  1655. `bias_ih` of each cells. Default: None.
  1656. bias_hh_attr (ParamAttr, optional): The parameter attribute for the
  1657. `bias_hh` of each cells. Default: None.
  1658. proj_size (int, optional): If specified, the output hidden state of each layer
  1659. will be projected to `proj_size`. `proj_size` must be smaller than `hidden_size`.
  1660. Default: 0.
  1661. name (str, optional): Name for the operation (optional, default is
  1662. None). For more information, please refer to :ref:`api_guide_Name`.
  1663. Inputs:
  1664. - **inputs** (Tensor): the input sequence. If `time_major` is True, the shape is `[time_steps, batch_size, input_size]`, else, the shape is `[batch_size, time_steps, input_size]`. `time_steps` means the length of the input sequence.
  1665. - **initial_states** (list|tuple, optional): the initial state, a list/tuple of (h, c), the shape of each is `[num_layers * num_directions, batch_size, hidden_size]`. If initial_state is not given, zero initial states are used.
  1666. - **sequence_length** (Tensor, optional): shape `[batch_size]`, dtype: int64 or int32. The valid lengths of input sequences. Defaults to None. If `sequence_length` is not None, the inputs are treated as padded sequences. In each input sequence, elements whos time step index are not less than the valid length are treated as paddings.
  1667. Returns:
  1668. - **outputs** (Tensor). The output sequence. If `time_major` is True, the shape is `[time_steps, batch_size, num_directions * hidden_size]`. If `proj_size` is specified, shape will be `[time_major, batch_size, num_directions * proj_size]`. If `time_major` is False, the shape is `[batch_size, time_steps, num_directions * hidden_size]`. Note that `num_directions` is 2 if direction is "bidirectional" else 1. `time_steps` means the length of the output sequence.
  1669. - **final_states** (tuple). The final state, a tuple of two tensors, h and c. The shape of each is `[num_layers * num_directions, batch_size, hidden_size]`. If `proj_size` is specified, the last dimension of h will be proj_size.
  1670. Note that `num_directions` is 2 if direction is "bidirectional" (the index of forward states are 0, 2, 4, 6... and the index of backward states are 1, 3, 5, 7...), else 1.
  1671. Variables:
  1672. - **weight_ih_l[k]**: the learnable input-hidden weights of the k-th layer. If `k = 0`, the shape is `[hidden_size, input_size]`. Otherwise, the shape is `[hidden_size, num_directions * hidden_size]`.
  1673. - **weight_hh_l[k]**: the learnable hidden-hidden weights of the k-th layer, with shape `[hidden_size, hidden_size]`.
  1674. - **bias_ih_l[k]**: the learnable input-hidden bias of the k-th layer, with shape `[hidden_size]`.
  1675. - **bias_hh_l[k]**: the learnable hidden-hidden bias of the k-th layer, with shape `[hidden_size]`.
  1676. Examples:
  1677. .. code-block:: python
  1678. >>> import paddle
  1679. >>> rnn = paddle.nn.LSTM(16, 32, 2)
  1680. >>> x = paddle.randn((4, 23, 16))
  1681. >>> prev_h = paddle.randn((2, 4, 32))
  1682. >>> prev_c = paddle.randn((2, 4, 32))
  1683. >>> y, (h, c) = rnn(x, (prev_h, prev_c))
  1684. >>> print(y.shape)
  1685. [4, 23, 32]
  1686. >>> print(h.shape)
  1687. [2, 4, 32]
  1688. >>> print(c.shape)
  1689. [2, 4, 32]
  1690. """
  1691. def __init__(
  1692. self,
  1693. input_size,
  1694. hidden_size,
  1695. num_layers=1,
  1696. direction="forward",
  1697. time_major=False,
  1698. dropout=0.0,
  1699. weight_ih_attr=None,
  1700. weight_hh_attr=None,
  1701. bias_ih_attr=None,
  1702. bias_hh_attr=None,
  1703. proj_size=0,
  1704. name=None,
  1705. ):
  1706. super().__init__(
  1707. "LSTM",
  1708. input_size,
  1709. hidden_size,
  1710. num_layers,
  1711. direction,
  1712. time_major,
  1713. dropout,
  1714. weight_ih_attr,
  1715. weight_hh_attr,
  1716. bias_ih_attr,
  1717. bias_hh_attr,
  1718. proj_size,
  1719. )
  1720. class GRU(RNNBase):
  1721. r"""
  1722. Multilayer GRU. It takes input sequence and initial states as inputs, and
  1723. returns the output sequences and the final states.
  1724. Each layer inside the GRU maps the input sequences and initial states
  1725. to the output sequences and final states in the following manner: at each
  1726. step, it takes step inputs(:math:`x_{t}`) and previous
  1727. states(:math:`h_{t-1}`) as inputs, and returns step outputs(:math:`y_{t}`)
  1728. and new states(:math:`h_{t}`).
  1729. .. math::
  1730. r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}h_{t-1} + b_{hr})
  1731. z_{t} & = \sigma(W_{iz}x_{t} + b_{iz} + W_{hz}h_{t-1} + b_{hz})
  1732. \widetilde{h}_{t} & = \tanh(W_{ic}x_{t} + b_{ic} + r_{t} * (W_{hc}h_{t-1} + b_{hc}))
  1733. h_{t} & = z_{t} * h_{t-1} + (1 - z_{t}) * \widetilde{h}_{t}
  1734. y_{t} & = h_{t}
  1735. where :math:`\sigma` is the sigmoid function, and * is the elementwise
  1736. multiplication operator.
  1737. Using key word arguments to construct is recommended.
  1738. Parameters:
  1739. input_size (int): The input size of :math:`x` for the first layer's cell.
  1740. hidden_size (int): The hidden size of :math:`h` for each layer's cell.
  1741. num_layers (int, optional): Number of recurrent layers. Defaults to 1.
  1742. direction (str, optional): The direction of the network. It can be "forward"
  1743. or "bidirect"(or "bidirectional"). When "bidirect", the way to merge
  1744. outputs of forward and backward is concatenating. Defaults to "forward".
  1745. time_major (bool, optional): Whether the first dimension of the input
  1746. means the time steps. If time_major is True, the shape of Tensor is
  1747. [time_steps,batch_size,input_size], otherwise [batch_size, time_steps,input_size].
  1748. Defaults to False. `time_steps` means the length of input sequence.
  1749. dropout (float, optional): The dropout probability. Dropout is applied
  1750. to the input of each layer except for the first layer. The range of
  1751. dropout from 0 to 1. Defaults to 0.
  1752. weight_ih_attr (ParamAttr, optional): The parameter attribute for
  1753. `weight_ih` of each cell. Default: None.
  1754. weight_hh_attr (ParamAttr, optional): The parameter attribute for
  1755. `weight_hh` of each cell. Default: None.
  1756. bias_ih_attr (ParamAttr, optional): The parameter attribute for the
  1757. `bias_ih` of each cells. Default: None.
  1758. bias_hh_attr (ParamAttr, optional): The parameter attribute for the
  1759. `bias_hh` of each cells. Default: None.
  1760. name (str, optional): Name for the operation (optional, default is
  1761. None). For more information, please refer to :ref:`api_guide_Name`.
  1762. Inputs:
  1763. - **inputs** (Tensor): the input sequence. If `time_major` is True, the shape is `[time_steps, batch_size, input_size]`, else, the shape is `[batch_size, time_steps, input_size]`. `time_steps` means the length of the input sequence.
  1764. - **initial_states** (Tensor, optional): the initial state. The shape is `[num_layers * num_directions, batch_size, hidden_size]`. If initial_state is not given, zero initial states are used. Defaults to None.
  1765. - **sequence_length** (Tensor, optional): shape `[batch_size]`, dtype: int64 or int32. The valid lengths of input sequences. Defaults to None. If `sequence_length` is not None, the inputs are treated as padded sequences. In each input sequence, elements whos time step index are not less than the valid length are treated as paddings.
  1766. Returns:
  1767. - **outputs** (Tensor): the output sequence. If `time_major` is True, the shape is `[time_steps, batch_size, num_directions * hidden_size]`, else, the shape is `[batch_size, time_steps, num_directions * hidden_size]`. Note that `num_directions` is 2 if direction is "bidirectional" else 1. `time_steps` means the length of the output sequence.
  1768. - **final_states** (Tensor): final states. The shape is `[num_layers * num_directions, batch_size, hidden_size]`. Note that `num_directions` is 2 if direction is "bidirectional" (the index of forward states are 0, 2, 4, 6... and the index of backward states are 1, 3, 5, 7...), else 1.
  1769. Variables:
  1770. - **weight_ih_l[k]**: the learnable input-hidden weights of the k-th layer. If `k = 0`, the shape is `[hidden_size, input_size]`. Otherwise, the shape is `[hidden_size, num_directions * hidden_size]`.
  1771. - **weight_hh_l[k]**: the learnable hidden-hidden weights of the k-th layer, with shape `[hidden_size, hidden_size]`.
  1772. - **bias_ih_l[k]**: the learnable input-hidden bias of the k-th layer, with shape `[hidden_size]`.
  1773. - **bias_hh_l[k]**: the learnable hidden-hidden bias of the k-th layer, with shape `[hidden_size]`.
  1774. Examples:
  1775. .. code-block:: python
  1776. >>> import paddle
  1777. >>> rnn = paddle.nn.GRU(16, 32, 2)
  1778. >>> x = paddle.randn((4, 23, 16))
  1779. >>> prev_h = paddle.randn((2, 4, 32))
  1780. >>> y, h = rnn(x, prev_h)
  1781. >>> print(y.shape)
  1782. [4, 23, 32]
  1783. >>> print(h.shape)
  1784. [2, 4, 32]
  1785. """
  1786. def __init__(
  1787. self,
  1788. input_size,
  1789. hidden_size,
  1790. num_layers=1,
  1791. direction="forward",
  1792. time_major=False,
  1793. dropout=0.0,
  1794. weight_ih_attr=None,
  1795. weight_hh_attr=None,
  1796. bias_ih_attr=None,
  1797. bias_hh_attr=None,
  1798. name=None,
  1799. ):
  1800. super().__init__(
  1801. "GRU",
  1802. input_size,
  1803. hidden_size,
  1804. num_layers,
  1805. direction,
  1806. time_major,
  1807. dropout,
  1808. weight_ih_attr,
  1809. weight_hh_attr,
  1810. bias_ih_attr,
  1811. bias_hh_attr,
  1812. 0, # proj_size
  1813. )