modeling_tf_outputs.py 55 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990
  1. # Copyright 2020 The HuggingFace Team. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from __future__ import annotations
  15. import warnings
  16. from dataclasses import dataclass
  17. import tensorflow as tf
  18. from .utils import ModelOutput
  19. @dataclass
  20. class TFBaseModelOutput(ModelOutput):
  21. """
  22. Base class for model's outputs, with potential hidden states and attentions.
  23. Args:
  24. last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
  25. Sequence of hidden-states at the output of the last layer of the model.
  26. hidden_states (`tuple(tf.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  27. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  28. `(batch_size, sequence_length, hidden_size)`.
  29. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
  30. attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  31. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  32. sequence_length)`.
  33. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  34. heads.
  35. """
  36. last_hidden_state: tf.Tensor | None = None
  37. hidden_states: tuple[tf.Tensor] | None = None
  38. attentions: tuple[tf.Tensor] | None = None
  39. @dataclass
  40. class TFBaseModelOutputWithNoAttention(ModelOutput):
  41. """
  42. Base class for model's outputs, with potential hidden states.
  43. Args:
  44. last_hidden_state (`tf.Tensor` shape `(batch_size, num_channels, height, width)`):
  45. Sequence of hidden-states at the output of the last layer of the model.
  46. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  47. Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
  48. the output of each layer) of shape `(batch_size, num_channels, height, width)`.
  49. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  50. """
  51. last_hidden_state: tf.Tensor | None = None
  52. hidden_states: tuple[tf.Tensor, ...] | None = None
  53. @dataclass
  54. class TFBaseModelOutputWithPooling(ModelOutput):
  55. """
  56. Base class for model's outputs that also contains a pooling of the last hidden states.
  57. Args:
  58. last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
  59. Sequence of hidden-states at the output of the last layer of the model.
  60. pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`):
  61. Last layer hidden-state of the first token of the sequence (classification token) further processed by a
  62. Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence
  63. prediction (classification) objective during pretraining.
  64. This output is usually *not* a good summary of the semantic content of the input, you're often better with
  65. averaging or pooling the sequence of hidden-states for the whole input sequence.
  66. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  67. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  68. `(batch_size, sequence_length, hidden_size)`.
  69. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
  70. attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  71. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  72. sequence_length)`.
  73. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  74. heads.
  75. """
  76. last_hidden_state: tf.Tensor | None = None
  77. pooler_output: tf.Tensor | None = None
  78. hidden_states: tuple[tf.Tensor] | None = None
  79. attentions: tuple[tf.Tensor] | None = None
  80. @dataclass
  81. class TFBaseModelOutputWithPoolingAndNoAttention(ModelOutput):
  82. """
  83. Base class for model's outputs that also contains a pooling of the last hidden states.
  84. Args:
  85. last_hidden_state (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
  86. Sequence of hidden-states at the output of the last layer of the model.
  87. pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`):
  88. Last layer hidden-state after a pooling operation on the spatial dimensions.
  89. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  90. Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
  91. the output of each layer) of shape `(batch_size, num_channels, height, width)`.
  92. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  93. """
  94. last_hidden_state: tf.Tensor | None = None
  95. pooler_output: tf.Tensor | None = None
  96. hidden_states: tuple[tf.Tensor, ...] | None = None
  97. @dataclass
  98. class TFBaseModelOutputWithPoolingAndCrossAttentions(ModelOutput):
  99. """
  100. Base class for model's outputs that also contains a pooling of the last hidden states.
  101. Args:
  102. last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
  103. Sequence of hidden-states at the output of the last layer of the model.
  104. pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`):
  105. Last layer hidden-state of the first token of the sequence (classification token) further processed by a
  106. Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence
  107. prediction (classification) objective during pretraining.
  108. This output is usually *not* a good summary of the semantic content of the input, you're often better with
  109. averaging or pooling the sequence of hidden-states for the whole input sequence.
  110. past_key_values (`list[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  111. List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
  112. sequence_length, embed_size_per_head)`).
  113. Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
  114. `past_key_values` input) to speed up sequential decoding.
  115. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  116. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  117. `(batch_size, sequence_length, hidden_size)`.
  118. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
  119. attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  120. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  121. sequence_length)`.
  122. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  123. heads.
  124. cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  125. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  126. sequence_length)`.
  127. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  128. weighted average in the cross-attention heads.
  129. """
  130. last_hidden_state: tf.Tensor | None = None
  131. pooler_output: tf.Tensor | None = None
  132. past_key_values: list[tf.Tensor] | None = None
  133. hidden_states: tuple[tf.Tensor] | None = None
  134. attentions: tuple[tf.Tensor] | None = None
  135. cross_attentions: tuple[tf.Tensor] | None = None
  136. @dataclass
  137. class TFBaseModelOutputWithPast(ModelOutput):
  138. """
  139. Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
  140. Args:
  141. last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
  142. Sequence of hidden-states at the output of the last layer of the model.
  143. If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
  144. hidden_size)` is output.
  145. past_key_values (`list[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  146. List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
  147. sequence_length, embed_size_per_head)`).
  148. Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
  149. `past_key_values` input) to speed up sequential decoding.
  150. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  151. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  152. `(batch_size, sequence_length, hidden_size)`.
  153. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
  154. attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  155. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  156. sequence_length)`.
  157. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  158. heads.
  159. """
  160. last_hidden_state: tf.Tensor | None = None
  161. past_key_values: list[tf.Tensor] | None = None
  162. hidden_states: tuple[tf.Tensor] | None = None
  163. attentions: tuple[tf.Tensor] | None = None
  164. @dataclass
  165. class TFBaseModelOutputWithCrossAttentions(ModelOutput):
  166. """
  167. Base class for model's outputs, with potential hidden states and attentions.
  168. Args:
  169. last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
  170. Sequence of hidden-states at the output of the last layer of the model.
  171. hidden_states (`tuple(tf.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  172. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  173. `(batch_size, sequence_length, hidden_size)`.
  174. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
  175. attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  176. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  177. sequence_length)`.
  178. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  179. heads.
  180. cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  181. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  182. sequence_length)`.
  183. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  184. weighted average in the cross-attention heads.
  185. """
  186. last_hidden_state: tf.Tensor | None = None
  187. hidden_states: tuple[tf.Tensor] | None = None
  188. attentions: tuple[tf.Tensor] | None = None
  189. cross_attentions: tuple[tf.Tensor] | None = None
  190. @dataclass
  191. class TFBaseModelOutputWithPastAndCrossAttentions(ModelOutput):
  192. """
  193. Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
  194. Args:
  195. last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
  196. Sequence of hidden-states at the output of the last layer of the model.
  197. If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
  198. hidden_size)` is output.
  199. past_key_values (`list[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  200. List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
  201. sequence_length, embed_size_per_head)`).
  202. Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
  203. `past_key_values` input) to speed up sequential decoding.
  204. hidden_states (`tuple(tf.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  205. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  206. `(batch_size, sequence_length, hidden_size)`.
  207. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
  208. attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  209. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  210. sequence_length)`.
  211. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  212. heads.
  213. cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  214. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  215. sequence_length)`.
  216. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  217. weighted average in the cross-attention heads.
  218. """
  219. last_hidden_state: tf.Tensor | None = None
  220. past_key_values: list[tf.Tensor] | None = None
  221. hidden_states: tuple[tf.Tensor] | None = None
  222. attentions: tuple[tf.Tensor] | None = None
  223. cross_attentions: tuple[tf.Tensor] | None = None
  224. @dataclass
  225. class TFSeq2SeqModelOutput(ModelOutput):
  226. """
  227. Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
  228. decoding.
  229. Args:
  230. last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
  231. Sequence of hidden-states at the output of the last layer of the decoder of the model.
  232. If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
  233. hidden_size)` is output.
  234. past_key_values (`list[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  235. List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
  236. sequence_length, embed_size_per_head)`).
  237. Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
  238. used (see `past_key_values` input) to speed up sequential decoding.
  239. decoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  240. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  241. `(batch_size, sequence_length, hidden_size)`.
  242. Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
  243. decoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  244. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  245. sequence_length)`.
  246. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
  247. self-attention heads.
  248. cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  249. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  250. sequence_length)`.
  251. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  252. weighted average in the cross-attention heads.
  253. encoder_last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
  254. Sequence of hidden-states at the output of the last layer of the encoder of the model.
  255. encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  256. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  257. `(batch_size, sequence_length, hidden_size)`.
  258. Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
  259. encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  260. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  261. sequence_length)`.
  262. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
  263. self-attention heads.
  264. """
  265. last_hidden_state: tf.Tensor | None = None
  266. past_key_values: list[tf.Tensor] | None = None
  267. decoder_hidden_states: tuple[tf.Tensor] | None = None
  268. decoder_attentions: tuple[tf.Tensor] | None = None
  269. cross_attentions: tuple[tf.Tensor] | None = None
  270. encoder_last_hidden_state: tf.Tensor | None = None
  271. encoder_hidden_states: tuple[tf.Tensor] | None = None
  272. encoder_attentions: tuple[tf.Tensor] | None = None
  273. @dataclass
  274. class TFCausalLMOutput(ModelOutput):
  275. """
  276. Base class for causal language model (or autoregressive) outputs.
  277. Args:
  278. loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `labels` is provided):
  279. Language modeling loss (for next-token prediction).
  280. logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
  281. Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
  282. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  283. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  284. `(batch_size, sequence_length, hidden_size)`.
  285. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
  286. attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  287. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  288. sequence_length)`.
  289. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  290. heads.
  291. """
  292. loss: tf.Tensor | None = None
  293. logits: tf.Tensor | None = None
  294. hidden_states: tuple[tf.Tensor] | None = None
  295. attentions: tuple[tf.Tensor] | None = None
  296. @dataclass
  297. class TFCausalLMOutputWithPast(ModelOutput):
  298. """
  299. Base class for causal language model (or autoregressive) outputs.
  300. Args:
  301. loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `labels` is provided):
  302. Language modeling loss (for next-token prediction).
  303. logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
  304. Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
  305. past_key_values (`list[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  306. List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
  307. sequence_length, embed_size_per_head)`).
  308. Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
  309. `past_key_values` input) to speed up sequential decoding.
  310. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  311. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  312. `(batch_size, sequence_length, hidden_size)`.
  313. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
  314. attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  315. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  316. sequence_length)`.
  317. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  318. heads.
  319. """
  320. loss: tf.Tensor | None = None
  321. logits: tf.Tensor | None = None
  322. past_key_values: list[tf.Tensor] | None = None
  323. hidden_states: tuple[tf.Tensor] | None = None
  324. attentions: tuple[tf.Tensor] | None = None
  325. @dataclass
  326. class TFCausalLMOutputWithCrossAttentions(ModelOutput):
  327. """
  328. Base class for causal language model (or autoregressive) outputs.
  329. Args:
  330. loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `labels` is provided):
  331. Language modeling loss (for next-token prediction).
  332. logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
  333. Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
  334. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  335. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  336. `(batch_size, sequence_length, hidden_size)`.
  337. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
  338. attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  339. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  340. sequence_length)`.
  341. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  342. heads.
  343. cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  344. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  345. sequence_length)`.
  346. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  347. weighted average in the cross-attention heads.
  348. past_key_values (`list[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  349. List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
  350. sequence_length, embed_size_per_head)`).
  351. Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
  352. `past_key_values` input) to speed up sequential decoding.
  353. """
  354. loss: tf.Tensor | None = None
  355. logits: tf.Tensor | None = None
  356. past_key_values: list[tf.Tensor] | None = None
  357. hidden_states: tuple[tf.Tensor] | None = None
  358. attentions: tuple[tf.Tensor] | None = None
  359. cross_attentions: tuple[tf.Tensor] | None = None
  360. @dataclass
  361. class TFMaskedLMOutput(ModelOutput):
  362. """
  363. Base class for masked language models outputs.
  364. Args:
  365. loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `labels` is provided):
  366. Masked language modeling (MLM) loss.
  367. logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
  368. Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
  369. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  370. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  371. `(batch_size, sequence_length, hidden_size)`.
  372. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
  373. attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  374. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  375. sequence_length)`.
  376. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  377. heads.
  378. """
  379. loss: tf.Tensor | None = None
  380. logits: tf.Tensor | None = None
  381. hidden_states: tuple[tf.Tensor] | None = None
  382. attentions: tuple[tf.Tensor] | None = None
  383. @dataclass
  384. class TFSeq2SeqLMOutput(ModelOutput):
  385. """
  386. Base class for sequence-to-sequence language models outputs.
  387. Args:
  388. loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `labels` is provided):
  389. Language modeling loss.
  390. logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
  391. Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
  392. past_key_values (`list[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  393. List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
  394. sequence_length, embed_size_per_head)`).
  395. Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
  396. used (see `past_key_values` input) to speed up sequential decoding.
  397. decoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  398. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  399. `(batch_size, sequence_length, hidden_size)`.
  400. Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
  401. decoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  402. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  403. sequence_length)`.
  404. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
  405. self-attention heads.
  406. cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  407. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  408. sequence_length)`.
  409. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  410. weighted average in the cross-attention heads.
  411. encoder_last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
  412. Sequence of hidden-states at the output of the last layer of the encoder of the model.
  413. encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  414. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  415. `(batch_size, sequence_length, hidden_size)`.
  416. Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
  417. encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  418. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  419. sequence_length)`.
  420. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
  421. self-attention heads.
  422. """
  423. loss: tf.Tensor | None = None
  424. logits: tf.Tensor | None = None
  425. past_key_values: list[tf.Tensor] | None = None
  426. decoder_hidden_states: tuple[tf.Tensor] | None = None
  427. decoder_attentions: tuple[tf.Tensor] | None = None
  428. cross_attentions: tuple[tf.Tensor] | None = None
  429. encoder_last_hidden_state: tf.Tensor | None = None
  430. encoder_hidden_states: tuple[tf.Tensor] | None = None
  431. encoder_attentions: tuple[tf.Tensor] | None = None
  432. @dataclass
  433. class TFNextSentencePredictorOutput(ModelOutput):
  434. """
  435. Base class for outputs of models predicting if two sentences are consecutive or not.
  436. Args:
  437. loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `next_sentence_label` is provided):
  438. Next sentence prediction loss.
  439. logits (`tf.Tensor` of shape `(batch_size, 2)`):
  440. Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
  441. before SoftMax).
  442. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  443. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  444. `(batch_size, sequence_length, hidden_size)`.
  445. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
  446. attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  447. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  448. sequence_length)`.
  449. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  450. heads.
  451. """
  452. loss: tf.Tensor | None = None
  453. logits: tf.Tensor | None = None
  454. hidden_states: tuple[tf.Tensor] | None = None
  455. attentions: tuple[tf.Tensor] | None = None
  456. @dataclass
  457. class TFSequenceClassifierOutput(ModelOutput):
  458. """
  459. Base class for outputs of sentence classification models.
  460. Args:
  461. loss (`tf.Tensor` of shape `(batch_size, )`, *optional*, returned when `labels` is provided):
  462. Classification (or regression if config.num_labels==1) loss.
  463. logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
  464. Classification (or regression if config.num_labels==1) scores (before SoftMax).
  465. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  466. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  467. `(batch_size, sequence_length, hidden_size)`.
  468. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
  469. attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  470. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  471. sequence_length)`.
  472. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  473. heads.
  474. """
  475. loss: tf.Tensor | None = None
  476. logits: tf.Tensor | None = None
  477. hidden_states: tuple[tf.Tensor] | None = None
  478. attentions: tuple[tf.Tensor] | None = None
  479. @dataclass
  480. class TFSeq2SeqSequenceClassifierOutput(ModelOutput):
  481. """
  482. Base class for outputs of sequence-to-sequence sentence classification models.
  483. Args:
  484. loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `label` is provided):
  485. Classification (or regression if config.num_labels==1) loss.
  486. logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
  487. Classification (or regression if config.num_labels==1) scores (before SoftMax).
  488. past_key_values (`list[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  489. List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
  490. sequence_length, embed_size_per_head)`).
  491. Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
  492. used (see `past_key_values` input) to speed up sequential decoding.
  493. decoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  494. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  495. `(batch_size, sequence_length, hidden_size)`.
  496. Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
  497. decoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  498. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  499. sequence_length)`.
  500. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
  501. self-attention heads.
  502. cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  503. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  504. sequence_length)`
  505. encoder_last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
  506. Sequence of hidden-states at the output of the last layer of the encoder of the model.
  507. encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  508. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  509. `(batch_size, sequence_length, hidden_size)`.
  510. Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
  511. encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  512. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  513. sequence_length)`.
  514. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
  515. self-attention heads.
  516. """
  517. loss: tf.Tensor | None = None
  518. logits: tf.Tensor | None = None
  519. past_key_values: list[tf.Tensor] | None = None
  520. decoder_hidden_states: tuple[tf.Tensor] | None = None
  521. decoder_attentions: tuple[tf.Tensor] | None = None
  522. cross_attentions: tuple[tf.Tensor] | None = None
  523. encoder_last_hidden_state: tf.Tensor | None = None
  524. encoder_hidden_states: tuple[tf.Tensor] | None = None
  525. encoder_attentions: tuple[tf.Tensor] | None = None
  526. @dataclass
  527. class TFSemanticSegmenterOutput(ModelOutput):
  528. """
  529. Base class for outputs of semantic segmentation models.
  530. Args:
  531. loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  532. Classification (or regression if config.num_labels==1) loss.
  533. logits (`tf.Tensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`):
  534. Classification scores for each pixel.
  535. <Tip warning={true}>
  536. The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is
  537. to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the
  538. original image size as post-processing. You should always check your logits shape and resize as needed.
  539. </Tip>
  540. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  541. Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
  542. the output of each layer) of shape `(batch_size, patch_size, hidden_size)`.
  543. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  544. attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  545. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, patch_size, sequence_length)`.
  546. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  547. heads.
  548. """
  549. loss: tf.Tensor | None = None
  550. logits: tf.Tensor | None = None
  551. hidden_states: tuple[tf.Tensor] | None = None
  552. attentions: tuple[tf.Tensor] | None = None
  553. @dataclass
  554. class TFSemanticSegmenterOutputWithNoAttention(ModelOutput):
  555. """
  556. Base class for outputs of semantic segmentation models that do not output attention scores.
  557. Args:
  558. loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  559. Classification (or regression if config.num_labels==1) loss.
  560. logits (`tf.Tensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`):
  561. Classification scores for each pixel.
  562. <Tip warning={true}>
  563. The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is
  564. to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the
  565. original image size as post-processing. You should always check your logits shape and resize as needed.
  566. </Tip>
  567. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  568. Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
  569. the output of each layer) of shape `(batch_size, patch_size, hidden_size)`.
  570. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  571. """
  572. loss: tf.Tensor | None = None
  573. logits: tf.Tensor | None = None
  574. hidden_states: tuple[tf.Tensor] | None = None
  575. @dataclass
  576. class TFImageClassifierOutput(ModelOutput):
  577. """
  578. Base class for outputs of image classification models.
  579. Args:
  580. loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  581. Classification (or regression if config.num_labels==1) loss.
  582. logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
  583. Classification (or regression if config.num_labels==1) scores (before SoftMax).
  584. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  585. Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
  586. the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states (also called
  587. feature maps) of the model at the output of each stage.
  588. attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  589. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, patch_size, sequence_length)`.
  590. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  591. heads.
  592. """
  593. loss: tf.Tensor | None = None
  594. logits: tf.Tensor | None = None
  595. hidden_states: tuple[tf.Tensor] | None = None
  596. attentions: tuple[tf.Tensor] | None = None
  597. @dataclass
  598. class TFMultipleChoiceModelOutput(ModelOutput):
  599. """
  600. Base class for outputs of multiple choice models.
  601. Args:
  602. loss (`tf.Tensor` of shape *(batch_size, )*, *optional*, returned when `labels` is provided):
  603. Classification loss.
  604. logits (`tf.Tensor` of shape `(batch_size, num_choices)`):
  605. *num_choices* is the second dimension of the input tensors. (see *input_ids* above).
  606. Classification scores (before SoftMax).
  607. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  608. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  609. `(batch_size, sequence_length, hidden_size)`.
  610. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
  611. attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  612. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  613. sequence_length)`.
  614. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  615. heads.
  616. """
  617. loss: tf.Tensor | None = None
  618. logits: tf.Tensor | None = None
  619. hidden_states: tuple[tf.Tensor] | None = None
  620. attentions: tuple[tf.Tensor] | None = None
  621. @dataclass
  622. class TFTokenClassifierOutput(ModelOutput):
  623. """
  624. Base class for outputs of token classification models.
  625. Args:
  626. loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of unmasked labels, returned when `labels` is provided) :
  627. Classification loss.
  628. logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.num_labels)`):
  629. Classification scores (before SoftMax).
  630. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  631. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  632. `(batch_size, sequence_length, hidden_size)`.
  633. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
  634. attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  635. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  636. sequence_length)`.
  637. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  638. heads.
  639. """
  640. loss: tf.Tensor | None = None
  641. logits: tf.Tensor | None = None
  642. hidden_states: tuple[tf.Tensor] | None = None
  643. attentions: tuple[tf.Tensor] | None = None
  644. @dataclass
  645. class TFQuestionAnsweringModelOutput(ModelOutput):
  646. """
  647. Base class for outputs of question answering models.
  648. Args:
  649. loss (`tf.Tensor` of shape `(batch_size, )`, *optional*, returned when `start_positions` and `end_positions` are provided):
  650. Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
  651. start_logits (`tf.Tensor` of shape `(batch_size, sequence_length)`):
  652. Span-start scores (before SoftMax).
  653. end_logits (`tf.Tensor` of shape `(batch_size, sequence_length)`):
  654. Span-end scores (before SoftMax).
  655. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  656. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  657. `(batch_size, sequence_length, hidden_size)`.
  658. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
  659. attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  660. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  661. sequence_length)`.
  662. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  663. heads.
  664. """
  665. loss: tf.Tensor | None = None
  666. start_logits: tf.Tensor | None = None
  667. end_logits: tf.Tensor | None = None
  668. hidden_states: tuple[tf.Tensor] | None = None
  669. attentions: tuple[tf.Tensor] | None = None
  670. @dataclass
  671. class TFSeq2SeqQuestionAnsweringModelOutput(ModelOutput):
  672. """
  673. Base class for outputs of sequence-to-sequence question answering models.
  674. Args:
  675. loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  676. Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
  677. start_logits (`tf.Tensor` of shape `(batch_size, sequence_length)`):
  678. Span-start scores (before SoftMax).
  679. end_logits (`tf.Tensor` of shape `(batch_size, sequence_length)`):
  680. Span-end scores (before SoftMax).
  681. past_key_values (`list[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  682. List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
  683. sequence_length, embed_size_per_head)`).
  684. Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
  685. used (see `past_key_values` input) to speed up sequential decoding.
  686. decoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  687. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  688. `(batch_size, sequence_length, hidden_size)`.
  689. Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
  690. decoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  691. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  692. sequence_length)`.
  693. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
  694. self-attention heads.
  695. encoder_last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
  696. Sequence of hidden-states at the output of the last layer of the encoder of the model.
  697. encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  698. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  699. `(batch_size, sequence_length, hidden_size)`.
  700. Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
  701. encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  702. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  703. sequence_length)`.
  704. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
  705. self-attention heads.
  706. """
  707. loss: tf.Tensor | None = None
  708. start_logits: tf.Tensor | None = None
  709. end_logits: tf.Tensor | None = None
  710. past_key_values: list[tf.Tensor] | None = None
  711. decoder_hidden_states: tuple[tf.Tensor] | None = None
  712. decoder_attentions: tuple[tf.Tensor] | None = None
  713. encoder_last_hidden_state: tf.Tensor | None = None
  714. encoder_hidden_states: tuple[tf.Tensor] | None = None
  715. encoder_attentions: tuple[tf.Tensor] | None = None
  716. @dataclass
  717. class TFSequenceClassifierOutputWithPast(ModelOutput):
  718. """
  719. Base class for outputs of sentence classification models.
  720. Args:
  721. loss (`tf.Tensor` of shape `(batch_size, )`, *optional*, returned when `labels` is provided):
  722. Classification (or regression if config.num_labels==1) loss.
  723. logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
  724. Classification (or regression if config.num_labels==1) scores (before SoftMax).
  725. past_key_values (`list[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  726. List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
  727. sequence_length, embed_size_per_head)`).
  728. Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
  729. `past_key_values` input) to speed up sequential decoding.
  730. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  731. Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
  732. `(batch_size, sequence_length, hidden_size)`.
  733. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
  734. attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  735. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  736. sequence_length)`.
  737. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  738. heads.
  739. """
  740. loss: tf.Tensor | None = None
  741. logits: tf.Tensor | None = None
  742. past_key_values: list[tf.Tensor] | None = None
  743. hidden_states: tuple[tf.Tensor] | None = None
  744. attentions: tuple[tf.Tensor] | None = None
  745. @dataclass
  746. class TFImageClassifierOutputWithNoAttention(ModelOutput):
  747. """
  748. Base class for outputs of image classification models.
  749. Args:
  750. loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  751. Classification (or regression if config.num_labels==1) loss.
  752. logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
  753. Classification (or regression if config.num_labels==1) scores (before SoftMax).
  754. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  755. Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
  756. the output of each stage) of shape `(batch_size, num_channels, height, width)`. Hidden-states (also called
  757. feature maps) of the model at the output of each stage.
  758. """
  759. loss: tf.Tensor | None = None
  760. logits: tf.Tensor | None = None
  761. hidden_states: tuple[tf.Tensor, ...] | None = None
  762. @dataclass
  763. class TFMaskedImageModelingOutput(ModelOutput):
  764. """
  765. Base class for outputs of masked image completion / in-painting models.
  766. Args:
  767. loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
  768. Reconstruction loss.
  769. reconstruction (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
  770. Reconstructed / completed images.
  771. hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when
  772. `config.output_hidden_states=True`):
  773. Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
  774. the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states (also called
  775. feature maps) of the model at the output of each stage.
  776. attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when
  777. `config.output_attentions=True`):
  778. Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, patch_size, sequence_length)`.
  779. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  780. heads.
  781. """
  782. loss: tf.Tensor | None = None
  783. reconstruction: tf.Tensor | None = None
  784. hidden_states: tuple[tf.Tensor] | None = None
  785. attentions: tuple[tf.Tensor] | None = None
  786. @property
  787. def logits(self):
  788. warnings.warn(
  789. "logits attribute is deprecated and will be removed in version 5 of Transformers."
  790. " Please use the reconstruction attribute to retrieve the final output instead.",
  791. FutureWarning,
  792. )
  793. return self.reconstruction