configuration_funnel.py 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166
  1. # coding=utf-8
  2. # Copyright 2020, Hugging Face
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """Funnel Transformer model configuration"""
  16. from ...configuration_utils import PretrainedConfig
  17. from ...utils import logging
  18. logger = logging.get_logger(__name__)
  19. class FunnelConfig(PretrainedConfig):
  20. r"""
  21. This is the configuration class to store the configuration of a [`FunnelModel`] or a [`TFBertModel`]. It is used to
  22. instantiate a Funnel Transformer model according to the specified arguments, defining the model architecture.
  23. Instantiating a configuration with the defaults will yield a similar configuration to that of the Funnel
  24. Transformer [funnel-transformer/small](https://huggingface.co/funnel-transformer/small) architecture.
  25. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
  26. documentation from [`PretrainedConfig`] for more information.
  27. Args:
  28. vocab_size (`int`, *optional*, defaults to 30522):
  29. Vocabulary size of the Funnel transformer. Defines the number of different tokens that can be represented
  30. by the `inputs_ids` passed when calling [`FunnelModel`] or [`TFFunnelModel`].
  31. block_sizes (`list[int]`, *optional*, defaults to `[4, 4, 4]`):
  32. The sizes of the blocks used in the model.
  33. block_repeats (`list[int]`, *optional*):
  34. If passed along, each layer of each block is repeated the number of times indicated.
  35. num_decoder_layers (`int`, *optional*, defaults to 2):
  36. The number of layers in the decoder (when not using the base model).
  37. d_model (`int`, *optional*, defaults to 768):
  38. Dimensionality of the model's hidden states.
  39. n_head (`int`, *optional*, defaults to 12):
  40. Number of attention heads for each attention layer in the Transformer encoder.
  41. d_head (`int`, *optional*, defaults to 64):
  42. Dimensionality of the model's heads.
  43. d_inner (`int`, *optional*, defaults to 3072):
  44. Inner dimension in the feed-forward blocks.
  45. hidden_act (`str` or `callable`, *optional*, defaults to `"gelu_new"`):
  46. The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
  47. `"relu"`, `"silu"` and `"gelu_new"` are supported.
  48. hidden_dropout (`float`, *optional*, defaults to 0.1):
  49. The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
  50. attention_dropout (`float`, *optional*, defaults to 0.1):
  51. The dropout probability for the attention probabilities.
  52. activation_dropout (`float`, *optional*, defaults to 0.0):
  53. The dropout probability used between the two layers of the feed-forward blocks.
  54. initializer_range (`float`, *optional*, defaults to 0.1):
  55. The upper bound of the *uniform initializer* for initializing all weight matrices in attention layers.
  56. initializer_std (`float`, *optional*):
  57. The standard deviation of the *normal initializer* for initializing the embedding matrix and the weight of
  58. linear layers. Will default to 1 for the embedding matrix and the value given by Xavier initialization for
  59. linear layers.
  60. layer_norm_eps (`float`, *optional*, defaults to 1e-09):
  61. The epsilon used by the layer normalization layers.
  62. pooling_type (`str`, *optional*, defaults to `"mean"`):
  63. Possible values are `"mean"` or `"max"`. The way pooling is performed at the beginning of each block.
  64. attention_type (`str`, *optional*, defaults to `"relative_shift"`):
  65. Possible values are `"relative_shift"` or `"factorized"`. The former is faster on CPU/GPU while the latter
  66. is faster on TPU.
  67. separate_cls (`bool`, *optional*, defaults to `True`):
  68. Whether or not to separate the cls token when applying pooling.
  69. truncate_seq (`bool`, *optional*, defaults to `True`):
  70. When using `separate_cls`, whether or not to truncate the last token when pooling, to avoid getting a
  71. sequence length that is not a multiple of 2.
  72. pool_q_only (`bool`, *optional*, defaults to `True`):
  73. Whether or not to apply the pooling only to the query or to query, key and values for the attention layers.
  74. """
  75. model_type = "funnel"
  76. attribute_map = {
  77. "hidden_size": "d_model",
  78. "num_attention_heads": "n_head",
  79. }
  80. def __init__(
  81. self,
  82. vocab_size=30522,
  83. block_sizes=[4, 4, 4],
  84. block_repeats=None,
  85. num_decoder_layers=2,
  86. d_model=768,
  87. n_head=12,
  88. d_head=64,
  89. d_inner=3072,
  90. hidden_act="gelu_new",
  91. hidden_dropout=0.1,
  92. attention_dropout=0.1,
  93. activation_dropout=0.0,
  94. initializer_range=0.1,
  95. initializer_std=None,
  96. layer_norm_eps=1e-9,
  97. pooling_type="mean",
  98. attention_type="relative_shift",
  99. separate_cls=True,
  100. truncate_seq=True,
  101. pool_q_only=True,
  102. **kwargs,
  103. ):
  104. self.vocab_size = vocab_size
  105. self.block_sizes = block_sizes
  106. self.block_repeats = [1] * len(block_sizes) if block_repeats is None else block_repeats
  107. assert len(block_sizes) == len(self.block_repeats), (
  108. "`block_sizes` and `block_repeats` should have the same length."
  109. )
  110. self.num_decoder_layers = num_decoder_layers
  111. self.d_model = d_model
  112. self.n_head = n_head
  113. self.d_head = d_head
  114. self.d_inner = d_inner
  115. self.hidden_act = hidden_act
  116. self.hidden_dropout = hidden_dropout
  117. self.attention_dropout = attention_dropout
  118. self.activation_dropout = activation_dropout
  119. self.initializer_range = initializer_range
  120. self.initializer_std = initializer_std
  121. self.layer_norm_eps = layer_norm_eps
  122. assert pooling_type in [
  123. "mean",
  124. "max",
  125. ], f"Got {pooling_type} for `pooling_type` but only 'mean' and 'max' are supported."
  126. self.pooling_type = pooling_type
  127. assert attention_type in [
  128. "relative_shift",
  129. "factorized",
  130. ], f"Got {attention_type} for `attention_type` but only 'relative_shift' and 'factorized' are supported."
  131. self.attention_type = attention_type
  132. self.separate_cls = separate_cls
  133. self.truncate_seq = truncate_seq
  134. self.pool_q_only = pool_q_only
  135. super().__init__(**kwargs)
  136. @property
  137. def num_hidden_layers(self):
  138. return sum(self.block_sizes)
  139. @num_hidden_layers.setter
  140. def num_hidden_layers(self, value):
  141. raise NotImplementedError(
  142. "This model does not support the setting of `num_hidden_layers`. Please set `block_sizes`."
  143. )
  144. @property
  145. def num_blocks(self):
  146. return len(self.block_sizes)
  147. @num_blocks.setter
  148. def num_blocks(self, value):
  149. raise NotImplementedError("This model does not support the setting of `num_blocks`. Please set `block_sizes`.")
  150. __all__ = ["FunnelConfig"]