configuration_falcon.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211
  1. # coding=utf-8
  2. # Copyright 2023 the Falcon authors and HuggingFace Inc. team. All rights reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """Falcon configuration"""
  16. from ...configuration_utils import PretrainedConfig
  17. from ...utils import logging
  18. logger = logging.get_logger(__name__)
  19. class FalconConfig(PretrainedConfig):
  20. r"""
  21. This is the configuration class to store the configuration of a [`FalconModel`]. It is used to instantiate a Falcon
  22. model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
  23. defaults will yield a similar configuration to that of the
  24. [tiiuae/falcon-7b](https://huggingface.co/tiiuae/falcon-7b) architecture.
  25. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
  26. documentation from [`PretrainedConfig`] for more information.
  27. Args:
  28. vocab_size (`int`, *optional*, defaults to 65024):
  29. Vocabulary size of the Falcon model. Defines the number of different tokens that can be represented by the
  30. `inputs_ids` passed when calling [`FalconModel`]
  31. hidden_size (`int`, *optional*, defaults to 4544):
  32. Dimension of the hidden representations.
  33. num_hidden_layers (`int`, *optional*, defaults to 32):
  34. Number of hidden layers in the Transformer decoder.
  35. num_attention_heads (`int`, *optional*, defaults to 71):
  36. Number of attention heads for each attention layer in the Transformer encoder.
  37. num_ln_in_parallel_attn (`int`, *optional*):
  38. Set to 2 if separate layer norms are to be used for the MLP and the attention output when using parallel
  39. attention, otherwise, 1.
  40. layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
  41. The epsilon used by the layer normalization layers.
  42. initializer_range (`float`, *optional*, defaults to 0.02):
  43. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
  44. use_cache (`bool`, *optional*, defaults to `True`):
  45. Whether the model should return the last key/values attentions (not used by all models). Only relevant if
  46. `config.is_decoder=True`.
  47. hidden_dropout (`float`, *optional*, defaults to 0.0):
  48. The dropout probability for MLP layers.
  49. attention_dropout (`float`, *optional*, defaults to 0.0):
  50. The dropout probability for attention layers.
  51. num_kv_heads (`int`, *optional*):
  52. Number of key-value heads to use per attention layer. If unset, defaults to the same value as
  53. `num_attention_heads`.
  54. alibi (`bool`, *optional*, defaults to `False`):
  55. Whether to use ALiBi positional biases during self-attention.
  56. new_decoder_architecture (`bool`, *optional*, defaults to `False`):
  57. Whether to use the new (Falcon-40B) decoder architecture. If `True`, the `multi_query` and `parallel_attn`
  58. arguments are ignored, as the new decoder always uses parallel attention.
  59. multi_query (`bool`, *optional*, defaults to `True`):
  60. Whether to use multi-query attention in the decoder. Ignored when `new_decoder_architecture` is `True`.
  61. parallel_attn (`bool`, *optional*, defaults to `True`):
  62. Whether to compute attention in parallel with the feedforward layer. If False, they are consecutive
  63. instead, as in the original Transformer architecture. Ignored when `new_decoder_architecture` is `True`.
  64. bias (`bool`, *optional*, defaults to `False`):
  65. Whether to use bias on Linear layers.
  66. max_position_embeddings (`int`, *optional*, defaults to 2048):
  67. The maximum sequence length that this model might ever be used with, when `alibi` is `False`. Pretrained
  68. Falcon models with RoPE support up to 2048 tokens.
  69. rope_theta (`float`, *optional*, defaults to 10000.0):
  70. The base period of the RoPE embeddings.
  71. rope_scaling (`Dict`, *optional*):
  72. Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
  73. and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
  74. accordingly.
  75. Expected contents:
  76. `rope_type` (`str`):
  77. The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
  78. 'llama3'], with 'default' being the original RoPE implementation.
  79. `factor` (`float`, *optional*):
  80. Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
  81. most scaling types, a `factor` of x will enable the model to handle sequences of length x *
  82. original maximum pre-trained length.
  83. `original_max_position_embeddings` (`int`, *optional*):
  84. Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
  85. pretraining.
  86. `attention_factor` (`float`, *optional*):
  87. Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
  88. computation. If unspecified, it defaults to value recommended by the implementation, using the
  89. `factor` field to infer the suggested value.
  90. `beta_fast` (`float`, *optional*):
  91. Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
  92. ramp function. If unspecified, it defaults to 32.
  93. `beta_slow` (`float`, *optional*):
  94. Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
  95. ramp function. If unspecified, it defaults to 1.
  96. `short_factor` (`list[float]`, *optional*):
  97. Only used with 'longrope'. The scaling factor to be applied to short contexts (<
  98. `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
  99. size divided by the number of attention heads divided by 2
  100. `long_factor` (`list[float]`, *optional*):
  101. Only used with 'longrope'. The scaling factor to be applied to long contexts (<
  102. `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
  103. size divided by the number of attention heads divided by 2
  104. `low_freq_factor` (`float`, *optional*):
  105. Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
  106. `high_freq_factor` (`float`, *optional*):
  107. Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
  108. bos_token_id (`int`, *optional*, defaults to 11):
  109. The id of the "beginning-of-sequence" token.
  110. eos_token_id (`int`, *optional*, defaults to 11):
  111. The id of the "end-of-sequence" token.
  112. ffn_hidden_size (`int`, *optional*):
  113. The hidden size of the feedforward layer in the Transformer decoder.
  114. defaults to 4x hidden dim
  115. activation (`str`, *optional*, defaults to `"gelu"`):
  116. The activation function used in the feedforward layer.
  117. Example:
  118. ```python
  119. >>> from transformers import FalconModel, FalconConfig
  120. >>> # Initializing a small (2-layer) Falcon configuration
  121. >>> configuration = FalconConfig(num_hidden_layers=2)
  122. >>> # Initializing a model from the small configuration
  123. >>> model = FalconModel(configuration)
  124. >>> # Accessing the model configuration
  125. >>> configuration = model.config
  126. ```"""
  127. model_type = "falcon"
  128. keys_to_ignore_at_inference = ["past_key_values"]
  129. def __init__(
  130. self,
  131. vocab_size=65024,
  132. hidden_size=4544,
  133. num_hidden_layers=32,
  134. num_attention_heads=71,
  135. num_ln_in_parallel_attn=None,
  136. layer_norm_epsilon=1e-5,
  137. initializer_range=0.02,
  138. use_cache=True,
  139. hidden_dropout=0.0,
  140. attention_dropout=0.0,
  141. num_kv_heads=None,
  142. alibi=False,
  143. new_decoder_architecture=False,
  144. multi_query=True,
  145. parallel_attn=True,
  146. bias=False,
  147. max_position_embeddings=2048,
  148. rope_theta=10000.0,
  149. rope_scaling=None,
  150. bos_token_id=11,
  151. eos_token_id=11,
  152. ffn_hidden_size=None,
  153. activation="gelu",
  154. **kwargs,
  155. ):
  156. self.vocab_size = vocab_size
  157. # Backward compatibility with n_embed kwarg
  158. n_embed = kwargs.pop("n_embed", None)
  159. self.hidden_size = hidden_size if n_embed is None else n_embed
  160. self.num_hidden_layers = num_hidden_layers
  161. self.num_attention_heads = num_attention_heads
  162. self.layer_norm_epsilon = layer_norm_epsilon
  163. self.initializer_range = initializer_range
  164. self.use_cache = use_cache
  165. self.hidden_dropout = hidden_dropout
  166. self.attention_dropout = attention_dropout
  167. self.bos_token_id = bos_token_id
  168. self.eos_token_id = eos_token_id
  169. self.num_kv_heads = num_attention_heads if num_kv_heads is None else num_kv_heads
  170. self.alibi = alibi
  171. self.new_decoder_architecture = new_decoder_architecture
  172. self.multi_query = multi_query # Ignored when new_decoder_architecture is True
  173. self.parallel_attn = parallel_attn
  174. self.bias = bias
  175. self.num_ln_in_parallel_attn = num_ln_in_parallel_attn
  176. self.max_position_embeddings = max_position_embeddings
  177. self.rope_theta = rope_theta
  178. self.rope_scaling = rope_scaling
  179. self.activation = activation
  180. if ffn_hidden_size is None:
  181. self.ffn_hidden_size = hidden_size * 4
  182. else:
  183. self.ffn_hidden_size = ffn_hidden_size
  184. super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
  185. @property
  186. def head_dim(self):
  187. return self.hidden_size // self.num_attention_heads
  188. @property
  189. def rotary(self):
  190. return not self.alibi
  191. __all__ = ["FalconConfig"]