configuration_mamba.py 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. # coding=utf-8
  2. # Copyright 2024 The HuggingFace Inc. team.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """MAMBA configuration"""
  16. import math
  17. from ...configuration_utils import PretrainedConfig
  18. from ...utils import logging
  19. logger = logging.get_logger(__name__)
  20. class MambaConfig(PretrainedConfig):
  21. """
  22. This is the configuration class to store the configuration of a [`MambaModel`]. It is used to instantiate a MAMBA
  23. model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
  24. defaults will yield a similar configuration to that of the MAMBA
  25. [state-spaces/mamba-2.8b](https://huggingface.co/state-spaces/mamba-2.8b) architecture.
  26. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
  27. documentation from [`PretrainedConfig`] for more information.
  28. Args:
  29. vocab_size (`int`, *optional*, defaults to 50280):
  30. Vocabulary size of the MAMBA model. Defines the number of different tokens that can be represented by the
  31. `inputs_ids` passed when calling [`MambaModel`].
  32. hidden_size (`int`, *optional*, defaults to 768):
  33. Dimensionality of the embeddings and hidden states.
  34. state_size (`int`, *optional*, defaults to 16): shape of the state space latents.
  35. num_hidden_layers (`int`, *optional*, defaults to 32):
  36. Number of hidden layers in the model.
  37. layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
  38. The epsilon to use in the layer normalization layers.
  39. pad_token_id (`int`, *optional*, defaults to 0):
  40. Padding token id.
  41. bos_token_id (`int`, *optional*, defaults to 0):
  42. The id of the beginning of sentence token in the vocabulary.
  43. eos_token_id (`int`, *optional*, defaults to 0):
  44. The id of the end of sentence token in the vocabulary.
  45. expand (`int`, *optional*, defaults to 2): Expanding factor used to determine the intermediate size.
  46. conv_kernel (`int`, *optional*, defaults to 4): Size of the convolution kernel.
  47. use_bias (`bool`, *optional*, defaults to `False`):
  48. Whether or not to use bias in ["in_proj", "out_proj"] of the mixer block
  49. use_conv_bias (`bool`, *optional*, defaults to `True`):
  50. Whether or not to use bias in the convolution layer of the mixer block.
  51. hidden_act (`str`, *optional*, defaults to `"silu"`):
  52. The non-linear activation function (function or string) in the decoder.
  53. initializer_range (`float`, *optional*, defaults to 0.1):
  54. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
  55. residual_in_fp32 (`bool`, *optional*, defaults to `True`):
  56. Whether or not residuals should be in `float32`. If set to `False` residuals will keep the same `dtype` as the rest of the model
  57. time_step_rank (`Union[int,str]`, *optional*, defaults to `"auto"`):
  58. Rank of the discretization projection matrix. `"auto"` means that it will default to `math.ceil(self.hidden_size / 16)`
  59. time_step_scale (`float`, *optional*, defaults to 1.0):
  60. Scale used used to scale `dt_proj.bias`.
  61. time_step_min (`float`, *optional*, defaults to 0.001):
  62. Minimum `time_step` used to bound `dt_proj.bias`.
  63. time_step_max (`float`, *optional*, defaults to 0.1):
  64. Maximum `time_step` used to bound `dt_proj.bias`.
  65. time_step_init_scheme (`float`, *optional*, defaults to `"random"`):
  66. Init scheme used for `dt_proj.weight`. Should be one of `["random","uniform"]`
  67. time_step_floor (`float`, *optional*, defaults to 0.0001):
  68. Minimum clamping value of the `dt_proj.bias` layer initialization.
  69. rescale_prenorm_residual (`bool`, *optional*, defaults to `False`):
  70. Whether or not to rescale `out_proj` weights when initializing.
  71. use_cache (`bool`, *optional*, defaults to `True`):
  72. Whether or not the cache should be used.
  73. use_mambapy (`bool`, *optional*, defaults to `False`):
  74. Determines the fallback strategy during training if the CUDA-based official implementation of Mamba is not available. If `True`, the mamba.py implementation is used. If `False`, the naive and slower implementation is used. Consider switching to the naive version if memory is limited.
  75. Example:
  76. ```python
  77. >>> from transformers import MambaConfig, MambaModel
  78. >>> # Initializing a Mamba configuration
  79. >>> configuration = MambaConfig()
  80. >>> # Initializing a model (with random weights) from the configuration
  81. >>> model = MambaModel(configuration)
  82. >>> # Accessing the model configuration
  83. >>> configuration = model.config
  84. ```"""
  85. model_type = "mamba"
  86. def __init__(
  87. self,
  88. vocab_size=50280,
  89. hidden_size=768,
  90. state_size=16,
  91. num_hidden_layers=32,
  92. layer_norm_epsilon=1e-5,
  93. pad_token_id=0,
  94. bos_token_id=0,
  95. eos_token_id=0,
  96. expand=2,
  97. conv_kernel=4,
  98. use_bias=False,
  99. use_conv_bias=True,
  100. hidden_act="silu",
  101. initializer_range=0.1,
  102. residual_in_fp32=True,
  103. time_step_rank="auto",
  104. time_step_scale=1.0,
  105. time_step_min=0.001,
  106. time_step_max=0.1,
  107. time_step_init_scheme="random",
  108. time_step_floor=1e-4,
  109. rescale_prenorm_residual=False,
  110. use_cache=True,
  111. use_mambapy=False,
  112. **kwargs,
  113. ):
  114. self.vocab_size = vocab_size
  115. self.hidden_size = hidden_size
  116. self.state_size = state_size
  117. self.num_hidden_layers = num_hidden_layers
  118. self.layer_norm_epsilon = layer_norm_epsilon
  119. self.conv_kernel = conv_kernel
  120. self.expand = expand
  121. self.intermediate_size = int(expand * self.hidden_size)
  122. self.bos_token_id = bos_token_id
  123. self.eos_token_id = eos_token_id
  124. self.pad_token_id = pad_token_id
  125. self.use_bias = use_bias
  126. self.use_conv_bias = use_conv_bias
  127. self.hidden_act = hidden_act
  128. self.initializer_range = initializer_range
  129. self.time_step_rank = math.ceil(self.hidden_size / 16) if time_step_rank == "auto" else time_step_rank
  130. self.time_step_scale = time_step_scale
  131. self.time_step_min = time_step_min
  132. self.time_step_max = time_step_max
  133. self.time_step_init_scheme = time_step_init_scheme
  134. self.time_step_floor = time_step_floor
  135. self.rescale_prenorm_residual = rescale_prenorm_residual
  136. self.residual_in_fp32 = residual_in_fp32
  137. self.use_cache = use_cache
  138. self.use_mambapy = use_mambapy
  139. super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, **kwargs)
  140. __all__ = ["MambaConfig"]