configuration_doge.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241
  1. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
  2. # This file was automatically generated from src/transformers/models/doge/modular_doge.py.
  3. # Do NOT edit this file manually as any edits will be overwritten by the generation of
  4. # the file from the modular. If any change should be done, please apply the change to the
  5. # modular_doge.py file directly. One of our CI enforces this.
  6. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
  7. # coding=utf-8
  8. # Copyright 2025 Jingze Shi and the HuggingFace Inc. team. All rights reserved.
  9. #
  10. # The Doge family of small language models is trained by SmallDoge Team.
  11. #
  12. # Licensed under the Apache License, Version 2.0 (the "License");
  13. # you may not use this file except in compliance with the License.
  14. # You may obtain a copy of the License at
  15. #
  16. # http://www.apache.org/licenses/LICENSE-2.0
  17. #
  18. # Unless required by applicable law or agreed to in writing, software
  19. # distributed under the License is distributed on an "AS IS" BASIS,
  20. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  21. # See the License for the specific language governing permissions and
  22. # limitations under the License.
  23. from ...configuration_utils import PretrainedConfig
  24. from ...modeling_rope_utils import rope_config_validation
  25. class DogeConfig(PretrainedConfig):
  26. r"""
  27. This is the configuration class to store the configuration of a [`DogeModel`]. It is used to instantiate an Doge
  28. model according to the specified arguments, defining the model architecture like [SmallDoge/Doge-320M](https://huggingface.co/SmallDoge/Doge-320M).
  29. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
  30. documentation from [`PretrainedConfig`] for more information.
  31. Args:
  32. vocab_size (`int`, *optional*, defaults to 32768):
  33. Vocabulary size of the Doge2 model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`DogeModel`]
  34. hidden_size (`int`, *optional*, defaults to 1024):
  35. Dimension of the hidden representations.
  36. intermediate_size (`int`, *optional*, defaults to 2048):
  37. Dimension of the MLP representations.
  38. num_hidden_layers (`int`, *optional*, defaults to 32):
  39. Number of hidden layers in the Transformer decoder.
  40. hidden_dropout (`float`, *optional*, defaults to 0.0):
  41. Dropout probability for each sequence transformation and state transformation module.
  42. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
  43. The non-linear activation function (function or string) in the decoder.
  44. initializer_range (`float`, *optional*, defaults to 0.02):
  45. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
  46. rms_norm_eps (`float`, *optional*, defaults to 1e-06):
  47. The epsilon used by the rms normalization layers.
  48. use_cache (`bool`, *optional*, defaults to `True`):
  49. Whether or not the model should return the last key/values attentions (not used by all models). Only
  50. relevant if `config.is_decoder=True`.
  51. tie_word_embeddings (`bool`, *optional*, defaults to `False`):
  52. Whether the model's input and output word embeddings should be tied.
  53. max_position_embeddings (`int`, *optional*, defaults to 2048):
  54. The maximum sequence length that this model might ever be used with.
  55. rope_theta (`float`, *optional*, defaults to 10000.0):
  56. The base period of the RoPE embeddings.
  57. rope_scaling (`Dict`, *optional*):
  58. Dictionary containing the scaling configuration for the RoPE embeddings.
  59. NOTE: if you apply new rope type and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value accordingly.
  60. Doge family of small models use `{ 'rope_type': 'dynamic', 'factor': 4.0, 'original_max_position_embeddings': 2048 }` as the default value.
  61. Expected contents:
  62. `rope_type` (`str`):
  63. The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', 'llama3'], with 'default' being the original RoPE implementation.
  64. `factor` (`float`, *optional*):
  65. Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings.
  66. In most scaling types, a `factor` of x will enable the model to handle sequences of length x * original maximum pre-trained length.
  67. `original_max_position_embeddings` (`int`, *optional*):
  68. Used with 'dynamic', 'longrope' and 'llama3'.
  69. The original max position embeddings used during pretraining.
  70. `attention_factor` (`float`, *optional*):
  71. Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
  72. computation.
  73. If unspecified, it defaults to value recommended by the implementation, using the `factor` field to infer the suggested value.
  74. `beta_fast` (`float`, *optional*):
  75. Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
  76. ramp function. If unspecified, it defaults to 32.
  77. `beta_slow` (`float`, *optional*):
  78. Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
  79. ramp function. If unspecified, it defaults to 1.
  80. `short_factor` (`List[float]`, *optional*):
  81. Only used with 'longrope'. The scaling factor to be applied to short contexts (<`original_max_position_embeddings`).
  82. Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2
  83. `long_factor` (`List[float]`, *optional*):
  84. Only used with 'longrope'. The scaling factor to be applied to long contexts (<`original_max_position_embeddings`).
  85. Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2
  86. `low_freq_factor` (`float`, *optional*):
  87. Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
  88. `high_freq_factor` (`float`, *optional*):
  89. Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
  90. num_attention_heads (`int`, *optional*, defaults to 8):
  91. Number of attention heads for each attention layer in the Transformer decoder.
  92. num_key_value_heads (`int`, *optional*):
  93. This is the number of key_value heads that should be used to implement Grouped Query Attention.
  94. If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
  95. `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used.
  96. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed by meanpooling all the original heads within that group.
  97. For more details checkout [this paper](https://huggingface.co/papers/2305.13245).
  98. If it is not specified, will default to `num_attention_heads`.
  99. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
  100. Whether to use a bias in the query, key, value and output projection layers during self-attention.
  101. attention_dropout (`float`, *optional*, defaults to 0.0):
  102. The dropout ratio for the attention probabilities.
  103. mlp_bias (`bool`, *optional*, defaults to `False`):
  104. Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
  105. sliding_window (`int`, *optional*):
  106. Sliding window attention window size. If not specified, will default to `None`.
  107. keep_window_size (`int`, *optional*, defaults to 2048):
  108. The window size of tokens that are not dynamically masked, and dynamic masking is only performed when the sequence length exceeds this value.
  109. is_moe (`bool`, *optional*, defaults to `False`):
  110. Whether to use the Cross Domain Mixture of Experts, if `True`, the MoE will inherit the MLP to initialize.
  111. num_experts (`int`, *optional*, defaults to 16384):
  112. Number of routed experts in the model. This is only used when `is_moe=True`.
  113. num_experts_per_tok (`int`, *optional*, defaults to 64):
  114. Number of selected experts to route per-token.
  115. norm_topk_prob (`bool`, *optional*, defaults to `False`):
  116. Whether to normalize the topk probabilities.
  117. output_router_logits (`bool`, *optional*, defaults to `False`):
  118. Whether or not the router logits should be returned by the model. Enabling this will also
  119. allow the model to output the auxiliary loss, including load balancing loss and router z-loss.
  120. router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
  121. The aux loss factor for the total loss.
  122. ```python
  123. >>> from transformers import DogeConfig, DogeModel
  124. >>> # Initializing a Doge-320M style configuration
  125. >>> configuration = DogeConfig()
  126. >>> # Initializing a model from the Doge-320M style configuration
  127. >>> model = DogeModel(configuration)
  128. >>> # Accessing the model configuration
  129. >>> configuration = model.config
  130. ```"""
  131. model_type = "doge"
  132. keys_to_ignore_at_inference = ["past_key_values"]
  133. # Default tensor parallel plan for base model `DogeModel`
  134. base_model_tp_plan = {
  135. "layers.*.self_attn.q_proj": "colwise",
  136. "layers.*.self_attn.k_proj": "colwise",
  137. "layers.*.self_attn.v_proj": "colwise",
  138. "layers.*.self_attn.dt_proj": "rowwise",
  139. "layers.*.self_attn.o_proj": "rowwise",
  140. "layers.*.input_layernorm.weight": "sequence_parallel",
  141. "layers.*.input_residual.weight": "sequence_parallel",
  142. "layers.*.post_attention_layernorm.weight": "sequence_parallel",
  143. "layers.*.post_attention_residual.weight": "sequence_parallel",
  144. "norm.weight": "sequence_parallel",
  145. "layers.*.mlp.gate_proj": "colwise",
  146. "layers.*.mlp.up_proj": "colwise",
  147. "layers.*.mlp.down_proj": "rowwise",
  148. "layers.*.mlp.router_gate": "colwise_rep",
  149. "layers.*.mlp.down_embed": "rowwise_rep",
  150. "layers.*.mlp.up_embed": "rowwise_rep",
  151. }
  152. base_model_pp_plan = {
  153. "embed_tokens": (["input_ids"], ["inputs_embeds"]),
  154. "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
  155. "norm": (["hidden_states"], ["hidden_states"]),
  156. }
  157. def __init__(
  158. self,
  159. vocab_size=32768,
  160. hidden_size=1024,
  161. intermediate_size=2048,
  162. num_hidden_layers=32,
  163. hidden_dropout=0.0,
  164. hidden_act="silu",
  165. initializer_range=0.02,
  166. rms_norm_eps=1e-06,
  167. use_cache=True,
  168. tie_word_embeddings=False,
  169. max_position_embeddings=2048,
  170. rope_theta=10000.0,
  171. rope_scaling=None,
  172. num_attention_heads=8,
  173. num_key_value_heads=None,
  174. attention_bias=False,
  175. attention_dropout=0.0,
  176. mlp_bias=False,
  177. sliding_window=None,
  178. keep_window_size=2048,
  179. is_moe=False,
  180. num_experts=16384,
  181. num_experts_per_tok=64,
  182. norm_topk_prob=False,
  183. output_router_logits=False,
  184. router_aux_loss_coef=0.001,
  185. **kwargs,
  186. ):
  187. self.vocab_size = vocab_size
  188. self.hidden_size = hidden_size
  189. self.intermediate_size = intermediate_size
  190. self.num_hidden_layers = num_hidden_layers
  191. self.hidden_dropout = hidden_dropout
  192. self.hidden_act = hidden_act
  193. self.initializer_range = initializer_range
  194. self.rms_norm_eps = rms_norm_eps
  195. self.use_cache = use_cache
  196. self.max_position_embeddings = max_position_embeddings
  197. self.rope_theta = rope_theta
  198. self.rope_scaling = rope_scaling
  199. self.num_attention_heads = num_attention_heads
  200. self.num_key_value_heads = num_key_value_heads
  201. self.attention_bias = attention_bias
  202. self.attention_dropout = attention_dropout
  203. self.mlp_bias = mlp_bias
  204. self.sliding_window = sliding_window
  205. self.keep_window_size = keep_window_size
  206. self.is_moe = is_moe
  207. self.num_experts = num_experts
  208. self.num_experts_per_tok = num_experts_per_tok
  209. self.norm_topk_prob = norm_topk_prob
  210. self.output_router_logits = output_router_logits
  211. self.router_aux_loss_coef = router_aux_loss_coef
  212. # Validate the correctness of rotary position embeddings parameters
  213. # BC: if there is a 'type' field, copy it it to 'rope_type'.
  214. if self.rope_scaling is not None and "type" in self.rope_scaling:
  215. self.rope_scaling["rope_type"] = self.rope_scaling["type"]
  216. rope_config_validation(self)
  217. # for backward compatibility
  218. if num_key_value_heads is None:
  219. self.num_key_value_heads = num_attention_heads
  220. super().__init__(
  221. tie_word_embeddings=tie_word_embeddings,
  222. **kwargs,
  223. )
  224. __all__ = ["DogeConfig"]