configuration_csm.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440
  1. # coding=utf-8
  2. # Copyright 2025 Sesame and The HuggingFace Inc. team. All rights reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. from ...configuration_utils import PretrainedConfig
  16. from ...modeling_rope_utils import rope_config_validation
  17. from ...utils import logging
  18. from ..auto.configuration_auto import AutoConfig
  19. logger = logging.get_logger(__name__)
  20. class CsmDepthDecoderConfig(PretrainedConfig):
  21. r"""
  22. This is the configuration class to store the configuration of a [`CsmDepthDecoderModel`]. It is used to instantiate an CSM depth decoder
  23. model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield
  24. a similar configuration to that of the csm-1b.
  25. e.g. [sesame/csm-1b](https://huggingface.co/sesame/csm-1b)
  26. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
  27. documentation from [`PretrainedConfig`] for more information.
  28. Args:
  29. num_codebooks (`int`, *optional*, defaults to 32):
  30. Number of codebooks used in the underlying codec model responsible for tokenizing the audio.
  31. backbone_hidden_size (`int`, *optional*, defaults to 2048):
  32. Dimension of the hidden representations of the backbone model used with this depth decoder.
  33. vocab_size (`int`, *optional*, defaults to 2051):
  34. Vocabulary size of the CsmDepthDecoder model. Defines the number of different audio tokens that can be represented by each codebook.
  35. hidden_size (`int`, *optional*, defaults to 1024):
  36. Dimension of the hidden representations.
  37. intermediate_size (`int`, *optional*, defaults to 8192):
  38. Dimension of the MLP representations.
  39. num_hidden_layers (`int`, *optional*, defaults to 4):
  40. Number of hidden layers in the Transformer decoder.
  41. num_attention_heads (`int`, *optional*, defaults to 8):
  42. Number of attention heads for each attention layer in the Transformer decoder.
  43. num_key_value_heads (`int`, *optional*, defaults to 2):
  44. This is the number of key_value heads that should be used to implement Grouped Query Attention. If
  45. `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
  46. `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
  47. converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
  48. by meanpooling all the original heads within that group. For more details, check out [this
  49. paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
  50. `num_attention_heads`.
  51. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
  52. The non-linear activation function (function or string) in the decoder.
  53. max_position_embeddings (`int`, *optional*, defaults to 33):
  54. The maximum sequence length that this model might ever be used with.
  55. initializer_range (`float`, *optional*, defaults to 0.02):
  56. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
  57. rms_norm_eps (`float`, *optional*, defaults to 1e-05):
  58. The epsilon used by the rms normalization layers.
  59. use_cache (`bool`, *optional*, defaults to `True`):
  60. Whether or not the model should return the last key/values attentions (not used by all models). Only
  61. relevant if `config.is_decoder=True`.
  62. pad_token_id (`int`, *optional*, defaults to 2050):
  63. Padding token id.
  64. bos_token_id (`int`, *optional*):
  65. Beginning of stream token id.
  66. eos_token_id (`int`, *optional*):
  67. End of stream token id.
  68. rope_theta (`float`, *optional*, defaults to 500000):
  69. The base period of the RoPE embeddings.
  70. rope_scaling (`Dict`, *optional*):
  71. Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
  72. and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
  73. accordingly.
  74. Expected contents:
  75. `rope_type` (`str`):
  76. The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
  77. 'llama3'], with 'default' being the original RoPE implementation.
  78. `factor` (`float`, *optional*):
  79. Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
  80. most scaling types, a `factor` of x will enable the model to handle sequences of length x *
  81. original maximum pre-trained length.
  82. `original_max_position_embeddings` (`int`, *optional*):
  83. Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
  84. pretraining.
  85. `attention_factor` (`float`, *optional*):
  86. Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
  87. computation. If unspecified, it defaults to value recommended by the implementation, using the
  88. `factor` field to infer the suggested value.
  89. `beta_fast` (`float`, *optional*):
  90. Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
  91. ramp function. If unspecified, it defaults to 32.
  92. `beta_slow` (`float`, *optional*):
  93. Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
  94. ramp function. If unspecified, it defaults to 1.
  95. `short_factor` (`list[float]`, *optional*):
  96. Only used with 'longrope'. The scaling factor to be applied to short contexts (<
  97. `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
  98. size divided by the number of attention heads divided by 2
  99. `long_factor` (`list[float]`, *optional*):
  100. Only used with 'longrope'. The scaling factor to be applied to long contexts (<
  101. `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
  102. size divided by the number of attention heads divided by 2
  103. `low_freq_factor` (`float`, *optional*):
  104. Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
  105. `high_freq_factor` (`float`, *optional*):
  106. Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
  107. attention_bias (`bool`, *optional*, defaults to `False`):
  108. Whether to use a bias in the query, key, value and output projection layers during self-attention.
  109. attention_dropout (`float`, *optional*, defaults to 0.0):
  110. The dropout ratio for the attention probabilities.
  111. mlp_bias (`bool`, *optional*, defaults to `False`):
  112. Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
  113. head_dim (`int`, *optional*):
  114. The attention head dimension. If None, it will default to hidden_size // num_attention_heads
  115. ```python
  116. >>> from transformers import CsmDepthDecoder, CsmDepthDecoderConfig
  117. >>> # Initializing a CsmDepthDecoder
  118. >>> configuration = CsmDepthDecoderConfig()
  119. >>> model = CsmDepthDecoderModel(configuration)
  120. >>> # Accessing the model configuration
  121. >>> configuration = model.config
  122. ```"""
  123. model_type = "csm_depth_decoder_model"
  124. base_config_key = "depth_decoder_config"
  125. keys_to_ignore_at_inference = ["past_key_values"]
  126. def __init__(
  127. self,
  128. num_codebooks=32,
  129. backbone_hidden_size=2048,
  130. vocab_size=2051,
  131. hidden_size=1024,
  132. intermediate_size=8192,
  133. num_hidden_layers=4,
  134. num_attention_heads=8,
  135. num_key_value_heads=2,
  136. hidden_act="silu",
  137. max_position_embeddings=33,
  138. initializer_range=0.02,
  139. rms_norm_eps=1e-5,
  140. use_cache=True,
  141. pad_token_id=None,
  142. bos_token_id=None,
  143. eos_token_id=None,
  144. rope_theta=500000,
  145. rope_scaling=None,
  146. attention_bias=False,
  147. attention_dropout=0.0,
  148. mlp_bias=False,
  149. head_dim=None,
  150. **kwargs,
  151. ):
  152. if kwargs.pop("tie_word_embeddings", False):
  153. raise ValueError("`tie_word_embeddings=True` is not supported for CsmDepthDecoderConfig")
  154. super().__init__(
  155. pad_token_id=pad_token_id,
  156. bos_token_id=bos_token_id,
  157. eos_token_id=eos_token_id,
  158. tie_word_embeddings=False,
  159. **kwargs,
  160. )
  161. self.num_codebooks = num_codebooks
  162. self.vocab_size = vocab_size
  163. self.backbone_hidden_size = backbone_hidden_size
  164. self.max_position_embeddings = max_position_embeddings
  165. self.hidden_size = hidden_size
  166. self.intermediate_size = intermediate_size
  167. self.num_hidden_layers = num_hidden_layers
  168. self.num_attention_heads = num_attention_heads
  169. # for backward compatibility
  170. if num_key_value_heads is None:
  171. num_key_value_heads = num_attention_heads
  172. self.num_key_value_heads = num_key_value_heads
  173. self.hidden_act = hidden_act
  174. self.initializer_range = initializer_range
  175. self.rms_norm_eps = rms_norm_eps
  176. self.use_cache = use_cache
  177. self.rope_theta = rope_theta
  178. self.rope_scaling = rope_scaling
  179. self.attention_bias = attention_bias
  180. self.attention_dropout = attention_dropout
  181. self.mlp_bias = mlp_bias
  182. self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
  183. # Validate the correctness of rotary position embeddings parameters
  184. # BC: if there is a 'type' field, copy it it to 'rope_type'.
  185. if self.rope_scaling is not None and "type" in self.rope_scaling:
  186. self.rope_scaling["rope_type"] = self.rope_scaling["type"]
  187. rope_config_validation(self)
  188. class CsmConfig(PretrainedConfig):
  189. r"""
  190. This is the configuration class to store the configuration of a [`CsmForConditionalGeneration`]. It is used to instantiate an CSM
  191. model according to the specified arguments, defining the model architecture. Instantiating a configuration
  192. with the defaults will yield a similar configuration to that of the csm-1b.
  193. e.g. [sesame/csm-1b](https://huggingface.co/sesame/csm-1b)
  194. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
  195. documentation from [`PretrainedConfig`] for more information.
  196. Args:
  197. num_codebooks (`int`, *optional*, defaults to 32):
  198. Number of codebooks used in the underlying codec model responsible for tokenizing the audio.
  199. vocab_size (`int`, *optional*, defaults to 2051):
  200. Vocabulary size of the Csm model. Defines the number of different audio tokens that can be represented by each codebook.
  201. text_vocab_size (`int`, *optional*, defaults to 128256):
  202. Vocabulary size of the text input for the Csm model. Defines the number of different text tokens that can be represented.
  203. hidden_size (`int`, *optional*, defaults to 2048):
  204. Dimension of the hidden representations of the backbone model.
  205. intermediate_size (`int`, *optional*, defaults to 8192):
  206. Dimension of the MLP representations of the backbone model.
  207. num_hidden_layers (`int`, *optional*, defaults to 16):
  208. Number of hidden layers in the backbone model Transformer decoder.
  209. num_attention_heads (`int`, *optional*, defaults to 32):
  210. Number of attention heads for each attention layer in the backbone model Transformer decoder.
  211. num_key_value_heads (`int`, *optional*, defaults to 8):
  212. This is the number of key_value heads that should be used to implement Grouped Query Attention. If
  213. `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
  214. `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
  215. converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
  216. by meanpooling all the original heads within that group. For more details, check out [this
  217. paper](https://huggingface.co/papers/2305.13245).
  218. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
  219. The non-linear activation function (function or string) in the backbone model Transformer decoder.
  220. max_position_embeddings (`int`, *optional*, defaults to 2048):
  221. The maximum sequence length that this model might ever be used with.
  222. initializer_range (`float`, *optional*, defaults to 0.02):
  223. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
  224. rms_norm_eps (`float`, *optional*, defaults to 1e-05):
  225. The epsilon used by the rms normalization layers.
  226. use_cache (`bool`, *optional*, defaults to `True`):
  227. Whether or not the model should return the last key/values attentions (not used by all models). Only
  228. relevant if `config.is_decoder=True`.
  229. pad_token_id (`int`, *optional*, defaults to 128002):
  230. Padding token id.
  231. codebook_pad_token_id (`int`, *optional*, defaults to 2050):
  232. Padding token id for codebook tokens.
  233. codebook_eos_token_id (`int`, *optional*, defaults to 0):
  234. End of stream token id for codebook tokens.
  235. bos_token_id (`int`, *optional*, defaults to 128000):
  236. Beginning of stream token id.
  237. eos_token_id (`int`, *optional*):
  238. End of stream token id.
  239. audio_token_id (`int`, *optional*, defaults to 128002):
  240. Audio token id in the text input.
  241. audio_eos_token_id (`int`, *optional*, defaults to 128003):
  242. End of stream token id for audio in the text input.
  243. rope_theta (`float`, *optional*, defaults to 500000):
  244. The base period of the RoPE embeddings.
  245. rope_scaling (`Dict`, *optional*, defaults to `{'factor': 32.0, 'high_freq_factor': 0.5, 'low_freq_factor': 0.125, 'original_max_position_embeddings': 1024, 'rope_type': 'llama3'}`):
  246. Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
  247. and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
  248. accordingly.
  249. Expected contents:
  250. `rope_type` (`str`):
  251. The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
  252. 'llama3'], with 'default' being the original RoPE implementation.
  253. `factor` (`float`, *optional*):
  254. Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
  255. most scaling types, a `factor` of x will enable the model to handle sequences of length x *
  256. original maximum pre-trained length.
  257. `original_max_position_embeddings` (`int`, *optional*):
  258. Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
  259. pretraining.
  260. `attention_factor` (`float`, *optional*):
  261. Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
  262. computation. If unspecified, it defaults to value recommended by the implementation, using the
  263. `factor` field to infer the suggested value.
  264. `beta_fast` (`float`, *optional*):
  265. Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
  266. ramp function. If unspecified, it defaults to 32.
  267. `beta_slow` (`float`, *optional*):
  268. Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
  269. ramp function. If unspecified, it defaults to 1.
  270. `short_factor` (`list[float]`, *optional*):
  271. Only used with 'longrope'. The scaling factor to be applied to short contexts (<
  272. `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
  273. size divided by the number of attention heads divided by 2
  274. `long_factor` (`list[float]`, *optional*):
  275. Only used with 'longrope'. The scaling factor to be applied to long contexts (<
  276. `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
  277. size divided by the number of attention heads divided by 2
  278. `low_freq_factor` (`float`, *optional*):
  279. Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
  280. `high_freq_factor` (`float`, *optional*):
  281. Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
  282. attention_bias (`bool`, *optional*, defaults to `False`):
  283. Whether to use a bias in the query, key, value and output projection layers during self-attention.
  284. attention_dropout (`float`, *optional*, defaults to 0.0):
  285. The dropout ratio for the attention probabilities.
  286. mlp_bias (`bool`, *optional*, defaults to `False`):
  287. Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
  288. head_dim (`int`, *optional*):
  289. The attention head dimension. If None, it will default to hidden_size // num_attention_heads
  290. tie_codebooks_embeddings (`bool`, *optional*, defaults to `True`):
  291. Whether to tie the codebook tokens embeddings of the backbone model to the codebook tokens embeddings of the depth decoder.
  292. depth_decoder_config (`CsmDepthDecoderConfig`, *optional*):
  293. Configuration for the depth decoder.
  294. codec_config (`PretrainedConfig`, *optional*):
  295. Configuration for the codec.
  296. ```python
  297. >>> from transformers import CsmForConditionalGeneration, CsmConfig
  298. >>> # Initializing a CsmConfig
  299. >>> configuration = CsmConfig()
  300. >>> # Initializing a model
  301. >>> model = CsmForConditionalGeneration(configuration)
  302. >>> # Accessing the model configuration
  303. >>> configuration = model.config
  304. ```"""
  305. model_type = "csm"
  306. base_config_key = "csm_config"
  307. keys_to_ignore_at_inference = ["past_key_values"]
  308. sub_configs = {
  309. "codec_config": AutoConfig,
  310. "depth_decoder_config": CsmDepthDecoderConfig,
  311. }
  312. def __init__(
  313. self,
  314. num_codebooks=32,
  315. vocab_size=2051,
  316. text_vocab_size=128256,
  317. hidden_size=2048,
  318. intermediate_size=8192,
  319. num_hidden_layers=16,
  320. num_attention_heads=32,
  321. num_key_value_heads=8,
  322. hidden_act="silu",
  323. max_position_embeddings=2048,
  324. initializer_range=0.02,
  325. rms_norm_eps=1e-5,
  326. use_cache=True,
  327. pad_token_id=128002,
  328. codebook_pad_token_id=2050,
  329. codebook_eos_token_id=0,
  330. bos_token_id=128000,
  331. eos_token_id=None,
  332. audio_token_id=128002,
  333. audio_eos_token_id=128003,
  334. rope_theta=500000,
  335. rope_scaling=None,
  336. attention_bias=False,
  337. attention_dropout=0.0,
  338. mlp_bias=False,
  339. head_dim=None,
  340. tie_codebooks_embeddings=True,
  341. depth_decoder_config=None,
  342. codec_config=None,
  343. **kwargs,
  344. ):
  345. if kwargs.pop("tie_word_embeddings", False):
  346. raise ValueError("`tie_word_embeddings=True` is not supported for CsmConfig")
  347. super().__init__(
  348. pad_token_id=pad_token_id,
  349. bos_token_id=bos_token_id,
  350. eos_token_id=eos_token_id,
  351. tie_word_embeddings=False,
  352. **kwargs,
  353. )
  354. if depth_decoder_config is None:
  355. self.depth_decoder_config = CsmDepthDecoderConfig()
  356. logger.info("depth_decoder_config is None, using default depth decoder config.")
  357. elif isinstance(depth_decoder_config, dict):
  358. self.depth_decoder_config = CsmDepthDecoderConfig(**depth_decoder_config)
  359. elif isinstance(depth_decoder_config, CsmDepthDecoderConfig):
  360. self.depth_decoder_config = depth_decoder_config
  361. if codec_config is None:
  362. self.codec_config = AutoConfig.for_model("mimi")
  363. logger.info("codec_config is None, using default audio encoder config.")
  364. elif isinstance(codec_config, dict):
  365. self.codec_config = AutoConfig.for_model(**codec_config)
  366. elif isinstance(codec_config, PretrainedConfig):
  367. self.codec_config = codec_config
  368. self.text_vocab_size = text_vocab_size
  369. self.num_codebooks = num_codebooks
  370. self.audio_token_id = audio_token_id
  371. self.audio_eos_token_id = audio_eos_token_id
  372. self.codebook_pad_token_id = codebook_pad_token_id
  373. self.codebook_eos_token_id = codebook_eos_token_id
  374. self.tie_codebooks_embeddings = tie_codebooks_embeddings
  375. self.vocab_size = vocab_size
  376. self.max_position_embeddings = max_position_embeddings
  377. self.hidden_size = hidden_size
  378. self.intermediate_size = intermediate_size
  379. self.num_hidden_layers = num_hidden_layers
  380. self.num_attention_heads = num_attention_heads
  381. # for backward compatibility
  382. if num_key_value_heads is None:
  383. num_key_value_heads = num_attention_heads
  384. self.num_key_value_heads = num_key_value_heads
  385. self.hidden_act = hidden_act
  386. self.initializer_range = initializer_range
  387. self.rms_norm_eps = rms_norm_eps
  388. self.use_cache = use_cache
  389. self.rope_theta = rope_theta
  390. self.rope_scaling = rope_scaling
  391. self.attention_bias = attention_bias
  392. self.attention_dropout = attention_dropout
  393. self.mlp_bias = mlp_bias
  394. self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
  395. # Validate the correctness of rotary position embeddings parameters
  396. # BC: if there is a 'type' field, copy it it to 'rope_type'.
  397. if self.rope_scaling is not None and "type" in self.rope_scaling:
  398. self.rope_scaling["rope_type"] = self.rope_scaling["type"]
  399. rope_config_validation(self)
  400. __all__ = [
  401. "CsmDepthDecoderConfig",
  402. "CsmConfig",
  403. ]