configuration_parakeet.py 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235
  1. # coding=utf-8
  2. # Copyright 2025 The HuggingFace Inc. team. All rights reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """Parakeet model configuration."""
  16. from typing import Union
  17. from ...configuration_utils import PretrainedConfig
  18. from ...utils import logging
  19. logger = logging.get_logger(__name__)
  20. class ParakeetEncoderConfig(PretrainedConfig):
  21. r"""
  22. This is the configuration class to store the configuration of a [`ParakeetEncoder`]. It is used to instantiate a
  23. `ParakeetEncoder` model according to the specified arguments, defining the model architecture.
  24. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
  25. documentation from [`PretrainedConfig`] for more information.
  26. Args:
  27. hidden_size (`int`, *optional*, defaults to 1024):
  28. Dimension of the layers and the hidden states.
  29. num_hidden_layers (`int`, *optional*, defaults to 24):
  30. Number of hidden layers in the Transformer encoder.
  31. num_attention_heads (`int`, *optional*, defaults to 8):
  32. Number of attention heads for each attention layer in the Transformer encoder.
  33. intermediate_size (`int`, *optional*, defaults to 4096):
  34. Dimension of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
  35. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
  36. The non-linear activation function (function or string) in the encoder and pooler.
  37. attention_bias (`bool`, *optional*, defaults to `True`):
  38. Whether to use bias in the attention layers.
  39. conv_kernel_size (`int`, *optional*, defaults to 9):
  40. The kernel size of the convolution layers in the Conformer block.
  41. subsampling_factor (`int`, *optional*, defaults to 8):
  42. The factor by which the input sequence is subsampled.
  43. subsampling_conv_channels (`int`, *optional*, defaults to 256):
  44. The number of channels in the subsampling convolution layers.
  45. num_mel_bins (`int`, *optional*, defaults to 80):
  46. Number of mel features.
  47. subsampling_conv_kernel_size (`int`, *optional*, defaults to 3):
  48. The kernel size of the subsampling convolution layers.
  49. subsampling_conv_stride (`int`, *optional*, defaults to 2):
  50. The stride of the subsampling convolution layers.
  51. dropout (`float`, *optional*, defaults to 0.1):
  52. The dropout ratio for all fully connected layers in the embeddings, encoder, and pooler.
  53. dropout_positions (`float`, *optional*, defaults to 0.0):
  54. The dropout ratio for the positions in the input sequence.
  55. layerdrop (`float`, *optional*, defaults to 0.1):
  56. The dropout ratio for the layers in the encoder.
  57. activation_dropout (`float`, *optional*, defaults to 0.1):
  58. The dropout ratio for activations inside the fully connected layer.
  59. attention_dropout (`float`, *optional*, defaults to 0.1):
  60. The dropout ratio for the attention layers.
  61. max_position_embeddings (`int`, *optional*, defaults to 5000):
  62. The maximum sequence length that this model might ever be used with.
  63. scale_input (`bool`, *optional*, defaults to `True`):
  64. Whether to scale the input embeddings.
  65. initializer_range (`float`, *optional*, defaults to 0.02):
  66. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
  67. Example:
  68. ```python
  69. >>> from transformers import ParakeetEncoderModel, ParakeetEncoderConfig
  70. >>> # Initializing a `ParakeetEncoder` configuration
  71. >>> configuration = ParakeetEncoderConfig()
  72. >>> # Initializing a model from the configuration
  73. >>> model = ParakeetEncoderModel(configuration)
  74. >>> # Accessing the model configuration
  75. >>> configuration = model.config
  76. ```
  77. This configuration class is based on the ParakeetEncoder architecture from NVIDIA NeMo. You can find more details
  78. and pre-trained models at [nvidia/parakeet-ctc-1.1b](https://huggingface.co/nvidia/parakeet-ctc-1.1b).
  79. """
  80. model_type = "parakeet_encoder"
  81. keys_to_ignore_at_inference = ["past_key_values"]
  82. def __init__(
  83. self,
  84. hidden_size=1024,
  85. num_hidden_layers=24,
  86. num_attention_heads=8,
  87. intermediate_size=4096,
  88. hidden_act="silu",
  89. attention_bias=True,
  90. conv_kernel_size=9,
  91. subsampling_factor=8,
  92. subsampling_conv_channels=256,
  93. num_mel_bins=80,
  94. subsampling_conv_kernel_size=3,
  95. subsampling_conv_stride=2,
  96. dropout=0.1,
  97. dropout_positions=0.0,
  98. layerdrop=0.1,
  99. activation_dropout=0.1,
  100. attention_dropout=0.1,
  101. max_position_embeddings=5000,
  102. scale_input=True,
  103. initializer_range=0.02,
  104. **kwargs,
  105. ):
  106. super().__init__(
  107. **kwargs,
  108. )
  109. self.hidden_size = hidden_size
  110. self.num_hidden_layers = num_hidden_layers
  111. self.num_attention_heads = num_attention_heads
  112. self.num_key_value_heads = num_attention_heads # LlamaAttention compatibility
  113. self.intermediate_size = intermediate_size
  114. self.hidden_act = hidden_act
  115. self.attention_bias = attention_bias
  116. if (conv_kernel_size - 1) % 2 != 0:
  117. raise ValueError(f"conv_kernel_size must be odd, got {conv_kernel_size}")
  118. self.conv_kernel_size = conv_kernel_size
  119. self.subsampling_conv_kernel_size = subsampling_conv_kernel_size
  120. self.subsampling_conv_stride = subsampling_conv_stride
  121. self.subsampling_factor = subsampling_factor
  122. self.subsampling_conv_channels = subsampling_conv_channels
  123. self.num_mel_bins = num_mel_bins
  124. self.dropout = dropout
  125. self.dropout_positions = dropout_positions
  126. self.layerdrop = layerdrop
  127. self.activation_dropout = activation_dropout
  128. self.attention_dropout = attention_dropout
  129. self.max_position_embeddings = max_position_embeddings
  130. self.scale_input = scale_input
  131. self.initializer_range = initializer_range
  132. class ParakeetCTCConfig(PretrainedConfig):
  133. r"""
  134. This is the configuration class to store the configuration of a [`ParakeetForCTC`]. It is used to instantiate a
  135. Parakeet CTC model according to the specified arguments, defining the model architecture.
  136. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
  137. documentation from [`PretrainedConfig`] for more information.
  138. Args:
  139. vocab_size (`int`, *optional*, defaults to 1025):
  140. Vocabulary size of the model.
  141. ctc_loss_reduction (`str`, *optional*, defaults to `"mean"`):
  142. Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
  143. instance of [`ParakeetForCTC`].
  144. ctc_zero_infinity (`bool`, *optional*, defaults to `True`):
  145. Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly
  146. occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance
  147. of [`ParakeetForCTC`].
  148. encoder_config (`Union[dict, ParakeetEncoderConfig]`, *optional*):
  149. The config object or dictionary of the encoder.
  150. pad_token_id (`int`, *optional*, defaults to 1024):
  151. Padding token id. Also used as blank token id.
  152. Example:
  153. ```python
  154. >>> from transformers import ParakeetForCTC, ParakeetCTCConfig
  155. >>> # Initializing a Parakeet configuration
  156. >>> configuration = ParakeetCTCConfig()
  157. >>> # Initializing a model from the configuration
  158. >>> model = ParakeetForCTC(configuration)
  159. >>> # Accessing the model configuration
  160. >>> configuration = model.config
  161. ```
  162. This configuration class is based on the Parakeet CTC architecture from NVIDIA NeMo. You can find more details
  163. and pre-trained models at [nvidia/parakeet-ctc-1.1b](https://huggingface.co/nvidia/parakeet-ctc-1.1b).
  164. """
  165. model_type = "parakeet_ctc"
  166. sub_configs = {"encoder_config": ParakeetEncoderConfig}
  167. def __init__(
  168. self,
  169. vocab_size=1025,
  170. ctc_loss_reduction="mean",
  171. ctc_zero_infinity=True,
  172. encoder_config: Union[dict, ParakeetEncoderConfig] = None,
  173. pad_token_id=1024,
  174. **kwargs,
  175. ):
  176. self.vocab_size = vocab_size
  177. self.ctc_loss_reduction = ctc_loss_reduction
  178. self.ctc_zero_infinity = ctc_zero_infinity
  179. if isinstance(encoder_config, dict):
  180. self.encoder_config = ParakeetEncoderConfig(**encoder_config)
  181. elif encoder_config is None:
  182. self.encoder_config = ParakeetEncoderConfig()
  183. self.encoder_config = self.encoder_config
  184. self.initializer_range = self.encoder_config.initializer_range
  185. super().__init__(
  186. pad_token_id=pad_token_id,
  187. **kwargs,
  188. )
  189. @classmethod
  190. def from_encoder_config(cls, encoder_config: ParakeetEncoderConfig, **kwargs):
  191. r"""
  192. Instantiate a [`ParakeetCTCConfig`] (or a derived class) from parakeet encoder model configuration.
  193. Returns:
  194. [`ParakeetCTCConfig`]: An instance of a configuration object
  195. """
  196. return cls(encoder_config=encoder_config.to_dict(), **kwargs)
  197. __all__ = ["ParakeetCTCConfig", "ParakeetEncoderConfig"]