configuration_xcodec.py 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186
  1. # coding=utf-8
  2. # Copyright 2025 The HuggingFace Inc. team. All rights reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """Xcodec model configuration"""
  16. import math
  17. from typing import Optional, Union
  18. import numpy as np
  19. from transformers import AutoConfig, DacConfig, HubertConfig, WavLMConfig
  20. from ...configuration_utils import PretrainedConfig
  21. from ...utils import logging
  22. logger = logging.get_logger(__name__)
  23. class XcodecConfig(PretrainedConfig):
  24. r"""
  25. This is the configuration class to store the configuration of an [`XcodecModel`]. It is used to instantiate a
  26. Xcodec model according to the specified arguments, defining the model architecture. Instantiating a configuration
  27. with the defaults will yield a similar configuration to that of the
  28. [Manel/X-Codec](https://huggingface.co/Manel/X-Codec) architecture.
  29. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
  30. documentation from [`PretrainedConfig`] for more information.
  31. Args:
  32. target_bandwidths (`List[float]`, *optional*, defaults to `[0.5, 1, 1.5, 2, 4]`):
  33. The range of different bandwidths (in kbps) the model can encode audio with.
  34. sample_rate (`int`, *optional*, defaults to 16000):
  35. The sampling rate at which the audio waveform should be digitalized, in hertz (Hz).
  36. kernel_size (`int`, *optional*, defaults to 3):
  37. Kernel size for the initial semantic convolution.
  38. channel_ratios (`List[float]`, *optional*, defaults to `[1, 1]`):
  39. Expansion factors for the number of output channels in each semantic block.
  40. strides (`List[int]`, *optional*, defaults to `[1, 1]`):
  41. Strides for each semantic encoder block.
  42. block_dilations (`List[int]`, *optional*, defaults to `[1, 1]`):
  43. Dilation factors for the residual units in semantic blocks.
  44. unit_kernel_size (`int`, *optional*, defaults to 3):
  45. Kernel size inside each ResidualUnit in semantic blocks.
  46. codebook_size (`int`, *optional*, defaults to 1024):
  47. Number of entries in each residual quantizer's codebook.
  48. codebook_dim (`int`, *optional*):
  49. Dimensionality of each codebook vector. Defaults to sum of hidden size of acoustic and semantic models.
  50. initializer_range (`float`, *optional*, defaults to 0.02):
  51. Standard deviation of the truncated normal initializer for all weight matrices.
  52. acoustic_model_config (`Union[Dict, DacConfig]`, *optional*):
  53. An instance of the configuration for the acoustic (DAC) model.
  54. semantic_model_config (`Union[Dict, HubertConfig, WavLMConfig]`, *optional*):
  55. An instance of the configuration object for the semantic (HuBERT) model.
  56. Example:
  57. ```python
  58. >>> from transformers import XcodecModel, XcodecConfig
  59. >>> # Initializing configuration
  60. >>> configuration = XcodecConfig()
  61. >>> # Initializing a model (with random weights) from the configuration
  62. >>> model = XcodecModel(configuration)
  63. >>> # Accessing the model configuration
  64. >>> configuration = model.config
  65. ```"""
  66. model_type = "xcodec"
  67. sub_configs = {
  68. "acoustic_model_config": DacConfig,
  69. "semantic_model_config": AutoConfig,
  70. }
  71. def __init__(
  72. self,
  73. target_bandwidths: Optional[list[float]] = None,
  74. sample_rate: int = 16000,
  75. kernel_size: int = 3,
  76. channel_ratios: list[float] = [1, 1],
  77. strides: list[int] = [1, 1],
  78. block_dilations: list[int] = [1, 1],
  79. unit_kernel_size: int = 3,
  80. codebook_size: int = 1024,
  81. codebook_dim: Optional[int] = None,
  82. initializer_range: float = 0.02,
  83. acoustic_model_config: Union[dict, DacConfig] = None,
  84. semantic_model_config: Union[dict, HubertConfig] = None,
  85. **kwargs,
  86. ):
  87. super().__init__(**kwargs)
  88. if acoustic_model_config is None:
  89. self.acoustic_model_config = DacConfig(
  90. encoder_hidden_size=64,
  91. # NOTE: original DAC uses [2, 4, 8, 8] `downsampling ratios`, namely reverse of `upsampling_ratios`
  92. # (not sure if intentional by Xcodec but we keep it)
  93. downsampling_ratios=[8, 5, 4, 2],
  94. decoder_hidden_size=1024,
  95. upsampling_ratios=[8, 5, 4, 2],
  96. hidden_size=256,
  97. )
  98. elif isinstance(acoustic_model_config, dict):
  99. self.acoustic_model_config = DacConfig(**acoustic_model_config)
  100. elif isinstance(acoustic_model_config, DacConfig):
  101. self.acoustic_model_config = acoustic_model_config
  102. else:
  103. raise ValueError(
  104. f"acoustic_model_config must be a dict or DacConfig instance, but got {type(acoustic_model_config)}"
  105. )
  106. if semantic_model_config is None:
  107. self.semantic_model_config = HubertConfig()
  108. elif isinstance(semantic_model_config, dict):
  109. if "_name_or_path" in semantic_model_config:
  110. # If the config is a path, load it using AutoConfig
  111. self.semantic_model_config = AutoConfig.from_pretrained(semantic_model_config["_name_or_path"])
  112. else:
  113. # assume HubertConfig as probably created from scratch
  114. logger.warning(
  115. "Could not determine semantic model type from config architecture. Defaulting to `HubertConfig`."
  116. )
  117. self.semantic_model_config = HubertConfig(**semantic_model_config)
  118. elif isinstance(semantic_model_config, WavLMConfig) or isinstance(semantic_model_config, HubertConfig):
  119. self.semantic_model_config = semantic_model_config
  120. else:
  121. raise ValueError(
  122. f"semantic_model_config must be a dict, HubertConfig, or WavLMConfig instance, but got {type(semantic_model_config)}"
  123. )
  124. if target_bandwidths is None:
  125. target_bandwidths = [0.5, 1, 1.5, 2, 4]
  126. self.target_bandwidths = target_bandwidths
  127. self.sample_rate = sample_rate
  128. self.kernel_size = kernel_size
  129. self.channel_ratios = channel_ratios
  130. self.strides = strides
  131. self.block_dilations = block_dilations
  132. self.unit_kernel_size = unit_kernel_size
  133. self.codebook_size = codebook_size
  134. self.initializer_range = initializer_range
  135. if codebook_dim is None:
  136. codebook_dim = self.acoustic_model_config.hidden_size + self.semantic_model_config.hidden_size
  137. self.codebook_dim = codebook_dim
  138. @property
  139. def frame_rate(self) -> int:
  140. return math.ceil(self.sample_rate / self.hop_length)
  141. @property
  142. def semantic_hidden_size(self) -> int:
  143. return self.semantic_model_config.hidden_size
  144. @property
  145. def hop_length(self) -> int:
  146. return int(np.prod(self.acoustic_model_config.downsampling_ratios))
  147. @property
  148. def codebook_nbits(self) -> int:
  149. return math.ceil(math.log2(self.codebook_size))
  150. @property
  151. def hidden_size(self) -> int:
  152. return self.acoustic_model_config.hidden_size + self.semantic_model_config.hidden_size
  153. @property
  154. def num_quantizers(self) -> int:
  155. return int(1000 * self.target_bandwidths[-1] // (self.frame_rate * self.codebook_nbits))
  156. __all__ = ["XcodecConfig"]