functional.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341
  1. # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # Modified from librosa(https://github.com/librosa/librosa)
  15. import math
  16. from typing import Optional, Union
  17. import paddle
  18. from paddle import Tensor
  19. from paddle.base.framework import Variable
  20. from paddle.pir import Value
  21. def hz_to_mel(
  22. freq: Union[Tensor, Value, Variable, float], htk: bool = False
  23. ) -> Union[Tensor, Value, Variable, float]:
  24. """Convert Hz to Mels.
  25. Args:
  26. freq (Union[Tensor, float]): The input tensor with arbitrary shape.
  27. htk (bool, optional): Use htk scaling. Defaults to False.
  28. Returns:
  29. Union[Tensor, float]: Frequency in mels.
  30. Examples:
  31. .. code-block:: python
  32. >>> import paddle
  33. >>> val = 3.0
  34. >>> htk_flag = True
  35. >>> mel_paddle_tensor = paddle.audio.functional.hz_to_mel(
  36. ... paddle.to_tensor(val), htk_flag)
  37. """
  38. if htk:
  39. if isinstance(freq, (Tensor, Variable, Value)):
  40. return 2595.0 * paddle.log10(1.0 + freq / 700.0)
  41. else:
  42. return 2595.0 * math.log10(1.0 + freq / 700.0)
  43. # Fill in the linear part
  44. f_min = 0.0
  45. f_sp = 200.0 / 3
  46. mels = (freq - f_min) / f_sp
  47. # Fill in the log-scale part
  48. min_log_hz = 1000.0 # beginning of log region (Hz)
  49. min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
  50. logstep = math.log(6.4) / 27.0 # step size for log region
  51. if isinstance(freq, (Tensor, Variable, Value)):
  52. target = (
  53. min_log_mel + paddle.log(freq / min_log_hz + 1e-10) / logstep
  54. ) # prevent nan with 1e-10
  55. mask = (freq > min_log_hz).astype(freq.dtype)
  56. mels = target * mask + mels * (
  57. 1 - mask
  58. ) # will replace by masked_fill OP in future
  59. else:
  60. if freq >= min_log_hz:
  61. mels = min_log_mel + math.log(freq / min_log_hz + 1e-10) / logstep
  62. return mels
  63. def mel_to_hz(
  64. mel: Union[float, Tensor, Variable, Value], htk: bool = False
  65. ) -> Union[float, Tensor, Variable, Value]:
  66. """Convert mel bin numbers to frequencies.
  67. Args:
  68. mel (Union[float, Tensor]): The mel frequency represented as a tensor with arbitrary shape.
  69. htk (bool, optional): Use htk scaling. Defaults to False.
  70. Returns:
  71. Union[float, Tensor]: Frequencies in Hz.
  72. Examples:
  73. .. code-block:: python
  74. >>> import paddle
  75. >>> val = 3.0
  76. >>> htk_flag = True
  77. >>> mel_paddle_tensor = paddle.audio.functional.mel_to_hz(
  78. ... paddle.to_tensor(val), htk_flag)
  79. ...
  80. """
  81. if htk:
  82. return 700.0 * (10.0 ** (mel / 2595.0) - 1.0)
  83. f_min = 0.0
  84. f_sp = 200.0 / 3
  85. freqs = f_min + f_sp * mel
  86. # And now the nonlinear scale
  87. min_log_hz = 1000.0 # beginning of log region (Hz)
  88. min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
  89. logstep = math.log(6.4) / 27.0 # step size for log region
  90. if isinstance(mel, (Tensor, Variable, Value)):
  91. target = min_log_hz * paddle.exp(logstep * (mel - min_log_mel))
  92. mask = (mel > min_log_mel).astype(mel.dtype)
  93. freqs = target * mask + freqs * (
  94. 1 - mask
  95. ) # will replace by masked_fill OP in future
  96. else:
  97. if mel >= min_log_mel:
  98. freqs = min_log_hz * math.exp(logstep * (mel - min_log_mel))
  99. return freqs
  100. def mel_frequencies(
  101. n_mels: int = 64,
  102. f_min: float = 0.0,
  103. f_max: float = 11025.0,
  104. htk: bool = False,
  105. dtype: str = 'float32',
  106. ) -> Union[Tensor, Variable, Value]:
  107. """Compute mel frequencies.
  108. Args:
  109. n_mels (int, optional): Number of mel bins. Defaults to 64.
  110. f_min (float, optional): Minimum frequency in Hz. Defaults to 0.0.
  111. fmax (float, optional): Maximum frequency in Hz. Defaults to 11025.0.
  112. htk (bool, optional): Use htk scaling. Defaults to False.
  113. dtype (str, optional): The data type of the return frequencies. Defaults to 'float32'.
  114. Returns:
  115. Tensor: Tensor of n_mels frequencies in Hz with shape `(n_mels,)`.
  116. Examples:
  117. .. code-block:: python
  118. >>> import paddle
  119. >>> n_mels = 64
  120. >>> f_min = 0.5
  121. >>> f_max = 10000
  122. >>> htk_flag = True
  123. >>> paddle_mel_freq = paddle.audio.functional.mel_frequencies(
  124. ... n_mels, f_min, f_max, htk_flag, 'float64')
  125. """
  126. # 'Center freqs' of mel bands - uniformly spaced between limits
  127. min_mel = hz_to_mel(f_min, htk=htk)
  128. max_mel = hz_to_mel(f_max, htk=htk)
  129. mels = paddle.linspace(min_mel, max_mel, n_mels, dtype=dtype)
  130. freqs = mel_to_hz(mels, htk=htk)
  131. return freqs
  132. def fft_frequencies(sr: int, n_fft: int, dtype: str = 'float32') -> Tensor:
  133. """Compute fourier frequencies.
  134. Args:
  135. sr (int): Sample rate.
  136. n_fft (int): Number of fft bins.
  137. dtype (str, optional): The data type of the return frequencies. Defaults to 'float32'.
  138. Returns:
  139. Tensor: FFT frequencies in Hz with shape `(n_fft//2 + 1,)`.
  140. Examples:
  141. .. code-block:: python
  142. >>> import paddle
  143. >>> sr = 16000
  144. >>> n_fft = 128
  145. >>> fft_freq = paddle.audio.functional.fft_frequencies(sr, n_fft)
  146. """
  147. return paddle.linspace(0, float(sr) / 2, int(1 + n_fft // 2), dtype=dtype)
  148. def compute_fbank_matrix(
  149. sr: int,
  150. n_fft: int,
  151. n_mels: int = 64,
  152. f_min: float = 0.0,
  153. f_max: Optional[float] = None,
  154. htk: bool = False,
  155. norm: Union[str, float] = 'slaney',
  156. dtype: str = 'float32',
  157. ) -> Tensor:
  158. """Compute fbank matrix.
  159. Args:
  160. sr (int): Sample rate.
  161. n_fft (int): Number of fft bins.
  162. n_mels (int, optional): Number of mel bins. Defaults to 64.
  163. f_min (float, optional): Minimum frequency in Hz. Defaults to 0.0.
  164. f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
  165. htk (bool, optional): Use htk scaling. Defaults to False.
  166. norm (Union[str, float], optional): Type of normalization. Defaults to 'slaney'.
  167. dtype (str, optional): The data type of the return matrix. Defaults to 'float32'.
  168. Returns:
  169. Tensor: Mel transform matrix with shape `(n_mels, n_fft//2 + 1)`.
  170. Examples:
  171. .. code-block:: python
  172. >>> import paddle
  173. >>> sr = 23
  174. >>> n_fft = 51
  175. >>> fbank = paddle.audio.functional.compute_fbank_matrix(sr, n_fft)
  176. """
  177. if f_max is None:
  178. f_max = float(sr) / 2
  179. # Initialize the weights
  180. weights = paddle.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
  181. # Center freqs of each FFT bin
  182. fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft, dtype=dtype)
  183. # 'Center freqs' of mel bands - uniformly spaced between limits
  184. mel_f = mel_frequencies(
  185. n_mels + 2, f_min=f_min, f_max=f_max, htk=htk, dtype=dtype
  186. )
  187. fdiff = mel_f[1:] - mel_f[:-1] # np.diff(mel_f)
  188. ramps = mel_f.unsqueeze(1) - fftfreqs.unsqueeze(0)
  189. # ramps = np.subtract.outer(mel_f, fftfreqs)
  190. for i in range(n_mels):
  191. # lower and upper slopes for all bins
  192. lower = -ramps[i] / fdiff[i]
  193. upper = ramps[i + 2] / fdiff[i + 1]
  194. # .. then intersect them with each other and zero
  195. weights[i] = paddle.maximum(
  196. paddle.zeros_like(lower), paddle.minimum(lower, upper)
  197. )
  198. # Slaney-style mel is scaled to be approx constant energy per channel
  199. if norm == 'slaney':
  200. enorm = 2.0 / (mel_f[2 : n_mels + 2] - mel_f[:n_mels])
  201. weights *= enorm.unsqueeze(1)
  202. elif isinstance(norm, (int, float)):
  203. weights = paddle.nn.functional.normalize(weights, p=norm, axis=-1)
  204. return weights
  205. def power_to_db(
  206. spect: Union[Tensor, Variable, Value],
  207. ref_value: float = 1.0,
  208. amin: float = 1e-10,
  209. top_db: Optional[float] = 80.0,
  210. ) -> Union[Tensor, Variable, Value]:
  211. """Convert a power spectrogram (amplitude squared) to decibel (dB) units. The function computes the scaling `10 * log10(x / ref)` in a numerically stable way.
  212. Args:
  213. spect (Tensor): STFT power spectrogram.
  214. ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
  215. amin (float, optional): Minimum threshold. Defaults to 1e-10.
  216. top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to None.
  217. Returns:
  218. Tensor: Power spectrogram in db scale.
  219. Examples:
  220. .. code-block:: python
  221. >>> import paddle
  222. >>> val = 3.0
  223. >>> decibel_paddle = paddle.audio.functional.power_to_db(
  224. ... paddle.to_tensor(val))
  225. """
  226. if amin <= 0:
  227. raise Exception("amin must be strictly positive")
  228. if ref_value <= 0:
  229. raise Exception("ref_value must be strictly positive")
  230. ones = paddle.ones_like(spect)
  231. log_spec = 10.0 * paddle.log10(paddle.maximum(ones * amin, spect))
  232. log_spec -= 10.0 * math.log10(max(ref_value, amin))
  233. if top_db is not None:
  234. if top_db < 0:
  235. raise Exception("top_db must be non-negative")
  236. log_spec = paddle.maximum(log_spec, ones * (log_spec.max() - top_db))
  237. return log_spec
  238. def create_dct(
  239. n_mfcc: int,
  240. n_mels: int,
  241. norm: Optional[str] = 'ortho',
  242. dtype: str = 'float32',
  243. ) -> Tensor:
  244. """Create a discrete cosine transform(DCT) matrix.
  245. Args:
  246. n_mfcc (int): Number of mel frequency cepstral coefficients.
  247. n_mels (int): Number of mel filterbanks.
  248. norm (Optional[str], optional): Normalization type. Defaults to 'ortho'.
  249. dtype (str, optional): The data type of the return matrix. Defaults to 'float32'.
  250. Returns:
  251. Tensor: The DCT matrix with shape `(n_mels, n_mfcc)`.
  252. Examples:
  253. .. code-block:: python
  254. >>> import paddle
  255. >>> n_mfcc = 23
  256. >>> n_mels = 257
  257. >>> dct = paddle.audio.functional.create_dct(n_mfcc, n_mels)
  258. """
  259. n = paddle.arange(n_mels, dtype=dtype)
  260. k = paddle.arange(n_mfcc, dtype=dtype).unsqueeze(1)
  261. dct = paddle.cos(
  262. math.pi / float(n_mels) * (n + 0.5) * k
  263. ) # size (n_mfcc, n_mels)
  264. if norm is None:
  265. dct *= 2.0
  266. else:
  267. assert norm == "ortho"
  268. dct[0] *= 1.0 / math.sqrt(2.0)
  269. dct *= math.sqrt(2.0 / float(n_mels))
  270. return dct.T