feature_extraction_sequence_utils.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371
  1. # Copyright 2021 The HuggingFace Inc. team.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """
  15. Sequence feature extraction class for common feature extractors to preprocess sequences.
  16. """
  17. from typing import Optional, Union
  18. import numpy as np
  19. from .feature_extraction_utils import BatchFeature, FeatureExtractionMixin
  20. from .utils import PaddingStrategy, TensorType, is_tf_tensor, is_torch_tensor, logging, to_numpy
  21. logger = logging.get_logger(__name__)
  22. class SequenceFeatureExtractor(FeatureExtractionMixin):
  23. """
  24. This is a general feature extraction class for speech recognition.
  25. Args:
  26. feature_size (`int`):
  27. The feature dimension of the extracted features.
  28. sampling_rate (`int`):
  29. The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
  30. padding_value (`float`):
  31. The value that is used to fill the padding values / vectors.
  32. """
  33. def __init__(self, feature_size: int, sampling_rate: int, padding_value: float, **kwargs):
  34. self.feature_size = feature_size
  35. self.sampling_rate = sampling_rate
  36. self.padding_value = padding_value
  37. self.padding_side = kwargs.pop("padding_side", "right")
  38. self.return_attention_mask = kwargs.pop("return_attention_mask", True)
  39. super().__init__(**kwargs)
  40. def pad(
  41. self,
  42. processed_features: Union[
  43. BatchFeature,
  44. list[BatchFeature],
  45. dict[str, BatchFeature],
  46. dict[str, list[BatchFeature]],
  47. list[dict[str, BatchFeature]],
  48. ],
  49. padding: Union[bool, str, PaddingStrategy] = True,
  50. max_length: Optional[int] = None,
  51. truncation: bool = False,
  52. pad_to_multiple_of: Optional[int] = None,
  53. return_attention_mask: Optional[bool] = None,
  54. return_tensors: Optional[Union[str, TensorType]] = None,
  55. ) -> BatchFeature:
  56. """
  57. Pad input values / input vectors or a batch of input values / input vectors up to predefined length or to the
  58. max sequence length in the batch.
  59. Padding side (left/right) padding values are defined at the feature extractor level (with `self.padding_side`,
  60. `self.padding_value`)
  61. <Tip>
  62. If the `processed_features` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the
  63. result will use the same type unless you provide a different tensor type with `return_tensors`. In the case of
  64. PyTorch tensors, you will lose the specific device of your tensors however.
  65. </Tip>
  66. Args:
  67. processed_features ([`BatchFeature`], list of [`BatchFeature`], `dict[str, list[float]]`, `dict[str, list[list[float]]` or `list[dict[str, list[float]]]`):
  68. Processed inputs. Can represent one input ([`BatchFeature`] or `dict[str, list[float]]`) or a batch of
  69. input values / vectors (list of [`BatchFeature`], *dict[str, list[list[float]]]* or *list[dict[str,
  70. list[float]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader
  71. collate function.
  72. Instead of `list[float]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
  73. see the note above for the return type.
  74. padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
  75. Select a strategy to pad the returned sequences (according to the model's padding side and padding
  76. index) among:
  77. - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
  78. sequence if provided).
  79. - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
  80. acceptable input length for the model if that argument is not provided.
  81. - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
  82. lengths).
  83. max_length (`int`, *optional*):
  84. Maximum length of the returned list and optionally padding length (see above).
  85. truncation (`bool`):
  86. Activates truncation to cut input sequences longer than `max_length` to `max_length`.
  87. pad_to_multiple_of (`int`, *optional*):
  88. If set will pad the sequence to a multiple of the provided value.
  89. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
  90. `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
  91. return_attention_mask (`bool`, *optional*):
  92. Whether to return the attention mask. If left to the default, will return the attention mask according
  93. to the specific feature_extractor's default.
  94. [What are attention masks?](../glossary#attention-mask)
  95. return_tensors (`str` or [`~utils.TensorType`], *optional*):
  96. If set, will return tensors instead of list of python integers. Acceptable values are:
  97. - `'tf'`: Return TensorFlow `tf.constant` objects.
  98. - `'pt'`: Return PyTorch `torch.Tensor` objects.
  99. - `'np'`: Return Numpy `np.ndarray` objects.
  100. """
  101. # If we have a list of dicts, let's convert it in a dict of lists
  102. # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
  103. if isinstance(processed_features, (list, tuple)) and isinstance(processed_features[0], (dict, BatchFeature)):
  104. processed_features = {
  105. key: [example[key] for example in processed_features] for key in processed_features[0]
  106. }
  107. # The model's main input name, usually `input_values`, has be passed for padding
  108. if self.model_input_names[0] not in processed_features:
  109. raise ValueError(
  110. "You should supply an instance of `transformers.BatchFeature` or list of `transformers.BatchFeature`"
  111. f" to this method that includes {self.model_input_names[0]}, but you provided"
  112. f" {list(processed_features.keys())}"
  113. )
  114. required_input = processed_features[self.model_input_names[0]]
  115. return_attention_mask = (
  116. return_attention_mask if return_attention_mask is not None else self.return_attention_mask
  117. )
  118. if len(required_input) == 0:
  119. if return_attention_mask:
  120. processed_features["attention_mask"] = []
  121. return processed_features
  122. # If we have PyTorch/TF tensors or lists as inputs, we cast them as Numpy arrays
  123. # and rebuild them afterwards if no return_tensors is specified
  124. # Note that we lose the specific device the tensor may be on for PyTorch
  125. first_element = required_input[0]
  126. if isinstance(first_element, (list, tuple)):
  127. # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
  128. index = 0
  129. while len(required_input[index]) == 0:
  130. index += 1
  131. if index < len(required_input):
  132. first_element = required_input[index][0]
  133. if return_tensors is None:
  134. if is_tf_tensor(first_element):
  135. return_tensors = "tf"
  136. elif is_torch_tensor(first_element):
  137. return_tensors = "pt"
  138. elif isinstance(first_element, (int, float, list, tuple, np.ndarray)):
  139. return_tensors = "np"
  140. else:
  141. raise ValueError(
  142. f"type of {first_element} unknown: {type(first_element)}. "
  143. "Should be one of a python, numpy, pytorch or tensorflow object."
  144. )
  145. for key, value in processed_features.items():
  146. if isinstance(value[0], (int, float)):
  147. processed_features[key] = to_numpy(value)
  148. else:
  149. processed_features[key] = [to_numpy(v) for v in value]
  150. # Convert padding_strategy in PaddingStrategy
  151. padding_strategy = self._get_padding_strategies(padding=padding, max_length=max_length)
  152. required_input = processed_features[self.model_input_names[0]]
  153. batch_size = len(required_input)
  154. if not all(len(v) == batch_size for v in processed_features.values()):
  155. raise ValueError("Some items in the output dictionary have a different batch size than others.")
  156. truncated_inputs = []
  157. for i in range(batch_size):
  158. inputs = {k: v[i] for k, v in processed_features.items()}
  159. # truncation
  160. inputs_slice = self._truncate(
  161. inputs,
  162. max_length=max_length,
  163. pad_to_multiple_of=pad_to_multiple_of,
  164. truncation=truncation,
  165. )
  166. truncated_inputs.append(inputs_slice)
  167. if padding_strategy == PaddingStrategy.LONGEST:
  168. # make sure that `max_length` cannot be longer than the longest truncated length
  169. max_length = max(len(input_slice[self.model_input_names[0]]) for input_slice in truncated_inputs)
  170. padding_strategy = PaddingStrategy.MAX_LENGTH
  171. batch_outputs = {}
  172. for i in range(batch_size):
  173. # padding
  174. outputs = self._pad(
  175. truncated_inputs[i],
  176. max_length=max_length,
  177. padding_strategy=padding_strategy,
  178. pad_to_multiple_of=pad_to_multiple_of,
  179. return_attention_mask=return_attention_mask,
  180. )
  181. for key, value in outputs.items():
  182. if key not in batch_outputs:
  183. batch_outputs[key] = []
  184. if value.dtype is np.dtype(np.float64):
  185. value = value.astype(np.float32)
  186. batch_outputs[key].append(value)
  187. return BatchFeature(batch_outputs, tensor_type=return_tensors)
  188. def _pad(
  189. self,
  190. processed_features: Union[dict[str, np.ndarray], BatchFeature],
  191. max_length: Optional[int] = None,
  192. padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
  193. pad_to_multiple_of: Optional[int] = None,
  194. return_attention_mask: Optional[bool] = None,
  195. ) -> dict:
  196. """
  197. Pad inputs (on left/right and up to predefined length or max length in the batch)
  198. Args:
  199. processed_features (`Union[dict[str, np.ndarray], BatchFeature]`):
  200. Dictionary of input values (`np.ndarray[float]`) / input vectors (`list[np.ndarray[float]]`) or batch
  201. of inputs values (`list[np.ndarray[int]]`) / input vectors (`list[np.ndarray[int]]`)
  202. max_length (`int`, *optional*):
  203. Maximum length of the returned list and optionally padding length (see below)
  204. padding_strategy (`PaddingStrategy`, *optional*, default to `PaddingStrategy.DO_NOT_PAD`):
  205. PaddingStrategy to use for padding.
  206. - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
  207. - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
  208. - PaddingStrategy.DO_NOT_PAD: Do not pad
  209. The feature_extractor padding sides are defined in self.padding_side:
  210. - 'left': pads on the left of the sequences
  211. - 'right': pads on the right of the sequences
  212. pad_to_multiple_of (`int`, *optional*):
  213. Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to
  214. enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta), or on TPUs
  215. which benefit from having sequence lengths be a multiple of 128.
  216. return_attention_mask (`bool`, *optional*):
  217. Set to False to avoid returning attention mask (default: set to model specifics)
  218. """
  219. required_input = processed_features[self.model_input_names[0]]
  220. if padding_strategy == PaddingStrategy.LONGEST:
  221. max_length = len(required_input)
  222. if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
  223. max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
  224. needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) < max_length
  225. if return_attention_mask and "attention_mask" not in processed_features:
  226. processed_features["attention_mask"] = np.ones(len(required_input), dtype=np.int32)
  227. if needs_to_be_padded:
  228. difference = max_length - len(required_input)
  229. if self.padding_side == "right":
  230. if return_attention_mask:
  231. processed_features["attention_mask"] = np.pad(
  232. processed_features["attention_mask"], (0, difference)
  233. )
  234. padding_shape = ((0, difference), (0, 0)) if self.feature_size > 1 else (0, difference)
  235. processed_features[self.model_input_names[0]] = np.pad(
  236. required_input, padding_shape, "constant", constant_values=self.padding_value
  237. )
  238. elif self.padding_side == "left":
  239. if return_attention_mask:
  240. processed_features["attention_mask"] = np.pad(
  241. processed_features["attention_mask"], (difference, 0)
  242. )
  243. padding_shape = ((difference, 0), (0, 0)) if self.feature_size > 1 else (difference, 0)
  244. processed_features[self.model_input_names[0]] = np.pad(
  245. required_input, padding_shape, "constant", constant_values=self.padding_value
  246. )
  247. else:
  248. raise ValueError("Invalid padding strategy:" + str(self.padding_side))
  249. return processed_features
  250. def _truncate(
  251. self,
  252. processed_features: Union[dict[str, np.ndarray], BatchFeature],
  253. max_length: Optional[int] = None,
  254. pad_to_multiple_of: Optional[int] = None,
  255. truncation: Optional[bool] = None,
  256. ):
  257. """
  258. Truncate inputs to predefined length or max length in the batch
  259. Args:
  260. processed_features(`Union[dict[str, np.ndarray], BatchFeature]`):
  261. Dictionary of input values (`np.ndarray[float]`) / input vectors (`list[np.ndarray[float]]`) or batch
  262. of inputs values (`list[np.ndarray[int]]`) / input vectors (`list[np.ndarray[int]]`)
  263. max_length (`int`, *optional*):
  264. maximum length of the returned list and optionally padding length (see below)
  265. pad_to_multiple_of (`int`, *optional*) :
  266. Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to
  267. enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta), or on TPUs
  268. which benefit from having sequence lengths be a multiple of 128.
  269. truncation (`bool`, *optional*):
  270. Activates truncation to cut input sequences longer than `max_length` to `max_length`.
  271. """
  272. if not truncation:
  273. return processed_features
  274. elif truncation and max_length is None:
  275. raise ValueError("When setting ``truncation=True``, make sure that ``max_length`` is defined.")
  276. required_input = processed_features[self.model_input_names[0]]
  277. # find `max_length` that fits `pad_to_multiple_of`
  278. if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
  279. max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
  280. needs_to_be_truncated = len(required_input) > max_length
  281. if needs_to_be_truncated:
  282. processed_features[self.model_input_names[0]] = processed_features[self.model_input_names[0]][:max_length]
  283. if "attention_mask" in processed_features:
  284. processed_features["attention_mask"] = processed_features["attention_mask"][:max_length]
  285. return processed_features
  286. def _get_padding_strategies(self, padding=False, max_length=None):
  287. """
  288. Find the correct padding strategy
  289. """
  290. # Get padding strategy
  291. if padding is not False:
  292. if padding is True:
  293. padding_strategy = PaddingStrategy.LONGEST # Default to pad to the longest sequence in the batch
  294. elif not isinstance(padding, PaddingStrategy):
  295. padding_strategy = PaddingStrategy(padding)
  296. elif isinstance(padding, PaddingStrategy):
  297. padding_strategy = padding
  298. else:
  299. padding_strategy = PaddingStrategy.DO_NOT_PAD
  300. # Set max length if needed
  301. if max_length is None:
  302. if padding_strategy == PaddingStrategy.MAX_LENGTH:
  303. raise ValueError(
  304. f"When setting ``padding={PaddingStrategy.MAX_LENGTH}``, make sure that max_length is defined"
  305. )
  306. # Test if we have a padding value
  307. if padding_strategy != PaddingStrategy.DO_NOT_PAD and (self.padding_value is None):
  308. raise ValueError(
  309. "Asking to pad but the feature_extractor does not have a padding value. Please select a value to use"
  310. " as `padding_value`. For example: `feature_extractor.padding_value = 0.0`."
  311. )
  312. return padding_strategy