processing_clap.py 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475
  1. # coding=utf-8
  2. # Copyright 2023 The HuggingFace Inc. team.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """
  16. Audio/Text processor class for CLAP
  17. """
  18. from typing import Optional, Union
  19. from ...audio_utils import AudioInput
  20. from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
  21. from ...tokenization_utils_base import PreTokenizedInput, TextInput
  22. from ...utils import logging
  23. from ...utils.deprecation import deprecate_kwarg
  24. logger = logging.get_logger(__name__)
  25. class ClapProcessor(ProcessorMixin):
  26. r"""
  27. Constructs a CLAP processor which wraps a CLAP feature extractor and a RoBerta tokenizer into a single processor.
  28. [`ClapProcessor`] offers all the functionalities of [`ClapFeatureExtractor`] and [`RobertaTokenizerFast`]. See the
  29. [`~ClapProcessor.__call__`] and [`~ClapProcessor.decode`] for more information.
  30. Args:
  31. feature_extractor ([`ClapFeatureExtractor`]):
  32. The audio processor is a required input.
  33. tokenizer ([`RobertaTokenizerFast`]):
  34. The tokenizer is a required input.
  35. """
  36. feature_extractor_class = "ClapFeatureExtractor"
  37. tokenizer_class = ("RobertaTokenizer", "RobertaTokenizerFast")
  38. def __init__(self, feature_extractor, tokenizer):
  39. super().__init__(feature_extractor, tokenizer)
  40. @deprecate_kwarg("audios", version="v4.59.0", new_name="audio")
  41. def __call__(
  42. self,
  43. text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
  44. audios: Optional[AudioInput] = None,
  45. audio: Optional[AudioInput] = None,
  46. **kwargs: Unpack[ProcessingKwargs],
  47. ):
  48. """
  49. Forwards the `audio` and `sampling_rate` arguments to [`~ClapFeatureExtractor.__call__`] and the `text`
  50. argument to [`~RobertaTokenizerFast.__call__`]. Please refer to the docstring of the above two methods for more
  51. information.
  52. """
  53. # The `deprecate_kwarg` will not work if the inputs are passed as arguments, so we check
  54. # again that the correct naming is used
  55. if audios is not None and audio is None:
  56. logger.warning(
  57. "Using `audios` keyword argument is deprecated when calling ClapProcessor, instead use `audio`."
  58. )
  59. audio = audios
  60. return super().__call__(text=text, audio=audio, **kwargs)
  61. __all__ = ["ClapProcessor"]