processing_aria.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189
  1. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
  2. # This file was automatically generated from src/transformers/models/aria/modular_aria.py.
  3. # Do NOT edit this file manually as any edits will be overwritten by the generation of
  4. # the file from the modular. If any change should be done, please apply the change to the
  5. # modular_aria.py file directly. One of our CI enforces this.
  6. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
  7. # coding=utf-8
  8. # Copyright 2024 The Rhymes-AI Teams Authors and The HuggingFace Inc. team. All rights reserved.
  9. #
  10. # Licensed under the Apache License, Version 2.0 (the "License");
  11. # you may not use this file except in compliance with the License.
  12. # You may obtain a copy of the License at
  13. #
  14. # http://www.apache.org/licenses/LICENSE-2.0
  15. #
  16. # Unless required by applicable law or agreed to in writing, software
  17. # distributed under the License is distributed on an "AS IS" BASIS,
  18. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  19. # See the License for the specific language governing permissions and
  20. # limitations under the License.
  21. from typing import Optional, Union
  22. import numpy as np
  23. from ...image_processing_utils import BatchFeature
  24. from ...image_utils import ImageInput
  25. from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
  26. from ...tokenization_utils import PreTokenizedInput, TextInput
  27. from ...utils import TensorType
  28. from ..auto import AutoTokenizer
  29. class AriaProcessorKwargs(ProcessingKwargs, total=False):
  30. _defaults = {
  31. "text_kwargs": {
  32. "padding": False,
  33. "return_mm_token_type_ids": False,
  34. },
  35. "images_kwargs": {
  36. "max_image_size": 980,
  37. "split_image": False,
  38. },
  39. "return_tensors": TensorType.PYTORCH,
  40. }
  41. class AriaProcessor(ProcessorMixin):
  42. """
  43. AriaProcessor is a processor for the Aria model which wraps the Aria image preprocessor and the LLama slow tokenizer.
  44. Args:
  45. image_processor (`AriaImageProcessor`, *optional*):
  46. The AriaImageProcessor to use for image preprocessing.
  47. tokenizer (`PreTrainedTokenizerBase`, *optional*):
  48. An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input.
  49. chat_template (`str`, *optional*):
  50. A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string.
  51. size_conversion (`Dict`, *optional*):
  52. A dictionary indicating size conversions for images.
  53. """
  54. attributes = ["image_processor", "tokenizer"]
  55. image_processor_class = "AriaImageProcessor"
  56. tokenizer_class = "AutoTokenizer"
  57. def __init__(
  58. self,
  59. image_processor=None,
  60. tokenizer: Union[AutoTokenizer, str] = None,
  61. chat_template: Optional[str] = None,
  62. size_conversion: Optional[dict[Union[float, int], int]] = None,
  63. ):
  64. if size_conversion is None:
  65. size_conversion = {490: 128, 980: 256}
  66. self.size_conversion = {int(k): v for k, v in size_conversion.items()}
  67. self.image_token = tokenizer.image_token
  68. self.image_token_id = tokenizer.image_token_id
  69. if tokenizer is not None and tokenizer.pad_token is None:
  70. tokenizer.pad_token = tokenizer.unk_token
  71. super().__init__(image_processor, tokenizer, chat_template=chat_template)
  72. def __call__(
  73. self,
  74. text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]],
  75. images: Optional[ImageInput] = None,
  76. audio=None,
  77. videos=None,
  78. **kwargs: Unpack[AriaProcessorKwargs],
  79. ) -> BatchFeature:
  80. """
  81. Main method to prepare for the model one or several sequences(s) and image(s).
  82. Args:
  83. text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`):
  84. The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
  85. (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
  86. `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
  87. images (`ImageInput`):
  88. The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
  89. tensor. Both channels-first and channels-last formats are supported.
  90. Returns:
  91. [`BatchFeature`]: A [`BatchFeature`] with the following fields:
  92. - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
  93. - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
  94. `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
  95. `None`).
  96. - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
  97. - **pixel_mask** -- Pixel mask to be fed to a model. Returned when `images` is not `None`.
  98. """
  99. output_kwargs = self._merge_kwargs(
  100. AriaProcessorKwargs,
  101. tokenizer_init_kwargs=self.tokenizer.init_kwargs,
  102. **kwargs,
  103. )
  104. if isinstance(text, str):
  105. text = [text]
  106. elif not isinstance(text, list) and not isinstance(text[0], str):
  107. raise TypeError("Invalid input text. Please provide a string, or a list of strings")
  108. if images is not None:
  109. image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
  110. # expand the image_token according to the num_crops and tokens per image
  111. tokens_per_image = self.size_conversion[image_inputs.pixel_values.shape[2]]
  112. prompt_strings = []
  113. num_crops = image_inputs.pop("num_crops") * tokens_per_image
  114. for sample in text:
  115. sample = sample.replace(self.tokenizer.image_token, self.tokenizer.image_token * num_crops)
  116. prompt_strings.append(sample)
  117. else:
  118. image_inputs = {}
  119. prompt_strings = text
  120. return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
  121. return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False)
  122. text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"], return_tensors=None)
  123. self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"])
  124. if return_mm_token_type_ids:
  125. array_ids = np.array(text_inputs["input_ids"])
  126. mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
  127. mm_token_type_ids[array_ids == self.image_token_id] = 1
  128. text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
  129. return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
  130. def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs):
  131. """
  132. Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
  133. Args:
  134. image_sizes (`list[list[int]]`, *optional*):
  135. The input sizes formatted as (height, width) per each image.
  136. Returns:
  137. `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
  138. input modalities, along with other useful data.
  139. """
  140. vision_data = {}
  141. if image_sizes is not None:
  142. images_kwargs = AriaProcessorKwargs._defaults.get("images_kwargs", {})
  143. images_kwargs.update(kwargs)
  144. max_size = images_kwargs.get("max_image_size", None) or self.image_processor.max_image_size
  145. num_image_patches = [
  146. self.image_processor.get_number_of_image_patches(*image_size, images_kwargs)
  147. for image_size in image_sizes
  148. ]
  149. num_image_tokens = [self.size_conversion[max_size] * num_patches for num_patches in num_image_patches]
  150. vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
  151. return MultiModalData(**vision_data)
  152. @property
  153. def model_input_names(self):
  154. tokenizer_input_names = self.tokenizer.model_input_names
  155. image_processor_input_names = self.image_processor.model_input_names
  156. # Remove `num_crops`, it is popped and used only when processing. Make a copy of list when removing
  157. # otherwise `self.image_processor.model_input_names` is also modified
  158. image_processor_input_names = [name for name in image_processor_input_names if name != "num_crops"]
  159. return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
  160. __all__ = ["AriaProcessor"]