processing_udop.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194
  1. # coding=utf-8
  2. # Copyright 2024 The HuggingFace Inc. team.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """
  16. Processor class for UDOP.
  17. """
  18. from typing import Optional, Union
  19. from transformers import logging
  20. from ...image_processing_utils import BatchFeature
  21. from ...image_utils import ImageInput
  22. from ...processing_utils import ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
  23. from ...tokenization_utils_base import PreTokenizedInput, TextInput
  24. logger = logging.get_logger(__name__)
  25. class UdopTextKwargs(TextKwargs, total=False):
  26. word_labels: Optional[Union[list[int], list[list[int]]]]
  27. boxes: Union[list[list[int]], list[list[list[int]]]]
  28. class UdopProcessorKwargs(ProcessingKwargs, total=False):
  29. text_kwargs: UdopTextKwargs
  30. _defaults = {
  31. "text_kwargs": {
  32. "add_special_tokens": True,
  33. "padding": False,
  34. "truncation": False,
  35. "stride": 0,
  36. "return_overflowing_tokens": False,
  37. "return_special_tokens_mask": False,
  38. "return_offsets_mapping": False,
  39. "return_length": False,
  40. "verbose": True,
  41. },
  42. "images_kwargs": {},
  43. }
  44. class UdopProcessor(ProcessorMixin):
  45. r"""
  46. Constructs a UDOP processor which combines a LayoutLMv3 image processor and a UDOP tokenizer into a single processor.
  47. [`UdopProcessor`] offers all the functionalities you need to prepare data for the model.
  48. It first uses [`LayoutLMv3ImageProcessor`] to resize, rescale and normalize document images, and optionally applies OCR
  49. to get words and normalized bounding boxes. These are then provided to [`UdopTokenizer`] or [`UdopTokenizerFast`],
  50. which turns the words and bounding boxes into token-level `input_ids`, `attention_mask`, `token_type_ids`, `bbox`.
  51. Optionally, one can provide integer `word_labels`, which are turned into token-level `labels` for token
  52. classification tasks (such as FUNSD, CORD).
  53. Additionally, it also supports passing `text_target` and `text_pair_target` to the tokenizer, which can be used to
  54. prepare labels for language modeling tasks.
  55. Args:
  56. image_processor (`LayoutLMv3ImageProcessor`):
  57. An instance of [`LayoutLMv3ImageProcessor`]. The image processor is a required input.
  58. tokenizer (`UdopTokenizer` or `UdopTokenizerFast`):
  59. An instance of [`UdopTokenizer`] or [`UdopTokenizerFast`]. The tokenizer is a required input.
  60. """
  61. attributes = ["image_processor", "tokenizer"]
  62. image_processor_class = "LayoutLMv3ImageProcessor"
  63. tokenizer_class = ("UdopTokenizer", "UdopTokenizerFast")
  64. def __init__(self, image_processor, tokenizer):
  65. super().__init__(image_processor, tokenizer)
  66. def __call__(
  67. self,
  68. images: Optional[ImageInput] = None,
  69. text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
  70. audio=None,
  71. videos=None,
  72. **kwargs: Unpack[UdopProcessorKwargs],
  73. ) -> BatchFeature:
  74. """
  75. This method first forwards the `images` argument to [`~UdopImageProcessor.__call__`]. In case
  76. [`UdopImageProcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
  77. bounding boxes along with the additional arguments to [`~UdopTokenizer.__call__`] and returns the output,
  78. together with the prepared `pixel_values`. In case [`UdopImageProcessor`] was initialized with `apply_ocr` set
  79. to `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along with the
  80. additional arguments to [`~UdopTokenizer.__call__`] and returns the output, together with the prepared
  81. `pixel_values`.
  82. Alternatively, one can pass `text_target` and `text_pair_target` to prepare the targets of UDOP.
  83. Please refer to the docstring of the above two methods for more information.
  84. """
  85. # verify input
  86. output_kwargs = self._merge_kwargs(
  87. UdopProcessorKwargs,
  88. tokenizer_init_kwargs=self.tokenizer.init_kwargs,
  89. **kwargs,
  90. )
  91. boxes = output_kwargs["text_kwargs"].pop("boxes", None)
  92. word_labels = output_kwargs["text_kwargs"].pop("word_labels", None)
  93. text_pair = output_kwargs["text_kwargs"].pop("text_pair", None)
  94. return_overflowing_tokens = output_kwargs["text_kwargs"].get("return_overflowing_tokens", False)
  95. return_offsets_mapping = output_kwargs["text_kwargs"].get("return_offsets_mapping", False)
  96. text_target = output_kwargs["text_kwargs"].get("text_target", None)
  97. if self.image_processor.apply_ocr and (boxes is not None):
  98. raise ValueError(
  99. "You cannot provide bounding boxes if you initialized the image processor with apply_ocr set to True."
  100. )
  101. if self.image_processor.apply_ocr and (word_labels is not None):
  102. raise ValueError(
  103. "You cannot provide word labels if you initialized the image processor with apply_ocr set to True."
  104. )
  105. if return_overflowing_tokens and not return_offsets_mapping:
  106. raise ValueError("You cannot return overflowing tokens without returning the offsets mapping.")
  107. if text_target is not None:
  108. # use the processor to prepare the targets of UDOP
  109. return self.tokenizer(
  110. **output_kwargs["text_kwargs"],
  111. )
  112. else:
  113. # use the processor to prepare the inputs of UDOP
  114. # first, apply the image processor
  115. features = self.image_processor(images=images, **output_kwargs["images_kwargs"])
  116. features_words = features.pop("words", None)
  117. features_boxes = features.pop("boxes", None)
  118. output_kwargs["text_kwargs"].pop("text_target", None)
  119. output_kwargs["text_kwargs"].pop("text_pair_target", None)
  120. output_kwargs["text_kwargs"]["text_pair"] = text_pair
  121. output_kwargs["text_kwargs"]["boxes"] = boxes if boxes is not None else features_boxes
  122. output_kwargs["text_kwargs"]["word_labels"] = word_labels
  123. # second, apply the tokenizer
  124. if text is not None and self.image_processor.apply_ocr and text_pair is None:
  125. if isinstance(text, str):
  126. text = [text] # add batch dimension (as the image processor always adds a batch dimension)
  127. output_kwargs["text_kwargs"]["text_pair"] = features_words
  128. encoded_inputs = self.tokenizer(
  129. text=text if text is not None else features_words,
  130. **output_kwargs["text_kwargs"],
  131. )
  132. # add pixel values
  133. if return_overflowing_tokens is True:
  134. features["pixel_values"] = self.get_overflowing_images(
  135. features["pixel_values"], encoded_inputs["overflow_to_sample_mapping"]
  136. )
  137. features.update(encoded_inputs)
  138. return features
  139. # Copied from transformers.models.layoutlmv3.processing_layoutlmv3.LayoutLMv3Processor.get_overflowing_images
  140. def get_overflowing_images(self, images, overflow_to_sample_mapping):
  141. # in case there's an overflow, ensure each `input_ids` sample is mapped to its corresponding image
  142. images_with_overflow = []
  143. for sample_idx in overflow_to_sample_mapping:
  144. images_with_overflow.append(images[sample_idx])
  145. if len(images_with_overflow) != len(overflow_to_sample_mapping):
  146. raise ValueError(
  147. "Expected length of images to be the same as the length of `overflow_to_sample_mapping`, but got"
  148. f" {len(images_with_overflow)} and {len(overflow_to_sample_mapping)}"
  149. )
  150. return images_with_overflow
  151. @property
  152. def model_input_names(self):
  153. tokenizer_input_names = self.tokenizer.model_input_names
  154. image_processor_input_names = self.image_processor.model_input_names
  155. return list(tokenizer_input_names + image_processor_input_names + ["bbox"])
  156. __all__ = ["UdopProcessor"]