visual_question_answering.py 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216
  1. from typing import Optional, Union
  2. from ..generation import GenerationConfig
  3. from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging
  4. from .base import Pipeline, build_pipeline_init_args
  5. if is_vision_available():
  6. from PIL import Image
  7. from ..image_utils import load_image
  8. if is_torch_available():
  9. from ..models.auto.modeling_auto import MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES
  10. from .pt_utils import KeyDataset
  11. logger = logging.get_logger(__name__)
  12. @add_end_docstrings(build_pipeline_init_args(has_tokenizer=True, has_image_processor=True))
  13. class VisualQuestionAnsweringPipeline(Pipeline):
  14. """
  15. Visual Question Answering pipeline using a `AutoModelForVisualQuestionAnswering`. This pipeline is currently only
  16. available in PyTorch.
  17. Unless the model you're using explicitly sets these generation parameters in its configuration files
  18. (`generation_config.json`), the following default values will be used:
  19. - max_new_tokens: 256
  20. Example:
  21. ```python
  22. >>> from transformers import pipeline
  23. >>> oracle = pipeline(model="dandelin/vilt-b32-finetuned-vqa")
  24. >>> image_url = "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/lena.png"
  25. >>> oracle(question="What is she wearing ?", image=image_url)
  26. [{'score': 0.948, 'answer': 'hat'}, {'score': 0.009, 'answer': 'fedora'}, {'score': 0.003, 'answer': 'clothes'}, {'score': 0.003, 'answer': 'sun hat'}, {'score': 0.002, 'answer': 'nothing'}]
  27. >>> oracle(question="What is she wearing ?", image=image_url, top_k=1)
  28. [{'score': 0.948, 'answer': 'hat'}]
  29. >>> oracle(question="Is this a person ?", image=image_url, top_k=1)
  30. [{'score': 0.993, 'answer': 'yes'}]
  31. >>> oracle(question="Is this a man ?", image=image_url, top_k=1)
  32. [{'score': 0.996, 'answer': 'no'}]
  33. ```
  34. Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
  35. This visual question answering pipeline can currently be loaded from [`pipeline`] using the following task
  36. identifiers: `"visual-question-answering", "vqa"`.
  37. The models that this pipeline can use are models that have been fine-tuned on a visual question answering task. See
  38. the up-to-date list of available models on
  39. [huggingface.co/models](https://huggingface.co/models?filter=visual-question-answering).
  40. """
  41. _load_processor = False
  42. _load_image_processor = True
  43. _load_feature_extractor = False
  44. _load_tokenizer = True
  45. _pipeline_calls_generate = True
  46. # Make sure the docstring is updated when the default generation config is changed
  47. _default_generation_config = GenerationConfig(
  48. max_new_tokens=256,
  49. )
  50. def __init__(self, *args, **kwargs):
  51. super().__init__(*args, **kwargs)
  52. self.check_model_type(MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES)
  53. def _sanitize_parameters(self, top_k=None, padding=None, truncation=None, timeout=None, **kwargs):
  54. preprocess_params, postprocess_params = {}, {}
  55. if padding is not None:
  56. preprocess_params["padding"] = padding
  57. if truncation is not None:
  58. preprocess_params["truncation"] = truncation
  59. if timeout is not None:
  60. preprocess_params["timeout"] = timeout
  61. if top_k is not None:
  62. postprocess_params["top_k"] = top_k
  63. forward_params = {}
  64. if getattr(self, "assistant_model", None) is not None:
  65. forward_params["assistant_model"] = self.assistant_model
  66. if getattr(self, "assistant_tokenizer", None) is not None:
  67. forward_params["tokenizer"] = self.tokenizer
  68. forward_params["assistant_tokenizer"] = self.assistant_tokenizer
  69. return preprocess_params, forward_params, postprocess_params
  70. def __call__(
  71. self,
  72. image: Union["Image.Image", str, list["Image.Image"], list[str], "KeyDataset"],
  73. question: Optional[Union[str, list[str]]] = None,
  74. **kwargs,
  75. ):
  76. r"""
  77. Answers open-ended questions about images. The pipeline accepts several types of inputs which are detailed
  78. below:
  79. - `pipeline(image=image, question=question)`
  80. - `pipeline({"image": image, "question": question})`
  81. - `pipeline([{"image": image, "question": question}])`
  82. - `pipeline([{"image": image, "question": question}, {"image": image, "question": question}])`
  83. Args:
  84. image (`str`, `list[str]`, `PIL.Image`, `list[PIL.Image]` or `KeyDataset`):
  85. The pipeline handles three types of images:
  86. - A string containing a http link pointing to an image
  87. - A string containing a local path to an image
  88. - An image loaded in PIL directly
  89. The pipeline accepts either a single image or a batch of images. If given a single image, it can be
  90. broadcasted to multiple questions.
  91. For dataset: the passed in dataset must be of type `transformers.pipelines.pt_utils.KeyDataset`
  92. Example:
  93. ```python
  94. >>> from transformers.pipelines.pt_utils import KeyDataset
  95. >>> from datasets import load_dataset
  96. >>> dataset = load_dataset("detection-datasets/coco")
  97. >>> oracle(image=KeyDataset(dataset, "image"), question="What's in this image?")
  98. ```
  99. question (`str`, `list[str]`):
  100. The question(s) asked. If given a single question, it can be broadcasted to multiple images.
  101. If multiple images and questions are given, each and every question will be broadcasted to all images
  102. (same effect as a Cartesian product)
  103. top_k (`int`, *optional*, defaults to 5):
  104. The number of top labels that will be returned by the pipeline. If the provided number is higher than
  105. the number of labels available in the model configuration, it will default to the number of labels.
  106. timeout (`float`, *optional*, defaults to None):
  107. The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
  108. the call may block forever.
  109. Return:
  110. A dictionary or a list of dictionaries containing the result. The dictionaries contain the following keys:
  111. - **label** (`str`) -- The label identified by the model.
  112. - **score** (`int`) -- The score attributed by the model for that label.
  113. """
  114. is_dataset = isinstance(image, KeyDataset)
  115. is_image_batch = isinstance(image, list) and all(isinstance(item, (Image.Image, str)) for item in image)
  116. is_question_batch = isinstance(question, list) and all(isinstance(item, str) for item in question)
  117. if isinstance(image, (Image.Image, str)) and isinstance(question, str):
  118. inputs = {"image": image, "question": question}
  119. elif (is_image_batch or is_dataset) and isinstance(question, str):
  120. inputs = [{"image": im, "question": question} for im in image]
  121. elif isinstance(image, (Image.Image, str)) and is_question_batch:
  122. inputs = [{"image": image, "question": q} for q in question]
  123. elif (is_image_batch or is_dataset) and is_question_batch:
  124. question_image_pairs = []
  125. for q in question:
  126. for im in image:
  127. question_image_pairs.append({"image": im, "question": q})
  128. inputs = question_image_pairs
  129. else:
  130. """
  131. Supports the following format
  132. - {"image": image, "question": question}
  133. - [{"image": image, "question": question}]
  134. - Generator and datasets
  135. """
  136. inputs = image
  137. results = super().__call__(inputs, **kwargs)
  138. return results
  139. def preprocess(self, inputs, padding=False, truncation=False, timeout=None):
  140. image = load_image(inputs["image"], timeout=timeout)
  141. model_inputs = self.tokenizer(
  142. inputs["question"],
  143. return_tensors=self.framework,
  144. padding=padding,
  145. truncation=truncation,
  146. )
  147. image_features = self.image_processor(images=image, return_tensors=self.framework)
  148. if self.framework == "pt":
  149. image_features = image_features.to(self.dtype)
  150. model_inputs.update(image_features)
  151. return model_inputs
  152. def _forward(self, model_inputs, **generate_kwargs):
  153. if self.model.can_generate():
  154. # User-defined `generation_config` passed to the pipeline call take precedence
  155. if "generation_config" not in generate_kwargs:
  156. generate_kwargs["generation_config"] = self.generation_config
  157. model_outputs = self.model.generate(**model_inputs, **generate_kwargs)
  158. else:
  159. model_outputs = self.model(**model_inputs)
  160. return model_outputs
  161. def postprocess(self, model_outputs, top_k=5):
  162. if self.model.can_generate():
  163. return [
  164. {"answer": self.tokenizer.decode(output_ids, skip_special_tokens=True).strip()}
  165. for output_ids in model_outputs
  166. ]
  167. else:
  168. if top_k > self.model.config.num_labels:
  169. top_k = self.model.config.num_labels
  170. if self.framework == "pt":
  171. probs = model_outputs.logits.sigmoid()[0]
  172. scores, ids = probs.topk(top_k)
  173. else:
  174. raise ValueError(f"Unsupported framework: {self.framework}")
  175. scores = scores.tolist()
  176. ids = ids.tolist()
  177. return [{"score": score, "answer": self.model.config.id2label[_id]} for score, _id in zip(scores, ids)]