fill_mask.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286
  1. from typing import Any, Union, overload
  2. import numpy as np
  3. from ..utils import add_end_docstrings, is_tf_available, is_torch_available, logging
  4. from .base import GenericTensor, Pipeline, PipelineException, build_pipeline_init_args
  5. if is_tf_available():
  6. import tensorflow as tf
  7. from ..tf_utils import stable_softmax
  8. if is_torch_available():
  9. import torch
  10. logger = logging.get_logger(__name__)
  11. @add_end_docstrings(
  12. build_pipeline_init_args(has_tokenizer=True),
  13. r"""
  14. top_k (`int`, *optional*, defaults to 5):
  15. The number of predictions to return.
  16. targets (`str` or `list[str]`, *optional*):
  17. When passed, the model will limit the scores to the passed targets instead of looking up in the whole
  18. vocab. If the provided targets are not in the model vocab, they will be tokenized and the first resulting
  19. token will be used (with a warning, and that might be slower).
  20. tokenizer_kwargs (`dict`, *optional*):
  21. Additional dictionary of keyword arguments passed along to the tokenizer.""",
  22. )
  23. class FillMaskPipeline(Pipeline):
  24. _load_processor = False
  25. _load_image_processor = False
  26. _load_feature_extractor = False
  27. _load_tokenizer = True
  28. """
  29. Masked language modeling prediction pipeline using any `ModelWithLMHead`. See the [masked language modeling
  30. examples](../task_summary#masked-language-modeling) for more information.
  31. Example:
  32. ```python
  33. >>> from transformers import pipeline
  34. >>> fill_masker = pipeline(model="google-bert/bert-base-uncased")
  35. >>> fill_masker("This is a simple [MASK].")
  36. [{'score': 0.042, 'token': 3291, 'token_str': 'problem', 'sequence': 'this is a simple problem.'}, {'score': 0.031, 'token': 3160, 'token_str': 'question', 'sequence': 'this is a simple question.'}, {'score': 0.03, 'token': 8522, 'token_str': 'equation', 'sequence': 'this is a simple equation.'}, {'score': 0.027, 'token': 2028, 'token_str': 'one', 'sequence': 'this is a simple one.'}, {'score': 0.024, 'token': 3627, 'token_str': 'rule', 'sequence': 'this is a simple rule.'}]
  37. ```
  38. Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
  39. This mask filling pipeline can currently be loaded from [`pipeline`] using the following task identifier:
  40. `"fill-mask"`.
  41. The models that this pipeline can use are models that have been trained with a masked language modeling objective,
  42. which includes the bi-directional models in the library. See the up-to-date list of available models on
  43. [huggingface.co/models](https://huggingface.co/models?filter=fill-mask).
  44. <Tip>
  45. This pipeline only works for inputs with exactly one token masked. Experimental: We added support for multiple
  46. masks. The returned values are raw model output, and correspond to disjoint probabilities where one might expect
  47. joint probabilities (See [discussion](https://github.com/huggingface/transformers/pull/10222)).
  48. </Tip>
  49. <Tip>
  50. This pipeline now supports tokenizer_kwargs. For example try:
  51. ```python
  52. >>> from transformers import pipeline
  53. >>> fill_masker = pipeline(model="google-bert/bert-base-uncased")
  54. >>> tokenizer_kwargs = {"truncation": True}
  55. >>> fill_masker(
  56. ... "This is a simple [MASK]. " + "...with a large amount of repeated text appended. " * 100,
  57. ... tokenizer_kwargs=tokenizer_kwargs,
  58. ... )
  59. ```
  60. </Tip>
  61. """
  62. def get_masked_index(self, input_ids: GenericTensor) -> np.ndarray:
  63. if self.framework == "tf":
  64. masked_index = tf.where(input_ids == self.tokenizer.mask_token_id).numpy()
  65. elif self.framework == "pt":
  66. masked_index = torch.nonzero(input_ids == self.tokenizer.mask_token_id, as_tuple=False)
  67. else:
  68. raise ValueError("Unsupported framework")
  69. return masked_index
  70. def _ensure_exactly_one_mask_token(self, input_ids: GenericTensor) -> np.ndarray:
  71. masked_index = self.get_masked_index(input_ids)
  72. numel = np.prod(masked_index.shape)
  73. if numel < 1:
  74. raise PipelineException(
  75. "fill-mask",
  76. self.model.base_model_prefix,
  77. f"No mask_token ({self.tokenizer.mask_token}) found on the input",
  78. )
  79. def ensure_exactly_one_mask_token(self, model_inputs: GenericTensor):
  80. if isinstance(model_inputs, list):
  81. for model_input in model_inputs:
  82. self._ensure_exactly_one_mask_token(model_input["input_ids"][0])
  83. else:
  84. for input_ids in model_inputs["input_ids"]:
  85. self._ensure_exactly_one_mask_token(input_ids)
  86. def preprocess(
  87. self, inputs, return_tensors=None, tokenizer_kwargs=None, **preprocess_parameters
  88. ) -> dict[str, GenericTensor]:
  89. if return_tensors is None:
  90. return_tensors = self.framework
  91. if tokenizer_kwargs is None:
  92. tokenizer_kwargs = {}
  93. model_inputs = self.tokenizer(inputs, return_tensors=return_tensors, **tokenizer_kwargs)
  94. self.ensure_exactly_one_mask_token(model_inputs)
  95. return model_inputs
  96. def _forward(self, model_inputs):
  97. model_outputs = self.model(**model_inputs)
  98. model_outputs["input_ids"] = model_inputs["input_ids"]
  99. return model_outputs
  100. def postprocess(self, model_outputs, top_k=5, target_ids=None):
  101. # Cap top_k if there are targets
  102. if target_ids is not None and target_ids.shape[0] < top_k:
  103. top_k = target_ids.shape[0]
  104. input_ids = model_outputs["input_ids"][0]
  105. outputs = model_outputs["logits"]
  106. if self.framework == "tf":
  107. masked_index = tf.where(input_ids == self.tokenizer.mask_token_id).numpy()[:, 0]
  108. outputs = outputs.numpy()
  109. logits = outputs[0, masked_index, :]
  110. probs = stable_softmax(logits, axis=-1)
  111. if target_ids is not None:
  112. probs = tf.gather_nd(tf.squeeze(probs, 0), target_ids.reshape(-1, 1))
  113. probs = tf.expand_dims(probs, 0)
  114. topk = tf.math.top_k(probs, k=top_k)
  115. values, predictions = topk.values.numpy(), topk.indices.numpy()
  116. else:
  117. masked_index = torch.nonzero(input_ids == self.tokenizer.mask_token_id, as_tuple=False).squeeze(-1)
  118. # Fill mask pipeline supports only one ${mask_token} per sample
  119. logits = outputs[0, masked_index, :]
  120. probs = logits.softmax(dim=-1)
  121. if target_ids is not None:
  122. probs = probs[..., target_ids]
  123. values, predictions = probs.topk(top_k)
  124. result = []
  125. single_mask = values.shape[0] == 1
  126. for i, (_values, _predictions) in enumerate(zip(values.tolist(), predictions.tolist())):
  127. row = []
  128. for v, p in zip(_values, _predictions):
  129. # Copy is important since we're going to modify this array in place
  130. tokens = input_ids.numpy().copy()
  131. if target_ids is not None:
  132. p = target_ids[p].tolist()
  133. tokens[masked_index[i]] = p
  134. # Filter padding out:
  135. tokens = tokens[np.where(tokens != self.tokenizer.pad_token_id)]
  136. # Originally we skip special tokens to give readable output.
  137. # For multi masks though, the other [MASK] would be removed otherwise
  138. # making the output look odd, so we add them back
  139. sequence = self.tokenizer.decode(tokens, skip_special_tokens=single_mask)
  140. proposition = {"score": v, "token": p, "token_str": self.tokenizer.decode([p]), "sequence": sequence}
  141. row.append(proposition)
  142. result.append(row)
  143. if single_mask:
  144. return result[0]
  145. return result
  146. def get_target_ids(self, targets):
  147. if isinstance(targets, str):
  148. targets = [targets]
  149. try:
  150. vocab = self.tokenizer.get_vocab()
  151. except Exception:
  152. vocab = {}
  153. target_ids = []
  154. for target in targets:
  155. id_ = vocab.get(target)
  156. if id_ is None:
  157. input_ids = self.tokenizer(
  158. target,
  159. add_special_tokens=False,
  160. return_attention_mask=False,
  161. return_token_type_ids=False,
  162. max_length=1,
  163. truncation=True,
  164. )["input_ids"]
  165. if len(input_ids) == 0:
  166. logger.warning(
  167. f"The specified target token `{target}` does not exist in the model vocabulary. "
  168. "We cannot replace it with anything meaningful, ignoring it"
  169. )
  170. continue
  171. id_ = input_ids[0]
  172. # XXX: If users encounter this pass
  173. # it becomes pretty slow, so let's make sure
  174. # The warning enables them to fix the input to
  175. # get faster performance.
  176. logger.warning(
  177. f"The specified target token `{target}` does not exist in the model vocabulary. "
  178. f"Replacing with `{self.tokenizer.convert_ids_to_tokens(id_)}`."
  179. )
  180. target_ids.append(id_)
  181. target_ids = list(set(target_ids))
  182. if len(target_ids) == 0:
  183. raise ValueError("At least one target must be provided when passed.")
  184. target_ids = np.array(target_ids)
  185. return target_ids
  186. def _sanitize_parameters(self, top_k=None, targets=None, tokenizer_kwargs=None):
  187. preprocess_params = {}
  188. if tokenizer_kwargs is not None:
  189. preprocess_params["tokenizer_kwargs"] = tokenizer_kwargs
  190. postprocess_params = {}
  191. if targets is not None:
  192. target_ids = self.get_target_ids(targets)
  193. postprocess_params["target_ids"] = target_ids
  194. if top_k is not None:
  195. postprocess_params["top_k"] = top_k
  196. if self.tokenizer.mask_token_id is None:
  197. raise PipelineException(
  198. "fill-mask", self.model.base_model_prefix, "The tokenizer does not define a `mask_token`."
  199. )
  200. return preprocess_params, {}, postprocess_params
  201. @overload
  202. def __call__(self, inputs: str, **kwargs: Any) -> list[dict[str, Any]]: ...
  203. @overload
  204. def __call__(self, inputs: list[str], **kwargs: Any) -> list[list[dict[str, Any]]]: ...
  205. def __call__(
  206. self, inputs: Union[str, list[str]], **kwargs: Any
  207. ) -> Union[list[dict[str, Any]], list[list[dict[str, Any]]]]:
  208. """
  209. Fill the masked token in the text(s) given as inputs.
  210. Args:
  211. inputs (`str` or `list[str]`):
  212. One or several texts (or one list of prompts) with masked tokens.
  213. targets (`str` or `list[str]`, *optional*):
  214. When passed, the model will limit the scores to the passed targets instead of looking up in the whole
  215. vocab. If the provided targets are not in the model vocab, they will be tokenized and the first
  216. resulting token will be used (with a warning, and that might be slower).
  217. top_k (`int`, *optional*):
  218. When passed, overrides the number of predictions to return.
  219. Return:
  220. A list or a list of list of `dict`: Each result comes as list of dictionaries with the following keys:
  221. - **sequence** (`str`) -- The corresponding input with the mask token prediction.
  222. - **score** (`float`) -- The corresponding probability.
  223. - **token** (`int`) -- The predicted token id (to replace the masked one).
  224. - **token_str** (`str`) -- The predicted token (to replace the masked one).
  225. """
  226. outputs = super().__call__(inputs, **kwargs)
  227. if isinstance(inputs, list) and len(inputs) == 1:
  228. return outputs[0]
  229. return outputs