text_classification.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245
  1. import inspect
  2. import warnings
  3. from typing import Any, Union
  4. import numpy as np
  5. from ..utils import ExplicitEnum, add_end_docstrings, is_tf_available, is_torch_available
  6. from .base import GenericTensor, Pipeline, build_pipeline_init_args
  7. if is_tf_available():
  8. from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
  9. if is_torch_available():
  10. from ..models.auto.modeling_auto import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
  11. def sigmoid(_outputs):
  12. return 1.0 / (1.0 + np.exp(-_outputs))
  13. def softmax(_outputs):
  14. maxes = np.max(_outputs, axis=-1, keepdims=True)
  15. shifted_exp = np.exp(_outputs - maxes)
  16. return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
  17. class ClassificationFunction(ExplicitEnum):
  18. SIGMOID = "sigmoid"
  19. SOFTMAX = "softmax"
  20. NONE = "none"
  21. @add_end_docstrings(
  22. build_pipeline_init_args(has_tokenizer=True),
  23. r"""
  24. return_all_scores (`bool`, *optional*, defaults to `False`):
  25. Whether to return all prediction scores or just the one of the predicted class.
  26. function_to_apply (`str`, *optional*, defaults to `"default"`):
  27. The function to apply to the model outputs in order to retrieve the scores. Accepts four different values:
  28. - `"default"`: if the model has a single label, will apply the sigmoid function on the output. If the model
  29. has several labels, will apply the softmax function on the output. In case of regression tasks, will not
  30. apply any function on the output.
  31. - `"sigmoid"`: Applies the sigmoid function on the output.
  32. - `"softmax"`: Applies the softmax function on the output.
  33. - `"none"`: Does not apply any function on the output.""",
  34. )
  35. class TextClassificationPipeline(Pipeline):
  36. """
  37. Text classification pipeline using any `ModelForSequenceClassification`. See the [sequence classification
  38. examples](../task_summary#sequence-classification) for more information.
  39. Example:
  40. ```python
  41. >>> from transformers import pipeline
  42. >>> classifier = pipeline(model="distilbert/distilbert-base-uncased-finetuned-sst-2-english")
  43. >>> classifier("This movie is disgustingly good !")
  44. [{'label': 'POSITIVE', 'score': 1.0}]
  45. >>> classifier("Director tried too much.")
  46. [{'label': 'NEGATIVE', 'score': 0.996}]
  47. ```
  48. Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
  49. This text classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
  50. `"sentiment-analysis"` (for classifying sequences according to positive or negative sentiments).
  51. If multiple classification labels are available (`model.config.num_labels >= 2`), the pipeline will run a softmax
  52. over the results. If there is a single label, the pipeline will run a sigmoid over the result. In case of regression
  53. tasks (`model.config.problem_type == "regression"`), will not apply any function on the output.
  54. The models that this pipeline can use are models that have been fine-tuned on a sequence classification task. See
  55. the up-to-date list of available models on
  56. [huggingface.co/models](https://huggingface.co/models?filter=text-classification).
  57. """
  58. _load_processor = False
  59. _load_image_processor = False
  60. _load_feature_extractor = False
  61. _load_tokenizer = True
  62. return_all_scores = False
  63. function_to_apply = ClassificationFunction.NONE
  64. def __init__(self, **kwargs):
  65. super().__init__(**kwargs)
  66. self.check_model_type(
  67. TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
  68. if self.framework == "tf"
  69. else MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
  70. )
  71. def _sanitize_parameters(self, return_all_scores=None, function_to_apply=None, top_k="", **tokenizer_kwargs):
  72. # Using "" as default argument because we're going to use `top_k=None` in user code to declare
  73. # "No top_k"
  74. preprocess_params = tokenizer_kwargs
  75. postprocess_params = {}
  76. if hasattr(self.model.config, "return_all_scores") and return_all_scores is None:
  77. return_all_scores = self.model.config.return_all_scores
  78. if isinstance(top_k, int) or top_k is None:
  79. postprocess_params["top_k"] = top_k
  80. postprocess_params["_legacy"] = False
  81. elif return_all_scores is not None:
  82. warnings.warn(
  83. "`return_all_scores` is now deprecated, if want a similar functionality use `top_k=None` instead of"
  84. " `return_all_scores=True` or `top_k=1` instead of `return_all_scores=False`.",
  85. UserWarning,
  86. )
  87. if return_all_scores:
  88. postprocess_params["top_k"] = None
  89. else:
  90. postprocess_params["top_k"] = 1
  91. if isinstance(function_to_apply, str):
  92. function_to_apply = ClassificationFunction[function_to_apply.upper()]
  93. if function_to_apply is not None:
  94. postprocess_params["function_to_apply"] = function_to_apply
  95. return preprocess_params, {}, postprocess_params
  96. def __call__(
  97. self,
  98. inputs: Union[str, list[str], dict[str, str], list[dict[str, str]]],
  99. **kwargs: Any,
  100. ) -> list[dict[str, Any]]:
  101. """
  102. Classify the text(s) given as inputs.
  103. Args:
  104. inputs (`str` or `list[str]` or `dict[str]`, or `list[dict[str]]`):
  105. One or several texts to classify. In order to use text pairs for your classification, you can send a
  106. dictionary containing `{"text", "text_pair"}` keys, or a list of those.
  107. top_k (`int`, *optional*, defaults to `1`):
  108. How many results to return.
  109. function_to_apply (`str`, *optional*, defaults to `"default"`):
  110. The function to apply to the model outputs in order to retrieve the scores. Accepts four different
  111. values:
  112. If this argument is not specified, then it will apply the following functions according to the number
  113. of labels:
  114. - If problem type is regression, will not apply any function on the output.
  115. - If the model has a single label, will apply the sigmoid function on the output.
  116. - If the model has several labels, will apply the softmax function on the output.
  117. Possible values are:
  118. - `"sigmoid"`: Applies the sigmoid function on the output.
  119. - `"softmax"`: Applies the softmax function on the output.
  120. - `"none"`: Does not apply any function on the output.
  121. Return:
  122. A list of `dict`: Each result comes as list of dictionaries with the following keys:
  123. - **label** (`str`) -- The label predicted.
  124. - **score** (`float`) -- The corresponding probability.
  125. If `top_k` is used, one such dictionary is returned per label.
  126. """
  127. inputs = (inputs,)
  128. result = super().__call__(*inputs, **kwargs)
  129. # TODO try and retrieve it in a nicer way from _sanitize_parameters.
  130. _legacy = "top_k" not in kwargs
  131. if isinstance(inputs[0], str) and _legacy:
  132. # This pipeline is odd, and return a list when single item is run
  133. return [result]
  134. else:
  135. return result
  136. def preprocess(self, inputs, **tokenizer_kwargs) -> dict[str, GenericTensor]:
  137. return_tensors = self.framework
  138. if isinstance(inputs, dict):
  139. return self.tokenizer(**inputs, return_tensors=return_tensors, **tokenizer_kwargs)
  140. elif isinstance(inputs, list) and len(inputs) == 1 and isinstance(inputs[0], list) and len(inputs[0]) == 2:
  141. # It used to be valid to use a list of list of list for text pairs, keeping this path for BC
  142. return self.tokenizer(
  143. text=inputs[0][0], text_pair=inputs[0][1], return_tensors=return_tensors, **tokenizer_kwargs
  144. )
  145. elif isinstance(inputs, list):
  146. # This is likely an invalid usage of the pipeline attempting to pass text pairs.
  147. raise ValueError(
  148. "The pipeline received invalid inputs, if you are trying to send text pairs, you can try to send a"
  149. ' dictionary `{"text": "My text", "text_pair": "My pair"}` in order to send a text pair.'
  150. )
  151. return self.tokenizer(inputs, return_tensors=return_tensors, **tokenizer_kwargs)
  152. def _forward(self, model_inputs):
  153. # `XXXForSequenceClassification` models should not use `use_cache=True` even if it's supported
  154. model_forward = self.model.forward if self.framework == "pt" else self.model.call
  155. if "use_cache" in inspect.signature(model_forward).parameters:
  156. model_inputs["use_cache"] = False
  157. return self.model(**model_inputs)
  158. def postprocess(self, model_outputs, function_to_apply=None, top_k=1, _legacy=True):
  159. # `_legacy` is used to determine if we're running the naked pipeline and in backward
  160. # compatibility mode, or if running the pipeline with `pipeline(..., top_k=1)` we're running
  161. # the more natural result containing the list.
  162. # Default value before `set_parameters`
  163. if function_to_apply is None:
  164. if self.model.config.problem_type == "regression":
  165. function_to_apply = ClassificationFunction.NONE
  166. elif self.model.config.problem_type == "multi_label_classification" or self.model.config.num_labels == 1:
  167. function_to_apply = ClassificationFunction.SIGMOID
  168. elif self.model.config.problem_type == "single_label_classification" or self.model.config.num_labels > 1:
  169. function_to_apply = ClassificationFunction.SOFTMAX
  170. elif hasattr(self.model.config, "function_to_apply") and function_to_apply is None:
  171. function_to_apply = self.model.config.function_to_apply
  172. else:
  173. function_to_apply = ClassificationFunction.NONE
  174. outputs = model_outputs["logits"][0]
  175. if self.framework == "pt":
  176. # To enable using fp16 and bf16
  177. outputs = outputs.float().numpy()
  178. else:
  179. outputs = outputs.numpy()
  180. if function_to_apply == ClassificationFunction.SIGMOID:
  181. scores = sigmoid(outputs)
  182. elif function_to_apply == ClassificationFunction.SOFTMAX:
  183. scores = softmax(outputs)
  184. elif function_to_apply == ClassificationFunction.NONE:
  185. scores = outputs
  186. else:
  187. raise ValueError(f"Unrecognized `function_to_apply` argument: {function_to_apply}")
  188. if top_k == 1 and _legacy:
  189. return {"label": self.model.config.id2label[scores.argmax().item()], "score": scores.max().item()}
  190. dict_scores = [
  191. {"label": self.model.config.id2label[i], "score": score.item()} for i, score in enumerate(scores)
  192. ]
  193. if not _legacy:
  194. dict_scores.sort(key=lambda x: x["score"], reverse=True)
  195. if top_k is not None:
  196. dict_scores = dict_scores[:top_k]
  197. return dict_scores