| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100 |
- # Copyright (c) Alibaba, Inc. and its affiliates.
- import os
- from collections.abc import Mapping
- from typing import Any, Dict, List, Tuple, Union
- import json
- import numpy as np
- from transformers import AutoTokenizer
- from modelscope.metainfo import Models
- from modelscope.outputs import OutputKeys
- from modelscope.preprocessors.base import Preprocessor
- from modelscope.utils.constant import ModeKeys
- from modelscope.utils.hub import get_model_type, parse_label_mapping
- from modelscope.utils.logger import get_logger
- logger = get_logger()
- __all__ = ['parse_text_and_label', 'labels_to_id']
- def parse_text_and_label(data,
- mode,
- first_sequence=None,
- second_sequence=None,
- label=None):
- """Parse the input and return the sentences and labels.
- When input type is tuple or list and its size is 2:
- If the pair param is False, data will be parsed as the first_sentence and the label,
- else it will be parsed as the first_sentence and the second_sentence.
- Args:
- data: The input data.
- mode: The mode of the preprocessor
- first_sequence: The key of the first sequence
- second_sequence: The key of the second sequence
- label: The key of the label
- Returns:
- The sentences and labels tuple.
- """
- text_a, text_b, labels = None, None, None
- if isinstance(data, str):
- text_a = data
- elif isinstance(data, tuple) or isinstance(data, list):
- if len(data) == 3:
- text_a, text_b, labels = data
- elif len(data) == 2:
- if mode == ModeKeys.INFERENCE:
- text_a, text_b = data
- else:
- text_a, labels = data
- elif isinstance(data, Mapping):
- text_a = data.get(first_sequence)
- text_b = data.get(second_sequence)
- if label is None or isinstance(label, str):
- labels = data.get(label)
- else:
- labels = [data.get(lb) for lb in label]
- return text_a, text_b, labels
- def labels_to_id(labels, output, label2id=None):
- """Turn the labels to id with the type int or float.
- If the original label's type is str or int, the label2id mapping will try to convert it to the final label.
- If the original label's type is float, or the label2id mapping does not exist,
- the original label will be returned.
- Args:
- label2id: An extra label2id mapping. If not provided, the label will not be translated to ids.
- labels: The input labels.
- output: The label id.
- Returns:
- The final labels.
- """
- def label_can_be_mapped(label):
- return isinstance(label, str) or isinstance(label, int)
- try:
- if isinstance(labels, (tuple, list)) and all([label_can_be_mapped(label) for label in labels]) \
- and label2id is not None:
- output[OutputKeys.LABELS] = [
- label2id[label] if label in label2id else label2id[str(label)]
- for label in labels
- ]
- elif label_can_be_mapped(labels) and label2id is not None:
- output[OutputKeys.LABELS] = label2id[
- labels] if labels in label2id else label2id[str(labels)]
- elif labels is not None:
- output[OutputKeys.LABELS] = labels
- except KeyError as e:
- logger.error(
- f'Label {labels} cannot be found in the label mapping {label2id},'
- f'which comes from the user input or the configuration files. '
- f'Please consider matching your labels with this mapping.')
- raise e
|