# Copyright 2021-2022 The Alibaba DAMO Team Authors. # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Tokenization classes for PoNet """ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union from transformers.file_utils import PaddingStrategy from transformers.models.bert.tokenization_bert import BertTokenizer from transformers.tokenization_utils import BatchEncoding, EncodedInput from modelscope.utils.constant import ModelFile from modelscope.utils.logger import get_logger logger = get_logger() VOCAB_FILES_NAMES = {'vocab_file': ModelFile.VOCAB_FILE} PRETRAINED_VOCAB_FILES_MAP = {'vocab_file': {}} PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 'nlp_ponet_fill-mask_chinese-base': 512, 'nlp_ponet_fill-mask_english-base': 512, } PRETRAINED_INIT_CONFIGURATION = { 'nlp_ponet_fill-mask_chinese-base': { 'do_lower_case': True }, 'nlp_ponet_fill-mask_english-base': { 'do_lower_case': True }, } class PoNetTokenizer(BertTokenizer): r""" Construct an PoNet tokenizer. Based on BertTokenizer. This tokenizer inherits from :class:`~transformers.BertTokenizer` which contains most of the main methods. Users should refer to this superclass for more information regarding those methods. Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning parameters. """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION def _pad( self, encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], max_length: Optional[int] = None, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of: Optional[int] = None, return_attention_mask: Optional[bool] = None, ) -> dict: """ Pad encoded inputs (on left/right and up to predefined length or max length in the batch) Args: encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). max_length: maximum length of the returned list and optionally padding length (see below). Will truncate by taking into account the special tokens. padding_strategy: PaddingStrategy to use for padding. - PaddingStrategy.LONGEST Pad to the longest sequence in the batch - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) - PaddingStrategy.DO_NOT_PAD: Do not pad The tokenizer padding sides are defined in self.padding_side: - 'left': pads on the left of the sequences - 'right': pads on the right of the sequences pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability >= 7.5 (Volta). return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) """ # Load from model defaults if return_attention_mask is None: return_attention_mask = 'attention_mask' in self.model_input_names required_input = encoded_inputs[self.model_input_names[0]] if padding_strategy == PaddingStrategy.LONGEST: max_length = len(required_input) if max_length is not None and pad_to_multiple_of is not None and ( max_length % pad_to_multiple_of != 0): max_length = ( (max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len( required_input) != max_length if needs_to_be_padded: difference = max_length - len(required_input) if self.padding_side == 'right': if return_attention_mask: encoded_inputs['attention_mask'] = [1] * len( required_input) + [0] * difference if 'token_type_ids' in encoded_inputs: encoded_inputs['token_type_ids'] = ( encoded_inputs['token_type_ids'] + [self.pad_token_type_id] * difference) if 'special_tokens_mask' in encoded_inputs: encoded_inputs['special_tokens_mask'] = encoded_inputs[ 'special_tokens_mask'] + [1] * difference if 'segment_ids' in encoded_inputs: encoded_inputs[ 'segment_ids'] = encoded_inputs['segment_ids'] + [ encoded_inputs['segment_ids'][-1] + 1 ] * difference # noqa * encoded_inputs[self.model_input_names[ 0]] = required_input + [self.pad_token_id] * difference elif self.padding_side == 'left': if return_attention_mask: encoded_inputs['attention_mask'] = [0] * difference + [ 1 ] * len(required_input) if 'token_type_ids' in encoded_inputs: encoded_inputs['token_type_ids'] = [ self.pad_token_type_id ] * difference + encoded_inputs['token_type_ids'] if 'segment_ids' in encoded_inputs: encoded_inputs['segment_ids'] = [encoded_inputs['segment_ids'][-1] + 1] * difference + \ encoded_inputs['segment_ids'] # noqa * if 'special_tokens_mask' in encoded_inputs: encoded_inputs['special_tokens_mask'] = [ 1 ] * difference + encoded_inputs['special_tokens_mask'] encoded_inputs[self.model_input_names[ 0]] = [self.pad_token_id] * difference + required_input else: raise ValueError('Invalid padding strategy:' + str(self.padding_side)) elif return_attention_mask and 'attention_mask' not in encoded_inputs: encoded_inputs['attention_mask'] = [1] * len(required_input) return encoded_inputs