tokenization.py 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156
  1. # Copyright 2021-2022 The Alibaba DAMO Team Authors.
  2. # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3. # All rights reserved.
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License");
  6. # you may not use this file except in compliance with the License.
  7. # You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. """Tokenization classes for PoNet """
  17. from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
  18. from transformers.file_utils import PaddingStrategy
  19. from transformers.models.bert.tokenization_bert import BertTokenizer
  20. from transformers.tokenization_utils import BatchEncoding, EncodedInput
  21. from modelscope.utils.constant import ModelFile
  22. from modelscope.utils.logger import get_logger
  23. logger = get_logger()
  24. VOCAB_FILES_NAMES = {'vocab_file': ModelFile.VOCAB_FILE}
  25. PRETRAINED_VOCAB_FILES_MAP = {'vocab_file': {}}
  26. PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
  27. 'nlp_ponet_fill-mask_chinese-base': 512,
  28. 'nlp_ponet_fill-mask_english-base': 512,
  29. }
  30. PRETRAINED_INIT_CONFIGURATION = {
  31. 'nlp_ponet_fill-mask_chinese-base': {
  32. 'do_lower_case': True
  33. },
  34. 'nlp_ponet_fill-mask_english-base': {
  35. 'do_lower_case': True
  36. },
  37. }
  38. class PoNetTokenizer(BertTokenizer):
  39. r"""
  40. Construct an PoNet tokenizer. Based on BertTokenizer.
  41. This tokenizer inherits from :class:`~transformers.BertTokenizer` which contains most of the main methods.
  42. Users should refer to this superclass for more information regarding those methods.
  43. Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
  44. parameters.
  45. """
  46. vocab_files_names = VOCAB_FILES_NAMES
  47. pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
  48. max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
  49. pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
  50. def _pad(
  51. self,
  52. encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
  53. max_length: Optional[int] = None,
  54. padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
  55. pad_to_multiple_of: Optional[int] = None,
  56. return_attention_mask: Optional[bool] = None,
  57. ) -> dict:
  58. """
  59. Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
  60. Args:
  61. encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or
  62. batch of tokenized inputs (`List[List[int]]`).
  63. max_length: maximum length of the returned list and optionally padding length (see below).
  64. Will truncate by taking into account the special tokens.
  65. padding_strategy: PaddingStrategy to use for padding.
  66. - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
  67. - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
  68. - PaddingStrategy.DO_NOT_PAD: Do not pad
  69. The tokenizer padding sides are defined in self.padding_side:
  70. - 'left': pads on the left of the sequences
  71. - 'right': pads on the right of the sequences
  72. pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
  73. This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
  74. >= 7.5 (Volta).
  75. return_attention_mask: (optional) Set to False to avoid returning
  76. attention mask (default: set to model specifics)
  77. """
  78. # Load from model defaults
  79. if return_attention_mask is None:
  80. return_attention_mask = 'attention_mask' in self.model_input_names
  81. required_input = encoded_inputs[self.model_input_names[0]]
  82. if padding_strategy == PaddingStrategy.LONGEST:
  83. max_length = len(required_input)
  84. if max_length is not None and pad_to_multiple_of is not None and (
  85. max_length % pad_to_multiple_of != 0):
  86. max_length = (
  87. (max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
  88. needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(
  89. required_input) != max_length
  90. if needs_to_be_padded:
  91. difference = max_length - len(required_input)
  92. if self.padding_side == 'right':
  93. if return_attention_mask:
  94. encoded_inputs['attention_mask'] = [1] * len(
  95. required_input) + [0] * difference
  96. if 'token_type_ids' in encoded_inputs:
  97. encoded_inputs['token_type_ids'] = (
  98. encoded_inputs['token_type_ids']
  99. + [self.pad_token_type_id] * difference)
  100. if 'special_tokens_mask' in encoded_inputs:
  101. encoded_inputs['special_tokens_mask'] = encoded_inputs[
  102. 'special_tokens_mask'] + [1] * difference
  103. if 'segment_ids' in encoded_inputs:
  104. encoded_inputs[
  105. 'segment_ids'] = encoded_inputs['segment_ids'] + [
  106. encoded_inputs['segment_ids'][-1] + 1
  107. ] * difference # noqa *
  108. encoded_inputs[self.model_input_names[
  109. 0]] = required_input + [self.pad_token_id] * difference
  110. elif self.padding_side == 'left':
  111. if return_attention_mask:
  112. encoded_inputs['attention_mask'] = [0] * difference + [
  113. 1
  114. ] * len(required_input)
  115. if 'token_type_ids' in encoded_inputs:
  116. encoded_inputs['token_type_ids'] = [
  117. self.pad_token_type_id
  118. ] * difference + encoded_inputs['token_type_ids']
  119. if 'segment_ids' in encoded_inputs:
  120. encoded_inputs['segment_ids'] = [encoded_inputs['segment_ids'][-1] + 1] * difference + \
  121. encoded_inputs['segment_ids'] # noqa *
  122. if 'special_tokens_mask' in encoded_inputs:
  123. encoded_inputs['special_tokens_mask'] = [
  124. 1
  125. ] * difference + encoded_inputs['special_tokens_mask']
  126. encoded_inputs[self.model_input_names[
  127. 0]] = [self.pad_token_id] * difference + required_input
  128. else:
  129. raise ValueError('Invalid padding strategy:'
  130. + str(self.padding_side))
  131. elif return_attention_mask and 'attention_mask' not in encoded_inputs:
  132. encoded_inputs['attention_mask'] = [1] * len(required_input)
  133. return encoded_inputs