tokenization_cpmant.py 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
  1. # coding=utf-8
  2. # Copyright 2022 The OpenBMB Team and The HuggingFace Inc. team. All rights reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """Tokenization classes for CPMAnt."""
  16. import collections
  17. import os
  18. from typing import Optional
  19. from transformers.utils import is_rjieba_available, requires_backends
  20. if is_rjieba_available():
  21. import rjieba
  22. from ...tokenization_utils import PreTrainedTokenizer
  23. from ...utils import logging
  24. logger = logging.get_logger(__name__)
  25. VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
  26. def load_vocab(vocab_file):
  27. """Loads a vocabulary file into a dictionary."""
  28. vocab = collections.OrderedDict()
  29. with open(vocab_file, "r", encoding="utf-8") as reader:
  30. tokens = reader.readlines()
  31. for index, token in enumerate(tokens):
  32. token = token.rstrip("\n")
  33. vocab[token] = index
  34. return vocab
  35. class WordpieceTokenizer:
  36. def __init__(self, vocab, unk_token="<unk>", max_input_chars_per_word=200):
  37. self.vocab = vocab
  38. self.unk_token = unk_token
  39. self.max_input_chars_per_word = max_input_chars_per_word
  40. def tokenize(self, token):
  41. chars = list(token)
  42. if len(chars) > self.max_input_chars_per_word:
  43. return [self.unk_token]
  44. start = 0
  45. sub_tokens = []
  46. while start < len(chars):
  47. end = len(chars)
  48. cur_substr = None
  49. while start < end:
  50. substr = "".join(chars[start:end])
  51. if substr in self.vocab:
  52. cur_substr = substr
  53. break
  54. end -= 1
  55. if cur_substr is None:
  56. sub_tokens.append(self.unk_token)
  57. start += 1
  58. else:
  59. sub_tokens.append(cur_substr)
  60. start = end
  61. return sub_tokens
  62. class CpmAntTokenizer(PreTrainedTokenizer):
  63. """
  64. Construct a CPMAnt tokenizer. Based on byte-level Byte-Pair-Encoding.
  65. Args:
  66. vocab_file (`str`):
  67. Path to the vocabulary file.
  68. bod_token (`str`, *optional*, defaults to `"<d>"`):
  69. The beginning of document token.
  70. eod_token (`str`, *optional*, defaults to `"</d>"`):
  71. The end of document token.
  72. bos_token (`str`, *optional*, defaults to `"<s>"`):
  73. The beginning of sequence token.
  74. eos_token (`str`, *optional*, defaults to `"</s>"`):
  75. The end of sequence token.
  76. pad_token (`str`, *optional*, defaults to `"<pad>"`):
  77. The token used for padding.
  78. unk_token (`str`, *optional*, defaults to `"<unk>"`):
  79. The unknown token.
  80. line_token (`str`, *optional*, defaults to `"</n>"`):
  81. The line token.
  82. space_token (`str`, *optional*, defaults to `"</_>"`):
  83. The space token.
  84. """
  85. vocab_files_names = VOCAB_FILES_NAMES
  86. model_input_names = ["input_ids", "attention_mask"]
  87. add_prefix_space = False
  88. def __init__(
  89. self,
  90. vocab_file,
  91. bod_token="<d>",
  92. eod_token="</d>",
  93. bos_token="<s>",
  94. eos_token="</s>",
  95. pad_token="<pad>",
  96. unk_token="<unk>",
  97. line_token="</n>",
  98. space_token="</_>",
  99. padding_side="left",
  100. **kwargs,
  101. ):
  102. requires_backends(self, ["rjieba"])
  103. self.bod_token = bod_token
  104. self.eod_token = eod_token
  105. self.encoder = load_vocab(vocab_file)
  106. self.encoder[" "] = self.encoder[space_token]
  107. self.encoder["\n"] = self.encoder[line_token]
  108. del self.encoder[space_token]
  109. del self.encoder[line_token]
  110. self.encoder = collections.OrderedDict(sorted(self.encoder.items(), key=lambda x: x[1]))
  111. self.decoder = {v: k for k, v in self.encoder.items()}
  112. self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.encoder, unk_token=unk_token)
  113. super().__init__(
  114. bod_token=bod_token,
  115. eod_token=eod_token,
  116. bos_token=bos_token,
  117. eos_token=eos_token,
  118. pad_token=pad_token,
  119. unk_token=unk_token,
  120. line_token=line_token,
  121. space_token=space_token,
  122. padding_side=padding_side,
  123. **kwargs,
  124. )
  125. @property
  126. def bod_token_id(self):
  127. return self.encoder[self.bod_token]
  128. @property
  129. def eod_token_id(self):
  130. return self.encoder[self.eod_token]
  131. @property
  132. def newline_id(self):
  133. return self.encoder["\n"]
  134. @property
  135. def vocab_size(self) -> int:
  136. return len(self.encoder)
  137. def get_vocab(self):
  138. return dict(self.encoder, **self.added_tokens_encoder)
  139. def _tokenize(self, text):
  140. """Tokenize a string."""
  141. output_tokens = []
  142. for x in rjieba.cut(text, False):
  143. output_tokens.extend(self.wordpiece_tokenizer.tokenize(x))
  144. return output_tokens
  145. def _decode(self, token_ids, **kwargs):
  146. """Decode ids into a string."""
  147. token_ids = [i for i in token_ids if i >= 0]
  148. token_ids = [
  149. x for x in token_ids if x != self.pad_token_id and x != self.eos_token_id and x != self.bos_token_id
  150. ]
  151. return super()._decode(token_ids, **kwargs)
  152. def check(self, token):
  153. return token in self.encoder
  154. def convert_tokens_to_string(self, tokens: list[str]) -> str:
  155. return "".join(tokens)
  156. def _convert_token_to_id(self, token):
  157. """Converts a token (str) in an id using the vocab."""
  158. return self.encoder.get(token, self.encoder.get(self.unk_token))
  159. def _convert_id_to_token(self, index):
  160. """Converts an index (integer) in a token (str) using the vocab."""
  161. return self.decoder.get(index, self.unk_token)
  162. def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
  163. if os.path.isdir(save_directory):
  164. vocab_file = os.path.join(
  165. save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
  166. )
  167. else:
  168. vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
  169. index = 0
  170. if " " in self.encoder:
  171. self.encoder["</_>"] = self.encoder[" "]
  172. del self.encoder[" "]
  173. if "\n" in self.encoder:
  174. self.encoder["</n>"] = self.encoder["\n"]
  175. del self.encoder["\n"]
  176. self.encoder = collections.OrderedDict(sorted(self.encoder.items(), key=lambda x: x[1]))
  177. with open(vocab_file, "w", encoding="utf-8") as writer:
  178. for token, token_index in self.encoder.items():
  179. if index != token_index:
  180. logger.warning(
  181. f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
  182. " Please check that the vocabulary is not corrupted!"
  183. )
  184. index = token_index
  185. writer.write(token + "\n")
  186. index += 1
  187. return (vocab_file,)
  188. def build_inputs_with_special_tokens(
  189. self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
  190. ) -> list[int]:
  191. """
  192. Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
  193. adding special tokens. A CPMAnt sequence has the following format:
  194. - single sequence: `[BOS] Sequence`.
  195. Args:
  196. token_ids_0 (`list[int]`): The first tokenized sequence that special tokens will be added.
  197. token_ids_1 (`list[int]`): The optional second tokenized sequence that special tokens will be added.
  198. Returns:
  199. `list[int]`: The model input with special tokens.
  200. """
  201. if token_ids_1 is None:
  202. return [self.bos_token_id] + token_ids_0
  203. return [self.bos_token_id] + token_ids_0 + [self.bos_token_id] + token_ids_1
  204. def get_special_tokens_mask(
  205. self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
  206. ) -> list[int]:
  207. """
  208. Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
  209. special tokens using the tokenizer `prepare_for_model` method.
  210. Args:
  211. token_ids_0 (`list[int]`): List of IDs.
  212. token_ids_1 (`list[int]`, *optional*): Optional second list of IDs for sequence pairs.
  213. already_has_special_tokens (`bool`, *optional*, defaults to `False`):
  214. Whether or not the token list is already formatted with special tokens for the model.
  215. Returns:
  216. `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
  217. """
  218. if already_has_special_tokens:
  219. return super().get_special_tokens_mask(
  220. token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
  221. )
  222. if token_ids_1 is not None:
  223. return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1))
  224. return [1] + ([0] * len(token_ids_0))
  225. __all__ = ["CpmAntTokenizer"]