tokenization.py 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252
  1. # Copyright (c) Alibaba Cloud.
  2. #
  3. # This source code is licensed under the license found in the
  4. # LICENSE file in the root directory of this source tree.
  5. """Tokenization classes for QWen."""
  6. from __future__ import (absolute_import, division, print_function,
  7. unicode_literals)
  8. import base64
  9. import logging
  10. import os
  11. import unicodedata
  12. from io import open
  13. from typing import List, Optional, Tuple, Union
  14. import json
  15. import tiktoken
  16. from transformers import AddedToken, PreTrainedTokenizer
  17. from modelscope.utils.logger import get_logger
  18. logger = get_logger()
  19. VOCAB_FILES_NAMES = {'vocab_file': 'qwen.tiktoken'}
  20. class QWenTokenizer(PreTrainedTokenizer):
  21. """QWen tokenizer."""
  22. """NOTE: This tokenizer will not handle special tokens to avoid injection attacks"""
  23. vocab_files_names = VOCAB_FILES_NAMES
  24. def __init__(
  25. self,
  26. vocab_file,
  27. errors='replace',
  28. max_len=None,
  29. unk_token='<|endoftext|>',
  30. bos_token='<|endoftext|>',
  31. eos_token='<|endoftext|>',
  32. pad_token=None,
  33. add_prefix_space=False,
  34. add_bos_token=False,
  35. add_more_sp_tokens=True,
  36. **kwargs,
  37. ):
  38. bos_token = (
  39. AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(
  40. bos_token, str) else bos_token)
  41. eos_token = (
  42. AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(
  43. eos_token, str) else eos_token)
  44. unk_token = (
  45. AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(
  46. unk_token, str) else unk_token)
  47. pad_token = (
  48. AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(
  49. pad_token, str) else pad_token)
  50. super().__init__(
  51. errors=errors,
  52. unk_token=unk_token,
  53. bos_token=bos_token,
  54. eos_token=eos_token,
  55. pad_token=pad_token,
  56. add_prefix_space=add_prefix_space,
  57. add_bos_token=add_bos_token,
  58. )
  59. self.add_bos_token = add_bos_token
  60. self.max_len = max_len if max_len is not None else int(1e12)
  61. self.errors = errors # how to handle errors in decoding
  62. name = 'Qwen'
  63. ENDOFTEXT = '<|endoftext|>'
  64. IMSTART = '<|im_start|>'
  65. IMEND = '<|im_end|>'
  66. if add_more_sp_tokens:
  67. special_tokens = (
  68. ENDOFTEXT,
  69. IMSTART,
  70. IMEND,
  71. '<R>',
  72. '<S>',
  73. '<X>',
  74. '<mask>',
  75. '<sep>',
  76. ) + tuple([f'<extra_{i}>' for i in range(200)])
  77. else:
  78. special_tokens = (ENDOFTEXT, IMSTART, IMEND)
  79. PAT_STR = (
  80. r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}|"""
  81. r""" ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""")
  82. def load_tiktoken_bpe(tiktoken_bpe_file: str) -> 'dict[bytes, int]':
  83. contents = open(tiktoken_bpe_file, 'rb').read()
  84. return {
  85. base64.b64decode(token): int(rank)
  86. for token, rank in (line.split()
  87. for line in contents.splitlines() if line)
  88. }
  89. mergeable_ranks = load_tiktoken_bpe(vocab_file)
  90. special_tokens = {
  91. token: index
  92. for index, token in enumerate(
  93. special_tokens, start=len(mergeable_ranks))
  94. }
  95. self.special_tokens = special_tokens
  96. enc = tiktoken.Encoding(
  97. name,
  98. pat_str=PAT_STR,
  99. mergeable_ranks=mergeable_ranks,
  100. special_tokens=special_tokens,
  101. )
  102. assert (
  103. len(mergeable_ranks) + len(special_tokens) == enc.n_vocab
  104. ), f'{len(mergeable_ranks) + len(special_tokens)} != {enc.n_vocab} in encoding'
  105. self.mergeable_ranks = mergeable_ranks
  106. self.encoder = self.mergeable_ranks
  107. self.decoder = {v: k for k, v in self.encoder.items()}
  108. self.tokenizer = enc # type: tiktoken.Encoding
  109. self.eod_id = self.tokenizer.eot_token
  110. self.im_start_id = special_tokens[IMSTART]
  111. self.im_end_id = special_tokens[IMEND]
  112. def __len__(self):
  113. return self.tokenizer.n_vocab
  114. def get_vocab(self):
  115. return self.mergeable_ranks
  116. def convert_tokens_to_ids(self, tokens):
  117. ids = []
  118. # Remove support for py2
  119. if isinstance(tokens, str):
  120. if tokens in self.special_tokens:
  121. return self.special_tokens[tokens]
  122. else:
  123. return self.encoder.get(tokens)
  124. for token in tokens:
  125. if token in self.special_tokens:
  126. ids.append(self.special_tokens[token])
  127. else:
  128. ids.append(self.encoder.get(token))
  129. if len(ids) > self.max_len:
  130. logger.warning(
  131. 'Token indices sequence length is longer than the specified maximum '
  132. ' sequence length for this model ({} > {}). Running this'
  133. ' sequence through the model will result in indexing errors'.
  134. format(len(ids), self.max_len))
  135. return ids
  136. def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
  137. """
  138. Save only the vocabulary of the tokenizer (vocabulary + added tokens).
  139. Returns:
  140. `Tuple(str)`: Paths to the files saved.
  141. """
  142. file_path = os.path.join(save_directory, 'qwen.tiktoken')
  143. with open(file_path, 'w', encoding='utf8') as w:
  144. for k, v in self.mergeable_ranks.items():
  145. line = base64.b64encode(k).decode('utf8') + ' ' + str(v) + '\n'
  146. w.write(line)
  147. return (file_path, )
  148. def tokenize(self, text: str, **kwargs) -> List[str]:
  149. """
  150. Converts a string in a sequence of tokens, replacing unknown tokens with the `unk_token`.
  151. Args:
  152. text (`str`):
  153. The sequence to be encoded.
  154. kwargs (additional keyword arguments, *optional*):
  155. Will be passed to the underlying model specific encode method. See details in
  156. [`~PreTrainedTokenizerBase.__call__`]
  157. Returns:
  158. `List[str]`: The list of tokens.
  159. """
  160. tokens = []
  161. text = unicodedata.normalize('NFC', text)
  162. for t in self.tokenizer.encode_ordinary(text):
  163. tokens.append(self.decoder[t])
  164. return tokens
  165. def convert_tokens_to_string(self, tokens: List[str]) -> str:
  166. """
  167. Converts a sequence of tokens in a single string. The most simple way to do it is `" ".join(tokens)` but we
  168. often want to remove sub-word tokenization artifacts at the same time.
  169. """
  170. text = ''.join(tokens)
  171. text = bytearray([self.byte_decoder[c] for c in text]).decode(
  172. 'utf-8', errors=self.errors)
  173. return text
  174. @property
  175. def vocab_size(self):
  176. return self.tokenizer.n_vocab
  177. def _convert_id_to_token(self, index: int) -> str:
  178. if index >= self.tokenizer.n_vocab:
  179. return self.unk_token
  180. return self.tokenizer.decode([index])
  181. def _convert_token_to_id(self, token: str) -> int:
  182. """Converts a token to an id using the vocab."""
  183. return self.encoder.get(
  184. token.encode('UTF-8'),
  185. self.tokenizer.encode(self.unk_token, allowed_special='all')[0],
  186. )
  187. @property
  188. def all_special_tokens(self) -> List[str]:
  189. """
  190. `List[str]`: All the special tokens (`'<unk>'`, `'<cls>'`, etc.) mapped to class attributes.
  191. Convert tokens of `tokenizers.AddedToken` type to string.
  192. """
  193. all_toks = [str(s) for s in self.special_tokens.keys()]
  194. return all_toks
  195. @property
  196. def all_special_ids(self) -> List[int]:
  197. """
  198. `List[int]`: List the ids of the special tokens(`'<unk>'`, `'<cls>'`, etc.) mapped to class attributes.
  199. """
  200. all_ids = [v for v in self.special_tokens.values()]
  201. return all_ids
  202. def _tokenize(self, text, **kwargs):
  203. """
  204. Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
  205. vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
  206. Do NOT take care of added tokens.
  207. """
  208. raise NotImplementedError
  209. def _decode(
  210. self,
  211. token_ids: Union[int, List[int]],
  212. skip_special_tokens: bool = False,
  213. **kwargs,
  214. ) -> str:
  215. if isinstance(token_ids, int):
  216. token_ids = [token_ids]
  217. if skip_special_tokens:
  218. token_ids = [i for i in token_ids if i not in self.all_special_ids]
  219. return self.tokenizer.decode(token_ids)