tokenization.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466
  1. """Tokenization classes for ChatGLM."""
  2. import os
  3. from typing import Dict, List, Optional, Union
  4. import numpy as np
  5. import sentencepiece as spm
  6. from transformers.tokenization_utils import PreTrainedTokenizer
  7. from transformers.tokenization_utils_base import BatchEncoding, EncodedInput
  8. from transformers.utils import PaddingStrategy
  9. from modelscope.utils import logger as logging
  10. logger = logging.get_logger()
  11. PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
  12. 'THUDM/chatglm-6b': 2048,
  13. }
  14. class TextTokenizer:
  15. def __init__(self, model_path):
  16. self.sp = spm.SentencePieceProcessor()
  17. self.sp.Load(model_path)
  18. self.num_tokens = self.sp.vocab_size()
  19. def encode(self, text):
  20. return self.sp.EncodeAsIds(text)
  21. def decode(self, ids: List[int]):
  22. return self.sp.DecodeIds(ids)
  23. def tokenize(self, text):
  24. return self.sp.EncodeAsPieces(text)
  25. def convert_tokens_to_ids(self, tokens):
  26. return [self.sp.PieceToId(token) for token in tokens]
  27. def convert_token_to_id(self, token):
  28. return self.sp.PieceToId(token)
  29. def convert_id_to_token(self, idx):
  30. return self.sp.IdToPiece(idx)
  31. def __len__(self):
  32. return self.num_tokens
  33. class SPTokenizer:
  34. def __init__(
  35. self,
  36. vocab_file,
  37. num_image_tokens=20000,
  38. max_blank_length=80,
  39. byte_fallback=True,
  40. ):
  41. assert vocab_file is not None
  42. self.vocab_file = vocab_file
  43. self.num_image_tokens = num_image_tokens
  44. self.special_tokens = [
  45. '[MASK]', '[gMASK]', '[sMASK]', '<unused_0>', '<sop>', '<eop>',
  46. '<ENC>', '<dBLOCK>'
  47. ]
  48. self.max_blank_length = max_blank_length
  49. self.byte_fallback = byte_fallback
  50. self.text_tokenizer = TextTokenizer(vocab_file)
  51. def _get_text_tokenizer(self):
  52. return self.text_tokenizer
  53. @staticmethod
  54. def get_blank_token(length: int):
  55. assert length >= 2
  56. return f'<|blank_{length}|>'
  57. @staticmethod
  58. def get_tab_token():
  59. return '<|tab|>'
  60. @property
  61. def num_text_tokens(self):
  62. return self.text_tokenizer.num_tokens
  63. @property
  64. def num_tokens(self):
  65. return self.num_image_tokens + self.num_text_tokens
  66. @staticmethod
  67. def _encode_whitespaces(text: str, max_len: int = 80):
  68. text = text.replace('\t', SPTokenizer.get_tab_token())
  69. for i in range(max_len, 1, -1):
  70. text = text.replace(' ' * i, SPTokenizer.get_blank_token(i))
  71. return text
  72. def _preprocess(self, text: str, linebreak=True, whitespaces=True):
  73. if linebreak:
  74. text = text.replace('\n', '<n>')
  75. if whitespaces:
  76. text = self._encode_whitespaces(
  77. text, max_len=self.max_blank_length)
  78. return text
  79. def encode(self,
  80. text: str,
  81. linebreak=True,
  82. whitespaces=True,
  83. add_dummy_prefix=True) -> List[int]:
  84. """
  85. @param text: Text to encode.
  86. @param linebreak: Whether to encode newline (\n) in text.
  87. @param whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding.
  88. @param special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text.
  89. @param add_dummy_prefix: Whether to add dummy blank space in the beginning.
  90. """
  91. text = self._preprocess(text, linebreak, whitespaces)
  92. if not add_dummy_prefix:
  93. text = '<n>' + text
  94. tmp = self._get_text_tokenizer().encode(text)
  95. tokens = [x + self.num_image_tokens for x in tmp]
  96. return tokens if add_dummy_prefix else tokens[2:]
  97. def decode(self, text_ids: List[int]) -> str:
  98. ids = [int(_id) - self.num_image_tokens for _id in text_ids]
  99. ids = [_id for _id in ids if _id >= 0]
  100. text = self._get_text_tokenizer().decode(ids)
  101. text = text.replace('<n>', '\n')
  102. text = text.replace(SPTokenizer.get_tab_token(), '\t')
  103. for i in range(2, self.max_blank_length + 1):
  104. text = text.replace(self.get_blank_token(i), ' ' * i)
  105. return text
  106. def tokenize(self,
  107. text: str,
  108. linebreak=True,
  109. whitespaces=True,
  110. add_dummy_prefix=True) -> List[str]:
  111. """
  112. @param text: Text to encode.
  113. @param linebreak: Whether to encode newline (\n) in text.
  114. @param whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding.
  115. @param special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text.
  116. @param add_dummy_prefix: Whether to add dummy blank space in the beginning.
  117. """
  118. text = self._preprocess(text, linebreak, whitespaces)
  119. if not add_dummy_prefix:
  120. text = '<n>' + text
  121. tokens = self._get_text_tokenizer().tokenize(text)
  122. return tokens if add_dummy_prefix else tokens[2:]
  123. def __getitem__(self, x: Union[int, str]):
  124. if isinstance(x, int):
  125. if x < self.num_image_tokens:
  126. return '<image_{}>'.format(x)
  127. else:
  128. return self.text_tokenizer.convert_id_to_token(
  129. x - self.num_image_tokens)
  130. elif isinstance(x, str):
  131. if x.startswith('<image_') and x.endswith(
  132. '>') and x[7:-1].isdigit():
  133. return int(x[7:-1])
  134. else:
  135. return self.text_tokenizer.convert_token_to_id(
  136. x) + self.num_image_tokens
  137. else:
  138. raise ValueError('The key should be str or int.')
  139. class ChatGLMTokenizer(PreTrainedTokenizer):
  140. """
  141. Construct a ChatGLM tokenizer. Based on byte-level Byte-Pair-Encoding.
  142. Args:
  143. vocab_file: Path to the vocabulary file.
  144. do_lower_case: Use lower case letters.
  145. remove_space: Remove spaces.
  146. bos_token: The bos token
  147. eos_token: The Eos Token
  148. end_token: The end token
  149. mask_token: The mask token
  150. gmask_token: The gmask token
  151. padding_side: The padding side
  152. num_image_tokens: The `num_image_tokens` in `SPTokenizer`
  153. """
  154. vocab_files_names = {'vocab_file': 'ice_text.model'}
  155. max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
  156. model_input_names = ['input_ids', 'attention_mask', 'position_ids']
  157. def __init__(self,
  158. vocab_file,
  159. do_lower_case=False,
  160. remove_space=False,
  161. bos_token='<sop>',
  162. eos_token='<eop>',
  163. end_token='</s>',
  164. mask_token='[MASK]',
  165. gmask_token='[gMASK]',
  166. padding_side='left',
  167. num_image_tokens=20000,
  168. **kwargs) -> None:
  169. self.sp_tokenizer = SPTokenizer(
  170. vocab_file, num_image_tokens=num_image_tokens)
  171. super().__init__(
  172. do_lower_case=do_lower_case,
  173. remove_space=remove_space,
  174. padding_side=padding_side,
  175. bos_token=bos_token,
  176. eos_token=eos_token,
  177. end_token=end_token,
  178. mask_token=mask_token,
  179. gmask_token=gmask_token,
  180. num_image_tokens=num_image_tokens,
  181. **kwargs)
  182. self.do_lower_case = do_lower_case
  183. self.remove_space = remove_space
  184. self.vocab_file = vocab_file
  185. self.bos_token = bos_token
  186. self.eos_token = eos_token
  187. self.end_token = end_token
  188. self.mask_token = mask_token
  189. self.gmask_token = gmask_token
  190. """ Initialisation """
  191. @property
  192. def gmask_token_id(self) -> Optional[int]:
  193. if self.gmask_token is None:
  194. return None
  195. return self.convert_tokens_to_ids(self.gmask_token)
  196. @property
  197. def end_token_id(self) -> Optional[int]:
  198. """
  199. `Optional[int]`: Id of the end of context token in the vocabulary. Returns `None` if the token has not been
  200. set.
  201. """
  202. if self.end_token is None:
  203. return None
  204. return self.convert_tokens_to_ids(self.end_token)
  205. @property
  206. def vocab_size(self):
  207. """ Returns vocab size """
  208. return self.sp_tokenizer.num_tokens
  209. def get_vocab(self):
  210. """ Returns vocab as a dict """
  211. vocab = {
  212. self._convert_id_to_token(i): i
  213. for i in range(self.vocab_size)
  214. }
  215. vocab.update(self.added_tokens_encoder)
  216. return vocab
  217. def preprocess_text(self, inputs):
  218. if self.remove_space:
  219. outputs = ' '.join(inputs.strip().split())
  220. else:
  221. outputs = inputs
  222. if self.do_lower_case:
  223. outputs = outputs.lower()
  224. return outputs
  225. def _tokenize(self, text, **kwargs):
  226. """ Returns a tokenized string. """
  227. text = self.preprocess_text(text)
  228. seq = self.sp_tokenizer.tokenize(text)
  229. return seq
  230. def _decode(self,
  231. token_ids: Union[int, List[int]],
  232. skip_special_tokens: bool = False,
  233. clean_up_tokenization_spaces: bool = True,
  234. **kwargs) -> str:
  235. if isinstance(token_ids, int):
  236. token_ids = [token_ids]
  237. if len(token_ids) == 0:
  238. return ''
  239. if self.pad_token_id in token_ids: # remove pad
  240. token_ids = list(filter((self.pad_token_id).__ne__, token_ids))
  241. return self.sp_tokenizer.decode(token_ids)
  242. def _convert_token_to_id(self, token):
  243. """ Converts a token (str) in an id using the vocab. """
  244. return self.sp_tokenizer[token]
  245. def _convert_id_to_token(self, index):
  246. """Converts an index (integer) in a token (str) using the vocab."""
  247. return self.sp_tokenizer[index]
  248. def save_vocabulary(self, save_directory, filename_prefix=None):
  249. """
  250. Save the vocabulary and special tokens file to a directory.
  251. Args:
  252. save_directory (`str`):
  253. The directory in which to save the vocabulary.
  254. filename_prefix (`str`, *optional*):
  255. An optional prefix to add to the named of the saved files.
  256. Returns:
  257. `Tuple(str)`: Paths to the files saved.
  258. """
  259. if os.path.isdir(save_directory):
  260. vocab_file = os.path.join(save_directory,
  261. self.vocab_files_names['vocab_file'])
  262. else:
  263. vocab_file = save_directory
  264. with open(self.vocab_file, 'rb') as fin:
  265. proto_str = fin.read()
  266. with open(vocab_file, 'wb') as writer:
  267. writer.write(proto_str)
  268. return (vocab_file, )
  269. def build_inputs_with_special_tokens(
  270. self,
  271. token_ids_0: List[int],
  272. token_ids_1: Optional[List[int]] = None) -> List[int]:
  273. """
  274. Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
  275. adding special tokens. A BERT sequence has the following format:
  276. - single sequence: `[CLS] X [SEP]`
  277. - pair of sequences: `[CLS] A [SEP] B [SEP]`
  278. Args:
  279. token_ids_0 (`List[int]`):
  280. List of IDs to which the special tokens will be added.
  281. token_ids_1 (`List[int]`, *optional*):
  282. Optional second list of IDs for sequence pairs.
  283. Returns:
  284. `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
  285. """
  286. mask_ids = self.sp_tokenizer[self.mask_token]
  287. gmask_ids = self.sp_tokenizer[self.gmask_token]
  288. eos_id = self.sp_tokenizer[self.eos_token]
  289. if mask_ids not in token_ids_0 and gmask_ids not in token_ids_0:
  290. token_ids_0 += [gmask_ids]
  291. if token_ids_0[-1] != mask_ids and token_ids_0[-1] != gmask_ids:
  292. token_ids_0 += [self.sp_tokenizer[self.end_token]]
  293. token_ids_0 += [self.sp_tokenizer[self.bos_token]]
  294. if token_ids_1 is not None:
  295. if not token_ids_1 or token_ids_1[-1] != eos_id:
  296. token_ids_1 += [eos_id]
  297. token_ids_0 += token_ids_1
  298. return token_ids_0
  299. def _pad(
  300. self,
  301. encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
  302. max_length: Optional[int] = None,
  303. padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
  304. pad_to_multiple_of: Optional[int] = None,
  305. return_attention_mask: Optional[bool] = None,
  306. ) -> dict:
  307. """
  308. Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
  309. Args:
  310. encoded_inputs:
  311. Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
  312. max_length: maximum length of the returned list and optionally padding length (see below).
  313. Will truncate by taking into account the special tokens.
  314. padding_strategy: PaddingStrategy to use for padding.
  315. - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
  316. - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
  317. - PaddingStrategy.DO_NOT_PAD: Do not pad
  318. The tokenizer padding sides are defined in self.padding_side:
  319. - 'left': pads on the left of the sequences
  320. - 'right': pads on the right of the sequences
  321. pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
  322. This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
  323. `>= 7.5` (Volta).
  324. return_attention_mask:
  325. (optional) Set to False to avoid returning attention mask (default: set to model specifics)
  326. """
  327. # Load from model defaults
  328. bos_token_id = self.sp_tokenizer[self.bos_token]
  329. mask_token_id = self.sp_tokenizer[self.mask_token]
  330. gmask_token_id = self.sp_tokenizer[self.gmask_token]
  331. assert self.padding_side == 'left'
  332. required_input = encoded_inputs[self.model_input_names[0]]
  333. seq_length = len(required_input)
  334. if padding_strategy == PaddingStrategy.LONGEST:
  335. max_length = len(required_input)
  336. if max_length is not None and pad_to_multiple_of is not None and (
  337. max_length % pad_to_multiple_of != 0):
  338. max_length = (
  339. (max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
  340. needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(
  341. required_input) != max_length
  342. # Initialize attention mask if not present.
  343. if max_length is not None:
  344. if 'attention_mask' not in encoded_inputs:
  345. if bos_token_id in required_input:
  346. context_length = required_input.index(bos_token_id)
  347. else:
  348. context_length = seq_length
  349. attention_mask = np.ones((1, seq_length, seq_length))
  350. attention_mask = np.tril(attention_mask)
  351. attention_mask[:, :, :context_length] = 1
  352. attention_mask = np.bool_(attention_mask < 0.5)
  353. encoded_inputs['attention_mask'] = attention_mask
  354. if 'position_ids' not in encoded_inputs:
  355. position_ids = np.arange(seq_length, dtype=np.int64)
  356. mask_token = mask_token_id if mask_token_id in required_input else gmask_token_id
  357. if mask_token in required_input:
  358. mask_position = required_input.index(mask_token)
  359. position_ids[context_length:] = mask_position
  360. block_position_ids = np.concatenate([
  361. np.zeros(context_length, dtype=np.int64),
  362. np.arange(
  363. 1, seq_length - context_length + 1, dtype=np.int64)
  364. ])
  365. encoded_inputs['position_ids'] = np.stack(
  366. [position_ids, block_position_ids], axis=0)
  367. if needs_to_be_padded:
  368. difference = max_length - len(required_input)
  369. if 'attention_mask' in encoded_inputs:
  370. encoded_inputs['attention_mask'] = np.pad(
  371. encoded_inputs['attention_mask'],
  372. pad_width=[(0, 0), (difference, 0), (difference, 0)],
  373. mode='constant',
  374. constant_values=True)
  375. if 'token_type_ids' in encoded_inputs:
  376. encoded_inputs['token_type_ids'] = [
  377. self.pad_token_type_id
  378. ] * difference + encoded_inputs['token_type_ids']
  379. if 'special_tokens_mask' in encoded_inputs:
  380. encoded_inputs['special_tokens_mask'] = [
  381. 1
  382. ] * difference + encoded_inputs['special_tokens_mask']
  383. if 'position_ids' in encoded_inputs:
  384. encoded_inputs['position_ids'] = np.pad(
  385. encoded_inputs['position_ids'],
  386. pad_width=[(0, 0), (difference, 0)])
  387. encoded_inputs[self.model_input_names[
  388. 0]] = [self.pad_token_id] * difference + required_input
  389. return encoded_inputs