ggml.py 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780
  1. # coding=utf-8
  2. # Copyright 2024 The ggml.ai team and The HuggingFace Inc. team. and pygguf author (github.com/99991)
  3. # https://github.com/99991/pygguf
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License");
  6. # you may not use this file except in compliance with the License.
  7. # You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. """
  17. Integration with GGML / The file is copied and adapted from https://github.com/99991/pygguf
  18. with extra methods beings exposed
  19. """
  20. from array import array
  21. import numpy as np
  22. from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers, processors
  23. from tokenizers.models import BPE, Unigram
  24. from .. import AddedToken
  25. from ..convert_slow_tokenizer import GemmaConverter, GPT2Converter, LlamaConverter, Qwen2Converter, T5Converter
  26. from ..utils import logging
  27. from ..utils.logging import tqdm
  28. logger = logging.get_logger(__name__)
  29. GGUF_CONFIG_MAPPING = {
  30. "general": {
  31. "architecture": "model_type",
  32. "name": "_model_name_or_path",
  33. },
  34. "llama": {
  35. "context_length": "max_position_embeddings",
  36. "block_count": "num_hidden_layers",
  37. "feed_forward_length": "intermediate_size",
  38. "embedding_length": "hidden_size",
  39. # NOTE: rope.dimension_count==head_dim only suitable for llama/mistral
  40. "rope.dimension_count": "head_dim",
  41. "rope.freq_base": "rope_theta",
  42. "attention.head_count": "num_attention_heads",
  43. "attention.head_count_kv": "num_key_value_heads",
  44. "attention.layer_norm_rms_epsilon": "rms_norm_eps",
  45. "vocab_size": "vocab_size",
  46. },
  47. "mistral": {
  48. "context_length": "max_position_embeddings",
  49. "block_count": "num_hidden_layers",
  50. "feed_forward_length": "intermediate_size",
  51. "embedding_length": "hidden_size",
  52. # NOTE: rope.dimension_count==head_dim only suitable for llama/mistral
  53. "rope.dimension_count": "head_dim",
  54. "rope.freq_base": "rope_theta",
  55. "attention.head_count": "num_attention_heads",
  56. "attention.head_count_kv": "num_key_value_heads",
  57. "attention.layer_norm_rms_epsilon": "rms_norm_eps",
  58. "vocab_size": "vocab_size",
  59. },
  60. "qwen2": {
  61. "context_length": "max_position_embeddings",
  62. "block_count": "num_hidden_layers",
  63. "feed_forward_length": "intermediate_size",
  64. "embedding_length": "hidden_size",
  65. "rope.dimension_count": None,
  66. "rope.freq_base": "rope_theta",
  67. "attention.head_count": "num_attention_heads",
  68. "attention.head_count_kv": "num_key_value_heads",
  69. "attention.layer_norm_rms_epsilon": "rms_norm_eps",
  70. "vocab_size": "vocab_size",
  71. },
  72. "qwen2moe": {
  73. "context_length": "max_position_embeddings",
  74. "block_count": "num_hidden_layers",
  75. "feed_forward_length": "intermediate_size",
  76. "embedding_length": "hidden_size",
  77. "rope.dimension_count": None,
  78. "rope.freq_base": "rope_theta",
  79. "attention.head_count": "num_attention_heads",
  80. "attention.head_count_kv": "num_key_value_heads",
  81. "attention.layer_norm_rms_epsilon": "rms_norm_eps",
  82. "vocab_size": "vocab_size",
  83. "expert_count": "num_experts",
  84. "expert_used_count": "num_experts_per_tok",
  85. },
  86. "lfm2": {
  87. "context_length": "max_position_embeddings",
  88. "block_count": "num_hidden_layers",
  89. "feed_forward_length": "intermediate_size",
  90. "embedding_length": "hidden_size",
  91. "rope.dimension_count": None,
  92. "rope.freq_base": "rope_theta",
  93. "attention.head_count": "num_attention_heads",
  94. "attention.head_count_kv": "num_key_value_heads",
  95. "attention.layer_norm_rms_epsilon": "rms_norm_eps",
  96. "vocab_size": "vocab_size",
  97. "shortconv.l_cache": "conv_L_cache",
  98. },
  99. "qwen3": {
  100. "context_length": "max_position_embeddings",
  101. "block_count": "num_hidden_layers",
  102. "feed_forward_length": "intermediate_size",
  103. "embedding_length": "hidden_size",
  104. "rope.dimension_count": None,
  105. "rope.freq_base": "rope_theta",
  106. "attention.head_count": "num_attention_heads",
  107. "attention.head_count_kv": "num_key_value_heads",
  108. "attention.layer_norm_rms_epsilon": "rms_norm_eps",
  109. "vocab_size": "vocab_size",
  110. },
  111. "qwen3_moe": {
  112. "context_length": "max_position_embeddings",
  113. "block_count": "num_hidden_layers",
  114. "feed_forward_length": "intermediate_size",
  115. "embedding_length": "hidden_size",
  116. "rope.dimension_count": None,
  117. "rope.freq_base": "rope_theta",
  118. "attention.key_length": "head_dim",
  119. "attention.head_count": "num_attention_heads",
  120. "attention.head_count_kv": "num_key_value_heads",
  121. "attention.layer_norm_rms_epsilon": "rms_norm_eps",
  122. "vocab_size": "vocab_size",
  123. "expert_count": "num_experts",
  124. "expert_used_count": "num_experts_per_tok",
  125. },
  126. "falcon": {
  127. "context_length": "max_position_embeddings",
  128. "block_count": "num_hidden_layers",
  129. "feed_forward_length": "intermediate_size",
  130. "embedding_length": "hidden_size",
  131. "rope.dimension_count": None,
  132. "rope.freq_base": "rope_theta",
  133. "attention.head_count": "num_attention_heads",
  134. "attention.head_count_kv": "num_key_value_heads",
  135. "attention.layer_norm_rms_epsilon": "rms_norm_eps",
  136. "vocab_size": "vocab_size",
  137. },
  138. "tokenizer": {
  139. "ggml.bos_token_id": "bos_token_id",
  140. "ggml.eos_token_id": "eos_token_id",
  141. "ggml.unknown_token_id": "unk_token_id",
  142. "ggml.padding_token_id": "pad_token_id",
  143. },
  144. "phi3": {
  145. "context_length": "max_position_embeddings",
  146. "block_count": "num_hidden_layers",
  147. "feed_forward_length": "intermediate_size",
  148. "embedding_length": "hidden_size",
  149. "rope.dimension_count": None,
  150. "rope.freq_base": "rope_theta",
  151. "attention.head_count": "num_attention_heads",
  152. "attention.head_count_kv": "num_key_value_heads",
  153. "attention.layer_norm_rms_epsilon": "rms_norm_eps",
  154. "vocab_size": "vocab_size",
  155. },
  156. "bloom": {
  157. "block_count": "n_layer",
  158. "embedding_length": "hidden_size",
  159. "attention.head_count": "n_head",
  160. "vocab_size": "vocab_size",
  161. "attention.layer_norm_epsilon": "layer_norm_epsilon",
  162. },
  163. "t5": {
  164. "context_length": "n_positions",
  165. "block_count": "num_layers",
  166. "feed_forward_length": "d_ff",
  167. "embedding_length": "d_model",
  168. "attention.key_length": "d_kv",
  169. "attention.head_count": "num_heads",
  170. "attention.head_count_kv": "num_key_value_heads",
  171. "attention.layer_norm_epsilon": "layer_norm_epsilon",
  172. "attention.relative_buckets_count": "relative_attention_num_buckets",
  173. "decoder_start_token_id": "decoder_start_token_id",
  174. "vocab_size": "vocab_size",
  175. },
  176. "stablelm": {
  177. "context_length": "max_position_embeddings",
  178. "block_count": "num_hidden_layers",
  179. "feed_forward_length": "intermediate_size",
  180. "embedding_length": "hidden_size",
  181. "rope.dimension_count": None,
  182. "attention.head_count": "num_attention_heads",
  183. "attention.head_count_kv": "num_key_value_heads",
  184. "attention.layer_norm_epsilon": "layer_norm_eps",
  185. "vocab_size": "vocab_size",
  186. },
  187. "gpt2": {
  188. "block_count": "n_layer",
  189. "context_length": "n_ctx",
  190. "embedding_length": "n_embd",
  191. "feed_forward_length": "feed_forward_length",
  192. "attention.head_count": "n_head",
  193. "attention.layer_norm_epsilon": "layer_norm_epsilon",
  194. },
  195. "starcoder2": {
  196. "block_count": "num_hidden_layers",
  197. "context_length": "max_position_embeddings",
  198. "embedding_length": "hidden_size",
  199. "feed_forward_length": "intermediate_size",
  200. "attention.head_count": "num_attention_heads",
  201. "attention.head_count_kv": "num_key_value_heads",
  202. "attention.layer_norm_epsilon": "norm_epsilon",
  203. },
  204. "mamba": {
  205. "vocab_size": "vocab_size",
  206. "context_length": "max_position_embeddings",
  207. "embedding_length": "hidden_size",
  208. "attention.layer_norm_rms_epsilon": "layer_norm_epsilon",
  209. "block_count": "num_hidden_layers",
  210. "ssm.conv_kernel": "conv_kernel",
  211. "ssm.state_size": "state_size",
  212. "ssm.time_step_rank": "time_step_rank",
  213. "ssm.inner_size": "intermediate_size",
  214. },
  215. "nemotron": {
  216. "context_length": "max_position_embeddings",
  217. "block_count": "num_hidden_layers",
  218. "feed_forward_length": "intermediate_size",
  219. "embedding_length": "hidden_size",
  220. "rope.dimension_count": None,
  221. "rope.freq_base": "rope_theta",
  222. "attention.head_count": "num_attention_heads",
  223. "attention.head_count_kv": "num_key_value_heads",
  224. "attention.layer_norm_rms_epsilon": "norm_eps",
  225. "vocab_size": "vocab_size",
  226. },
  227. "gemma2": {
  228. "context_length": "max_position_embeddings",
  229. "block_count": "num_hidden_layers",
  230. "feed_forward_length": "intermediate_size",
  231. "embedding_length": "hidden_size",
  232. "rope.dimension_count": None,
  233. "rope.freq_base": "rope_theta",
  234. # NOTE: Gemma2 has key_length==value_length==head_dim
  235. # See: https://github.com/ggerganov/llama.cpp/blob/2e2f8f093cd4fb6bbb87ba84f6b9684fa082f3fa/convert_hf_to_gguf.py#L3293-L3294
  236. "attention.key_length": "head_dim",
  237. "attention.head_count": "num_attention_heads",
  238. "attention.head_count_kv": "num_key_value_heads",
  239. "attention.layer_norm_rms_epsilon": "rms_norm_eps",
  240. "attention.sliding_window": "sliding_window",
  241. "vocab_size": "vocab_size",
  242. },
  243. "gemma3": {
  244. "context_length": "max_position_embeddings",
  245. "block_count": "num_hidden_layers",
  246. "feed_forward_length": "intermediate_size",
  247. "embedding_length": "hidden_size",
  248. "rope.dimension_count": None,
  249. "rope.freq_base": "rope_theta",
  250. # NOTE: Gemma3 has key_length==value_length==head_dim
  251. # See: https://github.com/ggml-org/llama.cpp/blob/fe5b78c89670b2f37ecb216306bed3e677b49d9f/convert_hf_to_gguf.py#L3495-L3496
  252. "attention.key_length": "head_dim",
  253. "attention.head_count": "num_attention_heads",
  254. "attention.head_count_kv": "num_key_value_heads",
  255. "attention.layer_norm_rms_epsilon": "rms_norm_eps",
  256. "attention.sliding_window": "sliding_window",
  257. "vocab_size": "vocab_size",
  258. },
  259. "umt5": {
  260. "context_length": "n_positions",
  261. "block_count": "num_layers",
  262. "feed_forward_length": "d_ff",
  263. "embedding_length": "d_model",
  264. "attention.key_length": "d_kv",
  265. "attention.head_count": "num_heads",
  266. "attention.head_count_kv": "num_key_value_heads",
  267. "attention.layer_norm_epsilon": "layer_norm_epsilon",
  268. "attention.relative_buckets_count": "relative_attention_num_buckets",
  269. "decoder_start_token_id": "decoder_start_token_id",
  270. "vocab_size": "vocab_size",
  271. },
  272. "deci": {
  273. "context_length": "max_position_embeddings",
  274. "block_count": "num_hidden_layers",
  275. "feed_forward_length": "intermediate_size",
  276. "embedding_length": "hidden_size",
  277. "rope.dimension_count": None,
  278. "rope.freq_base": "rope_theta",
  279. "attention.head_count": "num_attention_heads",
  280. "attention.head_count_kv": "num_key_value_heads",
  281. "attention.layer_norm_rms_epsilon": "rms_norm_eps",
  282. "vocab_size": "vocab_size",
  283. },
  284. }
  285. GGUF_TOKENIZER_MAPPING = {
  286. "tokenizer": {
  287. "ggml.model": "tokenizer_type",
  288. "ggml.tokens": "tokens",
  289. "ggml.scores": "scores",
  290. "ggml.token_type": "token_type",
  291. "ggml.merges": "merges",
  292. "ggml.bos_token_id": "bos_token_id",
  293. "ggml.eos_token_id": "eos_token_id",
  294. "ggml.unknown_token_id": "unk_token_id",
  295. "ggml.padding_token_id": "pad_token_id",
  296. "ggml.add_space_prefix": "add_prefix_space",
  297. },
  298. "tokenizer_config": {
  299. "chat_template": "chat_template",
  300. "ggml.model": "model_type",
  301. "ggml.bos_token_id": "bos_token_id",
  302. "ggml.eos_token_id": "eos_token_id",
  303. "ggml.unknown_token_id": "unk_token_id",
  304. "ggml.padding_token_id": "pad_token_id",
  305. },
  306. }
  307. def _gguf_parse_value(_value, data_type):
  308. if not isinstance(data_type, list):
  309. data_type = [data_type]
  310. if len(data_type) == 1:
  311. data_type = data_type[0]
  312. array_data_type = None
  313. else:
  314. if data_type[0] != 9:
  315. raise ValueError("Received multiple types, therefore expected the first type to indicate an array.")
  316. data_type, array_data_type = data_type
  317. if data_type in [0, 1, 2, 3, 4, 5, 10, 11]:
  318. _value = int(_value[0])
  319. elif data_type in [6, 12]:
  320. _value = float(_value[0])
  321. elif data_type == 7:
  322. _value = bool(_value[0])
  323. elif data_type == 8:
  324. _value = array("B", list(_value)).tobytes().decode()
  325. elif data_type == 9:
  326. _value = _gguf_parse_value(_value, array_data_type)
  327. return _value
  328. class GGUFTokenizerSkeleton:
  329. def __init__(self, dict_):
  330. for k, v in dict_.items():
  331. setattr(self, k, v)
  332. if not hasattr(self, "merges"):
  333. if not hasattr(self, "tokens") or not hasattr(self, "scores"):
  334. raise ValueError(
  335. "tokens and scores need to be passed for a LLaMa tokenizer without merges to be instantiated."
  336. )
  337. tokens = self.tokens
  338. scores = self.scores
  339. vocab = {t: scores[i] for i, t in enumerate(tokens)}
  340. logger.warning("Merges were not in checkpoint, building merges on the fly.")
  341. merges = []
  342. for merge, piece_score in tqdm(vocab.items()):
  343. local = []
  344. for index in range(1, len(merge)):
  345. piece_l, piece_r = merge[:index], merge[index:]
  346. if piece_l in tokens and piece_r in tokens:
  347. local.append((piece_l, piece_r, piece_score))
  348. local = sorted(local, key=lambda x: (vocab[x[0]], vocab[x[1]]), reverse=True)
  349. merges.extend(local)
  350. merges = sorted(merges, key=lambda val: val[2], reverse=True)
  351. merges = [(val[0], val[1]) for val in merges]
  352. self.merges = merges
  353. else:
  354. self.merges = [tuple(merge.split(" ")) for merge in self.merges]
  355. if not hasattr(self, "scores"):
  356. self.scores = [None for _ in range(len(self.tokens))]
  357. if not hasattr(self, "added_tokens"):
  358. self.added_tokens = []
  359. if not hasattr(self, "unk_token_id"):
  360. self.unk_token_id = None
  361. # Llama2 uses the field `unknown_token_id`
  362. if hasattr(self, "unknown_token_id") and self.unk_token_id is None:
  363. self.unk_token_id = self.unknown_token_id
  364. class GGUFLlamaConverter(LlamaConverter):
  365. def __init__(self, tokenizer_dict):
  366. self.proto = GGUFTokenizerSkeleton(tokenizer_dict)
  367. self.original_tokenizer = self.proto
  368. self.additional_kwargs = {}
  369. self.is_llama_3_tokenizer = getattr(self.proto, "tokenizer_type", "llama") != "llama"
  370. def vocab(self, proto):
  371. return list(zip(proto.tokens, proto.scores))
  372. def merges(self, proto):
  373. return proto.merges
  374. def tokenizer(self, proto):
  375. vocab_scores = self.vocab(self.proto)
  376. merges = self.merges(self.proto)
  377. bpe_vocab = {word: i for i, (word, _score) in enumerate(vocab_scores)}
  378. unk_token = proto.tokens[proto.unk_token_id] if proto.unk_token_id is not None else None
  379. bos_token = proto.tokens[proto.bos_token_id] if getattr(proto, "bos_token_id", None) is not None else None
  380. eos_token = proto.tokens[proto.bos_token_id] if getattr(proto, "eos_token_id", None) is not None else None
  381. tokenizer = Tokenizer(
  382. BPE(
  383. bpe_vocab,
  384. merges,
  385. unk_token=unk_token,
  386. fuse_unk=True,
  387. byte_fallback=True,
  388. )
  389. )
  390. special_tokens = []
  391. if not hasattr(self.proto, "token_type"):
  392. if unk_token is not None:
  393. special_tokens.append(AddedToken(unk_token, normalized=False, special=True))
  394. if bos_token is not None:
  395. special_tokens.append(AddedToken(bos_token, normalized=False, special=True))
  396. if eos_token is not None:
  397. special_tokens.append(AddedToken(eos_token, normalized=False, special=True))
  398. else:
  399. # 3 stands for special tokens
  400. special_tokens_idx = np.where(np.array(self.proto.token_type) == 3)[0]
  401. for idx in special_tokens_idx:
  402. special_tokens.append(AddedToken(self.proto.tokens[idx], normalized=False, special=True))
  403. if len(special_tokens) != 0:
  404. tokenizer.add_special_tokens(special_tokens)
  405. if len(self.proto.added_tokens) != 0:
  406. tokenizer.add_tokens(
  407. [AddedToken(added_token, normalized=False, special=False) for added_token in self.proto.added_tokens]
  408. )
  409. self.additional_kwargs["unk_token"] = unk_token
  410. self.additional_kwargs["eos_token"] = bos_token
  411. self.additional_kwargs["bos_token"] = eos_token
  412. if self.is_llama_3_tokenizer:
  413. self.additional_kwargs["add_prefix_space"] = None
  414. self.additional_kwargs["clean_up_tokenization_spaces"] = True
  415. self.additional_kwargs["legacy"] = False
  416. self.original_tokenizer.legacy = False
  417. return tokenizer
  418. def decoder(self, replacement, add_prefix_space):
  419. sequence = [
  420. decoders.ByteFallback(),
  421. decoders.Fuse(),
  422. decoders.Replace("▁", " "),
  423. ]
  424. if self.is_llama_3_tokenizer:
  425. sequence += [decoders.ByteLevel(add_prefix_space=False, trim_offsets=False, use_regex=True)]
  426. if add_prefix_space:
  427. sequence += [decoders.Strip(content=" ", left=1)]
  428. return decoders.Sequence(sequence)
  429. def converted(self):
  430. # Copied partly from converted method in SpmConverter class
  431. tokenizer = self.tokenizer(self.proto)
  432. # Tokenizer assemble
  433. normalizer = self.normalizer(self.proto)
  434. if normalizer is not None:
  435. tokenizer.normalizer = normalizer
  436. replacement = "▁"
  437. add_prefix_space = True
  438. if hasattr(self.original_tokenizer, "add_prefix_space"):
  439. add_prefix_space = self.original_tokenizer.add_prefix_space
  440. pre_tokenizer = self.pre_tokenizer(replacement, add_prefix_space)
  441. if pre_tokenizer is not None:
  442. tokenizer.pre_tokenizer = pre_tokenizer
  443. tokenizer.decoder = self.decoder(replacement, add_prefix_space)
  444. post_processor = self.post_processor()
  445. if post_processor:
  446. tokenizer.post_processor = post_processor
  447. # HACK: patch the llama-3 tokenizer to use the corresponding pre-tokenizer
  448. # and normalizer
  449. if self.is_llama_3_tokenizer:
  450. tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(
  451. add_prefix_space=False, trim_offsets=False, use_regex=True
  452. )
  453. # This is tricky as the additional kwargs are passed after legacy is force-set in LlamaTokenizer's
  454. # init.
  455. tokenizer.normalizer = normalizers.Sequence([])
  456. return tokenizer
  457. class GGUFQwen2Converter(Qwen2Converter):
  458. def __init__(self, tokenizer_dict):
  459. self.original_tokenizer = GGUFTokenizerSkeleton(tokenizer_dict)
  460. self.additional_kwargs = {}
  461. def converted(self) -> Tokenizer:
  462. vocab = {word: i for i, word in enumerate(self.original_tokenizer.tokens)}
  463. merges = self.original_tokenizer.merges
  464. tokenizer = super().converted(vocab, merges)
  465. tokenizer.add_special_tokens(
  466. [
  467. AddedToken("<|endoftext|>", normalized=False, special=True),
  468. AddedToken("<|im_start|>", normalized=False, special=True),
  469. AddedToken("<|im_end|>", normalized=False, special=True),
  470. ]
  471. )
  472. return tokenizer
  473. class GGUFPhi3Converter(LlamaConverter):
  474. def __init__(self, tokenizer_dict):
  475. self.proto = GGUFTokenizerSkeleton(tokenizer_dict)
  476. self.original_tokenizer = self.proto
  477. self.additional_kwargs = {}
  478. def vocab(self, proto):
  479. return list(zip(proto.tokens, proto.scores))
  480. def merges(self, proto):
  481. return proto.merges
  482. def tokenizer(self, proto):
  483. vocab_scores = self.vocab(self.proto)
  484. merges = self.merges(self.proto)
  485. bpe_vocab = {word: i for i, (word, _score) in enumerate(vocab_scores)}
  486. tokenizer = Tokenizer(BPE(bpe_vocab, merges))
  487. # add the special tokens from phi3 tokenizer config
  488. tokenizer.add_special_tokens(
  489. [
  490. AddedToken("</s>", rstrip=True, lstrip=False, normalized=False, special=True),
  491. AddedToken("<|endoftext|>", normalized=False, special=True),
  492. AddedToken("<|assistant|>", rstrip=True, normalized=False, special=True),
  493. AddedToken("<|placeholder1|>", rstrip=True, normalized=False, special=True),
  494. AddedToken("<|placeholder2|>", rstrip=True, normalized=False, special=True),
  495. AddedToken("<|placeholder3|>", rstrip=True, normalized=False, special=True),
  496. AddedToken("<|placeholder4|>", rstrip=True, normalized=False, special=True),
  497. AddedToken("<|system|>", rstrip=True, normalized=False, special=True),
  498. AddedToken("<|end|>", rstrip=True, normalized=False, special=True),
  499. AddedToken("<|placeholder5|>", rstrip=True, normalized=False, special=True),
  500. AddedToken("<|placeholder6|>", rstrip=True, normalized=False, special=True),
  501. AddedToken("<|user|>", rstrip=True, normalized=False, special=True),
  502. ]
  503. )
  504. self.additional_kwargs["unk_token"] = (
  505. proto.tokens[proto.unk_token_id] if proto.unk_token_id is not None else None
  506. )
  507. self.additional_kwargs["eos_token"] = (
  508. proto.tokens[proto.eos_token_id] if proto.eos_token_id is not None else None
  509. )
  510. self.additional_kwargs["bos_token"] = (
  511. proto.tokens[proto.bos_token_id] if proto.bos_token_id is not None else None
  512. )
  513. self.additional_kwargs["pad_token"] = (
  514. proto.tokens[proto.pad_token_id] if proto.pad_token_id is not None else None
  515. )
  516. return tokenizer
  517. def decoder(self, replacement, add_prefix_space):
  518. sequence = [
  519. decoders.ByteFallback(),
  520. decoders.Fuse(),
  521. decoders.Replace(replacement, " "),
  522. ]
  523. if add_prefix_space:
  524. sequence += [decoders.Strip(content=" ", left=1)]
  525. return decoders.Sequence(sequence)
  526. def converted(self) -> Tokenizer:
  527. tokenizer = self.tokenizer(self.proto)
  528. replacement = "▁"
  529. add_prefix_space = True
  530. if hasattr(self.original_tokenizer, "add_prefix_space"):
  531. add_prefix_space = self.original_tokenizer.add_prefix_space
  532. tokenizer.decoder = self.decoder(replacement, add_prefix_space)
  533. return tokenizer
  534. class GGUFGPTConverter(GPT2Converter):
  535. def __init__(self, tokenizer_dict):
  536. self.original_tokenizer = GGUFTokenizerSkeleton(tokenizer_dict)
  537. self.additional_kwargs = {}
  538. def converted(self) -> Tokenizer:
  539. vocab = {word: i for i, word in enumerate(self.original_tokenizer.tokens)}
  540. merges = self.original_tokenizer.merges
  541. tokenizer = super().converted(vocab, merges)
  542. return tokenizer
  543. class GGUFT5Converter(T5Converter):
  544. def __init__(self, tokenizer_dict):
  545. # set dummy data to avoid unnecessary merges calculation
  546. tokenizer_dict["merges"] = ["dummy text"]
  547. self.proto = GGUFTokenizerSkeleton(tokenizer_dict)
  548. self.token2id = {k: v for v, k in enumerate(self.proto.tokens)}
  549. self.original_tokenizer = self.proto
  550. self.additional_kwargs = {}
  551. def vocab(self, proto):
  552. return list(zip(proto.tokens, proto.scores))
  553. def normalizer(self, proto):
  554. if getattr(self.original_tokenizer, "legacy", True):
  555. sequence = []
  556. if getattr(self.original_tokenizer, "add_prefix_space", True):
  557. sequence += [normalizers.Prepend(prepend="▁")]
  558. sequence += [normalizers.Replace(pattern=" ", content="▁")]
  559. return normalizers.Sequence(sequence)
  560. return None # non-legacy, no normalizer
  561. def post_processor(self):
  562. return processors.TemplateProcessing(
  563. single=["$A", "</s>"],
  564. pair=["$A", "</s>", "$B", "</s>"],
  565. special_tokens=[
  566. ("</s>", self.token2id["</s>"]),
  567. ],
  568. )
  569. def converted(self) -> Tokenizer:
  570. vocab_scores = self.vocab(self.proto)
  571. tokenizer = Tokenizer(
  572. Unigram(
  573. vocab_scores,
  574. unk_id=self.proto.unk_token_id,
  575. byte_fallback=False,
  576. )
  577. )
  578. # Tokenizer assemble
  579. normalizer = self.normalizer(self.proto)
  580. if normalizer is not None:
  581. tokenizer.normalizer = normalizer
  582. replacement = "▁"
  583. add_prefix_space = True
  584. if hasattr(self.original_tokenizer, "add_prefix_space"):
  585. add_prefix_space = self.original_tokenizer.add_prefix_space
  586. pre_tokenizer = self.pre_tokenizer(replacement, add_prefix_space)
  587. if pre_tokenizer is not None:
  588. tokenizer.pre_tokenizer = pre_tokenizer
  589. tokenizer.decoder = self.decoder(replacement, add_prefix_space)
  590. post_processor = self.post_processor()
  591. if post_processor:
  592. tokenizer.post_processor = post_processor
  593. return tokenizer
  594. class GGUFGemmaConverter(GemmaConverter):
  595. def __init__(self, tokenizer_dict):
  596. # set dummy data to avoid unnecessary merges calculation
  597. tokenizer_dict["merges"] = ["dummy text"]
  598. self.proto = GGUFTokenizerSkeleton(tokenizer_dict)
  599. self.original_tokenizer = self.proto
  600. self.additional_kwargs = {}
  601. def vocab(self, proto):
  602. original_vocab = list(zip(proto.tokens, proto.scores))
  603. updated_vocab = []
  604. for token, score in original_vocab:
  605. if token == "<0x09>":
  606. updated_vocab.append(("\t", score))
  607. elif " " in token and len(token.strip()) == 0:
  608. underscores = "▁" * len(token)
  609. updated_vocab.append((underscores, score))
  610. else:
  611. updated_vocab.append((token, score))
  612. return updated_vocab
  613. def normalizer(self, proto):
  614. return normalizers.Replace(" ", "▁")
  615. def decoder(self, replacement, add_prefix_space):
  616. sequence = [
  617. decoders.Replace("▁", " "),
  618. decoders.ByteFallback(),
  619. decoders.Fuse(),
  620. ]
  621. if add_prefix_space:
  622. sequence += [decoders.Strip(content=" ", left=1)]
  623. return decoders.Sequence(sequence)
  624. def converted(self) -> Tokenizer:
  625. vocab_scores = self.vocab(self.proto)
  626. tokenizer = Tokenizer(
  627. Unigram(
  628. vocab_scores,
  629. unk_id=self.proto.unk_token_id,
  630. byte_fallback=self.handle_byte_fallback,
  631. )
  632. )
  633. normalizer = self.normalizer(self.proto)
  634. if normalizer is not None:
  635. tokenizer.normalizer = normalizer
  636. replacement = "▁"
  637. add_prefix_space = True
  638. if hasattr(self.original_tokenizer, "add_prefix_space"):
  639. add_prefix_space = self.original_tokenizer.add_prefix_space
  640. tokenizer.decoder = self.decoder(replacement, add_prefix_space)
  641. pre_tokenizer = self.pre_tokenizer(replacement, add_prefix_space)
  642. if pre_tokenizer is not None:
  643. tokenizer.pre_tokenizer = pre_tokenizer
  644. return tokenizer
  645. GGUF_TO_FAST_CONVERTERS = {
  646. "llama": GGUFLlamaConverter,
  647. "qwen2": GGUFQwen2Converter,
  648. "qwen2_moe": GGUFQwen2Converter,
  649. "qwen3": GGUFQwen2Converter,
  650. "qwen3_moe": GGUFQwen2Converter,
  651. "phi3": GGUFPhi3Converter,
  652. "bloom": GGUFGPTConverter,
  653. "falcon": GGUFGPTConverter,
  654. "stablelm": GGUFGPTConverter,
  655. "gpt2": GGUFGPTConverter,
  656. "starcoder2": GGUFGPTConverter,
  657. "t5": GGUFT5Converter,
  658. "mamba": GGUFGPTConverter,
  659. "nemotron": GGUFGPTConverter,
  660. "gemma2": GGUFGemmaConverter,
  661. "gemma3_text": GGUFGemmaConverter,
  662. "umt5": GGUFT5Converter,
  663. "deci": GGUFLlamaConverter,
  664. "decilm": GGUFLlamaConverter,
  665. }
  666. def convert_gguf_tokenizer(architecture: str, tokenizer_dict) -> tuple[Tokenizer, dict]:
  667. """
  668. Utilities to convert a slow tokenizer instance in a fast tokenizer instance.
  669. Args:
  670. architecture (`str`): The model architecture derived from gguf file.
  671. transformer_tokenizer ([`~tokenization_utils_base.PreTrainedTokenizer`]):
  672. Instance of a slow tokenizer to convert in the backend tokenizer for
  673. [`~tokenization_utils_base.PreTrainedTokenizerFast`].
  674. Return:
  675. A instance of [`~tokenizers.Tokenizer`] to be used as the backend tokenizer of a
  676. [`~tokenization_utils_base.PreTrainedTokenizerFast`]
  677. """
  678. tokenizer_class_name = architecture
  679. converter = GGUF_TO_FAST_CONVERTERS[tokenizer_class_name](tokenizer_dict)
  680. fast_tokenizer = converter.converted()
  681. return fast_tokenizer, converter.additional_kwargs