tokenization_mistral_common.py 89 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883
  1. # Copyright 2025 Mistral AI and The HuggingFace Inc. team. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import os
  15. import shutil
  16. import warnings
  17. from collections.abc import Mapping, Sized
  18. from enum import Enum
  19. from pathlib import Path
  20. from typing import Any, Callable, Optional, Union, overload
  21. import numpy as np
  22. from transformers.audio_utils import load_audio_as
  23. from transformers.tokenization_utils_base import (
  24. LARGE_INTEGER,
  25. VERY_LARGE_INTEGER,
  26. BatchEncoding,
  27. EncodedInput,
  28. PreTokenizedInput,
  29. PreTrainedTokenizerBase,
  30. TextInput,
  31. TruncationStrategy,
  32. )
  33. from transformers.utils import PaddingStrategy, TensorType, add_end_docstrings, logging, to_py_obj
  34. from transformers.utils.generic import is_torch_tensor
  35. from transformers.utils.hub import PushToHubMixin
  36. from transformers.utils.import_utils import is_mistral_common_available, is_torch_available, requires
  37. if is_mistral_common_available():
  38. from mistral_common.protocol.instruct.request import ChatCompletionRequest
  39. from mistral_common.protocol.instruct.validator import ValidationMode
  40. from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy, TokenizerVersion
  41. from mistral_common.tokens.tokenizers.image import MultiModalVersion
  42. from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
  43. from mistral_common.tokens.tokenizers.tekken import Tekkenizer
  44. from mistral_common.tokens.tokenizers.utils import download_tokenizer_from_hf_hub
  45. if is_torch_available():
  46. import torch
  47. logger = logging.get_logger(__name__)
  48. ENCODE_KWARGS_DOCSTRING = r"""
  49. add_special_tokens (`bool`, *optional*, defaults to `True`):
  50. Whether or not to add special tokens when encoding the sequences. This will use the underlying
  51. `PretrainedTokenizerBase.build_inputs_with_special_tokens` function, which defines which tokens are
  52. automatically added to the input ids. This is useful if you want to add `bos` or `eos` tokens
  53. automatically.
  54. padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
  55. Activates and controls padding. Accepts the following values:
  56. - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
  57. sequence is provided).
  58. - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
  59. acceptable input length for the model if that argument is not provided.
  60. - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
  61. lengths).
  62. truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
  63. Activates and controls truncation. Accepts the following values:
  64. - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or
  65. to the maximum acceptable input length for the model if that argument is not provided.
  66. - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
  67. greater than the model maximum admissible input size).
  68. max_length (`int`, *optional*):
  69. Controls the maximum length to use by one of the truncation/padding parameters.
  70. If left unset or set to `None`, this will use the predefined model maximum length if a maximum length
  71. is required by one of the truncation/padding parameters. If the model has no specific maximum input
  72. length (like XLNet) truncation/padding to a maximum length will be deactivated.
  73. stride (`int`, *optional*, defaults to 0):
  74. If set to a number along with `max_length`, the overflowing tokens returned when
  75. `return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence
  76. returned to provide some overlap between truncated and overflowing sequences. The value of this
  77. argument defines the number of overlapping tokens.
  78. pad_to_multiple_of (`int`, *optional*):
  79. If set will pad the sequence to a multiple of the provided value. Requires `padding` to be activated.
  80. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
  81. `>= 7.5` (Volta).
  82. padding_side (`str`, *optional*):
  83. The side on which the model should have padding applied. Should be selected between ['right', 'left'].
  84. Default value is picked from the class attribute of the same name.
  85. return_tensors (`str` or [`~utils.TensorType`], *optional*):
  86. If set, will return tensors instead of list of python integers. Acceptable values are:
  87. - `'pt'`: Return PyTorch `torch.Tensor` objects.
  88. """
  89. ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
  90. return_attention_mask (`bool`, *optional*):
  91. Whether to return the attention mask. If left to the default, will return the attention mask according
  92. to the specific tokenizer's default, defined by the `return_outputs` attribute.
  93. [What are attention masks?](../glossary#attention-mask)
  94. return_overflowing_tokens (`bool`, *optional*, defaults to `False`):
  95. Whether or not to return overflowing token sequences. If a pair of sequences of input ids (or a batch
  96. of pairs) is provided with `truncation_strategy = longest_first` or `True`, an error is raised instead
  97. of returning overflowing tokens.
  98. return_special_tokens_mask (`bool`, *optional*, defaults to `False`):
  99. Whether or not to return special tokens mask information.
  100. return_offsets_mapping (`bool`, *optional*, defaults to `False`):
  101. Whether or not to return `(char_start, char_end)` for each token.
  102. This is only available on fast tokenizers inheriting from [`PreTrainedTokenizerFast`], if using
  103. Python's tokenizer, this method will raise `NotImplementedError`.
  104. return_length (`bool`, *optional*, defaults to `False`):
  105. Whether or not to return the lengths of the encoded inputs.
  106. verbose (`bool`, *optional*, defaults to `True`):
  107. Whether or not to print more information and warnings.
  108. **kwargs: passed to the `self.tokenize()` method
  109. Return:
  110. [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
  111. - **input_ids** -- List of token ids to be fed to a model.
  112. [What are input IDs?](../glossary#input-ids)
  113. - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
  114. `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names`).
  115. [What are attention masks?](../glossary#attention-mask)
  116. - **overflowing_tokens** -- List of overflowing tokens sequences (when a `max_length` is specified and
  117. `return_overflowing_tokens=True`).
  118. - **num_truncated_tokens** -- Number of tokens truncated (when a `max_length` is specified and
  119. `return_overflowing_tokens=True`).
  120. - **special_tokens_mask** -- List of 0s and 1s, with 1 specifying added special tokens and 0 specifying
  121. regular sequence tokens (when `add_special_tokens=True` and `return_special_tokens_mask=True`).
  122. - **length** -- The length of the inputs (when `return_length=True`)
  123. """
  124. class MistralTokenizerType(str, Enum):
  125. """Enum for the different type of tokenizer."""
  126. spm = "spm"
  127. tekken = "tekken"
  128. @requires(backends=("mistral-common",))
  129. class MistralCommonTokenizer(PushToHubMixin):
  130. """
  131. Class to wrap `mistral-common` tokenizers.
  132. `mistral-common` is the official tokenizer library for Mistral AI models. To use it, you need to install it with:
  133. ```bash
  134. pip install transformers[mistral-common]
  135. ```
  136. Otherwise the tokenizer falls back to the Transformers implementation of the tokenizer.
  137. For more info on `mistral-common`, see [mistral-common](https://github.com/mistralai/mistral-common).
  138. This class is a wrapper around a `mistral_common.tokens.tokenizers.mistral.MistralTokenizer`.
  139. It provides a Hugging Face compatible interface to tokenize using the official mistral-common tokenizer.
  140. Supports the following methods from the `PreTrainedTokenizerBase` class:
  141. - [`~MistralCommonTokenizer.get_vocab`]: Returns the vocabulary as a dictionary of token to index.
  142. - [`~MistralCommonTokenizer.encode`]: Encode a string to a list of integers.
  143. - [`~MistralCommonTokenizer.decode`]: Decode a list of integers to a string.
  144. - [`~MistralCommonTokenizer.batch_decode`]: Decode a batch of list of integers to a list of strings.
  145. - [`~MistralCommonTokenizer.convert_tokens_to_ids`]: Convert a list of tokens to a list of integers.
  146. - [`~MistralCommonTokenizer.convert_ids_to_tokens`]: Convert a list of integers to a list of tokens.
  147. - [`~MistralCommonTokenizer.tokenize`]: Tokenize a string.
  148. - [`~MistralCommonTokenizer.get_special_tokens_mask`]: Get the special tokens mask for a list of tokens.
  149. - [`~MistralCommonTokenizer.prepare_for_model`]: Prepare a list of inputs for the model.
  150. - [`~MistralCommonTokenizer.pad`]: Pad a list of inputs to the same length.
  151. - [`~MistralCommonTokenizer.truncate_sequences`]: Truncate a list of sequences to the same length.
  152. - [`~MistralCommonTokenizer.apply_chat_template`]: Apply a chat template to a list of messages.
  153. - [`~MistralCommonTokenizer.__call__`]: Tokenize a string or a list of strings.
  154. - [`~MistralCommonTokenizer.from_pretrained`]: Download and cache a pretrained tokenizer from the Hugging Face model hub or local directory.
  155. - [`~MistralCommonTokenizer.save_pretrained`]: Save a tokenizer to a directory, so it can be reloaded using the `from_pretrained` class method.
  156. - [`~MistralCommonTokenizer.push_to_hub`]: Upload tokenizer to the Hugging Face model hub.
  157. Here are the key differences with the `PreTrainedTokenizerBase` class:
  158. - Pair of sequences are not supported. The signature have been kept for compatibility but all arguments related to pair of sequences are ignored. The return values of pairs are returned as `None`.
  159. - The `is_split_into_words` argument is not supported.
  160. - The `return_token_type_ids` argument is not supported.
  161. - It is not possible to add new tokens to the tokenizer. Also the special tokens are handled differently from Transformers. In `mistral-common`, special tokens are never encoded directly. This means that: `tokenizer.encode("<s>")` will not return the ID of the `<s>` token. Instead, it will return a list of IDs corresponding to the tokenization of the string `"<s>"`. For more information, see the [mistral-common documentation](https://mistralai.github.io/mistral-common/usage/tokenizers/#special-tokens).
  162. If you have suggestions to improve this class, please open an issue on the [mistral-common GitHub repository](https://github.com/mistralai/mistral-common/issues) if it is related to the tokenizer or on the [Transformers GitHub repository](https://github.com/huggingface/transformers/issues) if it is related to the Hugging Face interface.
  163. """
  164. model_input_names: list[str] = ["input_ids", "attention_mask"]
  165. padding_side: str = "left"
  166. truncation_side: str = "right"
  167. def __init__(
  168. self,
  169. tokenizer_path: Union[str, os.PathLike, Path],
  170. mode: ValidationMode = ValidationMode.test,
  171. model_max_length: int = VERY_LARGE_INTEGER,
  172. padding_side: str = "left",
  173. truncation_side: str = "right",
  174. model_input_names: Optional[list[str]] = None,
  175. clean_up_tokenization_spaces: bool = False,
  176. **kwargs,
  177. ):
  178. """
  179. Constructs a `MistralCommonTokenizer`.
  180. - **model_input_names** (`List[str]`) -- A list of inputs expected in the forward pass of the model.
  181. - **padding_side** (`str`) -- The default value for the side on which the model should have padding applied.
  182. Should be `'right'` or `'left'`.
  183. - **truncation_side** (`str`) -- The default value for the side on which the model should have truncation
  184. applied. Should be `'right'` or `'left'`.
  185. Args:
  186. tokenizer_path (`str` or `os.PathLike` or `Path`):
  187. Path to the tokenizer file to load the `MistralTokenizer`.
  188. mode (`ValidationMode`, *optional*, defaults to `ValidationMode.test`):
  189. The mode to use for the tokenizer. This will be passed to the `MistralTokenizer` constructor.
  190. model_max_length (`int`, *optional*):
  191. The maximum length (in number of tokens) for the inputs to the transformer model. When the tokenizer is
  192. loaded with [`~tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`], this will be set to the
  193. value stored for the associated model in `max_model_input_sizes` (see above). If no value is provided, will
  194. default to VERY_LARGE_INTEGER (`int(1e30)`).
  195. padding_side (`str`, *optional*):
  196. The side on which the model should have padding applied. Should be selected between ['right', 'left'].
  197. Default value is picked from the class attribute of the same name.
  198. truncation_side (`str`, *optional*):
  199. The side on which the model should have truncation applied. Should be selected between ['right', 'left'].
  200. Default value is picked from the class attribute of the same name.
  201. model_input_names (`List[string]`, *optional*):
  202. The list of inputs accepted by the forward pass of the model (like `"token_type_ids"` or
  203. `"attention_mask"`). Default value is picked from the class attribute of the same name.
  204. clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
  205. Whether or not the model should cleanup the spaces that were added when splitting the input text during the
  206. tokenization process.
  207. """
  208. if kwargs:
  209. raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported to init `MistralCommonTokenizer`.")
  210. self._tokenizer_path = Path(tokenizer_path)
  211. self.tokenizer: MistralTokenizer = MistralTokenizer.from_file(str(self._tokenizer_path), mode=mode)
  212. self._tokenizer_type = (
  213. MistralTokenizerType.tekken
  214. if isinstance(self.tokenizer.instruct_tokenizer.tokenizer, Tekkenizer)
  215. else MistralTokenizerType.spm
  216. )
  217. self.truncation_side = truncation_side
  218. self.padding_side = padding_side
  219. self.model_max_length = model_max_length
  220. self.cleanup_tokenization_spaces = clean_up_tokenization_spaces
  221. self.deprecation_warnings = {} # Use to store when we have already noticed a deprecation warning (avoid overlogging).
  222. if model_input_names is not None:
  223. if (
  224. not isinstance(model_input_names, (list, tuple))
  225. and len(model_input_names) == 0
  226. and not all(isinstance(i, str) for i in model_input_names)
  227. ):
  228. raise ValueError(
  229. "`model_input_names` should be a non-empty list or tuple of str but got an empty value."
  230. )
  231. self.model_input_names = model_input_names
  232. self._cache_get_vocab: Optional[dict[str, int]] = None
  233. @property
  234. def bos_token_id(self) -> int:
  235. """
  236. Id of the beginning of sentence token in the vocabulary.
  237. """
  238. return self.tokenizer.instruct_tokenizer.tokenizer.bos_id
  239. @property
  240. def eos_token_id(self) -> int:
  241. """
  242. Id of the end of sentence token in the vocabulary.
  243. """
  244. return self.tokenizer.instruct_tokenizer.tokenizer.eos_id
  245. @property
  246. def unk_token_id(self) -> int:
  247. """
  248. Id of the unknown token in the vocabulary.
  249. """
  250. return self.tokenizer.instruct_tokenizer.tokenizer.unk_id
  251. @property
  252. def pad_token_id(self) -> int:
  253. """
  254. Id of the padding token in the vocabulary.
  255. """
  256. return self.tokenizer.instruct_tokenizer.tokenizer.pad_id
  257. @property
  258. def bos_token(self) -> str:
  259. """
  260. String associated to the beginning of sentence token in the vocabulary.
  261. """
  262. return self.convert_ids_to_tokens(self.bos_token_id)
  263. @property
  264. def eos_token(self) -> str:
  265. """
  266. String associated to the end of sentence token in the vocabulary.
  267. """
  268. return self.convert_ids_to_tokens(self.eos_token_id)
  269. @property
  270. def unk_token(self) -> str:
  271. """
  272. String associated to the unknown token in the vocabulary.
  273. """
  274. return self.convert_ids_to_tokens(self.unk_token_id)
  275. @property
  276. def pad_token(self) -> str:
  277. """
  278. String associated to the padding token in the vocabulary.
  279. """
  280. return self.convert_ids_to_tokens(self.pad_token_id)
  281. @property
  282. def vocab_size(self) -> int:
  283. """
  284. Returns the size of the vocabulary.
  285. `int`: Size of the vocabulary.
  286. """
  287. return self.tokenizer.instruct_tokenizer.tokenizer.n_words
  288. def get_vocab(self) -> dict[str, int]:
  289. """
  290. Returns the vocabulary as a dictionary of token to index.
  291. This is a lossy conversion. There may be multiple token ids that decode to the same
  292. string due to partial UTF-8 byte sequences being converted to �.
  293. Returns:
  294. `Dict[str, int]`: The vocabulary.
  295. """
  296. if self._cache_get_vocab is None:
  297. self._cache_get_vocab = {
  298. token: idx for idx, token in enumerate(self.tokenizer.instruct_tokenizer.tokenizer.vocab())
  299. }
  300. return self._cache_get_vocab
  301. def __len__(self):
  302. """
  303. Size of the full vocabulary with the added tokens.
  304. """
  305. return self.vocab_size
  306. @add_end_docstrings(
  307. ENCODE_KWARGS_DOCSTRING,
  308. """
  309. **kwargs: Not supported by `MistralCommonTokenizer.encode`.
  310. Will raise an error if used.
  311. """,
  312. """
  313. Returns:
  314. `List[int]`, `torch.Tensor`: The tokenized ids of the text.
  315. """,
  316. )
  317. def encode(
  318. self,
  319. text: Union[TextInput, EncodedInput],
  320. text_pair: None = None,
  321. add_special_tokens: bool = True,
  322. padding: Union[bool, str, PaddingStrategy] = False,
  323. truncation: Union[bool, str, TruncationStrategy, None] = None,
  324. max_length: Optional[int] = None,
  325. stride: int = 0,
  326. pad_to_multiple_of: Optional[int] = None,
  327. padding_side: Optional[str] = None,
  328. return_tensors: Optional[Union[str, TensorType]] = None,
  329. verbose: bool = True,
  330. **kwargs,
  331. ) -> list[int]:
  332. """
  333. Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary.
  334. Args:
  335. text (`str` or `List[int]`):
  336. The first sequence to be encoded. This can be a string or a list of integers (tokenized string ids).
  337. text_pair (`None`, *optional*):
  338. Not supported by `MistralCommonTokenizer.encode`. Kept to match `PreTrainedTokenizerBase.encode` signature.
  339. """
  340. if kwargs:
  341. raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonTokenizer.encode`.")
  342. if text_pair:
  343. raise ValueError("`MistralCommonTokenizer.encode` does not support `text_pair`.")
  344. padding_strategy, truncation_strategy, max_length, _ = self._get_padding_truncation_strategies(
  345. padding=padding,
  346. truncation=truncation,
  347. max_length=max_length,
  348. pad_to_multiple_of=pad_to_multiple_of,
  349. verbose=verbose,
  350. )
  351. encoded_inputs = self._encode_plus(
  352. text,
  353. add_special_tokens=add_special_tokens,
  354. padding_strategy=padding_strategy,
  355. truncation_strategy=truncation_strategy,
  356. max_length=max_length,
  357. stride=stride,
  358. pad_to_multiple_of=pad_to_multiple_of,
  359. padding_side=padding_side,
  360. return_tensors=return_tensors,
  361. return_attention_mask=False,
  362. return_overflowing_tokens=False,
  363. return_special_tokens_mask=False,
  364. return_length=False,
  365. verbose=verbose,
  366. )
  367. return encoded_inputs["input_ids"]
  368. def decode(
  369. self,
  370. token_ids: Union[int, list[int], np.ndarray, "torch.Tensor"],
  371. skip_special_tokens: bool = False,
  372. clean_up_tokenization_spaces: Optional[bool] = None,
  373. **kwargs,
  374. ) -> str:
  375. """
  376. Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
  377. tokens and clean up tokenization spaces.
  378. Args:
  379. token_ids (`Union[int, List[int], np.ndarray, torch.Tensor]`):
  380. List of tokenized input ids. Can be obtained using the `__call__` method.
  381. skip_special_tokens (`bool`, *optional*, defaults to `False`):
  382. Whether or not to remove special tokens in the decoding.
  383. clean_up_tokenization_spaces (`bool`, *optional*):
  384. Whether or not to clean up the tokenization spaces. If `None`, will default to
  385. `self.clean_up_tokenization_spaces`.
  386. kwargs (additional keyword arguments, *optional*):
  387. Not supported by `MistralCommonTokenizer.decode`.
  388. Will raise an error if used.
  389. Returns:
  390. `str`: The decoded sentence.
  391. """
  392. if kwargs:
  393. raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonTokenizer.decode`.")
  394. clean_up_tokenization_spaces = clean_up_tokenization_spaces or self.cleanup_tokenization_spaces
  395. # Convert inputs to python lists
  396. token_ids = to_py_obj(token_ids)
  397. special_token_policy = SpecialTokenPolicy.IGNORE if skip_special_tokens else SpecialTokenPolicy.KEEP
  398. decoded_string = self.tokenizer.decode(token_ids, special_token_policy=special_token_policy)
  399. if clean_up_tokenization_spaces:
  400. decoded_string = PreTrainedTokenizerBase.clean_up_tokenization(decoded_string)
  401. return decoded_string
  402. def batch_decode(
  403. self,
  404. sequences: Union[list[int], list[list[int]], np.ndarray, "torch.Tensor"],
  405. skip_special_tokens: bool = False,
  406. clean_up_tokenization_spaces: Optional[bool] = None,
  407. **kwargs,
  408. ) -> list[str]:
  409. """
  410. Convert a list of lists of token ids into a list of strings by calling decode.
  411. Args:
  412. sequences (`Union[List[int], List[List[int]], np.ndarray, torch.Tensor]`):
  413. List of tokenized input ids. Can be obtained using the `__call__` method.
  414. skip_special_tokens (`bool`, *optional*, defaults to `False`):
  415. Whether or not to remove special tokens in the decoding.
  416. clean_up_tokenization_spaces (`bool`, *optional*):
  417. Whether or not to clean up the tokenization spaces. If `None`, will default to
  418. `self.clean_up_tokenization_spaces`.
  419. kwargs (additional keyword arguments, *optional*):
  420. Not supported by `MistralCommonTokenizer.batch_decode`.
  421. Will raise an error if used.
  422. Returns:
  423. `List[str]`: The list of decoded sentences.
  424. """
  425. return [
  426. self.decode(
  427. seq,
  428. skip_special_tokens=skip_special_tokens,
  429. clean_up_tokenization_spaces=clean_up_tokenization_spaces,
  430. **kwargs,
  431. )
  432. for seq in sequences
  433. ]
  434. def _is_control_token(self, token_id: int) -> bool:
  435. if self._tokenizer_type == MistralTokenizerType.spm:
  436. return token_id in self.tokenizer.instruct_tokenizer.tokenizer._control_tokens()
  437. elif self._tokenizer_type == MistralTokenizerType.tekken:
  438. return token_id < self.tokenizer.instruct_tokenizer.tokenizer.num_special_tokens
  439. else:
  440. raise ValueError(f"Unknown tokenizer type: {self._tokenizer_type}")
  441. @overload
  442. def convert_ids_to_tokens(self, ids: int, skip_special_tokens: bool = False) -> str: ...
  443. @overload
  444. def convert_ids_to_tokens(self, ids: list[int], skip_special_tokens: bool = False) -> list[str]: ...
  445. def convert_ids_to_tokens(
  446. self, ids: Union[int, list[int]], skip_special_tokens: bool = False
  447. ) -> Union[str, list[str]]:
  448. """
  449. Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
  450. added tokens.
  451. Args:
  452. ids (`int` or `List[int]`):
  453. The token id (or token ids) to convert to tokens.
  454. skip_special_tokens (`bool`, *optional*, defaults to `False`):
  455. Whether or not to remove special tokens in the decoding.
  456. Returns:
  457. `str` or `List[str]`: The decoded token(s).
  458. """
  459. if isinstance(ids, int):
  460. one_token = True
  461. ids = [ids]
  462. else:
  463. one_token = False
  464. tokens: list[str] = []
  465. for token_id in ids:
  466. if self._is_control_token(token_id) and skip_special_tokens:
  467. continue
  468. tokens.append(self.tokenizer.instruct_tokenizer.tokenizer.id_to_piece(token_id))
  469. if one_token:
  470. if tokens == []:
  471. raise ValueError(f"Invalid token id {ids}.")
  472. return tokens[0]
  473. return tokens
  474. def _piece_to_id(self, piece: str) -> int:
  475. if self._tokenizer_type == MistralTokenizerType.spm:
  476. return self.tokenizer.instruct_tokenizer.tokenizer._model.piece_to_id(piece)
  477. elif self._tokenizer_type == MistralTokenizerType.tekken:
  478. pieces = self.tokenizer.instruct_tokenizer.tokenizer._model.encode(
  479. piece, allowed_special="all", disallowed_special=set()
  480. )
  481. assert len(pieces) == 1, f"Expected to decode 1 token, got {len(pieces)}"
  482. return pieces[0]
  483. else:
  484. raise ValueError(f"Unknown tokenizer type: {self._tokenizer_type}")
  485. def convert_tokens_to_ids(self, tokens: Union[str, list[str]]) -> Union[int, list[int]]:
  486. """
  487. Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
  488. vocabulary.
  489. Args:
  490. tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s).
  491. Returns:
  492. `int` or `List[int]`: The token id or list of token ids.
  493. """
  494. if isinstance(tokens, str):
  495. one_token = True
  496. tokens = [tokens]
  497. else:
  498. one_token = False
  499. ids: list[int] = []
  500. for token in tokens:
  501. ids.append(self._piece_to_id(token))
  502. if one_token:
  503. return ids[0]
  504. return ids
  505. def _text_to_ids(self, text: TextInput, add_special_tokens: bool) -> list[int]:
  506. """
  507. Converts a string into a sequence of tokens ids, using the tokenizer.
  508. """
  509. tokens_ids = self.tokenizer.instruct_tokenizer.tokenizer.encode(
  510. text, bos=add_special_tokens, eos=add_special_tokens
  511. )
  512. return tokens_ids
  513. def tokenize(self, text: TextInput, **kwargs) -> list[str]:
  514. """
  515. Converts a string into a sequence of tokens, using the tokenizer.
  516. Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies.
  517. Args:
  518. text (`str`):
  519. The sequence to be encoded.
  520. **kwargs (additional keyword arguments):
  521. Not supported by `MistralCommonTokenizer.tokenize`.
  522. Will raise an error if used.
  523. Returns:
  524. `List[str]`: The list of tokens.
  525. """
  526. if kwargs:
  527. raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonTokenizer.tokenize`.")
  528. return self.convert_ids_to_tokens(self._text_to_ids(text, add_special_tokens=False), skip_special_tokens=False)
  529. def _encode_plus(
  530. self,
  531. text: Union[TextInput, EncodedInput],
  532. add_special_tokens: bool = True,
  533. padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
  534. truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
  535. max_length: Optional[int] = None,
  536. stride: int = 0,
  537. pad_to_multiple_of: Optional[int] = None,
  538. padding_side: Optional[str] = None,
  539. return_tensors: Optional[Union[str, TensorType]] = None,
  540. return_attention_mask: Optional[bool] = None,
  541. return_overflowing_tokens: bool = False,
  542. return_special_tokens_mask: bool = False,
  543. return_length: bool = False,
  544. verbose: bool = True,
  545. **kwargs,
  546. ) -> BatchEncoding:
  547. if kwargs:
  548. raise ValueError(
  549. f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonTokenizer._encode_plus`."
  550. )
  551. def get_input_ids(text):
  552. if isinstance(text, str):
  553. return self._text_to_ids(text, add_special_tokens)
  554. elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
  555. return text
  556. else:
  557. raise ValueError(f"Input {text} is not valid. Should be a string, or a list/tuple of integers.")
  558. ids = get_input_ids(text)
  559. return self.prepare_for_model(
  560. ids,
  561. add_special_tokens=add_special_tokens,
  562. padding=padding_strategy.value,
  563. truncation=truncation_strategy.value,
  564. max_length=max_length,
  565. stride=stride,
  566. pad_to_multiple_of=pad_to_multiple_of,
  567. padding_side=padding_side,
  568. return_tensors=return_tensors,
  569. prepend_batch_axis=True,
  570. return_attention_mask=return_attention_mask,
  571. return_overflowing_tokens=return_overflowing_tokens,
  572. return_special_tokens_mask=return_special_tokens_mask,
  573. return_length=return_length,
  574. verbose=verbose,
  575. )
  576. def _batch_encode_plus(
  577. self,
  578. batch_text: Union[
  579. list[TextInput],
  580. list[EncodedInput],
  581. ],
  582. add_special_tokens: bool = True,
  583. padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
  584. truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
  585. max_length: Optional[int] = None,
  586. stride: int = 0,
  587. pad_to_multiple_of: Optional[int] = None,
  588. padding_side: Optional[str] = None,
  589. return_tensors: Optional[Union[str, TensorType]] = None,
  590. return_attention_mask: Optional[bool] = None,
  591. return_overflowing_tokens: bool = False,
  592. return_special_tokens_mask: bool = False,
  593. return_offsets_mapping: bool = False,
  594. return_length: bool = False,
  595. verbose: bool = True,
  596. **kwargs,
  597. ) -> BatchEncoding:
  598. def get_input_ids(text):
  599. if isinstance(text, str):
  600. return self._text_to_ids(text, add_special_tokens)
  601. elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
  602. return text
  603. else:
  604. raise ValueError("Input is not valid. Should be a string or a list/tuple of integers.")
  605. if return_offsets_mapping:
  606. raise NotImplementedError(
  607. "return_offset_mapping is not available when using Python tokenizers. "
  608. "To use this feature, change your tokenizer to one deriving from "
  609. "transformers.PreTrainedTokenizerFast."
  610. )
  611. input_ids = []
  612. for ids in batch_text:
  613. input_ids.append(get_input_ids(ids))
  614. batch_outputs = self._batch_prepare_for_model(
  615. input_ids,
  616. add_special_tokens=add_special_tokens,
  617. padding_strategy=padding_strategy,
  618. truncation_strategy=truncation_strategy,
  619. max_length=max_length,
  620. stride=stride,
  621. pad_to_multiple_of=pad_to_multiple_of,
  622. padding_side=padding_side,
  623. return_attention_mask=return_attention_mask,
  624. return_overflowing_tokens=return_overflowing_tokens,
  625. return_special_tokens_mask=return_special_tokens_mask,
  626. return_length=return_length,
  627. return_tensors=return_tensors,
  628. verbose=verbose,
  629. )
  630. return BatchEncoding(batch_outputs)
  631. def _all_special_ids(self) -> set[int]:
  632. if self._tokenizer_type == MistralTokenizerType.tekken:
  633. return {t["rank"] for t in self.tokenizer.instruct_tokenizer.tokenizer._all_special_tokens}
  634. elif self._tokenizer_type == MistralTokenizerType.spm:
  635. return self.tokenizer.instruct_tokenizer.tokenizer._control_tokens()
  636. else:
  637. raise ValueError(f"Unknown tokenizer type: {self._tokenizer_type}")
  638. def get_special_tokens_mask(
  639. self, token_ids_0: list, token_ids_1: None = None, already_has_special_tokens: bool = False
  640. ) -> list[int]:
  641. """
  642. Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
  643. special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
  644. Args:
  645. token_ids_0 (`List[int]`):
  646. List of ids of the sequence.
  647. token_ids_1 (`List[int]`, *optional*):
  648. Not supported by `MistralCommonTokenizer`. Kept to match the interface of `PreTrainedTokenizerBase`.
  649. already_has_special_tokens (`bool`, *optional*, defaults to `False`):
  650. Whether or not the token list is already formatted with special tokens for the model.
  651. Returns:
  652. A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
  653. """
  654. if token_ids_1 is not None:
  655. raise ValueError(
  656. "`token_ids_1` is not supported by `MistralCommonTokenizer` and should be `None`, kept for compatibility."
  657. )
  658. if already_has_special_tokens:
  659. raise ValueError(
  660. "`already_has_special_tokens` is not supported by `MistralCommonTokenizer` and should be `False`."
  661. )
  662. all_special_ids = self._all_special_ids() # cache the ids
  663. special_tokens_mask = [1 if token in all_special_ids else 0 for token in token_ids_0]
  664. return special_tokens_mask
  665. def _batch_prepare_for_model(
  666. self,
  667. batch_ids: list[Union[PreTokenizedInput, list[int]]],
  668. add_special_tokens: bool = True,
  669. padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
  670. truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
  671. max_length: Optional[int] = None,
  672. stride: int = 0,
  673. pad_to_multiple_of: Optional[int] = None,
  674. padding_side: Optional[str] = None,
  675. return_tensors: Optional[str] = None,
  676. return_attention_mask: Optional[bool] = None,
  677. return_overflowing_tokens: bool = False,
  678. return_special_tokens_mask: bool = False,
  679. return_length: bool = False,
  680. verbose: bool = True,
  681. ) -> BatchEncoding:
  682. """
  683. Prepares a sequence of input id so that it can be used by the model. It
  684. adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
  685. manages a moving window (with user defined stride) for overflowing tokens.
  686. Args:
  687. batch_ids: list of tokenized input ids
  688. """
  689. batch_outputs = {}
  690. for ids in batch_ids:
  691. outputs = self.prepare_for_model(
  692. ids,
  693. add_special_tokens=add_special_tokens,
  694. padding=PaddingStrategy.DO_NOT_PAD.value, # we pad in batch afterward
  695. truncation=truncation_strategy.value,
  696. max_length=max_length,
  697. stride=stride,
  698. pad_to_multiple_of=None, # we pad in batch afterward
  699. padding_side=None, # we pad in batch afterward
  700. return_attention_mask=False, # we pad in batch afterward
  701. return_overflowing_tokens=return_overflowing_tokens,
  702. return_special_tokens_mask=return_special_tokens_mask,
  703. return_length=return_length,
  704. return_tensors=None, # We convert the whole batch to tensors at the end
  705. prepend_batch_axis=False,
  706. verbose=verbose,
  707. )
  708. for key, value in outputs.items():
  709. if key not in batch_outputs:
  710. batch_outputs[key] = []
  711. batch_outputs[key].append(value)
  712. batch_outputs = self.pad(
  713. batch_outputs,
  714. padding=padding_strategy.value,
  715. max_length=max_length,
  716. pad_to_multiple_of=pad_to_multiple_of,
  717. padding_side=padding_side,
  718. return_attention_mask=return_attention_mask,
  719. )
  720. batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
  721. return batch_outputs
  722. @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
  723. def prepare_for_model(
  724. self,
  725. ids: list[int],
  726. pair_ids: None = None,
  727. add_special_tokens: bool = True,
  728. padding: Union[bool, str, PaddingStrategy] = False,
  729. truncation: Union[bool, str, TruncationStrategy, None] = None,
  730. max_length: Optional[int] = None,
  731. stride: int = 0,
  732. pad_to_multiple_of: Optional[int] = None,
  733. padding_side: Optional[str] = None,
  734. return_tensors: Optional[Union[str, TensorType]] = None,
  735. return_attention_mask: Optional[bool] = None,
  736. return_overflowing_tokens: bool = False,
  737. return_special_tokens_mask: bool = False,
  738. return_length: bool = False,
  739. verbose: bool = True,
  740. prepend_batch_axis: bool = False,
  741. **kwargs,
  742. ) -> BatchEncoding:
  743. """
  744. Prepares a sequence of input id so that it can be used by the model. It
  745. adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
  746. manages a moving window (with user defined stride) for overflowing tokens.
  747. Args:
  748. ids (`List[int]`):
  749. Tokenized input ids of the first sequence.
  750. pair_ids (`None`, *optional*):
  751. Not supported by `MistralCommonTokenizer`. Kept to match the interface of `PreTrainedTokenizerBase`.
  752. """
  753. if pair_ids is not None:
  754. raise ValueError(
  755. "`pair_ids` is not supported by `MistralCommonTokenizer` and should be `None`, kept for compatibility."
  756. )
  757. if kwargs:
  758. raise ValueError(
  759. f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonTokenizer.prepare_for_model`."
  760. )
  761. padding_strategy, truncation_strategy, max_length, _ = self._get_padding_truncation_strategies(
  762. padding=padding,
  763. truncation=truncation,
  764. max_length=max_length,
  765. pad_to_multiple_of=pad_to_multiple_of,
  766. verbose=verbose,
  767. )
  768. len_ids = len(ids)
  769. # Load from model defaults
  770. if return_attention_mask is None:
  771. return_attention_mask = "attention_mask" in self.model_input_names
  772. encoded_inputs = {}
  773. # Truncation: Handle max sequence length
  774. overflowing_tokens = []
  775. if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and len_ids > max_length:
  776. ids, _, overflowing_tokens = self.truncate_sequences(
  777. ids,
  778. num_tokens_to_remove=len_ids - max_length,
  779. truncation_strategy=truncation_strategy,
  780. stride=stride,
  781. )
  782. if return_overflowing_tokens:
  783. encoded_inputs["overflowing_tokens"] = overflowing_tokens
  784. encoded_inputs["num_truncated_tokens"] = len_ids - max_length
  785. # Build output dictionary
  786. encoded_inputs[self.model_input_names[0]] = ids
  787. if return_special_tokens_mask:
  788. if add_special_tokens:
  789. encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, None)
  790. else:
  791. encoded_inputs["special_tokens_mask"] = [0] * len(ids)
  792. # Padding
  793. if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
  794. encoded_inputs = self.pad(
  795. encoded_inputs,
  796. max_length=max_length,
  797. padding=padding_strategy.value,
  798. pad_to_multiple_of=pad_to_multiple_of,
  799. padding_side=padding_side,
  800. return_attention_mask=return_attention_mask,
  801. )
  802. if return_length:
  803. encoded_inputs["length"] = len(encoded_inputs["input_ids"])
  804. batch_outputs = BatchEncoding(
  805. encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis
  806. )
  807. return batch_outputs
  808. def _get_padding_truncation_strategies(
  809. self,
  810. padding: Union[str, PaddingStrategy, bool] = False,
  811. truncation: Optional[Union[str, TruncationStrategy, bool]] = None,
  812. max_length: Optional[int] = None,
  813. pad_to_multiple_of: Optional[int] = None,
  814. verbose: bool = True,
  815. **kwargs,
  816. ):
  817. """
  818. Find the correct padding/truncation strategy.
  819. """
  820. # Backward compatibility for previous behavior, maybe we should deprecate it:
  821. # If you only set max_length, it activates truncation for max_length
  822. if max_length is not None and padding is False and truncation is None:
  823. if verbose:
  824. if not self.deprecation_warnings.get("Truncation-not-explicitly-activated", False):
  825. logger.warning(
  826. "Truncation was not explicitly activated but `max_length` is provided a specific value, please"
  827. " use `truncation=True` to explicitly truncate examples to max length. Defaulting to"
  828. " 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the"
  829. " tokenizer you can select this strategy more precisely by providing a specific strategy to"
  830. " `truncation`."
  831. )
  832. self.deprecation_warnings["Truncation-not-explicitly-activated"] = True
  833. truncation = "longest_first"
  834. # Get padding strategy
  835. if padding is not False:
  836. if padding is True:
  837. if verbose:
  838. if max_length is not None and (
  839. truncation is None or truncation is False or truncation == "do_not_truncate"
  840. ):
  841. warnings.warn(
  842. "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "
  843. "To pad to max length, use `padding='max_length'`."
  844. )
  845. padding_strategy = PaddingStrategy.LONGEST # Default to pad to the longest sequence in the batch
  846. elif not isinstance(padding, PaddingStrategy):
  847. padding_strategy = PaddingStrategy(padding)
  848. elif isinstance(padding, PaddingStrategy):
  849. padding_strategy = padding
  850. else:
  851. padding_strategy = PaddingStrategy.DO_NOT_PAD
  852. # Get truncation strategy
  853. if truncation is not False and truncation is not None:
  854. if truncation is True:
  855. truncation_strategy = (
  856. TruncationStrategy.LONGEST_FIRST
  857. ) # Default to truncate the longest sequences in pairs of inputs
  858. elif not isinstance(truncation, TruncationStrategy):
  859. truncation_strategy = TruncationStrategy(truncation)
  860. elif isinstance(truncation, TruncationStrategy):
  861. truncation_strategy = truncation
  862. if truncation in [TruncationStrategy.ONLY_FIRST, TruncationStrategy.ONLY_SECOND]:
  863. raise ValueError(
  864. "Truncation strategy `only_first` and `only_second` are not supported by `MistralCommonTokenizer`."
  865. )
  866. else:
  867. truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
  868. # Set max length if needed
  869. if max_length is None:
  870. if padding_strategy == PaddingStrategy.MAX_LENGTH:
  871. if self.model_max_length > LARGE_INTEGER:
  872. if verbose:
  873. if not self.deprecation_warnings.get("Asking-to-pad-to-max_length", False):
  874. logger.warning(
  875. "Asking to pad to max_length but no maximum length is provided and the model has no"
  876. " predefined maximum length. Default to no padding."
  877. )
  878. self.deprecation_warnings["Asking-to-pad-to-max_length"] = True
  879. padding_strategy = PaddingStrategy.DO_NOT_PAD
  880. else:
  881. max_length = self.model_max_length
  882. if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:
  883. if self.model_max_length > LARGE_INTEGER:
  884. if verbose:
  885. if not self.deprecation_warnings.get("Asking-to-truncate-to-max_length", False):
  886. logger.warning(
  887. "Asking to truncate to max_length but no maximum length is provided and the model has"
  888. " no predefined maximum length. Default to no truncation."
  889. )
  890. self.deprecation_warnings["Asking-to-truncate-to-max_length"] = True
  891. truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
  892. else:
  893. max_length = self.model_max_length
  894. # Test if we have a padding token
  895. if padding_strategy != PaddingStrategy.DO_NOT_PAD and (self.pad_token is None or self.pad_token_id < 0):
  896. raise ValueError(
  897. "Asking to pad but the tokenizer does not have a padding token. "
  898. "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` "
  899. "or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`."
  900. )
  901. # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided
  902. if (
  903. truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE
  904. and padding_strategy != PaddingStrategy.DO_NOT_PAD
  905. and pad_to_multiple_of is not None
  906. and max_length is not None
  907. and (max_length % pad_to_multiple_of != 0)
  908. ):
  909. raise ValueError(
  910. "Truncation and padding are both activated but "
  911. f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})."
  912. )
  913. return padding_strategy, truncation_strategy, max_length, kwargs
  914. def _pad(
  915. self,
  916. encoded_inputs: Union[dict[str, EncodedInput], BatchEncoding],
  917. max_length: Optional[int] = None,
  918. padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
  919. pad_to_multiple_of: Optional[int] = None,
  920. padding_side: Optional[str] = None,
  921. return_attention_mask: Optional[bool] = None,
  922. ) -> dict:
  923. """
  924. Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
  925. Args:
  926. encoded_inputs:
  927. Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
  928. max_length: maximum length of the returned list and optionally padding length (see below).
  929. Will truncate by taking into account the special tokens.
  930. padding_strategy: PaddingStrategy to use for padding.
  931. - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
  932. - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
  933. - PaddingStrategy.DO_NOT_PAD: Do not pad
  934. The tokenizer padding sides are defined in `padding_side` argument:
  935. - 'left': pads on the left of the sequences
  936. - 'right': pads on the right of the sequences
  937. pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
  938. This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
  939. `>= 7.5` (Volta).
  940. padding_side:
  941. The side on which the model should have padding applied. Should be selected between ['right', 'left'].
  942. Default value is picked from the class attribute of the same name.
  943. return_attention_mask:
  944. (optional) Set to False to avoid returning attention mask (default: set to model specifics)
  945. """
  946. # Load from model defaults
  947. if return_attention_mask is None:
  948. return_attention_mask = "attention_mask" in self.model_input_names
  949. required_input = encoded_inputs[self.model_input_names[0]]
  950. if padding_strategy == PaddingStrategy.LONGEST:
  951. max_length = len(required_input)
  952. if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
  953. max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
  954. needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
  955. # Initialize attention mask if not present.
  956. if return_attention_mask and "attention_mask" not in encoded_inputs:
  957. encoded_inputs["attention_mask"] = [1] * len(required_input)
  958. if needs_to_be_padded:
  959. difference = max_length - len(required_input)
  960. padding_side = padding_side if padding_side is not None else self.padding_side
  961. if padding_side == "right":
  962. if return_attention_mask:
  963. encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
  964. if "special_tokens_mask" in encoded_inputs:
  965. encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
  966. encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
  967. elif padding_side == "left":
  968. if return_attention_mask:
  969. encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
  970. if "special_tokens_mask" in encoded_inputs:
  971. encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
  972. encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
  973. else:
  974. raise ValueError(f"Invalid padding strategy:{padding_side}")
  975. return encoded_inputs
  976. def pad(
  977. self,
  978. encoded_inputs: Union[
  979. BatchEncoding,
  980. list[BatchEncoding],
  981. dict[str, EncodedInput],
  982. dict[str, list[EncodedInput]],
  983. list[dict[str, EncodedInput]],
  984. ],
  985. padding: Union[bool, str, PaddingStrategy] = True,
  986. max_length: Optional[int] = None,
  987. pad_to_multiple_of: Optional[int] = None,
  988. padding_side: Optional[str] = None,
  989. return_attention_mask: Optional[bool] = None,
  990. return_tensors: Optional[Union[str, TensorType]] = None,
  991. verbose: bool = True,
  992. ) -> BatchEncoding:
  993. """
  994. Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
  995. in the batch.
  996. Padding side (left/right) padding token ids are defined at the tokenizer level (with `self.padding_side`,
  997. `self.pad_token_id`).
  998. <Tip>
  999. If the `encoded_inputs` passed are dictionary of numpy arrays, PyTorch tensors, the
  1000. result will use the same type unless you provide a different tensor type with `return_tensors`. In the case of
  1001. PyTorch tensors, you will lose the specific device of your tensors however.
  1002. </Tip>
  1003. Args:
  1004. encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`], `Dict[str, List[int]]`, `Dict[str, List[List[int]]` or `List[Dict[str, List[int]]]`):
  1005. Tokenized inputs. Can represent one input ([`BatchEncoding`] or `Dict[str, List[int]]`) or a batch of
  1006. tokenized inputs (list of [`BatchEncoding`], *Dict[str, List[List[int]]]* or *List[Dict[str,
  1007. List[int]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader
  1008. collate function.
  1009. Instead of `List[int]` you can have tensors (numpy arrays, PyTorch tensors), see
  1010. the note above for the return type.
  1011. padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
  1012. Select a strategy to pad the returned sequences (according to the model's padding side and padding
  1013. index) among:
  1014. - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single
  1015. sequence if provided).
  1016. - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
  1017. acceptable input length for the model if that argument is not provided.
  1018. - `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different
  1019. lengths).
  1020. max_length (`int`, *optional*):
  1021. Maximum length of the returned list and optionally padding length (see above).
  1022. pad_to_multiple_of (`int`, *optional*):
  1023. If set will pad the sequence to a multiple of the provided value.
  1024. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
  1025. `>= 7.5` (Volta).
  1026. padding_side (`str`, *optional*):
  1027. The side on which the model should have padding applied. Should be selected between ['right', 'left'].
  1028. Default value is picked from the class attribute of the same name.
  1029. return_attention_mask (`bool`, *optional*):
  1030. Whether to return the attention mask. If left to the default, will return the attention mask according
  1031. to the specific tokenizer's default, defined by the `return_outputs` attribute.
  1032. [What are attention masks?](../glossary#attention-mask)
  1033. return_tensors (`str` or [`~utils.TensorType`], *optional*):
  1034. If set, will return tensors instead of list of python integers. Acceptable values are:
  1035. - `'pt'`: Return PyTorch `torch.Tensor` objects.
  1036. - `'np'`: Return Numpy `np.ndarray` objects.
  1037. verbose (`bool`, *optional*, defaults to `True`):
  1038. Whether or not to print more information and warnings.
  1039. """
  1040. # If we have a list of dicts, let's convert it in a dict of lists
  1041. # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
  1042. if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], Mapping):
  1043. encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0]}
  1044. # The model's main input name, usually `input_ids`, has been passed for padding
  1045. if self.model_input_names[0] not in encoded_inputs:
  1046. raise ValueError(
  1047. "You should supply an encoding or a list of encodings to this method "
  1048. f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
  1049. )
  1050. required_input = encoded_inputs[self.model_input_names[0]]
  1051. if required_input is None or (isinstance(required_input, Sized) and len(required_input) == 0):
  1052. if return_attention_mask:
  1053. encoded_inputs["attention_mask"] = []
  1054. return encoded_inputs
  1055. # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects
  1056. # and rebuild them afterwards if no return_tensors is specified
  1057. # Note that we lose the specific device the tensor may be on for PyTorch
  1058. first_element = required_input[0]
  1059. if isinstance(first_element, (list, tuple)):
  1060. # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
  1061. for item in required_input:
  1062. if len(item) != 0:
  1063. first_element = item[0]
  1064. break
  1065. # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
  1066. if not isinstance(first_element, (int, list, tuple)):
  1067. if is_torch_tensor(first_element):
  1068. return_tensors = "pt" if return_tensors is None else return_tensors
  1069. elif isinstance(first_element, np.ndarray):
  1070. return_tensors = "np" if return_tensors is None else return_tensors
  1071. else:
  1072. raise ValueError(
  1073. f"type of {first_element} unknown: {type(first_element)}. "
  1074. "Should be one of a python, numpy, pytorch or tensorflow object."
  1075. )
  1076. for key, value in encoded_inputs.items():
  1077. encoded_inputs[key] = to_py_obj(value)
  1078. # Convert padding_strategy in PaddingStrategy
  1079. padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
  1080. padding=padding, max_length=max_length, verbose=verbose
  1081. )
  1082. required_input = encoded_inputs[self.model_input_names[0]]
  1083. if required_input and not isinstance(required_input[0], (list, tuple)):
  1084. encoded_inputs = self._pad(
  1085. encoded_inputs,
  1086. max_length=max_length,
  1087. padding_strategy=padding_strategy,
  1088. pad_to_multiple_of=pad_to_multiple_of,
  1089. padding_side=padding_side,
  1090. return_attention_mask=return_attention_mask,
  1091. )
  1092. return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
  1093. batch_size = len(required_input)
  1094. assert all(len(v) == batch_size for v in encoded_inputs.values()), (
  1095. "Some items in the output dictionary have a different batch size than others."
  1096. )
  1097. if padding_strategy == PaddingStrategy.LONGEST:
  1098. max_length = max(len(inputs) for inputs in required_input)
  1099. padding_strategy = PaddingStrategy.MAX_LENGTH
  1100. batch_outputs = {}
  1101. for i in range(batch_size):
  1102. inputs = {k: v[i] for k, v in encoded_inputs.items()}
  1103. outputs = self._pad(
  1104. inputs,
  1105. max_length=max_length,
  1106. padding_strategy=padding_strategy,
  1107. pad_to_multiple_of=pad_to_multiple_of,
  1108. padding_side=padding_side,
  1109. return_attention_mask=return_attention_mask,
  1110. )
  1111. for key, value in outputs.items():
  1112. if key not in batch_outputs:
  1113. batch_outputs[key] = []
  1114. batch_outputs[key].append(value)
  1115. return BatchEncoding(batch_outputs, tensor_type=return_tensors)
  1116. def truncate_sequences(
  1117. self,
  1118. ids: list[int],
  1119. pair_ids: None = None,
  1120. num_tokens_to_remove: int = 0,
  1121. truncation_strategy: Union[str, TruncationStrategy] = "longest_first",
  1122. stride: int = 0,
  1123. **kwargs,
  1124. ) -> tuple[list[int], None, list[int]]:
  1125. """
  1126. Truncates a sequence pair in-place following the strategy.
  1127. Args:
  1128. ids (`List[int]`):
  1129. Tokenized input ids. Can be obtained from a string by chaining the `tokenize` and
  1130. `convert_tokens_to_ids` methods.
  1131. pair_ids (`None`, *optional*):
  1132. Not supported by `MistralCommonTokenizer`. Kept to match the signature of `PreTrainedTokenizerBase.truncate_sequences`.
  1133. num_tokens_to_remove (`int`, *optional*, defaults to 0):
  1134. Number of tokens to remove using the truncation strategy.
  1135. truncation_strategy (`str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `'longest_first'`):
  1136. The strategy to follow for truncation. Can be:
  1137. - `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
  1138. maximum acceptable input length for the model if that argument is not provided.
  1139. - `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths greater
  1140. than the model maximum admissible input size).
  1141. stride (`int`, *optional*, defaults to 0):
  1142. If set to a positive number, the overflowing tokens returned will contain some tokens from the main
  1143. sequence returned. The value of this argument defines the number of additional tokens.
  1144. Returns:
  1145. `Tuple[List[int], None, List[int]]`: The truncated `ids` and the list of
  1146. overflowing tokens. `None` is returned to match Transformers signature.
  1147. """
  1148. if kwargs:
  1149. raise ValueError(
  1150. f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonTokenizer.truncate_sequences`."
  1151. )
  1152. if pair_ids:
  1153. raise ValueError("`pair_ids` is not supported by `MistralCommonTokenizer.truncate_sequences`.")
  1154. if num_tokens_to_remove <= 0:
  1155. return (ids, None, [])
  1156. if not isinstance(truncation_strategy, TruncationStrategy):
  1157. truncation_strategy = TruncationStrategy(truncation_strategy)
  1158. if truncation_strategy in [TruncationStrategy.ONLY_FIRST, TruncationStrategy.ONLY_SECOND]:
  1159. raise ValueError(
  1160. f"Only {TruncationStrategy.LONGEST_FIRST} and {TruncationStrategy.DO_NOT_TRUNCATE} are supported."
  1161. )
  1162. overflowing_tokens = []
  1163. if truncation_strategy == TruncationStrategy.LONGEST_FIRST:
  1164. if len(ids) > num_tokens_to_remove:
  1165. window_len = min(len(ids), stride + num_tokens_to_remove)
  1166. if self.truncation_side == "left":
  1167. overflowing_tokens = ids[:window_len]
  1168. ids = ids[num_tokens_to_remove:]
  1169. elif self.truncation_side == "right":
  1170. overflowing_tokens = ids[-window_len:]
  1171. ids = ids[:-num_tokens_to_remove]
  1172. else:
  1173. raise ValueError(f"invalid truncation strategy: {self.truncation_side}, use 'left' or 'right'.")
  1174. else:
  1175. error_msg = (
  1176. f"We need to remove {num_tokens_to_remove} to truncate the input "
  1177. f"but the first sequence has a length {len(ids)}. "
  1178. )
  1179. logger.error(error_msg)
  1180. return (ids, None, overflowing_tokens)
  1181. def apply_chat_template(
  1182. self,
  1183. conversation: Union[list[dict[str, str]], list[list[dict[str, str]]]],
  1184. tools: Optional[list[Union[dict, Callable]]] = None,
  1185. continue_final_message: bool = False,
  1186. tokenize: bool = True,
  1187. padding: Union[bool, str, PaddingStrategy] = False,
  1188. truncation: bool = False,
  1189. max_length: Optional[int] = None,
  1190. return_tensors: Optional[Union[str, TensorType]] = None,
  1191. return_dict: bool = False,
  1192. **kwargs,
  1193. ) -> Union[str, list[int], list[str], list[list[int]], BatchEncoding]:
  1194. """
  1195. Converts a list of dictionaries with `"role"` and `"content"` keys to a list of token
  1196. ids.
  1197. Args:
  1198. conversation (Union[List[Dict[str, str]], List[List[Dict[str, str]]]]): A list of dicts
  1199. with "role" and "content" keys, representing the chat history so far.
  1200. tools (`List[Union[Dict, Callable]]`, *optional*):
  1201. A list of tools (callable functions) that will be accessible to the model. If the template does not
  1202. support function calling, this argument will have no effect. Each tool should be passed as a JSON Schema,
  1203. giving the name, description and argument types for the tool. See our
  1204. [chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#automated-function-conversion-for-tool-use)
  1205. for more information.
  1206. continue_final_message (bool, *optional*):
  1207. If this is set, the chat will be formatted so that the final
  1208. message in the chat is open-ended, without any EOS tokens. The model will continue this message
  1209. rather than starting a new one. This allows you to "prefill" part of
  1210. the model's response for it. Cannot be used at the same time as `add_generation_prompt`.
  1211. tokenize (`bool`, defaults to `True`):
  1212. Whether to tokenize the output. If `False`, the output will be a string.
  1213. padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
  1214. Select a strategy to pad the returned sequences (according to the model's padding side and padding
  1215. index) among:
  1216. - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
  1217. sequence if provided).
  1218. - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
  1219. acceptable input length for the model if that argument is not provided.
  1220. - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
  1221. lengths).
  1222. truncation (`bool`, defaults to `False`):
  1223. Whether to truncate sequences at the maximum length. Has no effect if tokenize is `False`.
  1224. max_length (`int`, *optional*):
  1225. Maximum length (in tokens) to use for padding or truncation. Has no effect if tokenize is `False`. If
  1226. not specified, the tokenizer's `max_length` attribute will be used as a default.
  1227. return_tensors (`str` or [`~utils.TensorType`], *optional*):
  1228. If set, will return tensors of a particular framework. Has no effect if tokenize is `False`. Acceptable
  1229. values are:
  1230. - `'pt'`: Return PyTorch `torch.Tensor` objects.
  1231. return_dict (`bool`, defaults to `False`):
  1232. Whether to return a dictionary with named outputs. Has no effect if tokenize is `False`.
  1233. If at least one conversation contains an image, its pixel values will be returned in the `pixel_values` key.
  1234. kwargs (additional keyword arguments, *optional*):
  1235. Not supported by `MistralCommonTokenizer.apply_chat_template`.
  1236. Will raise an error if used.
  1237. Returns:
  1238. `Union[str, List[int], List[str], List[List[int]], BatchEncoding]`: A list of token ids representing the tokenized chat so far, including control
  1239. tokens. This output is ready to pass to the model, either directly or via methods like `generate()`.
  1240. """
  1241. if kwargs:
  1242. raise ValueError(
  1243. f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonTokenizer.apply_chat_template`."
  1244. )
  1245. if not isinstance(truncation, bool):
  1246. raise ValueError("`truncation` must be a boolean for `apply_chat_template` method.")
  1247. if isinstance(conversation, (list, tuple)) and (
  1248. isinstance(conversation[0], (list, tuple)) or hasattr(conversation[0], "messages")
  1249. ):
  1250. conversations = conversation
  1251. is_batched = True
  1252. else:
  1253. conversations = [conversation]
  1254. is_batched = False
  1255. def _maybe_adapt_message(message: dict[str, Any]) -> None:
  1256. """Adapt message to `mistral-common` format and leave validation to `mistral-common`."""
  1257. if not isinstance(message, dict):
  1258. return
  1259. maybe_list_content: Optional[Union[str, list[dict[str, Union[str, dict[str, Any]]]]]] = message.get(
  1260. "content"
  1261. )
  1262. if not maybe_list_content or isinstance(maybe_list_content, str):
  1263. return
  1264. normalized_content: list[dict[str, Union[str, dict[str, Any]]]] = []
  1265. for content in maybe_list_content:
  1266. content_type = content.get("type", None)
  1267. if not content_type:
  1268. continue
  1269. elif content_type == "image":
  1270. maybe_url: Optional[str] = content.get("url")
  1271. maybe_path: Optional[str] = content.get("path")
  1272. maybe_base64: Optional[str] = content.get("base64")
  1273. if maybe_url:
  1274. image_content = maybe_url
  1275. elif maybe_path:
  1276. if not maybe_path.startswith("file://"):
  1277. maybe_path = Path(maybe_path).resolve().as_uri()
  1278. image_content = maybe_path
  1279. elif maybe_base64:
  1280. if not maybe_base64.startswith("data:image"):
  1281. maybe_base64 = "data:image/unk;base64," + maybe_base64
  1282. image_content = maybe_base64
  1283. else:
  1284. raise ValueError("Image content must be specified.")
  1285. normalized_content.append({"type": "image_url", "image_url": {"url": image_content}})
  1286. elif content_type == "audio":
  1287. maybe_url: Optional[str] = content.get("url")
  1288. maybe_path: Optional[str] = content.get("path")
  1289. maybe_base64: Optional[str] = content.get("base64")
  1290. if maybe_url or maybe_path:
  1291. audio_data = load_audio_as(maybe_url or maybe_path, return_format="dict", force_mono=True)
  1292. normalized_content.append({"type": "input_audio", "input_audio": audio_data})
  1293. continue
  1294. if not maybe_base64:
  1295. raise ValueError("Audio content must be specified.")
  1296. normalized_content.append({"type": "audio_url", "audio_url": {"url": maybe_base64}})
  1297. else:
  1298. normalized_content.append(content)
  1299. message["content"] = normalized_content
  1300. outputs = []
  1301. images: list[np.ndarray] = []
  1302. audios: list[np.ndarray] = []
  1303. for conversation in conversations:
  1304. messages: list[dict[str, Union[str, list[dict[str, Union[str, dict[str, Any]]]]]]] = []
  1305. for message in conversation:
  1306. _maybe_adapt_message(message)
  1307. messages.append(message)
  1308. chat_request = ChatCompletionRequest.from_openai(
  1309. messages=messages,
  1310. tools=tools,
  1311. continue_final_message=continue_final_message,
  1312. )
  1313. tokenized_request = self.tokenizer.encode_chat_completion(chat_request)
  1314. if tokenize:
  1315. outputs.append(tokenized_request.tokens)
  1316. else:
  1317. outputs.append(tokenized_request.text)
  1318. images.extend(tokenized_request.images)
  1319. audios.extend([el.audio_array for el in tokenized_request.audios])
  1320. if not is_batched:
  1321. outputs = outputs[0]
  1322. if tokenize:
  1323. out = self(
  1324. outputs,
  1325. padding=padding,
  1326. truncation=truncation,
  1327. max_length=max_length,
  1328. add_special_tokens=False,
  1329. return_tensors=return_tensors,
  1330. )
  1331. if return_dict:
  1332. if images:
  1333. pixel_values: Union[list[np.ndarray], np.ndarray, torch.Tensor]
  1334. if return_tensors == "pt":
  1335. if not is_torch_available():
  1336. raise ImportError(
  1337. "Unable to convert output to PyTorch tensors format, PyTorch is not installed."
  1338. )
  1339. pixel_values = torch.tensor(images)
  1340. elif return_tensors == "np":
  1341. pixel_values = np.array(images)
  1342. elif return_tensors is None:
  1343. pixel_values = images
  1344. else:
  1345. raise ValueError(f"Unsupported return_tensors type: {return_tensors}")
  1346. out.data["pixel_values"] = pixel_values
  1347. if audios:
  1348. if return_tensors is not None:
  1349. raise NotImplementedError(
  1350. "When passing audio content in apply_chat_template, `return_tensors` must be None since we cannot batch the audio inputs. The returned audio will be a list of numpy arrays."
  1351. )
  1352. # Transformers convention is audio for plural audio (audio does not take a "s")
  1353. out.data["audio"] = audios
  1354. return out
  1355. else:
  1356. return out["input_ids"]
  1357. else:
  1358. logger.warning(
  1359. "`MistralCommonTokenizer.apply_chat_template(..., tokenize=False)` is unsafe and may lead to unexpected behavior."
  1360. " Please consider using `tokenize=True` instead and don't encode the output manually."
  1361. )
  1362. return outputs
  1363. @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
  1364. def __call__(
  1365. self,
  1366. text: Union[TextInput, EncodedInput, list[TextInput], list[EncodedInput], None] = None,
  1367. text_pair: None = None,
  1368. text_target: None = None,
  1369. text_pair_target: None = None,
  1370. add_special_tokens: bool = True,
  1371. padding: Union[bool, str, PaddingStrategy] = False,
  1372. truncation: Union[bool, str, TruncationStrategy, None] = None,
  1373. max_length: Optional[int] = None,
  1374. stride: int = 0,
  1375. pad_to_multiple_of: Optional[int] = None,
  1376. padding_side: Optional[str] = None,
  1377. return_tensors: Optional[Union[str, TensorType]] = None,
  1378. return_attention_mask: Optional[bool] = None,
  1379. return_overflowing_tokens: bool = False,
  1380. return_special_tokens_mask: bool = False,
  1381. return_length: bool = False,
  1382. verbose: bool = True,
  1383. **kwargs,
  1384. ) -> BatchEncoding:
  1385. """
  1386. Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
  1387. sequences.
  1388. Args:
  1389. text (`str`, `List[str]`, `List[List[str]]`, *optional*):
  1390. The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of int
  1391. (encoded strings).
  1392. text_pair (`None`, *optional*):
  1393. Not supported by `MistralCommonTokenizer`. Kept to match the signature of `PreTrainedTokenizerBase.__call__`.
  1394. text_target (`None`, *optional*):
  1395. Not supported by `MistralCommonTokenizer`. Kept to match the signature of `PreTrainedTokenizerBase.__call__`.
  1396. text_pair_target (`None`, *optional*):
  1397. Not supported by `MistralCommonTokenizer`. Kept to match the signature of `PreTrainedTokenizerBase.__call__`.
  1398. """
  1399. if kwargs:
  1400. raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonTokenizer.__call__`.")
  1401. if text_pair or text_target or text_pair_target:
  1402. raise ValueError(
  1403. "`text_pair`, `text_target` and `text_pair_target` are not supported by `MistralCommonTokenizer`."
  1404. )
  1405. if return_tensors in ("tf", "jax"):
  1406. raise ValueError(
  1407. "`MistralCommonTokenizer` does not support `return_tensors='tf'` or `return_tensors='jax'`."
  1408. )
  1409. def _is_valid_text_input(t):
  1410. if isinstance(t, str):
  1411. # Strings are fine
  1412. return True
  1413. elif isinstance(t, (list, tuple)):
  1414. # List are fine as long as they are...
  1415. if len(t) == 0:
  1416. # ... empty
  1417. return True
  1418. elif isinstance(t[0], (str, int)):
  1419. # ... list of strings or int
  1420. return True
  1421. elif isinstance(t[0], (list, tuple)):
  1422. # ... list with an empty list or with a list of strings or with a list of ints
  1423. return len(t[0]) == 0 or isinstance(t[0][0], (str, int))
  1424. else:
  1425. return False
  1426. else:
  1427. return False
  1428. if not _is_valid_text_input(text):
  1429. raise ValueError(
  1430. "text input must be of type `str` (single example), `List[str]` (batch or single encoded example) "
  1431. "or `List[List[int]]` (batch of encoded examples)."
  1432. )
  1433. is_batched = isinstance(text, (list, tuple)) and isinstance(text[0], (str, list, tuple))
  1434. padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
  1435. padding=padding,
  1436. truncation=truncation,
  1437. max_length=max_length,
  1438. pad_to_multiple_of=pad_to_multiple_of,
  1439. verbose=verbose,
  1440. **kwargs,
  1441. )
  1442. if is_batched:
  1443. return self._batch_encode_plus(
  1444. batch_text=text,
  1445. add_special_tokens=add_special_tokens,
  1446. padding_strategy=padding_strategy,
  1447. truncation_strategy=truncation_strategy,
  1448. max_length=max_length,
  1449. stride=stride,
  1450. pad_to_multiple_of=pad_to_multiple_of,
  1451. padding_side=padding_side,
  1452. return_tensors=return_tensors,
  1453. return_attention_mask=return_attention_mask,
  1454. return_overflowing_tokens=return_overflowing_tokens,
  1455. return_special_tokens_mask=return_special_tokens_mask,
  1456. return_length=return_length,
  1457. verbose=verbose,
  1458. **kwargs,
  1459. )
  1460. else:
  1461. return self._encode_plus(
  1462. text=text,
  1463. add_special_tokens=add_special_tokens,
  1464. padding_strategy=padding_strategy,
  1465. truncation_strategy=truncation_strategy,
  1466. max_length=max_length,
  1467. stride=stride,
  1468. pad_to_multiple_of=pad_to_multiple_of,
  1469. padding_side=padding_side,
  1470. return_tensors=return_tensors,
  1471. return_attention_mask=return_attention_mask,
  1472. return_overflowing_tokens=return_overflowing_tokens,
  1473. return_special_tokens_mask=return_special_tokens_mask,
  1474. return_length=return_length,
  1475. verbose=verbose,
  1476. **kwargs,
  1477. )
  1478. @classmethod
  1479. def from_pretrained(
  1480. cls,
  1481. pretrained_model_name_or_path: Union[str, os.PathLike],
  1482. *init_inputs,
  1483. mode: ValidationMode = ValidationMode.test,
  1484. cache_dir: Optional[Union[str, os.PathLike]] = None,
  1485. force_download: bool = False,
  1486. local_files_only: bool = False,
  1487. token: Optional[Union[str, bool]] = None,
  1488. revision: str = "main",
  1489. model_max_length: int = VERY_LARGE_INTEGER,
  1490. padding_side: str = "left",
  1491. truncation_side: str = "right",
  1492. model_input_names: Optional[list[str]] = None,
  1493. clean_up_tokenization_spaces: bool = False,
  1494. **kwargs,
  1495. ):
  1496. r"""
  1497. Instantiate a `MistralCommonTokenizer` from a predefined
  1498. tokenizer.
  1499. Args:
  1500. pretrained_model_name_or_path (`str` or `os.PathLike`):
  1501. Can be either:
  1502. - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
  1503. - A path to a *directory* containing the tokenizer config, for instance saved
  1504. using the [`MistralCommonTokenizer.tokenization_mistral_common.save_pretrained`] method, e.g.,
  1505. `./my_model_directory/`.
  1506. mode (`ValidationMode`, *optional*, defaults to `ValidationMode.test`):
  1507. Validation mode for the `MistralTokenizer` tokenizer.
  1508. cache_dir (`str` or `os.PathLike`, *optional*):
  1509. Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the
  1510. standard cache should not be used.
  1511. force_download (`bool`, *optional*, defaults to `False`):
  1512. Whether or not to force the (re-)download the vocabulary files and override the cached versions if they
  1513. exist.
  1514. token (`str` or *bool*, *optional*):
  1515. The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
  1516. when running `hf auth login` (stored in `~/.huggingface`).
  1517. local_files_only (`bool`, *optional*, defaults to `False`):
  1518. Whether or not to only rely on local files and not to attempt to download any files.
  1519. revision (`str`, *optional*, defaults to `"main"`):
  1520. The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
  1521. git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
  1522. identifier allowed by git.
  1523. max_length (`int`, *optional*):
  1524. Controls the maximum length to use by one of the truncation/padding parameters.
  1525. If left unset or set to `None`, this will use the predefined model maximum length if a maximum length
  1526. is required by one of the truncation/padding parameters. If the model has no specific maximum input
  1527. length (like XLNet) truncation/padding to a maximum length will be deactivated.
  1528. padding_side (`str`, *optional*, defaults to `"left"`):
  1529. The side on which the model should have padding applied. Should be selected between ['right', 'left'].
  1530. Default value is picked from the class attribute of the same name.
  1531. truncation_side (`str`, *optional*, defaults to `"right"`):
  1532. The side on which the model should have truncation applied. Should be selected between ['right', 'left'].
  1533. model_input_names (`List[string]`, *optional*):
  1534. The list of inputs accepted by the forward pass of the model (like `"token_type_ids"` or
  1535. `"attention_mask"`). Default value is picked from the class attribute of the same name.
  1536. clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
  1537. Whether or not the model should cleanup the spaces that were added when splitting the input text during the
  1538. tokenization process.
  1539. kwargs (additional keyword arguments, *optional*):
  1540. Not supported by `MistralCommonTokenizer.from_pretrained`.
  1541. Will raise an error if used.
  1542. """
  1543. if init_inputs:
  1544. raise ValueError("`init_inputs` are not supported by `MistralCommonTokenizer.from_pretrained`.")
  1545. # Handle kwargs and AutoTokenizer case
  1546. if kwargs and not set(kwargs.keys()).issubset({"_from_auto", "trust_remote_code"}):
  1547. raise ValueError(
  1548. f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonTokenizer.from_pretrained`."
  1549. )
  1550. if not os.path.isdir(pretrained_model_name_or_path):
  1551. tokenizer_path = download_tokenizer_from_hf_hub(
  1552. repo_id=pretrained_model_name_or_path,
  1553. cache_dir=cache_dir,
  1554. token=token,
  1555. revision=revision,
  1556. force_download=force_download,
  1557. local_files_only=local_files_only,
  1558. )
  1559. else:
  1560. valid_tokenizer_files = []
  1561. tokenizer_file: str
  1562. instruct_versions = list(TokenizerVersion.__members__)
  1563. mm_versions = list(MultiModalVersion.__members__) + [""] # allow no mm version
  1564. sentencepiece_suffixes = [f".model.{v}{m}" for v in instruct_versions for m in mm_versions] + [".model"]
  1565. for path in os.listdir(pretrained_model_name_or_path):
  1566. pathlib_repo_file = Path(path)
  1567. file_name = pathlib_repo_file.name
  1568. suffix = "".join(pathlib_repo_file.suffixes)
  1569. if file_name == "tekken.json" or suffix in sentencepiece_suffixes:
  1570. valid_tokenizer_files.append(file_name)
  1571. if len(valid_tokenizer_files) == 0:
  1572. raise ValueError(f"No tokenizer file found in directory: {pretrained_model_name_or_path}")
  1573. # If there are multiple tokenizer files, we use tekken.json if it exists, otherwise the versioned one.
  1574. if len(valid_tokenizer_files) > 1:
  1575. if "tekken.json" in valid_tokenizer_files:
  1576. tokenizer_file = "tekken.json"
  1577. else:
  1578. tokenizer_file = max(valid_tokenizer_files)
  1579. logger.warning(
  1580. f"Multiple tokenizer files found in directory: {pretrained_model_name_or_path}. Using {tokenizer_file}."
  1581. )
  1582. else:
  1583. tokenizer_file = valid_tokenizer_files[0]
  1584. tokenizer_path = os.path.join(pretrained_model_name_or_path, tokenizer_file)
  1585. return cls(
  1586. tokenizer_path=tokenizer_path,
  1587. mode=mode,
  1588. model_max_length=model_max_length,
  1589. padding_side=padding_side,
  1590. truncation_side=truncation_side,
  1591. model_input_names=model_input_names,
  1592. clean_up_tokenization_spaces=clean_up_tokenization_spaces,
  1593. )
  1594. def save_pretrained(
  1595. self,
  1596. save_directory: Union[str, os.PathLike, Path],
  1597. push_to_hub: bool = False,
  1598. token: Optional[Union[str, bool]] = None,
  1599. commit_message: Optional[str] = None,
  1600. repo_id: Optional[str] = None,
  1601. private: Optional[bool] = None,
  1602. repo_url: Optional[str] = None,
  1603. organization: Optional[str] = None,
  1604. **kwargs,
  1605. ) -> tuple[str, ...]:
  1606. """
  1607. Save the full tokenizer state.
  1608. This method make sure the full tokenizer can then be re-loaded using the
  1609. [`~MistralCommonTokenizer.tokenization_mistral_common.from_pretrained`] class method.
  1610. Args:
  1611. save_directory (`str` or `os.PathLike`): The path to a directory where the tokenizer will be saved.
  1612. push_to_hub (`bool`, *optional*, defaults to `False`):
  1613. Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
  1614. repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
  1615. namespace).
  1616. token (`str` or *bool*, *optional*, defaults to `None`):
  1617. The token to use to push to the model hub. If `True`, will use the token in the `HF_TOKEN` environment
  1618. variable.
  1619. commit_message (`str`, *optional*): The commit message to use when pushing to the hub.
  1620. repo_id (`str`, *optional*): The name of the repository to which push to the Hub.
  1621. private (`bool`, *optional*): Whether the model repository is private or not.
  1622. repo_url (`str`, *optional*): The URL to the Git repository to which push to the Hub.
  1623. organization (`str`, *optional*): The name of the organization in which you would like to push your model.
  1624. kwargs (`Dict[str, Any]`, *optional*):
  1625. Not supported by `MistralCommonTokenizer.save_pretrained`.
  1626. Will raise an error if used.
  1627. Returns:
  1628. A tuple of `str`: The files saved.
  1629. """
  1630. # `save_jinja_files`` must be skipped to be able to save from a processor
  1631. kwargs.pop("save_jinja_files", None)
  1632. if kwargs:
  1633. raise ValueError(
  1634. f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonTokenizer.save_pretrained`."
  1635. )
  1636. save_directory = Path(save_directory)
  1637. save_directory.mkdir(parents=True, exist_ok=True)
  1638. shutil.copy(self._tokenizer_path, save_directory)
  1639. if push_to_hub:
  1640. repo_id = repo_id or str(save_directory).split(os.path.sep)[-1]
  1641. repo_id = self._create_repo(
  1642. repo_id, token=token, private=private, repo_url=repo_url, organization=organization
  1643. )
  1644. files_timestamps = self._get_files_timestamps(save_directory)
  1645. self._upload_modified_files(
  1646. save_directory,
  1647. repo_id,
  1648. files_timestamps,
  1649. commit_message=commit_message,
  1650. token=token,
  1651. )
  1652. return (str(save_directory / self._tokenizer_path.name),)