processing_utils.py 85 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782
  1. # Copyright 2022 The HuggingFace Inc. team.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """
  15. Processing saving/loading class for common processors.
  16. """
  17. import bisect
  18. import copy
  19. import inspect
  20. import json
  21. import os
  22. import sys
  23. import typing
  24. import warnings
  25. from dataclasses import dataclass
  26. from pathlib import Path
  27. from typing import Any, Optional, TypedDict, TypeVar, Union
  28. import numpy as np
  29. import typing_extensions
  30. from huggingface_hub.errors import EntryNotFoundError
  31. from .audio_utils import AudioInput, load_audio
  32. from .dynamic_module_utils import custom_object_save
  33. from .feature_extraction_utils import BatchFeature
  34. from .image_utils import ChannelDimension, ImageInput, is_vision_available
  35. from .utils.chat_template_utils import render_jinja_template
  36. from .video_utils import VideoInput, VideoMetadata
  37. if is_vision_available():
  38. from .image_utils import PILImageResampling
  39. from .tokenization_utils_base import (
  40. PaddingStrategy,
  41. PreTokenizedInput,
  42. PreTrainedTokenizerBase,
  43. TextInput,
  44. TruncationStrategy,
  45. )
  46. from .utils import (
  47. AUDIO_TOKENIZER_NAME,
  48. CHAT_TEMPLATE_DIR,
  49. CHAT_TEMPLATE_FILE,
  50. LEGACY_PROCESSOR_CHAT_TEMPLATE_FILE,
  51. PROCESSOR_NAME,
  52. PushToHubMixin,
  53. TensorType,
  54. cached_file,
  55. copy_func,
  56. direct_transformers_import,
  57. download_url,
  58. is_offline_mode,
  59. is_remote_url,
  60. is_torch_available,
  61. list_repo_templates,
  62. logging,
  63. )
  64. from .utils.deprecation import deprecate_kwarg
  65. if is_torch_available():
  66. from .modeling_utils import PreTrainedAudioTokenizerBase
  67. logger = logging.get_logger(__name__)
  68. # type hinting: specifying the type of processor class that inherits from ProcessorMixin
  69. SpecificProcessorType = TypeVar("SpecificProcessorType", bound="ProcessorMixin")
  70. # Dynamically import the Transformers module to grab the attribute classes of the processor from their names.
  71. transformers_module = direct_transformers_import(Path(__file__).parent)
  72. AUTO_TO_BASE_CLASS_MAPPING = {
  73. "AutoTokenizer": "PreTrainedTokenizerBase",
  74. "AutoFeatureExtractor": "FeatureExtractionMixin",
  75. "AutoImageProcessor": "ImageProcessingMixin",
  76. "AutoVideoProcessor": "BaseVideoProcessor",
  77. }
  78. if sys.version_info >= (3, 11):
  79. Unpack = typing.Unpack
  80. else:
  81. Unpack = typing_extensions.Unpack
  82. class TextKwargs(TypedDict, total=False):
  83. """
  84. Keyword arguments for text processing. For extended documentation, check out tokenization_utils_base methods and
  85. docstrings associated.
  86. Attributes:
  87. add_special_tokens (`bool`, *optional*)
  88. Whether or not to add special tokens when encoding the sequences.
  89. padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*)
  90. Activates and controls padding.
  91. truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*):
  92. Activates and controls truncation.
  93. max_length (`int`, *optional*):
  94. Controls the maximum length to use by one of the truncation/padding parameters.
  95. stride (`int`, *optional*):
  96. If set, the overflowing tokens will contain some tokens from the end of the truncated sequence.
  97. is_split_into_words (`bool`, *optional*):
  98. Whether or not the input is already pre-tokenized.
  99. pad_to_multiple_of (`int`, *optional*):
  100. If set, will pad the sequence to a multiple of the provided value.
  101. return_token_type_ids (`bool`, *optional*):
  102. Whether to return token type IDs.
  103. return_attention_mask (`bool`, *optional*):
  104. Whether to return the attention mask.
  105. return_overflowing_tokens (`bool`, *optional*):
  106. Whether or not to return overflowing token sequences.
  107. return_special_tokens_mask (`bool`, *optional*):
  108. Whether or not to return special tokens mask information.
  109. return_offsets_mapping (`bool`, *optional*):
  110. Whether or not to return `(char_start, char_end)` for each token.
  111. return_length (`bool`, *optional*):
  112. Whether or not to return the lengths of the encoded inputs.
  113. verbose (`bool`, *optional*):
  114. Whether or not to print more information and warnings.
  115. padding_side (`str`, *optional*):
  116. The side on which padding will be applied.
  117. return_mm_token_type_ids (`bool`, *optional*):
  118. Whether to return multimodal token type ids indicating mm placeholder token positions.
  119. """
  120. text_pair: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]]
  121. text_target: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]
  122. text_pair_target: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]]
  123. add_special_tokens: Optional[bool]
  124. padding: Union[bool, str, PaddingStrategy]
  125. truncation: Union[bool, str, TruncationStrategy]
  126. max_length: Optional[int]
  127. stride: Optional[int]
  128. is_split_into_words: Optional[bool]
  129. pad_to_multiple_of: Optional[int]
  130. return_token_type_ids: Optional[bool]
  131. return_attention_mask: Optional[bool]
  132. return_overflowing_tokens: Optional[bool]
  133. return_special_tokens_mask: Optional[bool]
  134. return_offsets_mapping: Optional[bool]
  135. return_length: Optional[bool]
  136. verbose: Optional[bool]
  137. padding_side: Optional[str]
  138. return_mm_token_type_ids: Optional[bool]
  139. class ImagesKwargs(TypedDict, total=False):
  140. """
  141. Keyword arguments for image processing. For extended documentation, check the appropriate ImageProcessor
  142. class methods and docstrings.
  143. Attributes:
  144. do_resize (`bool`, *optional*):
  145. Whether to resize the image.
  146. size (`dict[str, int]`, *optional*):
  147. Resize the shorter side of the input to `size["shortest_edge"]`.
  148. crop_size (`dict[str, int]`, *optional*):
  149. Desired output size when applying center-cropping.
  150. resample (`PILImageResampling`, *optional*):
  151. Resampling filter to use if resizing the image.
  152. do_rescale (`bool`, *optional*):
  153. Whether to rescale the image by the specified scale `rescale_factor`.
  154. rescale_factor (`int` or `float`, *optional*):
  155. Scale factor to use if rescaling the image.
  156. do_normalize (`bool`, *optional*):
  157. Whether to normalize the image.
  158. image_mean (`float` or `list[float]`, *optional*):
  159. Mean to use if normalizing the image.
  160. image_std (`float` or `list[float]`, *optional*):
  161. Standard deviation to use if normalizing the image.
  162. do_pad (`bool`, *optional*):
  163. Whether to pad the image to the `(max_height, max_width)` of the images in the batch.
  164. pad_size (`dict[str, int]`, *optional*):
  165. The size `{"height": int, "width" int}` to pad the images to.
  166. do_center_crop (`bool`, *optional*):
  167. Whether to center crop the image.
  168. data_format (`ChannelDimension` or `str`, *optional*):
  169. The channel dimension format for the output image.
  170. input_data_format (`ChannelDimension` or `str`, *optional*):
  171. The channel dimension format for the input image.
  172. device (`str`, *optional*):
  173. The device to use for processing (e.g. "cpu", "cuda"), only relevant for fast image processing.
  174. """
  175. do_resize: Optional[bool]
  176. size: Optional[dict[str, int]]
  177. crop_size: Optional[dict[str, int]]
  178. resample: Optional[Union["PILImageResampling", int]]
  179. do_rescale: Optional[bool]
  180. rescale_factor: Optional[float]
  181. do_normalize: Optional[bool]
  182. image_mean: Optional[Union[float, list[float]]]
  183. image_std: Optional[Union[float, list[float]]]
  184. do_pad: Optional[bool]
  185. pad_size: Optional[dict[str, int]]
  186. do_center_crop: Optional[bool]
  187. data_format: Optional[ChannelDimension]
  188. input_data_format: Optional[Union[str, ChannelDimension]]
  189. device: Optional[str]
  190. class VideosKwargs(TypedDict, total=False):
  191. """
  192. Keyword arguments for video processing.
  193. Attributes:
  194. do_convert_rgb (`bool`):
  195. Whether to convert the video to RGB format.
  196. do_resize (`bool`):
  197. Whether to resize the video.
  198. size (`dict[str, int]`, *optional*):
  199. Resize the shorter side of the input to `size["shortest_edge"]`.
  200. default_to_square (`bool`, *optional*, defaults to `self.default_to_square`):
  201. Whether to default to a square when resizing, if size is an int.
  202. resample (`PILImageResampling`, *optional*):
  203. Resampling filter to use if resizing the video.
  204. do_rescale (`bool`, *optional*):
  205. Whether to rescale the video by the specified scale `rescale_factor`.
  206. rescale_factor (`int` or `float`, *optional*):
  207. Scale factor to use if rescaling the video.
  208. do_normalize (`bool`, *optional*):
  209. Whether to normalize the video.
  210. image_mean (`float` or `list[float]`, *optional*):
  211. Mean to use if normalizing the video.
  212. image_std (`float` or `list[float]`, *optional*):
  213. Standard deviation to use if normalizing the video.
  214. do_center_crop (`bool`, *optional*):
  215. Whether to center crop the video.
  216. do_sample_frames (`bool`, *optional*):
  217. Whether to sample frames from the video before processing or to process the whole video.
  218. video_metadata (`Union[VideoMetadata, dict]`, *optional*):
  219. Metadata of the video containing information about total duration, fps and total number of frames.
  220. num_frames (`int`, *optional*):
  221. Maximum number of frames to sample when `do_sample_frames=True`.
  222. fps (`int` or `float`, *optional*):
  223. Target frames to sample per second when `do_sample_frames=True`.
  224. crop_size (`dict[str, int]`, *optional*):
  225. Desired output size when applying center-cropping.
  226. data_format (`ChannelDimension` or `str`, *optional*):
  227. The channel dimension format for the output video.
  228. input_data_format (`ChannelDimension` or `str`, *optional*):
  229. The channel dimension format for the input video.
  230. return_metadata (`ChannelDimension` or `str`, *optional*):
  231. Whether to return video metadata or not.
  232. """
  233. do_convert_rgb: Optional[bool]
  234. do_resize: Optional[bool]
  235. size: Optional[dict[str, int]]
  236. default_to_square: Optional[bool]
  237. resample: Optional["PILImageResampling"]
  238. do_rescale: Optional[bool]
  239. rescale_factor: Optional[float]
  240. do_normalize: Optional[bool]
  241. image_mean: Optional[Union[float, list[float]]]
  242. image_std: Optional[Union[float, list[float]]]
  243. do_center_crop: Optional[bool]
  244. crop_size: Optional[dict[str, int]]
  245. data_format: Optional[ChannelDimension]
  246. input_data_format: Optional[Union[str, ChannelDimension]]
  247. device: Optional[str]
  248. do_sample_frames: Optional[bool]
  249. video_metadata: Optional[Union[VideoMetadata, dict]]
  250. fps: Optional[Union[int, float]]
  251. num_frames: Optional[int]
  252. return_metadata: Optional[bool]
  253. class AudioKwargs(TypedDict, total=False):
  254. """
  255. Keyword arguments for audio processing.
  256. Attributes:
  257. sampling_rate (`int`, *optional*):
  258. The sampling rate at which the `raw_speech` input was sampled.
  259. raw_speech (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
  260. The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
  261. values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
  262. stereo, i.e. single float per timestep.
  263. padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*):
  264. Select a strategy to pad the returned sequences (according to the model's padding side and padding
  265. index) among:
  266. - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
  267. sequence if provided).
  268. - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
  269. acceptable input length for the model if that argument is not provided.
  270. - `False` or `'do_not_pad'`
  271. max_length (`int`, *optional*):
  272. Maximum length of the returned list and optionally padding length (see above).
  273. truncation (`bool`, *optional*):
  274. Activates truncation to cut input sequences longer than *max_length* to *max_length*.
  275. pad_to_multiple_of (`int`, *optional*):
  276. If set, will pad the sequence to a multiple of the provided value.
  277. return_attention_mask (`bool`, *optional*):
  278. Whether or not [`~ASTFeatureExtractor.__call__`] should return `attention_mask`.
  279. """
  280. sampling_rate: Optional[int]
  281. raw_speech: Optional[Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]]]
  282. padding: Optional[Union[bool, str, PaddingStrategy]]
  283. max_length: Optional[int]
  284. truncation: Optional[bool]
  285. pad_to_multiple_of: Optional[int]
  286. return_attention_mask: Optional[bool]
  287. class CommonKwargs(TypedDict, total=False):
  288. return_tensors: Optional[Union[str, TensorType]]
  289. class ProcessingKwargs(TypedDict, total=False):
  290. """
  291. Base class for kwargs passing to processors.
  292. In case a model has specific kwargs that are not present in the base class or default values for existing keys,
  293. it should have its own `ModelProcessorKwargs` class that inherits from `ProcessingKwargs` to provide:
  294. 1) Additional typed keys and that this model requires to process inputs.
  295. 2) Default values for existing keys under a `_defaults` attribute.
  296. New keys have to be defined as follows to ensure type hinting is done correctly.
  297. ```python
  298. # adding a new image kwarg for this model
  299. class ModelImagesKwargs(ImagesKwargs, total=False):
  300. new_image_kwarg: Optional[bool]
  301. class ModelProcessorKwargs(ProcessingKwargs, total=False):
  302. images_kwargs: ModelImagesKwargs
  303. _defaults = {
  304. "images_kwargs: {
  305. "new_image_kwarg": False,
  306. }
  307. "text_kwargs": {
  308. "padding": "max_length",
  309. },
  310. }
  311. ```
  312. For Python 3.8 compatibility, when inheriting from this class and overriding one of the kwargs,
  313. you need to manually update the __annotations__ dictionary. This can be done as follows:
  314. ```python
  315. class CustomProcessorKwargs(ProcessingKwargs, total=False):
  316. images_kwargs: CustomImagesKwargs
  317. CustomProcessorKwargs.__annotations__["images_kwargs"] = CustomImagesKwargs # python 3.8 compatibility
  318. ```python
  319. """
  320. _defaults = {}
  321. common_kwargs: CommonKwargs = {
  322. **CommonKwargs.__annotations__,
  323. }
  324. text_kwargs: TextKwargs = {
  325. **TextKwargs.__annotations__,
  326. }
  327. images_kwargs: ImagesKwargs = {
  328. **ImagesKwargs.__annotations__,
  329. }
  330. videos_kwargs: VideosKwargs = {
  331. **VideosKwargs.__annotations__,
  332. }
  333. audio_kwargs: AudioKwargs = {
  334. **AudioKwargs.__annotations__,
  335. }
  336. class TokenizerChatTemplateKwargs(TypedDict, total=False):
  337. """
  338. Keyword arguments for tokenizer's `apply_chat_template`, when it is called from within a processor.
  339. tools (`list[Dict]`, *optional*):
  340. A list of tools (callable functions) that will be accessible to the model. If the template does not
  341. support function calling, this argument will have no effect. Each tool should be passed as a JSON Schema,
  342. giving the name, description and argument types for the tool. See our
  343. [chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#automated-function-conversion-for-tool-use)
  344. for more information.
  345. documents (`list[dict[str, str]]`, *optional*):
  346. A list of dicts representing documents that will be accessible to the model if it is performing RAG
  347. (retrieval-augmented generation). If the template does not support RAG, this argument will have no
  348. effect. We recommend that each document should be a dict containing "title" and "text" keys. Please
  349. see the RAG section of the [chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#arguments-for-RAG)
  350. for examples of passing documents with chat templates.
  351. add_generation_prompt (bool, *optional*):
  352. If this is set, a prompt with the token(s) that indicate
  353. the start of an assistant message will be appended to the formatted output. This is useful when you want to generate a response from the model.
  354. Note that this argument will be passed to the chat template, and so it must be supported in the
  355. template for this argument to have any effect.
  356. continue_final_message (bool, *optional*):
  357. If this is set, the chat will be formatted so that the final
  358. message in the chat is open-ended, without any EOS tokens. The model will continue this message
  359. rather than starting a new one. This allows you to "prefill" part of
  360. the model's response for it. Cannot be used at the same time as `add_generation_prompt`.
  361. return_assistant_tokens_mask (`bool`, defaults to `False`):
  362. Whether to return a mask of the assistant generated tokens. For tokens generated by the assistant,
  363. the mask will contain 1. For user and system tokens, the mask will contain 0.
  364. This functionality is only available for chat templates that support it via the `{% generation %}` keyword.
  365. """
  366. tools: Optional[list[dict]] = None
  367. documents: Optional[list[dict[str, str]]] = None
  368. add_generation_prompt: Optional[bool] = False
  369. continue_final_message: Optional[bool] = False
  370. return_assistant_tokens_mask: Optional[bool] = False
  371. class ChatTemplateLoadKwargs(TypedDict, total=False):
  372. """
  373. Keyword arguments used to load multimodal data in processor chat templates.
  374. num_frames (`int`, *optional*):
  375. Number of frames to sample uniformly. If not passed, the whole video is loaded.
  376. load_audio_from_video (`bool`, *optional*):
  377. Whether to use the audio track of input video. If `True` the audio track will be loaded and passed to the
  378. processor. This flag has no effect if the model doesn't support audio modality.
  379. """
  380. sampling_rate: Optional[int] = 16_000
  381. load_audio_from_video: Optional[bool] = False
  382. class ProcessorChatTemplateKwargs(ChatTemplateLoadKwargs, TokenizerChatTemplateKwargs, total=False):
  383. """
  384. Keyword arguments for processor's `apply_chat_template`.
  385. tokenize (`bool`, *optional*, defaults to `False`):
  386. Whether to tokenize the output or not.
  387. return_dict (`bool`, defaults to `False`):
  388. Whether to return a dictionary with named outputs. Has no effect if tokenize is `False`.
  389. """
  390. tokenize: Optional[bool] = False
  391. return_dict: Optional[bool] = False
  392. class AllKwargsForChatTemplate(TypedDict, total=False):
  393. processor_kwargs: ProcessingKwargs
  394. mm_load_kwargs: ChatTemplateLoadKwargs
  395. template_kwargs: ProcessorChatTemplateKwargs
  396. @dataclass
  397. class MultiModalData:
  398. """
  399. Dataclass that holds extra useful data for processing
  400. multimodal data. Processors currently cannot return keys,
  401. unless it is used in model's forward. Thus we have helper
  402. methods that calculate and return useful data from processing
  403. input multimodals (images/videos).
  404. Note that this dataclass is aimed to be used only in vLLM
  405. and we might change its API in the future.
  406. """
  407. num_image_tokens: Optional[list[int]] = None
  408. num_video_tokens: Optional[list[int]] = None
  409. num_audio_tokens: Optional[list[int]] = None
  410. num_image_patches: Optional[list[int]] = None
  411. def __contains__(self, key):
  412. return hasattr(self, key) and getattr(self, key) is not None
  413. def __getitem__(self, key):
  414. if hasattr(self, key):
  415. return getattr(self, key)
  416. raise AttributeError(f"{self.__class__.__name__} has no attribute {key}")
  417. class ProcessorMixin(PushToHubMixin):
  418. """
  419. This is a mixin used to provide saving/loading functionality for all processor classes.
  420. """
  421. attributes = ["feature_extractor", "tokenizer"]
  422. optional_attributes = ["chat_template", "audio_tokenizer"]
  423. optional_call_args: list[str] = []
  424. # Names need to be attr_class for attr in attributes
  425. feature_extractor_class = None
  426. tokenizer_class = None
  427. _auto_class = None
  428. valid_processor_kwargs = ProcessingKwargs
  429. # args have to match the attributes class attribute
  430. def __init__(self, *args, **kwargs):
  431. # First, extract optional attributes from kwargs if present
  432. # Optional attributes can never be positional arguments
  433. for optional_attribute in self.optional_attributes:
  434. optional_attribute_value = kwargs.pop(optional_attribute, None)
  435. setattr(self, optional_attribute, optional_attribute_value)
  436. # Check audio tokenizer for its class but do not treat it as attr to avoid saving weights
  437. if optional_attribute == "audio_tokenizer" and optional_attribute_value is not None:
  438. proper_class = self.check_argument_for_proper_class(optional_attribute, optional_attribute_value)
  439. if not (is_torch_available() and isinstance(optional_attribute_value, PreTrainedAudioTokenizerBase)):
  440. raise ValueError(
  441. f"Tried to use `{proper_class}` for audio tokenization. However, this class is not"
  442. " registered for audio tokenization."
  443. )
  444. # Sanitize args and kwargs
  445. for key in kwargs:
  446. if key not in self.attributes:
  447. raise TypeError(f"Unexpected keyword argument {key}.")
  448. for arg, attribute_name in zip(args, self.attributes):
  449. if attribute_name in kwargs:
  450. raise TypeError(f"Got multiple values for argument {attribute_name}.")
  451. else:
  452. kwargs[attribute_name] = arg
  453. if len(kwargs) != len(self.attributes):
  454. raise ValueError(
  455. f"This processor requires {len(self.attributes)} arguments: {', '.join(self.attributes)}. Got "
  456. f"{len(args)} arguments instead."
  457. )
  458. # Check each arg is of the proper class (this will also catch a user initializing in the wrong order)
  459. for attribute_name, arg in kwargs.items():
  460. self.check_argument_for_proper_class(attribute_name, arg)
  461. setattr(self, attribute_name, arg)
  462. def __call__(
  463. self,
  464. images: Optional[ImageInput] = None,
  465. text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
  466. videos: Optional[VideoInput] = None,
  467. audio: Optional[AudioInput] = None,
  468. **kwargs: Unpack[ProcessingKwargs],
  469. ):
  470. """
  471. Main method to prepare for model inputs. This method forwards the each modality argument to its own processor
  472. along with `kwargs`. Please refer to the docstring of the each processor attributes for more information.
  473. Args:
  474. images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
  475. The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
  476. tensor. Both channels-first and channels-last formats are supported.
  477. text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`, *optional*):
  478. The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
  479. (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
  480. `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
  481. videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
  482. The video or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
  483. tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
  484. audio (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
  485. The audio or batch of audio to be prepared. Each audio can be a NumPy array or PyTorch
  486. tensor.
  487. return_tensors (`str` or [`~utils.TensorType`], *optional*):
  488. If set, will return tensors of a particular framework. Acceptable values are:
  489. - `'tf'`: Return TensorFlow `tf.constant` objects.
  490. - `'pt'`: Return PyTorch `torch.Tensor` objects.
  491. - `'np'`: Return NumPy `np.ndarray` objects.
  492. - `'jax'`: Return JAX `jnp.ndarray` objects.
  493. Returns:
  494. [`BatchFeature`]: A [`BatchFeature`] object with processed inputs in a dict format.
  495. """
  496. if images is None and text is None and videos is None and audio is None:
  497. raise ValueError(f"You need to provide at least one input to call {self.__class__.__name__}")
  498. kwargs = self._merge_kwargs(
  499. self.valid_processor_kwargs,
  500. tokenizer_init_kwargs=self.tokenizer.init_kwargs if hasattr(self, "tokenizer") else {},
  501. **kwargs,
  502. )
  503. attribute_to_kwargs = {
  504. "tokenizer": (text, "text_kwargs"),
  505. "image_processor": (images, "images_kwargs"),
  506. "video_processor": (videos, "videos_kwargs"),
  507. "feature_extractor": (audio, "audio_kwargs"),
  508. }
  509. outputs = {}
  510. for attribute_name in self.attributes:
  511. attribute = getattr(self, attribute_name, None)
  512. input_data, input_kwargs = attribute_to_kwargs[attribute_name]
  513. if input_data is not None and attribute is not None:
  514. attribute_output = attribute(input_data, **kwargs[input_kwargs])
  515. outputs.update(attribute_output)
  516. return BatchFeature(outputs)
  517. def check_argument_for_proper_class(self, argument_name, argument):
  518. """
  519. Checks the passed argument's class against the expected transformers class. In case of an unexpected
  520. mismatch between expected and actual class, an error is raise. Otherwise, the proper retrieved class
  521. is returned.
  522. """
  523. class_name = getattr(self, f"{argument_name}_class")
  524. # Nothing is ever going to be an instance of "AutoXxx", in that case we check the base class.
  525. class_name = AUTO_TO_BASE_CLASS_MAPPING.get(class_name, class_name)
  526. if isinstance(class_name, tuple):
  527. proper_class = tuple(self.get_possibly_dynamic_module(n) for n in class_name if n is not None)
  528. else:
  529. proper_class = self.get_possibly_dynamic_module(class_name)
  530. if not isinstance(argument, proper_class):
  531. raise TypeError(
  532. f"Received a {type(argument).__name__} for argument {argument_name}, but a {class_name} was expected."
  533. )
  534. return proper_class
  535. def to_dict(self, legacy_serialization=True) -> dict[str, Any]:
  536. """
  537. Serializes this instance to a Python dictionary.
  538. Returns:
  539. `dict[str, Any]`: Dictionary of all the attributes that make up this processor instance.
  540. """
  541. output = copy.deepcopy(self.__dict__)
  542. # Get the kwargs in `__init__`.
  543. sig = inspect.signature(self.__init__)
  544. # Only save the attributes that are presented in the kwargs of `__init__`.
  545. attrs_to_save = list(sig.parameters)
  546. # extra attributes to be kept
  547. attrs_to_save += ["auto_map"]
  548. if legacy_serialization:
  549. # Don't save attributes like `tokenizer`, `image processor` etc. in processor config if `legacy=True`
  550. attrs_to_save = [x for x in attrs_to_save if x not in self.__class__.attributes]
  551. if "tokenizer" in output:
  552. del output["tokenizer"]
  553. if "qformer_tokenizer" in output:
  554. del output["qformer_tokenizer"]
  555. if "protein_tokenizer" in output:
  556. del output["protein_tokenizer"]
  557. if "chat_template" in output:
  558. del output["chat_template"]
  559. def cast_array_to_list(dictionary):
  560. """
  561. Numpy arrays are not serialiazable but can be in pre-processing dicts.
  562. This function casts arrays to list, recusring through the nested configs as well.
  563. """
  564. for key, value in dictionary.items():
  565. if isinstance(value, np.ndarray):
  566. dictionary[key] = value.tolist()
  567. elif isinstance(value, dict):
  568. dictionary[key] = cast_array_to_list(value)
  569. return dictionary
  570. # Serialize attributes as a dict
  571. output = {
  572. k: v.to_dict() if isinstance(v, PushToHubMixin) else v
  573. for k, v in output.items()
  574. if (
  575. k in attrs_to_save # keep all attributes that have to be serialized
  576. and v.__class__.__name__ != "BeamSearchDecoderCTC" # remove attributes with that are objects
  577. and (
  578. (legacy_serialization and not isinstance(v, PushToHubMixin)) or not legacy_serialization
  579. ) # remove `PushToHubMixin` objects
  580. )
  581. }
  582. output = cast_array_to_list(output)
  583. # Special case, add `audio_tokenizer` dict which points to model weights and path
  584. if not legacy_serialization and "audio_tokenizer" in output:
  585. audio_tokenizer_dict = {
  586. "audio_tokenizer_class": self.audio_tokenizer.__class__.__name__,
  587. "audio_tokenizer_name_or_path": self.audio_tokenizer.name_or_path,
  588. }
  589. # Update or overwrite, what do audio tokenizers expect when loading?
  590. output["audio_tokenizer"] = audio_tokenizer_dict
  591. output["processor_class"] = self.__class__.__name__
  592. return output
  593. def to_json_string(self, legacy_serialization=True) -> str:
  594. """
  595. Serializes this instance to a JSON string.
  596. Returns:
  597. `str`: String containing all the attributes that make up this feature_extractor instance in JSON format.
  598. """
  599. dictionary = self.to_dict(legacy_serialization=legacy_serialization)
  600. return json.dumps(dictionary, indent=2, sort_keys=True) + "\n"
  601. def to_json_file(self, json_file_path: Union[str, os.PathLike], legacy_serialization=True):
  602. """
  603. Save this instance to a JSON file.
  604. Args:
  605. json_file_path (`str` or `os.PathLike`):
  606. Path to the JSON file in which this processor instance's parameters will be saved.
  607. """
  608. with open(json_file_path, "w", encoding="utf-8") as writer:
  609. writer.write(self.to_json_string(legacy_serialization=legacy_serialization))
  610. def __repr__(self):
  611. attributes_repr = [f"- {name}: {repr(getattr(self, name))}" for name in self.attributes]
  612. attributes_repr = "\n".join(attributes_repr)
  613. return f"{self.__class__.__name__}:\n{attributes_repr}\n\n{self.to_json_string()}"
  614. def save_pretrained(self, save_directory, push_to_hub: bool = False, legacy_serialization: bool = True, **kwargs):
  615. """
  616. Saves the attributes of this processor (feature extractor, tokenizer...) in the specified directory so that it
  617. can be reloaded using the [`~ProcessorMixin.from_pretrained`] method.
  618. <Tip>
  619. This class method is simply calling [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and
  620. [`~tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained`]. Please refer to the docstrings of the
  621. methods above for more information.
  622. </Tip>
  623. Args:
  624. save_directory (`str` or `os.PathLike`):
  625. Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
  626. be created if it does not exist).
  627. push_to_hub (`bool`, *optional*, defaults to `False`):
  628. Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
  629. repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
  630. namespace).
  631. legacy_serialization (`bool`, *optional*, defaults to `True`):
  632. Whether or not to save processor attributes in separate config files (legacy) or in processor's config
  633. file as a nested dict. Saving all attributes in a single dict will become the default in future versions.
  634. Set to `legacy_serialization=True` until then.
  635. kwargs (`dict[str, Any]`, *optional*):
  636. Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
  637. """
  638. use_auth_token = kwargs.pop("use_auth_token", None)
  639. if use_auth_token is not None:
  640. warnings.warn(
  641. "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
  642. FutureWarning,
  643. )
  644. if kwargs.get("token") is not None:
  645. raise ValueError(
  646. "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
  647. )
  648. kwargs["token"] = use_auth_token
  649. os.makedirs(save_directory, exist_ok=True)
  650. if push_to_hub:
  651. commit_message = kwargs.pop("commit_message", None)
  652. repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
  653. repo_id = self._create_repo(repo_id, **kwargs)
  654. files_timestamps = self._get_files_timestamps(save_directory)
  655. # If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
  656. # loaded from the Hub.
  657. if self._auto_class is not None:
  658. attrs = [getattr(self, attribute_name) for attribute_name in self.attributes]
  659. configs = [(a.init_kwargs if isinstance(a, PreTrainedTokenizerBase) else a) for a in attrs]
  660. configs.append(self)
  661. custom_object_save(self, save_directory, config=configs)
  662. save_jinja_files = kwargs.get("save_jinja_files", True)
  663. for attribute_name in self.attributes:
  664. # Save the tokenizer in its own vocab file. The other attributes are saved as part of `processor_config.json`
  665. if attribute_name == "tokenizer":
  666. attribute = getattr(self, attribute_name)
  667. if hasattr(attribute, "_set_processor_class"):
  668. attribute._set_processor_class(self.__class__.__name__)
  669. # Propagate save_jinja_files to tokenizer to ensure we don't get conflicts
  670. attribute.save_pretrained(save_directory, save_jinja_files=save_jinja_files)
  671. elif legacy_serialization:
  672. attribute = getattr(self, attribute_name)
  673. # Include the processor class in attribute config so this processor can then be reloaded with `AutoProcessor` API.
  674. if hasattr(attribute, "_set_processor_class"):
  675. attribute._set_processor_class(self.__class__.__name__)
  676. attribute.save_pretrained(save_directory)
  677. if self._auto_class is not None:
  678. # We added an attribute to the init_kwargs of the tokenizers, which needs to be cleaned up.
  679. for attribute_name in self.attributes:
  680. attribute = getattr(self, attribute_name)
  681. if isinstance(attribute, PreTrainedTokenizerBase):
  682. del attribute.init_kwargs["auto_map"]
  683. # If we save using the predefined names, we can load using `from_pretrained`
  684. # plus we save chat_template in its own file
  685. output_processor_file = os.path.join(save_directory, PROCESSOR_NAME)
  686. output_chat_template_file_jinja = os.path.join(save_directory, CHAT_TEMPLATE_FILE)
  687. output_chat_template_file_legacy = os.path.join(
  688. save_directory, LEGACY_PROCESSOR_CHAT_TEMPLATE_FILE
  689. ) # Legacy filename
  690. chat_template_dir = os.path.join(save_directory, CHAT_TEMPLATE_DIR)
  691. # Save `chat_template` in its own file. We can't get it from `processor_dict` as we popped it in `to_dict`
  692. # to avoid serializing chat template in json config file. So let's get it from `self` directly
  693. if self.chat_template is not None:
  694. save_jinja_files = kwargs.get("save_jinja_files", True)
  695. is_single_template = isinstance(self.chat_template, str)
  696. if save_jinja_files and is_single_template:
  697. # New format for single templates is to save them as chat_template.jinja
  698. with open(output_chat_template_file_jinja, "w", encoding="utf-8") as f:
  699. f.write(self.chat_template)
  700. logger.info(f"chat template saved in {output_chat_template_file_jinja}")
  701. elif save_jinja_files and not is_single_template:
  702. # New format for multiple templates is to save the default as chat_template.jinja
  703. # and the other templates in the chat_templates/ directory
  704. for template_name, template in self.chat_template.items():
  705. if template_name == "default":
  706. with open(output_chat_template_file_jinja, "w", encoding="utf-8") as f:
  707. f.write(self.chat_template["default"])
  708. logger.info(f"chat template saved in {output_chat_template_file_jinja}")
  709. else:
  710. os.makedirs(chat_template_dir, exist_ok=True)
  711. template_filepath = os.path.join(chat_template_dir, f"{template_name}.jinja")
  712. with open(template_filepath, "w", encoding="utf-8") as f:
  713. f.write(template)
  714. logger.info(f"chat template saved in {template_filepath}")
  715. elif is_single_template:
  716. # Legacy format for single templates: Put them in chat_template.json
  717. chat_template_json_string = (
  718. json.dumps({"chat_template": self.chat_template}, indent=2, sort_keys=True) + "\n"
  719. )
  720. with open(output_chat_template_file_legacy, "w", encoding="utf-8") as writer:
  721. writer.write(chat_template_json_string)
  722. logger.info(f"chat template saved in {output_chat_template_file_legacy}")
  723. elif self.chat_template is not None:
  724. # At this point we have multiple templates in the legacy format, which is not supported
  725. # chat template dicts are saved to chat_template.json as lists of dicts with fixed key names.
  726. raise ValueError(
  727. "Multiple chat templates are not supported in the legacy format. Please save them as "
  728. "separate files using the `save_jinja_files` argument."
  729. )
  730. if legacy_serialization:
  731. output_audio_tokenizer_file = os.path.join(save_directory, AUDIO_TOKENIZER_NAME)
  732. processor_dict = self.to_dict()
  733. # For now, let's not save to `processor_config.json` if the processor doesn't have extra attributes and
  734. # `auto_map` is not specified.
  735. if set(processor_dict.keys()) != {"processor_class"}:
  736. self.to_json_file(output_processor_file)
  737. logger.info(f"processor saved in {output_processor_file}")
  738. if set(processor_dict.keys()) == {"processor_class"}:
  739. return_files = []
  740. else:
  741. return_files = [output_processor_file]
  742. if self.audio_tokenizer is not None:
  743. audio_tokenizer_class = self.audio_tokenizer.__class__.__name__
  744. audio_tokenizer_name_or_path = self.audio_tokenizer.name_or_path
  745. audio_tokenizer_dict = {
  746. "audio_tokenizer_class": audio_tokenizer_class,
  747. "audio_tokenizer_name_or_path": audio_tokenizer_name_or_path,
  748. }
  749. audio_tokenizer_json = json.dumps(audio_tokenizer_dict, indent=2, sort_keys=True) + "\n"
  750. with open(output_audio_tokenizer_file, "w", encoding="utf-8") as writer:
  751. writer.write(audio_tokenizer_json)
  752. # Create a unified `preprocessor_config.json` and save all attributes as a composite config, except for tokenizers
  753. # NOTE: this will become the default way to save all processor attrbiutes in future versions. Toggled off for now to give
  754. # us time for smoother transition
  755. else:
  756. self.to_json_file(output_processor_file, legacy_serialization=False)
  757. logger.info(f"processor saved in {output_processor_file}")
  758. return_files = [output_processor_file]
  759. if push_to_hub:
  760. self._upload_modified_files(
  761. save_directory,
  762. repo_id,
  763. files_timestamps,
  764. commit_message=commit_message,
  765. token=kwargs.get("token"),
  766. )
  767. return return_files
  768. @classmethod
  769. def get_processor_dict(
  770. cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
  771. ) -> tuple[dict[str, Any], dict[str, Any]]:
  772. """
  773. From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
  774. processor of type [`~processing_utils.ProcessingMixin`] using `from_args_and_dict`.
  775. Parameters:
  776. pretrained_model_name_or_path (`str` or `os.PathLike`):
  777. The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
  778. subfolder (`str`, *optional*, defaults to `""`):
  779. In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
  780. specify the folder name here.
  781. Returns:
  782. `tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the processor object.
  783. """
  784. # holding a copy for optionally loading the audio tokenizer (if available)
  785. audio_tokenizer_kwargs = copy.deepcopy(kwargs)
  786. cache_dir = kwargs.pop("cache_dir", None)
  787. force_download = kwargs.pop("force_download", False)
  788. resume_download = kwargs.pop("resume_download", None)
  789. proxies = kwargs.pop("proxies", None)
  790. token = kwargs.pop("token", None)
  791. local_files_only = kwargs.pop("local_files_only", False)
  792. revision = kwargs.pop("revision", None)
  793. subfolder = kwargs.pop("subfolder", "")
  794. from_pipeline = kwargs.pop("_from_pipeline", None)
  795. from_auto_class = kwargs.pop("_from_auto", False)
  796. user_agent = {"file_type": "processor", "from_auto_class": from_auto_class}
  797. if from_pipeline is not None:
  798. user_agent["using_pipeline"] = from_pipeline
  799. if is_offline_mode() and not local_files_only:
  800. logger.info("Offline mode: forcing local_files_only=True")
  801. local_files_only = True
  802. pretrained_model_name_or_path = str(pretrained_model_name_or_path)
  803. is_local = os.path.isdir(pretrained_model_name_or_path)
  804. if os.path.isdir(pretrained_model_name_or_path):
  805. processor_file = os.path.join(pretrained_model_name_or_path, PROCESSOR_NAME)
  806. additional_chat_template_files = {}
  807. resolved_additional_chat_template_files = {}
  808. if os.path.isfile(pretrained_model_name_or_path):
  809. resolved_processor_file = pretrained_model_name_or_path
  810. # can't load chat-template and audio tokenizer when given a file as pretrained_model_name_or_path
  811. resolved_chat_template_file = None
  812. resolved_raw_chat_template_file = None
  813. resolved_audio_tokenizer_file = None
  814. is_local = True
  815. elif is_remote_url(pretrained_model_name_or_path):
  816. processor_file = pretrained_model_name_or_path
  817. resolved_processor_file = download_url(pretrained_model_name_or_path)
  818. # can't load chat-template and audio tokenizer when given a file url as pretrained_model_name_or_path
  819. resolved_chat_template_file = None
  820. resolved_raw_chat_template_file = None
  821. resolved_audio_tokenizer_file = None
  822. else:
  823. if is_local:
  824. template_dir = Path(pretrained_model_name_or_path, CHAT_TEMPLATE_DIR)
  825. if template_dir.is_dir():
  826. for template_file in template_dir.glob("*.jinja"):
  827. template_name = template_file.stem
  828. additional_chat_template_files[template_name] = f"{CHAT_TEMPLATE_DIR}/{template_file.name}"
  829. else:
  830. try:
  831. for template in list_repo_templates(
  832. pretrained_model_name_or_path,
  833. local_files_only=local_files_only,
  834. revision=revision,
  835. cache_dir=cache_dir,
  836. token=token,
  837. ):
  838. additional_chat_template_files[template] = f"{CHAT_TEMPLATE_DIR}/{template}.jinja"
  839. except EntryNotFoundError:
  840. pass # No template dir means no template files
  841. processor_file = PROCESSOR_NAME
  842. try:
  843. # Load from local folder or from cache or download from model Hub and cache
  844. resolved_processor_file = cached_file(
  845. pretrained_model_name_or_path,
  846. processor_file,
  847. cache_dir=cache_dir,
  848. force_download=force_download,
  849. proxies=proxies,
  850. resume_download=resume_download,
  851. local_files_only=local_files_only,
  852. token=token,
  853. user_agent=user_agent,
  854. revision=revision,
  855. subfolder=subfolder,
  856. _raise_exceptions_for_missing_entries=False,
  857. )
  858. # chat_template.json is a legacy file used by the processor class
  859. # a raw chat_template.jinja is preferred in future
  860. resolved_chat_template_file = cached_file(
  861. pretrained_model_name_or_path,
  862. LEGACY_PROCESSOR_CHAT_TEMPLATE_FILE,
  863. cache_dir=cache_dir,
  864. force_download=force_download,
  865. proxies=proxies,
  866. resume_download=resume_download,
  867. local_files_only=local_files_only,
  868. token=token,
  869. user_agent=user_agent,
  870. revision=revision,
  871. subfolder=subfolder,
  872. _raise_exceptions_for_missing_entries=False,
  873. )
  874. resolved_raw_chat_template_file = cached_file(
  875. pretrained_model_name_or_path,
  876. CHAT_TEMPLATE_FILE,
  877. cache_dir=cache_dir,
  878. force_download=force_download,
  879. proxies=proxies,
  880. resume_download=resume_download,
  881. local_files_only=local_files_only,
  882. token=token,
  883. user_agent=user_agent,
  884. revision=revision,
  885. subfolder=subfolder,
  886. _raise_exceptions_for_missing_entries=False,
  887. )
  888. resolved_additional_chat_template_files = {
  889. template_name: cached_file(
  890. pretrained_model_name_or_path,
  891. template_file,
  892. cache_dir=cache_dir,
  893. force_download=force_download,
  894. proxies=proxies,
  895. resume_download=resume_download,
  896. local_files_only=local_files_only,
  897. token=token,
  898. user_agent=user_agent,
  899. revision=revision,
  900. subfolder=subfolder,
  901. _raise_exceptions_for_missing_entries=False,
  902. )
  903. for template_name, template_file in additional_chat_template_files.items()
  904. }
  905. resolved_audio_tokenizer_file = cached_file(
  906. pretrained_model_name_or_path,
  907. AUDIO_TOKENIZER_NAME,
  908. cache_dir=cache_dir,
  909. force_download=force_download,
  910. proxies=proxies,
  911. resume_download=resume_download,
  912. local_files_only=local_files_only,
  913. token=token,
  914. user_agent=user_agent,
  915. revision=revision,
  916. subfolder=subfolder,
  917. _raise_exceptions_for_missing_entries=False,
  918. )
  919. except OSError:
  920. # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
  921. # the original exception.
  922. raise
  923. except Exception:
  924. # For any other exception, we throw a generic error.
  925. raise OSError(
  926. f"Can't load processor for '{pretrained_model_name_or_path}'. If you were trying to load"
  927. " it from 'https://huggingface.co/models', make sure you don't have a local directory with the"
  928. f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
  929. f" directory containing a {PROCESSOR_NAME} file"
  930. )
  931. # Add chat template as kwarg before returning because most models don't have processor config
  932. if resolved_chat_template_file is not None:
  933. # This is the legacy path
  934. with open(resolved_chat_template_file, encoding="utf-8") as reader:
  935. chat_template_json = json.loads(reader.read())
  936. chat_templates = {"default": chat_template_json["chat_template"]}
  937. if resolved_additional_chat_template_files:
  938. raise ValueError(
  939. "Cannot load chat template due to conflicting files - this checkpoint combines "
  940. "a legacy chat_template.json file with separate template files, which is not "
  941. "supported. To resolve this error, replace the legacy chat_template.json file "
  942. "with a modern chat_template.jinja file."
  943. )
  944. else:
  945. chat_templates = {
  946. template_name: open(template_file, "r", encoding="utf-8").read()
  947. for template_name, template_file in resolved_additional_chat_template_files.items()
  948. }
  949. if resolved_raw_chat_template_file is not None:
  950. with open(resolved_raw_chat_template_file, "r", encoding="utf-8") as reader:
  951. chat_templates["default"] = reader.read()
  952. if isinstance(chat_templates, dict) and "default" in chat_templates and len(chat_templates) == 1:
  953. chat_templates = chat_templates["default"] # Flatten when we just have a single template/file
  954. if chat_templates:
  955. kwargs["chat_template"] = chat_templates
  956. # Existing processors on the Hub created before #27761 being merged don't have `processor_config.json` (if not
  957. # updated afterward), and we need to keep `from_pretrained` work. So here it fallbacks to the empty dict.
  958. # (`cached_file` called using `_raise_exceptions_for_missing_entries=False` to avoid exception)
  959. # However, for models added in the future, we won't get the expected error if this file is missing.
  960. if resolved_processor_file is None:
  961. # In any case we need to pass `chat_template` if it is available
  962. processor_dict = {}
  963. else:
  964. try:
  965. # Load processor dict
  966. with open(resolved_processor_file, encoding="utf-8") as reader:
  967. text = reader.read()
  968. processor_dict = json.loads(text)
  969. except json.JSONDecodeError:
  970. raise OSError(
  971. f"It looks like the config file at '{resolved_processor_file}' is not a valid JSON file."
  972. )
  973. if is_local:
  974. logger.info(f"loading configuration file {resolved_processor_file}")
  975. else:
  976. logger.info(f"loading configuration file {processor_file} from cache at {resolved_processor_file}")
  977. if "chat_template" in processor_dict and processor_dict["chat_template"] is not None:
  978. logger.warning_once(
  979. "Chat templates should be in a 'chat_template.jinja' file but found key='chat_template' "
  980. "in the processor's config. Make sure to move your template to its own file."
  981. )
  982. if "chat_template" in kwargs:
  983. processor_dict["chat_template"] = kwargs.pop("chat_template")
  984. # Audio tokenizer needs to load the model checkpoint first, because the saved
  985. # json file contains only references to the model path and repo id
  986. if resolved_audio_tokenizer_file is not None or "audio_tokenizer" in processor_dict:
  987. if resolved_audio_tokenizer_file is not None:
  988. reader = open(resolved_audio_tokenizer_file, "r", encoding="utf-8")
  989. audio_tokenizer_dict = reader.read()
  990. audio_tokenizer_dict = json.loads(audio_tokenizer_dict)
  991. else:
  992. audio_tokenizer_dict = processor_dict["audio_tokenizer"]
  993. audio_tokenizer_class = cls.get_possibly_dynamic_module(audio_tokenizer_dict["audio_tokenizer_class"])
  994. audio_tokenizer_path = audio_tokenizer_dict["audio_tokenizer_name_or_path"]
  995. processor_dict["audio_tokenizer"] = audio_tokenizer_class.from_pretrained(
  996. audio_tokenizer_path, **audio_tokenizer_kwargs
  997. )
  998. # Pop attributes if saved in a single processor dict, they are loaded in `_get_arguments_from_pretrained`
  999. for attribute in cls.attributes:
  1000. processor_dict.pop(attribute, None)
  1001. return processor_dict, kwargs
  1002. @classmethod
  1003. def from_args_and_dict(cls, args, processor_dict: dict[str, Any], **kwargs):
  1004. """
  1005. Instantiates a type of [`~processing_utils.ProcessingMixin`] from a Python dictionary of parameters.
  1006. Args:
  1007. processor_dict (`dict[str, Any]`):
  1008. Dictionary that will be used to instantiate the processor object. Such a dictionary can be
  1009. retrieved from a pretrained checkpoint by leveraging the
  1010. [`~processing_utils.ProcessingMixin.to_dict`] method.
  1011. kwargs (`dict[str, Any]`):
  1012. Additional parameters from which to initialize the processor object.
  1013. Returns:
  1014. [`~processing_utils.ProcessingMixin`]: The processor object instantiated from those
  1015. parameters.
  1016. """
  1017. processor_dict = processor_dict.copy()
  1018. return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
  1019. # We have to pop up some unused (but specific) kwargs and then validate that it doesn't contain unused kwargs
  1020. # If we don't pop, some specific kwargs will raise a warning
  1021. if "processor_class" in processor_dict:
  1022. del processor_dict["processor_class"]
  1023. if "auto_map" in processor_dict:
  1024. del processor_dict["auto_map"]
  1025. # override processor_dict with given kwargs
  1026. processor_dict.update(kwargs)
  1027. # check if there is an overlap between args and processor_dict
  1028. accepted_args_and_kwargs = cls.__init__.__code__.co_varnames[: cls.__init__.__code__.co_argcount][1:]
  1029. # validate both processor_dict and given kwargs
  1030. unused_kwargs, valid_kwargs = cls.validate_init_kwargs(
  1031. processor_config=processor_dict, valid_kwargs=accepted_args_and_kwargs
  1032. )
  1033. # update args that are already in processor_dict to avoid duplicate arguments
  1034. args_to_update = {
  1035. i: valid_kwargs.pop(arg)
  1036. for i, arg in enumerate(accepted_args_and_kwargs)
  1037. if (arg in valid_kwargs and i < len(args))
  1038. }
  1039. args = [args_to_update.get(i, arg) for i, arg in enumerate(args)]
  1040. # instantiate processor with used (and valid) kwargs only
  1041. processor = cls(*args, **valid_kwargs)
  1042. logger.info(f"Processor {processor}")
  1043. if return_unused_kwargs:
  1044. return processor, unused_kwargs
  1045. else:
  1046. return processor
  1047. def _merge_kwargs(
  1048. self,
  1049. ModelProcessorKwargs: ProcessingKwargs,
  1050. tokenizer_init_kwargs: Optional[dict] = None,
  1051. **kwargs,
  1052. ) -> dict[str, dict]:
  1053. """
  1054. Method to merge dictionaries of kwargs cleanly separated by modality within a Processor instance.
  1055. The order of operations is as follows:
  1056. 1) kwargs passed as before have highest priority to preserve BC.
  1057. ```python
  1058. high_priority_kwargs = {"crop_size" = {"height": 222, "width": 222}, "padding" = "max_length"}
  1059. processor(..., **high_priority_kwargs)
  1060. ```
  1061. 2) kwargs passed as modality-specific kwargs have second priority. This is the recommended API.
  1062. ```python
  1063. processor(..., text_kwargs={"padding": "max_length"}, images_kwargs={"crop_size": {"height": 222, "width": 222}}})
  1064. ```
  1065. 3) kwargs passed during instantiation of a modality processor have fourth priority.
  1066. ```python
  1067. tokenizer = tokenizer_class(..., {"padding": "max_length"})
  1068. image_processor = image_processor_class(...)
  1069. processor(tokenizer, image_processor) # will pass max_length unless overridden by kwargs at call
  1070. ```
  1071. 4) defaults kwargs specified at processor level have lowest priority.
  1072. ```python
  1073. class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwargs, total=False):
  1074. _defaults = {
  1075. "text_kwargs": {
  1076. "padding": "max_length",
  1077. "max_length": 64,
  1078. },
  1079. }
  1080. ```
  1081. Args:
  1082. ModelProcessorKwargs (`ProcessingKwargs`):
  1083. Typed dictionary of kwargs specifically required by the model passed.
  1084. tokenizer_init_kwargs (`Dict`, *optional*):
  1085. Dictionary of kwargs the tokenizer was instantiated with and need to take precedence over defaults.
  1086. Returns:
  1087. output_kwargs (`Dict`):
  1088. Dictionary of per-modality kwargs to be passed to each modality-specific processor.
  1089. """
  1090. # Initialize dictionaries
  1091. output_kwargs = {
  1092. "text_kwargs": {},
  1093. "images_kwargs": {},
  1094. "audio_kwargs": {},
  1095. "videos_kwargs": {},
  1096. "common_kwargs": {},
  1097. }
  1098. default_kwargs = {
  1099. "text_kwargs": {},
  1100. "images_kwargs": {},
  1101. "audio_kwargs": {},
  1102. "videos_kwargs": {},
  1103. "common_kwargs": {},
  1104. }
  1105. possible_modality_keywords = {"text", "audio", "videos", "images"}
  1106. used_keys = set()
  1107. # get defaults from set model processor kwargs if they exist
  1108. for modality in default_kwargs:
  1109. default_kwargs[modality] = ModelProcessorKwargs._defaults.get(modality, {}).copy()
  1110. # update defaults with arguments from tokenizer init
  1111. for modality_key in ModelProcessorKwargs.__annotations__[modality].__annotations__:
  1112. # init with tokenizer init kwargs if necessary
  1113. if tokenizer_init_kwargs is not None and modality_key in tokenizer_init_kwargs:
  1114. value = (
  1115. getattr(self.tokenizer, modality_key)
  1116. if hasattr(self.tokenizer, modality_key)
  1117. else tokenizer_init_kwargs[modality_key]
  1118. )
  1119. default_kwargs[modality][modality_key] = value
  1120. # now defaults kwargs are updated with the tokenizers defaults.
  1121. # pass defaults to output dictionary
  1122. output_kwargs.update(default_kwargs)
  1123. # update modality kwargs with passed kwargs
  1124. non_modality_kwargs = set(kwargs) - set(output_kwargs)
  1125. for modality, output_kwarg in output_kwargs.items():
  1126. for modality_key in ModelProcessorKwargs.__annotations__[modality].__annotations__:
  1127. # check if we received a structured kwarg dict or not to handle it correctly
  1128. if modality in kwargs:
  1129. kwarg_value = kwargs[modality].pop(modality_key, "__empty__")
  1130. # check if this key was passed as a flat kwarg.
  1131. if kwarg_value != "__empty__" and modality_key in non_modality_kwargs:
  1132. raise ValueError(
  1133. f"Keyword argument {modality_key} was passed two times:\n"
  1134. f"in a dictionary for {modality} and as a **kwarg."
  1135. )
  1136. elif modality_key in kwargs:
  1137. # we get a modality_key instead of popping it because modality-specific processors
  1138. # can have overlapping kwargs
  1139. kwarg_value = kwargs.get(modality_key, "__empty__")
  1140. else:
  1141. kwarg_value = "__empty__"
  1142. if not isinstance(kwarg_value, str) or kwarg_value != "__empty__":
  1143. output_kwarg[modality_key] = kwarg_value
  1144. used_keys.add(modality_key)
  1145. # Determine if kwargs is a flat dictionary or contains nested dictionaries
  1146. if any(key in default_kwargs for key in kwargs):
  1147. # kwargs is dictionary-based, and some keys match modality names
  1148. for modality, subdict in kwargs.items():
  1149. if modality in default_kwargs:
  1150. for subkey, subvalue in subdict.items():
  1151. if subkey not in used_keys:
  1152. output_kwargs[modality][subkey] = subvalue
  1153. used_keys.add(subkey)
  1154. else:
  1155. # kwargs is a flat dictionary
  1156. for key, kwarg in kwargs.items():
  1157. if key not in used_keys:
  1158. if key in ModelProcessorKwargs.__annotations__["common_kwargs"].__annotations__:
  1159. output_kwargs["common_kwargs"][key] = kwarg
  1160. elif key not in possible_modality_keywords:
  1161. logger.warning_once(
  1162. f"Keyword argument `{key}` is not a valid argument for this processor and will be ignored."
  1163. )
  1164. # all modality-specific kwargs are updated with common kwargs
  1165. for kwarg in output_kwargs.values():
  1166. kwarg.update(output_kwargs["common_kwargs"])
  1167. return output_kwargs
  1168. @classmethod
  1169. def from_pretrained(
  1170. cls: type[SpecificProcessorType],
  1171. pretrained_model_name_or_path: Union[str, os.PathLike],
  1172. cache_dir: Optional[Union[str, os.PathLike]] = None,
  1173. force_download: bool = False,
  1174. local_files_only: bool = False,
  1175. token: Optional[Union[str, bool]] = None,
  1176. revision: str = "main",
  1177. **kwargs,
  1178. ) -> SpecificProcessorType:
  1179. r"""
  1180. Instantiate a processor associated with a pretrained model.
  1181. <Tip>
  1182. This class method is simply calling the feature extractor
  1183. [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`], image processor
  1184. [`~image_processing_utils.ImageProcessingMixin`] and the tokenizer
  1185. [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`] methods. Please refer to the docstrings of the
  1186. methods above for more information.
  1187. </Tip>
  1188. Args:
  1189. pretrained_model_name_or_path (`str` or `os.PathLike`):
  1190. This can be either:
  1191. - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
  1192. huggingface.co.
  1193. - a path to a *directory* containing a feature extractor file saved using the
  1194. [`~SequenceFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
  1195. - a path or url to a saved feature extractor JSON *file*, e.g.,
  1196. `./my_model_directory/preprocessor_config.json`.
  1197. **kwargs
  1198. Additional keyword arguments passed along to both
  1199. [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and
  1200. [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`].
  1201. """
  1202. kwargs["cache_dir"] = cache_dir
  1203. kwargs["force_download"] = force_download
  1204. kwargs["local_files_only"] = local_files_only
  1205. kwargs["revision"] = revision
  1206. use_auth_token = kwargs.pop("use_auth_token", None)
  1207. if use_auth_token is not None:
  1208. warnings.warn(
  1209. "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
  1210. FutureWarning,
  1211. )
  1212. if token is not None:
  1213. raise ValueError(
  1214. "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
  1215. )
  1216. token = use_auth_token
  1217. if token is not None:
  1218. kwargs["token"] = token
  1219. args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
  1220. processor_dict, kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs)
  1221. return cls.from_args_and_dict(args, processor_dict, **kwargs)
  1222. @classmethod
  1223. def register_for_auto_class(cls, auto_class="AutoProcessor"):
  1224. """
  1225. Register this class with a given auto class. This should only be used for custom feature extractors as the ones
  1226. in the library are already mapped with `AutoProcessor`.
  1227. Args:
  1228. auto_class (`str` or `type`, *optional*, defaults to `"AutoProcessor"`):
  1229. The auto class to register this new feature extractor with.
  1230. """
  1231. if not isinstance(auto_class, str):
  1232. auto_class = auto_class.__name__
  1233. import transformers.models.auto as auto_module
  1234. if not hasattr(auto_module, auto_class):
  1235. raise ValueError(f"{auto_class} is not a valid auto class.")
  1236. cls._auto_class = auto_class
  1237. @classmethod
  1238. def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
  1239. """
  1240. Identify and instantiate the subcomponents of Processor classes, like image processors and
  1241. tokenizers. This method uses the Processor attributes like `tokenizer_class` to figure out what class those
  1242. subcomponents should be. Note that any subcomponents must either be library classes that are accessible in
  1243. the `transformers` root, or they must be custom code that has been registered with the relevant autoclass,
  1244. via methods like `AutoTokenizer.register()`. If neither of these conditions are fulfilled, this method
  1245. will be unable to find the relevant subcomponent class and will raise an error.
  1246. """
  1247. args = []
  1248. for attribute_name in cls.attributes:
  1249. class_name = getattr(cls, f"{attribute_name}_class")
  1250. if isinstance(class_name, tuple):
  1251. classes = tuple(cls.get_possibly_dynamic_module(n) if n is not None else None for n in class_name)
  1252. if attribute_name == "image_processor":
  1253. # TODO: @yoni, change logic in v4.52 (when use_fast set to True by default)
  1254. use_fast = kwargs.get("use_fast")
  1255. if use_fast is None:
  1256. logger.warning_once(
  1257. "Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. "
  1258. "`use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. "
  1259. "This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`."
  1260. )
  1261. else:
  1262. use_fast = kwargs.get("use_fast", True)
  1263. if use_fast and classes[1] is not None:
  1264. attribute_class = classes[1]
  1265. else:
  1266. attribute_class = classes[0]
  1267. else:
  1268. attribute_class = cls.get_possibly_dynamic_module(class_name)
  1269. args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs))
  1270. return args
  1271. @staticmethod
  1272. def get_possibly_dynamic_module(module_name):
  1273. if hasattr(transformers_module, module_name):
  1274. return getattr(transformers_module, module_name)
  1275. lookup_locations = [
  1276. transformers_module.IMAGE_PROCESSOR_MAPPING,
  1277. transformers_module.VIDEO_PROCESSOR_MAPPING,
  1278. transformers_module.TOKENIZER_MAPPING,
  1279. transformers_module.FEATURE_EXTRACTOR_MAPPING,
  1280. transformers_module.MODEL_FOR_AUDIO_TOKENIZATION_MAPPING,
  1281. ]
  1282. for lookup_location in lookup_locations:
  1283. for custom_class in lookup_location._extra_content.values():
  1284. if isinstance(custom_class, tuple):
  1285. for custom_subclass in custom_class:
  1286. if custom_subclass is not None and custom_subclass.__name__ == module_name:
  1287. return custom_subclass
  1288. elif custom_class is not None and custom_class.__name__ == module_name:
  1289. return custom_class
  1290. raise ValueError(
  1291. f"Could not find module {module_name} in `transformers`. If this is a custom class, "
  1292. f"it should be registered using the relevant `AutoClass.register()` function so that "
  1293. f"other functions can find it!"
  1294. )
  1295. def batch_decode(self, *args, **kwargs):
  1296. """
  1297. This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
  1298. refer to the docstring of this method for more information.
  1299. """
  1300. if not hasattr(self, "tokenizer"):
  1301. raise ValueError(f"Cannot batch decode text: {self.__class__.__name__} has no tokenizer.")
  1302. return self.tokenizer.batch_decode(*args, **kwargs)
  1303. def decode(self, *args, **kwargs):
  1304. """
  1305. This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
  1306. the docstring of this method for more information.
  1307. """
  1308. if not hasattr(self, "tokenizer"):
  1309. raise ValueError(f"Cannot decode text: {self.__class__.__name__} has no tokenizer.")
  1310. return self.tokenizer.decode(*args, **kwargs)
  1311. @property
  1312. def model_input_names(self):
  1313. model_input_names = []
  1314. for attribute_name in self.attributes:
  1315. attribute = getattr(self, attribute_name, None)
  1316. attr_input_names = getattr(attribute, "model_input_names")
  1317. model_input_names.extend(attr_input_names)
  1318. return model_input_names
  1319. @staticmethod
  1320. def validate_init_kwargs(processor_config, valid_kwargs):
  1321. kwargs_from_config = set(processor_config.keys())
  1322. valid_kwargs_set = set(valid_kwargs)
  1323. unused_keys = kwargs_from_config - valid_kwargs_set
  1324. valid_keys = kwargs_from_config & valid_kwargs_set
  1325. unused_kwargs = {k: processor_config[k] for k in unused_keys} if unused_keys else {}
  1326. valid_kwargs = {k: processor_config[k] for k in valid_keys} if valid_keys else {}
  1327. return unused_kwargs, valid_kwargs
  1328. @deprecate_kwarg("video_fps", version="4.58", new_name="fps")
  1329. @deprecate_kwarg(
  1330. "video_load_backend",
  1331. version="4.59",
  1332. additional_message=". This function will use `torchcodec` by default, or `torchvision` if `torchcodec` is not installed.",
  1333. )
  1334. def apply_chat_template(
  1335. self,
  1336. conversation: Union[list[dict[str, str]], list[list[dict[str, str]]]],
  1337. chat_template: Optional[str] = None,
  1338. **kwargs: Unpack[AllKwargsForChatTemplate],
  1339. ) -> str:
  1340. """
  1341. Similar to the `apply_chat_template` method on tokenizers, this method applies a Jinja template to input
  1342. conversations to turn them into a single tokenizable string.
  1343. The input is expected to be in the following format, where each message content is a list consisting of text and
  1344. optionally image or video inputs. One can also provide an image, video, URL or local path which will be used to form
  1345. `pixel_values` when `return_dict=True`. If not provided, one will get only the formatted text, optionally tokenized text.
  1346. conversation = [
  1347. {
  1348. "role": "user",
  1349. "content": [
  1350. {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
  1351. {"type": "text", "text": "Please describe this image in detail."},
  1352. ],
  1353. },
  1354. ]
  1355. Args:
  1356. conversation (`Union[list[Dict, [str, str]], list[list[dict[str, str]]]]`):
  1357. The conversation to format.
  1358. chat_template (`Optional[str]`, *optional*):
  1359. The Jinja template to use for formatting the conversation. If not provided, the tokenizer's
  1360. chat template is used.
  1361. """
  1362. if chat_template is None:
  1363. if isinstance(self.chat_template, dict) and "default" in self.chat_template:
  1364. chat_template = self.chat_template["default"]
  1365. elif isinstance(self.chat_template, dict):
  1366. raise ValueError(
  1367. 'The processor has multiple chat templates but none of them are named "default". You need to specify'
  1368. " which one to use by passing the `chat_template` argument. Available templates are: "
  1369. f"{', '.join(self.chat_template.keys())}"
  1370. )
  1371. elif self.chat_template is not None:
  1372. chat_template = self.chat_template
  1373. else:
  1374. raise ValueError(
  1375. "Cannot use apply_chat_template because this processor does not have a chat template."
  1376. )
  1377. else:
  1378. if isinstance(self.chat_template, dict) and chat_template in self.chat_template:
  1379. # It's the name of a template, not a full template string
  1380. chat_template = self.chat_template[chat_template]
  1381. else:
  1382. # It's a template string, render it directly
  1383. pass
  1384. is_tokenizers_fast = hasattr(self, "tokenizer") and self.tokenizer.__class__.__name__.endswith("Fast")
  1385. if kwargs.get("continue_final_message", False):
  1386. if kwargs.get("add_generation_prompt", False):
  1387. raise ValueError(
  1388. "continue_final_message and add_generation_prompt are not compatible. Use continue_final_message when you want the model to continue the final message, and add_generation_prompt when you want to add a header that will prompt it to start a new assistant message instead."
  1389. )
  1390. if kwargs.get("return_assistant_tokens_mask", False):
  1391. raise ValueError("continue_final_message is not compatible with return_assistant_tokens_mask.")
  1392. if kwargs.get("return_assistant_tokens_mask", False):
  1393. if not is_tokenizers_fast:
  1394. raise ValueError(
  1395. "`return_assistant_tokens_mask` is not possible with slow tokenizers. Make sure you have `tokenizers` installed. "
  1396. "If the error persists, open an issue to support a Fast tokenizer for your model."
  1397. )
  1398. else:
  1399. kwargs["return_offsets_mapping"] = True # force offset mapping so we can infer token boundaries
  1400. # Fill sets of kwargs that should be used by different parts of template
  1401. processed_kwargs = {
  1402. "mm_load_kwargs": {},
  1403. "template_kwargs": {},
  1404. }
  1405. for kwarg_type in processed_kwargs:
  1406. for key in AllKwargsForChatTemplate.__annotations__[kwarg_type].__annotations__:
  1407. kwarg_type_defaults = AllKwargsForChatTemplate.__annotations__[kwarg_type]
  1408. default_value = getattr(kwarg_type_defaults, key, None)
  1409. value = kwargs.pop(key, default_value)
  1410. if value is not None and not isinstance(value, dict):
  1411. processed_kwargs[kwarg_type][key] = value
  1412. # pop unused and deprecated kwarg
  1413. kwargs.pop("video_load_backend", None)
  1414. # Pass unprocessed custom kwargs
  1415. processed_kwargs["template_kwargs"].update(kwargs)
  1416. if isinstance(conversation, (list, tuple)) and (
  1417. isinstance(conversation[0], (list, tuple)) or hasattr(conversation[0], "content")
  1418. ):
  1419. is_batched = True
  1420. conversations = conversation
  1421. else:
  1422. is_batched = False
  1423. conversations = [conversation]
  1424. tokenize = processed_kwargs["template_kwargs"].pop("tokenize", False)
  1425. return_dict = processed_kwargs["template_kwargs"].pop("return_dict", False)
  1426. mm_load_kwargs = processed_kwargs["mm_load_kwargs"]
  1427. if tokenize:
  1428. batch_images, batch_videos = [], []
  1429. batch_audios = []
  1430. for conversation in conversations:
  1431. images, videos = [], []
  1432. for message in conversation:
  1433. visuals = [content for content in message["content"] if content["type"] in ["image", "video"]]
  1434. audio_fnames = [
  1435. content[key]
  1436. for content in message["content"]
  1437. for key in ["audio", "url", "path"]
  1438. if key in content and content["type"] == "audio"
  1439. ]
  1440. image_fnames = [
  1441. vision_info[key]
  1442. for vision_info in visuals
  1443. for key in ["image", "url", "path", "base64"]
  1444. if key in vision_info and vision_info["type"] == "image"
  1445. ]
  1446. images.extend(image_fnames)
  1447. video_fnames = [
  1448. vision_info[key]
  1449. for vision_info in visuals
  1450. for key in ["video", "url", "path"]
  1451. if key in vision_info and vision_info["type"] == "video"
  1452. ]
  1453. videos.extend(video_fnames)
  1454. # Audio models do not accept nested list of audios (yet!) so we construct a flat input audio list
  1455. if not mm_load_kwargs["load_audio_from_video"]:
  1456. for fname in audio_fnames:
  1457. batch_audios.append(load_audio(fname, sampling_rate=mm_load_kwargs["sampling_rate"]))
  1458. else:
  1459. for fname in video_fnames:
  1460. batch_audios.append(load_audio(fname, sampling_rate=mm_load_kwargs["sampling_rate"]))
  1461. # Currently all processors can accept nested list of batches, but not flat list of visuals
  1462. # So we'll make a batched list of images and let the processor handle it
  1463. batch_images.append(images)
  1464. batch_videos.append(videos)
  1465. prompt, generation_indices = render_jinja_template(
  1466. conversations=conversations,
  1467. chat_template=chat_template,
  1468. **processed_kwargs["template_kwargs"], # different flags such as `return_assistant_mask`
  1469. **self.tokenizer.special_tokens_map, # tokenizer special tokens are used by some templates
  1470. )
  1471. if not is_batched:
  1472. prompt = prompt[0]
  1473. if tokenize:
  1474. # Tokenizer's `apply_chat_template` never adds special tokens when tokenizing
  1475. # But processor's `apply_chat_template` didn't have an option to tokenize, so users had to format the prompt
  1476. # and pass it to the processor. Users thus never worried about special tokens relying on processor handling
  1477. # everything internally. The below line is to keep BC for that and be able to work with model that have
  1478. # special tokens in the template (consistent with tokenizers). We dont want to raise warning, it will flood command line
  1479. # without actionable solution for users
  1480. single_prompt = prompt[0] if is_batched else prompt
  1481. if self.tokenizer.bos_token is not None and single_prompt.startswith(self.tokenizer.bos_token):
  1482. kwargs["add_special_tokens"] = False
  1483. # Always sample frames by default unless explicitly set to `False` by users. If users do not pass `num_frames`/`fps`
  1484. # sampling should not done for BC.
  1485. if "do_sample_frames" not in kwargs and (
  1486. kwargs.get("fps") is not None or kwargs.get("num_frames") is not None
  1487. ):
  1488. kwargs["do_sample_frames"] = True
  1489. images_exist = any((im is not None) for im_list in batch_images for im in im_list)
  1490. videos_exist = any((vid is not None) for vid_list in batch_videos for vid in vid_list)
  1491. out = self(
  1492. text=prompt,
  1493. images=batch_images if images_exist else None,
  1494. videos=batch_videos if videos_exist else None,
  1495. audio=batch_audios if batch_audios else None,
  1496. **kwargs,
  1497. )
  1498. if return_dict:
  1499. if processed_kwargs["template_kwargs"].get("return_assistant_tokens_mask", False):
  1500. assistant_masks = []
  1501. offset_mapping = out.pop("offset_mapping")
  1502. input_ids = out["input_ids"]
  1503. for i in range(len(input_ids)):
  1504. current_mask = [0] * len(input_ids[i])
  1505. offsets = offset_mapping[i]
  1506. offset_starts = [start for start, end in offsets]
  1507. for assistant_start_char, assistant_end_char in generation_indices[i]:
  1508. start_pos = bisect.bisect_left(offset_starts, assistant_start_char)
  1509. end_pos = bisect.bisect_left(offset_starts, assistant_end_char)
  1510. if not (
  1511. start_pos >= 0
  1512. and offsets[start_pos][0] <= assistant_start_char < offsets[start_pos][1]
  1513. ):
  1514. # start_token is out of bounds maybe due to truncation.
  1515. continue
  1516. for token_id in range(start_pos, end_pos if end_pos else len(input_ids[i])):
  1517. current_mask[token_id] = 1
  1518. assistant_masks.append(current_mask)
  1519. out["assistant_masks"] = assistant_masks
  1520. out.convert_to_tensors(tensor_type=kwargs.get("return_tensors"))
  1521. return out
  1522. else:
  1523. return out["input_ids"]
  1524. return prompt
  1525. def post_process_image_text_to_text(self, generated_outputs, skip_special_tokens=True, **kwargs):
  1526. """
  1527. Post-process the output of a vlm to decode the text.
  1528. Args:
  1529. generated_outputs (`torch.Tensor` or `np.ndarray`):
  1530. The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
  1531. or `(sequence_length,)`.
  1532. skip_special_tokens (`bool`, *optional*, defaults to `True`):
  1533. Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
  1534. **kwargs:
  1535. Additional arguments to be passed to the tokenizer's `batch_decode method`.
  1536. Returns:
  1537. `list[str]`: The decoded text.
  1538. """
  1539. return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=skip_special_tokens, **kwargs)
  1540. def _check_special_mm_tokens(self, text: list[str], text_inputs: "BatchFeature", modalities: list[str]):
  1541. """
  1542. Checks that number of special tokens in text and processed text is same. The count can be different
  1543. if tokenized text was truncated, leading to issues in model code.
  1544. """
  1545. for modality in modalities:
  1546. token_str = getattr(self, f"{modality}_token")
  1547. token_id = getattr(self, f"{modality}_token_id")
  1548. ids_count = [list(ids).count(token_id) for ids in text_inputs["input_ids"]]
  1549. text_count = [sample.count(token_str) for sample in text]
  1550. if ids_count != text_count:
  1551. raise ValueError(
  1552. f"Mismatch in `{modality}` token count between text and `input_ids`. Got ids={ids_count} and text={text_count}. "
  1553. "Likely due to `truncation='max_length'`. Please disable truncation or increase `max_length`."
  1554. )
  1555. ProcessorMixin.push_to_hub = copy_func(ProcessorMixin.push_to_hub)
  1556. if ProcessorMixin.push_to_hub.__doc__ is not None:
  1557. ProcessorMixin.push_to_hub.__doc__ = ProcessorMixin.push_to_hub.__doc__.format(
  1558. object="processor", object_class="AutoProcessor", object_files="processor files"
  1559. )