video_processing_utils.py 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895
  1. # coding=utf-8
  2. # Copyright 2025 The HuggingFace Inc. team.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. import json
  16. import os
  17. import warnings
  18. from copy import deepcopy
  19. from functools import partial
  20. from typing import Any, Callable, Optional, Union
  21. import numpy as np
  22. from .dynamic_module_utils import custom_object_save
  23. from .image_processing_utils import (
  24. BatchFeature,
  25. get_size_dict,
  26. )
  27. from .image_processing_utils_fast import BaseImageProcessorFast
  28. from .image_utils import (
  29. ChannelDimension,
  30. SizeDict,
  31. validate_kwargs,
  32. )
  33. from .processing_utils import Unpack, VideosKwargs
  34. from .utils import (
  35. IMAGE_PROCESSOR_NAME,
  36. PROCESSOR_NAME,
  37. VIDEO_PROCESSOR_NAME,
  38. TensorType,
  39. add_start_docstrings,
  40. copy_func,
  41. download_url,
  42. is_offline_mode,
  43. is_remote_url,
  44. is_torch_available,
  45. is_torchcodec_available,
  46. is_torchvision_v2_available,
  47. logging,
  48. )
  49. from .utils.hub import cached_file
  50. from .utils.import_utils import requires
  51. from .video_utils import (
  52. VideoInput,
  53. VideoMetadata,
  54. group_videos_by_shape,
  55. is_valid_video,
  56. load_video,
  57. make_batched_metadata,
  58. make_batched_videos,
  59. reorder_videos,
  60. to_channel_dimension_format,
  61. )
  62. if is_torch_available():
  63. import torch
  64. if is_torchvision_v2_available():
  65. from torchvision.transforms.v2 import functional as F
  66. logger = logging.get_logger(__name__)
  67. BASE_VIDEO_PROCESSOR_DOCSTRING = r"""
  68. Args:
  69. do_resize (`bool`, *optional*, defaults to `self.do_resize`):
  70. Whether to resize the video's (height, width) dimensions to the specified `size`. Can be overridden by the
  71. `do_resize` parameter in the `preprocess` method.
  72. size (`dict`, *optional*, defaults to `self.size`):
  73. Size of the output video after resizing. Can be overridden by the `size` parameter in the `preprocess`
  74. method.
  75. size_divisor (`int`, *optional*, defaults to `self.size_divisor`):
  76. The size by which to make sure both the height and width can be divided.
  77. default_to_square (`bool`, *optional*, defaults to `self.default_to_square`):
  78. Whether to default to a square video when resizing, if size is an int.
  79. resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
  80. Resampling filter to use if resizing the video. Only has an effect if `do_resize` is set to `True`. Can be
  81. overridden by the `resample` parameter in the `preprocess` method.
  82. do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
  83. Whether to center crop the video to the specified `crop_size`. Can be overridden by `do_center_crop` in the
  84. `preprocess` method.
  85. crop_size (`dict[str, int]` *optional*, defaults to `self.crop_size`):
  86. Size of the output video after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
  87. method.
  88. do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
  89. Whether to rescale the video by the specified scale `rescale_factor`. Can be overridden by the
  90. `do_rescale` parameter in the `preprocess` method.
  91. rescale_factor (`int` or `float`, *optional*, defaults to `self.rescale_factor`):
  92. Scale factor to use if rescaling the video. Only has an effect if `do_rescale` is set to `True`. Can be
  93. overridden by the `rescale_factor` parameter in the `preprocess` method.
  94. do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
  95. Whether to normalize the video. Can be overridden by the `do_normalize` parameter in the `preprocess`
  96. method. Can be overridden by the `do_normalize` parameter in the `preprocess` method.
  97. image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
  98. Mean to use if normalizing the video. This is a float or list of floats the length of the number of
  99. channels in the video. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
  100. overridden by the `image_mean` parameter in the `preprocess` method.
  101. image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
  102. Standard deviation to use if normalizing the video. This is a float or list of floats the length of the
  103. number of channels in the video. Can be overridden by the `image_std` parameter in the `preprocess` method.
  104. Can be overridden by the `image_std` parameter in the `preprocess` method.
  105. do_convert_rgb (`bool`, *optional*, defaults to `self.image_std`):
  106. Whether to convert the video to RGB.
  107. video_metadata (`VideoMetadata`, *optional*):
  108. Metadata of the video containing information about total duration, fps and total number of frames.
  109. do_sample_frames (`int`, *optional*, defaults to `self.do_sample_frames`):
  110. Whether to sample frames from the video before processing or to process the whole video.
  111. num_frames (`int`, *optional*, defaults to `self.num_frames`):
  112. Maximum number of frames to sample when `do_sample_frames=True`.
  113. fps (`int` or `float`, *optional*, defaults to `self.fps`):
  114. Target frames to sample per second when `do_sample_frames=True`.
  115. return_tensors (`str` or `TensorType`, *optional*):
  116. Returns stacked tensors if set to `pt, otherwise returns a list of tensors.
  117. data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
  118. The channel dimension format for the output video. Can be one of:
  119. - `"channels_first"` or `ChannelDimension.FIRST`: video in (num_channels, height, width) format.
  120. - `"channels_last"` or `ChannelDimension.LAST`: video in (height, width, num_channels) format.
  121. - Unset: Use the channel dimension format of the input video.
  122. input_data_format (`ChannelDimension` or `str`, *optional*):
  123. The channel dimension format for the input video. If unset, the channel dimension format is inferred
  124. from the input video. Can be one of:
  125. - `"channels_first"` or `ChannelDimension.FIRST`: video in (num_channels, height, width) format.
  126. - `"channels_last"` or `ChannelDimension.LAST`: video in (height, width, num_channels) format.
  127. - `"none"` or `ChannelDimension.NONE`: video in (height, width) format.
  128. device (`torch.device`, *optional*):
  129. The device to process the videos on. If unset, the device is inferred from the input videos.
  130. return_metadata (`bool`, *optional*):
  131. Whether to return video metadata or not.
  132. """
  133. @add_start_docstrings(
  134. "Constructs a base VideoProcessor.",
  135. BASE_VIDEO_PROCESSOR_DOCSTRING,
  136. )
  137. @requires(backends=("vision", "torchvision"))
  138. class BaseVideoProcessor(BaseImageProcessorFast):
  139. _auto_class = None
  140. resample = None
  141. image_mean = None
  142. image_std = None
  143. size = None
  144. size_divisor = None
  145. default_to_square = True
  146. crop_size = None
  147. do_resize = None
  148. do_center_crop = None
  149. do_rescale = None
  150. rescale_factor = 1 / 255
  151. do_normalize = None
  152. do_convert_rgb = None
  153. do_sample_frames = None
  154. fps = None
  155. num_frames = None
  156. video_metadata = None
  157. return_metadata = False
  158. valid_kwargs = VideosKwargs
  159. model_input_names = ["pixel_values_videos"]
  160. def __init__(self, **kwargs: Unpack[VideosKwargs]) -> None:
  161. super().__init__()
  162. self._processor_class = kwargs.pop("processor_class", None)
  163. # Additional attributes without default values
  164. for key, value in kwargs.items():
  165. try:
  166. setattr(self, key, value)
  167. except AttributeError as err:
  168. logger.error(f"Can't set {key} with value {value} for {self}")
  169. raise err
  170. # Prepare size related keys and turn then into `SizeDict`
  171. size = kwargs.pop("size", self.size)
  172. self.size = (
  173. get_size_dict(size=size, default_to_square=kwargs.pop("default_to_square", self.default_to_square))
  174. if size is not None
  175. else None
  176. )
  177. crop_size = kwargs.pop("crop_size", self.crop_size)
  178. self.crop_size = get_size_dict(crop_size, param_name="crop_size") if crop_size is not None else None
  179. # Save valid kwargs in a list for further processing
  180. self.model_valid_processing_keys = list(self.valid_kwargs.__annotations__.keys())
  181. for key in self.model_valid_processing_keys:
  182. if kwargs.get(key) is not None:
  183. setattr(self, key, kwargs[key])
  184. else:
  185. setattr(self, key, deepcopy(getattr(self, key, None)))
  186. def __call__(self, videos, **kwargs) -> BatchFeature:
  187. return self.preprocess(videos, **kwargs)
  188. def convert_to_rgb(
  189. self,
  190. video: "torch.Tensor",
  191. ) -> VideoInput:
  192. """
  193. Converts a video to RGB format.
  194. Args:
  195. video (`"torch.Tensor"`):
  196. The video to convert.
  197. Returns:
  198. `torch.Tensor`: The converted video.
  199. """
  200. video = F.grayscale_to_rgb(video)
  201. if video.shape[-3] == 3 or not (video[..., 3, :, :] < 255).any():
  202. return video
  203. # There is a transparency layer, blend it with a white background.
  204. # Calculate the alpha proportion for blending.
  205. alpha = video[..., 3, :, :] / 255.0
  206. video = (1 - alpha[..., None, :, :]) * 255 + alpha[..., None, :, :] * video[..., :3, :, :]
  207. return video
  208. def sample_frames(
  209. self,
  210. metadata: VideoMetadata,
  211. num_frames: Optional[int] = None,
  212. fps: Optional[Union[int, float]] = None,
  213. **kwargs,
  214. ):
  215. """
  216. Default sampling function which uniformly samples the desired number of frames between 0 and total number of frames.
  217. If `fps` is passed along with metadata, `fps` frames per second are sampled uniformty. Arguments `num_frames`
  218. and `fps` are mutually exclusive.
  219. Args:
  220. metadata (`VideoMetadata`):
  221. Metadata of the video containing information about total duration, fps and total number of frames.
  222. num_frames (`int`, *optional*):
  223. Maximum number of frames to sample. Defaults to `self.num_frames`.
  224. fps (`int` or `float`, *optional*):
  225. Target frames to sample per second. Defaults to `self.fps`.
  226. Returns:
  227. np.ndarray:
  228. Indices to sample video frames.
  229. """
  230. if fps is not None and num_frames is not None:
  231. raise ValueError(
  232. "`num_frames`, `fps`, and `sample_indices_fn` are mutually exclusive arguments, please use only one!"
  233. )
  234. num_frames = num_frames if num_frames is not None else self.num_frames
  235. fps = fps if fps is not None else self.fps
  236. total_num_frames = metadata.total_num_frames
  237. # If num_frames is not given but fps is, calculate num_frames from fps
  238. if num_frames is None and fps is not None:
  239. if metadata is None or metadata.fps is None:
  240. raise ValueError(
  241. "Asked to sample `fps` frames per second but no video metadata was provided which is required when sampling with `fps`. "
  242. "Please pass in `VideoMetadata` object or use a fixed `num_frames` per input video"
  243. )
  244. num_frames = int(total_num_frames / metadata.fps * fps)
  245. if num_frames > total_num_frames:
  246. raise ValueError(
  247. f"Video can't be sampled. The `num_frames={num_frames}` exceeds `total_num_frames={total_num_frames}`. "
  248. )
  249. if num_frames is not None:
  250. indices = torch.arange(0, total_num_frames, total_num_frames / num_frames).int()
  251. else:
  252. indices = torch.arange(0, total_num_frames).int()
  253. return indices
  254. def _decode_and_sample_videos(
  255. self,
  256. videos: VideoInput,
  257. video_metadata: Union[VideoMetadata, dict],
  258. do_sample_frames: Optional[bool] = None,
  259. sample_indices_fn: Optional[Callable] = None,
  260. ) -> list["torch.Tensor"]:
  261. """
  262. Decode input videos and sample frames if needed.
  263. """
  264. videos = make_batched_videos(videos)
  265. video_metadata = make_batched_metadata(videos, video_metadata=video_metadata)
  266. # Only sample frames if an array video is passed, otherwise first decode -> then sample
  267. if is_valid_video(videos[0]) and do_sample_frames:
  268. sampled_videos = []
  269. sampled_metadata = []
  270. for video, metadata in zip(videos, video_metadata):
  271. indices = sample_indices_fn(metadata=metadata)
  272. metadata.frames_indices = indices
  273. sampled_videos.append(video[indices])
  274. sampled_metadata.append(metadata)
  275. videos = sampled_videos
  276. video_metadata = sampled_metadata
  277. elif not is_valid_video(videos[0]):
  278. if isinstance(videos[0], list):
  279. # Videos sometimes are passed as a list of image URLs, especially through templates
  280. videos = [
  281. torch.stack([F.pil_to_tensor(image) for image in images], dim=0)
  282. for images in self.fetch_images(videos)
  283. ]
  284. if do_sample_frames:
  285. raise ValueError(
  286. "Sampling frames from a list of images is not supported! Set `do_sample_frames=False`."
  287. )
  288. else:
  289. videos, video_metadata = self.fetch_videos(videos, sample_indices_fn=sample_indices_fn)
  290. return videos, video_metadata
  291. def _prepare_input_videos(
  292. self,
  293. videos: VideoInput,
  294. input_data_format: Optional[Union[str, ChannelDimension]] = None,
  295. device: Optional[str] = None,
  296. ) -> list["torch.Tensor"]:
  297. """
  298. Prepare the input videos for processing.
  299. """
  300. processed_videos = []
  301. for video in videos:
  302. # `make_batched_videos` always returns a 4D array per video
  303. if isinstance(video, np.ndarray):
  304. video = to_channel_dimension_format(video, ChannelDimension.FIRST, input_data_format)
  305. # not using F.to_tensor as it doesn't handle (C, H, W) numpy arrays
  306. video = torch.from_numpy(video).contiguous()
  307. if device is not None:
  308. video = video.to(device)
  309. processed_videos.append(video)
  310. return processed_videos
  311. @add_start_docstrings(
  312. BASE_VIDEO_PROCESSOR_DOCSTRING,
  313. )
  314. def preprocess(
  315. self,
  316. videos: VideoInput,
  317. **kwargs: Unpack[VideosKwargs],
  318. ) -> BatchFeature:
  319. validate_kwargs(
  320. captured_kwargs=kwargs.keys(),
  321. valid_processor_keys=list(self.valid_kwargs.__annotations__.keys()) + ["return_tensors"],
  322. )
  323. # Set default kwargs from self. This ensures that if a kwarg is not provided
  324. # by the user, it gets its default value from the instance, or is set to None.
  325. for kwarg_name in self.valid_kwargs.__annotations__:
  326. kwargs.setdefault(kwarg_name, getattr(self, kwarg_name, None))
  327. input_data_format = kwargs.pop("input_data_format")
  328. do_sample_frames = kwargs.pop("do_sample_frames")
  329. device = kwargs.pop("device")
  330. video_metadata = kwargs.pop("video_metadata")
  331. sample_indices_fn = partial(self.sample_frames, **kwargs) if do_sample_frames else None
  332. videos, video_metadata = self._decode_and_sample_videos(
  333. videos,
  334. video_metadata=video_metadata,
  335. do_sample_frames=do_sample_frames,
  336. sample_indices_fn=sample_indices_fn,
  337. )
  338. videos = self._prepare_input_videos(videos=videos, input_data_format=input_data_format, device=device)
  339. kwargs = self._further_process_kwargs(**kwargs)
  340. self._validate_preprocess_kwargs(**kwargs)
  341. # Pop kwargs that are not needed in _preprocess
  342. kwargs.pop("data_format")
  343. return_metadata = kwargs.pop("return_metadata")
  344. preprocessed_videos = self._preprocess(videos=videos, **kwargs)
  345. if return_metadata:
  346. preprocessed_videos["video_metadata"] = video_metadata
  347. return preprocessed_videos
  348. def _preprocess(
  349. self,
  350. videos: list["torch.Tensor"],
  351. do_convert_rgb: bool,
  352. do_resize: bool,
  353. size: SizeDict,
  354. interpolation: Optional["F.InterpolationMode"],
  355. do_center_crop: bool,
  356. crop_size: SizeDict,
  357. do_rescale: bool,
  358. rescale_factor: float,
  359. do_normalize: bool,
  360. image_mean: Optional[Union[float, list[float]]],
  361. image_std: Optional[Union[float, list[float]]],
  362. return_tensors: Optional[Union[str, TensorType]] = None,
  363. **kwargs,
  364. ) -> BatchFeature:
  365. # Group videos by size for batched resizing
  366. grouped_videos, grouped_videos_index = group_videos_by_shape(videos)
  367. resized_videos_grouped = {}
  368. for shape, stacked_videos in grouped_videos.items():
  369. if do_convert_rgb:
  370. stacked_videos = self.convert_to_rgb(stacked_videos)
  371. if do_resize:
  372. stacked_videos = self.resize(stacked_videos, size=size, interpolation=interpolation)
  373. resized_videos_grouped[shape] = stacked_videos
  374. resized_videos = reorder_videos(resized_videos_grouped, grouped_videos_index)
  375. # Group videos by size for further processing
  376. # Needed in case do_resize is False, or resize returns videos with different sizes
  377. grouped_videos, grouped_videos_index = group_videos_by_shape(resized_videos)
  378. processed_videos_grouped = {}
  379. for shape, stacked_videos in grouped_videos.items():
  380. if do_center_crop:
  381. stacked_videos = self.center_crop(stacked_videos, crop_size)
  382. # Fused rescale and normalize
  383. stacked_videos = self.rescale_and_normalize(
  384. stacked_videos, do_rescale, rescale_factor, do_normalize, image_mean, image_std
  385. )
  386. processed_videos_grouped[shape] = stacked_videos
  387. processed_videos = reorder_videos(processed_videos_grouped, grouped_videos_index)
  388. processed_videos = torch.stack(processed_videos, dim=0) if return_tensors else processed_videos
  389. return BatchFeature(data={"pixel_values_videos": processed_videos}, tensor_type=return_tensors)
  390. @classmethod
  391. def from_pretrained(
  392. cls,
  393. pretrained_model_name_or_path: Union[str, os.PathLike],
  394. cache_dir: Optional[Union[str, os.PathLike]] = None,
  395. force_download: bool = False,
  396. local_files_only: bool = False,
  397. token: Optional[Union[str, bool]] = None,
  398. revision: str = "main",
  399. **kwargs,
  400. ):
  401. r"""
  402. Instantiate a type of [`~video_processing_utils.VideoProcessorBase`] from an video processor.
  403. Args:
  404. pretrained_model_name_or_path (`str` or `os.PathLike`):
  405. This can be either:
  406. - a string, the *model id* of a pretrained video hosted inside a model repo on
  407. huggingface.co.
  408. - a path to a *directory* containing a video processor file saved using the
  409. [`~video_processing_utils.VideoProcessorBase.save_pretrained`] method, e.g.,
  410. `./my_model_directory/`.
  411. - a path or url to a saved video processor JSON *file*, e.g.,
  412. `./my_model_directory/video_preprocessor_config.json`.
  413. cache_dir (`str` or `os.PathLike`, *optional*):
  414. Path to a directory in which a downloaded pretrained model video processor should be cached if the
  415. standard cache should not be used.
  416. force_download (`bool`, *optional*, defaults to `False`):
  417. Whether or not to force to (re-)download the video processor files and override the cached versions if
  418. they exist.
  419. resume_download:
  420. Deprecated and ignored. All downloads are now resumed by default when possible.
  421. Will be removed in v5 of Transformers.
  422. proxies (`dict[str, str]`, *optional*):
  423. A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
  424. 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
  425. token (`str` or `bool`, *optional*):
  426. The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
  427. the token generated when running `hf auth login` (stored in `~/.huggingface`).
  428. revision (`str`, *optional*, defaults to `"main"`):
  429. The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
  430. git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
  431. identifier allowed by git.
  432. <Tip>
  433. To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>"`.
  434. </Tip>
  435. return_unused_kwargs (`bool`, *optional*, defaults to `False`):
  436. If `False`, then this function returns just the final video processor object. If `True`, then this
  437. functions returns a `Tuple(video_processor, unused_kwargs)` where *unused_kwargs* is a dictionary
  438. consisting of the key/value pairs whose keys are not video processor attributes: i.e., the part of
  439. `kwargs` which has not been used to update `video_processor` and is otherwise ignored.
  440. subfolder (`str`, *optional*, defaults to `""`):
  441. In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
  442. specify the folder name here.
  443. kwargs (`dict[str, Any]`, *optional*):
  444. The values in kwargs of any keys which are video processor attributes will be used to override the
  445. loaded values. Behavior concerning key/value pairs whose keys are *not* video processor attributes is
  446. controlled by the `return_unused_kwargs` keyword parameter.
  447. Returns:
  448. A video processor of type [`~video_processing_utils.ImagVideoProcessorBase`].
  449. Examples:
  450. ```python
  451. # We can't instantiate directly the base class *VideoProcessorBase* so let's show the examples on a
  452. # derived class: *LlavaOnevisionVideoProcessor*
  453. video_processor = LlavaOnevisionVideoProcessor.from_pretrained(
  454. "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
  455. ) # Download video_processing_config from huggingface.co and cache.
  456. video_processor = LlavaOnevisionVideoProcessor.from_pretrained(
  457. "./test/saved_model/"
  458. ) # E.g. video processor (or model) was saved using *save_pretrained('./test/saved_model/')*
  459. video_processor = LlavaOnevisionVideoProcessor.from_pretrained("./test/saved_model/video_preprocessor_config.json")
  460. video_processor = LlavaOnevisionVideoProcessor.from_pretrained(
  461. "llava-hf/llava-onevision-qwen2-0.5b-ov-hf", do_normalize=False, foo=False
  462. )
  463. assert video_processor.do_normalize is False
  464. video_processor, unused_kwargs = LlavaOnevisionVideoProcessor.from_pretrained(
  465. "llava-hf/llava-onevision-qwen2-0.5b-ov-hf", do_normalize=False, foo=False, return_unused_kwargs=True
  466. )
  467. assert video_processor.do_normalize is False
  468. assert unused_kwargs == {"foo": False}
  469. ```"""
  470. kwargs["cache_dir"] = cache_dir
  471. kwargs["force_download"] = force_download
  472. kwargs["local_files_only"] = local_files_only
  473. kwargs["revision"] = revision
  474. use_auth_token = kwargs.pop("use_auth_token", None)
  475. if use_auth_token is not None:
  476. warnings.warn(
  477. "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
  478. FutureWarning,
  479. )
  480. if token is not None:
  481. raise ValueError(
  482. "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
  483. )
  484. token = use_auth_token
  485. if token is not None:
  486. kwargs["token"] = token
  487. video_processor_dict, kwargs = cls.get_video_processor_dict(pretrained_model_name_or_path, **kwargs)
  488. return cls.from_dict(video_processor_dict, **kwargs)
  489. def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
  490. """
  491. Save an video processor object to the directory `save_directory`, so that it can be re-loaded using the
  492. [`~video_processing_utils.VideoProcessorBase.from_pretrained`] class method.
  493. Args:
  494. save_directory (`str` or `os.PathLike`):
  495. Directory where the video processor JSON file will be saved (will be created if it does not exist).
  496. push_to_hub (`bool`, *optional*, defaults to `False`):
  497. Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
  498. repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
  499. namespace).
  500. kwargs (`dict[str, Any]`, *optional*):
  501. Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
  502. """
  503. use_auth_token = kwargs.pop("use_auth_token", None)
  504. if use_auth_token is not None:
  505. warnings.warn(
  506. "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
  507. FutureWarning,
  508. )
  509. if kwargs.get("token") is not None:
  510. raise ValueError(
  511. "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
  512. )
  513. kwargs["token"] = use_auth_token
  514. if os.path.isfile(save_directory):
  515. raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
  516. os.makedirs(save_directory, exist_ok=True)
  517. if push_to_hub:
  518. commit_message = kwargs.pop("commit_message", None)
  519. repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
  520. repo_id = self._create_repo(repo_id, **kwargs)
  521. files_timestamps = self._get_files_timestamps(save_directory)
  522. # If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
  523. # loaded from the Hub.
  524. if self._auto_class is not None:
  525. custom_object_save(self, save_directory, config=self)
  526. # If we save using the predefined names, we can load using `from_pretrained`
  527. output_video_processor_file = os.path.join(save_directory, VIDEO_PROCESSOR_NAME)
  528. self.to_json_file(output_video_processor_file)
  529. logger.info(f"Video processor saved in {output_video_processor_file}")
  530. if push_to_hub:
  531. self._upload_modified_files(
  532. save_directory,
  533. repo_id,
  534. files_timestamps,
  535. commit_message=commit_message,
  536. token=kwargs.get("token"),
  537. )
  538. return [output_video_processor_file]
  539. @classmethod
  540. def get_video_processor_dict(
  541. cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
  542. ) -> tuple[dict[str, Any], dict[str, Any]]:
  543. """
  544. From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
  545. video processor of type [`~video_processing_utils.VideoProcessorBase`] using `from_dict`.
  546. Parameters:
  547. pretrained_model_name_or_path (`str` or `os.PathLike`):
  548. The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
  549. subfolder (`str`, *optional*, defaults to `""`):
  550. In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
  551. specify the folder name here.
  552. Returns:
  553. `tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the video processor object.
  554. """
  555. cache_dir = kwargs.pop("cache_dir", None)
  556. force_download = kwargs.pop("force_download", False)
  557. resume_download = kwargs.pop("resume_download", None)
  558. proxies = kwargs.pop("proxies", None)
  559. token = kwargs.pop("token", None)
  560. use_auth_token = kwargs.pop("use_auth_token", None)
  561. local_files_only = kwargs.pop("local_files_only", False)
  562. revision = kwargs.pop("revision", None)
  563. subfolder = kwargs.pop("subfolder", "")
  564. from_pipeline = kwargs.pop("_from_pipeline", None)
  565. from_auto_class = kwargs.pop("_from_auto", False)
  566. if use_auth_token is not None:
  567. warnings.warn(
  568. "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
  569. FutureWarning,
  570. )
  571. if token is not None:
  572. raise ValueError(
  573. "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
  574. )
  575. token = use_auth_token
  576. user_agent = {"file_type": "video processor", "from_auto_class": from_auto_class}
  577. if from_pipeline is not None:
  578. user_agent["using_pipeline"] = from_pipeline
  579. if is_offline_mode() and not local_files_only:
  580. logger.info("Offline mode: forcing local_files_only=True")
  581. local_files_only = True
  582. pretrained_model_name_or_path = str(pretrained_model_name_or_path)
  583. is_local = os.path.isdir(pretrained_model_name_or_path)
  584. if os.path.isfile(pretrained_model_name_or_path):
  585. resolved_video_processor_file = pretrained_model_name_or_path
  586. is_local = True
  587. elif is_remote_url(pretrained_model_name_or_path):
  588. video_processor_file = pretrained_model_name_or_path
  589. resolved_video_processor_file = download_url(pretrained_model_name_or_path)
  590. else:
  591. video_processor_file = VIDEO_PROCESSOR_NAME
  592. try:
  593. # Try to load with a new config name first and if not successful try with the old file name
  594. # NOTE: we will gradually change to saving all processor configs as nested dict in PROCESSOR_NAME
  595. resolved_video_processor_files = [
  596. resolved_file
  597. for filename in [VIDEO_PROCESSOR_NAME, IMAGE_PROCESSOR_NAME, PROCESSOR_NAME]
  598. if (
  599. resolved_file := cached_file(
  600. pretrained_model_name_or_path,
  601. filename=filename,
  602. cache_dir=cache_dir,
  603. force_download=force_download,
  604. proxies=proxies,
  605. resume_download=resume_download,
  606. local_files_only=local_files_only,
  607. token=token,
  608. user_agent=user_agent,
  609. revision=revision,
  610. subfolder=subfolder,
  611. _raise_exceptions_for_missing_entries=False,
  612. )
  613. )
  614. is not None
  615. ]
  616. resolved_video_processor_file = resolved_video_processor_files[0]
  617. except OSError:
  618. # Raise any OS error raise by `cached_file`. It will have a helpful error message adapted to
  619. # the original exception.
  620. raise
  621. except Exception:
  622. # For any other exception, we throw a generic error.
  623. raise OSError(
  624. f"Can't load video processor for '{pretrained_model_name_or_path}'. If you were trying to load"
  625. " it from 'https://huggingface.co/models', make sure you don't have a local directory with the"
  626. f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
  627. f" directory containing a {VIDEO_PROCESSOR_NAME} file"
  628. )
  629. try:
  630. # Load video_processor dict
  631. with open(resolved_video_processor_file, "r", encoding="utf-8") as reader:
  632. text = reader.read()
  633. video_processor_dict = json.loads(text)
  634. video_processor_dict = video_processor_dict.get("video_processor", video_processor_dict)
  635. except json.JSONDecodeError:
  636. raise OSError(
  637. f"It looks like the config file at '{resolved_video_processor_file}' is not a valid JSON file."
  638. )
  639. if is_local:
  640. logger.info(f"loading configuration file {resolved_video_processor_file}")
  641. else:
  642. logger.info(
  643. f"loading configuration file {video_processor_file} from cache at {resolved_video_processor_file}"
  644. )
  645. return video_processor_dict, kwargs
  646. @classmethod
  647. def from_dict(cls, video_processor_dict: dict[str, Any], **kwargs):
  648. """
  649. Instantiates a type of [`~video_processing_utils.VideoProcessorBase`] from a Python dictionary of parameters.
  650. Args:
  651. video_processor_dict (`dict[str, Any]`):
  652. Dictionary that will be used to instantiate the video processor object. Such a dictionary can be
  653. retrieved from a pretrained checkpoint by leveraging the
  654. [`~video_processing_utils.VideoProcessorBase.to_dict`] method.
  655. kwargs (`dict[str, Any]`):
  656. Additional parameters from which to initialize the video processor object.
  657. Returns:
  658. [`~video_processing_utils.VideoProcessorBase`]: The video processor object instantiated from those
  659. parameters.
  660. """
  661. video_processor_dict = video_processor_dict.copy()
  662. return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
  663. # The `size` parameter is a dict and was previously an int or tuple in feature extractors.
  664. # We set `size` here directly to the `video_processor_dict` so that it is converted to the appropriate
  665. # dict within the video processor and isn't overwritten if `size` is passed in as a kwarg.
  666. if "size" in kwargs and "size" in video_processor_dict:
  667. video_processor_dict["size"] = kwargs.pop("size")
  668. if "crop_size" in kwargs and "crop_size" in video_processor_dict:
  669. video_processor_dict["crop_size"] = kwargs.pop("crop_size")
  670. video_processor = cls(**video_processor_dict)
  671. # Update video_processor with kwargs if needed
  672. to_remove = []
  673. for key, value in kwargs.items():
  674. if hasattr(video_processor, key):
  675. setattr(video_processor, key, value)
  676. to_remove.append(key)
  677. for key in to_remove:
  678. kwargs.pop(key, None)
  679. logger.info(f"Video processor {video_processor}")
  680. if return_unused_kwargs:
  681. return video_processor, kwargs
  682. else:
  683. return video_processor
  684. def to_dict(self) -> dict[str, Any]:
  685. """
  686. Serializes this instance to a Python dictionary.
  687. Returns:
  688. `dict[str, Any]`: Dictionary of all the attributes that make up this video processor instance.
  689. """
  690. output = deepcopy(self.__dict__)
  691. output.pop("model_valid_processing_keys", None)
  692. output.pop("_valid_kwargs_names", None)
  693. output["video_processor_type"] = self.__class__.__name__
  694. return output
  695. def to_json_string(self) -> str:
  696. """
  697. Serializes this instance to a JSON string.
  698. Returns:
  699. `str`: String containing all the attributes that make up this feature_extractor instance in JSON format.
  700. """
  701. dictionary = self.to_dict()
  702. for key, value in dictionary.items():
  703. if isinstance(value, np.ndarray):
  704. dictionary[key] = value.tolist()
  705. # make sure private name "_processor_class" is correctly
  706. # saved as "processor_class"
  707. _processor_class = dictionary.pop("_processor_class", None)
  708. if _processor_class is not None:
  709. dictionary["processor_class"] = _processor_class
  710. return json.dumps(dictionary, indent=2, sort_keys=True) + "\n"
  711. def to_json_file(self, json_file_path: Union[str, os.PathLike]):
  712. """
  713. Save this instance to a JSON file.
  714. Args:
  715. json_file_path (`str` or `os.PathLike`):
  716. Path to the JSON file in which this image_processor instance's parameters will be saved.
  717. """
  718. with open(json_file_path, "w", encoding="utf-8") as writer:
  719. writer.write(self.to_json_string())
  720. def __repr__(self):
  721. return f"{self.__class__.__name__} {self.to_json_string()}"
  722. @classmethod
  723. def from_json_file(cls, json_file: Union[str, os.PathLike]):
  724. """
  725. Instantiates a video processor of type [`~video_processing_utils.VideoProcessorBase`] from the path to a JSON
  726. file of parameters.
  727. Args:
  728. json_file (`str` or `os.PathLike`):
  729. Path to the JSON file containing the parameters.
  730. Returns:
  731. A video processor of type [`~video_processing_utils.VideoProcessorBase`]: The video_processor object
  732. instantiated from that JSON file.
  733. """
  734. with open(json_file, "r", encoding="utf-8") as reader:
  735. text = reader.read()
  736. video_processor_dict = json.loads(text)
  737. return cls(**video_processor_dict)
  738. @classmethod
  739. def register_for_auto_class(cls, auto_class="AutoVideoProcessor"):
  740. """
  741. Register this class with a given auto class. This should only be used for custom video processors as the ones
  742. in the library are already mapped with `AutoVideoProcessor `.
  743. <Tip warning={true}>
  744. This API is experimental and may have some slight breaking changes in the next releases.
  745. </Tip>
  746. Args:
  747. auto_class (`str` or `type`, *optional*, defaults to `"AutoVideoProcessor "`):
  748. The auto class to register this new video processor with.
  749. """
  750. if not isinstance(auto_class, str):
  751. auto_class = auto_class.__name__
  752. import transformers.models.auto as auto_module
  753. if not hasattr(auto_module, auto_class):
  754. raise ValueError(f"{auto_class} is not a valid auto class.")
  755. cls._auto_class = auto_class
  756. def fetch_videos(self, video_url_or_urls: Union[str, list[str], list[list[str]]], sample_indices_fn=None):
  757. """
  758. Convert a single or a list of urls into the corresponding `np.array` objects.
  759. If a single url is passed, the return value will be a single object. If a list is passed a list of objects is
  760. returned.
  761. """
  762. backend = "torchcodec"
  763. if not is_torchcodec_available():
  764. warnings.warn(
  765. "`torchcodec` is not installed and cannot be used to decode the video by default. "
  766. "Falling back to `torchvision`. Note that `torchvision` decoding is deprecated and will be removed in future versions. "
  767. )
  768. backend = "torchvision"
  769. if isinstance(video_url_or_urls, list):
  770. return list(zip(*[self.fetch_videos(x, sample_indices_fn=sample_indices_fn) for x in video_url_or_urls]))
  771. else:
  772. return load_video(video_url_or_urls, backend=backend, sample_indices_fn=sample_indices_fn)
  773. BaseVideoProcessor.push_to_hub = copy_func(BaseVideoProcessor.push_to_hub)
  774. if BaseVideoProcessor.push_to_hub.__doc__ is not None:
  775. BaseVideoProcessor.push_to_hub.__doc__ = BaseVideoProcessor.push_to_hub.__doc__.format(
  776. object="video processor", object_class="AutoVideoProcessor", object_files="video processor file"
  777. )