configuration_aimv2.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284
  1. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
  2. # This file was automatically generated from src/transformers/models/aimv2/modular_aimv2.py.
  3. # Do NOT edit this file manually as any edits will be overwritten by the generation of
  4. # the file from the modular. If any change should be done, please apply the change to the
  5. # modular_aimv2.py file directly. One of our CI enforces this.
  6. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
  7. # coding=utf-8
  8. # Copyright 2025 Apple Inc. and The HuggingFace Team. All rights reserved.
  9. #
  10. # Licensed under the Apache License, Version 2.0 (the "License");
  11. # you may not use this file except in compliance with the License.
  12. # You may obtain a copy of the License at
  13. #
  14. # http://www.apache.org/licenses/LICENSE-2.0
  15. #
  16. # Unless required by applicable law or agreed to in writing, software
  17. # distributed under the License is distributed on an "AS IS" BASIS,
  18. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  19. # See the License for the specific language governing permissions and
  20. # limitations under the License.
  21. from typing import Optional
  22. from ...configuration_utils import PretrainedConfig
  23. from ...utils import logging
  24. logger = logging.get_logger(__name__)
  25. class Aimv2VisionConfig(PretrainedConfig):
  26. r"""
  27. This is the configuration class to store the configuration of a [`Aimv2VisionModel`]. It is used to instantiate a
  28. AIMv2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
  29. configuration with the defaults will yield a similar configuration to that of the vision encoder of the AIMv2
  30. [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224) architecture.
  31. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
  32. documentation from [`PretrainedConfig`] for more information.
  33. Args:
  34. hidden_size (`int`, *optional*, defaults to 1024):
  35. Dimensionality of the encoder layers and the pooler layer.
  36. intermediate_size (`int`, *optional*, defaults to 2816):
  37. Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
  38. num_hidden_layers (`int`, *optional*, defaults to 24):
  39. Number of hidden layers in the Transformer encoder.
  40. num_attention_heads (`int`, *optional*, defaults to 8):
  41. Number of attention heads for each attention layer in the Transformer encoder.
  42. num_channels (`int`, *optional*, defaults to 3):
  43. Number of channels in the input images.
  44. image_size (`int`, *optional*, defaults to 224):
  45. The size (resolution) of each image.
  46. patch_size (`int`, *optional*, defaults to 14):
  47. The size (resolution) of each patch.
  48. rms_norm_eps (`float`, *optional*, defaults to 1e-05):
  49. The epsilon used by the rms normalization layers.
  50. attention_dropout (`float`, *optional*, defaults to 0.0):
  51. The dropout ratio for the attention probabilities.
  52. qkv_bias (`bool`, *optional*, defaults to `False`):
  53. Whether to add a bias to the queries, keys and values.
  54. mlp_bias (`bool`, *optional*, defaults to `False`):
  55. Whether to add a bias to the Linear layers or Not.
  56. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
  57. The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
  58. `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
  59. initializer_range (`float`, *optional*, defaults to 0.02):
  60. The standard deviation of the for initializing all weight matrices.
  61. use_head (`str`, *optional*, defaults to `True`):
  62. Whether to use Attention Pooling Head or Not.
  63. is_native (`str`, *optional*, defaults to `False`):
  64. Whether to use ckpt trained for image native resolution or not.
  65. Example:
  66. ```python
  67. >>> from transformers import SiglipVisionConfig, SiglipVisionModel
  68. >>> # Initializing a Aimv2VisionConfig with apple/aimv2-large-patch14-224 style configuration
  69. >>> configuration = Aimv2VisionConfig()
  70. >>> # Initializing a Aimv2VisionModel (with random weights) from the apple/aimv2-large-patch14-224 style configuration
  71. >>> model = Aimv2VisionModel(configuration)
  72. >>> # Accessing the model configuration
  73. >>> configuration = model.config
  74. ```"""
  75. model_type = "aimv2_vision_model"
  76. base_config_key = "vision_config"
  77. def __init__(
  78. self,
  79. hidden_size: int = 1024,
  80. intermediate_size: int = 2816,
  81. num_hidden_layers: int = 24,
  82. num_attention_heads: int = 8,
  83. num_channels: int = 3,
  84. image_size: int = 224,
  85. patch_size: int = 14,
  86. rms_norm_eps: float = 1e-5,
  87. attention_dropout: float = 0.0,
  88. qkv_bias: bool = False,
  89. mlp_bias: bool = False,
  90. hidden_act: str = "silu",
  91. initializer_range: float = 0.02,
  92. use_head: bool = True,
  93. is_native: bool = False,
  94. **kwargs,
  95. ):
  96. super().__init__(**kwargs)
  97. self.hidden_size = hidden_size
  98. self.intermediate_size = intermediate_size
  99. self.num_hidden_layers = num_hidden_layers
  100. self.num_attention_heads = num_attention_heads
  101. self.num_channels = num_channels
  102. self.patch_size = patch_size
  103. self.image_size = image_size
  104. self.attention_dropout = attention_dropout
  105. self.hidden_act = hidden_act
  106. self.use_head = use_head
  107. self.initializer_range = initializer_range
  108. self.mlp_bias = mlp_bias
  109. self.qkv_bias = qkv_bias
  110. self.rms_norm_eps = rms_norm_eps
  111. self.is_native = is_native
  112. class Aimv2TextConfig(PretrainedConfig):
  113. r"""
  114. This is the configuration class to store the configuration of a [`Aimv2TextModel`]. It is used to instantiate a
  115. AIMv2 text encoder according to the specified arguments, defining the model architecture. Instantiating a
  116. configuration with the defaults will yield a similar configuration to that of the text encoder of the AIMv2
  117. [apple/aimv2-large-patch14-224-lit](https://huggingface.co/apple/aimv2-large-patch14-224-lit) architecture.
  118. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
  119. documentation from [`PretrainedConfig`] for more information.
  120. Args:
  121. vocab_size (`int`, *optional*, defaults to 49408):
  122. Vocabulary size of the AIMv2 text model. Defines the number of different tokens that can be represented by
  123. the `inputs_ids` passed when calling [`Aimv2Model`].
  124. hidden_size (`int`, *optional*, defaults to 768):
  125. Dimensionality of the encoder layers and the pooler layer.
  126. intermediate_size (`int`, *optional*, defaults to 2048):
  127. Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
  128. num_hidden_layers (`int`, *optional*, defaults to 12):
  129. Number of hidden layers in the Transformer encoder.
  130. num_attention_heads (`int`, *optional*, defaults to 6):
  131. Number of attention heads for each attention layer in the Transformer encoder.
  132. rms_norm_eps (`float`, *optional*, defaults to 1e-05):
  133. The epsilon used by the rms normalization layers.
  134. attention_dropout (`float`, *optional*, defaults to 0.0):
  135. The dropout ratio for the attention probabilities.
  136. qkv_bias (`bool`, *optional*, defaults to `False`):
  137. Whether to add a bias to the queries, keys and values.
  138. mlp_bias (`bool`, *optional*, defaults to `False`):
  139. Whether to add a bias to the Linear layers or Not.
  140. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
  141. The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
  142. `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
  143. pad_token_id (`int`, *optional*, defaults to 1):
  144. The id of the padding token in the vocabulary.
  145. bos_token_id (`int`, *optional*, defaults to 49406):
  146. The id of the beginning-of-sequence token in the vocabulary.
  147. eos_token_id (`int`, *optional*, defaults to 49407):
  148. The id of the end-of-sequence token in the vocabulary.
  149. max_position_embeddings (`int`, *optional*, defaults to 77):
  150. The maximum sequence length that this model might ever be used with. Typically set this to something large
  151. just in case (e.g., 512 or 1024 or 2048).
  152. initializer_range (`float`, *optional*, defaults to 0.02):
  153. The standard deviation of the for initializing all weight matrices.
  154. """
  155. model_type = "aimv2_text_model"
  156. base_config_key = "text_config"
  157. def __init__(
  158. self,
  159. vocab_size: int = 49408,
  160. hidden_size: int = 768,
  161. intermediate_size: int = 2048,
  162. num_hidden_layers: int = 12,
  163. num_attention_heads: int = 6,
  164. rms_norm_eps: float = 1e-5,
  165. attention_dropout: float = 0.0,
  166. qkv_bias: bool = False,
  167. mlp_bias: bool = False,
  168. hidden_act: str = "silu",
  169. pad_token_id: Optional[int] = None,
  170. bos_token_id: Optional[int] = None,
  171. eos_token_id: int = 49407,
  172. max_position_embeddings: int = 77,
  173. initializer_range: bool = 0.02,
  174. **kwargs,
  175. ):
  176. super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
  177. self.vocab_size = vocab_size
  178. self.hidden_size = hidden_size
  179. self.intermediate_size = intermediate_size
  180. self.num_hidden_layers = num_hidden_layers
  181. self.num_attention_heads = num_attention_heads
  182. self.max_position_embeddings = max_position_embeddings
  183. self.hidden_act = hidden_act
  184. self.attention_dropout = attention_dropout
  185. self.initializer_range = initializer_range
  186. self.mlp_bias = mlp_bias
  187. self.qkv_bias = qkv_bias
  188. self.rms_norm_eps = rms_norm_eps
  189. class Aimv2Config(PretrainedConfig):
  190. r"""
  191. [`Aimv2Config`] is the configuration class to store the configuration of a [`Aimv2Model`]. It is used to
  192. instantiate a AIMv2 model according to the specified arguments, defining the text model and vision model configs.
  193. Instantiating a configuration with the defaults will yield a similar configuration to that of the AIMv2
  194. [apple/aimv2-large-patch14-224-lit](https://huggingface.co/apple/aimv2-large-patch14-224-lit) architecture.
  195. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
  196. documentation from [`PretrainedConfig`] for more information.
  197. Args:
  198. text_config (`dict`, *optional*):
  199. Dictionary of configuration options used to initialize [`Aimv2TextConfig`].
  200. vision_config (`dict`, *optional*):
  201. Dictionary of configuration options used to initialize [`Aimv2VisionConfig`].
  202. projection_dim (`int`, *optional*, defaults to 512):
  203. Dimensionality of text and vision projection layers.
  204. logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
  205. The initial value of the *logit_scale* parameter.
  206. kwargs (*optional*):
  207. Dictionary of keyword arguments.
  208. Example:
  209. ```python
  210. >>> from transformers import Aimv2Config, Aimv2Model
  211. >>> # Initializing a Aimv2Config with apple/aimv2-large-patch14-224-lit style configuration
  212. >>> configuration = Aimv2Config()
  213. >>> # Initializing a Aimv2Model (with random weights) from the apple/aimv2-large-patch14-224-lit style configuration
  214. >>> model = Aimv2Model(configuration)
  215. >>> # Accessing the model configuration
  216. >>> configuration = model.config
  217. >>> # We can also initialize a Aimv2Config from a Aimv2TextConfig and a Aimv2VisionConfig
  218. >>> from transformers import Aimv2TextConfig, Aimv2VisionConfig
  219. >>> # Initializing a AIMv2Text and AIMv2Vision configuration
  220. >>> config_text = Aimv2TextConfig()
  221. >>> config_vision = Aimv2VisionConfig()
  222. >>> config = Aimv2Config(text_config=config_text, vision_config=config_vision)
  223. ```"""
  224. model_type = "aimv2"
  225. sub_configs = {"text_config": Aimv2TextConfig, "vision_config": Aimv2VisionConfig}
  226. def __init__(
  227. self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
  228. ):
  229. super().__init__(**kwargs)
  230. if text_config is None:
  231. text_config = {}
  232. logger.info("`text_config` is `None`. Initializing the `Aimv2TextConfig` with default values.")
  233. if vision_config is None:
  234. vision_config = {}
  235. logger.info("`vision_config` is `None`. initializing the `Aimv2VisionConfig` with default values.")
  236. self.text_config = Aimv2TextConfig(**text_config)
  237. self.vision_config = Aimv2VisionConfig(**vision_config)
  238. self.projection_dim = projection_dim
  239. self.logit_scale_init_value = logit_scale_init_value
  240. self.max_logit_scale = 100.0
  241. __all__ = ["Aimv2Config", "Aimv2VisionConfig", "Aimv2TextConfig"]