configuration_align.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331
  1. # coding=utf-8
  2. # Copyright 2023 The HuggingFace Inc. team. All rights reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """ALIGN model configuration"""
  16. from ...configuration_utils import PretrainedConfig
  17. from ...utils import logging
  18. logger = logging.get_logger(__name__)
  19. class AlignTextConfig(PretrainedConfig):
  20. r"""
  21. This is the configuration class to store the configuration of a [`AlignTextModel`]. It is used to instantiate a
  22. ALIGN text encoder according to the specified arguments, defining the model architecture. Instantiating a
  23. configuration with the defaults will yield a similar configuration to that of the text encoder of the ALIGN
  24. [kakaobrain/align-base](https://huggingface.co/kakaobrain/align-base) architecture. The default values here are
  25. copied from BERT.
  26. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
  27. documentation from [`PretrainedConfig`] for more information.
  28. Args:
  29. vocab_size (`int`, *optional*, defaults to 30522):
  30. Vocabulary size of the Align Text model. Defines the number of different tokens that can be represented by
  31. the `inputs_ids` passed when calling [`AlignTextModel`].
  32. hidden_size (`int`, *optional*, defaults to 768):
  33. Dimensionality of the encoder layers and the pooler layer.
  34. num_hidden_layers (`int`, *optional*, defaults to 12):
  35. Number of hidden layers in the Transformer encoder.
  36. num_attention_heads (`int`, *optional*, defaults to 12):
  37. Number of attention heads for each attention layer in the Transformer encoder.
  38. intermediate_size (`int`, *optional*, defaults to 3072):
  39. Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
  40. hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
  41. The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
  42. `"relu"`, `"silu"` and `"gelu_new"` are supported.
  43. hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
  44. The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
  45. attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
  46. The dropout ratio for the attention probabilities.
  47. max_position_embeddings (`int`, *optional*, defaults to 512):
  48. The maximum sequence length that this model might ever be used with. Typically set this to something large
  49. just in case (e.g., 512 or 1024 or 2048).
  50. type_vocab_size (`int`, *optional*, defaults to 2):
  51. The vocabulary size of the `token_type_ids` passed when calling [`AlignTextModel`].
  52. initializer_range (`float`, *optional*, defaults to 0.02):
  53. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
  54. layer_norm_eps (`float`, *optional*, defaults to 1e-12):
  55. The epsilon used by the layer normalization layers.
  56. pad_token_id (`int`, *optional*, defaults to 0):
  57. Padding token id.
  58. position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
  59. Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
  60. positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
  61. [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155).
  62. For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
  63. with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658).
  64. use_cache (`bool`, *optional*, defaults to `True`):
  65. Whether or not the model should return the last key/values attentions (not used by all models). Only
  66. relevant if `config.is_decoder=True`.
  67. Example:
  68. ```python
  69. >>> from transformers import AlignTextConfig, AlignTextModel
  70. >>> # Initializing a AlignTextConfig with kakaobrain/align-base style configuration
  71. >>> configuration = AlignTextConfig()
  72. >>> # Initializing a AlignTextModel (with random weights) from the kakaobrain/align-base style configuration
  73. >>> model = AlignTextModel(configuration)
  74. >>> # Accessing the model configuration
  75. >>> configuration = model.config
  76. ```"""
  77. model_type = "align_text_model"
  78. base_config_key = "text_config"
  79. def __init__(
  80. self,
  81. vocab_size=30522,
  82. hidden_size=768,
  83. num_hidden_layers=12,
  84. num_attention_heads=12,
  85. intermediate_size=3072,
  86. hidden_act="gelu",
  87. hidden_dropout_prob=0.1,
  88. attention_probs_dropout_prob=0.1,
  89. max_position_embeddings=512,
  90. type_vocab_size=2,
  91. initializer_range=0.02,
  92. layer_norm_eps=1e-12,
  93. pad_token_id=0,
  94. position_embedding_type="absolute",
  95. use_cache=True,
  96. **kwargs,
  97. ):
  98. super().__init__(**kwargs)
  99. self.vocab_size = vocab_size
  100. self.hidden_size = hidden_size
  101. self.num_hidden_layers = num_hidden_layers
  102. self.num_attention_heads = num_attention_heads
  103. self.hidden_act = hidden_act
  104. self.intermediate_size = intermediate_size
  105. self.hidden_dropout_prob = hidden_dropout_prob
  106. self.attention_probs_dropout_prob = attention_probs_dropout_prob
  107. self.max_position_embeddings = max_position_embeddings
  108. self.type_vocab_size = type_vocab_size
  109. self.initializer_range = initializer_range
  110. self.layer_norm_eps = layer_norm_eps
  111. self.position_embedding_type = position_embedding_type
  112. self.use_cache = use_cache
  113. self.pad_token_id = pad_token_id
  114. class AlignVisionConfig(PretrainedConfig):
  115. r"""
  116. This is the configuration class to store the configuration of a [`AlignVisionModel`]. It is used to instantiate a
  117. ALIGN vision encoder according to the specified arguments, defining the model architecture. Instantiating a
  118. configuration with the defaults will yield a similar configuration to that of the vision encoder of the ALIGN
  119. [kakaobrain/align-base](https://huggingface.co/kakaobrain/align-base) architecture. The default values are copied
  120. from EfficientNet (efficientnet-b7)
  121. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
  122. documentation from [`PretrainedConfig`] for more information.
  123. Args:
  124. num_channels (`int`, *optional*, defaults to 3):
  125. The number of input channels.
  126. image_size (`int`, *optional*, defaults to 600):
  127. The input image size.
  128. width_coefficient (`float`, *optional*, defaults to 2.0):
  129. Scaling coefficient for network width at each stage.
  130. depth_coefficient (`float`, *optional*, defaults to 3.1):
  131. Scaling coefficient for network depth at each stage.
  132. depth_divisor `int`, *optional*, defaults to 8):
  133. A unit of network width.
  134. kernel_sizes (`list[int]`, *optional*, defaults to `[3, 3, 5, 3, 5, 5, 3]`):
  135. List of kernel sizes to be used in each block.
  136. in_channels (`list[int]`, *optional*, defaults to `[32, 16, 24, 40, 80, 112, 192]`):
  137. List of input channel sizes to be used in each block for convolutional layers.
  138. out_channels (`list[int]`, *optional*, defaults to `[16, 24, 40, 80, 112, 192, 320]`):
  139. List of output channel sizes to be used in each block for convolutional layers.
  140. depthwise_padding (`list[int]`, *optional*, defaults to `[]`):
  141. List of block indices with square padding.
  142. strides (`list[int]`, *optional*, defaults to `[1, 2, 2, 2, 1, 2, 1]`):
  143. List of stride sizes to be used in each block for convolutional layers.
  144. num_block_repeats (`list[int]`, *optional*, defaults to `[1, 2, 2, 3, 3, 4, 1]`):
  145. List of the number of times each block is to repeated.
  146. expand_ratios (`list[int]`, *optional*, defaults to `[1, 6, 6, 6, 6, 6, 6]`):
  147. List of scaling coefficient of each block.
  148. squeeze_expansion_ratio (`float`, *optional*, defaults to 0.25):
  149. Squeeze expansion ratio.
  150. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
  151. The non-linear activation function (function or string) in each block. If string, `"gelu"`, `"relu"`,
  152. `"selu", `"gelu_new"`, `"silu"` and `"mish"` are supported.
  153. hidden_dim (`int`, *optional*, defaults to 1280):
  154. The hidden dimension of the layer before the classification head.
  155. pooling_type (`str` or `function`, *optional*, defaults to `"mean"`):
  156. Type of final pooling to be applied before the dense classification head. Available options are [`"mean"`,
  157. `"max"`]
  158. initializer_range (`float`, *optional*, defaults to 0.02):
  159. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
  160. batch_norm_eps (`float`, *optional*, defaults to 1e-3):
  161. The epsilon used by the batch normalization layers.
  162. batch_norm_momentum (`float`, *optional*, defaults to 0.99):
  163. The momentum used by the batch normalization layers.
  164. drop_connect_rate (`float`, *optional*, defaults to 0.2):
  165. The drop rate for skip connections.
  166. Example:
  167. ```python
  168. >>> from transformers import AlignVisionConfig, AlignVisionModel
  169. >>> # Initializing a AlignVisionConfig with kakaobrain/align-base style configuration
  170. >>> configuration = AlignVisionConfig()
  171. >>> # Initializing a AlignVisionModel (with random weights) from the kakaobrain/align-base style configuration
  172. >>> model = AlignVisionModel(configuration)
  173. >>> # Accessing the model configuration
  174. >>> configuration = model.config
  175. ```"""
  176. model_type = "align_vision_model"
  177. base_config_key = "vision_config"
  178. def __init__(
  179. self,
  180. num_channels: int = 3,
  181. image_size: int = 600,
  182. width_coefficient: float = 2.0,
  183. depth_coefficient: float = 3.1,
  184. depth_divisor: int = 8,
  185. kernel_sizes: list[int] = [3, 3, 5, 3, 5, 5, 3],
  186. in_channels: list[int] = [32, 16, 24, 40, 80, 112, 192],
  187. out_channels: list[int] = [16, 24, 40, 80, 112, 192, 320],
  188. depthwise_padding: list[int] = [],
  189. strides: list[int] = [1, 2, 2, 2, 1, 2, 1],
  190. num_block_repeats: list[int] = [1, 2, 2, 3, 3, 4, 1],
  191. expand_ratios: list[int] = [1, 6, 6, 6, 6, 6, 6],
  192. squeeze_expansion_ratio: float = 0.25,
  193. hidden_act: str = "swish",
  194. hidden_dim: int = 2560,
  195. pooling_type: str = "mean",
  196. initializer_range: float = 0.02,
  197. batch_norm_eps: float = 0.001,
  198. batch_norm_momentum: float = 0.99,
  199. drop_connect_rate: float = 0.2,
  200. **kwargs,
  201. ):
  202. super().__init__(**kwargs)
  203. self.num_channels = num_channels
  204. self.image_size = image_size
  205. self.width_coefficient = width_coefficient
  206. self.depth_coefficient = depth_coefficient
  207. self.depth_divisor = depth_divisor
  208. self.kernel_sizes = kernel_sizes
  209. self.in_channels = in_channels
  210. self.out_channels = out_channels
  211. self.depthwise_padding = depthwise_padding
  212. self.strides = strides
  213. self.num_block_repeats = num_block_repeats
  214. self.expand_ratios = expand_ratios
  215. self.squeeze_expansion_ratio = squeeze_expansion_ratio
  216. self.hidden_act = hidden_act
  217. self.hidden_dim = hidden_dim
  218. self.pooling_type = pooling_type
  219. self.initializer_range = initializer_range
  220. self.batch_norm_eps = batch_norm_eps
  221. self.batch_norm_momentum = batch_norm_momentum
  222. self.drop_connect_rate = drop_connect_rate
  223. self.num_hidden_layers = sum(num_block_repeats) * 4
  224. class AlignConfig(PretrainedConfig):
  225. r"""
  226. [`AlignConfig`] is the configuration class to store the configuration of a [`AlignModel`]. It is used to
  227. instantiate a ALIGN model according to the specified arguments, defining the text model and vision model configs.
  228. Instantiating a configuration with the defaults will yield a similar configuration to that of the ALIGN
  229. [kakaobrain/align-base](https://huggingface.co/kakaobrain/align-base) architecture.
  230. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
  231. documentation from [`PretrainedConfig`] for more information.
  232. Args:
  233. text_config (`dict`, *optional*):
  234. Dictionary of configuration options used to initialize [`AlignTextConfig`].
  235. vision_config (`dict`, *optional*):
  236. Dictionary of configuration options used to initialize [`AlignVisionConfig`].
  237. projection_dim (`int`, *optional*, defaults to 640):
  238. Dimensionality of text and vision projection layers.
  239. temperature_init_value (`float`, *optional*, defaults to 1.0):
  240. The initial value of the *temperature* parameter. Default is used as per the original ALIGN implementation.
  241. initializer_range (`float`, *optional*, defaults to 0.02):
  242. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
  243. kwargs (*optional*):
  244. Dictionary of keyword arguments.
  245. Example:
  246. ```python
  247. >>> from transformers import AlignConfig, AlignModel
  248. >>> # Initializing a AlignConfig with kakaobrain/align-base style configuration
  249. >>> configuration = AlignConfig()
  250. >>> # Initializing a AlignModel (with random weights) from the kakaobrain/align-base style configuration
  251. >>> model = AlignModel(configuration)
  252. >>> # Accessing the model configuration
  253. >>> configuration = model.config
  254. >>> # We can also initialize a AlignConfig from a AlignTextConfig and a AlignVisionConfig
  255. >>> from transformers import AlignTextConfig, AlignVisionConfig
  256. >>> # Initializing ALIGN Text and Vision configurations
  257. >>> config_text = AlignTextConfig()
  258. >>> config_vision = AlignVisionConfig()
  259. >>> config = AlignConfig.from_text_vision_configs(config_text, config_vision)
  260. ```"""
  261. model_type = "align"
  262. sub_configs = {"text_config": AlignTextConfig, "vision_config": AlignVisionConfig}
  263. def __init__(
  264. self,
  265. text_config=None,
  266. vision_config=None,
  267. projection_dim=640,
  268. temperature_init_value=1.0,
  269. initializer_range=0.02,
  270. **kwargs,
  271. ):
  272. super().__init__(**kwargs)
  273. if text_config is None:
  274. text_config = {}
  275. logger.info("text_config is None. Initializing the AlignTextConfig with default values.")
  276. if vision_config is None:
  277. vision_config = {}
  278. logger.info("vision_config is None. Initializing the AlignVisionConfig with default values.")
  279. self.text_config = AlignTextConfig(**text_config)
  280. self.vision_config = AlignVisionConfig(**vision_config)
  281. self.projection_dim = projection_dim
  282. self.temperature_init_value = temperature_init_value
  283. self.initializer_range = initializer_range
  284. __all__ = ["AlignTextConfig", "AlignVisionConfig", "AlignConfig"]