configuration_janus.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322
  1. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
  2. # This file was automatically generated from src/transformers/models/janus/modular_janus.py.
  3. # Do NOT edit this file manually as any edits will be overwritten by the generation of
  4. # the file from the modular. If any change should be done, please apply the change to the
  5. # modular_janus.py file directly. One of our CI enforces this.
  6. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
  7. # coding=utf-8
  8. # Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
  9. #
  10. # Licensed under the Apache License, Version 2.0 (the "License");
  11. # you may not use this file except in compliance with the License.
  12. # You may obtain a copy of the License at
  13. #
  14. # http://www.apache.org/licenses/LICENSE-2.0
  15. #
  16. # Unless required by applicable law or agreed to in writing, software
  17. # distributed under the License is distributed on an "AS IS" BASIS,
  18. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  19. # See the License for the specific language governing permissions and
  20. # limitations under the License.
  21. from ...configuration_utils import PretrainedConfig
  22. from ...utils import logging
  23. from ..auto import CONFIG_MAPPING, AutoConfig
  24. logger = logging.get_logger(__name__)
  25. class JanusVisionConfig(PretrainedConfig):
  26. r"""
  27. This is the configuration class to store the configuration of a [`JanusVisionModel`]. It is used to instantiate a
  28. `JanusVisionModel` according to the specified arguments, defining the model architecture.
  29. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
  30. documentation from [`PretrainedConfig`] for more information.
  31. Args:
  32. hidden_size (`int`, *optional*, defaults to 1024):
  33. Dimensionality of the encoder layers and the pooler layer.
  34. num_hidden_layers (`int`, *optional*, defaults to 24):
  35. Number of hidden layers in the Transformer encoder.
  36. num_attention_heads (`int`, *optional*, defaults to 16):
  37. Number of attention heads for each attention layer in the Transformer encoder.
  38. num_channels (`int`, *optional*, defaults to 3):
  39. The number of input channels.
  40. patch_size (`int`, *optional*, defaults to 16):
  41. The size (resolution) of each patch.
  42. image_size (`int`, *optional*, defaults to 384):
  43. The size (resolution) of each image.
  44. attention_dropout (`float`, *optional*, defaults to 0.0):
  45. Dropout probability for attention weights.
  46. layer_norm_eps (`float`, *optional*, defaults to 1e-06):
  47. The epsilon used by the layer normalization layers.
  48. hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
  49. The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
  50. `"relu"`, `"selu"`, and `"gelu_new"` are supported.
  51. mlp_ratio (`float`, *optional*, defaults to 4.0):
  52. Ratio of MLP hidden dimensionality to embedding dimensionality.
  53. attention_bias (`bool`, *optional*, defaults to `True`):
  54. Whether to add a bias to the queries, keys, and values in the attention layers.
  55. hidden_dropout_rate (`float`, *optional*, defaults to 0.0):
  56. The dropout probability for fully connected layers in the encoder.
  57. projection_dim (`int`, *optional*, defaults to 2048):
  58. Dimensionality of the MLP projection head.
  59. projection_dropout (`float`, *optional*, defaults to 0.0):
  60. Dropout probability for the projection layer.
  61. use_qk_norm (`bool`, *optional*, defaults to `False`):
  62. Whether to normalize the query and key matrices.
  63. initializer_range (`float`, *optional*, defaults to 0.02):
  64. The standard deviation of the truncated normal initializer for initializing all weight matrices.
  65. depth (`int`, *optional*, defaults to 2):
  66. Number of hidden layers in the aligner module.
  67. num_image_tokens (`int`, *optional*, defaults to 576):
  68. Number of image tokens.
  69. """
  70. model_type = "janus_vision_model"
  71. base_config_key = "vision_config"
  72. def __init__(
  73. self,
  74. hidden_size=1024,
  75. num_hidden_layers=24,
  76. num_attention_heads=16,
  77. num_channels=3,
  78. patch_size=16,
  79. image_size=384,
  80. attention_dropout=0.0,
  81. layer_norm_eps=1e-6,
  82. hidden_act="gelu",
  83. mlp_ratio=4.0,
  84. attention_bias=True,
  85. hidden_dropout_rate=0.0,
  86. projection_dim=2048,
  87. projection_dropout=0.0,
  88. use_qk_norm=False,
  89. initializer_range=0.02,
  90. depth=2,
  91. num_image_tokens=576,
  92. **kwargs,
  93. ):
  94. super().__init__(**kwargs)
  95. self.hidden_size = hidden_size
  96. self.num_hidden_layers = num_hidden_layers
  97. self.num_attention_heads = num_attention_heads
  98. self.num_channels = num_channels
  99. self.patch_size = patch_size
  100. self.image_size = image_size
  101. self.attention_dropout = attention_dropout
  102. self.layer_norm_eps = layer_norm_eps
  103. self.hidden_act = hidden_act
  104. self.mlp_ratio = mlp_ratio
  105. self.attention_bias = attention_bias
  106. self.hidden_dropout_rate = hidden_dropout_rate
  107. self.projection_dim = projection_dim
  108. self.projection_dropout = projection_dropout
  109. self.use_qk_norm = use_qk_norm
  110. self.initializer_range = initializer_range
  111. self.depth = depth
  112. self.num_image_tokens = num_image_tokens
  113. class JanusVQVAEConfig(PretrainedConfig):
  114. r"""
  115. This is the configuration class to store the configuration of a [`JanusVQVAEModel`]. It is used to instantiate a
  116. `JanusVQVAEModel` according to the specified arguments, defining the model architecture.
  117. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
  118. documentation from [`PretrainedConfig`] for more information. Instantiating a
  119. configuration with the defaults will yield a similar configuration to the VQModel of the
  120. [deepseek-community/Janus-Pro-1B](https://huggingface.co/deepseek-community/Janus-Pro-1B).
  121. Args:
  122. embed_dim (`int`, *optional*, defaults to 8):
  123. Dimensionality of each embedding vector.
  124. num_embeddings (`int`, *optional*, defaults to 16384):
  125. Number of codebook embeddings.
  126. double_latent (`bool`, *optional*, defaults to `False`):
  127. Whether to use double z channels.
  128. latent_channels (`int`, *optional*, defaults to 256):
  129. Number of channels for the latent space.
  130. num_patches (`int`, *optional*, defaults to 32):
  131. Num of patches the input images can be divided into.
  132. in_channels (`int`, *optional*, defaults to 3):
  133. Number of input channels.
  134. out_channels (`int`, *optional*, defaults to 3):
  135. Number of out channels.
  136. base_channels (`int`, *optional*, defaults to 128):
  137. Base channel count.
  138. channel_multiplier (`list[int]`, *optional*, defaults to `[1, 1, 2, 2, 4]`):
  139. Channel multipliers for each resolution.
  140. num_res_blocks (`int`, *optional*, defaults to 2):
  141. Number of residual blocks.
  142. dropout (`float`, *optional*, defaults to 0.0):
  143. Dropout rate.
  144. initializer_range (`float`, *optional*, defaults to 0.02):
  145. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
  146. projection_dim (`int`, *optional*, defaults to 2048):
  147. Dimensionality of the MLP projection head.
  148. num_hidden_layers (`int`, *optional*, defaults to 2):
  149. Number of hidden layers in VAVAE MLP Connecter module.
  150. hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
  151. The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
  152. `"relu"`, `"silu"` and `"gelu_new"` are supported.
  153. image_token_embed_dim (`int`, *optional*, defaults to 2048):
  154. Dimension of image embeddings. It should be same as the dimensionality of text embeddings.
  155. """
  156. model_type = "janus_vqgan"
  157. base_config_key = "vq_config"
  158. def __init__(
  159. self,
  160. embed_dim: int = 8,
  161. num_embeddings: int = 16384,
  162. double_latent: bool = False,
  163. latent_channels: int = 256,
  164. num_patches: int = 32,
  165. in_channels: int = 3,
  166. out_channels: int = 3,
  167. base_channels: int = 128,
  168. channel_multiplier: list[int] = [1, 1, 2, 2, 4],
  169. num_res_blocks: int = 2,
  170. dropout: float = 0.0,
  171. initializer_range=0.02,
  172. projection_dim=2048,
  173. num_hidden_layers=2,
  174. hidden_act="gelu",
  175. image_token_embed_dim=2048,
  176. **kwargs,
  177. ):
  178. super().__init__(**kwargs)
  179. self.embed_dim = embed_dim
  180. self.num_embeddings = num_embeddings
  181. self.double_latent = double_latent
  182. self.latent_channels = latent_channels
  183. self.in_channels = in_channels
  184. self.base_channels = base_channels
  185. self.channel_multiplier = channel_multiplier
  186. self.num_res_blocks = num_res_blocks
  187. self.dropout = dropout
  188. self.initializer_range = initializer_range
  189. self.num_patches = num_patches
  190. self.out_channels = out_channels
  191. self.projection_dim = projection_dim
  192. self.num_hidden_layers = num_hidden_layers
  193. self.hidden_act = hidden_act
  194. self.image_token_embed_dim = image_token_embed_dim
  195. class JanusConfig(PretrainedConfig):
  196. r"""
  197. This is the configuration class to store the configuration of a [`JanusModel`]. It is used to instantiate an
  198. Janus model according to the specified arguments, defining the model architecture. Instantiating a configuration
  199. with the defaults will yield a similar configuration to that of the Janus-1B or Janus-7B models.
  200. e.g. [deepseek-community/Janus-Pro-1B](https://huggingface.co/deepseek-community/Janus-Pro-1B) or
  201. [deepseek-community/Janus-Pro-7B](https://huggingface.co/deepseek-community/Janus-Pro-7B)
  202. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
  203. documentation from [`PretrainedConfig`] for more information.
  204. Args:
  205. text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
  206. The config object or dictionary of the text backbone.
  207. vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `JanusVisionConfig`):
  208. The config object or dictionary of the vision backbone.
  209. vq_config (`Union[AutoConfig, dict]`, *optional*, defaults to `JanusVQVAEConfig`):
  210. The config object or dictionary of the VQVAE backbone.
  211. image_token_id (`int`, *optional*, defaults to 100581):
  212. Token index of a placeholder image token.
  213. Example:
  214. ```python
  215. >>> from transformers import JanusForConditionalGeneration, JanusConfig, JanusVisionConfig, JanusVQVAEConfig, LlamaConfig
  216. >>> # Initializing a Janus vision config
  217. >>> vision_config = JanusVisionConfig()
  218. >>> # Initializing a Llama config
  219. >>> text_config = LlamaConfig()
  220. >>> # Initializing a VQ config
  221. >>> vq_config = JanusVQVAEConfig()
  222. >>> # Initializing a Janus Pro 1B style configuration
  223. >>> configuration = JanusConfig(vision_config=vision_config, text_config=text_config, vq_config=vq_config)
  224. >>> # Initializing a model from the Janus Pro 1B style configuration
  225. >>> model = JanusForConditionalGeneration(configuration)
  226. >>> # Accessing the model configuration
  227. >>> configuration = model.config
  228. ```"""
  229. model_type = "janus"
  230. sub_configs = {
  231. "text_config": AutoConfig,
  232. "vision_config": JanusVisionConfig,
  233. "vq_config": JanusVQVAEConfig,
  234. }
  235. def __init__(
  236. self,
  237. text_config=None,
  238. vision_config=None,
  239. vq_config=None,
  240. image_token_id=100581,
  241. **kwargs,
  242. ):
  243. if isinstance(text_config, dict):
  244. text_config["model_type"] = text_config.get("model_type", "llama")
  245. self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
  246. elif text_config is None:
  247. logger.info("`text_config` is None. Initializing with default values")
  248. self.text_config = CONFIG_MAPPING["llama"]()
  249. elif isinstance(text_config, PretrainedConfig):
  250. self.text_config = text_config
  251. else:
  252. raise ValueError(
  253. f"Invalid type for `text_config`. Must be either `dict` or `LlamaConfig`."
  254. f" Type found: {type(text_config)}"
  255. )
  256. if vision_config is None:
  257. logger.info("`vision_config` is None. Initializing with default JanusVisionConfig values")
  258. self.vision_config = JanusVisionConfig()
  259. elif isinstance(vision_config, dict):
  260. self.vision_config = JanusVisionConfig(**vision_config)
  261. elif isinstance(vision_config, JanusVisionConfig):
  262. self.vision_config = vision_config
  263. else:
  264. raise ValueError(
  265. f"Invalid type for `vision_config`. Must be either `dict` or `JanusVisionConfig`."
  266. f" Type found: {type(vision_config)}"
  267. )
  268. if vq_config is None:
  269. logger.info("`vq_config` is None. Initializing with default JanusVQVAEConfig values")
  270. self.vq_config = JanusVQVAEConfig()
  271. elif isinstance(vq_config, dict):
  272. self.vq_config = JanusVQVAEConfig(**vq_config)
  273. elif isinstance(vq_config, JanusVQVAEConfig):
  274. self.vq_config = vq_config
  275. else:
  276. raise ValueError(
  277. f"Invalid type for `vq_config`. Must be either `dict` or `JanusVQVAEConfig`."
  278. f" Type found: {type(vq_config)}"
  279. )
  280. self.initializer_range = self.vision_config.initializer_range
  281. # This dimension is required when decoding discrete image tokens to continuous input.
  282. self.vq_config.num_patches = self.vision_config.image_size // self.vision_config.patch_size
  283. # The default is only the index for the 1B model, 7B uses a different one
  284. self.image_token_id = image_token_id
  285. super().__init__(**kwargs)
  286. __all__ = ["JanusVQVAEConfig", "JanusVisionConfig", "JanusConfig"]