configuration_evolla.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277
  1. # coding=utf-8
  2. # Copyright 2025 Westlake Representational Learning Lab (Fajie Yuan Lab) team and the HuggingFace Inc. team. All rights reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """Evolla model configuration"""
  16. from ...configuration_utils import PretrainedConfig
  17. from ...modeling_rope_utils import rope_config_validation
  18. from ...utils import logging
  19. logger = logging.get_logger(__name__)
  20. class SaProtConfig(PretrainedConfig):
  21. r"""This is the configuration class to store the configuration of a [`EvollaSaProtProteinEncoder`]. It is used to instantiate a
  22. SaProt model according to the specified arguments, defining the model architecture.
  23. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
  24. documentation from [`PretrainedConfig`] for more information.
  25. Args:
  26. vocab_size (`int`, *optional*, defaults to 446):
  27. Vocabulary size of the protein sequence model. Defines the number of different tokens that can be represented
  28. by the `inputs_ids` passed when calling [`EvollaModel`].
  29. mask_token_id (`int`, *optional*, defaults to 4):
  30. The id of the *mask* token in the protein sequence model.
  31. pad_token_id (`int`, *optional*, defaults to 1):
  32. The id of the *padding* token in the protein sequence model.
  33. hidden_size (`int`, *optional*, defaults to 1280):
  34. Dimensionality of the protein sequence model layers and the pooler layer.
  35. num_hidden_layers (`int`, *optional*, defaults to 33):
  36. Number of hidden layers in the protein sequence model.
  37. num_attention_heads (`int`, *optional*, defaults to 20):
  38. Number of attention heads for each attention layer in the protein sequence model.
  39. intermediate_size (`int`, *optional*, defaults to 5120):
  40. Dimensionality of the intermediate layers in the protein sequence model.
  41. hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
  42. The dropout ratio for the hidden layers in the protein sequence model.
  43. attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
  44. The dropout ratio for the attention probabilities in the protein sequence model.
  45. max_position_embeddings (`int`, *optional*, defaults to 1026):
  46. The maximum sequence length that the protein sequence model might ever be used with. Typically set this to
  47. something large just in case (e.g., 512 or 1024 or 2048).
  48. layer_norm_eps (`float`, *optional*, defaults to 1e-05):
  49. The epsilon value for the layer normalization layer in the protein sequence model.
  50. position_embedding_type (`str`, *optional*, defaults to `"rotary"`):
  51. The type of position embedding to use in the protein sequence model. Currently only `"rotary"` is supported.
  52. emb_layer_norm_before (`bool`, *optional*, defaults to `False`):
  53. Whether to apply layer normalization before the position embedding in the protein sequence model.
  54. token_dropout (`bool`, *optional*, defaults to `True`):
  55. Whether to apply dropout to the tokens in the protein sequence model."""
  56. def __init__(
  57. self,
  58. vocab_size=446,
  59. mask_token_id=4,
  60. pad_token_id=1,
  61. hidden_size=1280,
  62. num_hidden_layers=33,
  63. num_attention_heads=20,
  64. intermediate_size=5120,
  65. hidden_dropout_prob=0.1,
  66. attention_probs_dropout_prob=0.1,
  67. max_position_embeddings=1026,
  68. initializer_range=0.02,
  69. layer_norm_eps=1e-05,
  70. position_embedding_type="rotary",
  71. emb_layer_norm_before=False,
  72. token_dropout=True,
  73. **kwargs,
  74. ):
  75. super().__init__(pad_token_id=pad_token_id, mask_token_id=mask_token_id, **kwargs)
  76. self.vocab_size = vocab_size
  77. self.hidden_size = hidden_size
  78. self.num_hidden_layers = num_hidden_layers
  79. self.num_attention_heads = num_attention_heads
  80. self.intermediate_size = intermediate_size
  81. self.hidden_dropout_prob = hidden_dropout_prob
  82. self.attention_probs_dropout_prob = attention_probs_dropout_prob
  83. self.max_position_embeddings = max_position_embeddings
  84. self.initializer_range = initializer_range
  85. self.layer_norm_eps = layer_norm_eps
  86. self.position_embedding_type = position_embedding_type
  87. self.emb_layer_norm_before = emb_layer_norm_before
  88. self.token_dropout = token_dropout
  89. class EvollaConfig(PretrainedConfig):
  90. r"""
  91. This is the configuration class to store the configuration of a [`EvollaModel`]. It is used to instantiate an
  92. Evolla model according to the specified arguments, defining the model architecture. Instantiating a configuration
  93. with the defaults will yield a similar configuration to that of the Evolla-10B.
  94. e.g. [westlake-repl/Evolla-10B-hf](https://huggingface.co/westlake-repl/Evolla-10B-hf)
  95. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
  96. documentation from [`PretrainedConfig`] for more information.
  97. Args:
  98. protein_encoder_config (`dict`, *optional*):
  99. Dictionary of configuration options used to initialize [`SaProtConfig`].
  100. vocab_size (`int`, *optional*, defaults to 128256):
  101. Vocabulary size of the Evolla llama model. Defines the number of different tokens that can be represented by the
  102. `inputs_ids` passed when calling [`EvollaModel`].
  103. hidden_size (`int`, *optional*, defaults to 4096):
  104. Dimensionality of the llama layers and the pooler layer.
  105. intermediate_size (`int`, *optional*, defaults to 14336):
  106. Dimensionality of the intermediate layers in the llama model.
  107. num_hidden_layers (`int`, *optional*, defaults to 32):
  108. Number of hidden layers in the llama model.
  109. num_attention_heads (`int`, *optional*, defaults to 32):
  110. Number of attention heads for each attention layer in the llama model.
  111. num_key_value_heads (`int`, *optional*, defaults to 8):
  112. Number of key-value pairs for each attention layer in the llama model.
  113. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
  114. The non-linear activation function (function or string) in the llama model. If string, `"gelu"`, `"relu"`,
  115. `"selu"` and `"silu"` are supported.
  116. max_position_embeddings (`int`, *optional*, defaults to 8192):
  117. The maximum sequence length that this model might ever be used with. Typically set this to something large
  118. just in case (e.g., 512 or 1024 or 2048).
  119. rms_norm_eps (`float`, *optional*, defaults to 1e-05):
  120. The epsilon value for the RMS-norm layer in the llama model.
  121. rope_theta (`float`, *optional*, defaults to 500000.0):
  122. The threshold value for the RoPE layer in the llama model.
  123. rope_scaling (`float`, *optional*):
  124. The scaling factor for the RoPE layer in the llama model.
  125. attention_bias (`bool`, *optional*, defaults to `False`):
  126. Whether to use bias in the attention layer.
  127. attention_dropout (`float`, *optional*, defaults to 0.0):
  128. The dropout ratio for the attention layer.
  129. mlp_bias (`bool`, *optional*, defaults to `False`):
  130. Whether to use bias in the MLP layer.
  131. aligner_ffn_mult (`int`, *optional*, defaults to 4):
  132. The FFN multiplier for the aligner layer.
  133. aligner_enable_bias (`bool`, *optional*, defaults to `True`):
  134. Whether to use bias in the aligner layer.
  135. aligner_attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
  136. The dropout ratio for the attention probabilities in the aligner layer.
  137. aligner_num_add_layers (`int`, *optional*, defaults to 8):
  138. The number of additional layers for the aligner layer.
  139. resampler_depth (`int`, *optional*, defaults to 6):
  140. The depth of the resampler layer in the llama model.
  141. resampler_dim_head (`int`, *optional*, defaults to 64):
  142. The dimension of the heads in the resampler layer in the llama model.
  143. resampler_heads (`int`, *optional*, defaults to 8):
  144. The number of heads in the resampler layer in the llama model.
  145. resampler_num_latents (`int`, *optional*, defaults to 64):
  146. The number of latents in the resampler layer in the llama model.
  147. resampler_ff_mult (`int`, *optional*, defaults to 4):
  148. The FFN multiplier for the resampler layer.
  149. initializer_range (`float`, *optional*, defaults to 0.02):
  150. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
  151. pad_token_id (`int`, *optional*):
  152. The id of the *padding* token.
  153. bos_token_id (`int`, *optional*, defaults to 128000):
  154. The id of the *beginning-of-sequence* token.
  155. eos_token_id (`int`, *optional*, defaults to 128009):
  156. The id of the *end-of-sequence* token.
  157. use_cache (`bool`, *optional*, defaults to `False`):
  158. Whether or not the model should return the last key/values attentions (not used by all models).
  159. tie_word_embeddings (`bool`, *optional*, defaults to `False`):
  160. Whether or not to tie the input and output word embeddings.
  161. Example:
  162. ```python
  163. >>> from transformers import EvollaModel, EvollaConfig
  164. >>> # Initializing a Evolla evolla-10b style configuration
  165. >>> configuration = EvollaConfig()
  166. >>> # Initializing a model from the evolla-10b style configuration
  167. >>> model = EvollaModel(configuration)
  168. >>> # Accessing the model configuration
  169. >>> configuration = model.config
  170. ```"""
  171. model_type = "EvollaModel"
  172. sub_configs = {"protein_encoder_config": SaProtConfig}
  173. def __init__(
  174. self,
  175. protein_encoder_config=None,
  176. vocab_size=128256, # llama vocab size
  177. hidden_size=4096, # llama hidden size
  178. intermediate_size=14336, # llama intermediate size
  179. num_hidden_layers=32, # llama num layers
  180. num_attention_heads=32, # llama num heads
  181. num_key_value_heads=8, # llama num key-value heads
  182. hidden_act="silu", # llama activation function
  183. max_position_embeddings=8192, # llama rope max length
  184. rms_norm_eps=1e-05,
  185. rope_theta=500000.0,
  186. rope_scaling=None,
  187. attention_bias=False,
  188. attention_dropout=0.0,
  189. mlp_bias=False,
  190. aligner_ffn_mult=4,
  191. aligner_enable_bias=True,
  192. aligner_attention_probs_dropout_prob=0.1,
  193. aligner_num_add_layers=8,
  194. resampler_depth=6,
  195. resampler_dim_head=64,
  196. resampler_heads=8,
  197. resampler_num_latents=64,
  198. resampler_ff_mult=4,
  199. initializer_range=0.02,
  200. pad_token_id=None,
  201. bos_token_id=128000,
  202. eos_token_id=128009,
  203. use_cache=False,
  204. tie_word_embeddings=False,
  205. **kwargs,
  206. ):
  207. self.vocab_size = vocab_size
  208. self.hidden_size = hidden_size
  209. self.intermediate_size = intermediate_size
  210. self.num_hidden_layers = num_hidden_layers
  211. self.num_attention_heads = num_attention_heads
  212. self.num_key_value_heads = num_key_value_heads
  213. self.hidden_act = hidden_act
  214. self.max_position_embeddings = max_position_embeddings
  215. self.rms_norm_eps = rms_norm_eps
  216. self.tie_word_embeddings = tie_word_embeddings
  217. self.attention_bias = attention_bias
  218. self.attention_dropout = attention_dropout
  219. self.mlp_bias = mlp_bias
  220. self.aligner_ffn_mult = aligner_ffn_mult
  221. self.aligner_enable_bias = aligner_enable_bias
  222. self.aligner_attention_probs_dropout_prob = aligner_attention_probs_dropout_prob
  223. self.aligner_num_add_layers = aligner_num_add_layers
  224. self.use_cache = use_cache
  225. self.initializer_range = initializer_range
  226. self.resampler_depth = resampler_depth
  227. self.resampler_dim_head = resampler_dim_head
  228. self.resampler_heads = resampler_heads
  229. self.resampler_num_latents = resampler_num_latents
  230. self.resampler_ff_mult = resampler_ff_mult
  231. self.rope_theta = rope_theta
  232. self.rope_scaling = rope_scaling
  233. # Validate the correctness of rotary position embeddings parameters
  234. # BC: if there is a 'type' field, copy it it to 'rope_type'.
  235. if self.rope_scaling is not None and "type" in self.rope_scaling:
  236. self.rope_scaling["rope_type"] = self.rope_scaling["type"]
  237. rope_config_validation(self)
  238. # Subconfig
  239. if protein_encoder_config is None:
  240. protein_encoder_config = {}
  241. logger.info("`protein_encoder_config` is `None`. Initializing the `SaProtConfig` with default values.")
  242. self.protein_encoder_config = SaProtConfig(**protein_encoder_config)
  243. super().__init__(
  244. pad_token_id=pad_token_id,
  245. bos_token_id=bos_token_id,
  246. eos_token_id=eos_token_id,
  247. tie_word_embeddings=tie_word_embeddings,
  248. **kwargs,
  249. )
  250. __all__ = ["EvollaConfig"]