configuration.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224
  1. # Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
  2. # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License");
  6. # you may not use this file except in compliance with the License.
  7. # You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. """ PEER model configuration """
  17. # modified the path according to the structure in my directory csssl_4_15/cssl/ and its env
  18. from transformers.configuration_utils import PretrainedConfig
  19. from modelscope.utils import logger as logging
  20. logger = logging.get_logger()
  21. class PeerConfig(PretrainedConfig):
  22. r"""
  23. This is the configuration class to store the configuration of a :class:`~transformers.PeerModel` or a
  24. :class:`~transformers.TFPeerModel`. It is used to instantiate a PEER model according to the specified
  25. arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar
  26. configuration to that of the PEER `google/peer-small-discriminator
  27. <https://huggingface.co/google/peer-small-discriminator>`__ architecture.
  28. Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
  29. outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
  30. Args:
  31. vocab_size (:obj:`int`, `onal`, defaults to 30522)
  32. Vocabulary size of the PEER model. Defines the number of different tokens that can be represented by the
  33. :obj:`inputs_ids` passed when calling :class:`~transformers.PeerModel` or
  34. :class:`~transformers.TFPeerModel`.
  35. embedding_size (:obj:`int`, `onal`, defaults to 128)
  36. Dimensionality of the encoder layers and the pooler layer.
  37. hidden_size (:obj:`int`, `onal`, defaults to 256)
  38. Dimensionality of the encoder layers and the pooler layer.
  39. num_hidden_layers (:obj:`int`, `onal`, defaults to 12)
  40. Number of hidden layers in the Transformer encoder.
  41. num_attention_heads (:obj:`int`, `onal`, defaults to 4)
  42. Number of attention heads for each attention layer in the Transformer encoder.
  43. intermediate_size (:obj:`int`, `onal`, defaults to 1024)
  44. Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
  45. hidden_act (:obj:`str` or :obj:`Callable`, `onal`, defaults to :obj:`"gelu"`)
  46. The non-linear activation function (function or string) in the encoder and pooler. If string,
  47. :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
  48. hidden_dropout_prob (:obj:`float`, `onal`, defaults to 0.1)
  49. The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
  50. attention_probs_dropout_prob (:obj:`float`, `onal`, defaults to 0.1)
  51. The dropout ratio for the attention probabilities.
  52. max_position_embeddings (:obj:`int`, `onal`, defaults to 512)
  53. The maximum sequence length that this model might ever be used with. Typically set this to something large
  54. just in case (e.g., 512 or 1024 or 2048).
  55. type_vocab_size (:obj:`int`, `onal`, defaults to 2)
  56. The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.PeerModel` or
  57. :class:`~transformers.TFPeerModel`.
  58. initializer_range (:obj:`float`, `onal`, defaults to 0.02)
  59. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
  60. layer_norm_eps (:obj:`float`, `onal`, defaults to 1e-12)
  61. The epsilon used by the layer normalization layers.
  62. summary_type (:obj:`str`, `onal`, defaults to :obj:`"first"`)
  63. Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
  64. Has to be one of the following ones
  65. - :obj:`"last"`: Take the last token hidden state (like XLNet).
  66. - :obj:`"first"`: Take the first token hidden state (like BERT).
  67. - :obj:`"mean"`: Take the mean of all tokens hidden states.
  68. - :obj:`"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
  69. - :obj:`"attn"`: Not implemented now, use multi-head attention.
  70. summary_use_proj (:obj:`bool`, `onal`, defaults to :obj:`True`)
  71. Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
  72. Whether or not to add a projection after the vector extraction.
  73. summary_activation (:obj:`str`, `onal`)
  74. Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
  75. Pass :obj:`"gelu"` for a gelu activation to the output, any other value will result in no activation.
  76. summary_last_dropout (:obj:`float`, `onal`, defaults to 0.0)
  77. Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
  78. The dropout ratio to be used after the projection and activation.
  79. position_embedding_type (:obj:`str`, `onal`, defaults to :obj:`"absolute"`)
  80. Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
  81. :obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
  82. :obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
  83. <https://arxiv.org/abs/1803.02155>`__. For more information on :obj:`"relative_key_query"`, please refer to
  84. `Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
  85. <https://arxiv.org/abs/2009.13658>`__.
  86. Examples::
  87. >>> from transformers import PeerModel, PeerConfig
  88. >>> # Initializing a PEER peer-base-uncased style configuration
  89. >>> configuration = PeerConfig()
  90. >>> # Initializing a model from the peer-base-uncased style configuration
  91. >>> model = PeerModel(configuration)
  92. >>> # Accessing the model configuration
  93. >>> configuration = model.config
  94. """
  95. model_type = 'peer'
  96. def __init__(self,
  97. vocab_size=30522,
  98. embedding_size=128,
  99. hidden_size=256,
  100. num_hidden_layers=12,
  101. num_hidden_layers_shared=3,
  102. num_hidden_layers_gen=6,
  103. num_attention_heads=4,
  104. intermediate_size=1024,
  105. hidden_act='gelu',
  106. hidden_dropout_prob=0.1,
  107. attention_probs_dropout_prob=0.1,
  108. max_position_embeddings=512,
  109. type_vocab_size=2,
  110. initializer_range=0.02,
  111. layer_norm_eps=1e-12,
  112. summary_type='first',
  113. summary_use_proj=True,
  114. summary_activation='gelu',
  115. summary_last_dropout=0.1,
  116. pad_token_id=0,
  117. position_embedding_type='absolute',
  118. gen_weight=1,
  119. dis_weight=50,
  120. dis_weight_scheduler=1,
  121. augmentation_copies=1,
  122. augmentation_temperature=1,
  123. absolute_position_embedding=1,
  124. relative_position_embedding=32,
  125. seq_side_info_embeddings=0,
  126. cold_start_epochs=1.25,
  127. debug_config=dict(),
  128. rtd_levels=2,
  129. rtd_level_thresholds='',
  130. ranking_start_epoch=1.0,
  131. real_token_rank_for_good_estimate=5,
  132. rank_sampl_prop=0.3,
  133. rank_sampl_range=100,
  134. rank_delta_factor=0.0,
  135. rank_level_compare_method=0,
  136. weight_loss_low_levels=1.0,
  137. weight_loss_low_levels_setting='1.0-1.0',
  138. weight_loss_low_levels_scheduler=0,
  139. weight_loss_level_compos=1,
  140. mask_da=0,
  141. mask_da_start_epoch=0.0,
  142. mask_da_mlm_topk_val=0,
  143. mask_ratio_setting='0.15-0.15',
  144. mask_ratio_scheduler=0,
  145. mask_ratio_stage1_epochs=0.0,
  146. **kwargs):
  147. super().__init__(pad_token_id=pad_token_id, **kwargs)
  148. self.vocab_size = vocab_size
  149. self.embedding_size = embedding_size
  150. self.hidden_size = hidden_size
  151. self.num_hidden_layers = num_hidden_layers
  152. self.num_hidden_layers_shared = num_hidden_layers_shared
  153. self.num_hidden_layers_gen = num_hidden_layers_gen
  154. self.num_attention_heads = num_attention_heads
  155. self.intermediate_size = intermediate_size
  156. self.hidden_act = hidden_act
  157. self.hidden_dropout_prob = hidden_dropout_prob
  158. self.attention_probs_dropout_prob = attention_probs_dropout_prob
  159. self.max_position_embeddings = max_position_embeddings
  160. self.type_vocab_size = type_vocab_size
  161. self.initializer_range = initializer_range
  162. self.layer_norm_eps = layer_norm_eps
  163. self.summary_type = summary_type
  164. self.summary_use_proj = summary_use_proj
  165. self.summary_activation = summary_activation
  166. self.summary_last_dropout = summary_last_dropout
  167. if type(position_embedding_type) == str:
  168. position_embedding_type = position_embedding_type.split('+')
  169. self.position_embedding_type = position_embedding_type
  170. self.augmentation_temperature = augmentation_temperature
  171. self.gen_weight = gen_weight
  172. self.dis_weight = dis_weight
  173. self.dis_weight_scheduler = dis_weight_scheduler
  174. self.augmentation_copies = augmentation_copies
  175. self.absolute_position_embedding = absolute_position_embedding
  176. self.relative_position_embedding = relative_position_embedding
  177. self.seq_side_info_embeddings = seq_side_info_embeddings
  178. self.cold_start_epochs = cold_start_epochs
  179. self.debug_config = debug_config
  180. self.rtd_levels = rtd_levels
  181. self.rtd_level_thresholds = rtd_level_thresholds
  182. self.ranking_start_epoch = ranking_start_epoch
  183. self.real_token_rank_for_good_estimate = real_token_rank_for_good_estimate
  184. self.rank_sampl_prop = rank_sampl_prop
  185. self.rank_sampl_range = rank_sampl_range
  186. self.rank_delta_factor = rank_delta_factor
  187. self.rank_level_compare_method = rank_level_compare_method
  188. self.weight_loss_low_levels = weight_loss_low_levels
  189. self.weight_loss_low_levels_setting = weight_loss_low_levels_setting
  190. self.weight_loss_low_levels_scheduler = weight_loss_low_levels_scheduler
  191. self.weight_loss_level_compos = weight_loss_level_compos
  192. self.mask_da = mask_da
  193. self.mask_da_start_epoch = mask_da_start_epoch
  194. self.mask_da_mlm_topk_val = mask_da_mlm_topk_val
  195. self.mask_ratio_setting = mask_ratio_setting
  196. self.mask_ratio_scheduler = mask_ratio_scheduler
  197. self.mask_ratio_stage1_epochs = mask_ratio_stage1_epochs