configuration.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253
  1. # Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
  2. # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License");
  6. # you may not use this file except in compliance with the License.
  7. # You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. import copy
  17. import json
  18. from transformers import PretrainedConfig
  19. from modelscope.utils import logger as logging
  20. logger = logging.get_logger()
  21. class PlugNLUConfig(PretrainedConfig):
  22. model_type = 'plugNLU'
  23. def __init__(self,
  24. vocab_size=21504,
  25. original_vocab_size=21128,
  26. hidden_size=8192,
  27. num_hidden_layers=24,
  28. num_attention_heads=128,
  29. intermediate_size=32768,
  30. hidden_act='gelu',
  31. hidden_dropout_prob=0.1,
  32. attention_probs_dropout_prob=0.1,
  33. max_position_embeddings=2048,
  34. type_vocab_size=3,
  35. initializer_range=0.00707,
  36. lr_decay_style='linear',
  37. weight_decay=1e-2,
  38. clip_grad=1.0,
  39. warmup=0.0333,
  40. pre_ln=True,
  41. fp16=True,
  42. fp32_layernorm=True,
  43. fp32_embedding=False,
  44. fp32_tokentypes=False,
  45. layernorm_epsilon=1e-5,
  46. dec_hidden_layers=6,
  47. attn_separate=False,
  48. **kwargs):
  49. super().__init__(layer_norm_eps=layernorm_epsilon, **kwargs)
  50. self.vocab_size = vocab_size
  51. self.original_vocab_size = original_vocab_size
  52. self.hidden_size = hidden_size
  53. self.num_hidden_layers = num_hidden_layers
  54. self.num_attention_heads = num_attention_heads
  55. self.hidden_act = hidden_act
  56. self.intermediate_size = intermediate_size
  57. self.hidden_dropout_prob = hidden_dropout_prob
  58. self.attention_probs_dropout_prob = attention_probs_dropout_prob
  59. self.max_position_embeddings = max_position_embeddings
  60. self.type_vocab_size = type_vocab_size
  61. self.initializer_range = initializer_range
  62. self.lr_decay_style = lr_decay_style
  63. self.weight_decay = weight_decay
  64. self.clip_grad = clip_grad
  65. self.warmup = warmup
  66. self.pre_ln = pre_ln
  67. self.fp16 = fp16
  68. self.fp32_layernorm = fp32_layernorm
  69. self.fp32_embedding = fp32_embedding
  70. self.layernorm_epsilon = layernorm_epsilon
  71. self.fp32_tokentypes = fp32_tokentypes
  72. self.dec_hidden_layers = dec_hidden_layers
  73. self.attn_separate = attn_separate
  74. @classmethod
  75. def from_dict(cls, json_object):
  76. """Constructs a `BertConfig` from a Python dictionary of parameters."""
  77. config = PlugNLUConfig()
  78. for key, value in json_object.items():
  79. config.__dict__[key] = value
  80. return config
  81. @classmethod
  82. def from_json_file(cls, json_file):
  83. """Constructs a `BertConfig` from a json file of parameters."""
  84. with open(json_file, 'r', encoding='utf-8') as reader:
  85. text = reader.read()
  86. return cls.from_dict(json.loads(text))
  87. def merge_args(self, args):
  88. """merge values a `BertConfig` from a json file of parameters."""
  89. local_keys = self.__dict__.keys()
  90. for key, value in args.__dict__.items():
  91. if key in local_keys:
  92. continue
  93. self.__dict__[key] = value
  94. return self
  95. def __repr__(self):
  96. return str(self.to_json_string())
  97. def to_dict(self):
  98. """Serializes this instance to a Python dictionary."""
  99. output = copy.deepcopy(self.__dict__)
  100. return output
  101. def to_json_string(self):
  102. """Serializes this instance to a JSON string."""
  103. return json.dumps(self.to_dict(), indent=2, sort_keys=True) + '\n'
  104. class PlugNLGConfig(PlugNLUConfig):
  105. """
  106. This is the configuration class to store the configuration of a [`PlugModel`]. It is used to instantiate a
  107. PLUG understanding model according to the specified arguments, defining the model architecture. Instantiating a
  108. configuration with the defaults will yield a similar configuration to that of the PLUG
  109. [PLUG](https://modelscope.cn/models/damo/nlp_plug_text-generation_27B/summary) architecture.
  110. Configuration objects inherit from [`PlugNLUConfig`] and can be used to control the model outputs. Read the
  111. documentation from [`PlugNLUConfig`] for more information.
  112. Args:
  113. vocab_size (`int`, *optional*, defaults to 21504):
  114. Padded vocabulary size of the PLUG model for vocab tensor parallel. Defines the number of different tokens
  115. that can be represented by the `inputs_ids` passed when calling [`PlugModel`].
  116. original_vocab_size (`int`, *optional*, defaults to 21128):
  117. True vocabulary size of the PLUG model. Defines the number of different tokens that can be represented.
  118. hidden_size (`int`, *optional*, defaults to 8192):
  119. Dimensionality of the encoder layers and the pooler layer.
  120. num_hidden_layers (`int`, *optional*, defaults to 24):
  121. Number of hidden layers in the Transformer encoder.
  122. dec_hidden_layers (`int`, *optional*, defaults to 6):
  123. Number of hidden layers in the Transformer decoder.
  124. num_attention_heads (`int`, *optional*, defaults to 128):
  125. Number of attention heads for each attention layer in the Transformer encoder.
  126. intermediate_size (`int`, *optional*, defaults to 32768):
  127. Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
  128. hidden_act (`str`, *optional*, defaults to `"gelu"`):
  129. The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
  130. `"relu"`, `"selu"` and `"gelu_new"` are supported.
  131. hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
  132. The dropout ratio for all fully connected layers in the embeddings, encoder, and pooler.
  133. attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
  134. The dropout ratio for the Transformer Attention.
  135. max_position_embeddings (`int`, *optional*, defaults to 2048):
  136. The maximum sequence length that this model might ever be used with. Typically set this to something large
  137. just in case (e.g., 512 or 1024 or 2048).
  138. type_vocab_size (`int`, *optional*, defaults to 3):
  139. The vocabulary size of the `token_type_ids` passed when calling [`PlugModel`].
  140. initializer_range (`float`, *optional*, defaults to 0.00707):
  141. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
  142. lr_decay_style (`str`, *optional*, defaults to 'linear'):
  143. The decay style of learning rate during fine-tunining. If string, `"linear"`, `"cosine"`, `"exponential"`,
  144. `"constant"`, `"None"` are supported.
  145. weight_decay (`float`, *optional*, defaults to 1e-2):
  146. Decoupled weight decay to apply.
  147. clip_grad (`float`, *optional*, defaults to 1.0):
  148. Maximum gradient norm for gradient clipping.
  149. warmup (`float`, *optional*, defaults to 0.01):
  150. Ratio of total training steps used for a linear warmup from 0 to `learning_rate`.
  151. pre_ln (`boolean`, *optional*, defaults to `True`):
  152. Whether or not to apply LayerNorm to the input instead of the output in the blocks.
  153. fp16 (`boolean`, *optional*, defaults to `True`):
  154. Whether to use fp16 16-bit (mixed) precision training instead of 32-bit training.
  155. fp32_layernorm (`boolean`, *optional*, defaults to `True`):
  156. Whether to use fp32 32-bit precision LayerNorm training while the argument `fp16` set to `True`.
  157. fp32_embedding (`boolean`, *optional*, defaults to `False`):
  158. Whether to use fp32 32-bit precision Embedding training while the argument `fp16` set to `True`.
  159. fp32_tokentypes (`boolean`, *optional*, defaults to `False`):
  160. Whether to use fp32 32-bit precision token types training while the argument `fp16` set to `True`.
  161. layernorm_epsilon (`float`, *optional*, defaults to 1e-5):
  162. The epsilon to use in the layer normalization layers.
  163. attn_separate (`boolean`, *optional*, defaults to `False`):
  164. Whether or not to separate query-key-value to query, key, value in the Attention.
  165. Example:
  166. >>> # The PLUG model has 27B parameters and usually need to run on multiple GPUs. The example given
  167. >>> # here only initializes a slice of the model on a single GPU.
  168. >>> # Check out the [`~DistributedPipeline.__init__`] method to initialize entire PLUG model.
  169. >>> from modelscope.models.nlp.plug import PlugNLGConfig, PlugModel
  170. >>> # Initializing a Plug configuration
  171. >>> configuration = PlugNLGConfig()
  172. >>> # Initializing a model from the configuration
  173. >>> model = PlugModel(configuration)
  174. >>> # Accessing the model configuration
  175. >>> configuration = model.config
  176. """
  177. model_type = 'plugNLG'
  178. def __init__(self,
  179. vocab_size=21504,
  180. original_vocab_size=21128,
  181. hidden_size=8192,
  182. num_hidden_layers=24,
  183. dec_hidden_layers=6,
  184. num_attention_heads=128,
  185. intermediate_size=32768,
  186. hidden_act='gelu',
  187. hidden_dropout_prob=0.1,
  188. attention_probs_dropout_prob=0.1,
  189. max_position_embeddings=2048,
  190. type_vocab_size=3,
  191. initializer_range=0.00707,
  192. lr_decay_style='linear',
  193. weight_decay=1e-2,
  194. clip_grad=1.0,
  195. warmup=0.01,
  196. pre_ln=True,
  197. fp16=True,
  198. fp32_layernorm=True,
  199. fp32_embedding=False,
  200. fp32_tokentypes=False,
  201. layernorm_epsilon=1e-12,
  202. attn_separate=False,
  203. **kwargs):
  204. super().__init__(layer_norm_eps=layernorm_epsilon, **kwargs)
  205. self.vocab_size = vocab_size
  206. self.hidden_size = hidden_size
  207. self.num_hidden_layers = num_hidden_layers
  208. self.num_attention_heads = num_attention_heads
  209. self.hidden_act = hidden_act
  210. self.intermediate_size = intermediate_size
  211. self.hidden_dropout_prob = hidden_dropout_prob
  212. self.attention_probs_dropout_prob = attention_probs_dropout_prob
  213. self.max_position_embeddings = max_position_embeddings
  214. self.type_vocab_size = type_vocab_size
  215. self.initializer_range = initializer_range
  216. self.lr_decay_style = lr_decay_style
  217. self.weight_decay = weight_decay
  218. self.clip_grad = clip_grad
  219. self.warmup = warmup
  220. self.pre_ln = pre_ln
  221. self.fp16 = fp16
  222. self.fp32_layernorm = fp32_layernorm
  223. self.fp32_embedding = fp32_embedding
  224. self.layernorm_epsilon = layernorm_epsilon
  225. self.fp32_tokentypes = fp32_tokentypes
  226. self.dec_hidden_layers = dec_hidden_layers
  227. self.attn_separate = attn_separate