configuration_codegen.py 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231
  1. # coding=utf-8
  2. # Copyright 2022 Salesforce authors, The EleutherAI, and HuggingFace Teams. All rights reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """CodeGen model configuration"""
  16. from collections import OrderedDict
  17. from collections.abc import Mapping
  18. from typing import Any, Optional
  19. from ... import PreTrainedTokenizer, TensorType, is_torch_available
  20. from ...configuration_utils import PretrainedConfig
  21. from ...onnx import OnnxConfigWithPast, PatchingSpec
  22. from ...utils import logging
  23. logger = logging.get_logger(__name__)
  24. class CodeGenConfig(PretrainedConfig):
  25. r"""
  26. This is the configuration class to store the configuration of a [`CodeGenModel`]. It is used to instantiate a
  27. CodeGen model according to the specified arguments, defining the model architecture. Instantiating a configuration
  28. with the defaults will yield a similar configuration to that of the CodeGen
  29. [Salesforce/codegen-2B-mono](https://huggingface.co/Salesforce/codegen-2B-mono) architecture. Configuration objects
  30. inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from
  31. [`PretrainedConfig`] for more information.
  32. Args:
  33. vocab_size (`int`, *optional*, defaults to 50400):
  34. Vocabulary size of the CodeGen model. Defines the number of different tokens that can be represented by the
  35. `inputs_ids` passed when calling [`CodeGenModel`].
  36. n_positions (`int`, *optional*, defaults to 2048):
  37. The maximum sequence length that this model might ever be used with. Typically set this to something large
  38. just in case (e.g., 512 or 1024 or 2048).
  39. n_ctx (`int`, *optional*, defaults to 2048):
  40. This attribute is used in `CodeGenModel.__init__` without any real effect.
  41. n_embd (`int`, *optional*, defaults to 4096):
  42. Dimensionality of the embeddings and hidden states.
  43. n_layer (`int`, *optional*, defaults to 28):
  44. Number of hidden layers in the Transformer encoder.
  45. n_head (`int`, *optional*, defaults to 16):
  46. Number of attention heads for each attention layer in the Transformer encoder.
  47. rotary_dim (`int`, *optional*, defaults to 64):
  48. Number of dimensions in the embedding that Rotary Position Embedding is applied to.
  49. n_inner (`int`, *optional*):
  50. Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
  51. activation_function (`str`, *optional*, defaults to `"gelu_new"`):
  52. Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`.
  53. resid_pdrop (`float`, *optional*, defaults to 0.0):
  54. The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
  55. embd_pdrop (`int`, *optional*, defaults to 0.0):
  56. The dropout ratio for the embeddings.
  57. attn_pdrop (`float`, *optional*, defaults to 0.0):
  58. The dropout ratio for the attention.
  59. layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
  60. The epsilon to use in the layer normalization layers.
  61. initializer_range (`float`, *optional*, defaults to 0.02):
  62. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
  63. use_cache (`bool`, *optional*, defaults to `True`):
  64. Whether or not the model should return the last key/values attentions (not used by all models).
  65. bos_token_id (`int`, *optional*, defaults to 50256):
  66. Beginning of stream token id.
  67. eos_token_id (`int`, *optional*, defaults to 50256):
  68. End of stream token id.
  69. tie_word_embeddings (`bool`, *optional*, defaults to `False`):
  70. Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
  71. model has a output word embedding layer.
  72. Example:
  73. ```python
  74. >>> from transformers import CodeGenConfig, CodeGenModel
  75. >>> # Initializing a CodeGen 6B configuration
  76. >>> configuration = CodeGenConfig()
  77. >>> # Initializing a model (with random weights) from the configuration
  78. >>> model = CodeGenModel(configuration)
  79. >>> # Accessing the model configuration
  80. >>> configuration = model.config
  81. ```"""
  82. model_type = "codegen"
  83. attribute_map = {
  84. "max_position_embeddings": "n_positions",
  85. "hidden_size": "n_embd",
  86. "num_attention_heads": "n_head",
  87. "num_hidden_layers": "n_layer",
  88. }
  89. def __init__(
  90. self,
  91. vocab_size=50400,
  92. n_positions=2048,
  93. n_ctx=2048,
  94. n_embd=4096,
  95. n_layer=28,
  96. n_head=16,
  97. rotary_dim=64,
  98. n_inner=None,
  99. activation_function="gelu_new",
  100. resid_pdrop=0.0,
  101. embd_pdrop=0.0,
  102. attn_pdrop=0.0,
  103. layer_norm_epsilon=1e-5,
  104. initializer_range=0.02,
  105. use_cache=True,
  106. bos_token_id=50256,
  107. eos_token_id=50256,
  108. tie_word_embeddings=False,
  109. **kwargs,
  110. ):
  111. self.vocab_size = vocab_size
  112. self.n_ctx = n_ctx
  113. self.n_positions = n_positions
  114. self.n_embd = n_embd
  115. self.n_layer = n_layer
  116. self.n_head = n_head
  117. self.n_inner = n_inner
  118. self.rotary_dim = rotary_dim
  119. self.activation_function = activation_function
  120. self.resid_pdrop = resid_pdrop
  121. self.embd_pdrop = embd_pdrop
  122. self.attn_pdrop = attn_pdrop
  123. self.layer_norm_epsilon = layer_norm_epsilon
  124. self.initializer_range = initializer_range
  125. self.use_cache = use_cache
  126. self.bos_token_id = bos_token_id
  127. self.eos_token_id = eos_token_id
  128. super().__init__(
  129. bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs
  130. )
  131. # Copied from transformers.models.gpt2.configuration_gpt2.GPT2OnnxConfig
  132. class CodeGenOnnxConfig(OnnxConfigWithPast):
  133. def __init__(
  134. self,
  135. config: PretrainedConfig,
  136. task: str = "default",
  137. patching_specs: Optional[list[PatchingSpec]] = None,
  138. use_past: bool = False,
  139. ):
  140. super().__init__(config, task=task, patching_specs=patching_specs, use_past=use_past)
  141. if not getattr(self._config, "pad_token_id", None):
  142. # TODO: how to do that better?
  143. self._config.pad_token_id = 0
  144. @property
  145. def inputs(self) -> Mapping[str, Mapping[int, str]]:
  146. common_inputs = OrderedDict({"input_ids": {0: "batch", 1: "sequence"}})
  147. if self.use_past:
  148. self.fill_with_past_key_values_(common_inputs, direction="inputs")
  149. common_inputs["attention_mask"] = {0: "batch", 1: "past_sequence + sequence"}
  150. else:
  151. common_inputs["attention_mask"] = {0: "batch", 1: "sequence"}
  152. return common_inputs
  153. @property
  154. def num_layers(self) -> int:
  155. return self._config.n_layer
  156. @property
  157. def num_attention_heads(self) -> int:
  158. return self._config.n_head
  159. def generate_dummy_inputs(
  160. self,
  161. tokenizer: PreTrainedTokenizer,
  162. batch_size: int = -1,
  163. seq_length: int = -1,
  164. is_pair: bool = False,
  165. framework: Optional[TensorType] = None,
  166. ) -> Mapping[str, Any]:
  167. common_inputs = super(OnnxConfigWithPast, self).generate_dummy_inputs(
  168. tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
  169. )
  170. # We need to order the input in the way they appears in the forward()
  171. ordered_inputs = OrderedDict({"input_ids": common_inputs["input_ids"]})
  172. # Need to add the past_keys
  173. if self.use_past:
  174. if not is_torch_available():
  175. raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
  176. else:
  177. import torch
  178. batch, seqlen = common_inputs["input_ids"].shape
  179. # Not using the same length for past_key_values
  180. past_key_values_length = seqlen + 2
  181. past_shape = (
  182. batch,
  183. self.num_attention_heads,
  184. past_key_values_length,
  185. self._config.hidden_size // self.num_attention_heads,
  186. )
  187. ordered_inputs["past_key_values"] = [
  188. (torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(self.num_layers)
  189. ]
  190. ordered_inputs["attention_mask"] = common_inputs["attention_mask"]
  191. if self.use_past:
  192. mask_dtype = ordered_inputs["attention_mask"].dtype
  193. ordered_inputs["attention_mask"] = torch.cat(
  194. [ordered_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1
  195. )
  196. return ordered_inputs
  197. @property
  198. def default_onnx_opset(self) -> int:
  199. return 13
  200. __all__ = ["CodeGenConfig", "CodeGenOnnxConfig"]