configuration_bloom.py 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238
  1. # coding=utf-8
  2. # Copyright 2022 the Big Science Workshop and HuggingFace Inc. team. All rights reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """Bloom configuration"""
  16. from collections import OrderedDict
  17. from collections.abc import Mapping
  18. from typing import TYPE_CHECKING, Any, Optional
  19. from packaging import version
  20. if TYPE_CHECKING:
  21. from ... import PreTrainedTokenizer, TensorType
  22. from ...configuration_utils import PretrainedConfig
  23. from ...onnx import OnnxConfigWithPast, PatchingSpec
  24. from ...utils import is_torch_available, logging
  25. logger = logging.get_logger(__name__)
  26. class BloomConfig(PretrainedConfig):
  27. """
  28. This is the configuration class to store the configuration of a [`BloomModel`]. It is used to instantiate a Bloom
  29. model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
  30. defaults will yield a similar configuration to the Bloom architecture
  31. [bigscience/bloom](https://huggingface.co/bigscience/bloom).
  32. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
  33. documentation from [`PretrainedConfig`] for more information.
  34. Args:
  35. vocab_size (`int`, *optional*, defaults to 250880):
  36. Vocabulary size of the Bloom model. Defines the maximum number of different tokens that can be represented
  37. by the `inputs_ids` passed when calling [`BloomModel`]. Check [this
  38. discussion](https://huggingface.co/bigscience/bloom/discussions/120#633d28389addb8530b406c2a) on how the
  39. `vocab_size` has been defined.
  40. hidden_size (`int`, *optional*, defaults to 64):
  41. Dimensionality of the embeddings and hidden states.
  42. n_layer (`int`, *optional*, defaults to 2):
  43. Number of hidden layers in the Transformer encoder.
  44. n_head (`int`, *optional*, defaults to 8):
  45. Number of attention heads for each attention layer in the Transformer encoder.
  46. layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
  47. The epsilon to use in the layer normalization layers.
  48. initializer_range (`float`, *optional*, defaults to 0.02):
  49. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
  50. apply_residual_connection_post_layernorm (`bool`, *optional*, defaults to `False`):
  51. If enabled, use the layer norm of the hidden states as the residual in the transformer blocks
  52. hidden_dropout (`float`, *optional*, defaults to 0.1):
  53. Dropout rate of the dropout function on the bias dropout.
  54. attention_dropout (`float`, *optional*, defaults to 0.1):
  55. Dropout rate applied to the attention probs
  56. use_cache (`bool`, *optional*, defaults to `True`):
  57. Whether or not the model should return the last key/values attentions (not used by all models).
  58. pretraining_tp (`int`, *optional*, defaults to `1`):
  59. Experimental feature. Tensor parallelism rank used during pretraining with Megatron. Please refer to [this
  60. document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
  61. necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
  62. issue](https://github.com/pytorch/pytorch/issues/76232). Note also that this is enabled only when
  63. `slow_but_exact=True`.
  64. slow_but_exact (`bool`, *optional*, defaults to `False`):
  65. Experimental feature. Whether to use slow but exact implementation of the attention mechanism. While
  66. merging the TP rank tensors, due to slicing operations the results may be slightly different between the
  67. model trained on Megatron and our model. Please refer to [this
  68. issue](https://github.com/pytorch/pytorch/issues/76232). A solution to obtain more accurate results is to
  69. enable this feature. Enabling this will hurt the computational time of the inference. Will be probably
  70. resolved in the future once the main model has been fine-tuned with TP_rank=1.
  71. Example:
  72. ```python
  73. >>> from transformers import BloomConfig, BloomModel
  74. >>> # Initializing a Bloom configuration
  75. >>> configuration = BloomConfig()
  76. >>> # Initializing a model (with random weights) from the configuration
  77. >>> model = BloomModel(configuration)
  78. >>> # Accessing the model configuration
  79. >>> configuration = model.config
  80. ```"""
  81. model_type = "bloom"
  82. keys_to_ignore_at_inference = ["past_key_values"]
  83. attribute_map = {
  84. "num_hidden_layers": "n_layer",
  85. "num_attention_heads": "n_head",
  86. }
  87. def __init__(
  88. self,
  89. vocab_size=250880,
  90. hidden_size=64,
  91. n_layer=2,
  92. n_head=8,
  93. layer_norm_epsilon=1e-5,
  94. initializer_range=0.02,
  95. use_cache=True,
  96. bos_token_id=1,
  97. eos_token_id=2,
  98. apply_residual_connection_post_layernorm=False,
  99. hidden_dropout=0.0,
  100. attention_dropout=0.0,
  101. pretraining_tp=1, # TP rank used when training with megatron
  102. slow_but_exact=False,
  103. **kwargs,
  104. ):
  105. self.vocab_size = vocab_size
  106. # Backward compatibility with n_embed kwarg
  107. n_embed = kwargs.pop("n_embed", None)
  108. self.hidden_size = hidden_size if n_embed is None else n_embed
  109. self.n_layer = n_layer
  110. self.n_head = n_head
  111. self.layer_norm_epsilon = layer_norm_epsilon
  112. self.initializer_range = initializer_range
  113. self.use_cache = use_cache
  114. self.pretraining_tp = pretraining_tp
  115. self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
  116. self.hidden_dropout = hidden_dropout
  117. self.attention_dropout = attention_dropout
  118. self.bos_token_id = bos_token_id
  119. self.eos_token_id = eos_token_id
  120. self.slow_but_exact = slow_but_exact
  121. super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
  122. class BloomOnnxConfig(OnnxConfigWithPast):
  123. torch_onnx_minimum_version = version.parse("1.12")
  124. def __init__(
  125. self,
  126. config: PretrainedConfig,
  127. task: str = "default",
  128. patching_specs: Optional[list[PatchingSpec]] = None,
  129. use_past: bool = False,
  130. ):
  131. super().__init__(config, task=task, patching_specs=patching_specs, use_past=use_past)
  132. if not getattr(self._config, "pad_token_id", None):
  133. # TODO: how to do that better?
  134. self._config.pad_token_id = 0
  135. @property
  136. def inputs(self) -> Mapping[str, Mapping[int, str]]:
  137. common_inputs = OrderedDict({"input_ids": {0: "batch", 1: "sequence"}})
  138. if self.use_past:
  139. # BLOOM stores values on dynamic axis 2. For more details see: https://github.com/huggingface/transformers/pull/18344
  140. self.fill_with_past_key_values_(common_inputs, direction="inputs", inverted_values_shape=True)
  141. common_inputs["attention_mask"] = {0: "batch", 1: "past_sequence + sequence"}
  142. else:
  143. common_inputs["attention_mask"] = {0: "batch", 1: "sequence"}
  144. return common_inputs
  145. @property
  146. def num_layers(self) -> int:
  147. return self._config.n_layer
  148. @property
  149. def num_attention_heads(self) -> int:
  150. return self._config.n_head
  151. @property
  152. def atol_for_validation(self) -> float:
  153. return 1e-3
  154. def generate_dummy_inputs(
  155. self,
  156. tokenizer: "PreTrainedTokenizer",
  157. batch_size: int = -1,
  158. seq_length: int = -1,
  159. is_pair: bool = False,
  160. framework: Optional["TensorType"] = None,
  161. ) -> Mapping[str, Any]:
  162. common_inputs = super(OnnxConfigWithPast, self).generate_dummy_inputs(
  163. tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
  164. )
  165. # We need to order the input in the way they appears in the forward()
  166. ordered_inputs = OrderedDict({"input_ids": common_inputs["input_ids"]})
  167. # Need to add the past_keys
  168. if self.use_past:
  169. if not is_torch_available():
  170. raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
  171. else:
  172. import torch
  173. batch, seqlen = common_inputs["input_ids"].shape
  174. # Not using the same length for past_key_values
  175. past_key_values_length = seqlen + 2
  176. head_dim = self._config.hidden_size // self.num_attention_heads
  177. past_key_shape = (
  178. batch * self.num_attention_heads,
  179. head_dim,
  180. past_key_values_length,
  181. )
  182. past_value_shape = (
  183. batch * self.num_attention_heads,
  184. past_key_values_length,
  185. head_dim,
  186. )
  187. ordered_inputs["past_key_values"] = [
  188. (torch.zeros(past_key_shape), torch.zeros(past_value_shape)) for _ in range(self.num_layers)
  189. ]
  190. ordered_inputs["attention_mask"] = common_inputs["attention_mask"]
  191. if self.use_past:
  192. mask_dtype = ordered_inputs["attention_mask"].dtype
  193. ordered_inputs["attention_mask"] = torch.cat(
  194. [ordered_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1
  195. )
  196. return ordered_inputs
  197. @property
  198. def default_onnx_opset(self) -> int:
  199. return 13
  200. __all__ = ["BloomConfig", "BloomOnnxConfig"]