configuration.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102
  1. """ ChatGLM model configuration """
  2. from transformers.configuration_utils import PretrainedConfig
  3. from modelscope.utils import logger as logging
  4. logger = logging.get_logger()
  5. class ChatGLMConfig(PretrainedConfig):
  6. r"""
  7. This is the configuration class to store the configuration of a [`~ChatGLMModel`].
  8. It is used to instantiate an ChatGLM model according to the specified arguments, defining the model
  9. architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
  10. the ChatGLM-6B [THUDM/ChatGLM-6B](https://huggingface.co/THUDM/chatglm-6b) architecture.
  11. Configuration objects inherit from [`PretrainedConfig`] and can be used
  12. to control the model outputs. Read the documentation from [`PretrainedConfig`]
  13. for more information.
  14. Args:
  15. vocab_size (`int`, *optional*, defaults to 150528):
  16. Vocabulary size of the ChatGLM-6B model.
  17. Defines the number of different tokens that can be represented by the
  18. `inputs_ids` passed when calling [`~ChatGLMModel`] or
  19. [`~TFChatGLMModel`].
  20. hidden_size (`int`, *optional*, defaults to 4096):
  21. Dimension of the encoder layers and the pooler layer.
  22. num_hidden_layers (`int`, *optional*, defaults to 28):
  23. Number of hidden layers in the Transformer encoder.
  24. num_attention_heads (`int`, *optional*, defaults to 32):
  25. Number of attention heads for each attention layer in the Transformer encoder.
  26. inner_hidden_size (`int`, *optional*, defaults to 16384):
  27. Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
  28. max_sequence_length (`int`, *optional*, defaults to 512):
  29. The maximum sequence length that this model might ever be used with.
  30. Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
  31. layernorm_epsilon (`float`, *optional*, defaults to 1e-5):
  32. The epsilon used by the layer normalization layers.
  33. use_cache (`bool`, *optional*, defaults to `True`):
  34. Whether the model should return the last key/values attentions (not used by all models).
  35. Example:
  36. ```python
  37. >>> from modelscope.models.nlp.chatglm.configuration import ChatGLMConfig
  38. >>> from modelscope.models.nlp.chatglm.text_generation import ChatGLMModel
  39. >>> # Initializing a ChatGLM-6B THUDM/ChatGLM-6B style configuration
  40. >>> configuration = ChatGLMConfig()
  41. >>> # Initializing a model from the THUDM/ChatGLM-6B style configuration
  42. >>> model = ChatGLMModel(configuration)
  43. >>> # Accessing the model configuration
  44. >>> configuration = model.config
  45. ```
  46. """
  47. model_type = 'chatglm'
  48. def __init__(self,
  49. vocab_size=150528,
  50. hidden_size=4096,
  51. num_layers=28,
  52. num_attention_heads=32,
  53. layernorm_epsilon=1e-5,
  54. use_cache=False,
  55. bos_token_id=150004,
  56. eos_token_id=150005,
  57. mask_token_id=150000,
  58. gmask_token_id=150001,
  59. pad_token_id=0,
  60. max_sequence_length=2048,
  61. inner_hidden_size=16384,
  62. position_encoding_2d=True,
  63. quantization_bit=0,
  64. pre_seq_len=None,
  65. prefix_projection=False,
  66. **kwargs):
  67. self.num_layers = num_layers
  68. self.vocab_size = vocab_size
  69. self.hidden_size = hidden_size
  70. self.num_attention_heads = num_attention_heads
  71. self.max_sequence_length = max_sequence_length
  72. self.layernorm_epsilon = layernorm_epsilon
  73. self.inner_hidden_size = inner_hidden_size
  74. self.use_cache = use_cache
  75. self.bos_token_id = bos_token_id
  76. self.eos_token_id = eos_token_id
  77. self.pad_token_id = pad_token_id
  78. self.mask_token_id = mask_token_id
  79. self.gmask_token_id = gmask_token_id
  80. self.position_encoding_2d = position_encoding_2d
  81. self.quantization_bit = quantization_bit
  82. self.pre_seq_len = pre_seq_len
  83. self.prefix_projection = prefix_projection
  84. super().__init__(
  85. pad_token_id=pad_token_id,
  86. bos_token_id=bos_token_id,
  87. eos_token_id=eos_token_id,
  88. **kwargs)