configuration_pvt.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. # coding=utf-8
  2. # Copyright 2023 Authors: Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan,
  3. # Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao and The HuggingFace Inc. team.
  4. # All rights reserved.
  5. #
  6. # Licensed under the Apache License, Version 2.0 (the "License");
  7. # you may not use this file except in compliance with the License.
  8. # You may obtain a copy of the License at
  9. #
  10. # http://www.apache.org/licenses/LICENSE-2.0
  11. #
  12. # Unless required by applicable law or agreed to in writing, software
  13. # distributed under the License is distributed on an "AS IS" BASIS,
  14. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. # See the License for the specific language governing permissions and
  16. # limitations under the License.
  17. """Pvt model configuration"""
  18. from collections import OrderedDict
  19. from collections.abc import Mapping
  20. from typing import Callable
  21. from packaging import version
  22. from ...configuration_utils import PretrainedConfig
  23. from ...onnx import OnnxConfig
  24. from ...utils import logging
  25. logger = logging.get_logger(__name__)
  26. class PvtConfig(PretrainedConfig):
  27. r"""
  28. This is the configuration class to store the configuration of a [`PvtModel`]. It is used to instantiate an Pvt
  29. model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
  30. defaults will yield a similar configuration to that of the Pvt
  31. [Xrenya/pvt-tiny-224](https://huggingface.co/Xrenya/pvt-tiny-224) architecture.
  32. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
  33. documentation from [`PretrainedConfig`] for more information.
  34. Args:
  35. image_size (`int`, *optional*, defaults to 224):
  36. The input image size
  37. num_channels (`int`, *optional*, defaults to 3):
  38. The number of input channels.
  39. num_encoder_blocks (`int`, *optional*, defaults to 4):
  40. The number of encoder blocks (i.e. stages in the Mix Transformer encoder).
  41. depths (`list[int]`, *optional*, defaults to `[2, 2, 2, 2]`):
  42. The number of layers in each encoder block.
  43. sequence_reduction_ratios (`list[int]`, *optional*, defaults to `[8, 4, 2, 1]`):
  44. Sequence reduction ratios in each encoder block.
  45. hidden_sizes (`list[int]`, *optional*, defaults to `[64, 128, 320, 512]`):
  46. Dimension of each of the encoder blocks.
  47. patch_sizes (`list[int]`, *optional*, defaults to `[4, 2, 2, 2]`):
  48. Patch size before each encoder block.
  49. strides (`list[int]`, *optional*, defaults to `[4, 2, 2, 2]`):
  50. Stride before each encoder block.
  51. num_attention_heads (`list[int]`, *optional*, defaults to `[1, 2, 5, 8]`):
  52. Number of attention heads for each attention layer in each block of the Transformer encoder.
  53. mlp_ratios (`list[int]`, *optional*, defaults to `[8, 8, 4, 4]`):
  54. Ratio of the size of the hidden layer compared to the size of the input layer of the Mix FFNs in the
  55. encoder blocks.
  56. hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
  57. The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
  58. `"relu"`, `"selu"` and `"gelu_new"` are supported.
  59. hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
  60. The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
  61. attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
  62. The dropout ratio for the attention probabilities.
  63. initializer_range (`float`, *optional*, defaults to 0.02):
  64. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
  65. drop_path_rate (`float`, *optional*, defaults to 0.0):
  66. The dropout probability for stochastic depth, used in the blocks of the Transformer encoder.
  67. layer_norm_eps (`float`, *optional*, defaults to 1e-06):
  68. The epsilon used by the layer normalization layers.
  69. qkv_bias (`bool`, *optional*, defaults to `True`):
  70. Whether or not a learnable bias should be added to the queries, keys and values.
  71. num_labels ('int', *optional*, defaults to 1000):
  72. The number of classes.
  73. Example:
  74. ```python
  75. >>> from transformers import PvtModel, PvtConfig
  76. >>> # Initializing a PVT Xrenya/pvt-tiny-224 style configuration
  77. >>> configuration = PvtConfig()
  78. >>> # Initializing a model from the Xrenya/pvt-tiny-224 style configuration
  79. >>> model = PvtModel(configuration)
  80. >>> # Accessing the model configuration
  81. >>> configuration = model.config
  82. ```"""
  83. model_type = "pvt"
  84. def __init__(
  85. self,
  86. image_size: int = 224,
  87. num_channels: int = 3,
  88. num_encoder_blocks: int = 4,
  89. depths: list[int] = [2, 2, 2, 2],
  90. sequence_reduction_ratios: list[int] = [8, 4, 2, 1],
  91. hidden_sizes: list[int] = [64, 128, 320, 512],
  92. patch_sizes: list[int] = [4, 2, 2, 2],
  93. strides: list[int] = [4, 2, 2, 2],
  94. num_attention_heads: list[int] = [1, 2, 5, 8],
  95. mlp_ratios: list[int] = [8, 8, 4, 4],
  96. hidden_act: Mapping[str, Callable] = "gelu",
  97. hidden_dropout_prob: float = 0.0,
  98. attention_probs_dropout_prob: float = 0.0,
  99. initializer_range: float = 0.02,
  100. drop_path_rate: float = 0.0,
  101. layer_norm_eps: float = 1e-6,
  102. qkv_bias: bool = True,
  103. num_labels: int = 1000,
  104. **kwargs,
  105. ):
  106. super().__init__(**kwargs)
  107. self.image_size = image_size
  108. self.num_channels = num_channels
  109. self.num_encoder_blocks = num_encoder_blocks
  110. self.depths = depths
  111. self.sequence_reduction_ratios = sequence_reduction_ratios
  112. self.hidden_sizes = hidden_sizes
  113. self.patch_sizes = patch_sizes
  114. self.strides = strides
  115. self.mlp_ratios = mlp_ratios
  116. self.num_attention_heads = num_attention_heads
  117. self.hidden_act = hidden_act
  118. self.hidden_dropout_prob = hidden_dropout_prob
  119. self.attention_probs_dropout_prob = attention_probs_dropout_prob
  120. self.initializer_range = initializer_range
  121. self.drop_path_rate = drop_path_rate
  122. self.layer_norm_eps = layer_norm_eps
  123. self.num_labels = num_labels
  124. self.qkv_bias = qkv_bias
  125. class PvtOnnxConfig(OnnxConfig):
  126. torch_onnx_minimum_version = version.parse("1.11")
  127. @property
  128. def inputs(self) -> Mapping[str, Mapping[int, str]]:
  129. return OrderedDict(
  130. [
  131. ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
  132. ]
  133. )
  134. @property
  135. def atol_for_validation(self) -> float:
  136. return 1e-4
  137. @property
  138. def default_onnx_opset(self) -> int:
  139. return 12
  140. __all__ = ["PvtConfig", "PvtOnnxConfig"]