configuration_dpt.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302
  1. # coding=utf-8
  2. # Copyright 2022 The HuggingFace Inc. team. All rights reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """DPT model configuration"""
  16. import copy
  17. from ...configuration_utils import PretrainedConfig
  18. from ...utils import logging
  19. from ...utils.backbone_utils import verify_backbone_config_arguments
  20. from ..auto.configuration_auto import CONFIG_MAPPING
  21. from ..bit import BitConfig
  22. logger = logging.get_logger(__name__)
  23. class DPTConfig(PretrainedConfig):
  24. r"""
  25. This is the configuration class to store the configuration of a [`DPTModel`]. It is used to instantiate an DPT
  26. model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
  27. defaults will yield a similar configuration to that of the DPT
  28. [Intel/dpt-large](https://huggingface.co/Intel/dpt-large) architecture.
  29. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
  30. documentation from [`PretrainedConfig`] for more information.
  31. Args:
  32. hidden_size (`int`, *optional*, defaults to 768):
  33. Dimensionality of the encoder layers and the pooler layer.
  34. num_hidden_layers (`int`, *optional*, defaults to 12):
  35. Number of hidden layers in the Transformer encoder.
  36. num_attention_heads (`int`, *optional*, defaults to 12):
  37. Number of attention heads for each attention layer in the Transformer encoder.
  38. intermediate_size (`int`, *optional*, defaults to 3072):
  39. Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
  40. hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
  41. The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
  42. `"relu"`, `"selu"` and `"gelu_new"` are supported.
  43. hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
  44. The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
  45. attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
  46. The dropout ratio for the attention probabilities.
  47. initializer_range (`float`, *optional*, defaults to 0.02):
  48. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
  49. layer_norm_eps (`float`, *optional*, defaults to 1e-12):
  50. The epsilon used by the layer normalization layers.
  51. image_size (`int`, *optional*, defaults to 384):
  52. The size (resolution) of each image.
  53. patch_size (`int`, *optional*, defaults to 16):
  54. The size (resolution) of each patch.
  55. num_channels (`int`, *optional*, defaults to 3):
  56. The number of input channels.
  57. is_hybrid (`bool`, *optional*, defaults to `False`):
  58. Whether to use a hybrid backbone. Useful in the context of loading DPT-Hybrid models.
  59. qkv_bias (`bool`, *optional*, defaults to `True`):
  60. Whether to add a bias to the queries, keys and values.
  61. backbone_out_indices (`list[int]`, *optional*, defaults to `[2, 5, 8, 11]`):
  62. Indices of the intermediate hidden states to use from backbone.
  63. readout_type (`str`, *optional*, defaults to `"project"`):
  64. The readout type to use when processing the readout token (CLS token) of the intermediate hidden states of
  65. the ViT backbone. Can be one of [`"ignore"`, `"add"`, `"project"`].
  66. - "ignore" simply ignores the CLS token.
  67. - "add" passes the information from the CLS token to all other tokens by adding the representations.
  68. - "project" passes information to the other tokens by concatenating the readout to all other tokens before
  69. projecting the
  70. representation to the original feature dimension D using a linear layer followed by a GELU non-linearity.
  71. reassemble_factors (`list[int]`, *optional*, defaults to `[4, 2, 1, 0.5]`):
  72. The up/downsampling factors of the reassemble layers.
  73. neck_hidden_sizes (`list[str]`, *optional*, defaults to `[96, 192, 384, 768]`):
  74. The hidden sizes to project to for the feature maps of the backbone.
  75. fusion_hidden_size (`int`, *optional*, defaults to 256):
  76. The number of channels before fusion.
  77. head_in_index (`int`, *optional*, defaults to -1):
  78. The index of the features to use in the heads.
  79. use_batch_norm_in_fusion_residual (`bool`, *optional*, defaults to `False`):
  80. Whether to use batch normalization in the pre-activate residual units of the fusion blocks.
  81. use_bias_in_fusion_residual (`bool`, *optional*, defaults to `True`):
  82. Whether to use bias in the pre-activate residual units of the fusion blocks.
  83. add_projection (`bool`, *optional*, defaults to `False`):
  84. Whether to add a projection layer before the depth estimation head.
  85. use_auxiliary_head (`bool`, *optional*, defaults to `True`):
  86. Whether to use an auxiliary head during training.
  87. auxiliary_loss_weight (`float`, *optional*, defaults to 0.4):
  88. Weight of the cross-entropy loss of the auxiliary head.
  89. semantic_loss_ignore_index (`int`, *optional*, defaults to 255):
  90. The index that is ignored by the loss function of the semantic segmentation model.
  91. semantic_classifier_dropout (`float`, *optional*, defaults to 0.1):
  92. The dropout ratio for the semantic classification head.
  93. backbone_featmap_shape (`list[int]`, *optional*, defaults to `[1, 1024, 24, 24]`):
  94. Used only for the `hybrid` embedding type. The shape of the feature maps of the backbone.
  95. neck_ignore_stages (`list[int]`, *optional*, defaults to `[0, 1]`):
  96. Used only for the `hybrid` embedding type. The stages of the readout layers to ignore.
  97. backbone_config (`Union[dict[str, Any], PretrainedConfig]`, *optional*):
  98. The configuration of the backbone model. Only used in case `is_hybrid` is `True` or in case you want to
  99. leverage the [`AutoBackbone`] API.
  100. backbone (`str`, *optional*):
  101. Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
  102. will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
  103. is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
  104. use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
  105. Whether to use pretrained weights for the backbone.
  106. use_timm_backbone (`bool`, *optional*, defaults to `False`):
  107. Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
  108. library.
  109. backbone_kwargs (`dict`, *optional*):
  110. Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
  111. e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
  112. pooler_output_size (`int`, *optional*):
  113. Dimensionality of the pooler layer. If None, defaults to `hidden_size`.
  114. pooler_act (`str`, *optional*, defaults to `"tanh"`):
  115. The activation function to be used by the pooler. Keys of ACT2FN are supported for Flax and
  116. Pytorch, and elements of https://www.tensorflow.org/api_docs/python/tf/keras/activations are
  117. supported for Tensorflow.
  118. Example:
  119. ```python
  120. >>> from transformers import DPTModel, DPTConfig
  121. >>> # Initializing a DPT dpt-large style configuration
  122. >>> configuration = DPTConfig()
  123. >>> # Initializing a model from the dpt-large style configuration
  124. >>> model = DPTModel(configuration)
  125. >>> # Accessing the model configuration
  126. >>> configuration = model.config
  127. ```"""
  128. model_type = "dpt"
  129. def __init__(
  130. self,
  131. hidden_size=768,
  132. num_hidden_layers=12,
  133. num_attention_heads=12,
  134. intermediate_size=3072,
  135. hidden_act="gelu",
  136. hidden_dropout_prob=0.0,
  137. attention_probs_dropout_prob=0.0,
  138. initializer_range=0.02,
  139. layer_norm_eps=1e-12,
  140. image_size=384,
  141. patch_size=16,
  142. num_channels=3,
  143. is_hybrid=False,
  144. qkv_bias=True,
  145. backbone_out_indices=[2, 5, 8, 11],
  146. readout_type="project",
  147. reassemble_factors=[4, 2, 1, 0.5],
  148. neck_hidden_sizes=[96, 192, 384, 768],
  149. fusion_hidden_size=256,
  150. head_in_index=-1,
  151. use_batch_norm_in_fusion_residual=False,
  152. use_bias_in_fusion_residual=None,
  153. add_projection=False,
  154. use_auxiliary_head=True,
  155. auxiliary_loss_weight=0.4,
  156. semantic_loss_ignore_index=255,
  157. semantic_classifier_dropout=0.1,
  158. backbone_featmap_shape=[1, 1024, 24, 24],
  159. neck_ignore_stages=[0, 1],
  160. backbone_config=None,
  161. backbone=None,
  162. use_pretrained_backbone=False,
  163. use_timm_backbone=False,
  164. backbone_kwargs=None,
  165. pooler_output_size=None,
  166. pooler_act="tanh",
  167. **kwargs,
  168. ):
  169. super().__init__(**kwargs)
  170. self.hidden_size = hidden_size
  171. self.is_hybrid = is_hybrid
  172. use_autobackbone = False
  173. if self.is_hybrid:
  174. if backbone_config is None:
  175. backbone_config = {
  176. "global_padding": "same",
  177. "layer_type": "bottleneck",
  178. "depths": [3, 4, 9],
  179. "out_features": ["stage1", "stage2", "stage3"],
  180. "embedding_dynamic_padding": True,
  181. }
  182. if isinstance(backbone_config, dict):
  183. logger.info("Initializing the config with a `BiT` backbone.")
  184. backbone_config = BitConfig(**backbone_config)
  185. elif not isinstance(backbone_config, PretrainedConfig):
  186. raise ValueError(
  187. f"backbone_config must be a dictionary or a `PretrainedConfig`, got {backbone_config.__class__}."
  188. )
  189. self.backbone_config = backbone_config
  190. self.backbone_featmap_shape = backbone_featmap_shape
  191. self.neck_ignore_stages = neck_ignore_stages
  192. if readout_type != "project":
  193. raise ValueError("Readout type must be 'project' when using `DPT-hybrid` mode.")
  194. elif backbone is not None or backbone_config is not None:
  195. use_autobackbone = True
  196. if isinstance(backbone_config, dict):
  197. backbone_model_type = backbone_config.get("model_type")
  198. config_class = CONFIG_MAPPING[backbone_model_type]
  199. backbone_config = config_class.from_dict(backbone_config)
  200. self.backbone_config = backbone_config
  201. self.backbone_featmap_shape = None
  202. self.neck_ignore_stages = []
  203. # We only use load_backbone when config.is_hydrid is False
  204. verify_backbone_config_arguments(
  205. use_timm_backbone=use_timm_backbone,
  206. use_pretrained_backbone=use_pretrained_backbone,
  207. backbone=backbone,
  208. backbone_config=backbone_config,
  209. backbone_kwargs=backbone_kwargs,
  210. )
  211. else:
  212. self.backbone_config = None
  213. self.backbone_featmap_shape = None
  214. self.neck_ignore_stages = []
  215. self.backbone = backbone
  216. self.use_pretrained_backbone = use_pretrained_backbone
  217. self.use_timm_backbone = use_timm_backbone
  218. self.backbone_kwargs = backbone_kwargs
  219. # ViT parameters used if not using a hybrid backbone
  220. self.num_hidden_layers = num_hidden_layers
  221. self.num_attention_heads = num_attention_heads
  222. self.intermediate_size = intermediate_size
  223. self.hidden_dropout_prob = hidden_dropout_prob
  224. self.attention_probs_dropout_prob = attention_probs_dropout_prob
  225. self.layer_norm_eps = layer_norm_eps
  226. self.image_size = image_size
  227. self.patch_size = patch_size
  228. self.num_channels = num_channels
  229. self.qkv_bias = qkv_bias
  230. self.use_autobackbone = use_autobackbone
  231. self.backbone_out_indices = None if use_autobackbone else backbone_out_indices
  232. if readout_type not in ["ignore", "add", "project"]:
  233. raise ValueError("Readout_type must be one of ['ignore', 'add', 'project']")
  234. self.hidden_act = hidden_act
  235. self.initializer_range = initializer_range
  236. self.readout_type = readout_type
  237. self.reassemble_factors = reassemble_factors
  238. self.neck_hidden_sizes = neck_hidden_sizes
  239. self.fusion_hidden_size = fusion_hidden_size
  240. self.head_in_index = head_in_index
  241. self.use_batch_norm_in_fusion_residual = use_batch_norm_in_fusion_residual
  242. self.use_bias_in_fusion_residual = use_bias_in_fusion_residual
  243. self.add_projection = add_projection
  244. # auxiliary head attributes (semantic segmentation)
  245. self.use_auxiliary_head = use_auxiliary_head
  246. self.auxiliary_loss_weight = auxiliary_loss_weight
  247. self.semantic_loss_ignore_index = semantic_loss_ignore_index
  248. self.semantic_classifier_dropout = semantic_classifier_dropout
  249. self.pooler_output_size = pooler_output_size if pooler_output_size else hidden_size
  250. self.pooler_act = pooler_act
  251. def to_dict(self):
  252. """
  253. Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. Returns:
  254. `dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
  255. """
  256. output = copy.deepcopy(self.__dict__)
  257. if output["backbone_config"] is not None:
  258. output["backbone_config"] = self.backbone_config.to_dict()
  259. output["model_type"] = self.__class__.model_type
  260. return output
  261. @property
  262. def sub_configs(self):
  263. return (
  264. {"backbone_config": type(self.backbone_config)}
  265. if getattr(self, "backbone_config", None) is not None
  266. else {}
  267. )
  268. __all__ = ["DPTConfig"]