configuration_zoedepth.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245
  1. # coding=utf-8
  2. # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """ZoeDepth model configuration"""
  16. from ...configuration_utils import PretrainedConfig
  17. from ...utils import logging
  18. from ..auto.configuration_auto import CONFIG_MAPPING
  19. logger = logging.get_logger(__name__)
  20. ZOEDEPTH_PRETRAINED_CONFIG_ARCHIVE_MAP = {
  21. "Intel/zoedepth-nyu": "https://huggingface.co/Intel/zoedepth-nyu/resolve/main/config.json",
  22. }
  23. class ZoeDepthConfig(PretrainedConfig):
  24. r"""
  25. This is the configuration class to store the configuration of a [`ZoeDepthForDepthEstimation`]. It is used to instantiate an ZoeDepth
  26. model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
  27. defaults will yield a similar configuration to that of the ZoeDepth
  28. [Intel/zoedepth-nyu](https://huggingface.co/Intel/zoedepth-nyu) architecture.
  29. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
  30. documentation from [`PretrainedConfig`] for more information.
  31. Args:
  32. backbone_config (`Union[dict[str, Any], PretrainedConfig]`, *optional*, defaults to `BeitConfig()`):
  33. The configuration of the backbone model.
  34. backbone (`str`, *optional*):
  35. Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
  36. will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
  37. is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
  38. use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
  39. Whether to use pretrained weights for the backbone.
  40. backbone_kwargs (`dict`, *optional*):
  41. Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
  42. e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
  43. hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
  44. The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
  45. `"relu"`, `"selu"` and `"gelu_new"` are supported.
  46. initializer_range (`float`, *optional*, defaults to 0.02):
  47. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
  48. batch_norm_eps (`float`, *optional*, defaults to 1e-05):
  49. The epsilon used by the batch normalization layers.
  50. readout_type (`str`, *optional*, defaults to `"project"`):
  51. The readout type to use when processing the readout token (CLS token) of the intermediate hidden states of
  52. the ViT backbone. Can be one of [`"ignore"`, `"add"`, `"project"`].
  53. - "ignore" simply ignores the CLS token.
  54. - "add" passes the information from the CLS token to all other tokens by adding the representations.
  55. - "project" passes information to the other tokens by concatenating the readout to all other tokens before
  56. projecting the
  57. representation to the original feature dimension D using a linear layer followed by a GELU non-linearity.
  58. reassemble_factors (`list[int]`, *optional*, defaults to `[4, 2, 1, 0.5]`):
  59. The up/downsampling factors of the reassemble layers.
  60. neck_hidden_sizes (`list[str]`, *optional*, defaults to `[96, 192, 384, 768]`):
  61. The hidden sizes to project to for the feature maps of the backbone.
  62. fusion_hidden_size (`int`, *optional*, defaults to 256):
  63. The number of channels before fusion.
  64. head_in_index (`int`, *optional*, defaults to -1):
  65. The index of the features to use in the heads.
  66. use_batch_norm_in_fusion_residual (`bool`, *optional*, defaults to `False`):
  67. Whether to use batch normalization in the pre-activate residual units of the fusion blocks.
  68. use_bias_in_fusion_residual (`bool`, *optional*, defaults to `True`):
  69. Whether to use bias in the pre-activate residual units of the fusion blocks.
  70. num_relative_features (`int`, *optional*, defaults to 32):
  71. The number of features to use in the relative depth estimation head.
  72. add_projection (`bool`, *optional*, defaults to `False`):
  73. Whether to add a projection layer before the depth estimation head.
  74. bottleneck_features (`int`, *optional*, defaults to 256):
  75. The number of features in the bottleneck layer.
  76. num_attractors (`list[int], *optional*, defaults to `[16, 8, 4, 1]`):
  77. The number of attractors to use in each stage.
  78. bin_embedding_dim (`int`, *optional*, defaults to 128):
  79. The dimension of the bin embeddings.
  80. attractor_alpha (`int`, *optional*, defaults to 1000):
  81. The alpha value to use in the attractor.
  82. attractor_gamma (`int`, *optional*, defaults to 2):
  83. The gamma value to use in the attractor.
  84. attractor_kind (`str`, *optional*, defaults to `"mean"`):
  85. The kind of attractor to use. Can be one of [`"mean"`, `"sum"`].
  86. min_temp (`float`, *optional*, defaults to 0.0212):
  87. The minimum temperature value to consider.
  88. max_temp (`float`, *optional*, defaults to 50.0):
  89. The maximum temperature value to consider.
  90. bin_centers_type (`str`, *optional*, defaults to `"softplus"`):
  91. Activation type used for bin centers. Can be "normed" or "softplus". For "normed" bin centers, linear normalization trick
  92. is applied. This results in bounded bin centers. For "softplus", softplus activation is used and thus are unbounded.
  93. bin_configurations (`list[dict]`, *optional*, defaults to `[{'n_bins': 64, 'min_depth': 0.001, 'max_depth': 10.0}]`):
  94. Configuration for each of the bin heads.
  95. Each configuration should consist of the following keys:
  96. - name (`str`): The name of the bin head - only required in case of multiple bin configurations.
  97. - `n_bins` (`int`): The number of bins to use.
  98. - `min_depth` (`float`): The minimum depth value to consider.
  99. - `max_depth` (`float`): The maximum depth value to consider.
  100. In case only a single configuration is passed, the model will use a single head with the specified configuration.
  101. In case multiple configurations are passed, the model will use multiple heads with the specified configurations.
  102. num_patch_transformer_layers (`int`, *optional*):
  103. The number of transformer layers to use in the patch transformer. Only used in case of multiple bin configurations.
  104. patch_transformer_hidden_size (`int`, *optional*):
  105. The hidden size to use in the patch transformer. Only used in case of multiple bin configurations.
  106. patch_transformer_intermediate_size (`int`, *optional*):
  107. The intermediate size to use in the patch transformer. Only used in case of multiple bin configurations.
  108. patch_transformer_num_attention_heads (`int`, *optional*):
  109. The number of attention heads to use in the patch transformer. Only used in case of multiple bin configurations.
  110. Example:
  111. ```python
  112. >>> from transformers import ZoeDepthConfig, ZoeDepthForDepthEstimation
  113. >>> # Initializing a ZoeDepth zoedepth-large style configuration
  114. >>> configuration = ZoeDepthConfig()
  115. >>> # Initializing a model from the zoedepth-large style configuration
  116. >>> model = ZoeDepthForDepthEstimation(configuration)
  117. >>> # Accessing the model configuration
  118. >>> configuration = model.config
  119. ```"""
  120. model_type = "zoedepth"
  121. def __init__(
  122. self,
  123. backbone_config=None,
  124. backbone=None,
  125. use_pretrained_backbone=False,
  126. backbone_kwargs=None,
  127. hidden_act="gelu",
  128. initializer_range=0.02,
  129. batch_norm_eps=1e-05,
  130. readout_type="project",
  131. reassemble_factors=[4, 2, 1, 0.5],
  132. neck_hidden_sizes=[96, 192, 384, 768],
  133. fusion_hidden_size=256,
  134. head_in_index=-1,
  135. use_batch_norm_in_fusion_residual=False,
  136. use_bias_in_fusion_residual=None,
  137. num_relative_features=32,
  138. add_projection=False,
  139. bottleneck_features=256,
  140. num_attractors=[16, 8, 4, 1],
  141. bin_embedding_dim=128,
  142. attractor_alpha=1000,
  143. attractor_gamma=2,
  144. attractor_kind="mean",
  145. min_temp=0.0212,
  146. max_temp=50.0,
  147. bin_centers_type="softplus",
  148. bin_configurations=[{"n_bins": 64, "min_depth": 0.001, "max_depth": 10.0}],
  149. num_patch_transformer_layers=None,
  150. patch_transformer_hidden_size=None,
  151. patch_transformer_intermediate_size=None,
  152. patch_transformer_num_attention_heads=None,
  153. **kwargs,
  154. ):
  155. super().__init__(**kwargs)
  156. if readout_type not in ["ignore", "add", "project"]:
  157. raise ValueError("Readout_type must be one of ['ignore', 'add', 'project']")
  158. if attractor_kind not in ["mean", "sum"]:
  159. raise ValueError("Attractor_kind must be one of ['mean', 'sum']")
  160. if use_pretrained_backbone:
  161. raise ValueError("Pretrained backbones are not supported yet.")
  162. if backbone_config is not None and backbone is not None:
  163. raise ValueError("You can't specify both `backbone` and `backbone_config`.")
  164. if backbone_config is None and backbone is None:
  165. logger.info("`backbone_config` is `None`. Initializing the config with the default `BEiT` backbone.")
  166. backbone_config = CONFIG_MAPPING["beit"](
  167. image_size=384,
  168. num_hidden_layers=24,
  169. hidden_size=1024,
  170. intermediate_size=4096,
  171. num_attention_heads=16,
  172. use_relative_position_bias=True,
  173. reshape_hidden_states=False,
  174. out_features=["stage6", "stage12", "stage18", "stage24"],
  175. )
  176. elif isinstance(backbone_config, dict):
  177. backbone_model_type = backbone_config.get("model_type")
  178. config_class = CONFIG_MAPPING[backbone_model_type]
  179. backbone_config = config_class.from_dict(backbone_config)
  180. if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None:
  181. raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
  182. self.backbone_config = backbone_config
  183. self.backbone = backbone
  184. self.hidden_act = hidden_act
  185. self.use_pretrained_backbone = use_pretrained_backbone
  186. self.initializer_range = initializer_range
  187. self.batch_norm_eps = batch_norm_eps
  188. self.readout_type = readout_type
  189. self.reassemble_factors = reassemble_factors
  190. self.neck_hidden_sizes = neck_hidden_sizes
  191. self.fusion_hidden_size = fusion_hidden_size
  192. self.head_in_index = head_in_index
  193. self.use_batch_norm_in_fusion_residual = use_batch_norm_in_fusion_residual
  194. self.use_bias_in_fusion_residual = use_bias_in_fusion_residual
  195. self.num_relative_features = num_relative_features
  196. self.add_projection = add_projection
  197. self.bottleneck_features = bottleneck_features
  198. self.num_attractors = num_attractors
  199. self.bin_embedding_dim = bin_embedding_dim
  200. self.attractor_alpha = attractor_alpha
  201. self.attractor_gamma = attractor_gamma
  202. self.attractor_kind = attractor_kind
  203. self.min_temp = min_temp
  204. self.max_temp = max_temp
  205. self.bin_centers_type = bin_centers_type
  206. self.bin_configurations = bin_configurations
  207. self.num_patch_transformer_layers = num_patch_transformer_layers
  208. self.patch_transformer_hidden_size = patch_transformer_hidden_size
  209. self.patch_transformer_intermediate_size = patch_transformer_intermediate_size
  210. self.patch_transformer_num_attention_heads = patch_transformer_num_attention_heads
  211. @property
  212. def sub_configs(self):
  213. return (
  214. {"backbone_config": type(self.backbone_config)}
  215. if getattr(self, "backbone_config", None) is not None
  216. else {}
  217. )
  218. __all__ = ["ZOEDEPTH_PRETRAINED_CONFIG_ARCHIVE_MAP", "ZoeDepthConfig"]