utils.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. # Copyright 2021 The HuggingFace Team. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from ctypes import c_float, sizeof
  15. from enum import Enum
  16. from typing import TYPE_CHECKING, Optional, Union
  17. if TYPE_CHECKING:
  18. from .. import AutoFeatureExtractor, AutoProcessor, AutoTokenizer # tests_ignore
  19. class ParameterFormat(Enum):
  20. Float = c_float
  21. @property
  22. def size(self) -> int:
  23. """
  24. Number of byte required for this data type
  25. Returns:
  26. Integer > 0
  27. """
  28. return sizeof(self.value)
  29. def compute_effective_axis_dimension(dimension: int, fixed_dimension: int, num_token_to_add: int = 0) -> int:
  30. """
  31. Args:
  32. dimension:
  33. fixed_dimension:
  34. num_token_to_add:
  35. Returns:
  36. """
  37. # < 0 is possible if using a dynamic axis
  38. if dimension <= 0:
  39. dimension = fixed_dimension
  40. dimension -= num_token_to_add
  41. return dimension
  42. def compute_serialized_parameters_size(num_parameters: int, dtype: ParameterFormat) -> int:
  43. """
  44. Compute the size taken by all the parameters in the given the storage format when serializing the model
  45. Args:
  46. num_parameters: Number of parameters to be saved
  47. dtype: The data format each parameter will be saved
  48. Returns:
  49. Size (in byte) taken to save all the parameters
  50. """
  51. return num_parameters * dtype.size
  52. def get_preprocessor(model_name: str) -> Optional[Union["AutoTokenizer", "AutoFeatureExtractor", "AutoProcessor"]]:
  53. """
  54. Gets a preprocessor (tokenizer, feature extractor or processor) that is available for `model_name`.
  55. Args:
  56. model_name (`str`): Name of the model for which a preprocessor are loaded.
  57. Returns:
  58. `Optional[Union[AutoTokenizer, AutoFeatureExtractor, AutoProcessor]]`:
  59. If a processor is found, it is returned. Otherwise, if a tokenizer or a feature extractor exists, it is
  60. returned. If both a tokenizer and a feature extractor exist, an error is raised. The function returns
  61. `None` if no preprocessor is found.
  62. """
  63. # Avoid circular imports by only importing this here.
  64. from .. import AutoFeatureExtractor, AutoProcessor, AutoTokenizer # tests_ignore
  65. try:
  66. return AutoProcessor.from_pretrained(model_name)
  67. except (ValueError, OSError, KeyError):
  68. tokenizer, feature_extractor = None, None
  69. try:
  70. tokenizer = AutoTokenizer.from_pretrained(model_name)
  71. except (OSError, KeyError):
  72. pass
  73. try:
  74. feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
  75. except (OSError, KeyError):
  76. pass
  77. if tokenizer is not None and feature_extractor is not None:
  78. raise ValueError(
  79. f"Couldn't auto-detect preprocessor for {model_name}. Found both a tokenizer and a feature extractor."
  80. )
  81. elif tokenizer is None and feature_extractor is None:
  82. return None
  83. elif tokenizer is not None:
  84. return tokenizer
  85. else:
  86. return feature_extractor