_validators.py 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226
  1. # coding=utf-8
  2. # Copyright 2022-present, the HuggingFace Inc. team.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """Contains utilities to validate argument values in `huggingface_hub`."""
  16. import inspect
  17. import re
  18. import warnings
  19. from functools import wraps
  20. from itertools import chain
  21. from typing import Any, Dict
  22. from huggingface_hub.errors import HFValidationError
  23. from ._typing import CallableT
  24. REPO_ID_REGEX = re.compile(
  25. r"""
  26. ^
  27. (\b[\w\-.]+\b/)? # optional namespace (username or organization)
  28. \b # starts with a word boundary
  29. [\w\-.]{1,96} # repo_name: alphanumeric + . _ -
  30. \b # ends with a word boundary
  31. $
  32. """,
  33. flags=re.VERBOSE,
  34. )
  35. def validate_hf_hub_args(fn: CallableT) -> CallableT:
  36. """Validate values received as argument for any public method of `huggingface_hub`.
  37. The goal of this decorator is to harmonize validation of arguments reused
  38. everywhere. By default, all defined validators are tested.
  39. Validators:
  40. - [`~utils.validate_repo_id`]: `repo_id` must be `"repo_name"`
  41. or `"namespace/repo_name"`. Namespace is a username or an organization.
  42. - [`~utils.smoothly_deprecate_use_auth_token`]: Use `token` instead of
  43. `use_auth_token` (only if `use_auth_token` is not expected by the decorated
  44. function - in practice, always the case in `huggingface_hub`).
  45. Example:
  46. ```py
  47. >>> from huggingface_hub.utils import validate_hf_hub_args
  48. >>> @validate_hf_hub_args
  49. ... def my_cool_method(repo_id: str):
  50. ... print(repo_id)
  51. >>> my_cool_method(repo_id="valid_repo_id")
  52. valid_repo_id
  53. >>> my_cool_method("other..repo..id")
  54. huggingface_hub.utils._validators.HFValidationError: Cannot have -- or .. in repo_id: 'other..repo..id'.
  55. >>> my_cool_method(repo_id="other..repo..id")
  56. huggingface_hub.utils._validators.HFValidationError: Cannot have -- or .. in repo_id: 'other..repo..id'.
  57. >>> @validate_hf_hub_args
  58. ... def my_cool_auth_method(token: str):
  59. ... print(token)
  60. >>> my_cool_auth_method(token="a token")
  61. "a token"
  62. >>> my_cool_auth_method(use_auth_token="a use_auth_token")
  63. "a use_auth_token"
  64. >>> my_cool_auth_method(token="a token", use_auth_token="a use_auth_token")
  65. UserWarning: Both `token` and `use_auth_token` are passed (...)
  66. "a token"
  67. ```
  68. Raises:
  69. [`~utils.HFValidationError`]:
  70. If an input is not valid.
  71. """
  72. # TODO: add an argument to opt-out validation for specific argument?
  73. signature = inspect.signature(fn)
  74. # Should the validator switch `use_auth_token` values to `token`? In practice, always
  75. # True in `huggingface_hub`. Might not be the case in a downstream library.
  76. check_use_auth_token = "use_auth_token" not in signature.parameters and "token" in signature.parameters
  77. @wraps(fn)
  78. def _inner_fn(*args, **kwargs):
  79. has_token = False
  80. for arg_name, arg_value in chain(
  81. zip(signature.parameters, args), # Args values
  82. kwargs.items(), # Kwargs values
  83. ):
  84. if arg_name in ["repo_id", "from_id", "to_id"]:
  85. validate_repo_id(arg_value)
  86. elif arg_name == "token" and arg_value is not None:
  87. has_token = True
  88. if check_use_auth_token:
  89. kwargs = smoothly_deprecate_use_auth_token(fn_name=fn.__name__, has_token=has_token, kwargs=kwargs)
  90. return fn(*args, **kwargs)
  91. return _inner_fn # type: ignore
  92. def validate_repo_id(repo_id: str) -> None:
  93. """Validate `repo_id` is valid.
  94. This is not meant to replace the proper validation made on the Hub but rather to
  95. avoid local inconsistencies whenever possible (example: passing `repo_type` in the
  96. `repo_id` is forbidden).
  97. Rules:
  98. - Between 1 and 96 characters.
  99. - Either "repo_name" or "namespace/repo_name"
  100. - [a-zA-Z0-9] or "-", "_", "."
  101. - "--" and ".." are forbidden
  102. Valid: `"foo"`, `"foo/bar"`, `"123"`, `"Foo-BAR_foo.bar123"`
  103. Not valid: `"datasets/foo/bar"`, `".repo_id"`, `"foo--bar"`, `"foo.git"`
  104. Example:
  105. ```py
  106. >>> from huggingface_hub.utils import validate_repo_id
  107. >>> validate_repo_id(repo_id="valid_repo_id")
  108. >>> validate_repo_id(repo_id="other..repo..id")
  109. huggingface_hub.utils._validators.HFValidationError: Cannot have -- or .. in repo_id: 'other..repo..id'.
  110. ```
  111. Discussed in https://github.com/huggingface/huggingface_hub/issues/1008.
  112. In moon-landing (internal repository):
  113. - https://github.com/huggingface/moon-landing/blob/main/server/lib/Names.ts#L27
  114. - https://github.com/huggingface/moon-landing/blob/main/server/views/components/NewRepoForm/NewRepoForm.svelte#L138
  115. """
  116. if not isinstance(repo_id, str):
  117. # Typically, a Path is not a repo_id
  118. raise HFValidationError(f"Repo id must be a string, not {type(repo_id)}: '{repo_id}'.")
  119. if repo_id.count("/") > 1:
  120. raise HFValidationError(
  121. "Repo id must be in the form 'repo_name' or 'namespace/repo_name':"
  122. f" '{repo_id}'. Use `repo_type` argument if needed."
  123. )
  124. if not REPO_ID_REGEX.match(repo_id):
  125. raise HFValidationError(
  126. "Repo id must use alphanumeric chars, '-', '_' or '.'."
  127. " The name cannot start or end with '-' or '.' and the maximum length is 96:"
  128. f" '{repo_id}'."
  129. )
  130. if "--" in repo_id or ".." in repo_id:
  131. raise HFValidationError(f"Cannot have -- or .. in repo_id: '{repo_id}'.")
  132. if repo_id.endswith(".git"):
  133. raise HFValidationError(f"Repo_id cannot end by '.git': '{repo_id}'.")
  134. def smoothly_deprecate_use_auth_token(fn_name: str, has_token: bool, kwargs: Dict[str, Any]) -> Dict[str, Any]:
  135. """Smoothly deprecate `use_auth_token` in the `huggingface_hub` codebase.
  136. The long-term goal is to remove any mention of `use_auth_token` in the codebase in
  137. favor of a unique and less verbose `token` argument. This will be done a few steps:
  138. 0. Step 0: methods that require a read-access to the Hub use the `use_auth_token`
  139. argument (`str`, `bool` or `None`). Methods requiring write-access have a `token`
  140. argument (`str`, `None`). This implicit rule exists to be able to not send the
  141. token when not necessary (`use_auth_token=False`) even if logged in.
  142. 1. Step 1: we want to harmonize everything and use `token` everywhere (supporting
  143. `token=False` for read-only methods). In order not to break existing code, if
  144. `use_auth_token` is passed to a function, the `use_auth_token` value is passed
  145. as `token` instead, without any warning.
  146. a. Corner case: if both `use_auth_token` and `token` values are passed, a warning
  147. is thrown and the `use_auth_token` value is ignored.
  148. 2. Step 2: Once it is release, we should push downstream libraries to switch from
  149. `use_auth_token` to `token` as much as possible, but without throwing a warning
  150. (e.g. manually create issues on the corresponding repos).
  151. 3. Step 3: After a transitional period (6 months e.g. until April 2023?), we update
  152. `huggingface_hub` to throw a warning on `use_auth_token`. Hopefully, very few
  153. users will be impacted as it would have already been fixed.
  154. In addition, unit tests in `huggingface_hub` must be adapted to expect warnings
  155. to be thrown (but still use `use_auth_token` as before).
  156. 4. Step 4: After a normal deprecation cycle (3 releases ?), remove this validator.
  157. `use_auth_token` will definitely not be supported.
  158. In addition, we update unit tests in `huggingface_hub` to use `token` everywhere.
  159. This has been discussed in:
  160. - https://github.com/huggingface/huggingface_hub/issues/1094.
  161. - https://github.com/huggingface/huggingface_hub/pull/928
  162. - (related) https://github.com/huggingface/huggingface_hub/pull/1064
  163. """
  164. new_kwargs = kwargs.copy() # do not mutate input !
  165. use_auth_token = new_kwargs.pop("use_auth_token", None) # remove from kwargs
  166. if use_auth_token is not None:
  167. if has_token:
  168. warnings.warn(
  169. "Both `token` and `use_auth_token` are passed to"
  170. f" `{fn_name}` with non-None values. `token` is now the"
  171. " preferred argument to pass a User Access Token."
  172. " `use_auth_token` value will be ignored."
  173. )
  174. else:
  175. # `token` argument is not passed and a non-None value is passed in
  176. # `use_auth_token` => use `use_auth_token` value as `token` kwarg.
  177. new_kwargs["token"] = use_auth_token
  178. return new_kwargs