_cache_assets.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. # coding=utf-8
  2. # Copyright 2019-present, the HuggingFace Inc. team.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. from pathlib import Path
  16. from typing import Union
  17. from ..constants import HF_ASSETS_CACHE
  18. def cached_assets_path(
  19. library_name: str,
  20. namespace: str = "default",
  21. subfolder: str = "default",
  22. *,
  23. assets_dir: Union[str, Path, None] = None,
  24. ):
  25. """Return a folder path to cache arbitrary files.
  26. `huggingface_hub` provides a canonical folder path to store assets. This is the
  27. recommended way to integrate cache in a downstream library as it will benefit from
  28. the builtins tools to scan and delete the cache properly.
  29. The distinction is made between files cached from the Hub and assets. Files from the
  30. Hub are cached in a git-aware manner and entirely managed by `huggingface_hub`. See
  31. [related documentation](https://huggingface.co/docs/huggingface_hub/how-to-cache).
  32. All other files that a downstream library caches are considered to be "assets"
  33. (files downloaded from external sources, extracted from a .tar archive, preprocessed
  34. for training,...).
  35. Once the folder path is generated, it is guaranteed to exist and to be a directory.
  36. The path is based on 3 levels of depth: the library name, a namespace and a
  37. subfolder. Those 3 levels grants flexibility while allowing `huggingface_hub` to
  38. expect folders when scanning/deleting parts of the assets cache. Within a library,
  39. it is expected that all namespaces share the same subset of subfolder names but this
  40. is not a mandatory rule. The downstream library has then full control on which file
  41. structure to adopt within its cache. Namespace and subfolder are optional (would
  42. default to a `"default/"` subfolder) but library name is mandatory as we want every
  43. downstream library to manage its own cache.
  44. Expected tree:
  45. ```text
  46. assets/
  47. └── datasets/
  48. │ ├── SQuAD/
  49. │ │ ├── downloaded/
  50. │ │ ├── extracted/
  51. │ │ └── processed/
  52. │ ├── Helsinki-NLP--tatoeba_mt/
  53. │ ├── downloaded/
  54. │ ├── extracted/
  55. │ └── processed/
  56. └── transformers/
  57. ├── default/
  58. │ ├── something/
  59. ├── bert-base-cased/
  60. │ ├── default/
  61. │ └── training/
  62. hub/
  63. └── models--julien-c--EsperBERTo-small/
  64. ├── blobs/
  65. │ ├── (...)
  66. │ ├── (...)
  67. ├── refs/
  68. │ └── (...)
  69. └── [ 128] snapshots/
  70. ├── 2439f60ef33a0d46d85da5001d52aeda5b00ce9f/
  71. │ ├── (...)
  72. └── bbc77c8132af1cc5cf678da3f1ddf2de43606d48/
  73. └── (...)
  74. ```
  75. Args:
  76. library_name (`str`):
  77. Name of the library that will manage the cache folder. Example: `"dataset"`.
  78. namespace (`str`, *optional*, defaults to "default"):
  79. Namespace to which the data belongs. Example: `"SQuAD"`.
  80. subfolder (`str`, *optional*, defaults to "default"):
  81. Subfolder in which the data will be stored. Example: `extracted`.
  82. assets_dir (`str`, `Path`, *optional*):
  83. Path to the folder where assets are cached. This must not be the same folder
  84. where Hub files are cached. Defaults to `HF_HOME / "assets"` if not provided.
  85. Can also be set with `HF_ASSETS_CACHE` environment variable.
  86. Returns:
  87. Path to the cache folder (`Path`).
  88. Example:
  89. ```py
  90. >>> from huggingface_hub import cached_assets_path
  91. >>> cached_assets_path(library_name="datasets", namespace="SQuAD", subfolder="download")
  92. PosixPath('/home/wauplin/.cache/huggingface/extra/datasets/SQuAD/download')
  93. >>> cached_assets_path(library_name="datasets", namespace="SQuAD", subfolder="extracted")
  94. PosixPath('/home/wauplin/.cache/huggingface/extra/datasets/SQuAD/extracted')
  95. >>> cached_assets_path(library_name="datasets", namespace="Helsinki-NLP/tatoeba_mt")
  96. PosixPath('/home/wauplin/.cache/huggingface/extra/datasets/Helsinki-NLP--tatoeba_mt/default')
  97. >>> cached_assets_path(library_name="datasets", assets_dir="/tmp/tmp123456")
  98. PosixPath('/tmp/tmp123456/datasets/default/default')
  99. ```
  100. """
  101. # Resolve assets_dir
  102. if assets_dir is None:
  103. assets_dir = HF_ASSETS_CACHE
  104. assets_dir = Path(assets_dir).expanduser().resolve()
  105. # Avoid names that could create path issues
  106. for part in (" ", "/", "\\"):
  107. library_name = library_name.replace(part, "--")
  108. namespace = namespace.replace(part, "--")
  109. subfolder = subfolder.replace(part, "--")
  110. # Path to subfolder is created
  111. path = assets_dir / library_name / namespace / subfolder
  112. try:
  113. path.mkdir(exist_ok=True, parents=True)
  114. except (FileExistsError, NotADirectoryError):
  115. raise ValueError(f"Corrupted assets folder: cannot create directory because of an existing file ({path}).")
  116. # Return
  117. return path