_local_folder.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447
  1. # coding=utf-8
  2. # Copyright 2024-present, the HuggingFace Inc. team.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """Contains utilities to handle the `../.cache/huggingface` folder in local directories.
  16. First discussed in https://github.com/huggingface/huggingface_hub/issues/1738 to store
  17. download metadata when downloading files from the hub to a local directory (without
  18. using the cache).
  19. ./.cache/huggingface folder structure:
  20. [4.0K] data
  21. ├── [4.0K] .cache
  22. │ └── [4.0K] huggingface
  23. │ └── [4.0K] download
  24. │ ├── [ 16] file.parquet.metadata
  25. │ ├── [ 16] file.txt.metadata
  26. │ └── [4.0K] folder
  27. │ └── [ 16] file.parquet.metadata
  28. ├── [6.5G] file.parquet
  29. ├── [1.5K] file.txt
  30. └── [4.0K] folder
  31. └── [ 16] file.parquet
  32. Download metadata file structure:
  33. ```
  34. # file.txt.metadata
  35. 11c5a3d5811f50298f278a704980280950aedb10
  36. a16a55fda99d2f2e7b69cce5cf93ff4ad3049930
  37. 1712656091.123
  38. # file.parquet.metadata
  39. 11c5a3d5811f50298f278a704980280950aedb10
  40. 7c5d3f4b8b76583b422fcb9189ad6c89d5d97a094541ce8932dce3ecabde1421
  41. 1712656091.123
  42. }
  43. ```
  44. """
  45. import base64
  46. import hashlib
  47. import logging
  48. import os
  49. import time
  50. from dataclasses import dataclass
  51. from pathlib import Path
  52. from typing import Optional
  53. from .utils import WeakFileLock
  54. logger = logging.getLogger(__name__)
  55. @dataclass
  56. class LocalDownloadFilePaths:
  57. """
  58. Paths to the files related to a download process in a local dir.
  59. Returned by [`get_local_download_paths`].
  60. Attributes:
  61. file_path (`Path`):
  62. Path where the file will be saved.
  63. lock_path (`Path`):
  64. Path to the lock file used to ensure atomicity when reading/writing metadata.
  65. metadata_path (`Path`):
  66. Path to the metadata file.
  67. """
  68. file_path: Path
  69. lock_path: Path
  70. metadata_path: Path
  71. def incomplete_path(self, etag: str) -> Path:
  72. """Return the path where a file will be temporarily downloaded before being moved to `file_path`."""
  73. path = self.metadata_path.parent / f"{_short_hash(self.metadata_path.name)}.{etag}.incomplete"
  74. resolved_path = str(path.resolve())
  75. # Some Windows versions do not allow for paths longer than 255 characters.
  76. # In this case, we must specify it as an extended path by using the "\\?\" prefix.
  77. if os.name == "nt" and len(resolved_path) > 255 and not resolved_path.startswith("\\\\?\\"):
  78. path = Path("\\\\?\\" + resolved_path)
  79. return path
  80. @dataclass(frozen=True)
  81. class LocalUploadFilePaths:
  82. """
  83. Paths to the files related to an upload process in a local dir.
  84. Returned by [`get_local_upload_paths`].
  85. Attributes:
  86. path_in_repo (`str`):
  87. Path of the file in the repo.
  88. file_path (`Path`):
  89. Path where the file will be saved.
  90. lock_path (`Path`):
  91. Path to the lock file used to ensure atomicity when reading/writing metadata.
  92. metadata_path (`Path`):
  93. Path to the metadata file.
  94. """
  95. path_in_repo: str
  96. file_path: Path
  97. lock_path: Path
  98. metadata_path: Path
  99. @dataclass
  100. class LocalDownloadFileMetadata:
  101. """
  102. Metadata about a file in the local directory related to a download process.
  103. Attributes:
  104. filename (`str`):
  105. Path of the file in the repo.
  106. commit_hash (`str`):
  107. Commit hash of the file in the repo.
  108. etag (`str`):
  109. ETag of the file in the repo. Used to check if the file has changed.
  110. For LFS files, this is the sha256 of the file. For regular files, it corresponds to the git hash.
  111. timestamp (`int`):
  112. Unix timestamp of when the metadata was saved i.e. when the metadata was accurate.
  113. """
  114. filename: str
  115. commit_hash: str
  116. etag: str
  117. timestamp: float
  118. @dataclass
  119. class LocalUploadFileMetadata:
  120. """
  121. Metadata about a file in the local directory related to an upload process.
  122. """
  123. size: int
  124. # Default values correspond to "we don't know yet"
  125. timestamp: Optional[float] = None
  126. should_ignore: Optional[bool] = None
  127. sha256: Optional[str] = None
  128. upload_mode: Optional[str] = None
  129. remote_oid: Optional[str] = None
  130. is_uploaded: bool = False
  131. is_committed: bool = False
  132. def save(self, paths: LocalUploadFilePaths) -> None:
  133. """Save the metadata to disk."""
  134. with WeakFileLock(paths.lock_path):
  135. with paths.metadata_path.open("w") as f:
  136. new_timestamp = time.time()
  137. f.write(str(new_timestamp) + "\n")
  138. f.write(str(self.size)) # never None
  139. f.write("\n")
  140. if self.should_ignore is not None:
  141. f.write(str(int(self.should_ignore)))
  142. f.write("\n")
  143. if self.sha256 is not None:
  144. f.write(self.sha256)
  145. f.write("\n")
  146. if self.upload_mode is not None:
  147. f.write(self.upload_mode)
  148. f.write("\n")
  149. if self.remote_oid is not None:
  150. f.write(self.remote_oid)
  151. f.write("\n")
  152. f.write(str(int(self.is_uploaded)) + "\n")
  153. f.write(str(int(self.is_committed)) + "\n")
  154. self.timestamp = new_timestamp
  155. def get_local_download_paths(local_dir: Path, filename: str) -> LocalDownloadFilePaths:
  156. """Compute paths to the files related to a download process.
  157. Folders containing the paths are all guaranteed to exist.
  158. Args:
  159. local_dir (`Path`):
  160. Path to the local directory in which files are downloaded.
  161. filename (`str`):
  162. Path of the file in the repo.
  163. Return:
  164. [`LocalDownloadFilePaths`]: the paths to the files (file_path, lock_path, metadata_path, incomplete_path).
  165. """
  166. # filename is the path in the Hub repository (separated by '/')
  167. # make sure to have a cross platform transcription
  168. sanitized_filename = os.path.join(*filename.split("/"))
  169. if os.name == "nt":
  170. if sanitized_filename.startswith("..\\") or "\\..\\" in sanitized_filename:
  171. raise ValueError(
  172. f"Invalid filename: cannot handle filename '{sanitized_filename}' on Windows. Please ask the repository"
  173. " owner to rename this file."
  174. )
  175. file_path = local_dir / sanitized_filename
  176. metadata_path = _huggingface_dir(local_dir) / "download" / f"{sanitized_filename}.metadata"
  177. lock_path = metadata_path.with_suffix(".lock")
  178. # Some Windows versions do not allow for paths longer than 255 characters.
  179. # In this case, we must specify it as an extended path by using the "\\?\" prefix
  180. if os.name == "nt":
  181. if not str(local_dir).startswith("\\\\?\\") and len(os.path.abspath(lock_path)) > 255:
  182. file_path = Path("\\\\?\\" + os.path.abspath(file_path))
  183. lock_path = Path("\\\\?\\" + os.path.abspath(lock_path))
  184. metadata_path = Path("\\\\?\\" + os.path.abspath(metadata_path))
  185. file_path.parent.mkdir(parents=True, exist_ok=True)
  186. metadata_path.parent.mkdir(parents=True, exist_ok=True)
  187. return LocalDownloadFilePaths(file_path=file_path, lock_path=lock_path, metadata_path=metadata_path)
  188. def get_local_upload_paths(local_dir: Path, filename: str) -> LocalUploadFilePaths:
  189. """Compute paths to the files related to an upload process.
  190. Folders containing the paths are all guaranteed to exist.
  191. Args:
  192. local_dir (`Path`):
  193. Path to the local directory that is uploaded.
  194. filename (`str`):
  195. Path of the file in the repo.
  196. Return:
  197. [`LocalUploadFilePaths`]: the paths to the files (file_path, lock_path, metadata_path).
  198. """
  199. # filename is the path in the Hub repository (separated by '/')
  200. # make sure to have a cross platform transcription
  201. sanitized_filename = os.path.join(*filename.split("/"))
  202. if os.name == "nt":
  203. if sanitized_filename.startswith("..\\") or "\\..\\" in sanitized_filename:
  204. raise ValueError(
  205. f"Invalid filename: cannot handle filename '{sanitized_filename}' on Windows. Please ask the repository"
  206. " owner to rename this file."
  207. )
  208. file_path = local_dir / sanitized_filename
  209. metadata_path = _huggingface_dir(local_dir) / "upload" / f"{sanitized_filename}.metadata"
  210. lock_path = metadata_path.with_suffix(".lock")
  211. # Some Windows versions do not allow for paths longer than 255 characters.
  212. # In this case, we must specify it as an extended path by using the "\\?\" prefix
  213. if os.name == "nt":
  214. if not str(local_dir).startswith("\\\\?\\") and len(os.path.abspath(lock_path)) > 255:
  215. file_path = Path("\\\\?\\" + os.path.abspath(file_path))
  216. lock_path = Path("\\\\?\\" + os.path.abspath(lock_path))
  217. metadata_path = Path("\\\\?\\" + os.path.abspath(metadata_path))
  218. file_path.parent.mkdir(parents=True, exist_ok=True)
  219. metadata_path.parent.mkdir(parents=True, exist_ok=True)
  220. return LocalUploadFilePaths(
  221. path_in_repo=filename, file_path=file_path, lock_path=lock_path, metadata_path=metadata_path
  222. )
  223. def read_download_metadata(local_dir: Path, filename: str) -> Optional[LocalDownloadFileMetadata]:
  224. """Read metadata about a file in the local directory related to a download process.
  225. Args:
  226. local_dir (`Path`):
  227. Path to the local directory in which files are downloaded.
  228. filename (`str`):
  229. Path of the file in the repo.
  230. Return:
  231. `[LocalDownloadFileMetadata]` or `None`: the metadata if it exists, `None` otherwise.
  232. """
  233. paths = get_local_download_paths(local_dir, filename)
  234. with WeakFileLock(paths.lock_path):
  235. if paths.metadata_path.exists():
  236. try:
  237. with paths.metadata_path.open() as f:
  238. commit_hash = f.readline().strip()
  239. etag = f.readline().strip()
  240. timestamp = float(f.readline().strip())
  241. metadata = LocalDownloadFileMetadata(
  242. filename=filename,
  243. commit_hash=commit_hash,
  244. etag=etag,
  245. timestamp=timestamp,
  246. )
  247. except Exception as e:
  248. # remove the metadata file if it is corrupted / not the right format
  249. logger.warning(
  250. f"Invalid metadata file {paths.metadata_path}: {e}. Removing it from disk and continue."
  251. )
  252. try:
  253. paths.metadata_path.unlink()
  254. except Exception as e:
  255. logger.warning(f"Could not remove corrupted metadata file {paths.metadata_path}: {e}")
  256. try:
  257. # check if the file exists and hasn't been modified since the metadata was saved
  258. stat = paths.file_path.stat()
  259. if (
  260. stat.st_mtime - 1 <= metadata.timestamp
  261. ): # allow 1s difference as stat.st_mtime might not be precise
  262. return metadata
  263. logger.info(f"Ignored metadata for '{filename}' (outdated). Will re-compute hash.")
  264. except FileNotFoundError:
  265. # file does not exist => metadata is outdated
  266. return None
  267. return None
  268. def read_upload_metadata(local_dir: Path, filename: str) -> LocalUploadFileMetadata:
  269. """Read metadata about a file in the local directory related to an upload process.
  270. TODO: factorize logic with `read_download_metadata`.
  271. Args:
  272. local_dir (`Path`):
  273. Path to the local directory in which files are downloaded.
  274. filename (`str`):
  275. Path of the file in the repo.
  276. Return:
  277. `[LocalUploadFileMetadata]` or `None`: the metadata if it exists, `None` otherwise.
  278. """
  279. paths = get_local_upload_paths(local_dir, filename)
  280. with WeakFileLock(paths.lock_path):
  281. if paths.metadata_path.exists():
  282. try:
  283. with paths.metadata_path.open() as f:
  284. timestamp = float(f.readline().strip())
  285. size = int(f.readline().strip()) # never None
  286. _should_ignore = f.readline().strip()
  287. should_ignore = None if _should_ignore == "" else bool(int(_should_ignore))
  288. _sha256 = f.readline().strip()
  289. sha256 = None if _sha256 == "" else _sha256
  290. _upload_mode = f.readline().strip()
  291. upload_mode = None if _upload_mode == "" else _upload_mode
  292. if upload_mode not in (None, "regular", "lfs"):
  293. raise ValueError(f"Invalid upload mode in metadata {paths.path_in_repo}: {upload_mode}")
  294. _remote_oid = f.readline().strip()
  295. remote_oid = None if _remote_oid == "" else _remote_oid
  296. is_uploaded = bool(int(f.readline().strip()))
  297. is_committed = bool(int(f.readline().strip()))
  298. metadata = LocalUploadFileMetadata(
  299. timestamp=timestamp,
  300. size=size,
  301. should_ignore=should_ignore,
  302. sha256=sha256,
  303. upload_mode=upload_mode,
  304. remote_oid=remote_oid,
  305. is_uploaded=is_uploaded,
  306. is_committed=is_committed,
  307. )
  308. except Exception as e:
  309. # remove the metadata file if it is corrupted / not the right format
  310. logger.warning(
  311. f"Invalid metadata file {paths.metadata_path}: {e}. Removing it from disk and continue."
  312. )
  313. try:
  314. paths.metadata_path.unlink()
  315. except Exception as e:
  316. logger.warning(f"Could not remove corrupted metadata file {paths.metadata_path}: {e}")
  317. # TODO: can we do better?
  318. if (
  319. metadata.timestamp is not None
  320. and metadata.is_uploaded # file was uploaded
  321. and not metadata.is_committed # but not committed
  322. and time.time() - metadata.timestamp > 20 * 3600 # and it's been more than 20 hours
  323. ): # => we consider it as garbage-collected by S3
  324. metadata.is_uploaded = False
  325. # check if the file exists and hasn't been modified since the metadata was saved
  326. try:
  327. if metadata.timestamp is not None and paths.file_path.stat().st_mtime <= metadata.timestamp:
  328. return metadata
  329. logger.info(f"Ignored metadata for '{filename}' (outdated). Will re-compute hash.")
  330. except FileNotFoundError:
  331. # file does not exist => metadata is outdated
  332. pass
  333. # empty metadata => we don't know anything expect its size
  334. return LocalUploadFileMetadata(size=paths.file_path.stat().st_size)
  335. def write_download_metadata(local_dir: Path, filename: str, commit_hash: str, etag: str) -> None:
  336. """Write metadata about a file in the local directory related to a download process.
  337. Args:
  338. local_dir (`Path`):
  339. Path to the local directory in which files are downloaded.
  340. """
  341. paths = get_local_download_paths(local_dir, filename)
  342. with WeakFileLock(paths.lock_path):
  343. with paths.metadata_path.open("w") as f:
  344. f.write(f"{commit_hash}\n{etag}\n{time.time()}\n")
  345. def _huggingface_dir(local_dir: Path) -> Path:
  346. """Return the path to the `.cache/huggingface` directory in a local directory."""
  347. # Wrap in lru_cache to avoid overwriting the .gitignore file if called multiple times
  348. path = local_dir / ".cache" / "huggingface"
  349. path.mkdir(exist_ok=True, parents=True)
  350. # Create a .gitignore file in the .cache/huggingface directory if it doesn't exist
  351. # Should be thread-safe enough like this.
  352. gitignore = path / ".gitignore"
  353. gitignore_lock = path / ".gitignore.lock"
  354. if not gitignore.exists():
  355. try:
  356. with WeakFileLock(gitignore_lock, timeout=0.1):
  357. gitignore.write_text("*")
  358. except IndexError:
  359. pass
  360. except OSError: # TimeoutError, FileNotFoundError, PermissionError, etc.
  361. pass
  362. try:
  363. gitignore_lock.unlink()
  364. except OSError:
  365. pass
  366. return path
  367. def _short_hash(filename: str) -> str:
  368. return base64.urlsafe_b64encode(hashlib.sha1(filename.encode()).digest()).decode()