local.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514
  1. import datetime
  2. import io
  3. import logging
  4. import os
  5. import os.path as osp
  6. import shutil
  7. import stat
  8. import tempfile
  9. from functools import lru_cache
  10. from fsspec import AbstractFileSystem
  11. from fsspec.compression import compr
  12. from fsspec.core import get_compression
  13. from fsspec.utils import isfilelike, stringify_path
  14. logger = logging.getLogger("fsspec.local")
  15. class LocalFileSystem(AbstractFileSystem):
  16. """Interface to files on local storage
  17. Parameters
  18. ----------
  19. auto_mkdir: bool
  20. Whether, when opening a file, the directory containing it should
  21. be created (if it doesn't already exist). This is assumed by pyarrow
  22. code.
  23. """
  24. root_marker = "/"
  25. protocol = "file", "local"
  26. local_file = True
  27. def __init__(self, auto_mkdir=False, **kwargs):
  28. super().__init__(**kwargs)
  29. self.auto_mkdir = auto_mkdir
  30. @property
  31. def fsid(self):
  32. return "local"
  33. def mkdir(self, path, create_parents=True, **kwargs):
  34. path = self._strip_protocol(path)
  35. if self.exists(path):
  36. raise FileExistsError(path)
  37. if create_parents:
  38. self.makedirs(path, exist_ok=True)
  39. else:
  40. os.mkdir(path, **kwargs)
  41. def makedirs(self, path, exist_ok=False):
  42. path = self._strip_protocol(path)
  43. os.makedirs(path, exist_ok=exist_ok)
  44. def rmdir(self, path):
  45. path = self._strip_protocol(path)
  46. os.rmdir(path)
  47. def ls(self, path, detail=False, **kwargs):
  48. path = self._strip_protocol(path)
  49. path_info = self.info(path)
  50. infos = []
  51. if path_info["type"] == "directory":
  52. with os.scandir(path) as it:
  53. for f in it:
  54. try:
  55. # Only get the info if requested since it is a bit expensive (the stat call inside)
  56. # The strip_protocol is also used in info() and calls make_path_posix to always return posix paths
  57. info = self.info(f) if detail else self._strip_protocol(f.path)
  58. infos.append(info)
  59. except FileNotFoundError:
  60. pass
  61. else:
  62. infos = [path_info] if detail else [path_info["name"]]
  63. return infos
  64. def info(self, path, **kwargs):
  65. if isinstance(path, os.DirEntry):
  66. # scandir DirEntry
  67. out = path.stat(follow_symlinks=False)
  68. link = path.is_symlink()
  69. if path.is_dir(follow_symlinks=False):
  70. t = "directory"
  71. elif path.is_file(follow_symlinks=False):
  72. t = "file"
  73. else:
  74. t = "other"
  75. size = out.st_size
  76. if link:
  77. try:
  78. out2 = path.stat(follow_symlinks=True)
  79. size = out2.st_size
  80. except OSError:
  81. size = 0
  82. path = self._strip_protocol(path.path)
  83. else:
  84. # str or path-like
  85. path = self._strip_protocol(path)
  86. out = os.stat(path, follow_symlinks=False)
  87. link = stat.S_ISLNK(out.st_mode)
  88. if link:
  89. out = os.stat(path, follow_symlinks=True)
  90. size = out.st_size
  91. if stat.S_ISDIR(out.st_mode):
  92. t = "directory"
  93. elif stat.S_ISREG(out.st_mode):
  94. t = "file"
  95. else:
  96. t = "other"
  97. # Check for the 'st_birthtime' attribute, which is not always present; fallback to st_ctime
  98. created_time = getattr(out, "st_birthtime", out.st_ctime)
  99. result = {
  100. "name": path,
  101. "size": size,
  102. "type": t,
  103. "created": created_time,
  104. "islink": link,
  105. }
  106. for field in ["mode", "uid", "gid", "mtime", "ino", "nlink"]:
  107. result[field] = getattr(out, f"st_{field}")
  108. if link:
  109. result["destination"] = os.readlink(path)
  110. return result
  111. def lexists(self, path, **kwargs):
  112. return osp.lexists(path)
  113. def cp_file(self, path1, path2, **kwargs):
  114. path1 = self._strip_protocol(path1)
  115. path2 = self._strip_protocol(path2)
  116. if self.auto_mkdir:
  117. self.makedirs(self._parent(path2), exist_ok=True)
  118. if self.isfile(path1):
  119. shutil.copyfile(path1, path2)
  120. elif self.isdir(path1):
  121. self.mkdirs(path2, exist_ok=True)
  122. else:
  123. raise FileNotFoundError(path1)
  124. def isfile(self, path):
  125. path = self._strip_protocol(path)
  126. return os.path.isfile(path)
  127. def isdir(self, path):
  128. path = self._strip_protocol(path)
  129. return os.path.isdir(path)
  130. def get_file(self, path1, path2, callback=None, **kwargs):
  131. if isfilelike(path2):
  132. with open(path1, "rb") as f:
  133. shutil.copyfileobj(f, path2)
  134. else:
  135. return self.cp_file(path1, path2, **kwargs)
  136. def put_file(self, path1, path2, callback=None, **kwargs):
  137. return self.cp_file(path1, path2, **kwargs)
  138. def mv(self, path1, path2, recursive: bool = True, **kwargs):
  139. """Move files/directories
  140. For the specific case of local, all ops on directories are recursive and
  141. the recursive= kwarg is ignored.
  142. """
  143. path1 = self._strip_protocol(path1)
  144. path2 = self._strip_protocol(path2)
  145. shutil.move(path1, path2)
  146. def link(self, src, dst, **kwargs):
  147. src = self._strip_protocol(src)
  148. dst = self._strip_protocol(dst)
  149. os.link(src, dst, **kwargs)
  150. def symlink(self, src, dst, **kwargs):
  151. src = self._strip_protocol(src)
  152. dst = self._strip_protocol(dst)
  153. os.symlink(src, dst, **kwargs)
  154. def islink(self, path) -> bool:
  155. return os.path.islink(self._strip_protocol(path))
  156. def rm_file(self, path):
  157. os.remove(self._strip_protocol(path))
  158. def rm(self, path, recursive=False, maxdepth=None):
  159. if not isinstance(path, list):
  160. path = [path]
  161. for p in path:
  162. p = self._strip_protocol(p)
  163. if self.isdir(p):
  164. if not recursive:
  165. raise ValueError("Cannot delete directory, set recursive=True")
  166. if osp.abspath(p) == os.getcwd():
  167. raise ValueError("Cannot delete current working directory")
  168. shutil.rmtree(p)
  169. else:
  170. os.remove(p)
  171. def unstrip_protocol(self, name):
  172. name = self._strip_protocol(name) # normalise for local/win/...
  173. return f"file://{name}"
  174. def _open(self, path, mode="rb", block_size=None, **kwargs):
  175. path = self._strip_protocol(path)
  176. if self.auto_mkdir and "w" in mode:
  177. self.makedirs(self._parent(path), exist_ok=True)
  178. return LocalFileOpener(path, mode, fs=self, **kwargs)
  179. def touch(self, path, truncate=True, **kwargs):
  180. path = self._strip_protocol(path)
  181. if self.auto_mkdir:
  182. self.makedirs(self._parent(path), exist_ok=True)
  183. if self.exists(path):
  184. os.utime(path, None)
  185. else:
  186. open(path, "a").close()
  187. if truncate:
  188. os.truncate(path, 0)
  189. def created(self, path):
  190. info = self.info(path=path)
  191. return datetime.datetime.fromtimestamp(
  192. info["created"], tz=datetime.timezone.utc
  193. )
  194. def modified(self, path):
  195. info = self.info(path=path)
  196. return datetime.datetime.fromtimestamp(info["mtime"], tz=datetime.timezone.utc)
  197. @classmethod
  198. def _parent(cls, path):
  199. path = cls._strip_protocol(path)
  200. if os.sep == "/":
  201. # posix native
  202. return path.rsplit("/", 1)[0] or "/"
  203. else:
  204. # NT
  205. path_ = path.rsplit("/", 1)[0]
  206. if len(path_) <= 3:
  207. if path_[1:2] == ":":
  208. # nt root (something like c:/)
  209. return path_[0] + ":/"
  210. # More cases may be required here
  211. return path_
  212. @classmethod
  213. def _strip_protocol(cls, path):
  214. path = stringify_path(path)
  215. if path.startswith("file://"):
  216. path = path[7:]
  217. elif path.startswith("file:"):
  218. path = path[5:]
  219. elif path.startswith("local://"):
  220. path = path[8:]
  221. elif path.startswith("local:"):
  222. path = path[6:]
  223. path = make_path_posix(path)
  224. if os.sep != "/":
  225. # This code-path is a stripped down version of
  226. # > drive, path = ntpath.splitdrive(path)
  227. if path[1:2] == ":":
  228. # Absolute drive-letter path, e.g. X:\Windows
  229. # Relative path with drive, e.g. X:Windows
  230. drive, path = path[:2], path[2:]
  231. elif path[:2] == "//":
  232. # UNC drives, e.g. \\server\share or \\?\UNC\server\share
  233. # Device drives, e.g. \\.\device or \\?\device
  234. if (index1 := path.find("/", 2)) == -1 or (
  235. index2 := path.find("/", index1 + 1)
  236. ) == -1:
  237. drive, path = path, ""
  238. else:
  239. drive, path = path[:index2], path[index2:]
  240. else:
  241. # Relative path, e.g. Windows
  242. drive = ""
  243. path = path.rstrip("/") or cls.root_marker
  244. return drive + path
  245. else:
  246. return path.rstrip("/") or cls.root_marker
  247. def _isfilestore(self):
  248. # Inheriting from DaskFileSystem makes this False (S3, etc. were)
  249. # the original motivation. But we are a posix-like file system.
  250. # See https://github.com/dask/dask/issues/5526
  251. return True
  252. def chmod(self, path, mode):
  253. path = stringify_path(path)
  254. return os.chmod(path, mode)
  255. def make_path_posix(path):
  256. """Make path generic and absolute for current OS"""
  257. if not isinstance(path, str):
  258. if isinstance(path, (list, set, tuple)):
  259. return type(path)(make_path_posix(p) for p in path)
  260. else:
  261. path = stringify_path(path)
  262. if not isinstance(path, str):
  263. raise TypeError(f"could not convert {path!r} to string")
  264. if os.sep == "/":
  265. # Native posix
  266. if path.startswith("/"):
  267. # most common fast case for posix
  268. return path
  269. elif path.startswith("~"):
  270. return osp.expanduser(path)
  271. elif path.startswith("./"):
  272. path = path[2:]
  273. elif path == ".":
  274. path = ""
  275. return f"{os.getcwd()}/{path}"
  276. else:
  277. # NT handling
  278. if path[0:1] == "/" and path[2:3] == ":":
  279. # path is like "/c:/local/path"
  280. path = path[1:]
  281. if path[1:2] == ":":
  282. # windows full path like "C:\\local\\path"
  283. if len(path) <= 3:
  284. # nt root (something like c:/)
  285. return path[0] + ":/"
  286. path = path.replace("\\", "/")
  287. return path
  288. elif path[0:1] == "~":
  289. return make_path_posix(osp.expanduser(path))
  290. elif path.startswith(("\\\\", "//")):
  291. # windows UNC/DFS-style paths
  292. return "//" + path[2:].replace("\\", "/")
  293. elif path.startswith(("\\", "/")):
  294. # windows relative path with root
  295. path = path.replace("\\", "/")
  296. return f"{osp.splitdrive(os.getcwd())[0]}{path}"
  297. else:
  298. path = path.replace("\\", "/")
  299. if path.startswith("./"):
  300. path = path[2:]
  301. elif path == ".":
  302. path = ""
  303. return f"{make_path_posix(os.getcwd())}/{path}"
  304. def trailing_sep(path):
  305. """Return True if the path ends with a path separator.
  306. A forward slash is always considered a path separator, even on Operating
  307. Systems that normally use a backslash.
  308. """
  309. # TODO: if all incoming paths were posix-compliant then separator would
  310. # always be a forward slash, simplifying this function.
  311. # See https://github.com/fsspec/filesystem_spec/pull/1250
  312. return path.endswith(os.sep) or (os.altsep is not None and path.endswith(os.altsep))
  313. @lru_cache(maxsize=1)
  314. def get_umask(mask: int = 0o666) -> int:
  315. """Get the current umask.
  316. Follows https://stackoverflow.com/a/44130549 to get the umask.
  317. Temporarily sets the umask to the given value, and then resets it to the
  318. original value.
  319. """
  320. value = os.umask(mask)
  321. os.umask(value)
  322. return value
  323. class LocalFileOpener(io.IOBase):
  324. def __init__(
  325. self, path, mode, autocommit=True, fs=None, compression=None, **kwargs
  326. ):
  327. logger.debug("open file: %s", path)
  328. self.path = path
  329. self.mode = mode
  330. self.fs = fs
  331. self.f = None
  332. self.autocommit = autocommit
  333. self.compression = get_compression(path, compression)
  334. self.blocksize = io.DEFAULT_BUFFER_SIZE
  335. self._open()
  336. def _open(self):
  337. if self.f is None or self.f.closed:
  338. if self.autocommit or "w" not in self.mode:
  339. self.f = open(self.path, mode=self.mode)
  340. if self.compression:
  341. compress = compr[self.compression]
  342. self.f = compress(self.f, mode=self.mode)
  343. else:
  344. # TODO: check if path is writable?
  345. i, name = tempfile.mkstemp()
  346. os.close(i) # we want normal open and normal buffered file
  347. self.temp = name
  348. self.f = open(name, mode=self.mode)
  349. if "w" not in self.mode:
  350. self.size = self.f.seek(0, 2)
  351. self.f.seek(0)
  352. self.f.size = self.size
  353. def _fetch_range(self, start, end):
  354. # probably only used by cached FS
  355. if "r" not in self.mode:
  356. raise ValueError
  357. self._open()
  358. self.f.seek(start)
  359. return self.f.read(end - start)
  360. def __setstate__(self, state):
  361. self.f = None
  362. loc = state.pop("loc", None)
  363. self.__dict__.update(state)
  364. if "r" in state["mode"]:
  365. self.f = None
  366. self._open()
  367. self.f.seek(loc)
  368. def __getstate__(self):
  369. d = self.__dict__.copy()
  370. d.pop("f")
  371. if "r" in self.mode:
  372. d["loc"] = self.f.tell()
  373. else:
  374. if not self.f.closed:
  375. raise ValueError("Cannot serialise open write-mode local file")
  376. return d
  377. def commit(self):
  378. if self.autocommit:
  379. raise RuntimeError("Can only commit if not already set to autocommit")
  380. try:
  381. shutil.move(self.temp, self.path)
  382. except PermissionError as e:
  383. # shutil.move raises PermissionError if os.rename
  384. # and the default copy2 fallback with shutil.copystats fail.
  385. # The file should be there nonetheless, but without copied permissions.
  386. # If it doesn't exist, there was no permission to create the file.
  387. if not os.path.exists(self.path):
  388. raise e
  389. else:
  390. # If PermissionError is not raised, permissions can be set.
  391. try:
  392. mask = 0o666
  393. os.chmod(self.path, mask & ~get_umask(mask))
  394. except RuntimeError:
  395. pass
  396. def discard(self):
  397. if self.autocommit:
  398. raise RuntimeError("Cannot discard if set to autocommit")
  399. os.remove(self.temp)
  400. def readable(self) -> bool:
  401. return True
  402. def writable(self) -> bool:
  403. return "r" not in self.mode
  404. def read(self, *args, **kwargs):
  405. return self.f.read(*args, **kwargs)
  406. def write(self, *args, **kwargs):
  407. return self.f.write(*args, **kwargs)
  408. def tell(self, *args, **kwargs):
  409. return self.f.tell(*args, **kwargs)
  410. def seek(self, *args, **kwargs):
  411. return self.f.seek(*args, **kwargs)
  412. def seekable(self, *args, **kwargs):
  413. return self.f.seekable(*args, **kwargs)
  414. def readline(self, *args, **kwargs):
  415. return self.f.readline(*args, **kwargs)
  416. def readlines(self, *args, **kwargs):
  417. return self.f.readlines(*args, **kwargs)
  418. def close(self):
  419. return self.f.close()
  420. def truncate(self, size=None) -> int:
  421. return self.f.truncate(size)
  422. @property
  423. def closed(self):
  424. return self.f.closed
  425. def fileno(self):
  426. return self.raw.fileno()
  427. def flush(self) -> None:
  428. self.f.flush()
  429. def __iter__(self):
  430. return self.f.__iter__()
  431. def __getattr__(self, item):
  432. return getattr(self.f, item)
  433. def __enter__(self):
  434. self._incontext = True
  435. return self
  436. def __exit__(self, exc_type, exc_value, traceback):
  437. self._incontext = False
  438. self.f.__exit__(exc_type, exc_value, traceback)