spawn.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364
  1. # mypy: allow-untyped-defs
  2. import logging
  3. import multiprocessing
  4. import multiprocessing.connection
  5. import os
  6. import pickle
  7. import signal
  8. import sys
  9. import tempfile
  10. import time
  11. import warnings
  12. from concurrent.futures import as_completed, ThreadPoolExecutor
  13. from typing import Optional
  14. from torch.numa.binding import (
  15. maybe_temporarily_apply_numa_binding_to_current_thread,
  16. NumaOptions,
  17. )
  18. from . import _prctl_pr_set_pdeathsig # type: ignore[attr-defined]
  19. ENV_VAR_PARALLEL_START = "TORCH_MP_PARALLEL_START"
  20. log = logging.getLogger(__name__)
  21. __all__ = [
  22. "ProcessContext",
  23. "ProcessException",
  24. "ProcessExitedException",
  25. "ProcessRaisedException",
  26. "spawn",
  27. "SpawnContext",
  28. "start_processes",
  29. ]
  30. class ProcessException(Exception):
  31. __slots__ = ["error_index", "error_pid"]
  32. def __init__(self, msg: str, error_index: int, pid: int):
  33. super().__init__(msg)
  34. self.msg = msg
  35. self.error_index = error_index
  36. self.pid = pid
  37. def __reduce__(self):
  38. return type(self), (self.msg, self.error_index, self.pid)
  39. class ProcessRaisedException(ProcessException):
  40. """Exception raised when a process failed due to an exception raised by the code."""
  41. def __init__(
  42. self,
  43. msg: str,
  44. error_index: int,
  45. error_pid: int,
  46. ):
  47. super().__init__(msg, error_index, error_pid)
  48. class ProcessExitedException(ProcessException):
  49. """Exception raised when a process failed due to signal or exited with a specific code."""
  50. __slots__ = ["exit_code"]
  51. def __init__(
  52. self,
  53. msg: str,
  54. error_index: int,
  55. error_pid: int,
  56. exit_code: int,
  57. signal_name: Optional[str] = None,
  58. ):
  59. super().__init__(msg, error_index, error_pid)
  60. self.exit_code = exit_code
  61. self.signal_name = signal_name
  62. def __reduce__(self):
  63. return (
  64. type(self),
  65. (self.msg, self.error_index, self.pid, self.exit_code, self.signal_name),
  66. )
  67. def _wrap(fn, i, args, error_file):
  68. # prctl(2) is a Linux specific system call.
  69. # On other systems the following function call has no effect.
  70. # This is set to ensure that non-daemonic child processes can
  71. # terminate if their parent terminates before they do.
  72. _prctl_pr_set_pdeathsig(signal.SIGINT)
  73. try:
  74. fn(i, *args)
  75. except KeyboardInterrupt:
  76. pass # SIGINT; Killed by parent, do nothing
  77. except Exception:
  78. # Propagate exception to parent process, keeping original traceback
  79. import traceback
  80. with open(error_file, "wb") as fh:
  81. pickle.dump(traceback.format_exc(), fh)
  82. sys.exit(1)
  83. class ProcessContext:
  84. def __init__(self, processes, error_files):
  85. self.error_files = error_files
  86. self.processes = processes
  87. self.sentinels = {
  88. process.sentinel: index for index, process in enumerate(processes)
  89. }
  90. def pids(self):
  91. return [int(process.pid) for process in self.processes]
  92. def _join_procs_with_timeout(self, timeout: float):
  93. """Attempt to join all processes with a shared timeout."""
  94. end = time.monotonic() + timeout
  95. for process in self.processes:
  96. time_to_wait = max(0, end - time.monotonic())
  97. process.join(time_to_wait)
  98. def join(
  99. self, timeout: Optional[float] = None, grace_period: Optional[float] = None
  100. ):
  101. r"""Join one or more processes within spawn context.
  102. Attempt to join one or more processes in this spawn context.
  103. If one of them exited with a non-zero exit status, this function
  104. kills the remaining processes (optionally with a grace period)
  105. and raises an exception with the cause of the first process exiting.
  106. Returns ``True`` if all processes have been joined successfully,
  107. ``False`` if there are more processes that need to be joined.
  108. Args:
  109. timeout (float): Wait this long (in seconds) before giving up on waiting.
  110. grace_period (float): When any processes fail, wait this long (in seconds)
  111. for others to shutdown gracefully before terminating them. If they
  112. still don't exit, wait another grace period before killing them.
  113. """
  114. # Ensure this function can be called even when we're done.
  115. if len(self.sentinels) == 0:
  116. return True
  117. # Wait for any process to fail or all of them to succeed.
  118. ready = multiprocessing.connection.wait(
  119. self.sentinels.keys(),
  120. timeout=timeout,
  121. )
  122. error_index = None
  123. for sentinel in ready:
  124. index = self.sentinels.pop(sentinel)
  125. process = self.processes[index]
  126. process.join()
  127. if process.exitcode != 0:
  128. error_index = index
  129. break
  130. # Return if there was no error.
  131. if error_index is None:
  132. # Return whether or not all processes have been joined.
  133. return len(self.sentinels) == 0
  134. # An error occurred. Clean-up all processes before returning.
  135. # First, allow a grace period for processes to shutdown themselves.
  136. if grace_period is not None:
  137. self._join_procs_with_timeout(grace_period)
  138. # Then, terminate processes that are still alive. Try SIGTERM first.
  139. for process in self.processes:
  140. if process.is_alive():
  141. log.warning("Terminating process %s via signal SIGTERM", process.pid)
  142. process.terminate()
  143. # Try SIGKILL if the process isn't going down after another grace_period.
  144. # The reason is related to python signal handling is limited
  145. # to main thread and if that is in c/c++ land and stuck it won't
  146. # to handle it. We have seen processes getting stuck not handling
  147. # SIGTERM for the above reason.
  148. self._join_procs_with_timeout(30 if grace_period is None else grace_period)
  149. for process in self.processes:
  150. if process.is_alive():
  151. log.warning(
  152. "Unable to shutdown process %s via SIGTERM , forcefully exiting via SIGKILL",
  153. process.pid,
  154. )
  155. process.kill()
  156. process.join()
  157. # The file will only be created if the process crashed.
  158. failed_process = self.processes[error_index]
  159. if not os.access(self.error_files[error_index], os.R_OK):
  160. exitcode = self.processes[error_index].exitcode
  161. if exitcode < 0:
  162. try:
  163. name = signal.Signals(-exitcode).name
  164. except ValueError:
  165. name = f"<Unknown signal {-exitcode}>"
  166. raise ProcessExitedException(
  167. f"process {error_index:d} terminated with signal {name}",
  168. error_index=error_index,
  169. error_pid=failed_process.pid,
  170. exit_code=exitcode,
  171. signal_name=name,
  172. )
  173. else:
  174. raise ProcessExitedException(
  175. f"process {error_index:d} terminated with exit code {exitcode:d}",
  176. error_index=error_index,
  177. error_pid=failed_process.pid,
  178. exit_code=exitcode,
  179. )
  180. with open(self.error_files[error_index], "rb") as fh:
  181. original_trace = pickle.load(fh)
  182. msg = f"\n\n-- Process {error_index:d} terminated with the following error:\n"
  183. msg += original_trace
  184. raise ProcessRaisedException(msg, error_index, failed_process.pid)
  185. class SpawnContext(ProcessContext):
  186. def __init__(self, processes, error_files):
  187. warnings.warn("SpawnContext is renamed to ProcessContext since 1.4 release.")
  188. super().__init__(processes, error_files)
  189. # Note: [start_processes]
  190. # mp.start_processes handles both start_method='spawn' and 'fork'. It's supposed to be a
  191. # more generalized API than mp.spawn. Currently we only document mp.spawn as it's the
  192. # CUDA compatible start_method. However, in environments like Ipython notebooks, 'fork'
  193. # works better than 'spawn'. Every helper function we created for mp.spawn is indeed
  194. # general enough, and backends like XLA can reuse them in Colab notebooks as well.
  195. # Currently we only add this API first, we can consider adding it to documentation as
  196. # needed in the future.
  197. def start_processes(
  198. fn,
  199. args=(),
  200. nprocs=1,
  201. join=True,
  202. daemon=False,
  203. start_method="spawn",
  204. numa_options: Optional[NumaOptions] = None,
  205. ):
  206. # To speed up performance in certain cases (see https://github.com/pytorch/pytorch/issues/133010),
  207. # this func will start processes in parallel if start_method is 'forkserver'.
  208. # Please opt in to this perf optimization by setting env var (TORCH_MP_PARALLEL_START) to 1.
  209. # todo: investigate why spawn does not work with threadpool and raises SIGINT
  210. if (
  211. start_method == "forkserver"
  212. and os.environ.get(ENV_VAR_PARALLEL_START, "0") == "1"
  213. ):
  214. log.info("Starting processes in parallel.")
  215. start_parallel = True
  216. else:
  217. # Set env var TORCH_MP_PARALLEL_START to 0 to disable parallel start
  218. start_parallel = False
  219. mp = multiprocessing.get_context(start_method)
  220. error_files = [None] * nprocs
  221. processes = [None] * nprocs
  222. def start_process(i):
  223. # Each process is assigned a file to write tracebacks to. We
  224. # use the file being non-empty to indicate an exception
  225. # occurred (vs an expected shutdown). Note: this previously
  226. # used a multiprocessing.Queue but that can be prone to
  227. # deadlocks, so we went with a simpler solution for a one-shot
  228. # message between processes.
  229. tf = tempfile.NamedTemporaryFile(
  230. prefix="pytorch-errorfile-", suffix=".pickle", delete=False
  231. )
  232. tf.close()
  233. os.unlink(tf.name)
  234. process = mp.Process(
  235. target=_wrap,
  236. args=(fn, i, args, tf.name),
  237. daemon=daemon,
  238. )
  239. # HACK [NUMA inheritance]: Subprocesses inherit the parent thread's CPU
  240. # affinity. So, we temporarily apply the bindings to the current thread,
  241. # and then immediately undo them.
  242. # This is necessary because the alternatives would be to
  243. # either
  244. # 1. Use numactl CLI. However, Python's multiprocessing library
  245. # does not provide an API which would allow us to prepend
  246. # the command it runs with numactl options.
  247. # 2. Wrap the provided function such that it first applies
  248. # NUMA bindings, and then executes as expected. However, this
  249. # can result in worse memory locality, because torch and CUDA
  250. # initialization would occur before applying the bindings, thus
  251. # allowing some memory to be allocated on the wrong NUMA nodes.
  252. with maybe_temporarily_apply_numa_binding_to_current_thread(
  253. gpu_index=i, numa_options=numa_options
  254. ):
  255. process.start()
  256. return i, process, tf.name
  257. if not start_parallel:
  258. for i in range(nprocs):
  259. idx, process, tf_name = start_process(i)
  260. error_files[idx] = tf_name
  261. processes[idx] = process
  262. else:
  263. with ThreadPoolExecutor(max_workers=nprocs) as executor:
  264. futures = [executor.submit(start_process, i) for i in range(nprocs)]
  265. for fut in as_completed(futures):
  266. idx, process, tf_name = fut.result()
  267. # idx and process rank needs to be the same.
  268. error_files[idx] = tf_name
  269. processes[idx] = process
  270. context = ProcessContext(processes, error_files)
  271. if not join:
  272. return context
  273. # Loop on join until it returns True or raises an exception.
  274. while not context.join():
  275. pass
  276. def spawn(fn, args=(), nprocs=1, join=True, daemon=False, start_method="spawn"):
  277. r"""Spawns ``nprocs`` processes that run ``fn`` with ``args``.
  278. If one of the processes exits with a non-zero exit status, the
  279. remaining processes are killed and an exception is raised with the
  280. cause of termination. In the case an exception was caught in the
  281. child process, it is forwarded and its traceback is included in
  282. the exception raised in the parent process.
  283. Args:
  284. fn (function): Function is called as the entrypoint of the
  285. spawned process. This function must be defined at the top
  286. level of a module so it can be pickled and spawned. This
  287. is a requirement imposed by multiprocessing.
  288. The function is called as ``fn(i, *args)``, where ``i`` is
  289. the process index and ``args`` is the passed through tuple
  290. of arguments.
  291. args (tuple): Arguments passed to ``fn``.
  292. nprocs (int): Number of processes to spawn.
  293. join (bool): Perform a blocking join on all processes.
  294. daemon (bool): The spawned processes' daemon flag. If set to True,
  295. daemonic processes will be created.
  296. start_method (str): (deprecated) this method will always use ``spawn``
  297. as the start method. To use a different start method
  298. use ``start_processes()``.
  299. Returns:
  300. None if ``join`` is ``True``,
  301. :class:`~ProcessContext` if ``join`` is ``False``
  302. """
  303. if start_method != "spawn":
  304. msg = (
  305. f"This method only supports start_method=spawn (got: {start_method}).\n"
  306. "To use a different start_method use:\n\t\t"
  307. " torch.multiprocessing.start_processes(...)"
  308. )
  309. warnings.warn(msg, FutureWarning, stacklevel=2)
  310. return start_processes(fn, args, nprocs, join, daemon, start_method="spawn")