device_mesh.py 63 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370
  1. # mypy: allow-untyped-defs
  2. # Copyright (c) Meta Platforms, Inc. and affiliates
  3. import logging
  4. import os
  5. import threading
  6. import warnings
  7. from collections.abc import Iterator
  8. from itertools import zip_longest
  9. from typing import Optional, TYPE_CHECKING, Union
  10. import torch
  11. from torch.distributed import is_available
  12. from torch.distributed._mesh_layout import _MeshLayout
  13. from torch.distributed._pycute import IntTuple, is_int, suffix_product
  14. from torch.utils._typing_utils import not_none
  15. __all__ = ["init_device_mesh", "DeviceMesh"]
  16. if not is_available():
  17. import sys
  18. # We need to create the stubs when distributed is not available.
  19. # Otherwise, we would fail the doc tests (```./.ci/pytorch/docs-test.sh```),
  20. # since it would try to import ``torch.distributed.device_mesh`` or
  21. # ``torch.distributed.init_device_mesh`` but cannot find them.
  22. class _DeviceMeshStub:
  23. pass
  24. def _init_device_mesh_stub():
  25. pass
  26. sys.modules["torch.distributed.device_mesh"].DeviceMesh = _DeviceMeshStub # type: ignore[attr-defined]
  27. sys.modules[
  28. "torch.distributed.device_mesh"
  29. ].init_device_mesh = _init_device_mesh_stub # type: ignore[attr-defined]
  30. else:
  31. from torch._C._distributed_c10d import Backend as C10dBackend
  32. from torch.distributed.distributed_c10d import (
  33. _get_default_group,
  34. _resolve_process_group,
  35. get_backend,
  36. get_process_group_ranks,
  37. get_rank,
  38. get_world_size,
  39. GroupName,
  40. init_process_group,
  41. is_initialized,
  42. new_group,
  43. ProcessGroup,
  44. split_group,
  45. )
  46. logger = logging.getLogger(__name__)
  47. # only import numpy typing when type checking
  48. if TYPE_CHECKING:
  49. try:
  50. from numpy.typing import ArrayLike
  51. except ImportError:
  52. logger.warning(
  53. "DeviceMesh requires numpy >= 1.21 to be installed for type checking"
  54. )
  55. BackendConfig = tuple[str | None, C10dBackend.Options | None]
  56. torch.serialization.add_safe_globals([_MeshLayout])
  57. class _MeshEnv(threading.local):
  58. def __init__(self) -> None:
  59. self.mesh_stack: list[DeviceMesh] = []
  60. def get_current_mesh(self) -> "DeviceMesh":
  61. if len(self.mesh_stack) == 0:
  62. raise RuntimeError("No device mesh is currently active!")
  63. return self.mesh_stack[-1]
  64. # TODO: to remove it once we move all use cases into new API.
  65. def get_root_mesh(self, device_mesh: "DeviceMesh") -> "DeviceMesh":
  66. # If a mesh could not be found in the child_to_root_mapping, it is a root mesh itself.
  67. # A root mesh is not created through slicing.
  68. # We considers the root mesh of a root mesh is itself.
  69. # We keep this function for backward compatibility.
  70. warnings.warn(
  71. "This get_root_mesh API will be deprecated soon."
  72. "Please use `get_root_mesh` inside DeviceMesh instead.",
  73. stacklevel=2,
  74. )
  75. if not device_mesh:
  76. return device_mesh
  77. return device_mesh._get_root_mesh()
  78. @staticmethod
  79. def num_devices_per_host(device_type: str) -> int:
  80. return _get_device_handle(device_type).device_count()
  81. @staticmethod
  82. def num_hosts(device_type: str) -> int:
  83. # ProcessGroup can't tell us this info so we have to infer it, assume
  84. # homogeneous hardware for now
  85. return get_world_size() // _MeshEnv.num_devices_per_host(device_type)
  86. # TODO: to remove it once we move all use cases into new API.
  87. # We keep this API for backward compatibility.
  88. def _get_all_submeshes(
  89. self, device_mesh: "DeviceMesh", mesh_dim_name: str
  90. ) -> list["DeviceMesh"]:
  91. warnings.warn(
  92. "This _get_all_submeshes API will be deprecated soon."
  93. "Please use `_get_all_submeshes` inside DeviceMesh instead.",
  94. stacklevel=2,
  95. )
  96. return device_mesh._get_all_submeshes(mesh_dim_name)
  97. _mesh_resources: _MeshEnv = _MeshEnv()
  98. def _get_device_handle(device_type: str = "cuda"):
  99. """
  100. Get the module corresponding to the device_type which is cuda or cuda-like device.
  101. For example, when the device_type is cuda, the module `torch.cuda` is returned.
  102. Return None when there is no corresponding module for device_type, otherwise
  103. return the corresponding module.
  104. """
  105. return getattr(torch, device_type, None)
  106. class DeviceMesh:
  107. """
  108. DeviceMesh represents a mesh of devices, where layout of devices could be
  109. represented as a n-d dimension array, and each value of the n-d dimensional
  110. array is the global id of the default process group ranks.
  111. DeviceMesh could be used to setup the N dimensional device connections across the cluster,
  112. and manage the ProcessGroups for N dimensional parallelisms. Communications could happen on
  113. each dimension of the DeviceMesh separately. DeviceMesh respects the device that user selects
  114. already (i.e. if user call `torch.cuda.set_device` before the DeviceMesh initialization),
  115. and will select/set the device for the current process if user does not set the device
  116. beforehand. Note that manual device selection should happen BEFORE the DeviceMesh initialization.
  117. DeviceMesh can also be used as a context manager when using together with DTensor APIs.
  118. .. note::
  119. DeviceMesh follows SPMD programming model, which means the same PyTorch Python program
  120. is running on all processes/ranks in the cluster. Therefore, users need to make sure the
  121. `mesh` array (which describes the layout of devices) should be identical across all ranks.
  122. Inconsistent `mesh` will lead to silent hang.
  123. Args:
  124. device_type (str): The device type of the mesh. Currently supports: "cpu", "cuda/cuda-like".
  125. mesh (ndarray): A multi-dimensional array or an integer tensor describing the layout
  126. of devices, where the IDs are global IDs of the default process group.
  127. _rank (int): (experimental/internal)
  128. The global rank of the current process. If not provided, it will
  129. be inferred from the default process group.
  130. Returns:
  131. DeviceMesh: A :class:`DeviceMesh` object representing the device layout.
  132. The following program runs on each process/rank in an SPMD manner. In this example, we have 2
  133. hosts with 4 GPUs each.
  134. A reduction over the first dimension of mesh will reduce across
  135. columns (0, 4), .. and (3, 7), a reduction over the second dimension
  136. of mesh reduces across rows (0, 1, 2, 3) and (4, 5, 6, 7).
  137. Example::
  138. >>> # xdoctest: +SKIP("no rank")
  139. >>> from torch.distributed.device_mesh import DeviceMesh
  140. >>>
  141. >>> # Initialize device mesh as (2, 4) to represent the topology
  142. >>> # of cross-host(dim 0), and within-host (dim 1).
  143. >>> mesh = DeviceMesh(device_type="cuda", mesh=[[0, 1, 2, 3],[4, 5, 6, 7]])
  144. """
  145. _device_type: str
  146. _rank_map: torch.Tensor
  147. _mesh_dim_names: tuple[str, ...] | None
  148. _layout: _MeshLayout
  149. _root_mesh: Optional["DeviceMesh"] = None
  150. # Record flatten mesh name to its flattened mesh in root mesh.
  151. _flatten_mapping: dict[str, "DeviceMesh"]
  152. def __init__(
  153. self,
  154. device_type: str,
  155. mesh: Union[torch.Tensor, "ArrayLike"] | None = None,
  156. *,
  157. mesh_dim_names: tuple[str, ...] | None = None,
  158. backend_override: tuple[BackendConfig, ...] | None = None,
  159. _init_backend: bool = True,
  160. _rank: int | None = None,
  161. _layout: _MeshLayout | None = None,
  162. _rank_map: torch.Tensor | None = None,
  163. _root_mesh: Optional["DeviceMesh"] = None,
  164. ) -> None:
  165. # no-op in OSS, logs API usage metrics in meta-internal runs
  166. torch._C._log_api_usage_once(
  167. "torch.distributed.device_mesh.DeviceMesh.__init__"
  168. )
  169. if mesh is not None:
  170. if _layout is not None or _rank_map is not None:
  171. raise TypeError(
  172. "Cannot provide _layout and/or _rank_map if passing explicit mesh"
  173. )
  174. if isinstance(mesh, torch.Tensor) and mesh.device.type != "cpu":
  175. raise ValueError(f"`mesh` must be a CPU tensor, got {mesh}")
  176. mesh_tensor = (
  177. mesh.detach().to(dtype=torch.int).contiguous()
  178. if isinstance(mesh, torch.Tensor)
  179. else torch.tensor(mesh, device="cpu", dtype=torch.int)
  180. )
  181. _layout = _MeshLayout(mesh_tensor.size(), mesh_tensor.stride())
  182. _rank_map = mesh_tensor.flatten()
  183. else:
  184. if _layout is None or _rank_map is None:
  185. raise TypeError(
  186. "The mesh argument is required except for PRIVATE USAGE ONLY!"
  187. )
  188. assert _layout.check_non_overlap(), (
  189. "Please use a non-overlapping layout when creating a DeviceMesh."
  190. )
  191. assert _rank_map.ndim == 1, "The rank map must be 1-dimensional"
  192. assert _rank_map.is_contiguous(), "The rank map must be contiguous"
  193. assert _rank_map.numel() >= _layout.cosize(), (
  194. f"The rank map contains {_rank_map.numel()} element, "
  195. f"which isn't large enough for layout {_layout}"
  196. )
  197. self._device_type = device_type
  198. self._layout = _layout
  199. self._rank_map = _rank_map
  200. self._mesh_dim_names = tuple(mesh_dim_names) if mesh_dim_names else None
  201. self._root_mesh = _root_mesh
  202. if backend_override is None:
  203. backend_override = ((None, None),) * len(self._layout)
  204. elif len(backend_override) != len(self._layout):
  205. raise ValueError(
  206. f"backend_override should have the same length as the number of mesh dimensions, "
  207. f"but got {len(backend_override)} and {len(self._layout)}."
  208. )
  209. # Internal bookkeeping for the device mesh.
  210. self._layout = (
  211. _layout
  212. if _layout
  213. else _MeshLayout(self.mesh.size(), self.mesh.stride())
  214. )
  215. if not self._layout.check_non_overlap():
  216. raise AssertionError(
  217. "Please use a non-overlapping layout when creating a DeviceMesh."
  218. )
  219. # Because we still need to support slicing of flattened dim from root mesh, so we don't check stride here.
  220. if self._layout.numel() != self.mesh.numel():
  221. raise AssertionError(
  222. "Please use a valid layout when creating a DeviceMesh."
  223. f"The layout {self._layout} is not consistent with the mesh size {self.mesh.size()}."
  224. )
  225. # private field to pre-generate DeviceMesh's hash
  226. self._flatten_rank_map = tuple(self._rank_map.tolist())
  227. self._thread_id = None
  228. # Initialize instance-specific flatten mapping
  229. self._flatten_mapping = {}
  230. # Skip process group initialization if xla device or init backend is False
  231. # TODO(yeounoh) implement DeviceMesh backend and register XLA backend.
  232. if device_type != "xla":
  233. # always try to create default (world) pg, even if it is not initialized
  234. # already. The world pg is used for device mesh identity (rank) on each
  235. # process (we need to know if the current global rank is in the mesh or not).
  236. if _init_backend:
  237. self._setup_world_group_and_device()
  238. self._dim_group_names = self._init_process_groups(
  239. self._layout,
  240. self._rank_map,
  241. self._mesh_dim_names,
  242. backend_override,
  243. )
  244. if is_initialized() and get_backend() == "threaded":
  245. # pyrefly: ignore [bad-assignment]
  246. self._thread_id = threading.get_ident()
  247. if _rank is None:
  248. _rank = get_rank()
  249. # calculate the coordinates of the current global rank on the mesh
  250. rank_coords = (self.mesh == _rank).nonzero()
  251. if rank_coords.size(0) not in (0, 1):
  252. raise AssertionError(
  253. f"rank_coords.size(0) must be 0 or 1, got {rank_coords.size(0)}"
  254. )
  255. self._coordinate_on_dim: list[int] | None = (
  256. rank_coords[0].tolist() if rank_coords.size(0) > 0 else None
  257. )
  258. @property
  259. def device_type(self) -> str:
  260. """Returns the device type of the mesh."""
  261. return self._device_type
  262. @property
  263. def mesh(self) -> torch.Tensor:
  264. """Returns the tensor representing the layout of devices."""
  265. full_mesh = self._layout.remap_to_tensor(self._rank_map)
  266. if full_mesh.size(0) == 1:
  267. return full_mesh[0]
  268. my_coords = (full_mesh == get_rank()).nonzero()
  269. if my_coords.size(0) > 0:
  270. return full_mesh[my_coords[0, 0]]
  271. raise RuntimeError(
  272. "In order to get the mesh Tensor of a DeviceMesh it needs to "
  273. "either have all its original dimensions (e.g., no slicing) "
  274. "or it needs to contain the local rank"
  275. )
  276. @property
  277. def mesh_dim_names(self) -> tuple[str, ...] | None:
  278. """Returns the names of mesh dimensions."""
  279. return self._mesh_dim_names
  280. def _setup_world_group_and_device(self):
  281. default_initialized = is_initialized()
  282. # TODO: think about how to allow pg options to be passed to world group
  283. # or mesh dimension groups
  284. if not default_initialized:
  285. init_process_group()
  286. world_size = get_world_size()
  287. if self._layout.numel() > world_size:
  288. raise RuntimeError(
  289. f"Mesh should not be bigger than default world size {world_size}, but found {self._layout.numel()} ranks!"
  290. )
  291. # ONLY set the device if the current device is not initialized, if user already
  292. # set the device before DeviceMesh init, we respect the user's choice.
  293. device_handle = _get_device_handle(self._device_type)
  294. if device_handle and not device_handle.is_initialized():
  295. # auto set the cuda/cuda-like device only if user has not set it, if there's LOCAL_RANK
  296. # env variable from launchers, we use it to set the device.
  297. if "LOCAL_RANK" in os.environ:
  298. local_rank = int(os.environ["LOCAL_RANK"])
  299. logger.info(
  300. "Setting default device for the current process based on LOCAL_RANK=%s",
  301. local_rank,
  302. )
  303. device_handle.set_device(local_rank)
  304. else:
  305. warnings.warn(
  306. "It seems like you did not set/select the default device for the current process before the DeviceMesh "
  307. "initialization or use a launcher (i.e. torchrun) which populates `LOCAL_RANK` environment variable. "
  308. "It is recommended to set the current device for the process BEFORE the DeviceMesh initialization so that "
  309. "the underlying communicator (i.e. NCCL) can be initialized properly. "
  310. "Given that the current process has no default device selected, DeviceMesh will use a heuristic to set the "
  311. "device_id via `global_rank % num_devices_per_host`, assuming homogeneous hardware cluster. ",
  312. stacklevel=2,
  313. )
  314. # heuristic to set the current cuda/cuda-like device base on num of gpu devices available in each host
  315. # NOTE: This device selection would only work for homogeneous hardware.
  316. num_devices_per_host = device_handle.device_count()
  317. if (
  318. world_size > num_devices_per_host
  319. and world_size % num_devices_per_host != 0
  320. ):
  321. raise RuntimeError(
  322. f"DeviceMesh only support homogeneous hardware, but found "
  323. f"{world_size} ranks and {num_devices_per_host} {self._device_type} devices!"
  324. )
  325. device_handle.set_device(get_rank() % num_devices_per_host)
  326. return _get_default_group()
  327. @staticmethod
  328. def _init_one_process_group(
  329. sub_layout: _MeshLayout,
  330. rank_map: torch.Tensor,
  331. dim_name: str,
  332. backend_override: BackendConfig,
  333. ) -> GroupName | None:
  334. # Generate a 2D global mesh tensor for the current dim for PG creation.
  335. pg_ranks_by_dim = sub_layout.nest().remap_to_tensor(rank_map)
  336. backend, pg_options = backend_override
  337. # We need to explicitly pass in timeout when specified in option, otherwise
  338. # the default timeout will be used to override the timeout set in option.
  339. # TODO: remove this once we have fixed inside c10d level.
  340. timeout = pg_options._timeout if pg_options else None
  341. # If we have a 2D mesh with mesh_dim_names ("dp", "tp"), the group description
  342. # of the subgroups would be `mesh_dim_dp` and `mesh_name_tp`.
  343. # If the mesh doesn't have a mesh_dim_names, then the group description of the
  344. # subgroup would be `mesh_dim_0` and `mesh_dim_1`.
  345. group_desc = f"mesh_{dim_name}"
  346. dim_group = None
  347. default_group = _get_default_group()
  348. # Early return if there is only one sub_layout in the mesh layout.
  349. if sub_layout.numel() == get_world_size() and backend_override == (
  350. None,
  351. None,
  352. ):
  353. # Append the default pg to the first dim groups only if the default pg is compatible with `self._device_type`.
  354. # Otherwise, create new pg.
  355. ranks = list(range(get_world_size()))
  356. dim_group = (
  357. new_group(
  358. backend="cpu:gloo,cuda:nccl",
  359. ranks=ranks,
  360. group_desc="mesh_default",
  361. )
  362. if torch.cuda.is_available()
  363. and get_backend(default_group) == "gloo"
  364. else default_group
  365. )
  366. return dim_group.group_name # type: ignore[union-attr]
  367. # If bound_device_id exists, it means the nccl communicator has been eagerly initialized
  368. # so that we can use `split_group` to create subgroups through `ncclCommSplit`.
  369. # In this case, we only need to make one API call (`split_group``) for the subgroup creation
  370. # for each mesh dimension. In a 2 * 4 mesh, we only need to make two API calls per ranks to create
  371. # all the subgroups.
  372. # Otherwise, we need to make more than one API call (`new_group`) for subgroup creations. The
  373. # numbers of API calls are equal to the number of subgroups for each mesh dimension. In a 2 * 4
  374. # mesh, we need to make two API calls per ranks to create all the subgroups.
  375. if (
  376. getattr(default_group, "bound_device_id", None) is not None
  377. and torch.cuda.is_available()
  378. and (
  379. backend is None
  380. or default_group._get_backend(torch.device("cuda")).name()
  381. == backend
  382. )
  383. ):
  384. dim_group = split_group(
  385. parent_pg=default_group,
  386. timeout=timeout,
  387. pg_options=pg_options,
  388. split_ranks=pg_ranks_by_dim.tolist(),
  389. group_desc=group_desc,
  390. )
  391. return dim_group.group_name # type: ignore[union-attr]
  392. # If the subgroup has been already created through `split_group`, we simply loop over `pg_ranks_by_dim`
  393. # and append the `group_name` to the `dim_group_names` list when the current rank is in the subgroup.
  394. # Otherwise, we use `new_group` instead of `split_group` to create subgroups by looping over `pg_ranks_by_dim`
  395. # along with appending information to the `dim_group_names` list whenever necessary.
  396. pg_name = None
  397. for dim_mesh in pg_ranks_by_dim:
  398. subgroup_ranks = dim_mesh.tolist()
  399. dim_group = new_group(
  400. ranks=subgroup_ranks,
  401. timeout=timeout,
  402. backend=backend,
  403. pg_options=pg_options,
  404. group_desc=group_desc,
  405. )
  406. # only add to dim_groups if the current rank in the subgroup
  407. if get_rank() in subgroup_ranks:
  408. if pg_name is not None:
  409. raise RuntimeError(
  410. f"Each device mesh dimension should get only one process group, but got {get_rank()} "
  411. f"in {subgroup_ranks}!"
  412. )
  413. pg_name = dim_group.group_name
  414. return pg_name
  415. @staticmethod
  416. def _init_process_groups(
  417. layout: _MeshLayout,
  418. rank_map: torch.Tensor,
  419. mesh_dim_names: tuple[str, ...] | None,
  420. backend_override: tuple[BackendConfig, ...],
  421. ) -> list[GroupName]:
  422. # group_name associated with each mesh dimension, each
  423. # mesh dimension should have one sub-group per rank
  424. dim_group_names: list[GroupName | None] = []
  425. # create sub pgs base on the mesh argument specified
  426. for dim in range(len(layout)):
  427. dim_name = mesh_dim_names[dim] if mesh_dim_names else f"dim_{dim}"
  428. dim_group_names.append(
  429. DeviceMesh._init_one_process_group(
  430. layout[dim], rank_map, dim_name, backend_override[dim]
  431. )
  432. )
  433. # Filter out None values. If any are None then they should all be None.
  434. dim_non_none_group_names = [n for n in dim_group_names if n is not None]
  435. assert not dim_non_none_group_names or len(dim_non_none_group_names) == len(
  436. dim_group_names
  437. )
  438. return dim_non_none_group_names
  439. def _get_root_mesh(self) -> "DeviceMesh":
  440. return self._root_mesh if self._root_mesh else self
  441. def __enter__(self) -> "DeviceMesh":
  442. # set this mesh as the current mesh in mesh env
  443. _mesh_resources.mesh_stack.append(self)
  444. return self
  445. # pyre-fixme[2]: Parameter must be annotated.
  446. def __exit__(self, exc_type, exc_value, exc_traceback) -> None:
  447. # pop this mesh from mesh env
  448. _mesh_resources.mesh_stack.pop()
  449. def __repr__(self) -> str:
  450. device_mesh_repr = (
  451. f"({', '.join(f'{k}={v}' for k, v in zip(self._mesh_dim_names, self._layout.top_level_sizes))})"
  452. if self._mesh_dim_names
  453. else f"{self._layout.top_level_sizes}"
  454. )
  455. device_mesh_repr = f"DeviceMesh({device_mesh_repr}, '{self.device_type}', stride={self._layout.strides}"
  456. # We only print the mesh tensor if the debug mode is turned on.
  457. if os.environ.get("TORCH_DISTRIBUTED_DEBUG", "") == "DETAIL":
  458. device_mesh_repr += f", Mesh: {self.mesh.tolist()}"
  459. return f"{device_mesh_repr})"
  460. def __hash__(self):
  461. # lazily compute hash
  462. self._hash = getattr(self, "_hash", None)
  463. if not self._hash:
  464. self._hash = hash(
  465. (
  466. self._flatten_rank_map,
  467. self._layout,
  468. self._device_type,
  469. self._mesh_dim_names,
  470. self._thread_id,
  471. )
  472. )
  473. return self._hash
  474. def __eq__(self, other: object) -> bool:
  475. if self is other:
  476. return True
  477. if not isinstance(other, DeviceMesh):
  478. return False
  479. return (
  480. self._flatten_rank_map == other._flatten_rank_map
  481. and self._layout == other._layout
  482. and self._device_type == other._device_type
  483. and self._mesh_dim_names == other._mesh_dim_names
  484. and self._thread_id == other._thread_id
  485. )
  486. def __getitem__(self, mesh_dim_names: str | tuple[str, ...]) -> "DeviceMesh":
  487. """
  488. Slice the current DeviceMesh based on the mesh_dim_names given to create a submesh.
  489. The submesh created consists of the dimensions and the communicators indicated by
  490. ``mesh_dim_names``
  491. Args:
  492. mesh_dim_names (Union[str, tuple[str, ...]]): the name or the tuple of names of the
  493. mesh dimension of the DeviceMesh to create the submesh for.
  494. Returns:
  495. A :class:`DeviceMesh` object
  496. The following program runs on each process/rank in an SPMD manner in a world size of 8.
  497. In the first example:
  498. Calling mesh_2d["tp"] on rank 0, 1, 2, 3 returns a 1D submesh of DeviceMesh:([0, 1, 2, 3]).
  499. Calling mesh_2d["tp"] on rank 4, 5, 6, 7 returns a 1D submesh of DeviceMesh:([4, 5, 6, 7]).
  500. Calling mesh_2d["dp"] on rank 0, 4 returns a 1D submesh of DeviceMesh:([0, 4]).
  501. Calling mesh_2d["dp"] on rank 1, 5 returns a 1D submesh of DeviceMesh:([1, 5]).
  502. Calling mesh_2d["dp"] on rank 2, 6 returns a 1D submesh of DeviceMesh:([2, 6]).
  503. Calling mesh_2d["dp"] on rank 3, 7 returns a 1D submesh of DeviceMesh:([3, 7]).
  504. In the second example:
  505. Calling mesh_3d["dp", "cp"] on rank 0, 1, 4, 5 returns a 2D submesh of DeviceMesh:([[0, 1], [4, 5]]).
  506. Calling mesh_3d["dp", "cp"] on rank 2, 3, 6, 7 returns a 2D submesh of DeviceMesh:([[2, 3], [6, 7]]).
  507. Calling mesh_3d["cp", "dp"] on rank 0, 1, 4, 5 returns a 2D submesh of DeviceMesh:([[0, 4], [1, 5]]).
  508. Calling mesh_3d["cp", "dp"] on rank 2, 3, 6, 7 returns a 2D submesh of DeviceMesh:([[2, 6], [3, 7]]).
  509. Example::
  510. >>> # xdoctest: +SKIP("no rank")
  511. >>> from torch.distributed.device_mesh import DeviceMesh
  512. >>>
  513. >>> # Initialize a 2D device mesh as (2, 4) to represent the topology
  514. >>> # of cross-host(dim 0), and within-host (dim 1).
  515. >>> mesh_2d = init_device_mesh(device_type="cuda", (2,4), mesh_dim_names=("dp", "tp"))
  516. >>> tp_mesh = mesh_2d["tp"]
  517. >>> dp_mesh = mesh_2d["dp"]
  518. >>>
  519. >>> # Initialize a 3D mesh.
  520. >>> mesh_3d = init_device_mesh(device_type="cuda", (2,2,2), mesh_dim_names=("dp", "pp", "cp"))
  521. >>> # The order of the mesh_dim_names provided deteremines the order of dimensions in the submesh.
  522. >>> dp_cp_mesh = mesh_3d["dp", "cp"]
  523. >>> cp_dp_mesh = mesh_3d["cp", "dp"]
  524. """
  525. if not self._mesh_dim_names:
  526. raise RuntimeError("Cannot slice a DeviceMesh without mesh_dim_names!")
  527. mesh_dim_names = (
  528. (mesh_dim_names,) if isinstance(mesh_dim_names, str) else mesh_dim_names
  529. )
  530. if mesh_dim_names == self._mesh_dim_names:
  531. return self
  532. else:
  533. sliced_mesh_layout = self._get_slice_mesh_layout(mesh_dim_names)
  534. # When using FakeTensorMode to trace the model, `_create_sub_mesh()` will
  535. # fail as it will require a real tensor to manipulate.
  536. # `unset_fake_temporarily()` will allow us to materialize the tensors
  537. # within `_create_sub_mesh`, which should not affect modling.
  538. #
  539. # Note that this should be orthogonal to torch.compile(). But whether
  540. # we can compile device_mesh `slicing` (no graph break) is not verified
  541. # yet and need a follow-up,
  542. # TODO: compiler + device_mesh slicing.
  543. with torch._subclasses.fake_tensor.unset_fake_temporarily():
  544. submesh = self._create_sub_mesh(sliced_mesh_layout, mesh_dim_names)
  545. return submesh
  546. def get_group(self, mesh_dim: int | str | None = None) -> ProcessGroup:
  547. """
  548. Returns the single ProcessGroup specified by mesh_dim, or, if mesh_dim is not specified and the
  549. DeviceMesh is 1-dimensional, returns the only ProcessGroup in the mesh.
  550. Args:
  551. mesh_dim (str/int, optional): it can be the name of the mesh dimension or the index
  552. of the mesh dimension. Default is None.
  553. Returns:
  554. A :class:`ProcessGroup` object.
  555. """
  556. if not hasattr(self, "_dim_group_names"):
  557. raise RuntimeError("DeviceMesh process groups not initialized!")
  558. if len(self._layout) > 1 and mesh_dim is None:
  559. raise RuntimeError(
  560. f"Found the DeviceMesh have {len(self._layout)} dimensions",
  561. "Optional kwarg `mesh_dim` needs to be specified when device_mesh.ndim > 1.",
  562. "If you want to get the list of all the ProcessGroups in the DeviceMesh,"
  563. "please use `get_all_groups()` instead.",
  564. )
  565. # Quick return if the current device_mesh is a 1D mesh.
  566. if len(self._layout) == 1 and mesh_dim is None:
  567. return not_none(_resolve_process_group(self._dim_group_names[0]))
  568. root_mesh = self._get_root_mesh()
  569. root_to_flatten_mapping = root_mesh._flatten_mapping
  570. if root_to_flatten_mapping and mesh_dim in root_to_flatten_mapping:
  571. dim_group_name = root_to_flatten_mapping[
  572. mesh_dim # type: ignore[index]
  573. ]._dim_group_names[0]
  574. return not_none(_resolve_process_group(dim_group_name))
  575. else:
  576. mesh_dim = (
  577. self._get_mesh_dim_by_name(mesh_dim)
  578. if isinstance(mesh_dim, str)
  579. else mesh_dim
  580. )
  581. if not isinstance(mesh_dim, int):
  582. raise AssertionError(
  583. f"mesh_dim must be an int, got {type(mesh_dim)}"
  584. )
  585. return not_none(_resolve_process_group(self._dim_group_names[mesh_dim]))
  586. def get_all_groups(self) -> list[ProcessGroup]:
  587. """
  588. Returns a list of ProcessGroups for all mesh dimensions.
  589. Returns:
  590. A list of :class:`ProcessGroup` object.
  591. """
  592. return [self.get_group(i) for i in range(len(self._layout))]
  593. def _create_sub_mesh(
  594. self,
  595. layout: _MeshLayout,
  596. submesh_dim_names: tuple[str, ...],
  597. ) -> "DeviceMesh":
  598. root_mesh = self._get_root_mesh()
  599. slice_dim_group_name = []
  600. for name in submesh_dim_names:
  601. if name in not_none(self._mesh_dim_names):
  602. slice_dim_group_name.append(
  603. self._dim_group_names[ # type: ignore[has-type]
  604. not_none(self._mesh_dim_names).index(name)
  605. ]
  606. )
  607. else:
  608. # If device_mesh is not root_mesh, we already throw error in _get_slice_mesh_layout
  609. # Since we will deprecate the slicing of flattened dim_name from root mesh soon,
  610. # we don't want to optimize the code furthermore.
  611. flatten_mesh = self._flatten_mapping[name]
  612. slice_dim_group_name.append(
  613. flatten_mesh._dim_group_names[ # type: ignore[has-type]
  614. not_none(flatten_mesh._mesh_dim_names).index(name)
  615. ]
  616. )
  617. res_submesh = DeviceMesh(
  618. self._device_type,
  619. _layout=layout,
  620. _rank_map=root_mesh._rank_map,
  621. mesh_dim_names=submesh_dim_names,
  622. _root_mesh=root_mesh,
  623. _init_backend=False,
  624. )
  625. res_submesh._dim_group_names = slice_dim_group_name
  626. return res_submesh
  627. def _create_flatten_mesh(
  628. self,
  629. mesh_dim_name: str | None = None,
  630. backend_override: BackendConfig = (None, None),
  631. ) -> "DeviceMesh":
  632. root_mesh = self._get_root_mesh()
  633. if not mesh_dim_name:
  634. mesh_dim_name = "_".join(not_none(self._mesh_dim_names))
  635. # Flatten a 1D device mesh into its original mesh_dim_name will return itself.
  636. if self.ndim == 1 and mesh_dim_name in not_none(self._mesh_dim_names):
  637. return self
  638. # Check whether the mesh_dim_name for flattened mesh is valid.
  639. invalid_dim_names = not_none(root_mesh._mesh_dim_names)
  640. if mesh_dim_name in invalid_dim_names:
  641. raise ValueError(
  642. f"{mesh_dim_name} already exists for submesh of the {root_mesh}. ",
  643. f"The mesh_dim_names of submesh and flattened mesh are {invalid_dim_names}. "
  644. f"Please specify another valid mesh_dim_name.",
  645. )
  646. flattened_mesh_layout = self._layout.coalesce()
  647. if len(flattened_mesh_layout) > 1:
  648. flattened_mesh_layout = flattened_mesh_layout.nest()
  649. # Quick return if the flatten mesh has been created before.
  650. if mesh_dim_name in root_mesh._flatten_mapping:
  651. if (
  652. flattened_mesh_layout
  653. == root_mesh._flatten_mapping[mesh_dim_name]._layout
  654. ):
  655. return root_mesh._flatten_mapping[mesh_dim_name]
  656. else:
  657. raise ValueError(
  658. f"Flatten mesh with mesh_dim_name {mesh_dim_name} has been created before, "
  659. f"Please specify another valid mesh_dim_name."
  660. )
  661. res_flattened_mesh = DeviceMesh(
  662. root_mesh._device_type,
  663. _layout=flattened_mesh_layout,
  664. _rank_map=root_mesh._rank_map,
  665. mesh_dim_names=(mesh_dim_name,),
  666. _root_mesh=root_mesh,
  667. backend_override=(backend_override,),
  668. )
  669. root_mesh._flatten_mapping[mesh_dim_name] = res_flattened_mesh
  670. return res_flattened_mesh
  671. def _get_root_mesh_dim(self) -> int | None:
  672. """
  673. Returns the index of the mesh dim in the root mesh.
  674. The device_mesh passed in needs to be sliced out from the root mesh
  675. or submesh of the root mesh.
  676. """
  677. root_mesh = self._get_root_mesh()
  678. child_mesh_dim_names = self._mesh_dim_names
  679. if root_mesh and child_mesh_dim_names:
  680. if len(child_mesh_dim_names) != 1:
  681. raise AssertionError("The submesh can only be a 1D mesh.")
  682. child_mesh_dim_name = child_mesh_dim_names[0]
  683. return root_mesh._get_mesh_dim_by_name(child_mesh_dim_name)
  684. return None
  685. def _get_mesh_dim_by_name(self, mesh_dim_name: str) -> int:
  686. if self._mesh_dim_names is None or len(self._mesh_dim_names) == 0:
  687. raise KeyError(
  688. "No `mesh_dim_names` found.",
  689. )
  690. if mesh_dim_name not in self._mesh_dim_names:
  691. raise KeyError(
  692. f"Mesh dimension '{mesh_dim_name}' does not exist.",
  693. f"Available mesh dimensions are: mesh_dim_names={self._mesh_dim_names}",
  694. )
  695. return not_none(self._mesh_dim_names.index(mesh_dim_name))
  696. def _get_slice_mesh_layout(
  697. self, mesh_dim_names: tuple[str, ...]
  698. ) -> _MeshLayout:
  699. """
  700. Validate whether the mesh_dim_names is valid for slicing the given device_mesh.
  701. If valid, return dim indexes of the slice mesh in the device mesh.
  702. """
  703. slice_from_root = True
  704. if self != self._get_root_mesh():
  705. slice_from_root = False
  706. # The slice mesh_dim_names should consist either the current device_mesh's mesh_dim_names
  707. # or its flattened mesh's mesh_dim_names if it's root_mesh.
  708. flatten_name_to_root_layout = (
  709. {
  710. key: mesh._layout
  711. for key, mesh in self._get_root_mesh()._flatten_mapping.items()
  712. }
  713. if slice_from_root
  714. else {}
  715. )
  716. valid_mesh_dim_names = [
  717. *not_none(self._mesh_dim_names),
  718. *flatten_name_to_root_layout,
  719. ]
  720. if not all(
  721. mesh_dim_name in valid_mesh_dim_names
  722. for mesh_dim_name in mesh_dim_names
  723. ):
  724. raise KeyError(
  725. f"Invalid mesh_dim_names {mesh_dim_names} specified. "
  726. f"Valid mesh_dim_names are {valid_mesh_dim_names}."
  727. )
  728. layout_sliced = []
  729. for name in mesh_dim_names:
  730. if name in not_none(self._mesh_dim_names):
  731. layout_sliced.append(
  732. self._layout[not_none(self._mesh_dim_names).index(name)]
  733. )
  734. elif name in flatten_name_to_root_layout:
  735. warnings.warn(
  736. "Slicing a flattened dim from root mesh will be deprecated in PT 2.11. "
  737. "Users need to bookkeep the flattened mesh directly. ",
  738. stacklevel=2,
  739. )
  740. layout_sliced.append(flatten_name_to_root_layout[name])
  741. sliced_sizes = tuple(l.sizes for l in layout_sliced)
  742. sliced_strides = tuple(l.strides for l in layout_sliced)
  743. # The check below is from DeviceMesh's implementation before adopting CuTe layout for internal
  744. # bookkeeping and it can be removed but we need to define what is the expected behavior.
  745. # TODO: Remove the below check and define the expected behavior.
  746. # Validate the order of the slice mesh dim indices.
  747. # This needs to be in ascending order.
  748. pre_stride = -1
  749. for stride in reversed(sliced_strides):
  750. # Note that with CuTe layout, we can support slicing flattened non-contiguous mesh dims with no problem.
  751. # But this will make this behavior complicated so we decided to not support it for now.
  752. if not is_int(stride):
  753. raise NotImplementedError(
  754. "Currently, this only allows slicing out a contiguous flattened dim."
  755. )
  756. if stride < pre_stride:
  757. raise KeyError(
  758. f"Invalid mesh_dim_names {mesh_dim_names} specified. "
  759. "Mesh dim indices should be in ascending order."
  760. )
  761. pre_stride = stride
  762. # When users sliced dim_names outside from current mesh, we will check whether
  763. # there is layout overlap.
  764. # TODO: Eventually we will just directly throw error here because
  765. # we will deprecate the slicing of flattened dim_name from root mesh.
  766. layout_sliced = _MeshLayout(sliced_sizes, sliced_strides)
  767. if not layout_sliced.check_non_overlap():
  768. raise RuntimeError(
  769. f"Slicing overlapping dim_names {mesh_dim_names} is not allowed."
  770. )
  771. return layout_sliced
  772. # TODO: to make this use case by other components public API in the future.
  773. def _get_all_submeshes(self, mesh_dim_name: str) -> list["DeviceMesh"]:
  774. """
  775. Return all the submeshes of a given mesh dimension of the device mesh.
  776. """
  777. mesh_dim = self._get_mesh_dim_by_name(mesh_dim_name)
  778. layout = self._layout[mesh_dim]
  779. pg_ranks_by_dim = layout.remap_to_tensor(self._rank_map)
  780. cur_rank = self.get_rank()
  781. res_submeshes = []
  782. for mesh_1d in pg_ranks_by_dim:
  783. submesh = DeviceMesh(
  784. self._device_type,
  785. mesh_1d,
  786. mesh_dim_names=(mesh_dim_name,),
  787. _init_backend=False,
  788. )
  789. submesh._dim_group_names = ( # type: ignore[has-type]
  790. [self._dim_group_names[mesh_dim]] # type: ignore[has-type]
  791. if cur_rank in mesh_1d
  792. else []
  793. )
  794. res_submeshes.append(submesh)
  795. return res_submeshes
  796. @staticmethod
  797. def from_group(
  798. group: ProcessGroup | list[ProcessGroup],
  799. device_type: str,
  800. mesh: Union[torch.Tensor, "ArrayLike"] | None = None,
  801. *,
  802. mesh_dim_names: tuple[str, ...] | None = None,
  803. ) -> "DeviceMesh":
  804. """
  805. Constructs a :class:`DeviceMesh` with ``device_type`` from an
  806. existing :class:`ProcessGroup` or a list of existing :class:`ProcessGroup`.
  807. The constructed device mesh has number of dimensions equal to the
  808. number of groups passed. For example, if a single process group is passed in,
  809. the resulted DeviceMesh is a 1D mesh. If a list of 2 process groups is passed in,
  810. the resulted DeviceMesh is a 2D mesh.
  811. If more than one group is passed, then the ``mesh`` and ``mesh_dim_names`` arguments
  812. are required. The order of the process groups passed in determines the topology of
  813. the mesh. For example, the first process group will be the 0th dimension of the DeviceMesh.
  814. The `mesh` tensor passed in must have the same number of dimensions as the number of process
  815. groups passed in, and the order of the dimensions in the `mesh` tensor must match the order
  816. in the process groups passed in.
  817. Args:
  818. group (ProcessGroup or list[ProcessGroup]): the existing ProcessGroup
  819. or a list of existing ProcessGroups.
  820. device_type (str): The device type of the mesh. Currently supports: "cpu",
  821. "cuda/cuda-like". Passing in a device type with a GPU index, such as "cuda:0",
  822. is not allowed.
  823. mesh (torch.Tensor or ArrayLike, optional): A multi-dimensional array or an
  824. integer tensor describing the layout of devices, where the IDs are global IDs
  825. of the default process group. Default is None.
  826. mesh_dim_names (tuple[str, ...], optional): A tuple of mesh dimension names to assign
  827. to each dimension of the multi-dimensional array describing the layout of devices.
  828. Its length must match the length of `mesh_shape`. Each string in `mesh_dim_names`
  829. must be unique. Default is None.
  830. Returns:
  831. DeviceMesh: A :class:`DeviceMesh` object representing the device layout.
  832. """
  833. # 1D scenario
  834. if isinstance(group, ProcessGroup):
  835. group_ranks = get_process_group_ranks(group)
  836. if (
  837. isinstance(mesh, torch.Tensor) and mesh.tolist() != group_ranks
  838. ) or (
  839. mesh is not None
  840. and not isinstance(mesh, torch.Tensor)
  841. and mesh != group_ranks
  842. ):
  843. raise ValueError(
  844. f"Invalid mesh {str(mesh)} for ProcessGroup with ranks {group_ranks}"
  845. )
  846. mesh = torch.tensor(group_ranks, device="cpu", dtype=torch.int)
  847. device_mesh = DeviceMesh(
  848. device_type,
  849. mesh,
  850. mesh_dim_names=mesh_dim_names,
  851. _init_backend=False,
  852. )
  853. device_mesh._dim_group_names = [group.group_name]
  854. return device_mesh
  855. # nD scenario
  856. groups = list(group)
  857. if len(groups) == 0:
  858. raise ValueError("Expects at least one ProcessGroup to be passed")
  859. if mesh is None:
  860. raise ValueError("Must pass mesh if passing multiple ProcessGroups")
  861. if mesh_dim_names is None:
  862. raise ValueError(
  863. "Must pass mesh_dim_names if passing multiple ProcessGroups"
  864. )
  865. # When init a DeviceMesh with multiple ProcessGroups directly, we need to make sure
  866. # the mesh tensor is contiguous. Otherwise, the layout we inferred from the mesh tensor
  867. # will have larger span than the actual tensor. This is just internal implementation detail
  868. # and does not affect user facing behavior.
  869. mesh = (
  870. mesh.detach().to(dtype=torch.int, device="cpu")
  871. if isinstance(mesh, torch.Tensor)
  872. else torch.tensor(mesh, device="cpu", dtype=torch.int)
  873. )
  874. if mesh.ndim != len(groups):
  875. raise ValueError(
  876. "Expects mesh with ndim equal to number of ProcessGroups but got "
  877. f"mesh {mesh.tolist()} and {len(groups)} ProcessGroups"
  878. )
  879. device_mesh = DeviceMesh(
  880. device_type, mesh, mesh_dim_names=mesh_dim_names, _init_backend=False
  881. )
  882. device_mesh._dim_group_names = [group.group_name for group in groups]
  883. return device_mesh
  884. def size(self, mesh_dim: int | None = None) -> int:
  885. if mesh_dim is not None:
  886. return self._layout[mesh_dim].numel()
  887. return self._layout.numel()
  888. @property
  889. def ndim(self) -> int:
  890. return len(self._layout)
  891. @property
  892. def shape(self) -> tuple[int, ...]:
  893. return self._layout.top_level_sizes
  894. def get_rank(self) -> int:
  895. """
  896. Returns the current global rank.
  897. """
  898. return get_rank()
  899. def get_local_rank(self, mesh_dim: int | str | None = None) -> int:
  900. """
  901. Returns the local rank of the given mesh_dim of the DeviceMesh.
  902. Args:
  903. mesh_dim (str/int, optional): it can be the name of the mesh dimension or the index
  904. of the mesh dimension. Default is None.
  905. Returns:
  906. An integer denotes the local rank.
  907. The following program runs on each process/rank in an SPMD manner. In this example, we have 2
  908. hosts with 4 GPUs each.
  909. Calling mesh_2d.get_local_rank(mesh_dim=0) on rank 0, 1, 2, 3 would return 0.
  910. Calling mesh_2d.get_local_rank(mesh_dim=0) on rank 4, 5, 6, 7 would return 1.
  911. Calling mesh_2d.get_local_rank(mesh_dim=1) on rank 0, 4 would return 0.
  912. Calling mesh_2d.get_local_rank(mesh_dim=1) on rank 1, 5 would return 1.
  913. Calling mesh_2d.get_local_rank(mesh_dim=1) on rank 2, 6 would return 2.
  914. Calling mesh_2d.get_local_rank(mesh_dim=1) on rank 3, 7 would return 3.
  915. Example::
  916. >>> # xdoctest: +SKIP("no rank")
  917. >>> from torch.distributed.device_mesh import DeviceMesh
  918. >>>
  919. >>> # Initialize device mesh as (2, 4) to represent the topology
  920. >>> # of cross-host(dim 0), and within-host (dim 1).
  921. >>> mesh = DeviceMesh(device_type="cuda", mesh=[[0, 1, 2, 3],[4, 5, 6, 7]])
  922. """
  923. if self.ndim > 1 and mesh_dim is None:
  924. raise RuntimeError(
  925. f"Found the DeviceMesh have {len(self._layout)} dimensions",
  926. "Optional kwarg `mesh_dim` needs to be specified when device_mesh.ndim > 1.",
  927. )
  928. elif mesh_dim is None:
  929. mesh_dim = 0
  930. mesh_dim_group = not_none(self.get_group(mesh_dim))
  931. if not isinstance(mesh_dim_group, ProcessGroup):
  932. raise AssertionError(
  933. "We expect ProcessGroup before calling `get_rank`!"
  934. )
  935. return not_none(get_rank(mesh_dim_group))
  936. def get_coordinate(self) -> list[int] | None:
  937. """
  938. Return the relative indices of this rank relative to all
  939. dimensions of the mesh. If this rank is not part of the mesh, return None.
  940. """
  941. return self._coordinate_on_dim if self._coordinate_on_dim else None
  942. def _flatten(
  943. self,
  944. mesh_dim_name: str | None = None,
  945. backend_override: None
  946. | str
  947. | C10dBackend.Options
  948. | tuple[str, C10dBackend.Options] = None,
  949. ) -> "DeviceMesh":
  950. """
  951. Returns a 1D DeviceMesh by flattening the current DeviceMesh.
  952. If no mesh_dim_name is provided, the default is a string concatenating the mesh_dim_names of the
  953. given submesh with each mesh_dim_name separated by "_". For example, if we have a 3D mesh
  954. DeviceMesh([[[0, 1], [2, 3]], [[4, 5], [6, 7]]], mesh_dim_names=("dp", "cp", "tp")), calling
  955. mesh_3d["dp", "cp"]._flatten() will create a 1D submesh DeviceMesh([0, 2, 4, 6], mesh_dim_names=("dp_cp",))
  956. on rank 0, 2, 4, 6 and a 1D submesh DeviceMesh([1, 3, 5, 7], mesh_dim_names=("dp_cp",)) on rank 1, 3, 5, 7.
  957. After the flattened dimension is created, to access the flattened dimension in mesh_3d, one can use the
  958. existing slicing method to obtain the flattened mesh through calling mesh_3d["dp_cp"].
  959. """
  960. if not self._mesh_dim_names:
  961. raise RuntimeError(
  962. "Cannot flatten a DeviceMesh without mesh_dim_names!"
  963. )
  964. if backend_override is not None:
  965. (backend_override_tuple,) = _normalize_backend_override(
  966. {0: backend_override}, 1
  967. )
  968. else:
  969. backend_override_tuple = (None, None)
  970. return self._create_flatten_mesh(mesh_dim_name, backend_override_tuple)
  971. def _create_unflatten_mesh(
  972. self,
  973. dim: int,
  974. mesh_sizes: tuple[int, ...],
  975. mesh_dim_names: tuple[str, ...],
  976. backend_override: tuple[
  977. tuple[str | None, C10dBackend.Options | None], ...
  978. ] = ((None, None),),
  979. ) -> "DeviceMesh":
  980. inner_layout = _MeshLayout(tuple(mesh_sizes), suffix_product(mesh_sizes))
  981. if inner_layout.numel() != self._layout[dim].numel():
  982. raise ValueError(
  983. f"The product of {mesh_sizes=} is {inner_layout.numel()}, "
  984. f"but the original dimension at dim={dim} has size {self._layout[dim].numel()}. "
  985. f"These must be equal for unflatten to work correctly."
  986. )
  987. partial_layout = self._layout[dim].composition(inner_layout)
  988. unflattened_layout = self._layout.splice(dim, dim + 1, partial_layout)
  989. unflattened_mesh_dim_names = list(not_none(self.mesh_dim_names))
  990. unflattened_mesh_dim_names[dim : dim + 1] = list(mesh_dim_names)
  991. root_mesh = self._get_root_mesh()
  992. res_mesh = DeviceMesh(
  993. self.device_type,
  994. _layout=unflattened_layout,
  995. _rank_map=root_mesh._rank_map,
  996. mesh_dim_names=tuple(unflattened_mesh_dim_names),
  997. _root_mesh=root_mesh,
  998. _init_backend=False,
  999. )
  1000. # If original mesh has initiated its backend, we need to initialize the backend
  1001. # of unflatten dims as well.
  1002. # TODO: To make backend init more efficient with cute layout representation and support
  1003. # per dim backend init.
  1004. if hasattr(self, "_dim_group_names"):
  1005. dim_group_names = self._dim_group_names.copy()
  1006. dim_group_names[dim : dim + 1] = self._init_process_groups(
  1007. partial_layout,
  1008. root_mesh._rank_map,
  1009. mesh_dim_names,
  1010. backend_override,
  1011. )
  1012. res_mesh._dim_group_names = dim_group_names
  1013. return res_mesh
  1014. def _unflatten(
  1015. self,
  1016. dim: int | str,
  1017. mesh_sizes: tuple[int, ...],
  1018. mesh_dim_names: tuple[str, ...],
  1019. backend_override: dict[
  1020. str, str | C10dBackend.Options | tuple[str, C10dBackend.Options]
  1021. ]
  1022. | None = None,
  1023. ) -> "DeviceMesh":
  1024. """
  1025. Returns a DeviceMesh by unflatten the current DeviceMesh.
  1026. This api can be used to unflatten a N-D DeviceMesh into N-1+len(mesh_sizes)-D meshes or submeshes.
  1027. The dim is the dimension to be unflattened which can be either a string or an integer.
  1028. The mesh_sizes is a tuple which specifies the shape of the mesh unflatten into for the given dim.
  1029. The mesh_dim_names is a list of strings which specifies the names of the dimensions of the mesh unflatten into.
  1030. Its length must match the length of mesh_sizes.
  1031. For example, if we have a 1D mesh DeviceMesh([0, 1, 2, 3, 4, 5, 6, 7], mesh_dim_names=("world")),
  1032. calling mesh_1d._unflatten(0, (2, 2, 4), ["dp", "pp", "tp"]) will create a 3D mesh
  1033. DeviceMesh([[[0, 1], [2, 3]], [[4, 5], [6, 7]]], mesh_dim_names=("dp", "cp", "tp")).
  1034. Note that after calling the unflatten, there is no access to the unflattened dimension in mesh_1d, one can only
  1035. use the newly unflattened mesh to slice out the unflattened mesh dims.
  1036. """
  1037. if isinstance(dim, int) and dim >= self.ndim:
  1038. raise ValueError(
  1039. f"dim {dim} specified in `_unflatten` is out of range {self.ndim}"
  1040. )
  1041. elif isinstance(dim, str) and dim in not_none(self.mesh_dim_names):
  1042. raise ValueError(
  1043. f"dim {dim} specified in `_unflatten` is not in {self.mesh_dim_names}"
  1044. )
  1045. if len(mesh_sizes) != len(mesh_dim_names):
  1046. raise RuntimeError(
  1047. "mesh_dim_names must have same length as mesh_sizes in _unflatten!"
  1048. )
  1049. if isinstance(dim, str):
  1050. dim = not_none(self.mesh_dim_names).index(dim)
  1051. if backend_override is not None:
  1052. backend_override_tuple = tuple(
  1053. _normalize_backend_override(
  1054. backend_override, # type: ignore[arg-type]
  1055. len(mesh_sizes),
  1056. mesh_dim_names,
  1057. )
  1058. )
  1059. else:
  1060. backend_override_tuple = ((None, None),) * len(mesh_dim_names)
  1061. return self._create_unflatten_mesh(
  1062. dim,
  1063. mesh_sizes,
  1064. mesh_dim_names,
  1065. backend_override_tuple,
  1066. )
  1067. @staticmethod
  1068. def _concatenate(device_mesh_list: list["DeviceMesh"]) -> "DeviceMesh":
  1069. concat_dim_names: list[str] = []
  1070. concat_sizes: list[IntTuple] = []
  1071. concat_strides: list[IntTuple] = []
  1072. concat_dim_group_name: list[GroupName] = []
  1073. flatten_rank_map = device_mesh_list[0]._flatten_rank_map
  1074. for dm in device_mesh_list:
  1075. for i in range(len(dm._layout)):
  1076. concat_sizes.append(dm._layout[i].sizes)
  1077. concat_strides.append(dm._layout[i].strides)
  1078. concat_dim_names.extend(not_none(dm.mesh_dim_names))
  1079. concat_dim_group_name.extend(not_none(dm._dim_group_names))
  1080. # Concatenate device mesh having different root mesh tensors are meaningless
  1081. # because the concatenated indices should be indexed by the same root mesh tensor.
  1082. if dm._flatten_rank_map != flatten_rank_map:
  1083. raise RuntimeError(
  1084. "Cannot concatenate DeviceMeshes derived from different device meshs"
  1085. )
  1086. concat_mesh_layout = _MeshLayout(tuple(concat_sizes), tuple(concat_strides))
  1087. if not concat_mesh_layout.check_non_overlap():
  1088. raise RuntimeError(
  1089. f"Cannot concatenate overlapping meshes: {device_mesh_list}"
  1090. )
  1091. res_mesh = DeviceMesh(
  1092. device_mesh_list[0].device_type,
  1093. _layout=concat_mesh_layout,
  1094. _rank_map=device_mesh_list[0]._rank_map,
  1095. mesh_dim_names=tuple(concat_dim_names),
  1096. _root_mesh=device_mesh_list[0]._get_root_mesh(),
  1097. _init_backend=False,
  1098. )
  1099. res_mesh._dim_group_names = concat_dim_group_name
  1100. return res_mesh
  1101. def _normalize_backend_override(
  1102. backend_override: dict[
  1103. int | str,
  1104. str | C10dBackend.Options | tuple[str, C10dBackend.Options],
  1105. ],
  1106. ndim: int,
  1107. mesh_dim_names: tuple[str, ...] | None = None,
  1108. ) -> Iterator[BackendConfig]:
  1109. if mesh_dim_names is None:
  1110. mesh_dim_names = ()
  1111. for dim_idx, dim_name in zip_longest(range(ndim), mesh_dim_names):
  1112. if dim_name is not None and dim_name in backend_override:
  1113. if dim_idx in backend_override:
  1114. raise RuntimeError(
  1115. f"Found redundant dim index {dim_idx} and "
  1116. f"name {dim_name} in backend_override"
  1117. )
  1118. val = backend_override.pop(dim_name)
  1119. elif dim_idx in backend_override:
  1120. val = backend_override.pop(dim_idx)
  1121. else:
  1122. yield (None, None)
  1123. continue
  1124. if isinstance(val, str):
  1125. yield (val, None)
  1126. elif isinstance(val, C10dBackend.Options):
  1127. yield (None, val)
  1128. else:
  1129. yield val
  1130. if backend_override:
  1131. raise RuntimeError(
  1132. f"Found invalid keys in backend_override: got {list(backend_override.keys())}, "
  1133. f"expected integers in range [0, {ndim}) or one of {mesh_dim_names}"
  1134. )
  1135. def init_device_mesh(
  1136. device_type: str,
  1137. mesh_shape: tuple[int, ...],
  1138. *,
  1139. mesh_dim_names: tuple[str, ...] | None = None,
  1140. backend_override: dict[
  1141. int | str, str | C10dBackend.Options | tuple[str, C10dBackend.Options]
  1142. ]
  1143. | None = None,
  1144. ) -> DeviceMesh:
  1145. """
  1146. Initializes a `DeviceMesh` based on `device_type`, `mesh_shape`, and `mesh_dim_names` parameters.
  1147. This creates a DeviceMesh with an n-dimensional array layout, where `n` is the length of `mesh_shape`.
  1148. If `mesh_dim_names` is provided, each dimension is labeled as `mesh_dim_names[i]`.
  1149. .. note::
  1150. `init_device_mesh` follows SPMD programming model, meaning the same PyTorch Python program
  1151. runs on all processes/ranks in the cluster. Ensure `mesh_shape` (the dimensions of the nD array
  1152. describing device layout) is identical across all ranks. Inconsistent `mesh_shape` may lead to hanging.
  1153. .. note::
  1154. If no process group is found, init_device_mesh will initialize distributed process group/groups
  1155. required for distributed communications behind the scene.
  1156. Args:
  1157. device_type (str): The device type of the mesh. Currently supports: "cpu", "cuda/cuda-like", "xpu".
  1158. Passing in a device type with a GPU index, such as "cuda:0", is not allowed.
  1159. mesh_shape (Tuple[int]): A tuple defining the dimensions of the multi-dimensional array
  1160. describing the layout of devices.
  1161. mesh_dim_names (tuple[str, ...], optional): A tuple of mesh dimension names to assign to each dimension
  1162. of the multi-dimensional array describing the layout of devices. Its length must match the length
  1163. of `mesh_shape`. Each string in `mesh_dim_names` must be unique.
  1164. backend_override (Dict[int | str, tuple[str, Options] | str | Options], optional): Overrides for some or all of
  1165. the ProcessGroups that will be created for each mesh dimension. Each key can be either the index of a
  1166. dimension or its name (if mesh_dim_names is provided). Each value can be a tuple containing the name
  1167. of the backend and its options, or just one of these two components (in which case the other will be
  1168. set to its default value).
  1169. Returns:
  1170. DeviceMesh: A :class:`DeviceMesh` object representing the device layout.
  1171. Example::
  1172. >>> # xdoctest: +SKIP("no rank")
  1173. >>> from torch.distributed.device_mesh import init_device_mesh
  1174. >>>
  1175. >>> mesh_1d = init_device_mesh("cuda", mesh_shape=(8,))
  1176. >>> mesh_2d = init_device_mesh("cuda", mesh_shape=(2, 8), mesh_dim_names=("dp", "tp"))
  1177. """
  1178. if mesh_dim_names is not None:
  1179. if len(set(mesh_dim_names)) != len(mesh_dim_names):
  1180. raise RuntimeError(
  1181. "Each mesh_dim_name must be unique.",
  1182. f"Found repeated mesh_dim_name in mesh_dim_names {mesh_dim_names}",
  1183. )
  1184. if len(mesh_shape) != len(mesh_dim_names):
  1185. raise RuntimeError(
  1186. "mesh_shape and mesh_dim_names should have same length!",
  1187. f"Found len(mesh_dim_names): {len(mesh_dim_names)} and len(mesh_shape):{len(mesh_shape)}.",
  1188. )
  1189. if backend_override is not None:
  1190. backend_override_tuple = tuple(
  1191. _normalize_backend_override(
  1192. backend_override, len(mesh_shape), mesh_dim_names
  1193. )
  1194. )
  1195. else:
  1196. backend_override_tuple = None
  1197. # assume valid device types are all letters
  1198. if device_type and not device_type.isalpha():
  1199. raise RuntimeError(
  1200. f"Device type with index is not supported but got {device_type}. ",
  1201. "If you maintained a 'torch.device' object, it's recommended to pass in 'device.type'.",
  1202. )
  1203. layout = _MeshLayout(tuple(mesh_shape), suffix_product(tuple(mesh_shape)))
  1204. # Always initialize the (identity) rank map on CPU, regardless of what the
  1205. # external device type has been set to be (e.g. meta)
  1206. with torch.device("cpu"):
  1207. rank_map = torch.arange(layout.numel(), dtype=torch.int)
  1208. device_mesh = DeviceMesh(
  1209. device_type=device_type,
  1210. _layout=layout,
  1211. _rank_map=rank_map,
  1212. mesh_dim_names=mesh_dim_names,
  1213. backend_override=backend_override_tuple,
  1214. )
  1215. return device_mesh