api.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306
  1. # mypy: allow-untyped-defs
  2. from contextlib import contextmanager
  3. from typing import Optional
  4. import torch
  5. import torch.distributed as dist
  6. import torch.nn as nn
  7. from torch.distributed import distributed_c10d
  8. from torch.distributed._shard.sharded_tensor import ShardedTensor
  9. from .sharder import Sharder
  10. from .sharding_plan import ShardingPlan
  11. from .sharding_spec import ChunkShardingSpec, ShardingSpec
  12. def _shard_tensor(
  13. tensor: torch.Tensor, sharding_spec: ShardingSpec, src_rank=0, process_group=None
  14. ) -> ShardedTensor:
  15. """
  16. Given a :class:`torch.Tensor`, it shards that tensor according to the provided
  17. ``sharding_spec``. ``src_rank`` denotes the source rank which would be
  18. used as the ground truth of the data which would be scattered as shards
  19. across the rest of the ranks.
  20. Args:
  21. tensor (:class:`torch.Tensor`): Tensor needs to be sharded.
  22. sharding_spec (:class:`torch.distributed._shard.sharding_spec.ShardingSpec`): The specification
  23. describing how to shard the Tensor.
  24. Keyword args:
  25. src_rank (int, optional): The source rank which is used as the ground truth of
  26. the data for the parameter that would be sharded and scattered
  27. across the rest of the ranks.
  28. Default: 0.
  29. process_group (ProcessGroup, optional): The process group to work on. If None,
  30. the default process group will be used.
  31. Returns:
  32. A :class:`ShardedTensor` sharded from the given tensor.
  33. .. warning::
  34. Only :class:`torch.distributed._shard.sharding_spec.ChunkShardingSpec` is
  35. currently supported as the ``sharding_spec``.
  36. """
  37. if not tensor.is_contiguous():
  38. raise ValueError("input tensor is not a contiguous Tensor")
  39. pg = (
  40. process_group
  41. if process_group is not None
  42. else distributed_c10d._get_default_group()
  43. )
  44. world_size = dist.get_world_size(pg)
  45. current_rank = dist.get_rank(pg)
  46. # Validate src_rank and sharding_spec are same across all ranks.
  47. gathered_list = [None] * world_size
  48. dist.all_gather_object(gathered_list, (src_rank, sharding_spec), group=pg)
  49. for idx, entry in enumerate(gathered_list):
  50. if src_rank != entry[0]: # type: ignore[index]
  51. raise ValueError(
  52. f"src_rank={src_rank} on rank: {current_rank} does not " # type: ignore[index]
  53. f"match with src_rank={entry[0]} on rank: {idx}" # type: ignore[index]
  54. )
  55. if sharding_spec != entry[1]: # type: ignore[index]
  56. raise ValueError(
  57. f"sharding_spec={sharding_spec} on rank: {current_rank} does not " # type: ignore[index]
  58. f"match with sharding_spec={entry[1]} on rank: {idx}" # type: ignore[index]
  59. )
  60. st = sharding_spec.shard(tensor, src_rank=src_rank, process_group=pg)
  61. return st
  62. def shard_parameter(
  63. module: torch.nn.Module,
  64. param_name: str,
  65. sharding_spec: ShardingSpec,
  66. src_rank=0,
  67. process_group=None,
  68. ):
  69. """
  70. Given a :class:`torch.nn.Module`, a ``param_name`` for a parameter in that
  71. module, it shards that parameter according to the provided
  72. ``sharding_spec``. ``src_rank`` denotes the source rank which would be
  73. used as the ground truth of the data which would be scattered as shards
  74. across the rest of the ranks.
  75. This method replaces ``module.param_name`` with a
  76. :class:`torch.distributed._sharded_tensor.ShardedTensor`
  77. Args:
  78. module (:class:`torch.nn.Module`): Module whose parameter needs to be sharded.
  79. param_name (str): Name of the parameter of ``module`` that needs to be sharded.
  80. sharding_spec (:class:`torch.distributed._shard.sharding_spec.ShardingSpec`): The specification
  81. describing how to shard the Tensor.
  82. Keyword args:
  83. src_rank (int, optional): The source rank which is used as the ground truth of
  84. the data for the parameter that would be sharded and scattered
  85. across the rest of the ranks.
  86. Default: 0.
  87. process_group (ProcessGroup, optional): The process group to work on. If None,
  88. the default process group will be used.
  89. .. warning::
  90. Only :class:`torch.distributed._shard.sharding_spec.ChunkShardingSpec` is
  91. currently supported as the ``sharding_spec``.
  92. """
  93. # Perform some validation first.
  94. if not hasattr(module, param_name):
  95. raise AttributeError(f"{module._get_name()} has no attribute `{param_name}`")
  96. tensor = getattr(module, param_name)
  97. if not isinstance(tensor, torch.Tensor):
  98. raise ValueError(
  99. f"Expected {type(module).__name__}.{param_name} to be a Tensor, but found {type(tensor).__name__}"
  100. )
  101. if not tensor.is_contiguous():
  102. raise ValueError(f"param: {param_name} is not a contiguous Tensor")
  103. st = _shard_tensor(tensor, sharding_spec, src_rank, process_group)
  104. # Replace param with ShardedTensor.
  105. module.register_parameter(param_name, nn.Parameter(st))
  106. # Tracks the current process group in the load context manager.
  107. _CURRENT_PROCESS_GROUP: Optional[dist.ProcessGroup] = None
  108. @contextmanager
  109. def load_with_process_group(process_group):
  110. """
  111. Context manager to set the process group with which to load a ShardedTensor.
  112. """
  113. global _CURRENT_PROCESS_GROUP
  114. if _CURRENT_PROCESS_GROUP is not None:
  115. raise RuntimeError(
  116. 'ProcessGroup already set by previous "load_with_process_group" '
  117. "context manager"
  118. )
  119. _CURRENT_PROCESS_GROUP = process_group
  120. try:
  121. yield process_group
  122. finally:
  123. _CURRENT_PROCESS_GROUP = None
  124. def _get_current_process_group():
  125. """
  126. Retrieves the current process group set by ``load_with_process_group``.
  127. If not set, it just returns the default group.
  128. """
  129. global _CURRENT_PROCESS_GROUP
  130. if _CURRENT_PROCESS_GROUP is None:
  131. return distributed_c10d._get_default_group()
  132. else:
  133. return _CURRENT_PROCESS_GROUP
  134. def _reshard_output(
  135. module: torch.nn.Module, resharding_spec: ShardingSpec
  136. ) -> torch.nn.Module:
  137. """
  138. Hook a module with output resharding in the forward pass according
  139. to the given ``resharding_spec``.
  140. Args:
  141. module (:class:`torch.nn.Module`): Module whose output needs to be resharded.
  142. resharding_spec (:class:`torch.distributed._shard.sharding_spec.ShardingSpec`):
  143. The specification describing how the output of the module will be resharded.
  144. Returns:
  145. A :class:`torch.nn.Module` object with reshard API hooked.
  146. """
  147. def hook_func(_module, _input, output):
  148. if isinstance(output, ShardedTensor):
  149. return output.reshard(resharding_spec)
  150. return output
  151. module.register_forward_hook(hook_func)
  152. return module
  153. def _collect_local_shard(module: torch.nn.Module) -> torch.nn.Module:
  154. """
  155. Hook a module with local shards collection in the forward pass.
  156. This API is typically used to convert a sharded representation back to data parallel
  157. representation. In particular, it returns the local tensor for this Shard. If the
  158. size along the sharding dimension for the local tensor is 1, this dimension is removed
  159. from the final result. For example a [4, 16] ShardedTensor across 4 ranks is typically
  160. a local Tensor of size [16] across each rank and not [1, 16] across each rank.
  161. Args:
  162. module (:class:`torch.nn.Module`): Module whose output is ShardedTensor and the
  163. local tensor value needs to be returned.
  164. Returns:
  165. A :class:`torch.nn.Module` object with collection API hooked.
  166. """
  167. def hook_func(_module, _input, output):
  168. if isinstance(output, ShardedTensor):
  169. local_tensor = output.local_tensor()
  170. # Squeeze the # of dimensions manually, only applicable to ChunkShardingSpec
  171. sharding_spec = output._sharding_spec
  172. if (
  173. isinstance(sharding_spec, ChunkShardingSpec)
  174. and local_tensor.size(sharding_spec.dim) == 1 # type: ignore[attr-defined, arg-type]
  175. ):
  176. local_tensor = local_tensor.squeeze(
  177. output._sharding_spec.dim # type: ignore[attr-defined]
  178. )
  179. return local_tensor
  180. module.register_forward_hook(hook_func)
  181. return module
  182. def shard_module(module: nn.Module, plan: ShardingPlan, src_rank=0, process_group=None):
  183. """
  184. Shards a given module according to the provided sharding `plan`. This method
  185. first shards all the parameters according to the given sharding `plan`. Then if
  186. `output_plan` and `return_local_tensor` are specified in the sharding `plan`, it
  187. will tag the output of modules according `output_plan`, convert the module's
  188. output back to data parallel according to `return_local_tensor`.
  189. Needs to be called on all ranks in an SPMD fashion.
  190. Args:
  191. module (:class:`torch.nn.Module`): The module to apply sharding to
  192. plan (:class:`torch.distributed._shard.sharding_plan.ShardingPlan`):
  193. The ShardingPlan which specified param name to ShardingSpec to apply to
  194. each parameter.
  195. Keyword args:
  196. src_rank (int, optional): The source rank which is used as the ground truth of
  197. the data for the module that would be sharded and scattered across the rest
  198. of the ranks.
  199. Default: 0.
  200. process_group (ProcessGroup, optional): The process group to work on. If None,
  201. the default process group will be used.
  202. """
  203. # record Sharder paths for sanity check on the plan to ensure items in the plan
  204. # does not conflict with the submodule tree that the Sharder is working with
  205. sharder_paths = []
  206. for name, spec in plan.plan.items():
  207. if isinstance(spec, Sharder):
  208. sharder_paths.append(name)
  209. # shard the parameter according to the ShardingPlan
  210. for name, spec in plan.plan.items():
  211. if isinstance(spec, ShardingSpec):
  212. # if found a sharding spec, try to shard the parameter
  213. module_path, _, param_name = name.rpartition(".")
  214. for sharder_path in sharder_paths:
  215. if module_path.startswith(sharder_path):
  216. raise RuntimeError(
  217. f"ShardingPlan is in-valid, trying to shard a parameter: {name},"
  218. f" but there's already a Sharder entry for module {sharder_path},"
  219. f" parameter sharding should not conflict with the submodule tree"
  220. f" that a Sharder is working with!"
  221. )
  222. mod = module.get_submodule(module_path)
  223. shard_parameter(
  224. mod, param_name, spec, src_rank=src_rank, process_group=process_group
  225. )
  226. elif isinstance(spec, Sharder):
  227. parent_mod_path, _, _mod_name = name.rpartition(".")
  228. if name == "":
  229. raise KeyError("Module path must not be empty for custom sharder!")
  230. mod = module.get_submodule(name)
  231. parent_mod = module.get_submodule(parent_mod_path)
  232. sharded_mod = spec.shard(mod)
  233. # swap this submodule with the sharded module
  234. parent_mod.mod_name = sharded_mod
  235. else:
  236. raise TypeError(
  237. f"Only `ShardingSpec` and `Sharder` are supported to shard '{name}'"
  238. )
  239. # reshard output if there's an entry in `reshard_output` for this module
  240. if plan.output_plan is not None:
  241. for module_path, output_spec in plan.output_plan.items():
  242. if isinstance(output_spec, ShardingSpec):
  243. mod = module.get_submodule(module_path)
  244. _reshard_output(mod, output_spec)
  245. else:
  246. raise TypeError(
  247. f"Only `ShardingSpec` is supported as output_plan for '{module_path}'"
  248. )
  249. # convert the output back to data parallel for the modules appears in
  250. # `return_local_tensor` of the plan, we will call `_collect_local_shard`
  251. # to collect the local tensor for output of modules
  252. if plan.return_local_tensor is not None:
  253. for module_path in plan.return_local_tensor:
  254. mod = module.get_submodule(module_path)
  255. _collect_local_shard(mod)