distributed.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
  1. # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import torch
  15. import torch.distributed as dist
  16. from megatron_util import mpu
  17. from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
  18. from torch.autograd import Variable
  19. from torch.nn.modules import Module
  20. from torch.nn.parallel.distributed import DistributedDataParallel as DDP
  21. class PyTorchDistributedDataParallel(DDP):
  22. def named_parameters(self, prefix: str = '', recurse: bool = True):
  23. return self.module.named_parameters(prefix=prefix, recurse=recurse)
  24. def state_dict(self, destination=None, prefix='', keep_vars=False):
  25. sd = self.module.state_dict(destination, prefix, keep_vars)
  26. return sd
  27. def load_state_dict(self, state_dict, strict=True):
  28. return self.module.load_state_dict(state_dict, strict=strict)
  29. class DistributedDataParallel(Module):
  30. def __init__(self, module):
  31. super(DistributedDataParallel, self).__init__()
  32. self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
  33. self.module = module
  34. self.data_parallel_group = mpu.get_data_parallel_group()
  35. src_rank = mpu.get_model_parallel_rank()
  36. for p in self.module.parameters():
  37. if torch.is_tensor(p):
  38. dist.broadcast(p, src_rank, group=self.data_parallel_group)
  39. def allreduce_params(reduce_after=True,
  40. no_scale=False,
  41. fp32_allreduce=False):
  42. if (self.needs_reduction):
  43. self.needs_reduction = False
  44. buckets = {}
  45. for name, param in self.module.named_parameters():
  46. if param.requires_grad and param.grad is not None:
  47. tp = (param.data.type())
  48. if tp not in buckets:
  49. buckets[tp] = []
  50. buckets[tp].append(param)
  51. if self.warn_on_half:
  52. if torch.cuda.HalfTensor in buckets:
  53. print(
  54. 'WARNING: gloo dist backend for half parameters may be extremely slow. It is recommended to use the NCCL backend in this case.' # noqa
  55. )
  56. self.warn_on_half = False
  57. for tp in buckets:
  58. bucket = buckets[tp]
  59. grads = [param.grad.data for param in bucket]
  60. coalesced = _flatten_dense_tensors(grads)
  61. if fp32_allreduce:
  62. coalesced = coalesced.float()
  63. if not no_scale and not reduce_after:
  64. coalesced /= dist.get_world_size(
  65. group=self.data_parallel_group)
  66. dist.all_reduce(coalesced, group=self.data_parallel_group)
  67. torch.cuda.synchronize()
  68. if not no_scale and reduce_after:
  69. coalesced /= dist.get_world_size(
  70. group=self.data_parallel_group)
  71. for buf, synced in zip(
  72. grads, _unflatten_dense_tensors(coalesced, grads)):
  73. buf.copy_(synced)
  74. self.hook_handles = []
  75. self.hooks = []
  76. for param in list(self.module.parameters()):
  77. def allreduce_hook(*unused):
  78. Variable._execution_engine.queue_callback(allreduce_params)
  79. self.allreduce_params = allreduce_params
  80. def forward(self, *inputs, **kwargs):
  81. self.needs_reduction = True
  82. return self.module(*inputs, **kwargs)
  83. def state_dict(self, destination=None, prefix='', keep_vars=False):
  84. sd = self.module.state_dict(destination, prefix, keep_vars)
  85. return sd
  86. def load_state_dict(self, state_dict, strict=True):
  87. return self.module.load_state_dict(state_dict, strict=strict)
  88. def named_parameters(self, prefix: str = '', recurse: bool = True):
  89. return self.module.named_parameters(prefix=prefix, recurse=recurse)
  90. '''
  91. def _sync_buffers(self):
  92. buffers = list(self.module._all_buffers())
  93. if len(buffers) > 0:
  94. # cross-node buffer sync
  95. flat_buffers = _flatten_dense_tensors(buffers)
  96. dist.broadcast(flat_buffers, 0)
  97. for buf, synced in zip(buffers, _unflatten_dense_tensors(flat_buffers, buffers)):
  98. buf.copy_(synced)
  99. def train(self, mode=True):
  100. # Clear NCCL communicator and CUDA event cache of the default group ID,
  101. # These cache will be recreated at the later call. This is currently a
  102. # work-around for a potential NCCL deadlock.
  103. if dist._backend == dist.dist_backend.NCCL:
  104. dist._clear_group_cache()
  105. super(DistributedDataParallel, self).train(mode)
  106. self.module.train(mode)
  107. '''