default.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. #!/usr/bin/env python
  2. # Copyright 2021 The HuggingFace Team. All rights reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. from pathlib import Path
  16. import torch
  17. from ...utils import (
  18. is_hpu_available,
  19. is_mlu_available,
  20. is_musa_available,
  21. is_npu_available,
  22. is_sdaa_available,
  23. is_xpu_available,
  24. )
  25. from .config_args import ClusterConfig, default_json_config_file
  26. from .config_utils import SubcommandHelpFormatter
  27. description = "Create a default config file for Accelerate with only a few flags set."
  28. def write_basic_config(mixed_precision="no", save_location: str = default_json_config_file):
  29. """
  30. Creates and saves a basic cluster config to be used on a local machine with potentially multiple GPUs. Will also
  31. set CPU if it is a CPU-only machine.
  32. Args:
  33. mixed_precision (`str`, *optional*, defaults to "no"):
  34. Mixed Precision to use. Should be one of "no", "fp16", or "bf16"
  35. save_location (`str`, *optional*, defaults to `default_json_config_file`):
  36. Optional custom save location. Should be passed to `--config_file` when using `accelerate launch`. Default
  37. location is inside the huggingface cache folder (`~/.cache/huggingface`) but can be overridden by setting
  38. the `HF_HOME` environmental variable, followed by `accelerate/default_config.yaml`.
  39. """
  40. path = Path(save_location)
  41. path.parent.mkdir(parents=True, exist_ok=True)
  42. if path.exists():
  43. print(
  44. f"Configuration already exists at {save_location}, will not override. Run `accelerate config` manually or pass a different `save_location`."
  45. )
  46. return False
  47. mixed_precision = mixed_precision.lower()
  48. if mixed_precision not in ["no", "fp16", "bf16", "fp8"]:
  49. raise ValueError(
  50. f"`mixed_precision` should be one of 'no', 'fp16', 'bf16', or 'fp8'. Received {mixed_precision}"
  51. )
  52. config = {
  53. "compute_environment": "LOCAL_MACHINE",
  54. "mixed_precision": mixed_precision,
  55. }
  56. if is_mlu_available():
  57. num_mlus = torch.mlu.device_count()
  58. config["num_processes"] = num_mlus
  59. config["use_cpu"] = False
  60. if num_mlus > 1:
  61. config["distributed_type"] = "MULTI_MLU"
  62. else:
  63. config["distributed_type"] = "NO"
  64. if is_sdaa_available():
  65. num_sdaas = torch.sdaa.device_count()
  66. config["num_processes"] = num_sdaas
  67. config["use_cpu"] = False
  68. if num_sdaas > 1:
  69. config["distributed_type"] = "MULTI_SDAA"
  70. else:
  71. config["distributed_type"] = "NO"
  72. elif is_musa_available():
  73. num_musas = torch.musa.device_count()
  74. config["num_processes"] = num_musas
  75. config["use_cpu"] = False
  76. if num_musas > 1:
  77. config["distributed_type"] = "MULTI_MUSA"
  78. else:
  79. config["distributed_type"] = "NO"
  80. elif is_hpu_available():
  81. num_hpus = torch.hpu.device_count()
  82. config["num_processes"] = num_hpus
  83. config["use_cpu"] = False
  84. if num_hpus > 1:
  85. config["distributed_type"] = "MULTI_HPU"
  86. else:
  87. config["distributed_type"] = "NO"
  88. elif torch.cuda.is_available():
  89. num_gpus = torch.cuda.device_count()
  90. config["num_processes"] = num_gpus
  91. config["use_cpu"] = False
  92. if num_gpus > 1:
  93. config["distributed_type"] = "MULTI_GPU"
  94. else:
  95. config["distributed_type"] = "NO"
  96. elif is_xpu_available():
  97. num_xpus = torch.xpu.device_count()
  98. config["num_processes"] = num_xpus
  99. config["use_cpu"] = False
  100. if num_xpus > 1:
  101. config["distributed_type"] = "MULTI_XPU"
  102. else:
  103. config["distributed_type"] = "NO"
  104. elif is_npu_available():
  105. num_npus = torch.npu.device_count()
  106. config["num_processes"] = num_npus
  107. config["use_cpu"] = False
  108. if num_npus > 1:
  109. config["distributed_type"] = "MULTI_NPU"
  110. else:
  111. config["distributed_type"] = "NO"
  112. else:
  113. num_xpus = 0
  114. config["use_cpu"] = True
  115. config["num_processes"] = 1
  116. config["distributed_type"] = "NO"
  117. config["debug"] = False
  118. config["enable_cpu_affinity"] = False
  119. config = ClusterConfig(**config)
  120. config.to_json_file(path)
  121. return path
  122. def default_command_parser(parser, parents):
  123. parser = parser.add_parser("default", parents=parents, help=description, formatter_class=SubcommandHelpFormatter)
  124. parser.add_argument(
  125. "--config_file",
  126. default=default_json_config_file,
  127. help=(
  128. "The path to use to store the config file. Will default to a file named default_config.yaml in the cache "
  129. "location, which is the content of the environment `HF_HOME` suffixed with 'accelerate', or if you don't have "
  130. "such an environment variable, your cache directory ('~/.cache' or the content of `XDG_CACHE_HOME`) suffixed "
  131. "with 'huggingface'."
  132. ),
  133. dest="save_location",
  134. )
  135. parser.add_argument(
  136. "--mixed_precision",
  137. choices=["no", "fp16", "bf16"],
  138. type=str,
  139. help="Whether or not to use mixed precision training. "
  140. "Choose between FP16 and BF16 (bfloat16) training. "
  141. "BF16 training is only supported on Nvidia Ampere GPUs and PyTorch 1.10 or later.",
  142. default="no",
  143. )
  144. parser.set_defaults(func=default_config_command)
  145. return parser
  146. def default_config_command(args):
  147. config_file = write_basic_config(args.mixed_precision, args.save_location)
  148. if config_file:
  149. print(f"accelerate configuration saved at {config_file}")