| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917 |
- #!/usr/bin/env python
- # Copyright 2021 The HuggingFace Team. All rights reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- import os
- from ...utils import (
- ComputeEnvironment,
- DistributedType,
- is_deepspeed_available,
- is_fp8_available,
- is_hpu_available,
- is_mlu_available,
- is_mps_available,
- is_msamp_available,
- is_musa_available,
- is_npu_available,
- is_sdaa_available,
- is_transformer_engine_available,
- is_transformers_available,
- is_xpu_available,
- )
- from ...utils.constants import (
- DEEPSPEED_MULTINODE_LAUNCHERS,
- FSDP2_STATE_DICT_TYPE,
- FSDP_AUTO_WRAP_POLICY,
- FSDP_BACKWARD_PREFETCH,
- FSDP_SHARDING_STRATEGY,
- FSDP_STATE_DICT_TYPE,
- TORCH_DYNAMO_MODES,
- )
- from .config_args import ClusterConfig
- from .config_utils import (
- DYNAMO_BACKENDS,
- _ask_field,
- _ask_options,
- _convert_distributed_mode,
- _convert_dynamo_backend,
- _convert_fp8_backend,
- _convert_mixed_precision,
- _convert_yes_no_to_bool,
- )
- def get_cluster_input():
- distributed_type = _ask_options(
- "Which type of machine are you using?",
- [
- "No distributed training",
- "multi-CPU",
- "multi-XPU",
- "multi-HPU",
- "multi-GPU",
- "multi-NPU",
- "multi-MLU",
- "multi-SDAA",
- "multi-MUSA",
- "TPU",
- ],
- _convert_distributed_mode,
- )
- machine_rank = 0
- num_machines = 1
- num_processes = 1
- gpu_ids = None
- main_process_ip = None
- main_process_port = None
- rdzv_backend = "static"
- same_network = True
- debug = False
- if distributed_type in [
- DistributedType.MULTI_GPU,
- DistributedType.MULTI_MLU,
- DistributedType.MULTI_SDAA,
- DistributedType.MULTI_MUSA,
- DistributedType.MULTI_NPU,
- DistributedType.MULTI_XPU,
- DistributedType.MULTI_CPU,
- DistributedType.MULTI_HPU,
- ]:
- num_machines = _ask_field(
- "How many different machines will you use (use more than 1 for multi-node training)? [1]: ",
- int,
- default=1,
- )
- if num_machines > 1:
- machine_rank = _ask_options(
- "What is the rank of this machine?",
- list(range(num_machines)),
- int,
- )
- main_process_ip = _ask_field(
- "What is the IP address of the machine that will host the main process? ",
- )
- main_process_port = _ask_field(
- "What is the port you will use to communicate with the main process? ",
- int,
- )
- same_network = _ask_field(
- "Are all the machines on the same local network? Answer `no` if nodes are on the cloud and/or on different network hosts [YES/no]: ",
- _convert_yes_no_to_bool,
- default=True,
- error_message="Please enter yes or no.",
- )
- if not same_network:
- rdzv_backend = _ask_field(
- "What rendezvous backend will you use? ('static', 'c10d', ...): ", default="static"
- )
- debug = _ask_field(
- "Should distributed operations be checked while running for errors? This can avoid timeout issues but will be slower. [yes/NO]: ",
- _convert_yes_no_to_bool,
- default=False,
- error_message="Please enter yes or no.",
- )
- if distributed_type == DistributedType.NO:
- use_cpu = _ask_field(
- "Do you want to run your training on CPU only (even if a GPU / Apple Silicon / Ascend NPU device is available)? [yes/NO]:",
- _convert_yes_no_to_bool,
- default=False,
- error_message="Please enter yes or no.",
- )
- elif distributed_type == DistributedType.MULTI_CPU:
- use_cpu = True
- else:
- use_cpu = False
- ipex_config = {}
- mpirun_config = {}
- if use_cpu or is_xpu_available():
- ipex_config["ipex"] = _ask_field(
- "Do you want to use Intel PyTorch Extension (IPEX) to speed up training on CPU/XPU? [yes/NO]:",
- _convert_yes_no_to_bool,
- default=False,
- error_message="Please enter yes or no.",
- )
- if use_cpu:
- if distributed_type == DistributedType.MULTI_CPU:
- use_mpirun = _ask_field(
- "Do you want accelerate to launch mpirun? [yes/NO]: ",
- _convert_yes_no_to_bool,
- default=False,
- error_message="Please enter yes or no.",
- )
- if use_mpirun:
- mpirun_hostfile = _ask_field(
- "Please enter the path to the hostfile to use with mpirun [~/hostfile]: ",
- str,
- default="~/hostfile",
- )
- mpirun_config["mpirun_hostfile"] = os.path.expanduser(mpirun_hostfile.strip())
- mpirun_config["mpirun_ccl"] = _ask_field("Enter the number of oneCCL worker threads [1]: ", default=1)
- dynamo_config = {}
- use_dynamo = _ask_field(
- "Do you wish to optimize your script with torch dynamo?[yes/NO]:",
- _convert_yes_no_to_bool,
- default=False,
- error_message="Please enter yes or no.",
- )
- if use_dynamo:
- prefix = "dynamo_"
- dynamo_config[prefix + "backend"] = _ask_options(
- "Which dynamo backend would you like to use?",
- [x.lower() for x in DYNAMO_BACKENDS],
- _convert_dynamo_backend,
- default=2,
- )
- use_custom_options = _ask_field(
- "Do you want to customize the defaults sent to torch.compile? [yes/NO]: ",
- _convert_yes_no_to_bool,
- default=False,
- error_message="Please enter yes or no.",
- )
- if use_custom_options:
- dynamo_config[prefix + "mode"] = _ask_options(
- "Which mode do you want to use?",
- TORCH_DYNAMO_MODES,
- lambda x: TORCH_DYNAMO_MODES[int(x)],
- default=0,
- )
- dynamo_config[prefix + "use_fullgraph"] = _ask_field(
- "Do you want the fullgraph mode or it is ok to break model into several subgraphs? [yes/NO]: ",
- _convert_yes_no_to_bool,
- default=False,
- error_message="Please enter yes or no.",
- )
- dynamo_config[prefix + "use_dynamic"] = _ask_field(
- "Do you want to enable dynamic shape tracing? [yes/NO]: ",
- _convert_yes_no_to_bool,
- default=False,
- error_message="Please enter yes or no.",
- )
- dynamo_config[prefix + "use_regional_compilation"] = _ask_field(
- "Do you want to enable regional compilation? [yes/NO]: ",
- _convert_yes_no_to_bool,
- default=False,
- error_message="Please enter yes or no.",
- )
- use_mps = not use_cpu and is_mps_available()
- deepspeed_config = {}
- if (
- distributed_type
- in [
- DistributedType.MULTI_GPU,
- DistributedType.MULTI_XPU,
- DistributedType.MULTI_HPU,
- DistributedType.MULTI_NPU,
- DistributedType.MULTI_MLU,
- DistributedType.MULTI_SDAA,
- DistributedType.MULTI_MUSA,
- DistributedType.NO,
- ]
- and not use_mps
- ):
- use_deepspeed = _ask_field(
- "Do you want to use DeepSpeed? [yes/NO]: ",
- _convert_yes_no_to_bool,
- default=False,
- error_message="Please enter yes or no.",
- )
- if use_deepspeed:
- distributed_type = DistributedType.DEEPSPEED
- assert is_deepspeed_available(), (
- "DeepSpeed is not installed => run `pip3 install deepspeed` or build it from source"
- )
- if distributed_type == DistributedType.DEEPSPEED:
- use_deepspeed_config = _ask_field(
- "Do you want to specify a json file to a DeepSpeed config? [yes/NO]: ",
- _convert_yes_no_to_bool,
- default=False,
- error_message="Please enter yes or no.",
- )
- if use_deepspeed_config:
- deepspeed_config["deepspeed_config_file"] = _ask_field(
- "Please enter the path to the json DeepSpeed config file: ",
- str,
- default="none",
- )
- else:
- deepspeed_config["zero_stage"] = _ask_options(
- "What should be your DeepSpeed's ZeRO optimization stage?",
- [0, 1, 2, 3],
- int,
- default=2,
- )
- deepspeed_devices = ["none", "cpu", "nvme"]
- if deepspeed_config["zero_stage"] >= 2:
- deepspeed_config["offload_optimizer_device"] = _ask_options(
- "Where to offload optimizer states?", deepspeed_devices, lambda x: deepspeed_devices[int(x)]
- )
- deepspeed_config["offload_param_device"] = _ask_options(
- "Where to offload parameters?", deepspeed_devices, lambda x: deepspeed_devices[int(x)]
- )
- if deepspeed_config["offload_param_device"] == "nvme":
- deepspeed_config["offload_param_nvme_path"] = _ask_field(
- "Nvme Path to offload parameters?",
- str,
- default="/nvme",
- )
- if deepspeed_config["offload_optimizer_device"] == "nvme":
- deepspeed_config["offload_optimizer_nvme_path"] = _ask_field(
- "Nvme Path to offload optimizer states?",
- str,
- default="/nvme",
- )
- deepspeed_config["gradient_accumulation_steps"] = _ask_field(
- "How many gradient accumulation steps you're passing in your script? [1]: ",
- int,
- default=1,
- )
- use_gradient_clipping = _ask_field(
- "Do you want to use gradient clipping? [yes/NO]: ",
- _convert_yes_no_to_bool,
- default=False,
- error_message="Please enter yes or no.",
- )
- if use_gradient_clipping:
- deepspeed_config["gradient_clipping"] = _ask_field(
- "What is the gradient clipping value? [1.0]: ",
- float,
- default=1.0,
- )
- if deepspeed_config["zero_stage"] == 3:
- deepspeed_config["zero3_save_16bit_model"] = _ask_field(
- "Do you want to save 16-bit model weights when using ZeRO Stage-3? [yes/NO]: ",
- _convert_yes_no_to_bool,
- default=False,
- error_message="Please enter yes or no.",
- )
- deepspeed_config["zero3_init_flag"] = _ask_field(
- "Do you want to enable `deepspeed.zero.Init` when using ZeRO Stage-3 for constructing massive models? [yes/NO]: ",
- _convert_yes_no_to_bool,
- default=False,
- error_message="Please enter yes or no.",
- )
- if deepspeed_config["zero3_init_flag"]:
- if not is_transformers_available():
- raise Exception(
- "When `zero3_init_flag` is set, it requires Transformers to be installed. "
- "Please run `pip3 install transformers`."
- )
- use_moe = _ask_field(
- "Do you want to enable Mixture-of-Experts training (MoE)? [yes/NO]: ",
- _convert_yes_no_to_bool,
- default=False,
- error_message="Please enter yes or no.",
- )
- if use_moe:
- deepspeed_config["deepspeed_moe_layer_cls_names"] = _ask_field(
- "Specify the comma-separated list of transformers MoE layer class names (case-sensitive), e.g : "
- " `MixtralSparseMoeBlock`, `Qwen2MoeSparseMoeBlock`, `JetMoEAttention,JetMoEBlock` ... : ",
- str,
- )
- if num_machines > 1:
- launcher_query = "Which Type of launcher do you want to use?"
- deepspeed_config["deepspeed_multinode_launcher"] = _ask_options(
- launcher_query,
- DEEPSPEED_MULTINODE_LAUNCHERS,
- lambda x: DEEPSPEED_MULTINODE_LAUNCHERS[int(x)],
- )
- if deepspeed_config["deepspeed_multinode_launcher"] != DEEPSPEED_MULTINODE_LAUNCHERS[1]:
- deepspeed_config["deepspeed_hostfile"] = _ask_field(
- "DeepSpeed configures multi-node compute resources with hostfile. "
- "Each row is of the format `hostname slots=[num_gpus]`, e.g., `localhost slots=2`; "
- "for more information please refer official [documentation]"
- "(https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node). "
- "Please specify the location of hostfile: ",
- str,
- )
- is_exclusion_filter = _ask_field(
- "Do you want to specify exclusion filter string? [yes/NO]: ",
- _convert_yes_no_to_bool,
- default=False,
- error_message="Please enter yes or no.",
- )
- if is_exclusion_filter:
- deepspeed_config["deepspeed_exclusion_filter"] = _ask_field(
- "DeepSpeed exclusion filter string: ",
- str,
- )
- is_inclusion_filter = _ask_field(
- "Do you want to specify inclusion filter string? [yes/NO]: ",
- _convert_yes_no_to_bool,
- default=False,
- error_message="Please enter yes or no.",
- )
- if is_inclusion_filter:
- deepspeed_config["deepspeed_inclusion_filter"] = _ask_field(
- "DeepSpeed inclusion filter string: ",
- str,
- )
- fsdp_config = {}
- if distributed_type in [
- DistributedType.MULTI_GPU,
- DistributedType.MULTI_NPU,
- DistributedType.MULTI_MLU,
- DistributedType.MULTI_SDAA,
- DistributedType.MULTI_MUSA,
- DistributedType.MULTI_XPU,
- DistributedType.MULTI_HPU,
- ]:
- use_fsdp = _ask_field(
- "Do you want to use FullyShardedDataParallel? [yes/NO]: ",
- _convert_yes_no_to_bool,
- default=False,
- error_message="Please enter yes or no.",
- )
- if use_fsdp:
- distributed_type = DistributedType.FSDP
- if distributed_type == DistributedType.FSDP:
- fsdp_config["fsdp_version"] = _ask_options(
- "What should be your FSDP version? [2]: ",
- [1, 2],
- lambda x: int(x) + 1,
- default=1,
- )
- fsdp_version = fsdp_config["fsdp_version"] # extract to a variable to simplify usage later
- if fsdp_version == 1:
- sharding_strategy_query = "What should be your sharding strategy?"
- fsdp_config["fsdp_reshard_after_forward"] = _ask_options(
- sharding_strategy_query,
- FSDP_SHARDING_STRATEGY,
- lambda x: FSDP_SHARDING_STRATEGY[int(x)],
- )
- else:
- fsdp_config["fsdp_reshard_after_forward"] = _ask_field(
- "Do you want to enable resharding after forward? [YES/no]: ",
- _convert_yes_no_to_bool,
- default=True,
- error_message="Please enter yes or no.",
- )
- fsdp_config["fsdp_offload_params"] = _ask_field(
- "Do you want to offload parameters and gradients to CPU? [yes/NO]: ",
- _convert_yes_no_to_bool,
- default=False,
- error_message="Please enter yes or no.",
- )
- fsdp_wrap_query = "What should be your auto wrap policy?"
- fsdp_config["fsdp_auto_wrap_policy"] = _ask_options(
- fsdp_wrap_query,
- FSDP_AUTO_WRAP_POLICY,
- lambda x: FSDP_AUTO_WRAP_POLICY[int(x)],
- )
- if fsdp_config["fsdp_auto_wrap_policy"] == FSDP_AUTO_WRAP_POLICY[0]:
- use_no_split_modules = _ask_field(
- "Do you want to use the model's `_no_split_modules` to wrap. Only applicable for 🤗 Transformers [yes/NO]: ",
- _convert_yes_no_to_bool,
- default=False,
- error_message="Please enter yes or no.",
- )
- if not use_no_split_modules:
- fsdp_config["fsdp_transformer_layer_cls_to_wrap"] = _ask_field(
- "Specify the comma-separated list of transformer layer class names (case-sensitive) to wrap ,e.g, :"
- "`BertLayer`, `GPTJBlock`, `T5Block`, `BertLayer,BertEmbeddings,BertSelfOutput` ...? : ",
- str,
- )
- elif fsdp_config["fsdp_auto_wrap_policy"] == FSDP_AUTO_WRAP_POLICY[1]:
- fsdp_config["fsdp_min_num_params"] = _ask_field(
- "What should be your FSDP's minimum number of parameters for Default Auto Wrapping Policy? [1e8]: ",
- int,
- default=100000000,
- )
- # Removed in FSDP2, ask for user input for FSDP1
- if fsdp_version == 1:
- fsdp_backward_prefetch_query = "What should be your FSDP's backward prefetch policy?"
- fsdp_config["fsdp_backward_prefetch"] = _ask_options(
- fsdp_backward_prefetch_query,
- FSDP_BACKWARD_PREFETCH,
- lambda x: FSDP_BACKWARD_PREFETCH[int(x)],
- )
- fsdp_state_dict_type_query = "What should be your FSDP's state dict type?"
- fsdp_config["fsdp_state_dict_type"] = _ask_options(
- fsdp_state_dict_type_query,
- FSDP_STATE_DICT_TYPE if fsdp_version == 1 else FSDP2_STATE_DICT_TYPE,
- lambda x: FSDP_STATE_DICT_TYPE[int(x)] if fsdp_version == 1 else FSDP2_STATE_DICT_TYPE[int(x)],
- default=0,
- )
- # Not implemented in FSDP2, ask for user input for FSDP1
- if fsdp_version == 1:
- fsdp_config["fsdp_forward_prefetch"] = _ask_field(
- "Do you want to enable FSDP's forward prefetch policy? [yes/NO]: ",
- _convert_yes_no_to_bool,
- default=False,
- error_message="Please enter yes or no.",
- )
- # Obsolete in FSDP2, ask for user input for FSDP1
- if fsdp_version == 1:
- fsdp_config["fsdp_use_orig_params"] = _ask_field(
- "Do you want to enable FSDP's `use_orig_params` feature? [YES/no]: ",
- _convert_yes_no_to_bool,
- default=True,
- error_message="Please enter yes or no.",
- )
- fsdp_config["fsdp_cpu_ram_efficient_loading"] = _ask_field(
- "Do you want to enable CPU RAM efficient model loading? Only applicable for 🤗 Transformers models. [YES/no]: ",
- _convert_yes_no_to_bool,
- default=True,
- error_message="Please enter yes or no.",
- )
- # Obsolete in FSDP2, ask for user input for FSDP1
- if fsdp_version == 1:
- if fsdp_config["fsdp_cpu_ram_efficient_loading"]:
- fsdp_config["fsdp_sync_module_states"] = True
- else:
- fsdp_config["fsdp_sync_module_states"] = _ask_field(
- "Do you want each individually wrapped FSDP unit to broadcast module parameters from rank 0 at the start? [YES/no]: ",
- _convert_yes_no_to_bool,
- default=True,
- error_message="Please enter yes or no.",
- )
- fsdp_config["fsdp_activation_checkpointing"] = _ask_field(
- "Do you want to enable FSDP activation checkpointing? [yes/NO]: ",
- _convert_yes_no_to_bool,
- default=False,
- error_message="Please enter yes or no.",
- )
- parallelism_config = {}
- if fsdp_config.get("fsdp_version", 1) == 2:
- use_parallelism_config = _ask_field(
- "Do you want to use the parallelism config? [yes/NO]: ",
- _convert_yes_no_to_bool,
- default=False,
- error_message="Please enter yes or no.",
- )
- if use_parallelism_config:
- prefix = "parallelism_config_"
- parallelism_config[prefix + "dp_replicate_size"] = _ask_field(
- "What is the data parallelism replicate size? [1]: ",
- int,
- default=1,
- error_message="Please enter an integer.",
- )
- parallelism_config[prefix + "dp_shard_size"] = _ask_field(
- "What is the FSDP shard size? [1]: ",
- int,
- default=1,
- error_message="Please enter an integer.",
- )
- parallelism_config[prefix + "tp_size"] = _ask_field(
- "What is the tensor parallelism size? [1]: ",
- int,
- default=1,
- error_message="Please enter an integer.",
- )
- parallelism_config[prefix + "cp_size"] = _ask_field(
- "What is the context parallelism size? [1]: ",
- int,
- default=1,
- error_message="Please enter an integer.",
- )
- if parallelism_config[prefix + "cp_size"] > 1:
- parallelism_config[prefix + "cp_comm_strategy"] = _ask_options(
- "What is the compute parallelism communication strategy?",
- ["allgather", "alltoall"],
- lambda x: ["allgather", "alltoall"][int(x)],
- default=0,
- )
- megatron_lm_config = {}
- if distributed_type in [DistributedType.MULTI_GPU]:
- use_megatron_lm = _ask_field(
- "Do you want to use Megatron-LM ? [yes/NO]: ",
- _convert_yes_no_to_bool,
- default=False,
- error_message="Please enter yes or no.",
- )
- if use_megatron_lm:
- distributed_type = DistributedType.MEGATRON_LM
- if distributed_type == DistributedType.MEGATRON_LM:
- prefix = "megatron_lm_"
- megatron_lm_config[prefix + "tp_degree"] = _ask_field(
- "What is the Tensor Parallelism degree/size? [1]:",
- int,
- default=1,
- error_message="Please enter an integer.",
- )
- if megatron_lm_config[prefix + "tp_degree"] > 1:
- megatron_lm_config[prefix + "sequence_parallelism"] = _ask_field(
- "Do you want to enable Sequence Parallelism? [YES/no]: ",
- _convert_yes_no_to_bool,
- default=True,
- error_message="Please enter yes or no.",
- )
- megatron_lm_config[prefix + "pp_degree"] = _ask_field(
- "What is the Pipeline Parallelism degree/size? [1]:",
- int,
- default=1,
- error_message="Please enter an integer.",
- )
- if megatron_lm_config[prefix + "pp_degree"] > 1:
- megatron_lm_config[prefix + "num_micro_batches"] = _ask_field(
- "What is the number of micro-batches? [1]:",
- int,
- default=1,
- error_message="Please enter an integer.",
- )
- megatron_lm_config[prefix + "recompute_activations"] = _ask_field(
- "Do you want to enable selective activation recomputation? [YES/no]: ",
- _convert_yes_no_to_bool,
- default=True,
- error_message="Please enter yes or no.",
- )
- megatron_lm_config[prefix + "use_distributed_optimizer"] = _ask_field(
- "Do you want to use distributed optimizer "
- "which shards optimizer state and gradients across data parallel ranks? [YES/no]: ",
- _convert_yes_no_to_bool,
- default=True,
- error_message="Please enter yes or no.",
- )
- megatron_lm_config[prefix + "gradient_clipping"] = _ask_field(
- "What is the gradient clipping value based on global L2 Norm (0 to disable)? [1.0]: ",
- float,
- default=1.0,
- )
- # TPU specific defaults
- tpu_commands = None
- tpu_command_file = None
- tpu_downcast_bf16 = "no"
- tpu_env = []
- tpu_name = None
- tpu_vm = None
- tpu_zone = None
- tpu_use_sudo = False
- tpu_use_cluster = False
- if distributed_type in [
- DistributedType.MULTI_CPU,
- DistributedType.MULTI_XPU,
- DistributedType.MULTI_HPU,
- DistributedType.MULTI_GPU,
- DistributedType.MULTI_MLU,
- DistributedType.MULTI_SDAA,
- DistributedType.MULTI_MUSA,
- DistributedType.MULTI_NPU,
- DistributedType.XLA,
- ]:
- machine_type = str(distributed_type).split(".")[1].replace("MULTI_", "")
- if machine_type == "TPU":
- machine_type += " cores"
- elif machine_type == "CPU":
- machine_type = "processes"
- else:
- machine_type += "(s)"
- num_processes = _ask_field(
- f"How many {machine_type} should be used for distributed training? [1]:",
- int,
- default=1,
- error_message="Please enter an integer.",
- )
- elif distributed_type in [DistributedType.FSDP, DistributedType.DEEPSPEED, DistributedType.MEGATRON_LM]:
- num_processes = _ask_field(
- "How many GPU(s) should be used for distributed training? [1]:",
- int,
- default=1,
- error_message="Please enter an integer.",
- )
- else:
- num_processes = 1
- if (distributed_type == DistributedType.MULTI_GPU) and (num_machines == 1) and (num_processes == 1):
- raise ValueError(
- f"Specified distributed type {distributed_type} but only using 1 GPU on a single machine. Please select `No distributed training` for the type of machine you are using."
- )
- if (
- distributed_type
- in [
- DistributedType.MULTI_GPU,
- DistributedType.MULTI_MLU,
- DistributedType.MULTI_SDAA,
- DistributedType.MULTI_MUSA,
- DistributedType.MULTI_NPU,
- DistributedType.MULTI_XPU,
- DistributedType.MULTI_HPU,
- DistributedType.NO,
- ]
- and not use_cpu
- and not use_mps
- ):
- if is_npu_available():
- machine_type = "NPU(s)"
- elif is_mlu_available():
- machine_type = "MLU(s)"
- elif is_sdaa_available():
- machine_type = "SDAA(s)"
- elif is_musa_available():
- machine_type = "MUSA(s)"
- elif is_xpu_available():
- machine_type = "XPU(s)"
- elif is_hpu_available():
- machine_type = "HPU(s)"
- else:
- machine_type = "GPU(s)"
- gpu_ids = _ask_field(
- f"What {machine_type} (by id) should be used for training on this machine as a comma-separated list? [all]:",
- default="all",
- )
- # CPU affinity is only supported on NVIDIA hardware for now
- enable_cpu_affinity = False
- if distributed_type in (DistributedType.NO, DistributedType.MULTI_GPU) and not use_cpu and not use_mps:
- enable_cpu_affinity = _ask_field(
- "Would you like to enable numa efficiency? (Currently only supported on NVIDIA hardware). [yes/NO]: ",
- _convert_yes_no_to_bool,
- default=False,
- error_message="Please enter yes or no.",
- )
- fp8_config = None
- if distributed_type == DistributedType.XLA:
- mixed_precision = "no"
- main_training_function = _ask_field(
- "What is the name of the function in your script that should be launched in all parallel scripts? [main]: ",
- default="main",
- )
- tpu_use_cluster = _ask_field(
- "Are you using a TPU cluster? [yes/NO]: ",
- _convert_yes_no_to_bool,
- default=False,
- error_message="Please enter yes or no.",
- )
- if tpu_use_cluster:
- tpu_name = _ask_field(
- "What is the name of your TPU cluster? ",
- default=None,
- error_message="Please enter the name of your TPU cluster.",
- )
- tpu_zone = _ask_field(
- "What is the zone of your TPU cluster? ",
- default=None,
- error_message="Please enter the zone of your TPU cluster.",
- )
- tpu_use_sudo = _ask_field(
- "To run a python script in a TPU pod, should `sudo` be used? [yes/NO]: ",
- default=False,
- error_message="Please enter yes or no.",
- )
- run_commands = _ask_field(
- "Do you have code you wish to run on startup in each pod? [yes/NO]: ",
- _convert_yes_no_to_bool,
- default=False,
- error_message="Please enter yes or no.",
- )
- if run_commands:
- use_command_file = _ask_field(
- "Is this code located in a bash script? [yes/NO]: ",
- _convert_yes_no_to_bool,
- default=False,
- error_message="Please enter yes or no.",
- )
- if use_command_file:
- tpu_command_file = _ask_field(
- "What is the path to your bash script? ",
- default=None,
- error_message="Please enter the path to your bash script.",
- )
- tpu_command_file = os.path.abspath(tpu_command_file)
- else:
- print("Please enter each command separately you wish to run on startup in each pod.")
- tpu_commands = []
- another_command = True
- while another_command:
- tpu_commands.append(
- _ask_field(
- "Please enter a single command to be ran ",
- default=None,
- error_message="Please enter the commands you wish to run on startup in each pod as a single string.",
- )
- )
- another_command = _ask_field(
- "Do you wish to add another command? [yes/NO]: ",
- _convert_yes_no_to_bool,
- default=False,
- error_message="Please enter yes or no.",
- )
- tpu_vm = _ask_field(
- "If not using an instance group, what are the names of the Compute VM instances to be used, separated by a comma: ",
- default="",
- ).split(",")
- tpu_env = _ask_field(
- "What environment variables do you wish to set in each pod, separated by a comma: ",
- default="",
- ).split(",")
- else:
- main_training_function = "main"
- if distributed_type == DistributedType.DEEPSPEED and use_deepspeed_config:
- mixed_precision = None
- else:
- mixed_precision = _ask_options(
- "Do you wish to use mixed precision?",
- ["no", "fp16", "bf16", "fp8"],
- _convert_mixed_precision,
- )
- if mixed_precision == "fp8":
- if not is_fp8_available():
- raise ValueError("FP8 (either Transformer Engine or MSAMP) is not installed on this machine.")
- fp8_config = {}
- fp8_config["backend"] = _ask_options(
- "Which FP8 backend do you want to use?",
- ["te", "msamp"],
- _convert_fp8_backend,
- )
- if fp8_config["backend"] == "TE":
- if not is_transformer_engine_available():
- raise ValueError("TransformersEngine was selected, but it is not installed on this machine.")
- fp8_config["use_autocast_during_eval"] = _ask_field(
- "Do you want to use FP8 autocast during eval mode? Generally better metrics are found when this is disabled [yes/NO]: ",
- _convert_yes_no_to_bool,
- default=False,
- )
- fp8_config["margin"] = _ask_field(
- "What margin should be used for gradient scaling? [0]: ",
- int,
- default=0,
- )
- fp8_config["interval"] = _ask_field(
- "What interval should be used for for how often the scaling factor is recomputed? [1]: ",
- int,
- default=1,
- )
- fp8_config["fp8_format"] = _ask_options(
- "Which weight format should be used?",
- ["HYBRID", "E4M3", "E5M2"],
- lambda i: ["HYBRID", "E4M3", "E5M2"][i],
- default=0,
- )
- fp8_config["amax_history_length"] = _ask_field(
- "What length of history should be used for the amax scaling factor computation? [1024]: ",
- int,
- default=1024,
- )
- fp8_config["amax_compute_algorithm"] = _ask_options(
- "Which algorithm should be used for the amax scaling factor computation?",
- ["max", "most_recent"],
- lambda x: "max" if x == 0 else "most_recent",
- default=0,
- )
- fp8_config["override_linear_precision"] = _ask_field(
- "Do you want to to execute `fprop`, `dgrad`, and `wgrad` GEMMS in higher precision? [yes/NO]: ",
- _convert_yes_no_to_bool,
- default=False,
- )
- if fp8_config["override_linear_precision"]:
- fprop = _ask_field(
- "Should `fprop` be executed in higher precision? [yes/NO]: ",
- _convert_yes_no_to_bool,
- default=False,
- )
- dgrad = _ask_field(
- "Should `dgrad` be executed in higher precision? [yes/NO]: ",
- _convert_yes_no_to_bool,
- default=False,
- )
- wgrad = _ask_field(
- "Should `wgrad` be executed in higher precision? [yes/NO]: ",
- _convert_yes_no_to_bool,
- default=False,
- )
- fp8_config["override_linear_precision"] = (fprop, dgrad, wgrad)
- else:
- fp8_config["override_linear_precision"] = (False, False, False)
- elif fp8_config["backend"] == "MSAMP":
- if not is_msamp_available():
- raise ValueError("MSAMP was selected, but it is not installed on this machine.")
- fp8_config["optimization_level"] = _ask_options(
- "Which optimization level should be used?",
- ["O1", "O2"],
- lambda x: "O1" if x == 0 else "O2",
- default=1,
- )
- if use_dynamo and mixed_precision == "no" and not use_cpu:
- print(
- "Torch dynamo used without mixed precision requires TF32 to be efficient. Accelerate will enable it by default when launching your scripts."
- )
- if distributed_type == DistributedType.XLA and mixed_precision == "bf16":
- tpu_downcast_bf16 = _ask_field(
- "Should `torch.float` be cast as `bfloat16` and `torch.double` remain `float32` on TPUs?", default="no"
- )
- return ClusterConfig(
- compute_environment=ComputeEnvironment.LOCAL_MACHINE,
- distributed_type=distributed_type,
- num_processes=num_processes,
- gpu_ids=gpu_ids,
- mixed_precision=mixed_precision,
- downcast_bf16=tpu_downcast_bf16,
- machine_rank=machine_rank,
- num_machines=num_machines,
- main_process_ip=main_process_ip,
- main_process_port=main_process_port,
- main_training_function=main_training_function,
- fp8_config=fp8_config,
- deepspeed_config=deepspeed_config,
- fsdp_config=fsdp_config,
- parallelism_config=parallelism_config,
- megatron_lm_config=megatron_lm_config,
- ipex_config=ipex_config,
- mpirun_config=mpirun_config,
- use_cpu=use_cpu,
- rdzv_backend=rdzv_backend,
- same_network=same_network,
- commands=tpu_commands,
- command_file=tpu_command_file,
- tpu_env=tpu_env,
- tpu_name=tpu_name,
- tpu_vm=tpu_vm,
- tpu_zone=tpu_zone,
- tpu_use_sudo=tpu_use_sudo,
- tpu_use_cluster=tpu_use_cluster,
- dynamo_config=dynamo_config,
- debug=debug,
- enable_cpu_affinity=enable_cpu_affinity,
- )
|