| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157 |
- #!/usr/bin/env python
- # Copyright 2022 The HuggingFace Team. All rights reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- import argparse
- import os
- import subprocess
- from packaging.version import Version, parse
- from accelerate.commands.config.config_args import default_config_file, load_config_from_file
- _description = "Run commands across TPU VMs for initial setup before running `accelerate launch`."
- def tpu_command_parser(subparsers=None):
- if subparsers is not None:
- parser = subparsers.add_parser("tpu-config", description=_description)
- else:
- parser = argparse.ArgumentParser("Accelerate tpu-config command", description=_description)
- # Core arguments
- config_args = parser.add_argument_group(
- "Config Arguments", "Arguments that can be configured through `accelerate config`."
- )
- config_args.add_argument(
- "--config_file",
- type=str,
- default=None,
- help="Path to the config file to use for accelerate.",
- )
- config_args.add_argument(
- "--tpu_name",
- default=None,
- help="The name of the TPU to use. If not specified, will use the TPU specified in the config file.",
- )
- config_args.add_argument(
- "--tpu_zone",
- default=None,
- help="The zone of the TPU to use. If not specified, will use the zone specified in the config file.",
- )
- pod_args = parser.add_argument_group("TPU Arguments", "Arguments for options ran inside the TPU.")
- pod_args.add_argument(
- "--use_alpha",
- action="store_true",
- help="Whether to use `gcloud alpha` when running the TPU training script instead of `gcloud`.",
- )
- pod_args.add_argument(
- "--command_file",
- default=None,
- help="The path to the file containing the commands to run on the pod on startup.",
- )
- pod_args.add_argument(
- "--command",
- action="append",
- nargs="+",
- help="A command to run on the pod. Can be passed multiple times.",
- )
- pod_args.add_argument(
- "--install_accelerate",
- action="store_true",
- help="Whether to install accelerate on the pod. Defaults to False.",
- )
- pod_args.add_argument(
- "--accelerate_version",
- default="latest",
- help="The version of accelerate to install on the pod. If not specified, will use the latest pypi version. Specify 'dev' to install from GitHub.",
- )
- pod_args.add_argument(
- "--debug", action="store_true", help="If set, will print the command that would be run instead of running it."
- )
- if subparsers is not None:
- parser.set_defaults(func=tpu_command_launcher)
- return parser
- def tpu_command_launcher(args):
- defaults = None
- # Get the default from the config file if it exists.
- if args.config_file is not None or os.path.isfile(default_config_file):
- defaults = load_config_from_file(args.config_file)
- if not args.command_file and defaults.command_file is not None and not args.command:
- args.command_file = defaults.command_file
- if not args.command and defaults.commands is not None:
- args.command = defaults.commands
- if not args.tpu_name:
- args.tpu_name = defaults.tpu_name
- if not args.tpu_zone:
- args.tpu_zone = defaults.tpu_zone
- if args.accelerate_version == "dev":
- args.accelerate_version = "git+https://github.com/huggingface/accelerate.git"
- elif args.accelerate_version == "latest":
- args.accelerate_version = "accelerate -U"
- elif isinstance(parse(args.accelerate_version), Version):
- args.accelerate_version = f"accelerate=={args.accelerate_version}"
- if not args.command_file and not args.command:
- raise ValueError("You must specify either a command file or a command to run on the pod.")
- if args.command_file:
- with open(args.command_file) as f:
- args.command = [f.read().splitlines()]
- # To turn list of lists into list of strings
- if isinstance(args.command[0], list):
- args.command = [line for cmd in args.command for line in cmd]
- # Default to the shared folder and install accelerate
- new_cmd = ["cd /usr/share"]
- if args.install_accelerate:
- new_cmd += [f"pip install {args.accelerate_version}"]
- new_cmd += args.command
- args.command = "; ".join(new_cmd)
- # Then send it to gcloud
- # Eventually try to use google-api-core to do this instead of subprocess
- cmd = ["gcloud"]
- if args.use_alpha:
- cmd += ["alpha"]
- cmd += [
- "compute",
- "tpus",
- "tpu-vm",
- "ssh",
- args.tpu_name,
- "--zone",
- args.tpu_zone,
- "--command",
- args.command,
- "--worker",
- "all",
- ]
- if args.debug:
- print(f"Running {' '.join(cmd)}")
- return
- subprocess.run(cmd)
- print("Successfully setup pod.")
- def main():
- parser = tpu_command_parser()
- args = parser.parse_args()
- tpu_command_launcher(args)
|