| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488 |
- # -------------------------------------------------------------------------
- # Copyright (c) Microsoft Corporation. All rights reserved.
- # Licensed under the MIT License. See License.txt in the project root for
- # license information.
- # --------------------------------------------------------------------------
- import argparse
- import datetime
- import json
- import logging
- import os
- import subprocess
- import torch
- from benchmark_helper import setup_logger
- from metrics import BenchmarkRecord
- logger = logging.getLogger(__name__)
- def get_args():
- parser = argparse.ArgumentParser()
- parser.add_argument(
- "-b",
- "--batch-sizes",
- type=str,
- default="1 2",
- )
- parser.add_argument(
- "-s",
- "--sequence-lengths",
- type=str,
- default="8 16 32 64 128 256 512",
- )
- parser.add_argument(
- "-w",
- "--warmup-runs",
- type=int,
- default=5,
- )
- parser.add_argument(
- "-n",
- "--num-runs",
- type=int,
- default=1000,
- )
- parser.add_argument(
- "--hf-pt-eager",
- default=False,
- action="store_true",
- help="Benchmark in PyTorch without `torch.compile`",
- )
- parser.add_argument(
- "--hf-pt-compile",
- default=False,
- action="store_true",
- help="Benchmark in PyTorch with `torch.compile`",
- )
- parser.add_argument(
- "--hf-ort-dir-path",
- type=str,
- default="",
- help="Path to folder containing ONNX models for Optimum + ORT benchmarking",
- )
- parser.add_argument(
- "--ort-msft-model-path",
- type=str,
- default="",
- help="Path to ONNX model from https://github.com/microsoft/Llama-2-Onnx",
- )
- parser.add_argument(
- "--ort-convert-to-onnx-model-path",
- type=str,
- default="",
- help="Path to ONNX model from convert_to_onnx",
- )
- parser.add_argument(
- "--cache-dir",
- type=str,
- default="./model_cache",
- help="Cache dir where Hugging Face files are stored",
- )
- parser.add_argument(
- "--model-name",
- type=str,
- required=True,
- help="Model name in Hugging Face",
- )
- parser.add_argument(
- "--precision",
- type=str,
- required=True,
- choices=["int4", "int8", "fp16", "fp32"],
- help="Precision to run model",
- )
- parser.add_argument(
- "--device",
- type=str,
- required=True,
- choices=["cpu", "cuda", "rocm"],
- help="Device to benchmark models",
- )
- parser.add_argument(
- "--device-id",
- type=int,
- default=0,
- help="GPU device ID",
- )
- parser.add_argument(
- "--verbose",
- default=False,
- action="store_true",
- help="Print detailed logs",
- )
- parser.add_argument(
- "--timeout",
- type=int,
- default=10,
- help="Number of mins to attempt the benchmark before moving on",
- )
- parser.add_argument(
- "--log-folder",
- type=str,
- default=None,
- help="Path to folder to save logs and results",
- )
- args = parser.parse_args()
- setattr(args, "model_size", args.model_name.split("/")[-1].replace(".", "-")) # noqa: B010
- log_folder_name = f"./{args.model_size}_{args.precision}"
- if not args.log_folder:
- args.log_folder = log_folder_name
- os.makedirs(args.log_folder, exist_ok=True)
- # Convert timeout value to secs
- args.timeout *= 60
- return args
- def process_log_file(device_id, log_file, base_results):
- entries = []
- batch_size, sequence_length, step = None, None, None
- latency_s, latency_ms, throughput, memory = None, None, None, None
- batch_pattern = "Batch Size: "
- sequence_pattern = "Sequence Length: "
- prompt_step_pattern = "to get past_key_values"
- per_token_step_pattern = "with past_key_values"
- latency_pattern = "Latency: "
- throughput_pattern = "Throughput: "
- memory_pattern = "peak="
- with open(log_file) as f:
- for input_line in f:
- line = input_line.replace("\n", "")
- if batch_pattern in line:
- batch_size = int(line[len(batch_pattern) :])
- elif sequence_pattern in line:
- sequence_length = int(line[len(sequence_pattern) :])
- elif prompt_step_pattern in line:
- step = "prompt"
- elif per_token_step_pattern in line:
- step = "per-token"
- elif latency_pattern in line:
- latency_s = float(line[len(latency_pattern) : line.rfind(" ")])
- latency_ms = latency_s * 1000
- elif throughput_pattern in line:
- throughput = float(line[len(throughput_pattern) : line.rfind(" ")])
- elif memory_pattern in line:
- if "CPU" in line:
- # Example format for log entry:
- # CPU memory usage: before=1000.0 MB, peak=2000.0 MB
- memory = float(line[line.rfind("=") + 1 : line.rfind(" MB")]) / 1000
- else:
- # Example format for log entry:
- # GPU memory usage: before=[{'device_id': 0, 'name': 'NVIDIA A100-SXM4-80GB', 'max_used_MB': 69637.25}, {'device_id': 1, 'name': 'NVIDIA A100-SXM4-80GB', 'max_used_MB': 890.625}] peak=[{'device_id': 0, 'name': 'NVIDIA A100-SXM4-80GB', 'max_used_MB': 73861.25}, {'device_id': 1, 'name': 'NVIDIA A100-SXM4-80GB', 'max_used_MB': 890.625}]
- peak = line[line.find(memory_pattern) + len(memory_pattern) :].replace("'", '"')
- usage = json.loads(peak)[device_id]["max_used_MB"]
- memory = float(usage) / 1000
- # Append log entry to list of entries
- entry = base_results + [ # noqa: RUF005
- batch_size,
- sequence_length,
- step,
- latency_s,
- latency_ms,
- throughput,
- memory,
- ]
- entries.append(entry)
- return entries
- def save_results(results, filename):
- import pandas as pd # noqa: PLC0415
- df = pd.DataFrame(
- results,
- columns=[
- "Warmup Runs",
- "Measured Runs",
- "Model Name",
- "Engine",
- "Precision",
- "Device",
- "Batch Size",
- "Sequence Length",
- "Step",
- "Latency (s)",
- "Latency (ms)",
- "Throughput (tps)",
- "Memory (GB)",
- ],
- )
- # Set column types
- df["Warmup Runs"] = df["Warmup Runs"].astype("int")
- df["Measured Runs"] = df["Measured Runs"].astype("int")
- df["Batch Size"] = df["Batch Size"].astype("int")
- df["Sequence Length"] = df["Sequence Length"].astype("int")
- df["Latency (s)"] = df["Latency (s)"].astype("float")
- df["Latency (ms)"] = df["Latency (ms)"].astype("float")
- df["Throughput (tps)"] = df["Throughput (tps)"].astype("float")
- df["Memory (GB)"] = df["Memory (GB)"].astype("float")
- # get package name and version
- import pkg_resources # noqa: PLC0415
- installed_packages = pkg_resources.working_set
- installed_packages_list = sorted(
- [f"{i.key}=={i.version}" for i in installed_packages if i.key in ["onnxruntime", "onnxruntime-gpu"]]
- )
- ort_pkg_name = ""
- ort_pkg_version = ""
- if installed_packages_list:
- ort_pkg_name = installed_packages_list[0].split("==")[0]
- ort_pkg_version = installed_packages_list[0].split("==")[1]
- # Save results to csv with standard format
- records = []
- for _, row in df.iterrows():
- if row["Engine"] in ["optimum-ort", "onnxruntime"]:
- record = BenchmarkRecord(
- row["Model Name"], row["Precision"], "onnxruntime", row["Device"], ort_pkg_name, ort_pkg_version
- )
- elif row["Engine"] in ["pytorch-eager", "pytorch-compile"]:
- record = BenchmarkRecord(
- row["Model Name"], row["Precision"], "pytorch", row["Device"], torch.__name__, torch.__version__
- )
- else:
- record = BenchmarkRecord(row["Model Name"], row["Precision"], row["Engine"], row["Device"], "", "")
- record.config.warmup_runs = row["Warmup Runs"]
- record.config.measured_runs = row["Measured Runs"]
- record.config.batch_size = row["Batch Size"]
- record.config.seq_length = row["Sequence Length"]
- record.config.customized["measure_step"] = row["Step"]
- record.config.customized["engine"] = row["Engine"]
- record.metrics.customized["latency_s_mean"] = row["Latency (s)"]
- record.metrics.latency_ms_mean = row["Latency (ms)"]
- record.metrics.customized["throughput_tps"] = row["Throughput (tps)"]
- record.metrics.max_memory_usage_GB = row["Memory (GB)"]
- records.append(record)
- BenchmarkRecord.save_as_csv(filename, records)
- BenchmarkRecord.save_as_json(filename.replace(".csv", ".json"), records)
- logger.info(f"Results saved in {filename}!")
- def benchmark(args, benchmark_cmd, engine):
- log_filename = f"{engine}_{datetime.datetime.now():%Y-%m-%d_%H:%M:%S}.log"
- log_path = os.path.join(args.log_folder, log_filename)
- with open(log_path, "w") as log_file:
- process = subprocess.Popen(benchmark_cmd, stdout=log_file, stderr=log_file)
- try:
- process.wait(args.timeout)
- except subprocess.TimeoutExpired:
- process.kill()
- # Create entries for csv
- logger.info("Gathering data from log files...")
- base_results = [args.warmup_runs, args.num_runs, args.model_name, engine, args.precision, args.device]
- results = process_log_file(args.device_id, log_path, base_results)
- return results
- def main():
- args = get_args()
- setup_logger(args.verbose)
- logger.info(args.__dict__)
- torch.backends.cudnn.benchmark = True
- all_results = []
- os.environ["CUDA_VISIBLE_DEVICES"] = str(args.device_id)
- # Benchmark PyTorch without torch.compile
- if args.hf_pt_eager:
- benchmark_cmd = [
- "python",
- "-m",
- "models.llama.benchmark",
- "--benchmark-type",
- "hf-pt-eager",
- "--model-name",
- args.model_name,
- "--precision",
- args.precision,
- "--batch-sizes",
- args.batch_sizes,
- "--sequence-lengths",
- args.sequence_lengths,
- "--device",
- args.device,
- "--warmup-runs",
- str(args.warmup_runs),
- "--num-runs",
- str(args.num_runs),
- "--log-folder",
- args.log_folder,
- "--cache-dir",
- args.cache_dir,
- "--auth",
- ]
- logger.info("Benchmark PyTorch without torch.compile")
- results = benchmark(args, benchmark_cmd, "pytorch-eager")
- all_results.extend(results)
- # Benchmark PyTorch with torch.compile
- if args.hf_pt_compile:
- benchmark_cmd = [
- "python",
- "-m",
- "models.llama.benchmark",
- "--benchmark-type",
- "hf-pt-compile",
- "--model-name",
- args.model_name,
- "--precision",
- args.precision,
- "--batch-sizes",
- args.batch_sizes,
- "--sequence-lengths",
- args.sequence_lengths,
- "--device",
- args.device,
- "--warmup-runs",
- str(args.warmup_runs),
- "--num-runs",
- str(args.num_runs),
- "--log-folder",
- args.log_folder,
- "--cache-dir",
- args.cache_dir,
- "--auth",
- ]
- logger.info("Benchmark PyTorch with torch.compile")
- results = benchmark(args, benchmark_cmd, "pytorch-compile")
- all_results.extend(results)
- # Benchmark Optimum + ONNX Runtime
- if args.hf_ort_dir_path:
- benchmark_cmd = [
- "python",
- "-m",
- "models.llama.benchmark",
- "--benchmark-type",
- "hf-ort",
- "--hf-ort-dir-path",
- args.hf_ort_dir_path,
- "--model-name",
- args.model_name,
- "--precision",
- args.precision,
- "--batch-sizes",
- args.batch_sizes,
- "--sequence-lengths",
- args.sequence_lengths,
- "--device",
- args.device,
- "--warmup-runs",
- str(args.warmup_runs),
- "--num-runs",
- str(args.num_runs),
- "--log-folder",
- args.log_folder,
- "--cache-dir",
- args.cache_dir,
- "--auth",
- ]
- logger.info("Benchmark Optimum + ONNX Runtime")
- results = benchmark(args, benchmark_cmd, "optimum-ort")
- all_results.extend(results)
- # Benchmark Microsoft model in ONNX Runtime
- if args.ort_msft_model_path:
- benchmark_cmd = [
- "python",
- "-m",
- "models.llama.benchmark",
- "--benchmark-type",
- "ort-msft",
- "--ort-model-path",
- args.ort_msft_model_path,
- "--model-name",
- args.model_name,
- "--precision",
- args.precision,
- "--batch-sizes",
- args.batch_sizes,
- "--sequence-lengths",
- args.sequence_lengths,
- "--device",
- args.device,
- "--warmup-runs",
- str(args.warmup_runs),
- "--num-runs",
- str(args.num_runs),
- "--log-folder",
- args.log_folder,
- "--cache-dir",
- args.cache_dir,
- ]
- logger.info("Benchmark Microsoft model in ONNX Runtime")
- results = benchmark(args, benchmark_cmd, "ort-msft")
- all_results.extend(results)
- # Benchmark convert_to_onnx model in ONNX Runtime
- if args.ort_convert_to_onnx_model_path:
- benchmark_cmd = [
- "python",
- "-m",
- "models.llama.benchmark",
- "--benchmark-type",
- "ort-convert-to-onnx",
- "--ort-model-path",
- args.ort_convert_to_onnx_model_path,
- "--model-name",
- args.model_name,
- "--precision",
- args.precision,
- "--batch-sizes",
- args.batch_sizes,
- "--sequence-lengths",
- args.sequence_lengths,
- "--device",
- args.device,
- "--warmup-runs",
- str(args.warmup_runs),
- "--num-runs",
- str(args.num_runs),
- "--log-folder",
- args.log_folder,
- "--cache-dir",
- args.cache_dir,
- ]
- logger.info("Benchmark convert_to_onnx model in ONNX Runtime")
- results = benchmark(args, benchmark_cmd, "onnxruntime")
- all_results.extend(results)
- csv_file = f"{args.model_size}_{args.precision}_{datetime.datetime.now():%Y-%m-%d_%H:%M:%S}.csv"
- save_results(all_results, os.path.join(args.log_folder, csv_file))
- if __name__ == "__main__":
- main()
|