| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526 |
- # -------------------------------------------------------------------------
- # Copyright (c) Microsoft Corporation. All rights reserved.
- # Licensed under the MIT License. See License.txt in the project root for
- # license information.
- # --------------------------------------------------------------------------
- import argparse
- import datetime
- import json
- import logging
- import os
- import subprocess
- import librosa
- import torch
- from benchmark_helper import setup_logger
- from metrics import BenchmarkRecord
- from transformers import WhisperConfig, WhisperProcessor
- logger = logging.getLogger(__name__)
- def get_args():
- parser = argparse.ArgumentParser()
- parser.add_argument(
- "-a",
- "--audio-path",
- type=str,
- required=True,
- help="Path to folder of audio files for E2E evaluation",
- )
- parser.add_argument(
- "-l",
- "--language",
- default=None,
- help="Language of audio file",
- )
- parser.add_argument(
- "-t",
- "--task",
- default=None,
- choices=["transcribe", "translate"],
- help="Task to complete",
- )
- parser.add_argument(
- "-w",
- "--warmup-runs",
- type=int,
- default=5,
- )
- parser.add_argument(
- "-n",
- "--num-runs",
- type=int,
- default=10,
- )
- parser.add_argument(
- "--hf-pt-eager",
- default=False,
- action="store_true",
- help="Benchmark in PyTorch without `torch.compile`",
- )
- parser.add_argument(
- "--hf-pt-compile",
- default=False,
- action="store_true",
- help="Benchmark in PyTorch with `torch.compile`",
- )
- parser.add_argument(
- "--hf-ort-dir-path",
- type=str,
- help="Path to folder containing ONNX models for Optimum + ORT benchmarking",
- )
- parser.add_argument(
- "--ort-model-path",
- type=str,
- help="Path to ONNX model for ORT benchmarking",
- )
- parser.add_argument(
- "--model-name",
- type=str,
- required=True,
- help="Model name in Hugging Face (e.g. openai/whisper-large-v2)",
- )
- parser.add_argument(
- "--precision",
- type=str,
- required=True,
- choices=["int8", "fp16", "fp32"],
- help="Precision to run model",
- )
- parser.add_argument(
- "--device",
- type=str,
- required=True,
- choices=["cpu", "cuda", "rocm"],
- help="Device to benchmark models",
- )
- parser.add_argument(
- "--device-id",
- type=int,
- default=0,
- help="GPU device ID",
- )
- parser.add_argument(
- "--verbose",
- default=False,
- action="store_true",
- help="Print detailed logs",
- )
- parser.add_argument(
- "--timeout",
- type=int,
- default=5,
- help="Number of mins to attempt the benchmark before moving on",
- )
- parser.add_argument(
- "--log-folder",
- type=str,
- default=None,
- help="Path to folder to save logs and results",
- )
- parser.add_argument("--tune", default=False, action="store_true")
- args = parser.parse_args()
- setattr(args, "model_size", args.model_name.split("/")[-1].replace(".", "-")) # noqa: B010
- log_folder_name = f"./{args.model_size}-{args.precision}"
- if not args.log_folder:
- args.log_folder = log_folder_name
- os.makedirs(args.log_folder, exist_ok=True)
- # Convert timeout value to secs
- args.timeout *= 60
- return args
- def process_log_file(device_id, log_file, base_results):
- entries = []
- # Detect steps in speech pipeline
- step = None
- load_audio_pattern = "Load audio: "
- feat_ext_pattern = "Feature extraction: "
- pytorch_pattern = "Evaluating PyTorch..."
- onnxruntime_pattern = "Evaluating ONNX Runtime..."
- load_audio_latency_s, load_audio_throughput_s = None, None
- feat_ext_latency_s, feat_ext_throughput_s = None, None
- token_length, latency_s, per_token_latency_s, per_token_latency_ms = None, None, None, None
- throughput, memory = None, None
- # Detect metrics
- latency_pattern = "Latency: "
- throughput_pattern = "Throughput: "
- token_length_pattern = "Generated token length: "
- memory_pattern = "peak="
- with open(log_file) as f:
- for input_line in f:
- line = input_line.replace("\n", "")
- # Get step in speech recognition pipeline
- if load_audio_pattern in line:
- step = "load-audio"
- elif feat_ext_pattern in line:
- step = "feature-extraction"
- elif pytorch_pattern in line or onnxruntime_pattern in line:
- step = "process"
- # Check metrics
- if latency_pattern in line:
- latency_s = float(line[len(latency_pattern) : line.rfind(" ")])
- elif throughput_pattern in line:
- throughput = float(line[len(throughput_pattern) : line.rfind(" ")])
- if step == "load-audio":
- load_audio_latency_s, load_audio_throughput_s = latency_s, throughput
- step = None
- if step == "feature-extraction":
- feat_ext_latency_s, feat_ext_throughput_s = latency_s, throughput
- step = None
- elif token_length_pattern in line:
- token_length = int(line[len(token_length_pattern) : line.rfind(" ")])
- per_token_latency_s = latency_s / token_length
- per_token_latency_ms = per_token_latency_s * 1000
- elif memory_pattern in line:
- if "CPU" in line:
- # Example format for log entry:
- # CPU memory usage: before=1000.0 MB, peak=2000.0 MB
- memory = float(line[line.rfind("=") + 1 : line.rfind(" MB")]) / 1000
- else:
- # Example format for log entry:
- # GPU memory usage: before=[{'device_id': 0, 'name': 'Tesla V100-PCIE-16GB', 'max_used_MB': 1638.875}, {'device_id': 1, 'name': 'Tesla V100-PCIE-16GB', 'max_used_MB': 236.875}, peak=[{'device_id': 0, 'name': 'Tesla V100-PCIE-16GB', 'max_used_MB': 1780.875}, {'device_id': 1, 'name': 'Tesla V100-PCIE-16GB', 'max_used_MB': 236.875}]
- peak = line[line.find(memory_pattern) + len(memory_pattern) :].replace("'", '"')
- usage = json.loads(peak)[device_id]["max_used_MB"]
- memory = float(usage) / 1000
- # Calculate real-time factor (RTF):
- # RTF = total latency / audio duration
- total_latency = (
- (load_audio_latency_s if load_audio_latency_s else 0)
- + (feat_ext_latency_s if feat_ext_latency_s else 0)
- + (latency_s if latency_s else 0)
- )
- audio_duration = base_results[-1]
- rtf = (total_latency / audio_duration) if audio_duration else -1
- logger.info(f"Total latency: {total_latency} s")
- logger.info(f"Audio duration: {audio_duration} s")
- logger.info(f"Real-time factor: {rtf}")
- # Append log entry to list of entries
- entry = base_results + [ # noqa: RUF005
- token_length,
- load_audio_latency_s,
- load_audio_throughput_s,
- feat_ext_latency_s if feat_ext_latency_s else -1,
- feat_ext_throughput_s if feat_ext_throughput_s else -1,
- latency_s,
- per_token_latency_ms,
- throughput,
- memory,
- rtf,
- ]
- entries.append(entry)
- return entries
- def save_results(results, filename):
- import pandas as pd # noqa: PLC0415
- df = pd.DataFrame(
- results,
- columns=[
- "Warmup Runs",
- "Measured Runs",
- "Model Name",
- "Engine",
- "Precision",
- "Device",
- "Audio File",
- "Duration (s)",
- "Token Length",
- "Load Audio Latency (s)",
- "Load Audio Throughput (qps)",
- "Feature Extractor Latency (s)",
- "Feature Extractor Throughput (qps)",
- "Latency (s)",
- "Per Token Latency (ms/token)",
- "Throughput (qps)",
- "Memory (GB)",
- "Real Time Factor (RTF)",
- ],
- )
- # Set column types
- df["Warmup Runs"] = df["Warmup Runs"].astype("int")
- df["Measured Runs"] = df["Measured Runs"].astype("int")
- df["Duration (s)"] = df["Duration (s)"].astype("float")
- df["Token Length"] = df["Token Length"].astype("int")
- df["Load Audio Latency (s)"] = df["Load Audio Latency (s)"].astype("float")
- df["Load Audio Throughput (qps)"] = df["Load Audio Throughput (qps)"].astype("float")
- df["Feature Extractor Latency (s)"] = df["Feature Extractor Latency (s)"].astype("float")
- df["Feature Extractor Throughput (qps)"] = df["Feature Extractor Throughput (qps)"].astype("float")
- df["Latency (s)"] = df["Latency (s)"].astype("float")
- df["Per Token Latency (ms/token)"] = df["Per Token Latency (ms/token)"].astype("float")
- df["Throughput (qps)"] = df["Throughput (qps)"].astype("float")
- df["Memory (GB)"] = df["Memory (GB)"].astype("float")
- df["Real Time Factor (RTF)"] = df["Real Time Factor (RTF)"].astype("float")
- # get package name and version
- import pkg_resources # noqa: PLC0415
- installed_packages = pkg_resources.working_set
- installed_packages_list = sorted(
- [f"{i.key}=={i.version}" for i in installed_packages if i.key in ["onnxruntime", "onnxruntime-gpu"]]
- )
- ort_pkg_name = ""
- ort_pkg_version = ""
- if installed_packages_list:
- ort_pkg_name = installed_packages_list[0].split("==")[0]
- ort_pkg_version = installed_packages_list[0].split("==")[1]
- # Save results to csv with standard format
- records = []
- for _, row in df.iterrows():
- if row["Engine"] == "onnxruntime":
- record = BenchmarkRecord(
- row["Model Name"], row["Precision"], row["Engine"], row["Device"], ort_pkg_name, ort_pkg_version
- )
- else:
- record = BenchmarkRecord(
- row["Model Name"], row["Precision"], row["Engine"], row["Device"], torch.__name__, torch.__version__
- )
- record.config.customized["audio_file"] = row["Audio File"]
- record.config.warmup_runs = row["Warmup Runs"]
- record.config.measured_runs = row["Measured Runs"]
- record.metrics.customized["duration"] = row["Duration (s)"]
- record.metrics.customized["token_length"] = row["Token Length"]
- record.metrics.customized["load_audio_latency"] = row["Load Audio Latency (s)"]
- record.metrics.customized["load_audio_throughput"] = row["Load Audio Throughput (qps)"]
- record.metrics.customized["feature_extractor_latency_s"] = row["Feature Extractor Latency (s)"]
- record.metrics.customized["feature_extractor_throughput_qps"] = row["Feature Extractor Throughput (qps)"]
- record.metrics.customized["per_token_latency_ms"] = row["Per Token Latency (ms/token)"]
- record.metrics.customized["rtf"] = row["Real Time Factor (RTF)"]
- record.metrics.latency_ms_mean = row["Latency (s)"] * 1000
- record.metrics.throughput_qps = row["Throughput (qps)"]
- record.metrics.max_memory_usage_GB = row["Memory (GB)"]
- records.append(record)
- BenchmarkRecord.save_as_csv(filename, records)
- BenchmarkRecord.save_as_json(filename.replace(".csv", ".json"), records)
- logger.info(f"Results saved in {filename}!")
- def benchmark(args, benchmark_cmd, engine, audio_file, duration):
- log_filename = f"{engine}_{datetime.datetime.now():%Y-%m-%d_%H:%M:%S}.log"
- log_path = os.path.join(args.log_folder, log_filename)
- with open(log_path, "w") as log_file:
- process = subprocess.Popen(benchmark_cmd, stdout=log_file, stderr=log_file)
- try:
- process.wait(args.timeout)
- except subprocess.TimeoutExpired:
- process.kill()
- # Create entries for csv
- logger.info("Gathering data from log files...")
- base_results = [
- args.warmup_runs,
- args.num_runs,
- args.model_name,
- engine,
- args.precision,
- args.device,
- audio_file,
- duration,
- ]
- results = process_log_file(args.device_id, log_path, base_results)
- return results
- def main():
- args = get_args()
- setup_logger(args.verbose)
- logger.info(args.__dict__)
- torch.backends.cudnn.benchmark = True
- config = WhisperConfig.from_pretrained(args.model_name)
- processor = WhisperProcessor.from_pretrained(args.model_name)
- # Calculate forced decoder input ids
- hf_forced_decoder_ids = processor.get_decoder_prompt_ids(language=args.language, task=args.task)
- ort_forced_decoder_ids = [config.decoder_start_token_id] + [token_id[1] for token_id in hf_forced_decoder_ids]
- hf_decoder_input_ids_cmd = (
- ["--decoder-input-ids", str(hf_forced_decoder_ids)] if args.language and args.task else []
- )
- ort_decoder_input_ids_cmd = (
- ["--decoder-input-ids", str(ort_forced_decoder_ids)] if args.language and args.task else []
- )
- ort_tune_cmd = ["--tune"] if args.tune else []
- all_results = []
- for audio_file in os.listdir(args.audio_path):
- audio_path = os.path.join(args.audio_path, audio_file)
- try:
- duration = librosa.get_duration(path=audio_path)
- except Exception as e:
- duration = -1
- logger.warning(f"An error occurred while trying to calculate the audio duration: {e}", exc_info=True)
- logger.warning(
- f"If you get an error that says:\n\tsoundfile.LibsndfileError: Error opening '{audio_file}': File contains data in an unknown format.\nyou may not have installed `ffmpeg` in addition to installing `librosa`."
- )
- logger.info(f"Testing {audio_path}...")
- # Benchmark PyTorch without torch.compile
- if args.hf_pt_eager:
- benchmark_cmd = [ # noqa: RUF005
- "python",
- "-m",
- "models.whisper.benchmark",
- "--audio-path",
- audio_path,
- "--benchmark-type",
- "hf-pt-eager",
- "--model-name",
- args.model_name,
- "--precision",
- args.precision,
- "--device",
- args.device,
- "--device-id",
- str(args.device_id),
- "--warmup-runs",
- str(args.warmup_runs),
- "--num-runs",
- str(args.num_runs),
- "--log-folder",
- args.log_folder,
- ] + hf_decoder_input_ids_cmd
- logger.info("Benchmark PyTorch without torch.compile")
- results = benchmark(args, benchmark_cmd, "pytorch-eager", audio_file, duration)
- all_results.extend(results)
- # Benchmark PyTorch with torch.compile
- if args.hf_pt_compile:
- benchmark_cmd = [ # noqa: RUF005
- "python",
- "-m",
- "models.whisper.benchmark",
- "--audio-path",
- audio_path,
- "--benchmark-type",
- "hf-pt-compile",
- "--model-name",
- args.model_name,
- "--precision",
- args.precision,
- "--device",
- args.device,
- "--device-id",
- str(args.device_id),
- "--warmup-runs",
- str(args.warmup_runs),
- "--num-runs",
- str(args.num_runs),
- "--log-folder",
- args.log_folder,
- ] + hf_decoder_input_ids_cmd
- logger.info("Benchmark PyTorch with torch.compile")
- results = benchmark(args, benchmark_cmd, "pytorch-compile", audio_file, duration)
- all_results.extend(results)
- # Benchmark Optimum + ONNX Runtime
- if args.hf_ort_dir_path:
- benchmark_cmd = [ # noqa: RUF005
- "python",
- "-m",
- "models.whisper.benchmark",
- "--audio-path",
- audio_path,
- "--benchmark-type",
- "hf-ort",
- "--hf-ort-dir-path",
- args.hf_ort_dir_path,
- "--model-name",
- args.model_name,
- "--precision",
- args.precision,
- "--device",
- args.device,
- "--device-id",
- str(args.device_id),
- "--warmup-runs",
- str(args.warmup_runs),
- "--num-runs",
- str(args.num_runs),
- "--log-folder",
- args.log_folder,
- ] + hf_decoder_input_ids_cmd
- logger.info("Benchmark Optimum + ONNX Runtime")
- results = benchmark(args, benchmark_cmd, "optimum-ort", audio_file, duration)
- all_results.extend(results)
- # Benchmark ONNX Runtime
- if args.ort_model_path:
- benchmark_cmd = (
- [ # noqa: RUF005
- "python",
- "-m",
- "models.whisper.benchmark",
- "--audio-path",
- audio_path,
- "--benchmark-type",
- "ort",
- "--ort-model-path",
- args.ort_model_path,
- "--model-name",
- args.model_name,
- "--precision",
- args.precision,
- "--device",
- args.device,
- "--device-id",
- str(args.device_id),
- "--warmup-runs",
- str(args.warmup_runs),
- "--num-runs",
- str(args.num_runs),
- "--log-folder",
- args.log_folder,
- ]
- + ort_decoder_input_ids_cmd
- + ort_tune_cmd
- )
- logger.info("Benchmark ONNX Runtime")
- results = benchmark(args, benchmark_cmd, "onnxruntime", audio_file, duration)
- all_results.extend(results)
- csv_file = f"{args.model_size}-{args.precision}_{datetime.datetime.now():%Y-%m-%d_%H:%M:%S}.csv"
- save_results(all_results, os.path.join(args.log_folder, csv_file))
- if __name__ == "__main__":
- main()
|