benchmark_all.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526
  1. # -------------------------------------------------------------------------
  2. # Copyright (c) Microsoft Corporation. All rights reserved.
  3. # Licensed under the MIT License. See License.txt in the project root for
  4. # license information.
  5. # --------------------------------------------------------------------------
  6. import argparse
  7. import datetime
  8. import json
  9. import logging
  10. import os
  11. import subprocess
  12. import librosa
  13. import torch
  14. from benchmark_helper import setup_logger
  15. from metrics import BenchmarkRecord
  16. from transformers import WhisperConfig, WhisperProcessor
  17. logger = logging.getLogger(__name__)
  18. def get_args():
  19. parser = argparse.ArgumentParser()
  20. parser.add_argument(
  21. "-a",
  22. "--audio-path",
  23. type=str,
  24. required=True,
  25. help="Path to folder of audio files for E2E evaluation",
  26. )
  27. parser.add_argument(
  28. "-l",
  29. "--language",
  30. default=None,
  31. help="Language of audio file",
  32. )
  33. parser.add_argument(
  34. "-t",
  35. "--task",
  36. default=None,
  37. choices=["transcribe", "translate"],
  38. help="Task to complete",
  39. )
  40. parser.add_argument(
  41. "-w",
  42. "--warmup-runs",
  43. type=int,
  44. default=5,
  45. )
  46. parser.add_argument(
  47. "-n",
  48. "--num-runs",
  49. type=int,
  50. default=10,
  51. )
  52. parser.add_argument(
  53. "--hf-pt-eager",
  54. default=False,
  55. action="store_true",
  56. help="Benchmark in PyTorch without `torch.compile`",
  57. )
  58. parser.add_argument(
  59. "--hf-pt-compile",
  60. default=False,
  61. action="store_true",
  62. help="Benchmark in PyTorch with `torch.compile`",
  63. )
  64. parser.add_argument(
  65. "--hf-ort-dir-path",
  66. type=str,
  67. help="Path to folder containing ONNX models for Optimum + ORT benchmarking",
  68. )
  69. parser.add_argument(
  70. "--ort-model-path",
  71. type=str,
  72. help="Path to ONNX model for ORT benchmarking",
  73. )
  74. parser.add_argument(
  75. "--model-name",
  76. type=str,
  77. required=True,
  78. help="Model name in Hugging Face (e.g. openai/whisper-large-v2)",
  79. )
  80. parser.add_argument(
  81. "--precision",
  82. type=str,
  83. required=True,
  84. choices=["int8", "fp16", "fp32"],
  85. help="Precision to run model",
  86. )
  87. parser.add_argument(
  88. "--device",
  89. type=str,
  90. required=True,
  91. choices=["cpu", "cuda", "rocm"],
  92. help="Device to benchmark models",
  93. )
  94. parser.add_argument(
  95. "--device-id",
  96. type=int,
  97. default=0,
  98. help="GPU device ID",
  99. )
  100. parser.add_argument(
  101. "--verbose",
  102. default=False,
  103. action="store_true",
  104. help="Print detailed logs",
  105. )
  106. parser.add_argument(
  107. "--timeout",
  108. type=int,
  109. default=5,
  110. help="Number of mins to attempt the benchmark before moving on",
  111. )
  112. parser.add_argument(
  113. "--log-folder",
  114. type=str,
  115. default=None,
  116. help="Path to folder to save logs and results",
  117. )
  118. parser.add_argument("--tune", default=False, action="store_true")
  119. args = parser.parse_args()
  120. setattr(args, "model_size", args.model_name.split("/")[-1].replace(".", "-")) # noqa: B010
  121. log_folder_name = f"./{args.model_size}-{args.precision}"
  122. if not args.log_folder:
  123. args.log_folder = log_folder_name
  124. os.makedirs(args.log_folder, exist_ok=True)
  125. # Convert timeout value to secs
  126. args.timeout *= 60
  127. return args
  128. def process_log_file(device_id, log_file, base_results):
  129. entries = []
  130. # Detect steps in speech pipeline
  131. step = None
  132. load_audio_pattern = "Load audio: "
  133. feat_ext_pattern = "Feature extraction: "
  134. pytorch_pattern = "Evaluating PyTorch..."
  135. onnxruntime_pattern = "Evaluating ONNX Runtime..."
  136. load_audio_latency_s, load_audio_throughput_s = None, None
  137. feat_ext_latency_s, feat_ext_throughput_s = None, None
  138. token_length, latency_s, per_token_latency_s, per_token_latency_ms = None, None, None, None
  139. throughput, memory = None, None
  140. # Detect metrics
  141. latency_pattern = "Latency: "
  142. throughput_pattern = "Throughput: "
  143. token_length_pattern = "Generated token length: "
  144. memory_pattern = "peak="
  145. with open(log_file) as f:
  146. for input_line in f:
  147. line = input_line.replace("\n", "")
  148. # Get step in speech recognition pipeline
  149. if load_audio_pattern in line:
  150. step = "load-audio"
  151. elif feat_ext_pattern in line:
  152. step = "feature-extraction"
  153. elif pytorch_pattern in line or onnxruntime_pattern in line:
  154. step = "process"
  155. # Check metrics
  156. if latency_pattern in line:
  157. latency_s = float(line[len(latency_pattern) : line.rfind(" ")])
  158. elif throughput_pattern in line:
  159. throughput = float(line[len(throughput_pattern) : line.rfind(" ")])
  160. if step == "load-audio":
  161. load_audio_latency_s, load_audio_throughput_s = latency_s, throughput
  162. step = None
  163. if step == "feature-extraction":
  164. feat_ext_latency_s, feat_ext_throughput_s = latency_s, throughput
  165. step = None
  166. elif token_length_pattern in line:
  167. token_length = int(line[len(token_length_pattern) : line.rfind(" ")])
  168. per_token_latency_s = latency_s / token_length
  169. per_token_latency_ms = per_token_latency_s * 1000
  170. elif memory_pattern in line:
  171. if "CPU" in line:
  172. # Example format for log entry:
  173. # CPU memory usage: before=1000.0 MB, peak=2000.0 MB
  174. memory = float(line[line.rfind("=") + 1 : line.rfind(" MB")]) / 1000
  175. else:
  176. # Example format for log entry:
  177. # GPU memory usage: before=[{'device_id': 0, 'name': 'Tesla V100-PCIE-16GB', 'max_used_MB': 1638.875}, {'device_id': 1, 'name': 'Tesla V100-PCIE-16GB', 'max_used_MB': 236.875}, peak=[{'device_id': 0, 'name': 'Tesla V100-PCIE-16GB', 'max_used_MB': 1780.875}, {'device_id': 1, 'name': 'Tesla V100-PCIE-16GB', 'max_used_MB': 236.875}]
  178. peak = line[line.find(memory_pattern) + len(memory_pattern) :].replace("'", '"')
  179. usage = json.loads(peak)[device_id]["max_used_MB"]
  180. memory = float(usage) / 1000
  181. # Calculate real-time factor (RTF):
  182. # RTF = total latency / audio duration
  183. total_latency = (
  184. (load_audio_latency_s if load_audio_latency_s else 0)
  185. + (feat_ext_latency_s if feat_ext_latency_s else 0)
  186. + (latency_s if latency_s else 0)
  187. )
  188. audio_duration = base_results[-1]
  189. rtf = (total_latency / audio_duration) if audio_duration else -1
  190. logger.info(f"Total latency: {total_latency} s")
  191. logger.info(f"Audio duration: {audio_duration} s")
  192. logger.info(f"Real-time factor: {rtf}")
  193. # Append log entry to list of entries
  194. entry = base_results + [ # noqa: RUF005
  195. token_length,
  196. load_audio_latency_s,
  197. load_audio_throughput_s,
  198. feat_ext_latency_s if feat_ext_latency_s else -1,
  199. feat_ext_throughput_s if feat_ext_throughput_s else -1,
  200. latency_s,
  201. per_token_latency_ms,
  202. throughput,
  203. memory,
  204. rtf,
  205. ]
  206. entries.append(entry)
  207. return entries
  208. def save_results(results, filename):
  209. import pandas as pd # noqa: PLC0415
  210. df = pd.DataFrame(
  211. results,
  212. columns=[
  213. "Warmup Runs",
  214. "Measured Runs",
  215. "Model Name",
  216. "Engine",
  217. "Precision",
  218. "Device",
  219. "Audio File",
  220. "Duration (s)",
  221. "Token Length",
  222. "Load Audio Latency (s)",
  223. "Load Audio Throughput (qps)",
  224. "Feature Extractor Latency (s)",
  225. "Feature Extractor Throughput (qps)",
  226. "Latency (s)",
  227. "Per Token Latency (ms/token)",
  228. "Throughput (qps)",
  229. "Memory (GB)",
  230. "Real Time Factor (RTF)",
  231. ],
  232. )
  233. # Set column types
  234. df["Warmup Runs"] = df["Warmup Runs"].astype("int")
  235. df["Measured Runs"] = df["Measured Runs"].astype("int")
  236. df["Duration (s)"] = df["Duration (s)"].astype("float")
  237. df["Token Length"] = df["Token Length"].astype("int")
  238. df["Load Audio Latency (s)"] = df["Load Audio Latency (s)"].astype("float")
  239. df["Load Audio Throughput (qps)"] = df["Load Audio Throughput (qps)"].astype("float")
  240. df["Feature Extractor Latency (s)"] = df["Feature Extractor Latency (s)"].astype("float")
  241. df["Feature Extractor Throughput (qps)"] = df["Feature Extractor Throughput (qps)"].astype("float")
  242. df["Latency (s)"] = df["Latency (s)"].astype("float")
  243. df["Per Token Latency (ms/token)"] = df["Per Token Latency (ms/token)"].astype("float")
  244. df["Throughput (qps)"] = df["Throughput (qps)"].astype("float")
  245. df["Memory (GB)"] = df["Memory (GB)"].astype("float")
  246. df["Real Time Factor (RTF)"] = df["Real Time Factor (RTF)"].astype("float")
  247. # get package name and version
  248. import pkg_resources # noqa: PLC0415
  249. installed_packages = pkg_resources.working_set
  250. installed_packages_list = sorted(
  251. [f"{i.key}=={i.version}" for i in installed_packages if i.key in ["onnxruntime", "onnxruntime-gpu"]]
  252. )
  253. ort_pkg_name = ""
  254. ort_pkg_version = ""
  255. if installed_packages_list:
  256. ort_pkg_name = installed_packages_list[0].split("==")[0]
  257. ort_pkg_version = installed_packages_list[0].split("==")[1]
  258. # Save results to csv with standard format
  259. records = []
  260. for _, row in df.iterrows():
  261. if row["Engine"] == "onnxruntime":
  262. record = BenchmarkRecord(
  263. row["Model Name"], row["Precision"], row["Engine"], row["Device"], ort_pkg_name, ort_pkg_version
  264. )
  265. else:
  266. record = BenchmarkRecord(
  267. row["Model Name"], row["Precision"], row["Engine"], row["Device"], torch.__name__, torch.__version__
  268. )
  269. record.config.customized["audio_file"] = row["Audio File"]
  270. record.config.warmup_runs = row["Warmup Runs"]
  271. record.config.measured_runs = row["Measured Runs"]
  272. record.metrics.customized["duration"] = row["Duration (s)"]
  273. record.metrics.customized["token_length"] = row["Token Length"]
  274. record.metrics.customized["load_audio_latency"] = row["Load Audio Latency (s)"]
  275. record.metrics.customized["load_audio_throughput"] = row["Load Audio Throughput (qps)"]
  276. record.metrics.customized["feature_extractor_latency_s"] = row["Feature Extractor Latency (s)"]
  277. record.metrics.customized["feature_extractor_throughput_qps"] = row["Feature Extractor Throughput (qps)"]
  278. record.metrics.customized["per_token_latency_ms"] = row["Per Token Latency (ms/token)"]
  279. record.metrics.customized["rtf"] = row["Real Time Factor (RTF)"]
  280. record.metrics.latency_ms_mean = row["Latency (s)"] * 1000
  281. record.metrics.throughput_qps = row["Throughput (qps)"]
  282. record.metrics.max_memory_usage_GB = row["Memory (GB)"]
  283. records.append(record)
  284. BenchmarkRecord.save_as_csv(filename, records)
  285. BenchmarkRecord.save_as_json(filename.replace(".csv", ".json"), records)
  286. logger.info(f"Results saved in {filename}!")
  287. def benchmark(args, benchmark_cmd, engine, audio_file, duration):
  288. log_filename = f"{engine}_{datetime.datetime.now():%Y-%m-%d_%H:%M:%S}.log"
  289. log_path = os.path.join(args.log_folder, log_filename)
  290. with open(log_path, "w") as log_file:
  291. process = subprocess.Popen(benchmark_cmd, stdout=log_file, stderr=log_file)
  292. try:
  293. process.wait(args.timeout)
  294. except subprocess.TimeoutExpired:
  295. process.kill()
  296. # Create entries for csv
  297. logger.info("Gathering data from log files...")
  298. base_results = [
  299. args.warmup_runs,
  300. args.num_runs,
  301. args.model_name,
  302. engine,
  303. args.precision,
  304. args.device,
  305. audio_file,
  306. duration,
  307. ]
  308. results = process_log_file(args.device_id, log_path, base_results)
  309. return results
  310. def main():
  311. args = get_args()
  312. setup_logger(args.verbose)
  313. logger.info(args.__dict__)
  314. torch.backends.cudnn.benchmark = True
  315. config = WhisperConfig.from_pretrained(args.model_name)
  316. processor = WhisperProcessor.from_pretrained(args.model_name)
  317. # Calculate forced decoder input ids
  318. hf_forced_decoder_ids = processor.get_decoder_prompt_ids(language=args.language, task=args.task)
  319. ort_forced_decoder_ids = [config.decoder_start_token_id] + [token_id[1] for token_id in hf_forced_decoder_ids]
  320. hf_decoder_input_ids_cmd = (
  321. ["--decoder-input-ids", str(hf_forced_decoder_ids)] if args.language and args.task else []
  322. )
  323. ort_decoder_input_ids_cmd = (
  324. ["--decoder-input-ids", str(ort_forced_decoder_ids)] if args.language and args.task else []
  325. )
  326. ort_tune_cmd = ["--tune"] if args.tune else []
  327. all_results = []
  328. for audio_file in os.listdir(args.audio_path):
  329. audio_path = os.path.join(args.audio_path, audio_file)
  330. try:
  331. duration = librosa.get_duration(path=audio_path)
  332. except Exception as e:
  333. duration = -1
  334. logger.warning(f"An error occurred while trying to calculate the audio duration: {e}", exc_info=True)
  335. logger.warning(
  336. f"If you get an error that says:\n\tsoundfile.LibsndfileError: Error opening '{audio_file}': File contains data in an unknown format.\nyou may not have installed `ffmpeg` in addition to installing `librosa`."
  337. )
  338. logger.info(f"Testing {audio_path}...")
  339. # Benchmark PyTorch without torch.compile
  340. if args.hf_pt_eager:
  341. benchmark_cmd = [ # noqa: RUF005
  342. "python",
  343. "-m",
  344. "models.whisper.benchmark",
  345. "--audio-path",
  346. audio_path,
  347. "--benchmark-type",
  348. "hf-pt-eager",
  349. "--model-name",
  350. args.model_name,
  351. "--precision",
  352. args.precision,
  353. "--device",
  354. args.device,
  355. "--device-id",
  356. str(args.device_id),
  357. "--warmup-runs",
  358. str(args.warmup_runs),
  359. "--num-runs",
  360. str(args.num_runs),
  361. "--log-folder",
  362. args.log_folder,
  363. ] + hf_decoder_input_ids_cmd
  364. logger.info("Benchmark PyTorch without torch.compile")
  365. results = benchmark(args, benchmark_cmd, "pytorch-eager", audio_file, duration)
  366. all_results.extend(results)
  367. # Benchmark PyTorch with torch.compile
  368. if args.hf_pt_compile:
  369. benchmark_cmd = [ # noqa: RUF005
  370. "python",
  371. "-m",
  372. "models.whisper.benchmark",
  373. "--audio-path",
  374. audio_path,
  375. "--benchmark-type",
  376. "hf-pt-compile",
  377. "--model-name",
  378. args.model_name,
  379. "--precision",
  380. args.precision,
  381. "--device",
  382. args.device,
  383. "--device-id",
  384. str(args.device_id),
  385. "--warmup-runs",
  386. str(args.warmup_runs),
  387. "--num-runs",
  388. str(args.num_runs),
  389. "--log-folder",
  390. args.log_folder,
  391. ] + hf_decoder_input_ids_cmd
  392. logger.info("Benchmark PyTorch with torch.compile")
  393. results = benchmark(args, benchmark_cmd, "pytorch-compile", audio_file, duration)
  394. all_results.extend(results)
  395. # Benchmark Optimum + ONNX Runtime
  396. if args.hf_ort_dir_path:
  397. benchmark_cmd = [ # noqa: RUF005
  398. "python",
  399. "-m",
  400. "models.whisper.benchmark",
  401. "--audio-path",
  402. audio_path,
  403. "--benchmark-type",
  404. "hf-ort",
  405. "--hf-ort-dir-path",
  406. args.hf_ort_dir_path,
  407. "--model-name",
  408. args.model_name,
  409. "--precision",
  410. args.precision,
  411. "--device",
  412. args.device,
  413. "--device-id",
  414. str(args.device_id),
  415. "--warmup-runs",
  416. str(args.warmup_runs),
  417. "--num-runs",
  418. str(args.num_runs),
  419. "--log-folder",
  420. args.log_folder,
  421. ] + hf_decoder_input_ids_cmd
  422. logger.info("Benchmark Optimum + ONNX Runtime")
  423. results = benchmark(args, benchmark_cmd, "optimum-ort", audio_file, duration)
  424. all_results.extend(results)
  425. # Benchmark ONNX Runtime
  426. if args.ort_model_path:
  427. benchmark_cmd = (
  428. [ # noqa: RUF005
  429. "python",
  430. "-m",
  431. "models.whisper.benchmark",
  432. "--audio-path",
  433. audio_path,
  434. "--benchmark-type",
  435. "ort",
  436. "--ort-model-path",
  437. args.ort_model_path,
  438. "--model-name",
  439. args.model_name,
  440. "--precision",
  441. args.precision,
  442. "--device",
  443. args.device,
  444. "--device-id",
  445. str(args.device_id),
  446. "--warmup-runs",
  447. str(args.warmup_runs),
  448. "--num-runs",
  449. str(args.num_runs),
  450. "--log-folder",
  451. args.log_folder,
  452. ]
  453. + ort_decoder_input_ids_cmd
  454. + ort_tune_cmd
  455. )
  456. logger.info("Benchmark ONNX Runtime")
  457. results = benchmark(args, benchmark_cmd, "onnxruntime", audio_file, duration)
  458. all_results.extend(results)
  459. csv_file = f"{args.model_size}-{args.precision}_{datetime.datetime.now():%Y-%m-%d_%H:%M:%S}.csv"
  460. save_results(all_results, os.path.join(args.log_folder, csv_file))
  461. if __name__ == "__main__":
  462. main()