benchmark_all.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488
  1. # -------------------------------------------------------------------------
  2. # Copyright (c) Microsoft Corporation. All rights reserved.
  3. # Licensed under the MIT License. See License.txt in the project root for
  4. # license information.
  5. # --------------------------------------------------------------------------
  6. import argparse
  7. import datetime
  8. import json
  9. import logging
  10. import os
  11. import subprocess
  12. import torch
  13. from benchmark_helper import setup_logger
  14. from metrics import BenchmarkRecord
  15. logger = logging.getLogger(__name__)
  16. def get_args():
  17. parser = argparse.ArgumentParser()
  18. parser.add_argument(
  19. "-b",
  20. "--batch-sizes",
  21. type=str,
  22. default="1 2",
  23. )
  24. parser.add_argument(
  25. "-s",
  26. "--sequence-lengths",
  27. type=str,
  28. default="8 16 32 64 128 256 512",
  29. )
  30. parser.add_argument(
  31. "-w",
  32. "--warmup-runs",
  33. type=int,
  34. default=5,
  35. )
  36. parser.add_argument(
  37. "-n",
  38. "--num-runs",
  39. type=int,
  40. default=1000,
  41. )
  42. parser.add_argument(
  43. "--hf-pt-eager",
  44. default=False,
  45. action="store_true",
  46. help="Benchmark in PyTorch without `torch.compile`",
  47. )
  48. parser.add_argument(
  49. "--hf-pt-compile",
  50. default=False,
  51. action="store_true",
  52. help="Benchmark in PyTorch with `torch.compile`",
  53. )
  54. parser.add_argument(
  55. "--hf-ort-dir-path",
  56. type=str,
  57. default="",
  58. help="Path to folder containing ONNX models for Optimum + ORT benchmarking",
  59. )
  60. parser.add_argument(
  61. "--ort-msft-model-path",
  62. type=str,
  63. default="",
  64. help="Path to ONNX model from https://github.com/microsoft/Llama-2-Onnx",
  65. )
  66. parser.add_argument(
  67. "--ort-convert-to-onnx-model-path",
  68. type=str,
  69. default="",
  70. help="Path to ONNX model from convert_to_onnx",
  71. )
  72. parser.add_argument(
  73. "--cache-dir",
  74. type=str,
  75. default="./model_cache",
  76. help="Cache dir where Hugging Face files are stored",
  77. )
  78. parser.add_argument(
  79. "--model-name",
  80. type=str,
  81. required=True,
  82. help="Model name in Hugging Face",
  83. )
  84. parser.add_argument(
  85. "--precision",
  86. type=str,
  87. required=True,
  88. choices=["int4", "int8", "fp16", "fp32"],
  89. help="Precision to run model",
  90. )
  91. parser.add_argument(
  92. "--device",
  93. type=str,
  94. required=True,
  95. choices=["cpu", "cuda", "rocm"],
  96. help="Device to benchmark models",
  97. )
  98. parser.add_argument(
  99. "--device-id",
  100. type=int,
  101. default=0,
  102. help="GPU device ID",
  103. )
  104. parser.add_argument(
  105. "--verbose",
  106. default=False,
  107. action="store_true",
  108. help="Print detailed logs",
  109. )
  110. parser.add_argument(
  111. "--timeout",
  112. type=int,
  113. default=10,
  114. help="Number of mins to attempt the benchmark before moving on",
  115. )
  116. parser.add_argument(
  117. "--log-folder",
  118. type=str,
  119. default=None,
  120. help="Path to folder to save logs and results",
  121. )
  122. args = parser.parse_args()
  123. setattr(args, "model_size", args.model_name.split("/")[-1].replace(".", "-")) # noqa: B010
  124. log_folder_name = f"./{args.model_size}_{args.precision}"
  125. if not args.log_folder:
  126. args.log_folder = log_folder_name
  127. os.makedirs(args.log_folder, exist_ok=True)
  128. # Convert timeout value to secs
  129. args.timeout *= 60
  130. return args
  131. def process_log_file(device_id, log_file, base_results):
  132. entries = []
  133. batch_size, sequence_length, step = None, None, None
  134. latency_s, latency_ms, throughput, memory = None, None, None, None
  135. batch_pattern = "Batch Size: "
  136. sequence_pattern = "Sequence Length: "
  137. prompt_step_pattern = "to get past_key_values"
  138. per_token_step_pattern = "with past_key_values"
  139. latency_pattern = "Latency: "
  140. throughput_pattern = "Throughput: "
  141. memory_pattern = "peak="
  142. with open(log_file) as f:
  143. for input_line in f:
  144. line = input_line.replace("\n", "")
  145. if batch_pattern in line:
  146. batch_size = int(line[len(batch_pattern) :])
  147. elif sequence_pattern in line:
  148. sequence_length = int(line[len(sequence_pattern) :])
  149. elif prompt_step_pattern in line:
  150. step = "prompt"
  151. elif per_token_step_pattern in line:
  152. step = "per-token"
  153. elif latency_pattern in line:
  154. latency_s = float(line[len(latency_pattern) : line.rfind(" ")])
  155. latency_ms = latency_s * 1000
  156. elif throughput_pattern in line:
  157. throughput = float(line[len(throughput_pattern) : line.rfind(" ")])
  158. elif memory_pattern in line:
  159. if "CPU" in line:
  160. # Example format for log entry:
  161. # CPU memory usage: before=1000.0 MB, peak=2000.0 MB
  162. memory = float(line[line.rfind("=") + 1 : line.rfind(" MB")]) / 1000
  163. else:
  164. # Example format for log entry:
  165. # GPU memory usage: before=[{'device_id': 0, 'name': 'NVIDIA A100-SXM4-80GB', 'max_used_MB': 69637.25}, {'device_id': 1, 'name': 'NVIDIA A100-SXM4-80GB', 'max_used_MB': 890.625}] peak=[{'device_id': 0, 'name': 'NVIDIA A100-SXM4-80GB', 'max_used_MB': 73861.25}, {'device_id': 1, 'name': 'NVIDIA A100-SXM4-80GB', 'max_used_MB': 890.625}]
  166. peak = line[line.find(memory_pattern) + len(memory_pattern) :].replace("'", '"')
  167. usage = json.loads(peak)[device_id]["max_used_MB"]
  168. memory = float(usage) / 1000
  169. # Append log entry to list of entries
  170. entry = base_results + [ # noqa: RUF005
  171. batch_size,
  172. sequence_length,
  173. step,
  174. latency_s,
  175. latency_ms,
  176. throughput,
  177. memory,
  178. ]
  179. entries.append(entry)
  180. return entries
  181. def save_results(results, filename):
  182. import pandas as pd # noqa: PLC0415
  183. df = pd.DataFrame(
  184. results,
  185. columns=[
  186. "Warmup Runs",
  187. "Measured Runs",
  188. "Model Name",
  189. "Engine",
  190. "Precision",
  191. "Device",
  192. "Batch Size",
  193. "Sequence Length",
  194. "Step",
  195. "Latency (s)",
  196. "Latency (ms)",
  197. "Throughput (tps)",
  198. "Memory (GB)",
  199. ],
  200. )
  201. # Set column types
  202. df["Warmup Runs"] = df["Warmup Runs"].astype("int")
  203. df["Measured Runs"] = df["Measured Runs"].astype("int")
  204. df["Batch Size"] = df["Batch Size"].astype("int")
  205. df["Sequence Length"] = df["Sequence Length"].astype("int")
  206. df["Latency (s)"] = df["Latency (s)"].astype("float")
  207. df["Latency (ms)"] = df["Latency (ms)"].astype("float")
  208. df["Throughput (tps)"] = df["Throughput (tps)"].astype("float")
  209. df["Memory (GB)"] = df["Memory (GB)"].astype("float")
  210. # get package name and version
  211. import pkg_resources # noqa: PLC0415
  212. installed_packages = pkg_resources.working_set
  213. installed_packages_list = sorted(
  214. [f"{i.key}=={i.version}" for i in installed_packages if i.key in ["onnxruntime", "onnxruntime-gpu"]]
  215. )
  216. ort_pkg_name = ""
  217. ort_pkg_version = ""
  218. if installed_packages_list:
  219. ort_pkg_name = installed_packages_list[0].split("==")[0]
  220. ort_pkg_version = installed_packages_list[0].split("==")[1]
  221. # Save results to csv with standard format
  222. records = []
  223. for _, row in df.iterrows():
  224. if row["Engine"] in ["optimum-ort", "onnxruntime"]:
  225. record = BenchmarkRecord(
  226. row["Model Name"], row["Precision"], "onnxruntime", row["Device"], ort_pkg_name, ort_pkg_version
  227. )
  228. elif row["Engine"] in ["pytorch-eager", "pytorch-compile"]:
  229. record = BenchmarkRecord(
  230. row["Model Name"], row["Precision"], "pytorch", row["Device"], torch.__name__, torch.__version__
  231. )
  232. else:
  233. record = BenchmarkRecord(row["Model Name"], row["Precision"], row["Engine"], row["Device"], "", "")
  234. record.config.warmup_runs = row["Warmup Runs"]
  235. record.config.measured_runs = row["Measured Runs"]
  236. record.config.batch_size = row["Batch Size"]
  237. record.config.seq_length = row["Sequence Length"]
  238. record.config.customized["measure_step"] = row["Step"]
  239. record.config.customized["engine"] = row["Engine"]
  240. record.metrics.customized["latency_s_mean"] = row["Latency (s)"]
  241. record.metrics.latency_ms_mean = row["Latency (ms)"]
  242. record.metrics.customized["throughput_tps"] = row["Throughput (tps)"]
  243. record.metrics.max_memory_usage_GB = row["Memory (GB)"]
  244. records.append(record)
  245. BenchmarkRecord.save_as_csv(filename, records)
  246. BenchmarkRecord.save_as_json(filename.replace(".csv", ".json"), records)
  247. logger.info(f"Results saved in {filename}!")
  248. def benchmark(args, benchmark_cmd, engine):
  249. log_filename = f"{engine}_{datetime.datetime.now():%Y-%m-%d_%H:%M:%S}.log"
  250. log_path = os.path.join(args.log_folder, log_filename)
  251. with open(log_path, "w") as log_file:
  252. process = subprocess.Popen(benchmark_cmd, stdout=log_file, stderr=log_file)
  253. try:
  254. process.wait(args.timeout)
  255. except subprocess.TimeoutExpired:
  256. process.kill()
  257. # Create entries for csv
  258. logger.info("Gathering data from log files...")
  259. base_results = [args.warmup_runs, args.num_runs, args.model_name, engine, args.precision, args.device]
  260. results = process_log_file(args.device_id, log_path, base_results)
  261. return results
  262. def main():
  263. args = get_args()
  264. setup_logger(args.verbose)
  265. logger.info(args.__dict__)
  266. torch.backends.cudnn.benchmark = True
  267. all_results = []
  268. os.environ["CUDA_VISIBLE_DEVICES"] = str(args.device_id)
  269. # Benchmark PyTorch without torch.compile
  270. if args.hf_pt_eager:
  271. benchmark_cmd = [
  272. "python",
  273. "-m",
  274. "models.llama.benchmark",
  275. "--benchmark-type",
  276. "hf-pt-eager",
  277. "--model-name",
  278. args.model_name,
  279. "--precision",
  280. args.precision,
  281. "--batch-sizes",
  282. args.batch_sizes,
  283. "--sequence-lengths",
  284. args.sequence_lengths,
  285. "--device",
  286. args.device,
  287. "--warmup-runs",
  288. str(args.warmup_runs),
  289. "--num-runs",
  290. str(args.num_runs),
  291. "--log-folder",
  292. args.log_folder,
  293. "--cache-dir",
  294. args.cache_dir,
  295. "--auth",
  296. ]
  297. logger.info("Benchmark PyTorch without torch.compile")
  298. results = benchmark(args, benchmark_cmd, "pytorch-eager")
  299. all_results.extend(results)
  300. # Benchmark PyTorch with torch.compile
  301. if args.hf_pt_compile:
  302. benchmark_cmd = [
  303. "python",
  304. "-m",
  305. "models.llama.benchmark",
  306. "--benchmark-type",
  307. "hf-pt-compile",
  308. "--model-name",
  309. args.model_name,
  310. "--precision",
  311. args.precision,
  312. "--batch-sizes",
  313. args.batch_sizes,
  314. "--sequence-lengths",
  315. args.sequence_lengths,
  316. "--device",
  317. args.device,
  318. "--warmup-runs",
  319. str(args.warmup_runs),
  320. "--num-runs",
  321. str(args.num_runs),
  322. "--log-folder",
  323. args.log_folder,
  324. "--cache-dir",
  325. args.cache_dir,
  326. "--auth",
  327. ]
  328. logger.info("Benchmark PyTorch with torch.compile")
  329. results = benchmark(args, benchmark_cmd, "pytorch-compile")
  330. all_results.extend(results)
  331. # Benchmark Optimum + ONNX Runtime
  332. if args.hf_ort_dir_path:
  333. benchmark_cmd = [
  334. "python",
  335. "-m",
  336. "models.llama.benchmark",
  337. "--benchmark-type",
  338. "hf-ort",
  339. "--hf-ort-dir-path",
  340. args.hf_ort_dir_path,
  341. "--model-name",
  342. args.model_name,
  343. "--precision",
  344. args.precision,
  345. "--batch-sizes",
  346. args.batch_sizes,
  347. "--sequence-lengths",
  348. args.sequence_lengths,
  349. "--device",
  350. args.device,
  351. "--warmup-runs",
  352. str(args.warmup_runs),
  353. "--num-runs",
  354. str(args.num_runs),
  355. "--log-folder",
  356. args.log_folder,
  357. "--cache-dir",
  358. args.cache_dir,
  359. "--auth",
  360. ]
  361. logger.info("Benchmark Optimum + ONNX Runtime")
  362. results = benchmark(args, benchmark_cmd, "optimum-ort")
  363. all_results.extend(results)
  364. # Benchmark Microsoft model in ONNX Runtime
  365. if args.ort_msft_model_path:
  366. benchmark_cmd = [
  367. "python",
  368. "-m",
  369. "models.llama.benchmark",
  370. "--benchmark-type",
  371. "ort-msft",
  372. "--ort-model-path",
  373. args.ort_msft_model_path,
  374. "--model-name",
  375. args.model_name,
  376. "--precision",
  377. args.precision,
  378. "--batch-sizes",
  379. args.batch_sizes,
  380. "--sequence-lengths",
  381. args.sequence_lengths,
  382. "--device",
  383. args.device,
  384. "--warmup-runs",
  385. str(args.warmup_runs),
  386. "--num-runs",
  387. str(args.num_runs),
  388. "--log-folder",
  389. args.log_folder,
  390. "--cache-dir",
  391. args.cache_dir,
  392. ]
  393. logger.info("Benchmark Microsoft model in ONNX Runtime")
  394. results = benchmark(args, benchmark_cmd, "ort-msft")
  395. all_results.extend(results)
  396. # Benchmark convert_to_onnx model in ONNX Runtime
  397. if args.ort_convert_to_onnx_model_path:
  398. benchmark_cmd = [
  399. "python",
  400. "-m",
  401. "models.llama.benchmark",
  402. "--benchmark-type",
  403. "ort-convert-to-onnx",
  404. "--ort-model-path",
  405. args.ort_convert_to_onnx_model_path,
  406. "--model-name",
  407. args.model_name,
  408. "--precision",
  409. args.precision,
  410. "--batch-sizes",
  411. args.batch_sizes,
  412. "--sequence-lengths",
  413. args.sequence_lengths,
  414. "--device",
  415. args.device,
  416. "--warmup-runs",
  417. str(args.warmup_runs),
  418. "--num-runs",
  419. str(args.num_runs),
  420. "--log-folder",
  421. args.log_folder,
  422. "--cache-dir",
  423. args.cache_dir,
  424. ]
  425. logger.info("Benchmark convert_to_onnx model in ONNX Runtime")
  426. results = benchmark(args, benchmark_cmd, "onnxruntime")
  427. all_results.extend(results)
  428. csv_file = f"{args.model_size}_{args.precision}_{datetime.datetime.now():%Y-%m-%d_%H:%M:%S}.csv"
  429. save_results(all_results, os.path.join(args.log_folder, csv_file))
  430. if __name__ == "__main__":
  431. main()