profile_result_processor.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358
  1. # -------------------------------------------------------------------------
  2. # Copyright (c) Microsoft Corporation. All rights reserved.
  3. # Licensed under the MIT License.
  4. # --------------------------------------------------------------------------
  5. """This profiler result processor print out the kernel time spent on each Node of the model.
  6. Example of importing profile result file from onnxruntime_perf_test:
  7. python profile_result_processor.py --input profile_2021-10-25_12-02-41.json
  8. """
  9. import argparse
  10. import json
  11. _NODES_TYPE_CONTAINING_SUBGRAPH = frozenset(("Scan", "Loop", "If"))
  12. def parse_arguments(argv=None):
  13. parser = argparse.ArgumentParser()
  14. parser.add_argument(
  15. "-i",
  16. "--input",
  17. required=False,
  18. type=str,
  19. help="Set the input file for reading the profile results",
  20. )
  21. parser.add_argument(
  22. "--threshold",
  23. required=False,
  24. type=float,
  25. default=0.01,
  26. help="Threshold of run time ratio among all nodes. Nodes with larger ratio will show in top expensive nodes.",
  27. )
  28. parser.add_argument(
  29. "--provider",
  30. required=False,
  31. type=str,
  32. default="cuda",
  33. help="Execution provider to use",
  34. )
  35. parser.add_argument(
  36. "--kernel_time_only",
  37. required=False,
  38. action="store_true",
  39. help="Only include the kernel time and no fence time",
  40. )
  41. parser.set_defaults(kernel_time_only=False)
  42. parser.add_argument("-v", "--verbose", required=False, action="store_true")
  43. parser.set_defaults(verbose=False)
  44. return parser.parse_args(argv)
  45. def load_profile_json(profile_file):
  46. print(f"loading profile output {profile_file} ...")
  47. with open(profile_file) as opened_file:
  48. sess_time = json.load(opened_file)
  49. assert isinstance(sess_time, list)
  50. return sess_time
  51. def parse_kernel_results(sess_time, threshold=0):
  52. """Parse profile data and output nodes in two sections - nodes in the original order, and top expensive nodes.
  53. Args:
  54. sess_time (List[Dict]): profile data
  55. threshold (int, optional): Minimum ratio of duration among all. Defaults to 0.
  56. Returns:
  57. List[str]: lines of string for output.
  58. """
  59. kernel_name_to_op_name = {}
  60. kernel_time = {}
  61. kernel_freq = {}
  62. total = 0
  63. session_init = False
  64. for item in sess_time:
  65. # Skip all MemcpyHostToDevice before session_initialization
  66. if item["cat"] == "Session" and item["name"] == "session_initialization":
  67. session_init = True
  68. if not session_init:
  69. continue
  70. if item["cat"] == "Kernel" and "dur" in item and "args" in item and "op_name" in item["args"]:
  71. kernel_name = item["name"]
  72. op_name = item["args"]["op_name"]
  73. if op_name in _NODES_TYPE_CONTAINING_SUBGRAPH:
  74. continue
  75. # Handle MemcpyHostToDevice and MemcpyDeviceToHost here
  76. if not op_name:
  77. op_name = f"({kernel_name})"
  78. if kernel_name in kernel_time:
  79. kernel_time[kernel_name] += item["dur"]
  80. kernel_freq[kernel_name] += 1
  81. else:
  82. kernel_time[kernel_name] = item["dur"]
  83. kernel_freq[kernel_name] = 1
  84. kernel_name_to_op_name[kernel_name] = op_name
  85. total += item["dur"]
  86. if not kernel_time:
  87. return ["No kernel record found!"]
  88. # Output items with run time ratio > thresholds, and sorted by duration in the descending order.
  89. lines = []
  90. lines.append(f"\nTop expensive kernels with Time% >= {threshold * 100:.2f}:")
  91. lines.append("-" * 64)
  92. lines.append("Total(μs)\tTime%\tCalls\tAvg(μs)\tKernel")
  93. for kernel_name, duration in sorted(kernel_time.items(), key=lambda x: x[1], reverse=True):
  94. ratio = duration / total
  95. if ratio < threshold:
  96. continue
  97. calls = kernel_freq[kernel_name]
  98. avg_time = duration / float(calls)
  99. lines.append(f"{duration:10d}\t{ratio * 100.0:5.2f}\t{calls:5d}\t{avg_time:8.1f}\t{kernel_name}")
  100. # Group by operator
  101. op_time = {}
  102. for kernel_name, op_name in kernel_name_to_op_name.items():
  103. duration = kernel_time[kernel_name]
  104. if op_name in op_time:
  105. op_time[op_name] += duration
  106. else:
  107. op_time[op_name] = duration
  108. lines.append("\nGroup kernel time by operator:")
  109. lines.append("-" * 64)
  110. lines.append("Total(μs)\tTime%\tOperator")
  111. for op_name, duration in sorted(op_time.items(), key=lambda x: x[1], reverse=True):
  112. ratio = duration / total
  113. lines.append(f"{duration:10d}\t{ratio * 100.0:5.2f}\t{op_name}")
  114. return lines
  115. def parse_node_results(sess_time, kernel_time_only=False, threshold=0):
  116. """Parse profile data and output nodes in two sections - nodes in the original order, and top expensive nodes.
  117. Args:
  118. sess_time (List[Dict]): profile data
  119. kernel_time_only (bool, optional): Only include items for kernel time. Defaults to False.
  120. threshold (int, optional): Minimum ratio of duration among all. Defaults to 0.
  121. Returns:
  122. List[str]: lines of string for output.
  123. """
  124. node_name_list = []
  125. node_time = {}
  126. node_freq = {}
  127. node_provider = {}
  128. total = 0
  129. for item in sess_time:
  130. if item["cat"] == "Node" and "dur" in item and "args" in item and "op_name" in item["args"]:
  131. node_name = (
  132. item["name"].replace("_kernel_time", "").replace("_fence_before", "").replace("_fence_after", "")
  133. )
  134. if "provider" in item["args"]:
  135. if item["args"]["provider"] == "CPUExecutionProvider":
  136. device = "CPU"
  137. elif item["args"]["provider"] == "CUDAExecutionProvider":
  138. device = "CUDA"
  139. elif item["args"]["provider"] == "DmlExecutionProvider":
  140. device = "DML"
  141. if node_name not in node_provider:
  142. node_provider[node_name] = device
  143. else:
  144. assert node_provider[node_name] == device
  145. elif kernel_time_only:
  146. continue
  147. op_name = item["args"]["op_name"]
  148. if op_name in _NODES_TYPE_CONTAINING_SUBGRAPH:
  149. continue
  150. if node_name in node_time:
  151. node_time[node_name] += item["dur"]
  152. node_freq[node_name] += 1
  153. else:
  154. node_time[node_name] = item["dur"]
  155. node_freq[node_name] = 1
  156. node_name_list.append(node_name)
  157. total += item["dur"]
  158. # Output items in the original order.
  159. lines = [
  160. "\nNodes in the original order:",
  161. "-" * 64,
  162. "Total(μs)\tTime%\tAcc %\tAvg(μs)\tCalls\tProvider\tNode",
  163. ]
  164. before_percentage = 0.0
  165. for node_name in node_name_list:
  166. duration = node_time[node_name]
  167. calls = node_freq[node_name]
  168. avg_time = duration / float(calls)
  169. percentage = (duration / total) * 100.0
  170. provider = node_provider.get(node_name, "")
  171. before_percentage += percentage
  172. lines.append(
  173. f"{duration:10d}\t{percentage:5.2f}\t{before_percentage:5.2f}\t{avg_time:8.1f}\t{calls:5d}\t{provider:8s}\t{node_name}"
  174. )
  175. # Output items with run time ratio > thresholds, and sorted by duration in the descending order.
  176. lines.append(f"\nTop expensive nodes with Time% >= {threshold * 100:.2f}:")
  177. lines.append("-" * 64)
  178. lines.append("Total(μs)\tTime%\tAvg(μs)\tCalls\tProvider\tNode")
  179. for node_name, duration in sorted(node_time.items(), key=lambda x: x[1], reverse=True):
  180. ratio = duration / total
  181. if ratio < threshold:
  182. continue
  183. calls = node_freq[node_name]
  184. avg_time = duration / float(calls)
  185. percentage = (duration / total) * 100.0
  186. provider = node_provider.get(node_name, "")
  187. lines.append(f"{duration:10d}\t{percentage:5.2f}\t{avg_time:8.1f}\t{calls:5d}\t{provider:8s}\t{node_name}")
  188. return lines
  189. def group_node_results(sess_time):
  190. """Group results by operator name.
  191. Args:
  192. sess_time (List[Dict]): profile data
  193. Returns:
  194. List[str]: lines of string for output.
  195. """
  196. op_kernel_time = {}
  197. op_kernel_records = {}
  198. total_kernel_time = 0
  199. provider_op_kernel_time = {}
  200. provider_op_kernel_records = {}
  201. provider_kernel_time = {}
  202. op_fence_time = {}
  203. total_fence_time = 0
  204. provider_counter = {}
  205. for item in sess_time:
  206. if item["cat"] == "Node" and "dur" in item and "args" in item and "op_name" in item["args"]:
  207. op_name = item["args"]["op_name"]
  208. # TODO: shall we have a separated group for nodes with subgraph?
  209. if op_name in _NODES_TYPE_CONTAINING_SUBGRAPH:
  210. continue
  211. if "provider" not in item["args"]:
  212. if "fence" in item["name"]:
  213. if op_name in op_fence_time:
  214. op_fence_time[op_name] += item["dur"]
  215. else:
  216. op_fence_time[op_name] = item["dur"]
  217. total_fence_time += item["dur"]
  218. continue
  219. provider = item["args"].get("provider", "")
  220. if provider in provider_counter:
  221. provider_counter[provider] += 1
  222. else:
  223. provider_counter[provider] = 1
  224. key = f"{provider}:{op_name}"
  225. if key in provider_op_kernel_time:
  226. provider_op_kernel_time[key] += item["dur"]
  227. provider_op_kernel_records[key] += 1
  228. else:
  229. provider_op_kernel_time[key] = item["dur"]
  230. provider_op_kernel_records[key] = 1
  231. if provider in provider_kernel_time:
  232. provider_kernel_time[provider] += item["dur"]
  233. else:
  234. provider_kernel_time[provider] = item["dur"]
  235. if op_name in op_kernel_time:
  236. op_kernel_time[op_name] += item["dur"]
  237. op_kernel_records[op_name] += 1
  238. else:
  239. op_kernel_time[op_name] = item["dur"]
  240. op_kernel_records[op_name] = 1
  241. total_kernel_time += item["dur"]
  242. lines = ["", "Grouped by operator"]
  243. lines.append("-" * 64)
  244. lines.append("Total(μs)\tTime%\tKernel(μs)\tKernel%\tCalls\tAvgKernel(μs)\tFence(μs)\tOperator")
  245. for op_name, kernel_time in sorted(op_kernel_time.items(), key=lambda x: x[1], reverse=True):
  246. fence_time = op_fence_time.get(op_name, 0)
  247. kernel_time_ratio = kernel_time / total_kernel_time
  248. total_time = kernel_time + fence_time
  249. time_ratio = total_time / (total_kernel_time + total_fence_time)
  250. kernel_calls = op_kernel_records[op_name]
  251. avg_kernel_time = kernel_time / kernel_calls
  252. lines.append(
  253. f"{total_time:10d}\t{time_ratio * 100.0:5.2f}\t{kernel_time:11d}\t{kernel_time_ratio * 100.0:5.2f}\t{kernel_calls:5d}\t{avg_kernel_time:14.1f}\t{fence_time:10d}\t{op_name}"
  254. )
  255. lines += ["", "Grouped by provider + operator"]
  256. lines.append("-" * 64)
  257. lines.append("Kernel(μs)\tProvider%\tCalls\tAvgKernel(μs)\tProvider\tOperator")
  258. for key, kernel_time in sorted(provider_op_kernel_time.items(), key=lambda x: x[1], reverse=True):
  259. parts = key.split(":")
  260. provider = parts[0]
  261. op_name = parts[1]
  262. short_ep = provider.replace("ExecutionProvider", "")
  263. calls = provider_op_kernel_records[key]
  264. avg_kernel_time = kernel_time / calls
  265. provider_time_ratio = kernel_time / provider_kernel_time[provider]
  266. lines.append(
  267. f"{kernel_time:10d}\t{provider_time_ratio * 100.0:9.2f}\t{calls:5d}\t{avg_kernel_time:14.1f}\t{short_ep:8s}\t{op_name}"
  268. )
  269. return lines
  270. def process_results(profile_file, args):
  271. profile_records = load_profile_json(profile_file)
  272. lines = parse_kernel_results(profile_records, args.threshold)
  273. lines += parse_node_results(profile_records, args.kernel_time_only, args.threshold)
  274. lines += group_node_results(profile_records)
  275. return lines
  276. if __name__ == "__main__":
  277. arguments = parse_arguments()
  278. print("Arguments", arguments)
  279. from benchmark_helper import setup_logger
  280. setup_logger(arguments.verbose)
  281. profile_file = arguments.input
  282. results = process_results(profile_file, arguments)
  283. for line in results:
  284. print(line)