| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358 |
- # -------------------------------------------------------------------------
- # Copyright (c) Microsoft Corporation. All rights reserved.
- # Licensed under the MIT License.
- # --------------------------------------------------------------------------
- """This profiler result processor print out the kernel time spent on each Node of the model.
- Example of importing profile result file from onnxruntime_perf_test:
- python profile_result_processor.py --input profile_2021-10-25_12-02-41.json
- """
- import argparse
- import json
- _NODES_TYPE_CONTAINING_SUBGRAPH = frozenset(("Scan", "Loop", "If"))
- def parse_arguments(argv=None):
- parser = argparse.ArgumentParser()
- parser.add_argument(
- "-i",
- "--input",
- required=False,
- type=str,
- help="Set the input file for reading the profile results",
- )
- parser.add_argument(
- "--threshold",
- required=False,
- type=float,
- default=0.01,
- help="Threshold of run time ratio among all nodes. Nodes with larger ratio will show in top expensive nodes.",
- )
- parser.add_argument(
- "--provider",
- required=False,
- type=str,
- default="cuda",
- help="Execution provider to use",
- )
- parser.add_argument(
- "--kernel_time_only",
- required=False,
- action="store_true",
- help="Only include the kernel time and no fence time",
- )
- parser.set_defaults(kernel_time_only=False)
- parser.add_argument("-v", "--verbose", required=False, action="store_true")
- parser.set_defaults(verbose=False)
- return parser.parse_args(argv)
- def load_profile_json(profile_file):
- print(f"loading profile output {profile_file} ...")
- with open(profile_file) as opened_file:
- sess_time = json.load(opened_file)
- assert isinstance(sess_time, list)
- return sess_time
- def parse_kernel_results(sess_time, threshold=0):
- """Parse profile data and output nodes in two sections - nodes in the original order, and top expensive nodes.
- Args:
- sess_time (List[Dict]): profile data
- threshold (int, optional): Minimum ratio of duration among all. Defaults to 0.
- Returns:
- List[str]: lines of string for output.
- """
- kernel_name_to_op_name = {}
- kernel_time = {}
- kernel_freq = {}
- total = 0
- session_init = False
- for item in sess_time:
- # Skip all MemcpyHostToDevice before session_initialization
- if item["cat"] == "Session" and item["name"] == "session_initialization":
- session_init = True
- if not session_init:
- continue
- if item["cat"] == "Kernel" and "dur" in item and "args" in item and "op_name" in item["args"]:
- kernel_name = item["name"]
- op_name = item["args"]["op_name"]
- if op_name in _NODES_TYPE_CONTAINING_SUBGRAPH:
- continue
- # Handle MemcpyHostToDevice and MemcpyDeviceToHost here
- if not op_name:
- op_name = f"({kernel_name})"
- if kernel_name in kernel_time:
- kernel_time[kernel_name] += item["dur"]
- kernel_freq[kernel_name] += 1
- else:
- kernel_time[kernel_name] = item["dur"]
- kernel_freq[kernel_name] = 1
- kernel_name_to_op_name[kernel_name] = op_name
- total += item["dur"]
- if not kernel_time:
- return ["No kernel record found!"]
- # Output items with run time ratio > thresholds, and sorted by duration in the descending order.
- lines = []
- lines.append(f"\nTop expensive kernels with Time% >= {threshold * 100:.2f}:")
- lines.append("-" * 64)
- lines.append("Total(μs)\tTime%\tCalls\tAvg(μs)\tKernel")
- for kernel_name, duration in sorted(kernel_time.items(), key=lambda x: x[1], reverse=True):
- ratio = duration / total
- if ratio < threshold:
- continue
- calls = kernel_freq[kernel_name]
- avg_time = duration / float(calls)
- lines.append(f"{duration:10d}\t{ratio * 100.0:5.2f}\t{calls:5d}\t{avg_time:8.1f}\t{kernel_name}")
- # Group by operator
- op_time = {}
- for kernel_name, op_name in kernel_name_to_op_name.items():
- duration = kernel_time[kernel_name]
- if op_name in op_time:
- op_time[op_name] += duration
- else:
- op_time[op_name] = duration
- lines.append("\nGroup kernel time by operator:")
- lines.append("-" * 64)
- lines.append("Total(μs)\tTime%\tOperator")
- for op_name, duration in sorted(op_time.items(), key=lambda x: x[1], reverse=True):
- ratio = duration / total
- lines.append(f"{duration:10d}\t{ratio * 100.0:5.2f}\t{op_name}")
- return lines
- def parse_node_results(sess_time, kernel_time_only=False, threshold=0):
- """Parse profile data and output nodes in two sections - nodes in the original order, and top expensive nodes.
- Args:
- sess_time (List[Dict]): profile data
- kernel_time_only (bool, optional): Only include items for kernel time. Defaults to False.
- threshold (int, optional): Minimum ratio of duration among all. Defaults to 0.
- Returns:
- List[str]: lines of string for output.
- """
- node_name_list = []
- node_time = {}
- node_freq = {}
- node_provider = {}
- total = 0
- for item in sess_time:
- if item["cat"] == "Node" and "dur" in item and "args" in item and "op_name" in item["args"]:
- node_name = (
- item["name"].replace("_kernel_time", "").replace("_fence_before", "").replace("_fence_after", "")
- )
- if "provider" in item["args"]:
- if item["args"]["provider"] == "CPUExecutionProvider":
- device = "CPU"
- elif item["args"]["provider"] == "CUDAExecutionProvider":
- device = "CUDA"
- elif item["args"]["provider"] == "DmlExecutionProvider":
- device = "DML"
- if node_name not in node_provider:
- node_provider[node_name] = device
- else:
- assert node_provider[node_name] == device
- elif kernel_time_only:
- continue
- op_name = item["args"]["op_name"]
- if op_name in _NODES_TYPE_CONTAINING_SUBGRAPH:
- continue
- if node_name in node_time:
- node_time[node_name] += item["dur"]
- node_freq[node_name] += 1
- else:
- node_time[node_name] = item["dur"]
- node_freq[node_name] = 1
- node_name_list.append(node_name)
- total += item["dur"]
- # Output items in the original order.
- lines = [
- "\nNodes in the original order:",
- "-" * 64,
- "Total(μs)\tTime%\tAcc %\tAvg(μs)\tCalls\tProvider\tNode",
- ]
- before_percentage = 0.0
- for node_name in node_name_list:
- duration = node_time[node_name]
- calls = node_freq[node_name]
- avg_time = duration / float(calls)
- percentage = (duration / total) * 100.0
- provider = node_provider.get(node_name, "")
- before_percentage += percentage
- lines.append(
- f"{duration:10d}\t{percentage:5.2f}\t{before_percentage:5.2f}\t{avg_time:8.1f}\t{calls:5d}\t{provider:8s}\t{node_name}"
- )
- # Output items with run time ratio > thresholds, and sorted by duration in the descending order.
- lines.append(f"\nTop expensive nodes with Time% >= {threshold * 100:.2f}:")
- lines.append("-" * 64)
- lines.append("Total(μs)\tTime%\tAvg(μs)\tCalls\tProvider\tNode")
- for node_name, duration in sorted(node_time.items(), key=lambda x: x[1], reverse=True):
- ratio = duration / total
- if ratio < threshold:
- continue
- calls = node_freq[node_name]
- avg_time = duration / float(calls)
- percentage = (duration / total) * 100.0
- provider = node_provider.get(node_name, "")
- lines.append(f"{duration:10d}\t{percentage:5.2f}\t{avg_time:8.1f}\t{calls:5d}\t{provider:8s}\t{node_name}")
- return lines
- def group_node_results(sess_time):
- """Group results by operator name.
- Args:
- sess_time (List[Dict]): profile data
- Returns:
- List[str]: lines of string for output.
- """
- op_kernel_time = {}
- op_kernel_records = {}
- total_kernel_time = 0
- provider_op_kernel_time = {}
- provider_op_kernel_records = {}
- provider_kernel_time = {}
- op_fence_time = {}
- total_fence_time = 0
- provider_counter = {}
- for item in sess_time:
- if item["cat"] == "Node" and "dur" in item and "args" in item and "op_name" in item["args"]:
- op_name = item["args"]["op_name"]
- # TODO: shall we have a separated group for nodes with subgraph?
- if op_name in _NODES_TYPE_CONTAINING_SUBGRAPH:
- continue
- if "provider" not in item["args"]:
- if "fence" in item["name"]:
- if op_name in op_fence_time:
- op_fence_time[op_name] += item["dur"]
- else:
- op_fence_time[op_name] = item["dur"]
- total_fence_time += item["dur"]
- continue
- provider = item["args"].get("provider", "")
- if provider in provider_counter:
- provider_counter[provider] += 1
- else:
- provider_counter[provider] = 1
- key = f"{provider}:{op_name}"
- if key in provider_op_kernel_time:
- provider_op_kernel_time[key] += item["dur"]
- provider_op_kernel_records[key] += 1
- else:
- provider_op_kernel_time[key] = item["dur"]
- provider_op_kernel_records[key] = 1
- if provider in provider_kernel_time:
- provider_kernel_time[provider] += item["dur"]
- else:
- provider_kernel_time[provider] = item["dur"]
- if op_name in op_kernel_time:
- op_kernel_time[op_name] += item["dur"]
- op_kernel_records[op_name] += 1
- else:
- op_kernel_time[op_name] = item["dur"]
- op_kernel_records[op_name] = 1
- total_kernel_time += item["dur"]
- lines = ["", "Grouped by operator"]
- lines.append("-" * 64)
- lines.append("Total(μs)\tTime%\tKernel(μs)\tKernel%\tCalls\tAvgKernel(μs)\tFence(μs)\tOperator")
- for op_name, kernel_time in sorted(op_kernel_time.items(), key=lambda x: x[1], reverse=True):
- fence_time = op_fence_time.get(op_name, 0)
- kernel_time_ratio = kernel_time / total_kernel_time
- total_time = kernel_time + fence_time
- time_ratio = total_time / (total_kernel_time + total_fence_time)
- kernel_calls = op_kernel_records[op_name]
- avg_kernel_time = kernel_time / kernel_calls
- lines.append(
- f"{total_time:10d}\t{time_ratio * 100.0:5.2f}\t{kernel_time:11d}\t{kernel_time_ratio * 100.0:5.2f}\t{kernel_calls:5d}\t{avg_kernel_time:14.1f}\t{fence_time:10d}\t{op_name}"
- )
- lines += ["", "Grouped by provider + operator"]
- lines.append("-" * 64)
- lines.append("Kernel(μs)\tProvider%\tCalls\tAvgKernel(μs)\tProvider\tOperator")
- for key, kernel_time in sorted(provider_op_kernel_time.items(), key=lambda x: x[1], reverse=True):
- parts = key.split(":")
- provider = parts[0]
- op_name = parts[1]
- short_ep = provider.replace("ExecutionProvider", "")
- calls = provider_op_kernel_records[key]
- avg_kernel_time = kernel_time / calls
- provider_time_ratio = kernel_time / provider_kernel_time[provider]
- lines.append(
- f"{kernel_time:10d}\t{provider_time_ratio * 100.0:9.2f}\t{calls:5d}\t{avg_kernel_time:14.1f}\t{short_ep:8s}\t{op_name}"
- )
- return lines
- def process_results(profile_file, args):
- profile_records = load_profile_json(profile_file)
- lines = parse_kernel_results(profile_records, args.threshold)
- lines += parse_node_results(profile_records, args.kernel_time_only, args.threshold)
- lines += group_node_results(profile_records)
- return lines
- if __name__ == "__main__":
- arguments = parse_arguments()
- print("Arguments", arguments)
- from benchmark_helper import setup_logger
- setup_logger(arguments.verbose)
- profile_file = arguments.input
- results = process_results(profile_file, arguments)
- for line in results:
- print(line)
|