yichael
/
AIStoryBoard


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013
							# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import re
from enum import Enum

from paddle.base.core import TracerEventType, TracerMemEventType
from paddle.utils.flops import flops

from .statistic_helper import (
    intersection_ranges,
    merge_ranges,
    merge_self_ranges,
    sum_ranges,
)

_AllTracerEventType = [
    TracerEventType.Operator,
    TracerEventType.Dataloader,
    TracerEventType.ProfileStep,
    TracerEventType.CudaRuntime,
    TracerEventType.Kernel,
    TracerEventType.Memcpy,
    TracerEventType.Memset,
    TracerEventType.UserDefined,
    TracerEventType.OperatorInner,
    TracerEventType.Forward,
    TracerEventType.Backward,
    TracerEventType.Optimization,
    TracerEventType.Communication,
    TracerEventType.PythonOp,
    TracerEventType.PythonUserDefined,
]

_CommunicationOpName = ['allreduce', 'broadcast', 'rpc']


class SortedKeys(Enum):
    r"""
    SortedKeys is used to specify how to sort items when printing ``paddle.profiler.Profiler.summary`` table.

    The meaning of each SortedKeys is as following

    - **SortedKeys.CPUTotal** :  Sorted by CPU total time.

    - **SortedKeys.CPUAvg**  : Sorted by CPU average time.

    - **SortedKeys.CPUMax**  : Sorted by CPU max time.

    - **SortedKeys.CPUMin**  : Sorted by CPU min time.

    - **SortedKeys.GPUTotal**  : Sorted by GPU total time.

    - **SortedKeys.GPUAvg**  : Sorted by GPU average time.

    - **SortedKeys.GPUMax**  : Sorted by GPU max time.

    - **SortedKeys.GPUMin**  : Sorted by GPU min time.
    """
    CPUTotal = 0
    CPUAvg = 1
    CPUMax = 2
    CPUMin = 3
    GPUTotal = 4
    GPUAvg = 5
    GPUMax = 6
    GPUMin = 7


def _nodename2opname(name):
    r'''
    convert static host node name to operator name
    '''
    op_name = name.replace(' compute', '')
    op_name = op_name.replace(' dygraph', '')
    op_name = op_name.replace(' pybind_imperative_func', '')
    return op_name


class HostStatisticNode:
    r'''
    Wrap original node for calculating statistic metrics.
    '''

    def __init__(self, hostnode):
        self.hostnode = hostnode
        self.children_node = []
        self.runtime_node = []
        self.cpu_time = 0
        self.self_cpu_time = 0
        self.gpu_time = 0  # kernel time
        self.self_gpu_time = 0
        self.general_gpu_time = 0  # besides kernel, include time of gpu events like memcpy and memset
        self.self_general_gpu_time = 0
        self.flops = 0

    def cal_flops(self):
        if self.hostnode.type == TracerEventType.Operator:
            if hasattr(self.hostnode, 'input_shapes'):
                op_name = _nodename2opname(self.hostnode.name)
                self.flops = flops(
                    op_name,
                    self.hostnode.input_shapes,
                    self.hostnode.attributes,
                )

    def cal_statistic(self):
        self.cpu_time = self.hostnode.end_ns - self.hostnode.start_ns
        self.self_cpu_time = self.cpu_time
        self.cal_flops()
        for child in self.children_node:
            child.cal_flops()
            child.cal_statistic()
            self.gpu_time += child.gpu_time
            self.general_gpu_time += child.general_gpu_time
            self.self_cpu_time -= child.end_ns - child.start_ns
            self.flops += child.flops

        for rt in self.runtime_node:
            rt.cal_statistic()
            self.self_cpu_time -= rt.end_ns - rt.start_ns
            self.gpu_time += rt.gpu_time
            self.self_gpu_time += rt.gpu_time
            self.general_gpu_time += rt.general_gpu_time
            self.self_general_gpu_time += rt.general_gpu_time

        for device in self.hostnode.device_node:
            if device.type == TracerEventType.Kernel:
                self.gpu_time += device.end_ns - device.start_ns
                self.self_gpu_time += device.end_ns - device.start_ns
            self.general_gpu_time += device.end_ns - device.start_ns
            self.self_general_gpu_time += device.end_ns - device.start_ns

    @property
    def end_ns(self):
        return self.hostnode.end_ns

    @property
    def start_ns(self):
        return self.hostnode.start_ns

    def __getattr__(self, name):
        return getattr(self.hostnode, name)


def traverse_tree(nodetrees):
    results = collections.defaultdict(list)
    for thread_id, rootnode in nodetrees.items():
        stack = []
        stack.append(rootnode)
        threadlist = results[thread_id]
        while stack:
            current_node = stack.pop()
            threadlist.append(current_node)
            for childnode in current_node.children_node:
                stack.append(childnode)
    return results


def get_device_nodes(hostnode):
    '''
    Get all device nodes called in the time range of hostnode.
    '''
    stack = []
    device_nodes = []
    stack.append(hostnode)
    while stack:
        current_node = stack.pop()
        for childnode in current_node.children_node:
            stack.append(childnode)
        for runtimenode in current_node.runtime_node:
            for devicenode in runtimenode.device_node:
                device_nodes.append(devicenode)
    return device_nodes


def _build_layer_from_tree(nodetrees):
    def build_layer(node, depth=0):
        if "GradNode" in node.name:
            return [], 0

        if node.type in [
            TracerEventType.Backward,
            TracerEventType.Optimization,
        ]:
            return [], 0

        if node.type == TracerEventType.Operator:
            stat_node = HostStatisticNode(node)
            stat_node.cal_statistic()
            return stat_node, stat_node.flops

        layer = []
        nflops = 0
        for c in node.children_node:
            l, f = build_layer(c, depth + 1)
            if l:
                nflops += f
                layer.append(l)

        if node.type == TracerEventType.Forward:
            stat_node = HostStatisticNode(node)
            stat_node.cal_statistic()
            stat_node.flops = nflops
            return [stat_node, layer], nflops

        return layer, nflops

    ret = []
    for _, rootnode in nodetrees.items():
        layer, _ = build_layer(rootnode)
        ret.append(layer)

    return ret


def _format_large_number(n, precision=2):
    if n // 1e12 > 0:
        return f"{round(n / 1e12, precision)} T"
    if n // 1e9 > 0:
        return f"{round(n / 1e9, precision)} G"
    if n // 1e6 > 0:
        return f"{round(n / 1e6, precision)} M"
    if n // 1e3 > 0:
        return f"{round(n / 1e3, precision)} K"
    return f"{round(n, precision)}"


def _format_time(n, precision=2):
    if n // 1e9 > 0:
        return f"{round(n / 1e9, precision)} s"
    if n // 1e6 > 0:
        return f"{round(n / 1e6, precision)} ms"
    if n // 1e3 > 0:
        return f"{round(n / 1e3, precision)} us"
    return f"{round(n, precision)} ns"


def _gen_layer_flops(node, repeat=1):
    ret = []
    offset = []
    loop = []

    def print_layer_tree(node, depth=0):
        if isinstance(node, list):
            for n in node:
                print_layer_tree(n, depth + 1)

        elif node.type in [TracerEventType.Forward, TracerEventType.Operator]:
            if len(offset) == 0:
                offset.append(depth)

            name = _nodename2opname(node.name)

            if (
                depth == offset[-1] and len(ret) > 0 and ret[0].startswith(name)
            ):  # repeat begin
                loop.append(1)

            if len(loop) >= repeat:
                return "".join(ret)

            align = " " * (depth - offset[-1])
            tm = _format_time(node.cpu_time)
            flops_n = _format_large_number(node.flops)
            flops_s = _format_large_number(node.flops * 1e9 / node.cpu_time)
            ret.append(
                f"{align}{name} latency: {tm}, FLOPs: {flops_n}, FLOPS: {flops_s}\n"
            )

    for n in node[1:]:
        print_layer_tree(n)

    return "".join(ret)


def gen_layer_flops(nodetrees, repeat=1):
    r'''
    gen_layer_flops generate flops/runtime information depend on layer/operator.
    '''
    layer_tree = _build_layer_from_tree(nodetrees)
    return _gen_layer_flops(layer_tree, repeat)


def wrap_tree(nodetrees):
    '''
    Using HostStatisticNode to wrap original profiler result tree, and calculate node statistic metrics.
    '''
    node_statistic_tree = {}
    results = collections.defaultdict(list)
    newresults = collections.defaultdict(list)
    for thread_id, rootnode in nodetrees.items():
        stack = []
        stack.append(rootnode)
        root_statistic_node = HostStatisticNode(rootnode)
        newstack = []
        newstack.append(root_statistic_node)
        node_statistic_tree[thread_id] = root_statistic_node
        threadlist = results[thread_id]
        newthreadlist = newresults[thread_id]
        while stack:
            current_node = stack.pop()
            threadlist.append(current_node)
            current_statistic_node = newstack.pop()
            newthreadlist.append(current_statistic_node)
            for childnode in current_node.children_node:
                stack.append(childnode)
                child_statistic_node = HostStatisticNode(childnode)
                current_statistic_node.children_node.append(
                    child_statistic_node
                )
                newstack.append(child_statistic_node)
            for runtimenode in current_node.runtime_node:
                runtime_statistic_node = HostStatisticNode(runtimenode)
                current_statistic_node.runtime_node.append(
                    runtime_statistic_node
                )
    # recursive calculate node statistic values
    for thread_id, root_statistic_node in node_statistic_tree.items():
        root_statistic_node.cal_statistic()

    return node_statistic_tree, newresults


class TimeRangeSummary:
    r"""
    Analyse time ranges for each TracerEventType, and summarize the time.
    """

    def __init__(self):
        self.CPUTimeRange = collections.defaultdict(list)
        self.GPUTimeRange = collections.defaultdict(
            lambda: collections.defaultdict(list)
        )  # GPU events should be divided into different devices
        self.CPUTimeRangeSum = collections.defaultdict(int)
        self.GPUTimeRangeSum = collections.defaultdict(
            lambda: collections.defaultdict(int)
        )
        self.call_times = collections.defaultdict(int)

    def parse(self, nodetrees):
        r"""
        Analysis node trees in profiler result, and get time range for different tracer event type.
        """
        thread2hostnodes = traverse_tree(nodetrees)
        for threadid, hostnodes in thread2hostnodes.items():
            CPUTimeRange = collections.defaultdict(list)
            GPUTimeRange = collections.defaultdict(
                lambda: collections.defaultdict(
                    lambda: collections.defaultdict(list)
                )
            )  # device_id/type/stream_id
            for hostnode in hostnodes[1:]:  # skip root node
                CPUTimeRange[hostnode.type].append(
                    (hostnode.start_ns, hostnode.end_ns)
                )
                self.call_times[hostnode.type] += 1
                for runtimenode in hostnode.runtime_node:
                    CPUTimeRange[runtimenode.type].append(
                        (runtimenode.start_ns, runtimenode.end_ns)
                    )
                    self.call_times[runtimenode.type] += 1
                    for devicenode in runtimenode.device_node:
                        GPUTimeRange[devicenode.device_id][devicenode.type][
                            devicenode.stream_id
                        ].append((devicenode.start_ns, devicenode.end_ns))
                        self.call_times[devicenode.type] += 1

            for event_type, time_ranges in CPUTimeRange.items():
                time_ranges = merge_self_ranges(time_ranges, is_sorted=False)
                self.CPUTimeRange[event_type] = merge_ranges(
                    self.CPUTimeRange[event_type], time_ranges, is_sorted=True
                )
            for device_id, device_time_ranges in GPUTimeRange.items():
                for event_type, event_time_ranges in device_time_ranges.items():
                    for stream_id, time_ranges in event_time_ranges.items():
                        time_ranges = merge_self_ranges(
                            time_ranges, is_sorted=False
                        )
                        self.GPUTimeRange[device_id][event_type] = merge_ranges(
                            self.GPUTimeRange[device_id][event_type],
                            time_ranges,
                            is_sorted=True,
                        )

        for event_type, time_ranges in self.CPUTimeRange.items():
            self.CPUTimeRangeSum[event_type] = sum_ranges(time_ranges)
        for device_id, device_time_ranges in self.GPUTimeRange.items():
            for event_type, time_ranges in device_time_ranges.items():
                self.GPUTimeRangeSum[device_id][event_type] = sum_ranges(
                    time_ranges
                )

    def get_gpu_devices(self):
        return self.GPUTimeRange.keys()

    def get_gpu_range_sum(self, device_id, event_type):
        return self.GPUTimeRangeSum[device_id][event_type]

    def get_cpu_range_sum(self, event_type):
        return self.CPUTimeRangeSum[event_type]


class DistributedSummary:
    r"""
    Analysis communication and computation time range, and their overlap.
    The computation time is all kernel except kernels for communication like nccl.
    """

    def __init__(self):
        self.cpu_communication_range = []
        self.gpu_communication_range = []
        self.communication_range = []
        self.computation_range = []
        self.overlap_range = []
        self.cpu_calls = 0
        self.gpu_calls = 0

    def parse(self, nodetrees):
        '''
        Collect all communication and computation time ranges.
        '''
        thread2hostnodes = traverse_tree(nodetrees)
        for threadid, hostnodes in thread2hostnodes.items():
            for hostnode in hostnodes[1:]:  # skip root node
                # case 1: TracerEventType is Communication
                if hostnode.type == TracerEventType.Communication:
                    self.cpu_communication_range.append(
                        (hostnode.start_ns, hostnode.end_ns)
                    )
                    device_nodes = get_device_nodes(hostnode)
                    for device_node in device_nodes:
                        if device_node.type == TracerEventType.Kernel:
                            self.gpu_communication_range.append(
                                (device_node.start_ns, device_node.end_ns)
                            )

                # case 2: TracerEventType is Operator but is communication op
                elif hostnode.type == TracerEventType.Operator and any(
                    name in hostnode.name.lower()
                    for name in _CommunicationOpName
                ):
                    self.cpu_communication_range.append(
                        (hostnode.start_ns, hostnode.end_ns)
                    )
                    device_nodes = get_device_nodes(hostnode)
                    for device_node in device_nodes:
                        if device_node.type == TracerEventType.Kernel:
                            self.gpu_communication_range.append(
                                (device_node.start_ns, device_node.end_ns)
                            )

                # case 3: Others, filter kernels named with nccl
                else:
                    for runtimenode in hostnode.runtime_node:
                        for devicenode in runtimenode.device_node:
                            if devicenode.type == TracerEventType.Kernel:
                                kernel_name = devicenode.name.lower()
                                if (
                                    'nccl' in kernel_name
                                    or 'xccl' in kernel_name
                                ):
                                    self.gpu_communication_range.append(
                                        (devicenode.start_ns, devicenode.end_ns)
                                    )
                                else:
                                    self.computation_range.append(
                                        (devicenode.start_ns, devicenode.end_ns)
                                    )
        self.cpu_calls = len(set(self.cpu_communication_range))
        self.gpu_calls = len(set(self.gpu_communication_range))
        self.cpu_communication_range = merge_self_ranges(
            self.cpu_communication_range, is_sorted=False
        )
        self.gpu_communication_range = merge_self_ranges(
            self.gpu_communication_range, is_sorted=False
        )
        self.communication_range = merge_ranges(
            self.cpu_communication_range,
            self.gpu_communication_range,
            is_sorted=True,
        )
        self.computation_range = merge_self_ranges(
            self.computation_range, is_sorted=False
        )
        self.overlap_range = intersection_ranges(
            self.communication_range, self.computation_range, is_sorted=True
        )


class EventSummary:
    r"""
    Analyse operator event in profiling data, correlate with its device event.
    """

    class ItemBase:
        def __init__(self, name):
            self.name = name
            self.call = 0
            self.cpu_time = 0
            self.gpu_time = 0
            self.max_cpu_time = 0
            self.min_cpu_time = float('inf')
            self.max_gpu_time = 0
            self.min_gpu_time = float('inf')
            self.devices = {}
            self.operator_inners = {}
            self.general_gpu_time = 0
            self.min_general_gpu_time = float('inf')
            self.max_general_gpu_time = 0
            self._flops = 0

        @property
        def flops(self):
            return self._flops

        @property
        def avg_cpu_time(self):
            return self.cpu_time / self.call

        @property
        def avg_gpu_time(self):
            return self.gpu_time / self.call

        @property
        def avg_general_gpu_time(self):
            return self.general_gpu_time / self.call

        def add_cpu_time(self, time):
            if time > self.max_cpu_time:
                self.max_cpu_time = time
            if time < self.min_cpu_time:
                self.min_cpu_time = time
            self.cpu_time += time

        def add_gpu_time(self, time):
            if time > self.max_gpu_time:
                self.max_gpu_time = time
            if time < self.min_gpu_time:
                self.min_gpu_time = time
            self.gpu_time += time

        def add_general_gpu_time(self, time):
            if time > self.max_general_gpu_time:
                self.max_general_gpu_time = time
            if time < self.min_general_gpu_time:
                self.min_general_gpu_time = time
            self.general_gpu_time += time

        def add_call(self):
            self.call += 1

        def add_flops(self, flops):
            self._flops += flops

        def add_item(self, node):
            raise NotImplementedError

    class DeviceItem(ItemBase):
        def add_item(self, node):
            self.call += 1
            self.add_gpu_time(node.end_ns - node.start_ns)

    class OperatorItem(ItemBase):
        def add_item(self, node):
            self.add_call()
            self.add_cpu_time(node.cpu_time)
            self.add_gpu_time(node.gpu_time)
            self.add_general_gpu_time(node.general_gpu_time)
            self.add_flops(node.flops)
            for child in node.children_node:
                if child.type != TracerEventType.Operator:
                    if child.name not in self.operator_inners:
                        self.operator_inners[
                            child.name
                        ] = EventSummary.OperatorItem(child.name)
                    self.operator_inners[child.name].add_item(child)

            for runtimenode in node.runtime_node:
                for devicenode in runtimenode.device_node:
                    name = devicenode.name
                    if name not in self.devices:
                        self.devices[name] = EventSummary.DeviceItem(name)
                    self.devices[name].add_item(devicenode)

    class ForwardItem(ItemBase):
        def add_item(self, node):
            self.add_call()
            self.add_cpu_time(node.cpu_time)
            self.add_gpu_time(node.gpu_time)
            self.add_general_gpu_time(node.general_gpu_time)
            self.add_flops(node.flops)
            for child in node.children_node:
                if child.type != TracerEventType.Operator:
                    if child.name not in self.operator_inners:
                        self.operator_inners[
                            child.name
                        ] = EventSummary.OperatorItem(child.name)
                    self.operator_inners[child.name].add_item(child)

    class GeneralItem(ItemBase):
        def add_item(self, node):
            self.add_call()
            self.add_cpu_time(node.cpu_time)
            self.add_gpu_time(node.gpu_time)
            self.add_general_gpu_time(node.general_gpu_time)

    def __init__(self):
        self.items = {}  # for operator summary
        self.thread_items = collections.defaultdict(
            dict
        )  # for operator summary
        self.userdefined_items = {}  # for userdefined summary
        self.userdefined_thread_items = collections.defaultdict(
            dict
        )  # for userdefined summary
        self.model_perspective_items = {}  # for model summary
        self.memory_manipulation_items = {}  # for memory manipulation summary
        self.kernel_items = {}  # for kernel summary

    def parse(self, nodetrees):
        r"""
        Analysis operator event in the nodetress.
        """
        node_statistic_trees, thread2host_statistic_nodes = wrap_tree(nodetrees)
        for (
            threadid,
            host_statistic_nodes,
        ) in thread2host_statistic_nodes.items():
            for host_statistic_node in host_statistic_nodes[
                1:
            ]:  # skip root node
                if host_statistic_node.type == TracerEventType.Operator:
                    self.add_operator_item(host_statistic_node)
                if (
                    host_statistic_node.type == TracerEventType.UserDefined
                    or host_statistic_node.type
                    == TracerEventType.PythonUserDefined
                ):
                    if (
                        'memcpy' in host_statistic_node.name.lower()
                        or 'memorycopy' in host_statistic_node.name.lower()
                        or 'memset' in host_statistic_node.name.lower()
                    ):
                        self.add_memory_manipulation_item(host_statistic_node)
                    else:
                        if (
                            host_statistic_node.type
                            == TracerEventType.PythonUserDefined
                        ):
                            self.add_userdefined_item(host_statistic_node)
            self.add_kernel_item(host_statistic_nodes[0])

        for threadid, root_statistic_node in node_statistic_trees.items():
            deque = collections.deque()
            deque.append(root_statistic_node)
            while deque:
                current_node = deque.popleft()
                for child in current_node.children_node:
                    if (
                        child.type == TracerEventType.Forward
                        or child.type == TracerEventType.Dataloader
                        or child.type == TracerEventType.Backward
                        or child.type == TracerEventType.Optimization
                    ):
                        self.add_model_perspective_item(
                            child
                        )  # find first model perspective node
                    else:
                        if child.type == TracerEventType.ProfileStep:
                            self.add_model_perspective_item(child)
                        deque.append(child)

    def add_forward_item(self, operator_node):
        pass

    def add_operator_item(self, operator_node):
        if operator_node.name not in self.items:
            self.items[operator_node.name] = EventSummary.OperatorItem(
                operator_node.name
            )

        self.items[operator_node.name].add_item(operator_node)

        if operator_node.name not in self.thread_items[operator_node.thread_id]:
            self.thread_items[operator_node.thread_id][
                operator_node.name
            ] = EventSummary.OperatorItem(operator_node.name)
        self.thread_items[operator_node.thread_id][operator_node.name].add_item(
            operator_node
        )

    def add_userdefined_item(self, userdefined_node):
        if userdefined_node.name not in self.userdefined_items:
            self.userdefined_items[
                userdefined_node.name
            ] = EventSummary.GeneralItem(userdefined_node.name)

        self.userdefined_items[userdefined_node.name].add_item(userdefined_node)

        if (
            userdefined_node.name
            not in self.userdefined_thread_items[userdefined_node.thread_id]
        ):
            self.userdefined_thread_items[userdefined_node.thread_id][
                userdefined_node.name
            ] = EventSummary.GeneralItem(userdefined_node.name)
        self.userdefined_thread_items[userdefined_node.thread_id][
            userdefined_node.name
        ].add_item(userdefined_node)

    def add_memory_manipulation_item(self, memory_manipulation_node):
        if memory_manipulation_node.name not in self.memory_manipulation_items:
            self.memory_manipulation_items[
                memory_manipulation_node.name
            ] = EventSummary.GeneralItem(memory_manipulation_node.name)
        self.memory_manipulation_items[memory_manipulation_node.name].add_item(
            memory_manipulation_node
        )

    def add_model_perspective_item(self, model_perspective_node):
        if model_perspective_node.type == TracerEventType.Forward:
            name = 'Forward'
        elif model_perspective_node.type == TracerEventType.Backward:
            name = 'Backward'
        elif model_perspective_node.type == TracerEventType.Optimization:
            name = 'Optimization'
        elif model_perspective_node.type == TracerEventType.Dataloader:
            name = 'Dataloader'
        elif model_perspective_node.type == TracerEventType.ProfileStep:
            name = 'ProfileStep'
        else:
            return
        if name not in self.model_perspective_items:
            self.model_perspective_items[name] = EventSummary.GeneralItem(name)
        self.model_perspective_items[name].add_item(model_perspective_node)

    def add_kernel_item(self, root_node):
        device_nodes = get_device_nodes(root_node)
        for device_node in device_nodes:
            if device_node.type == TracerEventType.Kernel:
                name = device_node.name
                if name not in self.kernel_items:
                    self.kernel_items[name] = EventSummary.DeviceItem(name)
                self.kernel_items[name].add_item(device_node)


class MemorySummary:
    r"""
    Analyse memory events in profiling data.
    """

    class MemoryItem:
        def __init__(self, event_name, place, memory_type='Allocated'):
            self.event_name = event_name
            self.place = place
            self.allocation_count = 0
            self.free_count = 0
            self.allocation_size = 0
            self.free_size = 0
            self.increase_size = 0
            self.memory_type = memory_type

        def add_memory_record(self, size, allocation_type):
            if (
                allocation_type == TracerMemEventType.Allocate
                or allocation_type == TracerMemEventType.ReservedAllocate
            ):
                self.allocation_count += 1
                self.allocation_size += size

            elif (
                allocation_type == TracerMemEventType.Free
                or allocation_type == TracerMemEventType.ReservedFree
            ):
                self.free_count += 1
                self.free_size -= size  # size is sign(-) when free.

            else:
                print("No corresponding type.")
            self.increase_size = self.allocation_size - self.free_size

    def __init__(self):
        self.allocated_items = collections.defaultdict(
            dict
        )  # for memory summary, device type: event
        self.reserved_items = collections.defaultdict(
            dict
        )  # for memory summary, device type: event
        self.peak_allocation_values = collections.defaultdict(int)
        self.peak_reserved_values = collections.defaultdict(int)

    def _analyse_node_memory(self, event_name, node):
        for memnode in node.mem_node:  # self mem node
            if (
                memnode.type == TracerMemEventType.Allocate
                or memnode.type == TracerMemEventType.Free
            ):
                if event_name not in self.allocated_items[memnode.place]:
                    self.allocated_items[memnode.place][
                        event_name
                    ] = MemorySummary.MemoryItem(
                        event_name, memnode.place, 'Allocated'
                    )
                self.allocated_items[memnode.place][
                    event_name
                ].add_memory_record(memnode.increase_bytes, memnode.type)
            elif (
                memnode.type == TracerMemEventType.ReservedAllocate
                or memnode.type == TracerMemEventType.ReservedFree
            ):
                if event_name not in self.reserved_items[memnode.place]:
                    self.reserved_items[memnode.place][
                        event_name
                    ] = MemorySummary.MemoryItem(
                        event_name, memnode.place, 'Reserved'
                    )
                self.reserved_items[memnode.place][
                    event_name
                ].add_memory_record(memnode.increase_bytes, memnode.type)
            self.peak_allocation_values[memnode.place] = max(
                self.peak_allocation_values[memnode.place],
                memnode.peak_allocated,
            )
            self.peak_reserved_values[memnode.place] = max(
                self.peak_reserved_values[memnode.place], memnode.peak_reserved
            )

    def parse(self, nodetrees):
        r"""
        Analyse memory event in the nodetress.
        """
        thread2hostnodes = traverse_tree(nodetrees)
        for threadid, host_nodes in thread2hostnodes.items():
            for host_node in host_nodes[1:]:  # skip root node
                if host_node.type == TracerEventType.OperatorInner:
                    continue
                if host_node.type == TracerEventType.Operator:
                    for child in host_node.children_node:
                        self._analyse_node_memory(host_node.name, child)
                self._analyse_node_memory(host_node.name, host_node)


class StatisticData:
    r"""
    Hold all analysed results.
    """

    def __init__(self, node_trees, extra_info):
        self.node_trees = node_trees
        self.extra_info = extra_info
        self.time_range_summary = TimeRangeSummary()
        self.event_summary = EventSummary()
        self.distributed_summary = DistributedSummary()
        self.memory_summary = MemorySummary()
        self.time_range_summary.parse(node_trees)
        self.event_summary.parse(node_trees)
        self.distributed_summary.parse(node_trees)
        self.memory_summary.parse(node_trees)


def _build_table(
    statistic_data,
    sorted_by=SortedKeys.CPUTotal,
    op_detail=True,
    thread_sep=False,
    time_unit='ms',
    row_limit=100,
    max_src_column_width=75,
    views=None,
):
    from .profiler import SummaryView

    """Prints a summary of events."""
    # format table row
    SPACING_SIZE = 2
    row_format_list = [""]
    header_sep_list = [""]
    line_length_list = [-SPACING_SIZE]

    def add_column(padding, text_dir='<'):
        row_format_list[0] += (
            '{: ' + text_dir + str(padding) + '}' + (' ' * SPACING_SIZE)
        )
        header_sep_list[0] += '-' * padding + (' ' * SPACING_SIZE)
        line_length_list[0] += padding + SPACING_SIZE

    def add_title(padding, text):
        left_length = padding - len(text)
        half = left_length // 2
        return '-' * half + text + '-' * (left_length - half)

    result = []

    def append(s):
        result.append(s)
        result.append('\n')

    def format_time(time, unit='ms', indent=0):
        r"""
        Transform time in ns to time in unit.
        """
        if time == float('inf'):
            return '-'
        else:
            result = float(time)
            if unit == 's':
                result /= 1e9
            elif unit == 'ms':
                result /= 1e6
            elif unit == 'us':
                result /= 1e3
            return '{}{:.2f}'.format(' ' * indent, result)

    def format_ratio(ratio, indent=0):
        r"""
        Transform ratio within [0, 1] to percentage presentation.
        """
        return '{}{:.2f}'.format(' ' * indent, ratio * 100)

    total_time = statistic_data.time_range_summary.get_cpu_range_sum(
        TracerEventType.ProfileStep
    )

    if views is None or SummaryView.DeviceView in views:
        # ----- Print Device Summary ----- #
        headers = ['Device', 'Utilization (%)']
        name_column_width = 30
        DEFAULT_COLUMN_WIDTH = 20
        add_column(name_column_width)
        for _ in headers[1:]:
            add_column(DEFAULT_COLUMN_WIDTH)

        row_format = row_format_list[0]
        header_sep = header_sep_list[0]
        line_length = line_length_list[0]

        # construct table string

        append(add_title(line_length, "Device Summary"))
        append(header_sep)
        append(row_format.format(*headers))
        append(header_sep)
        row_values = [
            'CPU(Process)',
            format_ratio(
                float(statistic_data.extra_info['Process Cpu Utilization'])
            ),
        ]
        append(row_format.format(*row_values))
        row_values = [
            'CPU(System)',
            format_ratio(
                float(statistic_data.extra_info['System Cpu Utilization'])
            ),
        ]
        append(row_format.format(*row_values))
        for gpu_name in statistic_data.time_range_summary.get_gpu_devices():
            gpu_time = float(
                statistic_data.time_range_summary.get_gpu_range_sum(
                    gpu_name, TracerEventType.Kernel
                )
            )
            utilization = gpu_time / total_time
            row_values = [f'GPU{gpu_name}', format_ratio(utilization)]
            append(row_format.format(*row_values))

        append(header_sep)
        append(
            "Note:\nCPU(Process) Utilization = Current process CPU time over all cpu cores / elapsed time, so max utilization can be reached 100% * number of cpu cores.\n"
            "CPU(System) Utilization = All processes CPU time over all cpu cores(busy time) / (busy time + idle time).\n"
            "GPU Utilization = Current process GPU time / elapsed time."
        )
        append('-' * line_length)
        append('')
        append('')

        if total_time == 0:
            return ''.join(result)

    if views is None or SummaryView.OverView in views:
        # ----- Print Overview Summary ----- #
        headers = ['Event Type', 'Calls', 'CPU Time', 'Ratio (%)']
        row_format_list = [""]
        header_sep_list = [""]
        line_length_list = [-SPACING_SIZE]

        DEFAULT_COLUMN_WIDTH = 25
        for _ in headers:
            add_column(DEFAULT_COLUMN_WIDTH)

        row_format = row_format_list[0]
        header_sep = header_sep_list[0]
        line_length = line_length_list[0]

        # construct table string
        append(add_title(line_length, "Overview Summary"))
        append(f'Time unit: {time_unit}')
        append(header_sep)
        append(row_format.format(*headers))
        append(header_sep)
        cpu_type_time = collections.defaultdict(int)
        gpu_type_time = collections.defaultdict(int)
        cpu_call_times = collections.defaultdict(int)
        gpu_call_times = collections.defaultdict(int)
        cpu_call_times.update(statistic_data.time_range_summary.call_times)
        gpu_call_times.update(statistic_data.time_range_summary.call_times)

        for (
            event_type,
            value,
        ) in statistic_data.time_range_summary.CPUTimeRangeSum.items():
            if event_type != TracerEventType.Communication:
                cpu_type_time[event_type] = value
        if statistic_data.distributed_summary.cpu_communication_range:
            cpu_type_time[TracerEventType.Communication] = sum_ranges(
                statistic_data.distributed_summary.cpu_communication_range
            )
            cpu_call_times[
                TracerEventType.Communication
            ] = statistic_data.distributed_summary.cpu_calls

        for event_type in [
            TracerEventType.Dataloader,
            TracerEventType.Forward,
            TracerEventType.Backward,
            TracerEventType.Optimization,
        ]:
            event_type_name = str(event_type).split('.')[1]
            if (
                event_type in cpu_call_times
                and event_type_name
                in statistic_data.event_summary.model_perspective_items
            ):
                cpu_call_times[
                    event_type
                ] = statistic_data.event_summary.model_perspective_items[
                    event_type_name
                ].call
                cpu_type_time[
                    event_type
                ] = statistic_data.event_summary.model_perspective_items[
                    event_type_name
                ].cpu_time

        gpu_time_range = collections.defaultdict(list)
        for (
            device_id,
            device_time_ranges,
        ) in statistic_data.time_range_summary.GPUTimeRange.items():
            for event_type, time_range in device_time_ranges.items():
                gpu_time_range[event_type] = merge_ranges(
                    gpu_time_range[event_type], time_range, is_sorted=True
                )
        for event_type, time_range in gpu_time_range.items():
            gpu_type_time[event_type] = sum_ranges(time_range)
        if statistic_data.distributed_summary.gpu_communication_range:
            gpu_type_time[TracerEventType.Communication] = sum_ranges(
                statistic_data.distributed_summary.gpu_communication_range
            )
            gpu_call_times[
                TracerEventType.Communication
            ] = statistic_data.distributed_summary.gpu_calls

        sorted_items = sorted(
            cpu_type_time.items(), key=lambda x: x[1], reverse=True
        )
        event_type, time = sorted_items[0]
        row_values = [
            '{}'.format(str(event_type).split('.')[1]),
            cpu_call_times[event_type],
            format_time(time, unit=time_unit),
            format_ratio(float(time) / total_time),
        ]
        append(row_format.format(*row_values))
        for event_type, time in sorted_items[1:]:
            row_values = [
                '  {}'.format(str(event_type).split('.')[1]),
                cpu_call_times[event_type],
                format_time(time, unit=time_unit),
                format_ratio(float(time) / total_time),
            ]
            append(row_format.format(*row_values))
        append(header_sep)
        headers = ['', 'Calls', 'GPU Time', 'Ratio (%)']
        append(row_format.format(*headers))
        append(header_sep)
        for event_type, time in gpu_type_time.items():
            row_values = [
                '  {}'.format(str(event_type).split('.')[1]),
                gpu_call_times[event_type],
                format_time(time, unit=time_unit),
                format_ratio(float(time) / total_time),
            ]
            append(row_format.format(*row_values))

        append(header_sep)
        append(
            "Note:\nIn this table, We sum up all collected events in terms of event type.\n"
            "The time of events collected on host are presented as CPU Time, and as GPU Time if on device.\n"
            "Events with different types may overlap or inclusion, e.g. Operator includes OperatorInner, so the sum of ratios is not 100%.\n"
            "The time of events in the same type with overlap will not calculate twice, and all time is summed after merged.\n"
            "Example:\n"
            "Thread 1:\n"
            "  Operator: |___________|     |__________|\n"
            "Thread 2:\n"
            "  Operator:   |____________|     |___|\n"
            "After merged:\n"
            "  Result:   |______________|  |__________|\n"
        )
        append('-' * line_length)
        append('')
        append('')

    if views is None or SummaryView.ModelView in views:
        # ----- Print Model Summary Report ----- #
        model_perspective_items = (
            statistic_data.event_summary.model_perspective_items
        )
        if len(model_perspective_items) > 1:
            all_row_values = []
            accumulation_time = 0
            gpu_accumulation_time = 0
            gpu_total_time = (
                statistic_data.event_summary.model_perspective_items[
                    'ProfileStep'
                ].gpu_time
            )
            for name in [
                'ProfileStep',
                'Dataloader',
                'Forward',
                'Backward',
                'Optimization',
            ]:
                if name in model_perspective_items:
                    item = model_perspective_items[name]
                    if gpu_total_time == 0:
                        gpu_ratio = 0
                    else:
                        gpu_ratio = float(item.gpu_time) / gpu_total_time
                    name = f'{name}' if 'ProfileStep' in name else f'  {name}'
                    row_values = [
                        f'{name}',
                        item.call,
                        f'{format_time(item.cpu_time, unit=time_unit)} / {format_time(item.avg_cpu_time, unit=time_unit)} / {format_time(item.max_cpu_time, unit=time_unit)} / {format_time(item.min_cpu_time, unit=time_unit)} / {format_ratio(float(item.cpu_time) / total_time)}',
                        f'{format_time(item.gpu_time, unit=time_unit)} / {format_time(item.avg_gpu_time, unit=time_unit)} / {format_time(item.max_gpu_time, unit=time_unit)} / {format_time(item.min_gpu_time, unit=time_unit)} / {format_ratio(gpu_ratio)}',
                    ]
                    all_row_values.append(row_values)
                    if 'ProfileStep' not in name:
                        accumulation_time += item.cpu_time
                        gpu_accumulation_time += item.gpu_time

            other_time = total_time - accumulation_time
            other_gpu_time = gpu_total_time - gpu_accumulation_time
            if gpu_total_time == 0:
                gpu_ratio = 0
            else:
                gpu_ratio = float(other_gpu_time) / gpu_total_time
            row_values = [
                '  Others',
                '-',
                f'{format_time(other_time, unit=time_unit)} / - / - / - / {format_ratio(float(other_time) / total_time)}',
                f'{format_time(other_gpu_time, unit=time_unit)} / - / - / - / {format_ratio(gpu_ratio)}',
            ]
            all_row_values.append(row_values)
            # Calculate the column width
            calltime_width = 6
            cpu_data_description_width = 40
            gpu_data_description_width = 40
            for row_values in all_row_values:
                if (
                    isinstance(row_values[1], int)
                    and len(str(row_values[1])) > calltime_width
                ):
                    calltime_width = len(str(row_values[1]))
                if len(row_values[2]) > cpu_data_description_width:
                    cpu_data_description_width = len(row_values[2])
                if len(row_values[3]) > gpu_data_description_width:
                    gpu_data_description_width = len(row_values[3])
            headers = [
                'Name',
                'Calls',
                'CPU Total / Avg / Max / Min / Ratio(%)',
                'GPU Total / Avg / Max / Min / Ratio(%)',
            ]
            row_format_list = [""]
            header_sep_list = [""]
            line_length_list = [-SPACING_SIZE]
            name_column_width = 15
            add_column(name_column_width)
            add_column(calltime_width)
            add_column(cpu_data_description_width)
            add_column(gpu_data_description_width)

            row_format = row_format_list[0]
            header_sep = header_sep_list[0]
            line_length = line_length_list[0]

            # construct table string
            append(add_title(line_length, "Model Summary"))
            append(f'Time unit: {time_unit}')
            append(header_sep)
            append(row_format.format(*headers))
            append(header_sep)
            for row_values in all_row_values:
                append(row_format.format(*row_values))
            append(header_sep)
            append(
                "Note:\nIn this table, GPU time is the sum of all device(GPU) events called in the phase.\n"
                "Unlike overview summary, if two device(GPU) events execute on different streams with overlap time, we sum them directly here.\n"
            )
            append('-' * line_length)
            append('')
            append('')

    if views is None or SummaryView.DistributedView in views:
        # ----- Print Distribution Summary Report ----- #
        if statistic_data.distributed_summary.communication_range:
            headers = [
                'Name',
                'Total Time',
                'Ratio (%)',
            ]
            row_format_list = [""]
            header_sep_list = [""]
            line_length_list = [-SPACING_SIZE]

            DEFAULT_COLUMN_WIDTH = 25
            for _ in headers:
                add_column(DEFAULT_COLUMN_WIDTH)

            row_format = row_format_list[0]
            header_sep = header_sep_list[0]
            line_length = line_length_list[0]

            # construct table string
            append(add_title(line_length, "Distribution Summary"))
            append(f'Time unit: {time_unit}')
            append(header_sep)
            append(row_format.format(*headers))
            append(header_sep)
            communication_time = sum_ranges(
                statistic_data.distributed_summary.communication_range
            )
            computation_time = sum_ranges(
                statistic_data.distributed_summary.computation_range
            )
            overlap_time = sum_ranges(
                statistic_data.distributed_summary.overlap_range
            )
            row_values = [
                'ProfileStep',
                format_time(total_time, unit=time_unit),
                format_ratio(float(total_time) / total_time),
            ]
            append(row_format.format(*row_values))
            row_values = [
                '  Communication',
                format_time(communication_time, unit=time_unit),
                format_ratio(float(communication_time) / total_time),
            ]
            append(row_format.format(*row_values))

            row_values = [
                '  Computation',
                format_time(computation_time, unit=time_unit),
                format_ratio(float(computation_time) / total_time),
            ]
            append(row_format.format(*row_values))

            row_values = [
                '  Overlap',
                format_time(overlap_time, unit=time_unit),
                format_ratio(float(overlap_time) / total_time),
            ]
            append(row_format.format(*row_values))
            append(header_sep)
            append(
                "Note:\nCommunication time: Communication Event time, Communication Op time and its kernel time on gpu.\n"
                "Computation time: Kernel time, except kernels belong to communication(nccl kernels).\n"
                "Overlap time: Communication time intersects with computation time.\n"
                "Example:\n"
                "Communication:\n"
                "  CPU:              |_________________|\n"
                "  GPU:                                  |______________|\n"
                "  Total:            |_________________| |______________|\n"
                "Computation time(Kernel):\n"
                "  GPU:         |________________|\n"
                "Overlap time:       |___________|\n"
            )
            append('-' * line_length)
            append('')
            append('')

    if views is None or SummaryView.OperatorView in views:
        # ----- Print Operator Summary Report ----- #
        if statistic_data.event_summary.items:
            all_row_values = []
            name_column_width = 52
            if thread_sep:
                thread_items = statistic_data.event_summary.thread_items
            else:
                thread_items = {
                    'All threads merged': statistic_data.event_summary.items
                }
            for thread_id, items in thread_items.items():
                all_row_values.append(f"Thread: {thread_id}")
                if sorted_by == SortedKeys.CPUTotal:
                    sorted_items = sorted(
                        items.items(), key=lambda x: x[1].cpu_time, reverse=True
                    )
                elif sorted_by == SortedKeys.CPUAvg:
                    sorted_items = sorted(
                        items.items(),
                        key=lambda x: x[1].avg_cpu_time,
                        reverse=True,
                    )
                elif sorted_by == SortedKeys.CPUMax:
                    sorted_items = sorted(
                        items.items(),
                        key=lambda x: x[1].max_cpu_time,
                        reverse=True,
                    )
                elif sorted_by == SortedKeys.CPUMin:
                    sorted_items = sorted(
                        items.items(), key=lambda x: x[1].min_cpu_time
                    )
                elif sorted_by == SortedKeys.GPUTotal:
                    sorted_items = sorted(
                        items.items(),
                        key=lambda x: x[1].general_gpu_time,
                        reverse=True,
                    )
                elif sorted_by == SortedKeys.GPUAvg:
                    sorted_items = sorted(
                        items.items(),
                        key=lambda x: x[1].avg_general_gpu_time,
                        reverse=True,
                    )
                elif sorted_by == SortedKeys.GPUMax:
                    sorted_items = sorted(
                        items.items(),
                        key=lambda x: x[1].max_general_gpu_time,
                        reverse=True,
                    )
                elif sorted_by == SortedKeys.GPUMin:
                    sorted_items = sorted(
                        items.items(), key=lambda x: x[1].min_general_gpu_time
                    )
                total_op_cpu_time = 0
                total_op_gpu_time = 0

                for name, item in sorted_items:
                    total_op_cpu_time += item.cpu_time
                    total_op_gpu_time += item.general_gpu_time

                for name, item in sorted_items:
                    if total_op_cpu_time == 0:
                        cpu_ratio = 0
                    else:
                        cpu_ratio = float(item.cpu_time) / total_op_cpu_time
                    if total_op_gpu_time == 0:
                        gpu_ratio = 0
                    else:
                        gpu_ratio = (
                            float(item.general_gpu_time) / total_op_gpu_time
                        )
                    row_values = [
                        name,
                        item.call,
                        f'{format_time(item.cpu_time, unit=time_unit)} / {format_time(item.avg_cpu_time, unit=time_unit)} / {format_time(item.max_cpu_time, unit=time_unit)} / {format_time(item.min_cpu_time, unit=time_unit)} / {format_ratio(cpu_ratio)}',
                        '{} / {} / {} / {} / {}'.format(
                            format_time(item.general_gpu_time, unit=time_unit),
                            format_time(
                                item.avg_general_gpu_time, unit=time_unit
                            ),
                            format_time(
                                item.max_general_gpu_time, unit=time_unit
                            ),
                            format_time(
                                item.min_general_gpu_time, unit=time_unit
                            ),
                            format_ratio(gpu_ratio),
                        ),
                        item.flops,
                    ]
                    all_row_values.append(row_values)
                    if op_detail:
                        for (
                            innerop_name,
                            innerop_node,
                        ) in item.operator_inners.items():
                            if item.cpu_time == 0:
                                cpu_ratio = 0
                            else:
                                cpu_ratio = (
                                    float(innerop_node.cpu_time) / item.cpu_time
                                )
                            if item.general_gpu_time == 0:
                                gpu_ratio = 0
                            else:
                                gpu_ratio = (
                                    float(innerop_node.general_gpu_time)
                                    / item.general_gpu_time
                                )
                            if len(innerop_name) + 2 > name_column_width:
                                innerop_name = innerop_name[
                                    : name_column_width - 5
                                ]
                                innerop_name += "..."
                            row_values = [
                                f'  {innerop_name}',
                                innerop_node.call,
                                '{} / {} / {} / {} / {}'.format(
                                    format_time(
                                        innerop_node.cpu_time, unit=time_unit
                                    ),
                                    format_time(
                                        innerop_node.avg_cpu_time,
                                        unit=time_unit,
                                    ),
                                    format_time(
                                        innerop_node.max_cpu_time,
                                        unit=time_unit,
                                    ),
                                    format_time(
                                        innerop_node.min_cpu_time,
                                        unit=time_unit,
                                    ),
                                    format_ratio(cpu_ratio),
                                ),
                                '{} / {} / {} / {} / {}'.format(
                                    format_time(
                                        innerop_node.general_gpu_time,
                                        unit=time_unit,
                                    ),
                                    format_time(
                                        innerop_node.avg_general_gpu_time,
                                        unit=time_unit,
                                    ),
                                    format_time(
                                        innerop_node.max_general_gpu_time,
                                        unit=time_unit,
                                    ),
                                    format_time(
                                        innerop_node.min_general_gpu_time,
                                        unit=time_unit,
                                    ),
                                    format_ratio(gpu_ratio),
                                ),
                                '-',
                            ]
                            all_row_values.append(row_values)
                            for (
                                device_node_name,
                                device_node,
                            ) in innerop_node.devices.items():
                                if innerop_node.general_gpu_time == 0:
                                    gpu_ratio = 0
                                else:
                                    gpu_ratio = (
                                        float(device_node.gpu_time)
                                        / innerop_node.general_gpu_time
                                    )
                                if (
                                    len(device_node_name) + 4
                                    > name_column_width
                                ):
                                    device_node_name = device_node_name[
                                        : name_column_width - 7
                                    ]
                                    device_node_name += "..."
                                row_values = [
                                    f'    {device_node_name}',
                                    device_node.call,
                                    '- / - / - / - / -',
                                    '{} / {} / {} / {} / {}'.format(
                                        format_time(
                                            device_node.gpu_time, unit=time_unit
                                        ),
                                        format_time(
                                            device_node.avg_gpu_time,
                                            unit=time_unit,
                                        ),
                                        format_time(
                                            device_node.max_gpu_time,
                                            unit=time_unit,
                                        ),
                                        format_time(
                                            device_node.min_gpu_time,
                                            unit=time_unit,
                                        ),
                                        format_ratio(gpu_ratio),
                                    ),
                                    '-',
                                ]
                                all_row_values.append(row_values)
                        for (
                            device_node_name,
                            device_node,
                        ) in item.devices.items():
                            if item.general_gpu_time == 0:
                                gpu_ratio = 0
                            else:
                                gpu_ratio = (
                                    float(device_node.gpu_time)
                                    / item.general_gpu_time
                                )
                            if len(device_node_name) + 2 > name_column_width:
                                device_node_name = device_node_name[
                                    : name_column_width - 5
                                ]
                                device_node_name += "..."
                            row_values = [
                                f'  {device_node_name}',
                                device_node.call,
                                '- / - / - / - / -',
                                '{} / {} / {} / {} / {}'.format(
                                    format_time(
                                        device_node.gpu_time, unit=time_unit
                                    ),
                                    format_time(
                                        device_node.avg_gpu_time, unit=time_unit
                                    ),
                                    format_time(
                                        device_node.max_gpu_time, unit=time_unit
                                    ),
                                    format_time(
                                        device_node.min_gpu_time, unit=time_unit
                                    ),
                                    format_ratio(gpu_ratio),
                                ),
                                '-',
                            ]
                            all_row_values.append(row_values)
            # Calculate the column width
            calltime_width = 6
            cpu_data_description_width = 40
            gpu_data_description_width = 40
            flops_width = 10
            for row_values in all_row_values:
                if isinstance(row_values, str):
                    continue
                if (
                    isinstance(row_values[1], int)
                    and len(str(row_values[1])) > calltime_width
                ):
                    calltime_width = len(str(row_values[1]))
                if len(row_values[2]) > cpu_data_description_width:
                    cpu_data_description_width = len(row_values[2])
                if len(row_values[3]) > gpu_data_description_width:
                    gpu_data_description_width = len(row_values[3])
            headers = [
                'Name',
                'Calls',
                'CPU Total / Avg / Max / Min / Ratio(%)',
                'GPU Total / Avg / Max / Min / Ratio(%)',
                'FLOPs',
            ]
            row_format_list = [""]
            header_sep_list = [""]
            line_length_list = [-SPACING_SIZE]
            add_column(name_column_width)
            add_column(calltime_width)
            add_column(cpu_data_description_width)
            add_column(gpu_data_description_width)
            add_column(flops_width)

            row_format = row_format_list[0]
            header_sep = header_sep_list[0]
            line_length = line_length_list[0]

            # construct table string
            append(add_title(line_length, "Operator Summary"))
            append(f'Time unit: {time_unit}')
            append(header_sep)
            append(row_format.format(*headers))
            append(header_sep)
            for row_values in all_row_values:
                if isinstance(row_values, str):
                    append(add_title(line_length, row_values))
                else:
                    append(row_format.format(*row_values))
            append(header_sep)
            append('')
            append('')

    if views is None or SummaryView.KernelView in views:
        # ----- Print Kernel Summary Report ----- #
        if statistic_data.event_summary.kernel_items:
            all_row_values = []
            kernel_items = statistic_data.event_summary.kernel_items
            if sorted_by == SortedKeys.GPUAvg:
                sorted_items = sorted(
                    kernel_items.items(),
                    key=lambda x: x[1].avg_gpu_time,
                    reverse=True,
                )
            elif sorted_by == SortedKeys.GPUMax:
                sorted_items = sorted(
                    kernel_items.items(),
                    key=lambda x: x[1].max_gpu_time,
                    reverse=True,
                )
            elif sorted_by == SortedKeys.GPUMin:
                sorted_items = sorted(
                    kernel_items.items(), key=lambda x: x[1].min_gpu_time
                )
            else:
                sorted_items = sorted(
                    kernel_items.items(),
                    key=lambda x: x[1].gpu_time,
                    reverse=True,
                )

            total_kernel_gpu_time = 0
            for name, item in sorted_items:
                total_kernel_gpu_time += item.gpu_time
            for name, item in sorted_items:
                if total_kernel_gpu_time == 0:
                    gpu_ratio = 0
                else:
                    gpu_ratio = float(item.gpu_time) / total_kernel_gpu_time
                row_values = [
                    name,
                    item.call,
                    f'{format_time(item.gpu_time, unit=time_unit)} / {format_time(item.avg_gpu_time, unit=time_unit)} / {format_time(item.max_gpu_time, unit=time_unit)} / {format_time(item.min_gpu_time, unit=time_unit)} / {format_ratio(gpu_ratio)}',
                ]
                all_row_values.append(row_values)

            headers = [
                'Name',
                'Calls',
                'GPU Total / Avg / Max / Min / Ratio(%)',
            ]
            # Calculate the column width
            name_column_width = 90
            calltime_width = 6
            gpu_data_description_width = 40
            for row_values in all_row_values:
                if (
                    isinstance(row_values[1], int)
                    and len(str(row_values[1])) > calltime_width
                ):
                    calltime_width = len(str(row_values[1]))
                if len(row_values[2]) > gpu_data_description_width:
                    gpu_data_description_width = len(row_values[2])

            row_format_list = [""]
            header_sep_list = [""]
            line_length_list = [-SPACING_SIZE]
            add_column(name_column_width)
            add_column(calltime_width)
            add_column(gpu_data_description_width)

            row_format = row_format_list[0]
            header_sep = header_sep_list[0]
            line_length = line_length_list[0]

            # construct table string
            append(add_title(line_length, "Kernel Summary"))
            append(f'Time unit: {time_unit}')
            append(header_sep)
            append(row_format.format(*headers))
            append(header_sep)
            kernel_name_pattern = re.compile(r'(.+?)(<.*>)(\(.*\))')
            for row_values in all_row_values:
                match = kernel_name_pattern.match(row_values[0])
                if match:
                    name = match.group(1) + match.group(2)
                else:
                    name = row_values[0]
                if len(name) > name_column_width:
                    row_values[0] = name[: name_column_width - 3] + '...'
                else:
                    row_values[0] = name
                append(row_format.format(*row_values))
            append(header_sep)
            append('')
            append('')

    if views is None or SummaryView.MemoryManipulationView in views:
        # ----- Print Memory Manipulation Summary Report ----- #
        if statistic_data.event_summary.memory_manipulation_items:
            all_row_values = []
            memory_manipulation_items = (
                statistic_data.event_summary.memory_manipulation_items
            )
            gpu_total_time = (
                statistic_data.event_summary.model_perspective_items[
                    'ProfileStep'
                ].general_gpu_time
            )
            for name, item in memory_manipulation_items.items():
                if gpu_total_time == 0:
                    gpu_ratio = 0
                else:
                    gpu_ratio = float(item.general_gpu_time) / gpu_total_time
                row_values = [
                    name,
                    item.call,
                    f'{format_time(item.cpu_time, unit=time_unit)} / {format_time(item.avg_cpu_time, unit=time_unit)} / {format_time(item.max_cpu_time, unit=time_unit)} / {format_time(item.min_cpu_time, unit=time_unit)} / {format_ratio(float(item.cpu_time) / total_time)}',
                    f'{format_time(item.general_gpu_time, unit=time_unit)} / {format_time(item.avg_general_gpu_time, unit=time_unit)} / {format_time(item.max_general_gpu_time, unit=time_unit)} / {format_time(item.min_general_gpu_time, unit=time_unit)} / {format_ratio(gpu_ratio)}',
                ]
                all_row_values.append(row_values)

            headers = [
                'Name',
                'Calls',
                'CPU Total / Avg / Max / Min / Ratio(%)',
                'GPU Total / Avg / Max / Min / Ratio(%)',
            ]
            # Calculate the column width
            name_column_width = 0
            calltime_width = 6
            cpu_data_description_width = 40
            gpu_data_description_width = 40
            for row_values in all_row_values:
                if len(row_values[0]) > name_column_width:
                    name_column_width = len(row_values[0])
                if (
                    isinstance(row_values[1], int)
                    and len(str(row_values[1])) > calltime_width
                ):
                    calltime_width = len(str(row_values[1]))
                if len(row_values[2]) > cpu_data_description_width:
                    cpu_data_description_width = len(row_values[2])
                if len(row_values[3]) > gpu_data_description_width:
                    gpu_data_description_width = len(row_values[3])

            row_format_list = [""]
            header_sep_list = [""]
            line_length_list = [-SPACING_SIZE]
            add_column(name_column_width)
            add_column(calltime_width)
            add_column(cpu_data_description_width)
            add_column(gpu_data_description_width)

            row_format = row_format_list[0]
            header_sep = header_sep_list[0]
            line_length = line_length_list[0]

            # construct table string
            append(add_title(line_length, "Memory Manipulation Summary"))
            append(f'Time unit: {time_unit}')
            append(header_sep)
            append(row_format.format(*headers))
            append(header_sep)
            for row_values in all_row_values:
                append(row_format.format(*row_values))
            append(header_sep)
            append('')
            append('')

    if views is None or SummaryView.UDFView in views:
        # ----- Print UserDefined Summary Report ----- #
        if statistic_data.event_summary.userdefined_items:
            all_row_values = []
            gpu_total_time = (
                statistic_data.event_summary.model_perspective_items[
                    'ProfileStep'
                ].general_gpu_time
            )
            if thread_sep:
                userdefined_thread_items = (
                    statistic_data.event_summary.userdefined_thread_items
                )
            else:
                userdefined_thread_items = {
                    'All threads merged': statistic_data.event_summary.userdefined_items
                }
            for thread_id, items in userdefined_thread_items.items():
                all_row_values.append(f"Thread: {thread_id}")
                if sorted_by == SortedKeys.CPUTotal:
                    sorted_items = sorted(
                        items.items(), key=lambda x: x[1].cpu_time, reverse=True
                    )
                elif sorted_by == SortedKeys.CPUAvg:
                    sorted_items = sorted(
                        items.items(),
                        key=lambda x: x[1].avg_cpu_time,
                        reverse=True,
                    )
                elif sorted_by == SortedKeys.CPUMax:
                    sorted_items = sorted(
                        items.items(),
                        key=lambda x: x[1].max_cpu_time,
                        reverse=True,
                    )
                elif sorted_by == SortedKeys.CPUMin:
                    sorted_items = sorted(
                        items.items(), key=lambda x: x[1].min_cpu_time
                    )
                elif sorted_by == SortedKeys.GPUTotal:
                    sorted_items = sorted(
                        items.items(),
                        key=lambda x: x[1].general_gpu_time,
                        reverse=True,
                    )
                elif sorted_by == SortedKeys.GPUAvg:
                    sorted_items = sorted(
                        items.items(),
                        key=lambda x: x[1].avg_general_gpu_time,
                        reverse=True,
                    )
                elif sorted_by == SortedKeys.GPUMax:
                    sorted_items = sorted(
                        items.items(),
                        key=lambda x: x[1].max_general_gpu_time,
                        reverse=True,
                    )
                elif sorted_by == SortedKeys.GPUMin:
                    sorted_items = sorted(
                        items.items(), key=lambda x: x[1].min_general_gpu_time
                    )

                for name, item in sorted_items:
                    if gpu_total_time == 0:
                        gpu_ratio = 0
                    else:
                        gpu_ratio = (
                            float(item.general_gpu_time) / gpu_total_time
                        )
                    row_values = [
                        name,
                        item.call,
                        f'{format_time(item.cpu_time, unit=time_unit)} / {format_time(item.avg_cpu_time, unit=time_unit)} / {format_time(item.max_cpu_time, unit=time_unit)} / {format_time(item.min_cpu_time, unit=time_unit)} / {format_ratio(float(item.cpu_time) / total_time)}',
                        '{} / {} / {} / {} / {}'.format(
                            format_time(item.general_gpu_time, unit=time_unit),
                            format_time(
                                item.avg_general_gpu_time, unit=time_unit
                            ),
                            format_time(
                                item.max_general_gpu_time, unit=time_unit
                            ),
                            format_time(
                                item.min_general_gpu_time, unit=time_unit
                            ),
                            format_ratio(gpu_ratio),
                        ),
                    ]
                    all_row_values.append(row_values)

            # Calculate the column width
            name_column_width = 0
            calltime_width = 6
            cpu_data_description_width = 40
            gpu_data_description_width = 40
            for row_values in all_row_values:
                if isinstance(row_values, str):
                    continue
                if len(row_values[0]) > name_column_width:
                    name_column_width = len(row_values[0])
                if (
                    isinstance(row_values[1], int)
                    and len(str(row_values[1])) > calltime_width
                ):
                    calltime_width = len(str(row_values[1]))
                if len(row_values[2]) > cpu_data_description_width:
                    cpu_data_description_width = len(row_values[2])
                if len(row_values[3]) > gpu_data_description_width:
                    gpu_data_description_width = len(row_values[3])

            headers = [
                'Name',
                'Calls',
                'CPU Total / Avg / Max / Min / Ratio(%)',
                'GPU Total / Avg / Max / Min / Ratio(%)',
            ]
            row_format_list = [""]
            header_sep_list = [""]
            line_length_list = [-SPACING_SIZE]

            add_column(name_column_width)
            add_column(calltime_width)
            add_column(cpu_data_description_width)
            add_column(gpu_data_description_width)

            row_format = row_format_list[0]
            header_sep = header_sep_list[0]
            line_length = line_length_list[0]

            # construct table string
            append(add_title(line_length, "UserDefined Summary"))
            append(f'Time unit: {time_unit}')
            append(header_sep)
            append(row_format.format(*headers))
            append(header_sep)
            for row_values in all_row_values:
                if isinstance(row_values, str):
                    append(add_title(line_length, row_values))
                else:
                    append(row_format.format(*row_values))
            append('')
            append('')

    if views is None or SummaryView.MemoryView in views:
        # ----- Print Memory Summary Report ----- #
        if (
            statistic_data.memory_summary.allocated_items
            or statistic_data.memory_summary.reserved_items
        ):
            for (
                device_type,
                memory_events,
            ) in statistic_data.memory_summary.allocated_items.items():
                all_row_values = []
                sorted_items = sorted(
                    memory_events.items(),
                    key=lambda x: x[1].increase_size,
                    reverse=True,
                )

                for event_name, item in sorted_items:
                    row_values = [
                        event_name,
                        item.memory_type,
                        item.allocation_count,
                        item.free_count,
                        item.allocation_size,
                        item.free_size,
                        item.increase_size,
                    ]
                    all_row_values.append(row_values)

                sorted_reserved_items = sorted(
                    statistic_data.memory_summary.reserved_items[
                        device_type
                    ].items(),
                    key=lambda x: x[1].increase_size,
                    reverse=True,
                )
                for event_name, item in sorted_reserved_items:
                    row_values = [
                        event_name,
                        item.memory_type,
                        item.allocation_count,
                        item.free_count,
                        item.allocation_size,
                        item.free_size,
                        item.increase_size,
                    ]
                    all_row_values.append(row_values)

                # Calculate the column width
                headers = [
                    'Name',
                    'Type',
                    'Allocation Count',
                    'Free Count',
                    'Allocation Size',
                    'Free Size',
                    'Increased Size',
                ]
                row_format_list = [""]
                header_sep_list = [""]
                line_length_list = [-SPACING_SIZE]
                name_column_width = 50
                number_column_width = 15
                add_column(name_column_width)
                add_column(12)
                add_column(number_column_width)
                add_column(number_column_width)
                add_column(number_column_width)
                add_column(number_column_width)
                add_column(number_column_width)

                row_format = row_format_list[0]
                header_sep = header_sep_list[0]
                line_length = line_length_list[0]

                # construct table string
                append(
                    add_title(line_length, f"Memory Summary - {device_type}")
                )
                append(
                    'Peak Allocated Memory: {}'.format(
                        statistic_data.memory_summary.peak_allocation_values[
                            device_type
                        ]
                    )
                )
                append(
                    'Peak Reserved Memory: {}'.format(
                        statistic_data.memory_summary.peak_reserved_values[
                            device_type
                        ]
                    )
                )
                append(header_sep)
                append(row_format.format(*headers))
                append(header_sep)
                for row_values in all_row_values:
                    if isinstance(row_values, str):
                        append(add_title(line_length, row_values))
                    else:
                        append(row_format.format(*row_values))
                append('')
                append('')

    return ''.join(result)