profiler_statistic.py 80 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013
  1. # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import collections
  15. import re
  16. from enum import Enum
  17. from paddle.base.core import TracerEventType, TracerMemEventType
  18. from paddle.utils.flops import flops
  19. from .statistic_helper import (
  20. intersection_ranges,
  21. merge_ranges,
  22. merge_self_ranges,
  23. sum_ranges,
  24. )
  25. _AllTracerEventType = [
  26. TracerEventType.Operator,
  27. TracerEventType.Dataloader,
  28. TracerEventType.ProfileStep,
  29. TracerEventType.CudaRuntime,
  30. TracerEventType.Kernel,
  31. TracerEventType.Memcpy,
  32. TracerEventType.Memset,
  33. TracerEventType.UserDefined,
  34. TracerEventType.OperatorInner,
  35. TracerEventType.Forward,
  36. TracerEventType.Backward,
  37. TracerEventType.Optimization,
  38. TracerEventType.Communication,
  39. TracerEventType.PythonOp,
  40. TracerEventType.PythonUserDefined,
  41. ]
  42. _CommunicationOpName = ['allreduce', 'broadcast', 'rpc']
  43. class SortedKeys(Enum):
  44. r"""
  45. SortedKeys is used to specify how to sort items when printing ``paddle.profiler.Profiler.summary`` table.
  46. The meaning of each SortedKeys is as following
  47. - **SortedKeys.CPUTotal** : Sorted by CPU total time.
  48. - **SortedKeys.CPUAvg** : Sorted by CPU average time.
  49. - **SortedKeys.CPUMax** : Sorted by CPU max time.
  50. - **SortedKeys.CPUMin** : Sorted by CPU min time.
  51. - **SortedKeys.GPUTotal** : Sorted by GPU total time.
  52. - **SortedKeys.GPUAvg** : Sorted by GPU average time.
  53. - **SortedKeys.GPUMax** : Sorted by GPU max time.
  54. - **SortedKeys.GPUMin** : Sorted by GPU min time.
  55. """
  56. CPUTotal = 0
  57. CPUAvg = 1
  58. CPUMax = 2
  59. CPUMin = 3
  60. GPUTotal = 4
  61. GPUAvg = 5
  62. GPUMax = 6
  63. GPUMin = 7
  64. def _nodename2opname(name):
  65. r'''
  66. convert static host node name to operator name
  67. '''
  68. op_name = name.replace(' compute', '')
  69. op_name = op_name.replace(' dygraph', '')
  70. op_name = op_name.replace(' pybind_imperative_func', '')
  71. return op_name
  72. class HostStatisticNode:
  73. r'''
  74. Wrap original node for calculating statistic metrics.
  75. '''
  76. def __init__(self, hostnode):
  77. self.hostnode = hostnode
  78. self.children_node = []
  79. self.runtime_node = []
  80. self.cpu_time = 0
  81. self.self_cpu_time = 0
  82. self.gpu_time = 0 # kernel time
  83. self.self_gpu_time = 0
  84. self.general_gpu_time = 0 # besides kernel, include time of gpu events like memcpy and memset
  85. self.self_general_gpu_time = 0
  86. self.flops = 0
  87. def cal_flops(self):
  88. if self.hostnode.type == TracerEventType.Operator:
  89. if hasattr(self.hostnode, 'input_shapes'):
  90. op_name = _nodename2opname(self.hostnode.name)
  91. self.flops = flops(
  92. op_name,
  93. self.hostnode.input_shapes,
  94. self.hostnode.attributes,
  95. )
  96. def cal_statistic(self):
  97. self.cpu_time = self.hostnode.end_ns - self.hostnode.start_ns
  98. self.self_cpu_time = self.cpu_time
  99. self.cal_flops()
  100. for child in self.children_node:
  101. child.cal_flops()
  102. child.cal_statistic()
  103. self.gpu_time += child.gpu_time
  104. self.general_gpu_time += child.general_gpu_time
  105. self.self_cpu_time -= child.end_ns - child.start_ns
  106. self.flops += child.flops
  107. for rt in self.runtime_node:
  108. rt.cal_statistic()
  109. self.self_cpu_time -= rt.end_ns - rt.start_ns
  110. self.gpu_time += rt.gpu_time
  111. self.self_gpu_time += rt.gpu_time
  112. self.general_gpu_time += rt.general_gpu_time
  113. self.self_general_gpu_time += rt.general_gpu_time
  114. for device in self.hostnode.device_node:
  115. if device.type == TracerEventType.Kernel:
  116. self.gpu_time += device.end_ns - device.start_ns
  117. self.self_gpu_time += device.end_ns - device.start_ns
  118. self.general_gpu_time += device.end_ns - device.start_ns
  119. self.self_general_gpu_time += device.end_ns - device.start_ns
  120. @property
  121. def end_ns(self):
  122. return self.hostnode.end_ns
  123. @property
  124. def start_ns(self):
  125. return self.hostnode.start_ns
  126. def __getattr__(self, name):
  127. return getattr(self.hostnode, name)
  128. def traverse_tree(nodetrees):
  129. results = collections.defaultdict(list)
  130. for thread_id, rootnode in nodetrees.items():
  131. stack = []
  132. stack.append(rootnode)
  133. threadlist = results[thread_id]
  134. while stack:
  135. current_node = stack.pop()
  136. threadlist.append(current_node)
  137. for childnode in current_node.children_node:
  138. stack.append(childnode)
  139. return results
  140. def get_device_nodes(hostnode):
  141. '''
  142. Get all device nodes called in the time range of hostnode.
  143. '''
  144. stack = []
  145. device_nodes = []
  146. stack.append(hostnode)
  147. while stack:
  148. current_node = stack.pop()
  149. for childnode in current_node.children_node:
  150. stack.append(childnode)
  151. for runtimenode in current_node.runtime_node:
  152. for devicenode in runtimenode.device_node:
  153. device_nodes.append(devicenode)
  154. return device_nodes
  155. def _build_layer_from_tree(nodetrees):
  156. def build_layer(node, depth=0):
  157. if "GradNode" in node.name:
  158. return [], 0
  159. if node.type in [
  160. TracerEventType.Backward,
  161. TracerEventType.Optimization,
  162. ]:
  163. return [], 0
  164. if node.type == TracerEventType.Operator:
  165. stat_node = HostStatisticNode(node)
  166. stat_node.cal_statistic()
  167. return stat_node, stat_node.flops
  168. layer = []
  169. nflops = 0
  170. for c in node.children_node:
  171. l, f = build_layer(c, depth + 1)
  172. if l:
  173. nflops += f
  174. layer.append(l)
  175. if node.type == TracerEventType.Forward:
  176. stat_node = HostStatisticNode(node)
  177. stat_node.cal_statistic()
  178. stat_node.flops = nflops
  179. return [stat_node, layer], nflops
  180. return layer, nflops
  181. ret = []
  182. for _, rootnode in nodetrees.items():
  183. layer, _ = build_layer(rootnode)
  184. ret.append(layer)
  185. return ret
  186. def _format_large_number(n, precision=2):
  187. if n // 1e12 > 0:
  188. return f"{round(n / 1e12, precision)} T"
  189. if n // 1e9 > 0:
  190. return f"{round(n / 1e9, precision)} G"
  191. if n // 1e6 > 0:
  192. return f"{round(n / 1e6, precision)} M"
  193. if n // 1e3 > 0:
  194. return f"{round(n / 1e3, precision)} K"
  195. return f"{round(n, precision)}"
  196. def _format_time(n, precision=2):
  197. if n // 1e9 > 0:
  198. return f"{round(n / 1e9, precision)} s"
  199. if n // 1e6 > 0:
  200. return f"{round(n / 1e6, precision)} ms"
  201. if n // 1e3 > 0:
  202. return f"{round(n / 1e3, precision)} us"
  203. return f"{round(n, precision)} ns"
  204. def _gen_layer_flops(node, repeat=1):
  205. ret = []
  206. offset = []
  207. loop = []
  208. def print_layer_tree(node, depth=0):
  209. if isinstance(node, list):
  210. for n in node:
  211. print_layer_tree(n, depth + 1)
  212. elif node.type in [TracerEventType.Forward, TracerEventType.Operator]:
  213. if len(offset) == 0:
  214. offset.append(depth)
  215. name = _nodename2opname(node.name)
  216. if (
  217. depth == offset[-1] and len(ret) > 0 and ret[0].startswith(name)
  218. ): # repeat begin
  219. loop.append(1)
  220. if len(loop) >= repeat:
  221. return "".join(ret)
  222. align = " " * (depth - offset[-1])
  223. tm = _format_time(node.cpu_time)
  224. flops_n = _format_large_number(node.flops)
  225. flops_s = _format_large_number(node.flops * 1e9 / node.cpu_time)
  226. ret.append(
  227. f"{align}{name} latency: {tm}, FLOPs: {flops_n}, FLOPS: {flops_s}\n"
  228. )
  229. for n in node[1:]:
  230. print_layer_tree(n)
  231. return "".join(ret)
  232. def gen_layer_flops(nodetrees, repeat=1):
  233. r'''
  234. gen_layer_flops generate flops/runtime information depend on layer/operator.
  235. '''
  236. layer_tree = _build_layer_from_tree(nodetrees)
  237. return _gen_layer_flops(layer_tree, repeat)
  238. def wrap_tree(nodetrees):
  239. '''
  240. Using HostStatisticNode to wrap original profiler result tree, and calculate node statistic metrics.
  241. '''
  242. node_statistic_tree = {}
  243. results = collections.defaultdict(list)
  244. newresults = collections.defaultdict(list)
  245. for thread_id, rootnode in nodetrees.items():
  246. stack = []
  247. stack.append(rootnode)
  248. root_statistic_node = HostStatisticNode(rootnode)
  249. newstack = []
  250. newstack.append(root_statistic_node)
  251. node_statistic_tree[thread_id] = root_statistic_node
  252. threadlist = results[thread_id]
  253. newthreadlist = newresults[thread_id]
  254. while stack:
  255. current_node = stack.pop()
  256. threadlist.append(current_node)
  257. current_statistic_node = newstack.pop()
  258. newthreadlist.append(current_statistic_node)
  259. for childnode in current_node.children_node:
  260. stack.append(childnode)
  261. child_statistic_node = HostStatisticNode(childnode)
  262. current_statistic_node.children_node.append(
  263. child_statistic_node
  264. )
  265. newstack.append(child_statistic_node)
  266. for runtimenode in current_node.runtime_node:
  267. runtime_statistic_node = HostStatisticNode(runtimenode)
  268. current_statistic_node.runtime_node.append(
  269. runtime_statistic_node
  270. )
  271. # recursive calculate node statistic values
  272. for thread_id, root_statistic_node in node_statistic_tree.items():
  273. root_statistic_node.cal_statistic()
  274. return node_statistic_tree, newresults
  275. class TimeRangeSummary:
  276. r"""
  277. Analyse time ranges for each TracerEventType, and summarize the time.
  278. """
  279. def __init__(self):
  280. self.CPUTimeRange = collections.defaultdict(list)
  281. self.GPUTimeRange = collections.defaultdict(
  282. lambda: collections.defaultdict(list)
  283. ) # GPU events should be divided into different devices
  284. self.CPUTimeRangeSum = collections.defaultdict(int)
  285. self.GPUTimeRangeSum = collections.defaultdict(
  286. lambda: collections.defaultdict(int)
  287. )
  288. self.call_times = collections.defaultdict(int)
  289. def parse(self, nodetrees):
  290. r"""
  291. Analysis node trees in profiler result, and get time range for different tracer event type.
  292. """
  293. thread2hostnodes = traverse_tree(nodetrees)
  294. for threadid, hostnodes in thread2hostnodes.items():
  295. CPUTimeRange = collections.defaultdict(list)
  296. GPUTimeRange = collections.defaultdict(
  297. lambda: collections.defaultdict(
  298. lambda: collections.defaultdict(list)
  299. )
  300. ) # device_id/type/stream_id
  301. for hostnode in hostnodes[1:]: # skip root node
  302. CPUTimeRange[hostnode.type].append(
  303. (hostnode.start_ns, hostnode.end_ns)
  304. )
  305. self.call_times[hostnode.type] += 1
  306. for runtimenode in hostnode.runtime_node:
  307. CPUTimeRange[runtimenode.type].append(
  308. (runtimenode.start_ns, runtimenode.end_ns)
  309. )
  310. self.call_times[runtimenode.type] += 1
  311. for devicenode in runtimenode.device_node:
  312. GPUTimeRange[devicenode.device_id][devicenode.type][
  313. devicenode.stream_id
  314. ].append((devicenode.start_ns, devicenode.end_ns))
  315. self.call_times[devicenode.type] += 1
  316. for event_type, time_ranges in CPUTimeRange.items():
  317. time_ranges = merge_self_ranges(time_ranges, is_sorted=False)
  318. self.CPUTimeRange[event_type] = merge_ranges(
  319. self.CPUTimeRange[event_type], time_ranges, is_sorted=True
  320. )
  321. for device_id, device_time_ranges in GPUTimeRange.items():
  322. for event_type, event_time_ranges in device_time_ranges.items():
  323. for stream_id, time_ranges in event_time_ranges.items():
  324. time_ranges = merge_self_ranges(
  325. time_ranges, is_sorted=False
  326. )
  327. self.GPUTimeRange[device_id][event_type] = merge_ranges(
  328. self.GPUTimeRange[device_id][event_type],
  329. time_ranges,
  330. is_sorted=True,
  331. )
  332. for event_type, time_ranges in self.CPUTimeRange.items():
  333. self.CPUTimeRangeSum[event_type] = sum_ranges(time_ranges)
  334. for device_id, device_time_ranges in self.GPUTimeRange.items():
  335. for event_type, time_ranges in device_time_ranges.items():
  336. self.GPUTimeRangeSum[device_id][event_type] = sum_ranges(
  337. time_ranges
  338. )
  339. def get_gpu_devices(self):
  340. return self.GPUTimeRange.keys()
  341. def get_gpu_range_sum(self, device_id, event_type):
  342. return self.GPUTimeRangeSum[device_id][event_type]
  343. def get_cpu_range_sum(self, event_type):
  344. return self.CPUTimeRangeSum[event_type]
  345. class DistributedSummary:
  346. r"""
  347. Analysis communication and computation time range, and their overlap.
  348. The computation time is all kernel except kernels for communication like nccl.
  349. """
  350. def __init__(self):
  351. self.cpu_communication_range = []
  352. self.gpu_communication_range = []
  353. self.communication_range = []
  354. self.computation_range = []
  355. self.overlap_range = []
  356. self.cpu_calls = 0
  357. self.gpu_calls = 0
  358. def parse(self, nodetrees):
  359. '''
  360. Collect all communication and computation time ranges.
  361. '''
  362. thread2hostnodes = traverse_tree(nodetrees)
  363. for threadid, hostnodes in thread2hostnodes.items():
  364. for hostnode in hostnodes[1:]: # skip root node
  365. # case 1: TracerEventType is Communication
  366. if hostnode.type == TracerEventType.Communication:
  367. self.cpu_communication_range.append(
  368. (hostnode.start_ns, hostnode.end_ns)
  369. )
  370. device_nodes = get_device_nodes(hostnode)
  371. for device_node in device_nodes:
  372. if device_node.type == TracerEventType.Kernel:
  373. self.gpu_communication_range.append(
  374. (device_node.start_ns, device_node.end_ns)
  375. )
  376. # case 2: TracerEventType is Operator but is communication op
  377. elif hostnode.type == TracerEventType.Operator and any(
  378. name in hostnode.name.lower()
  379. for name in _CommunicationOpName
  380. ):
  381. self.cpu_communication_range.append(
  382. (hostnode.start_ns, hostnode.end_ns)
  383. )
  384. device_nodes = get_device_nodes(hostnode)
  385. for device_node in device_nodes:
  386. if device_node.type == TracerEventType.Kernel:
  387. self.gpu_communication_range.append(
  388. (device_node.start_ns, device_node.end_ns)
  389. )
  390. # case 3: Others, filter kernels named with nccl
  391. else:
  392. for runtimenode in hostnode.runtime_node:
  393. for devicenode in runtimenode.device_node:
  394. if devicenode.type == TracerEventType.Kernel:
  395. kernel_name = devicenode.name.lower()
  396. if (
  397. 'nccl' in kernel_name
  398. or 'xccl' in kernel_name
  399. ):
  400. self.gpu_communication_range.append(
  401. (devicenode.start_ns, devicenode.end_ns)
  402. )
  403. else:
  404. self.computation_range.append(
  405. (devicenode.start_ns, devicenode.end_ns)
  406. )
  407. self.cpu_calls = len(set(self.cpu_communication_range))
  408. self.gpu_calls = len(set(self.gpu_communication_range))
  409. self.cpu_communication_range = merge_self_ranges(
  410. self.cpu_communication_range, is_sorted=False
  411. )
  412. self.gpu_communication_range = merge_self_ranges(
  413. self.gpu_communication_range, is_sorted=False
  414. )
  415. self.communication_range = merge_ranges(
  416. self.cpu_communication_range,
  417. self.gpu_communication_range,
  418. is_sorted=True,
  419. )
  420. self.computation_range = merge_self_ranges(
  421. self.computation_range, is_sorted=False
  422. )
  423. self.overlap_range = intersection_ranges(
  424. self.communication_range, self.computation_range, is_sorted=True
  425. )
  426. class EventSummary:
  427. r"""
  428. Analyse operator event in profiling data, correlate with its device event.
  429. """
  430. class ItemBase:
  431. def __init__(self, name):
  432. self.name = name
  433. self.call = 0
  434. self.cpu_time = 0
  435. self.gpu_time = 0
  436. self.max_cpu_time = 0
  437. self.min_cpu_time = float('inf')
  438. self.max_gpu_time = 0
  439. self.min_gpu_time = float('inf')
  440. self.devices = {}
  441. self.operator_inners = {}
  442. self.general_gpu_time = 0
  443. self.min_general_gpu_time = float('inf')
  444. self.max_general_gpu_time = 0
  445. self._flops = 0
  446. @property
  447. def flops(self):
  448. return self._flops
  449. @property
  450. def avg_cpu_time(self):
  451. return self.cpu_time / self.call
  452. @property
  453. def avg_gpu_time(self):
  454. return self.gpu_time / self.call
  455. @property
  456. def avg_general_gpu_time(self):
  457. return self.general_gpu_time / self.call
  458. def add_cpu_time(self, time):
  459. if time > self.max_cpu_time:
  460. self.max_cpu_time = time
  461. if time < self.min_cpu_time:
  462. self.min_cpu_time = time
  463. self.cpu_time += time
  464. def add_gpu_time(self, time):
  465. if time > self.max_gpu_time:
  466. self.max_gpu_time = time
  467. if time < self.min_gpu_time:
  468. self.min_gpu_time = time
  469. self.gpu_time += time
  470. def add_general_gpu_time(self, time):
  471. if time > self.max_general_gpu_time:
  472. self.max_general_gpu_time = time
  473. if time < self.min_general_gpu_time:
  474. self.min_general_gpu_time = time
  475. self.general_gpu_time += time
  476. def add_call(self):
  477. self.call += 1
  478. def add_flops(self, flops):
  479. self._flops += flops
  480. def add_item(self, node):
  481. raise NotImplementedError
  482. class DeviceItem(ItemBase):
  483. def add_item(self, node):
  484. self.call += 1
  485. self.add_gpu_time(node.end_ns - node.start_ns)
  486. class OperatorItem(ItemBase):
  487. def add_item(self, node):
  488. self.add_call()
  489. self.add_cpu_time(node.cpu_time)
  490. self.add_gpu_time(node.gpu_time)
  491. self.add_general_gpu_time(node.general_gpu_time)
  492. self.add_flops(node.flops)
  493. for child in node.children_node:
  494. if child.type != TracerEventType.Operator:
  495. if child.name not in self.operator_inners:
  496. self.operator_inners[
  497. child.name
  498. ] = EventSummary.OperatorItem(child.name)
  499. self.operator_inners[child.name].add_item(child)
  500. for runtimenode in node.runtime_node:
  501. for devicenode in runtimenode.device_node:
  502. name = devicenode.name
  503. if name not in self.devices:
  504. self.devices[name] = EventSummary.DeviceItem(name)
  505. self.devices[name].add_item(devicenode)
  506. class ForwardItem(ItemBase):
  507. def add_item(self, node):
  508. self.add_call()
  509. self.add_cpu_time(node.cpu_time)
  510. self.add_gpu_time(node.gpu_time)
  511. self.add_general_gpu_time(node.general_gpu_time)
  512. self.add_flops(node.flops)
  513. for child in node.children_node:
  514. if child.type != TracerEventType.Operator:
  515. if child.name not in self.operator_inners:
  516. self.operator_inners[
  517. child.name
  518. ] = EventSummary.OperatorItem(child.name)
  519. self.operator_inners[child.name].add_item(child)
  520. class GeneralItem(ItemBase):
  521. def add_item(self, node):
  522. self.add_call()
  523. self.add_cpu_time(node.cpu_time)
  524. self.add_gpu_time(node.gpu_time)
  525. self.add_general_gpu_time(node.general_gpu_time)
  526. def __init__(self):
  527. self.items = {} # for operator summary
  528. self.thread_items = collections.defaultdict(
  529. dict
  530. ) # for operator summary
  531. self.userdefined_items = {} # for userdefined summary
  532. self.userdefined_thread_items = collections.defaultdict(
  533. dict
  534. ) # for userdefined summary
  535. self.model_perspective_items = {} # for model summary
  536. self.memory_manipulation_items = {} # for memory manipulation summary
  537. self.kernel_items = {} # for kernel summary
  538. def parse(self, nodetrees):
  539. r"""
  540. Analysis operator event in the nodetress.
  541. """
  542. node_statistic_trees, thread2host_statistic_nodes = wrap_tree(nodetrees)
  543. for (
  544. threadid,
  545. host_statistic_nodes,
  546. ) in thread2host_statistic_nodes.items():
  547. for host_statistic_node in host_statistic_nodes[
  548. 1:
  549. ]: # skip root node
  550. if host_statistic_node.type == TracerEventType.Operator:
  551. self.add_operator_item(host_statistic_node)
  552. if (
  553. host_statistic_node.type == TracerEventType.UserDefined
  554. or host_statistic_node.type
  555. == TracerEventType.PythonUserDefined
  556. ):
  557. if (
  558. 'memcpy' in host_statistic_node.name.lower()
  559. or 'memorycopy' in host_statistic_node.name.lower()
  560. or 'memset' in host_statistic_node.name.lower()
  561. ):
  562. self.add_memory_manipulation_item(host_statistic_node)
  563. else:
  564. if (
  565. host_statistic_node.type
  566. == TracerEventType.PythonUserDefined
  567. ):
  568. self.add_userdefined_item(host_statistic_node)
  569. self.add_kernel_item(host_statistic_nodes[0])
  570. for threadid, root_statistic_node in node_statistic_trees.items():
  571. deque = collections.deque()
  572. deque.append(root_statistic_node)
  573. while deque:
  574. current_node = deque.popleft()
  575. for child in current_node.children_node:
  576. if (
  577. child.type == TracerEventType.Forward
  578. or child.type == TracerEventType.Dataloader
  579. or child.type == TracerEventType.Backward
  580. or child.type == TracerEventType.Optimization
  581. ):
  582. self.add_model_perspective_item(
  583. child
  584. ) # find first model perspective node
  585. else:
  586. if child.type == TracerEventType.ProfileStep:
  587. self.add_model_perspective_item(child)
  588. deque.append(child)
  589. def add_forward_item(self, operator_node):
  590. pass
  591. def add_operator_item(self, operator_node):
  592. if operator_node.name not in self.items:
  593. self.items[operator_node.name] = EventSummary.OperatorItem(
  594. operator_node.name
  595. )
  596. self.items[operator_node.name].add_item(operator_node)
  597. if operator_node.name not in self.thread_items[operator_node.thread_id]:
  598. self.thread_items[operator_node.thread_id][
  599. operator_node.name
  600. ] = EventSummary.OperatorItem(operator_node.name)
  601. self.thread_items[operator_node.thread_id][operator_node.name].add_item(
  602. operator_node
  603. )
  604. def add_userdefined_item(self, userdefined_node):
  605. if userdefined_node.name not in self.userdefined_items:
  606. self.userdefined_items[
  607. userdefined_node.name
  608. ] = EventSummary.GeneralItem(userdefined_node.name)
  609. self.userdefined_items[userdefined_node.name].add_item(userdefined_node)
  610. if (
  611. userdefined_node.name
  612. not in self.userdefined_thread_items[userdefined_node.thread_id]
  613. ):
  614. self.userdefined_thread_items[userdefined_node.thread_id][
  615. userdefined_node.name
  616. ] = EventSummary.GeneralItem(userdefined_node.name)
  617. self.userdefined_thread_items[userdefined_node.thread_id][
  618. userdefined_node.name
  619. ].add_item(userdefined_node)
  620. def add_memory_manipulation_item(self, memory_manipulation_node):
  621. if memory_manipulation_node.name not in self.memory_manipulation_items:
  622. self.memory_manipulation_items[
  623. memory_manipulation_node.name
  624. ] = EventSummary.GeneralItem(memory_manipulation_node.name)
  625. self.memory_manipulation_items[memory_manipulation_node.name].add_item(
  626. memory_manipulation_node
  627. )
  628. def add_model_perspective_item(self, model_perspective_node):
  629. if model_perspective_node.type == TracerEventType.Forward:
  630. name = 'Forward'
  631. elif model_perspective_node.type == TracerEventType.Backward:
  632. name = 'Backward'
  633. elif model_perspective_node.type == TracerEventType.Optimization:
  634. name = 'Optimization'
  635. elif model_perspective_node.type == TracerEventType.Dataloader:
  636. name = 'Dataloader'
  637. elif model_perspective_node.type == TracerEventType.ProfileStep:
  638. name = 'ProfileStep'
  639. else:
  640. return
  641. if name not in self.model_perspective_items:
  642. self.model_perspective_items[name] = EventSummary.GeneralItem(name)
  643. self.model_perspective_items[name].add_item(model_perspective_node)
  644. def add_kernel_item(self, root_node):
  645. device_nodes = get_device_nodes(root_node)
  646. for device_node in device_nodes:
  647. if device_node.type == TracerEventType.Kernel:
  648. name = device_node.name
  649. if name not in self.kernel_items:
  650. self.kernel_items[name] = EventSummary.DeviceItem(name)
  651. self.kernel_items[name].add_item(device_node)
  652. class MemorySummary:
  653. r"""
  654. Analyse memory events in profiling data.
  655. """
  656. class MemoryItem:
  657. def __init__(self, event_name, place, memory_type='Allocated'):
  658. self.event_name = event_name
  659. self.place = place
  660. self.allocation_count = 0
  661. self.free_count = 0
  662. self.allocation_size = 0
  663. self.free_size = 0
  664. self.increase_size = 0
  665. self.memory_type = memory_type
  666. def add_memory_record(self, size, allocation_type):
  667. if (
  668. allocation_type == TracerMemEventType.Allocate
  669. or allocation_type == TracerMemEventType.ReservedAllocate
  670. ):
  671. self.allocation_count += 1
  672. self.allocation_size += size
  673. elif (
  674. allocation_type == TracerMemEventType.Free
  675. or allocation_type == TracerMemEventType.ReservedFree
  676. ):
  677. self.free_count += 1
  678. self.free_size -= size # size is sign(-) when free.
  679. else:
  680. print("No corresponding type.")
  681. self.increase_size = self.allocation_size - self.free_size
  682. def __init__(self):
  683. self.allocated_items = collections.defaultdict(
  684. dict
  685. ) # for memory summary, device type: event
  686. self.reserved_items = collections.defaultdict(
  687. dict
  688. ) # for memory summary, device type: event
  689. self.peak_allocation_values = collections.defaultdict(int)
  690. self.peak_reserved_values = collections.defaultdict(int)
  691. def _analyse_node_memory(self, event_name, node):
  692. for memnode in node.mem_node: # self mem node
  693. if (
  694. memnode.type == TracerMemEventType.Allocate
  695. or memnode.type == TracerMemEventType.Free
  696. ):
  697. if event_name not in self.allocated_items[memnode.place]:
  698. self.allocated_items[memnode.place][
  699. event_name
  700. ] = MemorySummary.MemoryItem(
  701. event_name, memnode.place, 'Allocated'
  702. )
  703. self.allocated_items[memnode.place][
  704. event_name
  705. ].add_memory_record(memnode.increase_bytes, memnode.type)
  706. elif (
  707. memnode.type == TracerMemEventType.ReservedAllocate
  708. or memnode.type == TracerMemEventType.ReservedFree
  709. ):
  710. if event_name not in self.reserved_items[memnode.place]:
  711. self.reserved_items[memnode.place][
  712. event_name
  713. ] = MemorySummary.MemoryItem(
  714. event_name, memnode.place, 'Reserved'
  715. )
  716. self.reserved_items[memnode.place][
  717. event_name
  718. ].add_memory_record(memnode.increase_bytes, memnode.type)
  719. self.peak_allocation_values[memnode.place] = max(
  720. self.peak_allocation_values[memnode.place],
  721. memnode.peak_allocated,
  722. )
  723. self.peak_reserved_values[memnode.place] = max(
  724. self.peak_reserved_values[memnode.place], memnode.peak_reserved
  725. )
  726. def parse(self, nodetrees):
  727. r"""
  728. Analyse memory event in the nodetress.
  729. """
  730. thread2hostnodes = traverse_tree(nodetrees)
  731. for threadid, host_nodes in thread2hostnodes.items():
  732. for host_node in host_nodes[1:]: # skip root node
  733. if host_node.type == TracerEventType.OperatorInner:
  734. continue
  735. if host_node.type == TracerEventType.Operator:
  736. for child in host_node.children_node:
  737. self._analyse_node_memory(host_node.name, child)
  738. self._analyse_node_memory(host_node.name, host_node)
  739. class StatisticData:
  740. r"""
  741. Hold all analysed results.
  742. """
  743. def __init__(self, node_trees, extra_info):
  744. self.node_trees = node_trees
  745. self.extra_info = extra_info
  746. self.time_range_summary = TimeRangeSummary()
  747. self.event_summary = EventSummary()
  748. self.distributed_summary = DistributedSummary()
  749. self.memory_summary = MemorySummary()
  750. self.time_range_summary.parse(node_trees)
  751. self.event_summary.parse(node_trees)
  752. self.distributed_summary.parse(node_trees)
  753. self.memory_summary.parse(node_trees)
  754. def _build_table(
  755. statistic_data,
  756. sorted_by=SortedKeys.CPUTotal,
  757. op_detail=True,
  758. thread_sep=False,
  759. time_unit='ms',
  760. row_limit=100,
  761. max_src_column_width=75,
  762. views=None,
  763. ):
  764. from .profiler import SummaryView
  765. """Prints a summary of events."""
  766. # format table row
  767. SPACING_SIZE = 2
  768. row_format_list = [""]
  769. header_sep_list = [""]
  770. line_length_list = [-SPACING_SIZE]
  771. def add_column(padding, text_dir='<'):
  772. row_format_list[0] += (
  773. '{: ' + text_dir + str(padding) + '}' + (' ' * SPACING_SIZE)
  774. )
  775. header_sep_list[0] += '-' * padding + (' ' * SPACING_SIZE)
  776. line_length_list[0] += padding + SPACING_SIZE
  777. def add_title(padding, text):
  778. left_length = padding - len(text)
  779. half = left_length // 2
  780. return '-' * half + text + '-' * (left_length - half)
  781. result = []
  782. def append(s):
  783. result.append(s)
  784. result.append('\n')
  785. def format_time(time, unit='ms', indent=0):
  786. r"""
  787. Transform time in ns to time in unit.
  788. """
  789. if time == float('inf'):
  790. return '-'
  791. else:
  792. result = float(time)
  793. if unit == 's':
  794. result /= 1e9
  795. elif unit == 'ms':
  796. result /= 1e6
  797. elif unit == 'us':
  798. result /= 1e3
  799. return '{}{:.2f}'.format(' ' * indent, result)
  800. def format_ratio(ratio, indent=0):
  801. r"""
  802. Transform ratio within [0, 1] to percentage presentation.
  803. """
  804. return '{}{:.2f}'.format(' ' * indent, ratio * 100)
  805. total_time = statistic_data.time_range_summary.get_cpu_range_sum(
  806. TracerEventType.ProfileStep
  807. )
  808. if views is None or SummaryView.DeviceView in views:
  809. # ----- Print Device Summary ----- #
  810. headers = ['Device', 'Utilization (%)']
  811. name_column_width = 30
  812. DEFAULT_COLUMN_WIDTH = 20
  813. add_column(name_column_width)
  814. for _ in headers[1:]:
  815. add_column(DEFAULT_COLUMN_WIDTH)
  816. row_format = row_format_list[0]
  817. header_sep = header_sep_list[0]
  818. line_length = line_length_list[0]
  819. # construct table string
  820. append(add_title(line_length, "Device Summary"))
  821. append(header_sep)
  822. append(row_format.format(*headers))
  823. append(header_sep)
  824. row_values = [
  825. 'CPU(Process)',
  826. format_ratio(
  827. float(statistic_data.extra_info['Process Cpu Utilization'])
  828. ),
  829. ]
  830. append(row_format.format(*row_values))
  831. row_values = [
  832. 'CPU(System)',
  833. format_ratio(
  834. float(statistic_data.extra_info['System Cpu Utilization'])
  835. ),
  836. ]
  837. append(row_format.format(*row_values))
  838. for gpu_name in statistic_data.time_range_summary.get_gpu_devices():
  839. gpu_time = float(
  840. statistic_data.time_range_summary.get_gpu_range_sum(
  841. gpu_name, TracerEventType.Kernel
  842. )
  843. )
  844. utilization = gpu_time / total_time
  845. row_values = [f'GPU{gpu_name}', format_ratio(utilization)]
  846. append(row_format.format(*row_values))
  847. append(header_sep)
  848. append(
  849. "Note:\nCPU(Process) Utilization = Current process CPU time over all cpu cores / elapsed time, so max utilization can be reached 100% * number of cpu cores.\n"
  850. "CPU(System) Utilization = All processes CPU time over all cpu cores(busy time) / (busy time + idle time).\n"
  851. "GPU Utilization = Current process GPU time / elapsed time."
  852. )
  853. append('-' * line_length)
  854. append('')
  855. append('')
  856. if total_time == 0:
  857. return ''.join(result)
  858. if views is None or SummaryView.OverView in views:
  859. # ----- Print Overview Summary ----- #
  860. headers = ['Event Type', 'Calls', 'CPU Time', 'Ratio (%)']
  861. row_format_list = [""]
  862. header_sep_list = [""]
  863. line_length_list = [-SPACING_SIZE]
  864. DEFAULT_COLUMN_WIDTH = 25
  865. for _ in headers:
  866. add_column(DEFAULT_COLUMN_WIDTH)
  867. row_format = row_format_list[0]
  868. header_sep = header_sep_list[0]
  869. line_length = line_length_list[0]
  870. # construct table string
  871. append(add_title(line_length, "Overview Summary"))
  872. append(f'Time unit: {time_unit}')
  873. append(header_sep)
  874. append(row_format.format(*headers))
  875. append(header_sep)
  876. cpu_type_time = collections.defaultdict(int)
  877. gpu_type_time = collections.defaultdict(int)
  878. cpu_call_times = collections.defaultdict(int)
  879. gpu_call_times = collections.defaultdict(int)
  880. cpu_call_times.update(statistic_data.time_range_summary.call_times)
  881. gpu_call_times.update(statistic_data.time_range_summary.call_times)
  882. for (
  883. event_type,
  884. value,
  885. ) in statistic_data.time_range_summary.CPUTimeRangeSum.items():
  886. if event_type != TracerEventType.Communication:
  887. cpu_type_time[event_type] = value
  888. if statistic_data.distributed_summary.cpu_communication_range:
  889. cpu_type_time[TracerEventType.Communication] = sum_ranges(
  890. statistic_data.distributed_summary.cpu_communication_range
  891. )
  892. cpu_call_times[
  893. TracerEventType.Communication
  894. ] = statistic_data.distributed_summary.cpu_calls
  895. for event_type in [
  896. TracerEventType.Dataloader,
  897. TracerEventType.Forward,
  898. TracerEventType.Backward,
  899. TracerEventType.Optimization,
  900. ]:
  901. event_type_name = str(event_type).split('.')[1]
  902. if (
  903. event_type in cpu_call_times
  904. and event_type_name
  905. in statistic_data.event_summary.model_perspective_items
  906. ):
  907. cpu_call_times[
  908. event_type
  909. ] = statistic_data.event_summary.model_perspective_items[
  910. event_type_name
  911. ].call
  912. cpu_type_time[
  913. event_type
  914. ] = statistic_data.event_summary.model_perspective_items[
  915. event_type_name
  916. ].cpu_time
  917. gpu_time_range = collections.defaultdict(list)
  918. for (
  919. device_id,
  920. device_time_ranges,
  921. ) in statistic_data.time_range_summary.GPUTimeRange.items():
  922. for event_type, time_range in device_time_ranges.items():
  923. gpu_time_range[event_type] = merge_ranges(
  924. gpu_time_range[event_type], time_range, is_sorted=True
  925. )
  926. for event_type, time_range in gpu_time_range.items():
  927. gpu_type_time[event_type] = sum_ranges(time_range)
  928. if statistic_data.distributed_summary.gpu_communication_range:
  929. gpu_type_time[TracerEventType.Communication] = sum_ranges(
  930. statistic_data.distributed_summary.gpu_communication_range
  931. )
  932. gpu_call_times[
  933. TracerEventType.Communication
  934. ] = statistic_data.distributed_summary.gpu_calls
  935. sorted_items = sorted(
  936. cpu_type_time.items(), key=lambda x: x[1], reverse=True
  937. )
  938. event_type, time = sorted_items[0]
  939. row_values = [
  940. '{}'.format(str(event_type).split('.')[1]),
  941. cpu_call_times[event_type],
  942. format_time(time, unit=time_unit),
  943. format_ratio(float(time) / total_time),
  944. ]
  945. append(row_format.format(*row_values))
  946. for event_type, time in sorted_items[1:]:
  947. row_values = [
  948. ' {}'.format(str(event_type).split('.')[1]),
  949. cpu_call_times[event_type],
  950. format_time(time, unit=time_unit),
  951. format_ratio(float(time) / total_time),
  952. ]
  953. append(row_format.format(*row_values))
  954. append(header_sep)
  955. headers = ['', 'Calls', 'GPU Time', 'Ratio (%)']
  956. append(row_format.format(*headers))
  957. append(header_sep)
  958. for event_type, time in gpu_type_time.items():
  959. row_values = [
  960. ' {}'.format(str(event_type).split('.')[1]),
  961. gpu_call_times[event_type],
  962. format_time(time, unit=time_unit),
  963. format_ratio(float(time) / total_time),
  964. ]
  965. append(row_format.format(*row_values))
  966. append(header_sep)
  967. append(
  968. "Note:\nIn this table, We sum up all collected events in terms of event type.\n"
  969. "The time of events collected on host are presented as CPU Time, and as GPU Time if on device.\n"
  970. "Events with different types may overlap or inclusion, e.g. Operator includes OperatorInner, so the sum of ratios is not 100%.\n"
  971. "The time of events in the same type with overlap will not calculate twice, and all time is summed after merged.\n"
  972. "Example:\n"
  973. "Thread 1:\n"
  974. " Operator: |___________| |__________|\n"
  975. "Thread 2:\n"
  976. " Operator: |____________| |___|\n"
  977. "After merged:\n"
  978. " Result: |______________| |__________|\n"
  979. )
  980. append('-' * line_length)
  981. append('')
  982. append('')
  983. if views is None or SummaryView.ModelView in views:
  984. # ----- Print Model Summary Report ----- #
  985. model_perspective_items = (
  986. statistic_data.event_summary.model_perspective_items
  987. )
  988. if len(model_perspective_items) > 1:
  989. all_row_values = []
  990. accumulation_time = 0
  991. gpu_accumulation_time = 0
  992. gpu_total_time = (
  993. statistic_data.event_summary.model_perspective_items[
  994. 'ProfileStep'
  995. ].gpu_time
  996. )
  997. for name in [
  998. 'ProfileStep',
  999. 'Dataloader',
  1000. 'Forward',
  1001. 'Backward',
  1002. 'Optimization',
  1003. ]:
  1004. if name in model_perspective_items:
  1005. item = model_perspective_items[name]
  1006. if gpu_total_time == 0:
  1007. gpu_ratio = 0
  1008. else:
  1009. gpu_ratio = float(item.gpu_time) / gpu_total_time
  1010. name = f'{name}' if 'ProfileStep' in name else f' {name}'
  1011. row_values = [
  1012. f'{name}',
  1013. item.call,
  1014. f'{format_time(item.cpu_time, unit=time_unit)} / {format_time(item.avg_cpu_time, unit=time_unit)} / {format_time(item.max_cpu_time, unit=time_unit)} / {format_time(item.min_cpu_time, unit=time_unit)} / {format_ratio(float(item.cpu_time) / total_time)}',
  1015. f'{format_time(item.gpu_time, unit=time_unit)} / {format_time(item.avg_gpu_time, unit=time_unit)} / {format_time(item.max_gpu_time, unit=time_unit)} / {format_time(item.min_gpu_time, unit=time_unit)} / {format_ratio(gpu_ratio)}',
  1016. ]
  1017. all_row_values.append(row_values)
  1018. if 'ProfileStep' not in name:
  1019. accumulation_time += item.cpu_time
  1020. gpu_accumulation_time += item.gpu_time
  1021. other_time = total_time - accumulation_time
  1022. other_gpu_time = gpu_total_time - gpu_accumulation_time
  1023. if gpu_total_time == 0:
  1024. gpu_ratio = 0
  1025. else:
  1026. gpu_ratio = float(other_gpu_time) / gpu_total_time
  1027. row_values = [
  1028. ' Others',
  1029. '-',
  1030. f'{format_time(other_time, unit=time_unit)} / - / - / - / {format_ratio(float(other_time) / total_time)}',
  1031. f'{format_time(other_gpu_time, unit=time_unit)} / - / - / - / {format_ratio(gpu_ratio)}',
  1032. ]
  1033. all_row_values.append(row_values)
  1034. # Calculate the column width
  1035. calltime_width = 6
  1036. cpu_data_description_width = 40
  1037. gpu_data_description_width = 40
  1038. for row_values in all_row_values:
  1039. if (
  1040. isinstance(row_values[1], int)
  1041. and len(str(row_values[1])) > calltime_width
  1042. ):
  1043. calltime_width = len(str(row_values[1]))
  1044. if len(row_values[2]) > cpu_data_description_width:
  1045. cpu_data_description_width = len(row_values[2])
  1046. if len(row_values[3]) > gpu_data_description_width:
  1047. gpu_data_description_width = len(row_values[3])
  1048. headers = [
  1049. 'Name',
  1050. 'Calls',
  1051. 'CPU Total / Avg / Max / Min / Ratio(%)',
  1052. 'GPU Total / Avg / Max / Min / Ratio(%)',
  1053. ]
  1054. row_format_list = [""]
  1055. header_sep_list = [""]
  1056. line_length_list = [-SPACING_SIZE]
  1057. name_column_width = 15
  1058. add_column(name_column_width)
  1059. add_column(calltime_width)
  1060. add_column(cpu_data_description_width)
  1061. add_column(gpu_data_description_width)
  1062. row_format = row_format_list[0]
  1063. header_sep = header_sep_list[0]
  1064. line_length = line_length_list[0]
  1065. # construct table string
  1066. append(add_title(line_length, "Model Summary"))
  1067. append(f'Time unit: {time_unit}')
  1068. append(header_sep)
  1069. append(row_format.format(*headers))
  1070. append(header_sep)
  1071. for row_values in all_row_values:
  1072. append(row_format.format(*row_values))
  1073. append(header_sep)
  1074. append(
  1075. "Note:\nIn this table, GPU time is the sum of all device(GPU) events called in the phase.\n"
  1076. "Unlike overview summary, if two device(GPU) events execute on different streams with overlap time, we sum them directly here.\n"
  1077. )
  1078. append('-' * line_length)
  1079. append('')
  1080. append('')
  1081. if views is None or SummaryView.DistributedView in views:
  1082. # ----- Print Distribution Summary Report ----- #
  1083. if statistic_data.distributed_summary.communication_range:
  1084. headers = [
  1085. 'Name',
  1086. 'Total Time',
  1087. 'Ratio (%)',
  1088. ]
  1089. row_format_list = [""]
  1090. header_sep_list = [""]
  1091. line_length_list = [-SPACING_SIZE]
  1092. DEFAULT_COLUMN_WIDTH = 25
  1093. for _ in headers:
  1094. add_column(DEFAULT_COLUMN_WIDTH)
  1095. row_format = row_format_list[0]
  1096. header_sep = header_sep_list[0]
  1097. line_length = line_length_list[0]
  1098. # construct table string
  1099. append(add_title(line_length, "Distribution Summary"))
  1100. append(f'Time unit: {time_unit}')
  1101. append(header_sep)
  1102. append(row_format.format(*headers))
  1103. append(header_sep)
  1104. communication_time = sum_ranges(
  1105. statistic_data.distributed_summary.communication_range
  1106. )
  1107. computation_time = sum_ranges(
  1108. statistic_data.distributed_summary.computation_range
  1109. )
  1110. overlap_time = sum_ranges(
  1111. statistic_data.distributed_summary.overlap_range
  1112. )
  1113. row_values = [
  1114. 'ProfileStep',
  1115. format_time(total_time, unit=time_unit),
  1116. format_ratio(float(total_time) / total_time),
  1117. ]
  1118. append(row_format.format(*row_values))
  1119. row_values = [
  1120. ' Communication',
  1121. format_time(communication_time, unit=time_unit),
  1122. format_ratio(float(communication_time) / total_time),
  1123. ]
  1124. append(row_format.format(*row_values))
  1125. row_values = [
  1126. ' Computation',
  1127. format_time(computation_time, unit=time_unit),
  1128. format_ratio(float(computation_time) / total_time),
  1129. ]
  1130. append(row_format.format(*row_values))
  1131. row_values = [
  1132. ' Overlap',
  1133. format_time(overlap_time, unit=time_unit),
  1134. format_ratio(float(overlap_time) / total_time),
  1135. ]
  1136. append(row_format.format(*row_values))
  1137. append(header_sep)
  1138. append(
  1139. "Note:\nCommunication time: Communication Event time, Communication Op time and its kernel time on gpu.\n"
  1140. "Computation time: Kernel time, except kernels belong to communication(nccl kernels).\n"
  1141. "Overlap time: Communication time intersects with computation time.\n"
  1142. "Example:\n"
  1143. "Communication:\n"
  1144. " CPU: |_________________|\n"
  1145. " GPU: |______________|\n"
  1146. " Total: |_________________| |______________|\n"
  1147. "Computation time(Kernel):\n"
  1148. " GPU: |________________|\n"
  1149. "Overlap time: |___________|\n"
  1150. )
  1151. append('-' * line_length)
  1152. append('')
  1153. append('')
  1154. if views is None or SummaryView.OperatorView in views:
  1155. # ----- Print Operator Summary Report ----- #
  1156. if statistic_data.event_summary.items:
  1157. all_row_values = []
  1158. name_column_width = 52
  1159. if thread_sep:
  1160. thread_items = statistic_data.event_summary.thread_items
  1161. else:
  1162. thread_items = {
  1163. 'All threads merged': statistic_data.event_summary.items
  1164. }
  1165. for thread_id, items in thread_items.items():
  1166. all_row_values.append(f"Thread: {thread_id}")
  1167. if sorted_by == SortedKeys.CPUTotal:
  1168. sorted_items = sorted(
  1169. items.items(), key=lambda x: x[1].cpu_time, reverse=True
  1170. )
  1171. elif sorted_by == SortedKeys.CPUAvg:
  1172. sorted_items = sorted(
  1173. items.items(),
  1174. key=lambda x: x[1].avg_cpu_time,
  1175. reverse=True,
  1176. )
  1177. elif sorted_by == SortedKeys.CPUMax:
  1178. sorted_items = sorted(
  1179. items.items(),
  1180. key=lambda x: x[1].max_cpu_time,
  1181. reverse=True,
  1182. )
  1183. elif sorted_by == SortedKeys.CPUMin:
  1184. sorted_items = sorted(
  1185. items.items(), key=lambda x: x[1].min_cpu_time
  1186. )
  1187. elif sorted_by == SortedKeys.GPUTotal:
  1188. sorted_items = sorted(
  1189. items.items(),
  1190. key=lambda x: x[1].general_gpu_time,
  1191. reverse=True,
  1192. )
  1193. elif sorted_by == SortedKeys.GPUAvg:
  1194. sorted_items = sorted(
  1195. items.items(),
  1196. key=lambda x: x[1].avg_general_gpu_time,
  1197. reverse=True,
  1198. )
  1199. elif sorted_by == SortedKeys.GPUMax:
  1200. sorted_items = sorted(
  1201. items.items(),
  1202. key=lambda x: x[1].max_general_gpu_time,
  1203. reverse=True,
  1204. )
  1205. elif sorted_by == SortedKeys.GPUMin:
  1206. sorted_items = sorted(
  1207. items.items(), key=lambda x: x[1].min_general_gpu_time
  1208. )
  1209. total_op_cpu_time = 0
  1210. total_op_gpu_time = 0
  1211. for name, item in sorted_items:
  1212. total_op_cpu_time += item.cpu_time
  1213. total_op_gpu_time += item.general_gpu_time
  1214. for name, item in sorted_items:
  1215. if total_op_cpu_time == 0:
  1216. cpu_ratio = 0
  1217. else:
  1218. cpu_ratio = float(item.cpu_time) / total_op_cpu_time
  1219. if total_op_gpu_time == 0:
  1220. gpu_ratio = 0
  1221. else:
  1222. gpu_ratio = (
  1223. float(item.general_gpu_time) / total_op_gpu_time
  1224. )
  1225. row_values = [
  1226. name,
  1227. item.call,
  1228. f'{format_time(item.cpu_time, unit=time_unit)} / {format_time(item.avg_cpu_time, unit=time_unit)} / {format_time(item.max_cpu_time, unit=time_unit)} / {format_time(item.min_cpu_time, unit=time_unit)} / {format_ratio(cpu_ratio)}',
  1229. '{} / {} / {} / {} / {}'.format(
  1230. format_time(item.general_gpu_time, unit=time_unit),
  1231. format_time(
  1232. item.avg_general_gpu_time, unit=time_unit
  1233. ),
  1234. format_time(
  1235. item.max_general_gpu_time, unit=time_unit
  1236. ),
  1237. format_time(
  1238. item.min_general_gpu_time, unit=time_unit
  1239. ),
  1240. format_ratio(gpu_ratio),
  1241. ),
  1242. item.flops,
  1243. ]
  1244. all_row_values.append(row_values)
  1245. if op_detail:
  1246. for (
  1247. innerop_name,
  1248. innerop_node,
  1249. ) in item.operator_inners.items():
  1250. if item.cpu_time == 0:
  1251. cpu_ratio = 0
  1252. else:
  1253. cpu_ratio = (
  1254. float(innerop_node.cpu_time) / item.cpu_time
  1255. )
  1256. if item.general_gpu_time == 0:
  1257. gpu_ratio = 0
  1258. else:
  1259. gpu_ratio = (
  1260. float(innerop_node.general_gpu_time)
  1261. / item.general_gpu_time
  1262. )
  1263. if len(innerop_name) + 2 > name_column_width:
  1264. innerop_name = innerop_name[
  1265. : name_column_width - 5
  1266. ]
  1267. innerop_name += "..."
  1268. row_values = [
  1269. f' {innerop_name}',
  1270. innerop_node.call,
  1271. '{} / {} / {} / {} / {}'.format(
  1272. format_time(
  1273. innerop_node.cpu_time, unit=time_unit
  1274. ),
  1275. format_time(
  1276. innerop_node.avg_cpu_time,
  1277. unit=time_unit,
  1278. ),
  1279. format_time(
  1280. innerop_node.max_cpu_time,
  1281. unit=time_unit,
  1282. ),
  1283. format_time(
  1284. innerop_node.min_cpu_time,
  1285. unit=time_unit,
  1286. ),
  1287. format_ratio(cpu_ratio),
  1288. ),
  1289. '{} / {} / {} / {} / {}'.format(
  1290. format_time(
  1291. innerop_node.general_gpu_time,
  1292. unit=time_unit,
  1293. ),
  1294. format_time(
  1295. innerop_node.avg_general_gpu_time,
  1296. unit=time_unit,
  1297. ),
  1298. format_time(
  1299. innerop_node.max_general_gpu_time,
  1300. unit=time_unit,
  1301. ),
  1302. format_time(
  1303. innerop_node.min_general_gpu_time,
  1304. unit=time_unit,
  1305. ),
  1306. format_ratio(gpu_ratio),
  1307. ),
  1308. '-',
  1309. ]
  1310. all_row_values.append(row_values)
  1311. for (
  1312. device_node_name,
  1313. device_node,
  1314. ) in innerop_node.devices.items():
  1315. if innerop_node.general_gpu_time == 0:
  1316. gpu_ratio = 0
  1317. else:
  1318. gpu_ratio = (
  1319. float(device_node.gpu_time)
  1320. / innerop_node.general_gpu_time
  1321. )
  1322. if (
  1323. len(device_node_name) + 4
  1324. > name_column_width
  1325. ):
  1326. device_node_name = device_node_name[
  1327. : name_column_width - 7
  1328. ]
  1329. device_node_name += "..."
  1330. row_values = [
  1331. f' {device_node_name}',
  1332. device_node.call,
  1333. '- / - / - / - / -',
  1334. '{} / {} / {} / {} / {}'.format(
  1335. format_time(
  1336. device_node.gpu_time, unit=time_unit
  1337. ),
  1338. format_time(
  1339. device_node.avg_gpu_time,
  1340. unit=time_unit,
  1341. ),
  1342. format_time(
  1343. device_node.max_gpu_time,
  1344. unit=time_unit,
  1345. ),
  1346. format_time(
  1347. device_node.min_gpu_time,
  1348. unit=time_unit,
  1349. ),
  1350. format_ratio(gpu_ratio),
  1351. ),
  1352. '-',
  1353. ]
  1354. all_row_values.append(row_values)
  1355. for (
  1356. device_node_name,
  1357. device_node,
  1358. ) in item.devices.items():
  1359. if item.general_gpu_time == 0:
  1360. gpu_ratio = 0
  1361. else:
  1362. gpu_ratio = (
  1363. float(device_node.gpu_time)
  1364. / item.general_gpu_time
  1365. )
  1366. if len(device_node_name) + 2 > name_column_width:
  1367. device_node_name = device_node_name[
  1368. : name_column_width - 5
  1369. ]
  1370. device_node_name += "..."
  1371. row_values = [
  1372. f' {device_node_name}',
  1373. device_node.call,
  1374. '- / - / - / - / -',
  1375. '{} / {} / {} / {} / {}'.format(
  1376. format_time(
  1377. device_node.gpu_time, unit=time_unit
  1378. ),
  1379. format_time(
  1380. device_node.avg_gpu_time, unit=time_unit
  1381. ),
  1382. format_time(
  1383. device_node.max_gpu_time, unit=time_unit
  1384. ),
  1385. format_time(
  1386. device_node.min_gpu_time, unit=time_unit
  1387. ),
  1388. format_ratio(gpu_ratio),
  1389. ),
  1390. '-',
  1391. ]
  1392. all_row_values.append(row_values)
  1393. # Calculate the column width
  1394. calltime_width = 6
  1395. cpu_data_description_width = 40
  1396. gpu_data_description_width = 40
  1397. flops_width = 10
  1398. for row_values in all_row_values:
  1399. if isinstance(row_values, str):
  1400. continue
  1401. if (
  1402. isinstance(row_values[1], int)
  1403. and len(str(row_values[1])) > calltime_width
  1404. ):
  1405. calltime_width = len(str(row_values[1]))
  1406. if len(row_values[2]) > cpu_data_description_width:
  1407. cpu_data_description_width = len(row_values[2])
  1408. if len(row_values[3]) > gpu_data_description_width:
  1409. gpu_data_description_width = len(row_values[3])
  1410. headers = [
  1411. 'Name',
  1412. 'Calls',
  1413. 'CPU Total / Avg / Max / Min / Ratio(%)',
  1414. 'GPU Total / Avg / Max / Min / Ratio(%)',
  1415. 'FLOPs',
  1416. ]
  1417. row_format_list = [""]
  1418. header_sep_list = [""]
  1419. line_length_list = [-SPACING_SIZE]
  1420. add_column(name_column_width)
  1421. add_column(calltime_width)
  1422. add_column(cpu_data_description_width)
  1423. add_column(gpu_data_description_width)
  1424. add_column(flops_width)
  1425. row_format = row_format_list[0]
  1426. header_sep = header_sep_list[0]
  1427. line_length = line_length_list[0]
  1428. # construct table string
  1429. append(add_title(line_length, "Operator Summary"))
  1430. append(f'Time unit: {time_unit}')
  1431. append(header_sep)
  1432. append(row_format.format(*headers))
  1433. append(header_sep)
  1434. for row_values in all_row_values:
  1435. if isinstance(row_values, str):
  1436. append(add_title(line_length, row_values))
  1437. else:
  1438. append(row_format.format(*row_values))
  1439. append(header_sep)
  1440. append('')
  1441. append('')
  1442. if views is None or SummaryView.KernelView in views:
  1443. # ----- Print Kernel Summary Report ----- #
  1444. if statistic_data.event_summary.kernel_items:
  1445. all_row_values = []
  1446. kernel_items = statistic_data.event_summary.kernel_items
  1447. if sorted_by == SortedKeys.GPUAvg:
  1448. sorted_items = sorted(
  1449. kernel_items.items(),
  1450. key=lambda x: x[1].avg_gpu_time,
  1451. reverse=True,
  1452. )
  1453. elif sorted_by == SortedKeys.GPUMax:
  1454. sorted_items = sorted(
  1455. kernel_items.items(),
  1456. key=lambda x: x[1].max_gpu_time,
  1457. reverse=True,
  1458. )
  1459. elif sorted_by == SortedKeys.GPUMin:
  1460. sorted_items = sorted(
  1461. kernel_items.items(), key=lambda x: x[1].min_gpu_time
  1462. )
  1463. else:
  1464. sorted_items = sorted(
  1465. kernel_items.items(),
  1466. key=lambda x: x[1].gpu_time,
  1467. reverse=True,
  1468. )
  1469. total_kernel_gpu_time = 0
  1470. for name, item in sorted_items:
  1471. total_kernel_gpu_time += item.gpu_time
  1472. for name, item in sorted_items:
  1473. if total_kernel_gpu_time == 0:
  1474. gpu_ratio = 0
  1475. else:
  1476. gpu_ratio = float(item.gpu_time) / total_kernel_gpu_time
  1477. row_values = [
  1478. name,
  1479. item.call,
  1480. f'{format_time(item.gpu_time, unit=time_unit)} / {format_time(item.avg_gpu_time, unit=time_unit)} / {format_time(item.max_gpu_time, unit=time_unit)} / {format_time(item.min_gpu_time, unit=time_unit)} / {format_ratio(gpu_ratio)}',
  1481. ]
  1482. all_row_values.append(row_values)
  1483. headers = [
  1484. 'Name',
  1485. 'Calls',
  1486. 'GPU Total / Avg / Max / Min / Ratio(%)',
  1487. ]
  1488. # Calculate the column width
  1489. name_column_width = 90
  1490. calltime_width = 6
  1491. gpu_data_description_width = 40
  1492. for row_values in all_row_values:
  1493. if (
  1494. isinstance(row_values[1], int)
  1495. and len(str(row_values[1])) > calltime_width
  1496. ):
  1497. calltime_width = len(str(row_values[1]))
  1498. if len(row_values[2]) > gpu_data_description_width:
  1499. gpu_data_description_width = len(row_values[2])
  1500. row_format_list = [""]
  1501. header_sep_list = [""]
  1502. line_length_list = [-SPACING_SIZE]
  1503. add_column(name_column_width)
  1504. add_column(calltime_width)
  1505. add_column(gpu_data_description_width)
  1506. row_format = row_format_list[0]
  1507. header_sep = header_sep_list[0]
  1508. line_length = line_length_list[0]
  1509. # construct table string
  1510. append(add_title(line_length, "Kernel Summary"))
  1511. append(f'Time unit: {time_unit}')
  1512. append(header_sep)
  1513. append(row_format.format(*headers))
  1514. append(header_sep)
  1515. kernel_name_pattern = re.compile(r'(.+?)(<.*>)(\(.*\))')
  1516. for row_values in all_row_values:
  1517. match = kernel_name_pattern.match(row_values[0])
  1518. if match:
  1519. name = match.group(1) + match.group(2)
  1520. else:
  1521. name = row_values[0]
  1522. if len(name) > name_column_width:
  1523. row_values[0] = name[: name_column_width - 3] + '...'
  1524. else:
  1525. row_values[0] = name
  1526. append(row_format.format(*row_values))
  1527. append(header_sep)
  1528. append('')
  1529. append('')
  1530. if views is None or SummaryView.MemoryManipulationView in views:
  1531. # ----- Print Memory Manipulation Summary Report ----- #
  1532. if statistic_data.event_summary.memory_manipulation_items:
  1533. all_row_values = []
  1534. memory_manipulation_items = (
  1535. statistic_data.event_summary.memory_manipulation_items
  1536. )
  1537. gpu_total_time = (
  1538. statistic_data.event_summary.model_perspective_items[
  1539. 'ProfileStep'
  1540. ].general_gpu_time
  1541. )
  1542. for name, item in memory_manipulation_items.items():
  1543. if gpu_total_time == 0:
  1544. gpu_ratio = 0
  1545. else:
  1546. gpu_ratio = float(item.general_gpu_time) / gpu_total_time
  1547. row_values = [
  1548. name,
  1549. item.call,
  1550. f'{format_time(item.cpu_time, unit=time_unit)} / {format_time(item.avg_cpu_time, unit=time_unit)} / {format_time(item.max_cpu_time, unit=time_unit)} / {format_time(item.min_cpu_time, unit=time_unit)} / {format_ratio(float(item.cpu_time) / total_time)}',
  1551. f'{format_time(item.general_gpu_time, unit=time_unit)} / {format_time(item.avg_general_gpu_time, unit=time_unit)} / {format_time(item.max_general_gpu_time, unit=time_unit)} / {format_time(item.min_general_gpu_time, unit=time_unit)} / {format_ratio(gpu_ratio)}',
  1552. ]
  1553. all_row_values.append(row_values)
  1554. headers = [
  1555. 'Name',
  1556. 'Calls',
  1557. 'CPU Total / Avg / Max / Min / Ratio(%)',
  1558. 'GPU Total / Avg / Max / Min / Ratio(%)',
  1559. ]
  1560. # Calculate the column width
  1561. name_column_width = 0
  1562. calltime_width = 6
  1563. cpu_data_description_width = 40
  1564. gpu_data_description_width = 40
  1565. for row_values in all_row_values:
  1566. if len(row_values[0]) > name_column_width:
  1567. name_column_width = len(row_values[0])
  1568. if (
  1569. isinstance(row_values[1], int)
  1570. and len(str(row_values[1])) > calltime_width
  1571. ):
  1572. calltime_width = len(str(row_values[1]))
  1573. if len(row_values[2]) > cpu_data_description_width:
  1574. cpu_data_description_width = len(row_values[2])
  1575. if len(row_values[3]) > gpu_data_description_width:
  1576. gpu_data_description_width = len(row_values[3])
  1577. row_format_list = [""]
  1578. header_sep_list = [""]
  1579. line_length_list = [-SPACING_SIZE]
  1580. add_column(name_column_width)
  1581. add_column(calltime_width)
  1582. add_column(cpu_data_description_width)
  1583. add_column(gpu_data_description_width)
  1584. row_format = row_format_list[0]
  1585. header_sep = header_sep_list[0]
  1586. line_length = line_length_list[0]
  1587. # construct table string
  1588. append(add_title(line_length, "Memory Manipulation Summary"))
  1589. append(f'Time unit: {time_unit}')
  1590. append(header_sep)
  1591. append(row_format.format(*headers))
  1592. append(header_sep)
  1593. for row_values in all_row_values:
  1594. append(row_format.format(*row_values))
  1595. append(header_sep)
  1596. append('')
  1597. append('')
  1598. if views is None or SummaryView.UDFView in views:
  1599. # ----- Print UserDefined Summary Report ----- #
  1600. if statistic_data.event_summary.userdefined_items:
  1601. all_row_values = []
  1602. gpu_total_time = (
  1603. statistic_data.event_summary.model_perspective_items[
  1604. 'ProfileStep'
  1605. ].general_gpu_time
  1606. )
  1607. if thread_sep:
  1608. userdefined_thread_items = (
  1609. statistic_data.event_summary.userdefined_thread_items
  1610. )
  1611. else:
  1612. userdefined_thread_items = {
  1613. 'All threads merged': statistic_data.event_summary.userdefined_items
  1614. }
  1615. for thread_id, items in userdefined_thread_items.items():
  1616. all_row_values.append(f"Thread: {thread_id}")
  1617. if sorted_by == SortedKeys.CPUTotal:
  1618. sorted_items = sorted(
  1619. items.items(), key=lambda x: x[1].cpu_time, reverse=True
  1620. )
  1621. elif sorted_by == SortedKeys.CPUAvg:
  1622. sorted_items = sorted(
  1623. items.items(),
  1624. key=lambda x: x[1].avg_cpu_time,
  1625. reverse=True,
  1626. )
  1627. elif sorted_by == SortedKeys.CPUMax:
  1628. sorted_items = sorted(
  1629. items.items(),
  1630. key=lambda x: x[1].max_cpu_time,
  1631. reverse=True,
  1632. )
  1633. elif sorted_by == SortedKeys.CPUMin:
  1634. sorted_items = sorted(
  1635. items.items(), key=lambda x: x[1].min_cpu_time
  1636. )
  1637. elif sorted_by == SortedKeys.GPUTotal:
  1638. sorted_items = sorted(
  1639. items.items(),
  1640. key=lambda x: x[1].general_gpu_time,
  1641. reverse=True,
  1642. )
  1643. elif sorted_by == SortedKeys.GPUAvg:
  1644. sorted_items = sorted(
  1645. items.items(),
  1646. key=lambda x: x[1].avg_general_gpu_time,
  1647. reverse=True,
  1648. )
  1649. elif sorted_by == SortedKeys.GPUMax:
  1650. sorted_items = sorted(
  1651. items.items(),
  1652. key=lambda x: x[1].max_general_gpu_time,
  1653. reverse=True,
  1654. )
  1655. elif sorted_by == SortedKeys.GPUMin:
  1656. sorted_items = sorted(
  1657. items.items(), key=lambda x: x[1].min_general_gpu_time
  1658. )
  1659. for name, item in sorted_items:
  1660. if gpu_total_time == 0:
  1661. gpu_ratio = 0
  1662. else:
  1663. gpu_ratio = (
  1664. float(item.general_gpu_time) / gpu_total_time
  1665. )
  1666. row_values = [
  1667. name,
  1668. item.call,
  1669. f'{format_time(item.cpu_time, unit=time_unit)} / {format_time(item.avg_cpu_time, unit=time_unit)} / {format_time(item.max_cpu_time, unit=time_unit)} / {format_time(item.min_cpu_time, unit=time_unit)} / {format_ratio(float(item.cpu_time) / total_time)}',
  1670. '{} / {} / {} / {} / {}'.format(
  1671. format_time(item.general_gpu_time, unit=time_unit),
  1672. format_time(
  1673. item.avg_general_gpu_time, unit=time_unit
  1674. ),
  1675. format_time(
  1676. item.max_general_gpu_time, unit=time_unit
  1677. ),
  1678. format_time(
  1679. item.min_general_gpu_time, unit=time_unit
  1680. ),
  1681. format_ratio(gpu_ratio),
  1682. ),
  1683. ]
  1684. all_row_values.append(row_values)
  1685. # Calculate the column width
  1686. name_column_width = 0
  1687. calltime_width = 6
  1688. cpu_data_description_width = 40
  1689. gpu_data_description_width = 40
  1690. for row_values in all_row_values:
  1691. if isinstance(row_values, str):
  1692. continue
  1693. if len(row_values[0]) > name_column_width:
  1694. name_column_width = len(row_values[0])
  1695. if (
  1696. isinstance(row_values[1], int)
  1697. and len(str(row_values[1])) > calltime_width
  1698. ):
  1699. calltime_width = len(str(row_values[1]))
  1700. if len(row_values[2]) > cpu_data_description_width:
  1701. cpu_data_description_width = len(row_values[2])
  1702. if len(row_values[3]) > gpu_data_description_width:
  1703. gpu_data_description_width = len(row_values[3])
  1704. headers = [
  1705. 'Name',
  1706. 'Calls',
  1707. 'CPU Total / Avg / Max / Min / Ratio(%)',
  1708. 'GPU Total / Avg / Max / Min / Ratio(%)',
  1709. ]
  1710. row_format_list = [""]
  1711. header_sep_list = [""]
  1712. line_length_list = [-SPACING_SIZE]
  1713. add_column(name_column_width)
  1714. add_column(calltime_width)
  1715. add_column(cpu_data_description_width)
  1716. add_column(gpu_data_description_width)
  1717. row_format = row_format_list[0]
  1718. header_sep = header_sep_list[0]
  1719. line_length = line_length_list[0]
  1720. # construct table string
  1721. append(add_title(line_length, "UserDefined Summary"))
  1722. append(f'Time unit: {time_unit}')
  1723. append(header_sep)
  1724. append(row_format.format(*headers))
  1725. append(header_sep)
  1726. for row_values in all_row_values:
  1727. if isinstance(row_values, str):
  1728. append(add_title(line_length, row_values))
  1729. else:
  1730. append(row_format.format(*row_values))
  1731. append('')
  1732. append('')
  1733. if views is None or SummaryView.MemoryView in views:
  1734. # ----- Print Memory Summary Report ----- #
  1735. if (
  1736. statistic_data.memory_summary.allocated_items
  1737. or statistic_data.memory_summary.reserved_items
  1738. ):
  1739. for (
  1740. device_type,
  1741. memory_events,
  1742. ) in statistic_data.memory_summary.allocated_items.items():
  1743. all_row_values = []
  1744. sorted_items = sorted(
  1745. memory_events.items(),
  1746. key=lambda x: x[1].increase_size,
  1747. reverse=True,
  1748. )
  1749. for event_name, item in sorted_items:
  1750. row_values = [
  1751. event_name,
  1752. item.memory_type,
  1753. item.allocation_count,
  1754. item.free_count,
  1755. item.allocation_size,
  1756. item.free_size,
  1757. item.increase_size,
  1758. ]
  1759. all_row_values.append(row_values)
  1760. sorted_reserved_items = sorted(
  1761. statistic_data.memory_summary.reserved_items[
  1762. device_type
  1763. ].items(),
  1764. key=lambda x: x[1].increase_size,
  1765. reverse=True,
  1766. )
  1767. for event_name, item in sorted_reserved_items:
  1768. row_values = [
  1769. event_name,
  1770. item.memory_type,
  1771. item.allocation_count,
  1772. item.free_count,
  1773. item.allocation_size,
  1774. item.free_size,
  1775. item.increase_size,
  1776. ]
  1777. all_row_values.append(row_values)
  1778. # Calculate the column width
  1779. headers = [
  1780. 'Name',
  1781. 'Type',
  1782. 'Allocation Count',
  1783. 'Free Count',
  1784. 'Allocation Size',
  1785. 'Free Size',
  1786. 'Increased Size',
  1787. ]
  1788. row_format_list = [""]
  1789. header_sep_list = [""]
  1790. line_length_list = [-SPACING_SIZE]
  1791. name_column_width = 50
  1792. number_column_width = 15
  1793. add_column(name_column_width)
  1794. add_column(12)
  1795. add_column(number_column_width)
  1796. add_column(number_column_width)
  1797. add_column(number_column_width)
  1798. add_column(number_column_width)
  1799. add_column(number_column_width)
  1800. row_format = row_format_list[0]
  1801. header_sep = header_sep_list[0]
  1802. line_length = line_length_list[0]
  1803. # construct table string
  1804. append(
  1805. add_title(line_length, f"Memory Summary - {device_type}")
  1806. )
  1807. append(
  1808. 'Peak Allocated Memory: {}'.format(
  1809. statistic_data.memory_summary.peak_allocation_values[
  1810. device_type
  1811. ]
  1812. )
  1813. )
  1814. append(
  1815. 'Peak Reserved Memory: {}'.format(
  1816. statistic_data.memory_summary.peak_reserved_values[
  1817. device_type
  1818. ]
  1819. )
  1820. )
  1821. append(header_sep)
  1822. append(row_format.format(*headers))
  1823. append(header_sep)
  1824. for row_values in all_row_values:
  1825. if isinstance(row_values, str):
  1826. append(add_title(line_length, row_values))
  1827. else:
  1828. append(row_format.format(*row_values))
  1829. append('')
  1830. append('')
  1831. return ''.join(result)