profiler_util.py 42 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183
  1. # mypy: allow-untyped-defs
  2. import bisect
  3. import itertools
  4. import math
  5. from collections import defaultdict, namedtuple
  6. from operator import attrgetter
  7. from typing import Any, Optional
  8. from typing_extensions import deprecated
  9. import torch
  10. from torch.autograd import DeviceType
  11. __all__ = [
  12. "EventList",
  13. "FormattedTimesMixin",
  14. "Interval",
  15. "Kernel",
  16. "FunctionEvent",
  17. "FunctionEventAvg",
  18. "StringTable",
  19. "MemRecordsAcc",
  20. ]
  21. class EventList(list):
  22. """A list of Events (for pretty printing)."""
  23. def __init__(self, *args, **kwargs):
  24. use_device = kwargs.pop("use_device", None)
  25. profile_memory = kwargs.pop("profile_memory", False)
  26. with_flops = kwargs.pop("with_flops", False)
  27. super().__init__(*args, **kwargs)
  28. self._use_device = use_device
  29. self._profile_memory = profile_memory
  30. self._tree_built = False
  31. self._with_flops = with_flops
  32. def _build_tree(self):
  33. self._populate_cpu_children()
  34. self._remove_dup_nodes()
  35. self._set_backward_stacktraces()
  36. self._tree_built = True
  37. def __str__(self):
  38. return self.table()
  39. def _remove_dup_nodes(self):
  40. while True:
  41. to_delete = set()
  42. for idx in range(len(self)):
  43. if (
  44. self[idx].cpu_parent is not None
  45. and self[idx].cpu_parent.name == self[idx].name
  46. and len(self[idx].cpu_parent.cpu_children) == 1
  47. ):
  48. self[idx].cpu_parent.cpu_children = self[idx].cpu_children
  49. self[idx].cpu_parent.kernels = self[idx].kernels # lift kernels up
  50. for ch in self[idx].cpu_children:
  51. ch.cpu_parent = self[idx].cpu_parent
  52. to_delete.add(idx)
  53. if len(to_delete) == 0:
  54. break
  55. new_evts = [ev for ind, ev in enumerate(self) if ind not in to_delete]
  56. self.clear()
  57. self.extend(new_evts)
  58. def _populate_cpu_children(self):
  59. """Populate child events into each underlying FunctionEvent object.
  60. One event is a child of another if [s1, e1) is inside [s2, e2). Where
  61. s1 and e1 would be start and end of the child event's interval. And
  62. s2 and e2 start and end of the parent event's interval
  63. Example: In event list [[0, 10], [1, 3], [3, 4]] would have make [0, 10]
  64. be a parent of two other intervals.
  65. If for any reason two intervals intersect only partially, this function
  66. will not record a parent child relationship between then.
  67. """
  68. # Some events can be async (i.e. start and end on different threads),
  69. # since it's generally undefined how to attribute children ranges to
  70. # async ranges, we do not use them when calculating nested ranges and stats
  71. sync_events = [
  72. evt
  73. for evt in self
  74. if not evt.is_async and evt.device_type == DeviceType.CPU
  75. ]
  76. events = sorted(
  77. sync_events,
  78. key=attrgetter("thread"),
  79. )
  80. # Group by both thread and node_id, so that events that happen to have
  81. # the same thread_id but are from different nodes aren't incorrectly
  82. # grouped together.
  83. threads = itertools.groupby(
  84. events, key=lambda event: (event.thread, event.node_id)
  85. )
  86. # For each thread we keep a stack of current nested parents.
  87. # We maintain the invariant that each interval is a subset of all other
  88. # intervals lower in the stack.
  89. #
  90. # First we sort the intervals by their start time. Then we iterate over them.
  91. # Every time we see a new interval we remove several parents from
  92. # the top until we restore the invariant. Then parent child relationship
  93. # if recorded if the stack is not empty.
  94. # Finally we add new interval to the list
  95. #
  96. # Algorithm has O(N * log(N)) complexity where N is number of
  97. # intervals
  98. for _thread_id, thread_events in threads:
  99. thread_events_ = sorted(
  100. thread_events,
  101. key=lambda event: [event.time_range.start, -event.time_range.end],
  102. )
  103. current_events: list[FunctionEvent] = []
  104. for event in thread_events_:
  105. while len(current_events) > 0:
  106. parent = current_events[-1]
  107. if (
  108. event.time_range.start >= parent.time_range.end
  109. or event.time_range.end > parent.time_range.end
  110. ):
  111. # this can't be a parent
  112. current_events.pop()
  113. else:
  114. parent.append_cpu_child(event)
  115. assert event.cpu_parent is None, (
  116. f"There is already a CPU parent event for {event.key}"
  117. )
  118. event.set_cpu_parent(parent)
  119. break
  120. current_events.append(event)
  121. def _set_backward_stacktraces(self):
  122. def bw_parent(evt):
  123. if evt is None:
  124. return None
  125. elif evt.scope == 1: # BACKWARD_FUNCTION
  126. return evt
  127. else:
  128. return bw_parent(evt.cpu_parent)
  129. fwd_stacks = {}
  130. for evt in self:
  131. if bw_parent(evt) is None and evt.stack is not None:
  132. t = (evt.sequence_nr, evt.thread)
  133. if t not in fwd_stacks:
  134. fwd_stacks[t] = evt.stack
  135. for evt in self:
  136. p = bw_parent(evt)
  137. if p is not None:
  138. assert p.fwd_thread is not None
  139. t = (p.sequence_nr, p.fwd_thread)
  140. if t in fwd_stacks:
  141. evt.stack = fwd_stacks[t]
  142. else:
  143. evt.stack = []
  144. @property
  145. def self_cpu_time_total(self):
  146. return sum(event.self_cpu_time_total for event in self)
  147. def table(
  148. self,
  149. sort_by=None,
  150. row_limit=100,
  151. max_src_column_width=75,
  152. max_name_column_width=55,
  153. max_shapes_column_width=80,
  154. header=None,
  155. top_level_events_only=False,
  156. time_unit=None,
  157. ):
  158. """Print an EventList as a nicely formatted table.
  159. Args:
  160. sort_by (str, optional): Attribute used to sort entries. By default
  161. they are printed in the same order as they were registered.
  162. Valid keys include: ``cpu_time``, ``cuda_time``, ``xpu_time``,
  163. ``cpu_time_total``, ``cuda_time_total``, ``xpu_time_total``,
  164. ``cpu_memory_usage``, ``cuda_memory_usage``, ``xpu_memory_usage``,
  165. ``self_cpu_memory_usage``, ``self_cuda_memory_usage``,
  166. ``self_xpu_memory_usage``, ``count``.
  167. top_level_events_only(bool, optional): Boolean flag to determine the
  168. selection of events to display. If true, the profiler will only
  169. display events at top level like top-level invocation of python
  170. `lstm`, python `add` or other functions, nested events like low-level
  171. cpu/cuda/xpu ops events are omitted for profiler result readability.
  172. time_unit(str, optional): A time unit to be used for all values in the
  173. table. Valid options are: ``s``, ``ms`` and ``us``.
  174. Returns:
  175. A string containing the table.
  176. """
  177. return _build_table(
  178. self,
  179. sort_by=sort_by,
  180. row_limit=row_limit,
  181. max_src_column_width=max_src_column_width,
  182. max_name_column_width=max_name_column_width,
  183. max_shapes_column_width=max_shapes_column_width,
  184. header=header,
  185. profile_memory=self._profile_memory,
  186. with_flops=self._with_flops,
  187. top_level_events_only=top_level_events_only,
  188. time_unit=time_unit,
  189. )
  190. def export_chrome_trace(self, path):
  191. """Export an EventList as a Chrome tracing tools file.
  192. The checkpoint can be later loaded and inspected under ``chrome://tracing`` URL.
  193. Args:
  194. path (str): Path where the trace will be written.
  195. """
  196. import os
  197. device_name = "cuda" if not self._use_device else self._use_device
  198. with open(path, "w") as f:
  199. next_id = 0
  200. # Use file IO over using json.dump since JSON dumping is very slow and
  201. # this technique is proven to give a 4x speedup.
  202. f.write("[")
  203. for evt in self:
  204. if evt.trace_name is None:
  205. continue
  206. f.write(
  207. '{{"name": "{}", '
  208. '"ph": "X", '
  209. '"ts": {}, '
  210. '"dur": {}, '
  211. '"tid": {}, '
  212. '"pid": "CPU functions", '
  213. '"args": {{}}}}, '.format(
  214. evt.trace_name,
  215. evt.time_range.start,
  216. evt.time_range.elapsed_us(),
  217. evt.thread
  218. if not evt.is_remote
  219. else f'" node_id:{evt.node_id}, thread_id:{evt.thread} "',
  220. )
  221. )
  222. for _ in evt.kernels:
  223. # 's' and 'f' draw Flow arrows from
  224. # the CPU launch to the GPU kernel
  225. f.write(
  226. f'{{"name": "{evt.trace_name}", '
  227. '"ph": "s", '
  228. f'"ts": {evt.time_range.start}, '
  229. f'"tid": {evt.thread}, '
  230. '"pid": "CPU functions", '
  231. f'"id": {next_id}, '
  232. f'"cat": "cpu_to_{device_name}", '
  233. '"args": {}}, '
  234. )
  235. # Note: use torch.profiler to get device kernel trace
  236. next_id += 1
  237. if len(self) > 0:
  238. # remove trailing whitespace and comma
  239. f.seek(f.tell() - 2, os.SEEK_SET)
  240. f.truncate()
  241. f.write("]")
  242. def supported_export_stacks_metrics(self):
  243. return [
  244. "self_cpu_time_total",
  245. "self_cuda_time_total",
  246. "self_xpu_time_total",
  247. "self_privateuse1_time_total",
  248. ]
  249. def export_stacks(self, path: str, metric: str):
  250. if metric not in self.supported_export_stacks_metrics():
  251. raise ValueError(
  252. "metric should be one of: "
  253. + str(self.supported_export_stacks_metrics())
  254. )
  255. translate_table = str.maketrans(" ;\t\n", "____")
  256. with open(path, "w") as f:
  257. for evt in self:
  258. if evt.stack and len(evt.stack) > 0:
  259. metric_value = getattr(
  260. evt,
  261. metric.replace("cuda", "device")
  262. .replace("xpu", "device")
  263. .replace("privateuse1", "device"),
  264. )
  265. if int(metric_value) > 0:
  266. stack_str = ""
  267. for entry in reversed(evt.stack):
  268. stack_str += entry.translate(translate_table)
  269. stack_str += ";"
  270. stack_str = stack_str[:-1] + " " + str(int(metric_value))
  271. f.write(stack_str + "\n")
  272. def key_averages(
  273. self,
  274. group_by_input_shapes=False,
  275. group_by_stack_n=0,
  276. group_by_overload_name=False,
  277. ):
  278. """Averages all function events over their keys.
  279. Args:
  280. group_by_input_shapes: group entries by
  281. (event name, input shapes) rather than just event name.
  282. This is useful to see which input shapes contribute to the runtime
  283. the most and may help with size-specific optimizations or
  284. choosing the best candidates for quantization (aka fitting a roof line)
  285. group_by_stack_n: group by top n stack trace entries
  286. group_by_overload_name: Differentiate operators by their overload name e.g. aten::add.Tensor
  287. and aten::add.out will be aggregated separately
  288. Returns:
  289. An EventList containing FunctionEventAvg objects.
  290. """
  291. assert self._tree_built
  292. stats: dict[tuple[str, ...], FunctionEventAvg] = defaultdict(FunctionEventAvg)
  293. def get_key(
  294. event, group_by_input_shapes, group_by_stack_n, group_by_overload_name
  295. ) -> tuple[str, ...]:
  296. key = [
  297. str(event.key),
  298. str(event.node_id),
  299. str(event.device_type),
  300. str(event.is_legacy),
  301. str(event.is_user_annotation),
  302. ]
  303. if group_by_overload_name:
  304. key.append(evt.overload_name)
  305. if group_by_input_shapes:
  306. key.append(str(event.input_shapes))
  307. if group_by_stack_n > 0:
  308. key += event.stack[:group_by_stack_n]
  309. return tuple(key)
  310. for evt in self:
  311. stats[
  312. get_key(
  313. evt, group_by_input_shapes, group_by_stack_n, group_by_overload_name
  314. )
  315. ].add(evt)
  316. avg_list = EventList(
  317. stats.values(),
  318. use_device=self._use_device,
  319. profile_memory=self._profile_memory,
  320. with_flops=self._with_flops,
  321. )
  322. for evt in avg_list:
  323. evt.stack = evt.stack[:group_by_stack_n]
  324. if not group_by_input_shapes:
  325. evt.input_shapes = ""
  326. if not group_by_overload_name:
  327. evt.overload_name = ""
  328. return avg_list
  329. def total_average(self):
  330. """Averages all events.
  331. Returns:
  332. A FunctionEventAvg object.
  333. """
  334. total_stat = FunctionEventAvg()
  335. for evt in self:
  336. total_stat += evt
  337. total_stat.key = None
  338. total_stat.key = "Total"
  339. return total_stat
  340. def _format_time(time_us):
  341. """Define how to format time in FunctionEvent."""
  342. US_IN_SECOND = 1000.0 * 1000.0
  343. US_IN_MS = 1000.0
  344. if time_us >= US_IN_SECOND:
  345. return f"{time_us / US_IN_SECOND:.3f}s"
  346. if time_us >= US_IN_MS:
  347. return f"{time_us / US_IN_MS:.3f}ms"
  348. return f"{time_us:.3f}us"
  349. def _format_time_share(time_us, total_time_us):
  350. """Define how to format time in FunctionEvent."""
  351. if total_time_us == 0:
  352. assert time_us == 0, f"Expected time_us == 0 but got {time_us}"
  353. return "NaN"
  354. return f"{time_us * 100.0 / total_time_us:.2f}%"
  355. def _format_memory(nbytes):
  356. """Return a formatted memory size string."""
  357. KB = 1024
  358. MB = 1024 * KB
  359. GB = 1024 * MB
  360. if abs(nbytes) >= GB:
  361. return f"{nbytes * 1.0 / GB:.2f} GB"
  362. elif abs(nbytes) >= MB:
  363. return f"{nbytes * 1.0 / MB:.2f} MB"
  364. elif abs(nbytes) >= KB:
  365. return f"{nbytes * 1.0 / KB:.2f} KB"
  366. else:
  367. return str(nbytes) + " B"
  368. def _attr_formatter(name):
  369. return property(lambda self: _format_time(getattr(self, name)))
  370. class FormattedTimesMixin:
  371. """Helpers for FunctionEvent and FunctionEventAvg.
  372. The subclass should define `*_time_total` and `count` attributes.
  373. """
  374. cpu_time_str = _attr_formatter("cpu_time")
  375. device_time_str = _attr_formatter("device_time")
  376. cpu_time_total_str = _attr_formatter("cpu_time_total")
  377. device_time_total_str = _attr_formatter("device_time_total")
  378. self_cpu_time_total_str = _attr_formatter("self_cpu_time_total")
  379. self_device_time_total_str = _attr_formatter("self_device_time_total")
  380. @property
  381. def cpu_time(self):
  382. return 0.0 if self.count == 0 else 1.0 * self.cpu_time_total / self.count # type: ignore[attr-defined]
  383. @property
  384. def device_time(self):
  385. return 0.0 if self.count == 0 else 1.0 * self.device_time_total / self.count # type: ignore[attr-defined]
  386. @property
  387. @deprecated(
  388. "`cuda_time` is deprecated, please use `device_time` instead.",
  389. category=FutureWarning,
  390. )
  391. def cuda_time(self): # To be deprecated
  392. return self.device_time
  393. class Interval:
  394. def __init__(self, start, end):
  395. self.start = start
  396. self.end = end
  397. def elapsed_us(self):
  398. r"""
  399. Returns the length of the interval
  400. """
  401. return self.end - self.start
  402. Kernel = namedtuple("Kernel", ["name", "device", "duration"])
  403. class FunctionEvent(FormattedTimesMixin):
  404. """Profiling information about a single function."""
  405. def __init__(
  406. self,
  407. id,
  408. name,
  409. thread,
  410. start_us,
  411. end_us,
  412. overload_name=None,
  413. fwd_thread=None,
  414. input_shapes=None,
  415. stack=None,
  416. scope=0,
  417. use_device=None,
  418. cpu_memory_usage=0,
  419. device_memory_usage=0,
  420. is_async=False,
  421. is_remote=False,
  422. sequence_nr=-1,
  423. node_id=-1,
  424. device_type=DeviceType.CPU,
  425. device_index=0,
  426. device_resource_id=None,
  427. is_legacy=False,
  428. flops=None,
  429. trace_name=None,
  430. concrete_inputs=None,
  431. kwinputs=None,
  432. is_user_annotation=False,
  433. ):
  434. self.id: int = id
  435. self.node_id: int = node_id
  436. self.name: str = name
  437. self.overload_name: str = overload_name
  438. self.trace_name: str = trace_name
  439. self.time_range: Interval = Interval(start_us, end_us)
  440. self.thread: int = thread
  441. self.fwd_thread: Optional[int] = fwd_thread
  442. self.kernels: list[Kernel] = []
  443. self.count: int = 1
  444. self.cpu_children: list[FunctionEvent] = []
  445. self.cpu_parent: Optional[FunctionEvent] = None
  446. self.input_shapes: tuple[int, ...] = input_shapes
  447. self.concrete_inputs: list[Any] = concrete_inputs
  448. self.kwinputs: dict[str, Any] = kwinputs
  449. self.stack: list = stack
  450. self.scope: int = scope
  451. self.use_device: Optional[str] = use_device
  452. self.cpu_memory_usage: int = cpu_memory_usage
  453. self.device_memory_usage: int = device_memory_usage
  454. self.is_async: bool = is_async
  455. self.is_remote: bool = is_remote
  456. self.sequence_nr: int = sequence_nr
  457. self.device_type: DeviceType = device_type
  458. self.device_index: int = device_index
  459. self.device_resource_id: int = (
  460. thread if device_resource_id is None else device_resource_id
  461. )
  462. self.is_legacy: bool = is_legacy
  463. self.flops: Optional[int] = flops
  464. self.is_user_annotation: Optional[bool] = is_user_annotation
  465. self.self_cpu_percent = -1
  466. self.total_cpu_percent = -1
  467. self.total_device_percent = -1
  468. def append_kernel(self, name, device, duration):
  469. assert self.device_type == DeviceType.CPU
  470. self.kernels.append(Kernel(name, device, duration))
  471. def append_cpu_child(self, child):
  472. """Append a CPU child of type FunctionEvent.
  473. One is supposed to append only direct children to the event to have
  474. correct self cpu time being reported.
  475. """
  476. assert self.device_type == DeviceType.CPU
  477. assert isinstance(child, FunctionEvent)
  478. assert child.device_type == DeviceType.CPU
  479. self.cpu_children.append(child)
  480. def set_cpu_parent(self, parent):
  481. """Set the immediate CPU parent of type FunctionEvent.
  482. One profiling FunctionEvent should have only one CPU parent such that
  483. the child's range interval is completely inside the parent's. We use
  484. this connection to determine the event is from top-level op or not.
  485. """
  486. assert self.device_type == DeviceType.CPU
  487. assert isinstance(parent, FunctionEvent)
  488. assert parent.device_type == DeviceType.CPU
  489. self.cpu_parent = parent
  490. # Note: async events don't have children, are not used when computing 'self'
  491. # metrics of other events, have only total cpu time
  492. @property
  493. def self_cpu_memory_usage(self):
  494. if self.is_async or self.device_type != DeviceType.CPU:
  495. return 0
  496. return self.cpu_memory_usage - sum(
  497. child.cpu_memory_usage for child in self.cpu_children
  498. )
  499. @property
  500. def self_device_memory_usage(self):
  501. if self.is_async or self.device_type != DeviceType.CPU:
  502. return 0
  503. return self.device_memory_usage - sum(
  504. child.device_memory_usage for child in self.cpu_children
  505. )
  506. @property
  507. @deprecated(
  508. "`self_cuda_memory_usage` is deprecated. Use `self_device_memory_usage` instead.",
  509. category=FutureWarning,
  510. )
  511. def self_cuda_memory_usage(self): # To be deprecated
  512. return self.self_device_memory_usage
  513. @property
  514. def cpu_time_total(self):
  515. if self.device_type == DeviceType.CPU:
  516. return self.time_range.elapsed_us()
  517. else:
  518. return 0
  519. @property
  520. def self_cpu_time_total(self):
  521. if self.is_async or self.device_type != DeviceType.CPU:
  522. return 0
  523. return self.cpu_time_total - sum(
  524. child.cpu_time_total for child in self.cpu_children
  525. )
  526. @property
  527. def device_time_total(self):
  528. if self.is_async or not self.use_device:
  529. return 0
  530. if self.device_type == DeviceType.CPU:
  531. if not self.is_legacy:
  532. # account for the kernels in the children ops
  533. return sum(kinfo.duration for kinfo in self.kernels) + sum(
  534. ch.device_time_total for ch in self.cpu_children
  535. )
  536. else:
  537. # each legacy cpu events has a single (fake) kernel
  538. return sum(kinfo.duration for kinfo in self.kernels)
  539. else:
  540. assert self.device_type in [
  541. DeviceType.CUDA,
  542. DeviceType.PrivateUse1,
  543. DeviceType.MTIA,
  544. DeviceType.HPU,
  545. ]
  546. return self.time_range.elapsed_us()
  547. @property
  548. @deprecated(
  549. "`cuda_time_total` is deprecated. Use `device_time_total` instead.",
  550. category=FutureWarning,
  551. )
  552. def cuda_time_total(self): # To be deprecated
  553. return self.device_time_total
  554. @property
  555. def self_device_time_total(self):
  556. if self.is_async or not self.use_device:
  557. return 0
  558. if self.device_type == DeviceType.CPU:
  559. return self.device_time_total - sum(
  560. child.device_time_total for child in self.cpu_children
  561. )
  562. else:
  563. assert self.device_type in [
  564. DeviceType.CUDA,
  565. DeviceType.PrivateUse1,
  566. DeviceType.MTIA,
  567. DeviceType.HPU,
  568. ]
  569. return self.device_time_total
  570. @property
  571. @deprecated(
  572. "`self_cuda_time_total` is deprecated. Use `self_device_time_total` instead.",
  573. category=FutureWarning,
  574. )
  575. def self_cuda_time_total(self): # To be deprecated
  576. return self.self_device_time_total
  577. @property
  578. def key(self):
  579. return self.name
  580. def __repr__(self):
  581. device_name = self.use_device
  582. device_time = self.device_time_str
  583. device_memory_usage = self.device_memory_usage
  584. return (
  585. f"<FunctionEvent id={self.id} name={self.name} overload_name={self.overload_name} "
  586. f"device_type={self.device_type} node_id={self.node_id} cpu_time={self.cpu_time_str} "
  587. f"start_us={self.time_range.start} end_us={self.time_range.end} "
  588. f"cpu_children={str([child.id for child in self.cpu_children])} {device_name}_time={device_time} "
  589. f"name={self.name} thread={self.thread} input_shapes={str(self.input_shapes)} "
  590. f"cpu_memory_usage={self.cpu_memory_usage} {device_name}_memory_usage={device_memory_usage} "
  591. f"is_async={self.is_async} is_remote={self.is_remote} seq_nr={self.sequence_nr} is_legacy={self.is_legacy}>"
  592. )
  593. class FunctionEventAvg(FormattedTimesMixin):
  594. """Used to average stats over multiple FunctionEvent objects."""
  595. def __init__(self) -> None:
  596. self.key: Optional[str] = None
  597. self.count: int = 0
  598. self.node_id: int = 0
  599. self.is_async: bool = False
  600. self.is_remote: bool = False
  601. self.use_device: Optional[str] = None
  602. self.cpu_time_total: int = 0
  603. self.device_time_total: int = 0
  604. self.self_cpu_time_total: int = 0
  605. self.self_device_time_total: int = 0
  606. self.input_shapes: Optional[list[list[int]]] = None
  607. self.overload_name: Optional[str] = None
  608. self.stack: Optional[list] = None
  609. self.scope: Optional[int] = None
  610. self.cpu_memory_usage: int = 0
  611. self.device_memory_usage: int = 0
  612. self.self_cpu_memory_usage: int = 0
  613. self.self_device_memory_usage: int = 0
  614. self.cpu_children: Optional[list[FunctionEvent]] = None
  615. self.cpu_parent: Optional[FunctionEvent] = None
  616. self.device_type: DeviceType = DeviceType.CPU
  617. self.is_legacy: bool = False
  618. self.flops: int = 0
  619. def add(self, other):
  620. if self.key is None:
  621. # First function being recorded as part of FunctionEventAvg, propagate
  622. # fields.
  623. self.key = other.key
  624. self.node_id = other.node_id
  625. self.is_async = other.is_async
  626. self.is_remote = other.is_remote
  627. self.cpu_parent = other.cpu_parent
  628. self.cpu_children = other.cpu_children
  629. self.overload_name = other.overload_name
  630. self.input_shapes = other.input_shapes
  631. self.stack = other.stack
  632. self.scope = other.scope
  633. self.device_type = other.device_type
  634. self.is_legacy = other.is_legacy
  635. self.use_device = other.use_device
  636. self.is_user_annotation = other.is_user_annotation
  637. assert isinstance(other, (FunctionEvent, FunctionEventAvg))
  638. assert other.key == self.key
  639. self.cpu_time_total += other.cpu_time_total
  640. self.device_time_total += other.device_time_total
  641. self.self_cpu_time_total += other.self_cpu_time_total
  642. self.self_device_time_total += other.self_device_time_total
  643. self.cpu_memory_usage += other.cpu_memory_usage
  644. self.device_memory_usage += other.device_memory_usage
  645. self.self_cpu_memory_usage += other.self_cpu_memory_usage
  646. self.self_device_memory_usage += other.self_device_memory_usage
  647. self.count += other.count
  648. if self.flops is None:
  649. self.flops = other.flops
  650. elif other.flops is not None:
  651. self.flops += other.flops
  652. return self
  653. def __iadd__(self, other):
  654. return self.add(other)
  655. def __repr__(self):
  656. device_name = "cuda" if not self.use_device else self.use_device
  657. self_device_time = self.self_device_time_total_str
  658. device_time = self.device_time_str
  659. device_memory = self.device_memory_usage
  660. return (
  661. f"<FunctionEventAvg key={self.key} self_cpu_time={self.self_cpu_time_total_str} cpu_time={self.cpu_time_str} "
  662. f" self_{device_name}_time={self_device_time} {device_name}_time={device_time} input_shapes={str(self.input_shapes)} "
  663. f"cpu_memory_usage={self.cpu_memory_usage} {device_name}_memory_usage={device_memory}>"
  664. )
  665. class StringTable(defaultdict):
  666. def __missing__(self, key):
  667. # manage cases like 't' (demangled to 'unsigned short') separately,
  668. # for now simply check the length to avoid unexpected results for
  669. # the short sequences
  670. self[key] = torch._C._demangle(key) if len(key) > 1 else key
  671. return self[key]
  672. class MemRecordsAcc:
  673. """Acceleration structure for accessing mem_records in interval."""
  674. def __init__(self, mem_records):
  675. self._mem_records = mem_records
  676. self._start_nses: list[int] = []
  677. self._indices: list[int] = []
  678. if len(mem_records) > 0:
  679. tmp = sorted([(r[0].start_ns(), i) for i, r in enumerate(mem_records)])
  680. self._start_nses, self._indices = zip(*tmp) # type: ignore[assignment]
  681. def in_interval(self, start_us, end_us):
  682. r"""
  683. Return all records in the given interval
  684. To maintain backward compatibility, convert us to ns in function
  685. """
  686. start_idx = bisect.bisect_left(self._start_nses, start_us * 1000)
  687. end_idx = bisect.bisect_right(self._start_nses, end_us * 1000)
  688. for i in range(start_idx, end_idx):
  689. yield self._mem_records[self._indices[i]]
  690. def _filter_stack_entry(entry):
  691. filtered_entries = [
  692. ("autograd/__init__", "_make_grads"),
  693. ("autograd/__init__", "backward"),
  694. ("torch/tensor", "backward"),
  695. ("_internal/common_utils", "prof_callable"),
  696. ("_internal/common_utils", "prof_func_call"),
  697. ("_internal/common_utils", "prof_meth_call"),
  698. ]
  699. return all(not (f[0] in entry and f[1] in entry) for f in filtered_entries)
  700. MEMORY_EVENT_NAME = "[memory]"
  701. OUT_OF_MEMORY_EVENT_NAME = "[OutOfMemory]"
  702. def _filter_name(name):
  703. # ignoring the following utility ops
  704. filtered_out_names = [
  705. MEMORY_EVENT_NAME, # used only for the top-level memory events
  706. OUT_OF_MEMORY_EVENT_NAME,
  707. "profiler::_record_function_enter",
  708. "profiler::_record_function_enter_new",
  709. "profiler::_record_function_exit",
  710. "aten::is_leaf",
  711. "aten::output_nr",
  712. "aten::_version",
  713. ]
  714. return name in filtered_out_names
  715. # Demangles and optionally rewrites the provided event name,
  716. # with_wildcard - whether to replace certain numbered event names
  717. # with a wildcard name to aggregate them together in the profiler table
  718. # output
  719. def _rewrite_name(name, with_wildcard=False):
  720. string_table = StringTable()
  721. name = string_table[name]
  722. if with_wildcard:
  723. if name.startswith("ProfilerStep#"):
  724. name = "ProfilerStep*"
  725. return name
  726. def _build_table(
  727. events,
  728. sort_by=None,
  729. header=None,
  730. row_limit=100,
  731. max_src_column_width=75,
  732. max_name_column_width=55,
  733. max_shapes_column_width=80,
  734. with_flops=False,
  735. profile_memory=False,
  736. top_level_events_only=False,
  737. time_unit=None,
  738. ):
  739. """Print a summary of events (which can be a list of FunctionEvent or FunctionEventAvg)."""
  740. if len(events) == 0:
  741. return ""
  742. has_device_time = any(event.self_device_time_total > 0 for event in events)
  743. has_device_mem = any(event.self_device_memory_usage > 0 for event in events)
  744. use_device = events[0].use_device
  745. # Running on PrivateUse1 device with profiler but not enable
  746. # ProfilerActivity.PrivateUse1 can also catch privateuse1 memory usage.
  747. # Here only need to check has_privateuse1_time if not use_device.
  748. if not use_device and has_device_time:
  749. raise RuntimeError("use_device is None, but there is device performance data.")
  750. has_input_shapes = any(
  751. (event.input_shapes is not None and len(event.input_shapes) > 0)
  752. for event in events
  753. )
  754. has_overload_names = any(
  755. (event.overload_name is not None and len(event.overload_name) > 0)
  756. for event in events
  757. )
  758. if sort_by is not None:
  759. events = EventList(
  760. sorted(
  761. events,
  762. key=lambda evt: getattr(
  763. evt,
  764. sort_by.replace("cuda", "device")
  765. .replace("xpu", "device")
  766. .replace("privateuse1", "device"),
  767. ),
  768. reverse=True,
  769. ),
  770. use_device=use_device,
  771. profile_memory=profile_memory,
  772. with_flops=with_flops,
  773. )
  774. name_column_width = max(len(evt.key) for evt in events) + 4
  775. if max_name_column_width is not None:
  776. name_column_width = min(name_column_width, max_name_column_width)
  777. shapes_column_width = max(len(str(evt.input_shapes)) for evt in events) + 4
  778. if max_shapes_column_width is not None:
  779. shapes_column_width = min(shapes_column_width, max_shapes_column_width)
  780. DEFAULT_COLUMN_WIDTH = 12
  781. flops_column_width = DEFAULT_COLUMN_WIDTH
  782. src_column_width = None
  783. stacks = [
  784. evt.stack for evt in events if evt.stack is not None and len(evt.stack) > 0
  785. ]
  786. has_stack = len(stacks) > 0
  787. if has_stack:
  788. src_column_width = (
  789. max(max(len(entry) for entry in stack) for stack in stacks) + 4
  790. )
  791. if max_src_column_width is not None:
  792. src_column_width = min(src_column_width, max_src_column_width)
  793. headers = ["Name"]
  794. if has_overload_names:
  795. headers.append("Overload Name")
  796. headers += [
  797. "Self CPU %",
  798. "Self CPU",
  799. "CPU total %",
  800. "CPU total",
  801. "CPU time avg",
  802. ]
  803. device_name = use_device.upper() if use_device is not None else "None"
  804. if has_device_time:
  805. headers.extend(
  806. [
  807. f"Self {device_name}",
  808. f"Self {device_name} %",
  809. f"{device_name} total",
  810. f"{device_name} time avg",
  811. ]
  812. )
  813. if profile_memory:
  814. headers.extend(
  815. [
  816. "CPU Mem",
  817. "Self CPU Mem",
  818. ]
  819. )
  820. if use_device and has_device_mem:
  821. headers.extend(
  822. [
  823. f"{device_name} Mem",
  824. f"Self {device_name} Mem",
  825. ]
  826. )
  827. headers.append("# of Calls")
  828. # Only append Node ID if any event has a valid (>= 0) Node ID
  829. append_node_id = any(evt.node_id != -1 for evt in events)
  830. if append_node_id:
  831. headers.append("Node ID")
  832. # Have to use a list because nonlocal is Py3 only...
  833. SPACING_SIZE = 2
  834. row_format_lst = [""]
  835. header_sep_lst = [""]
  836. line_length_lst = [-SPACING_SIZE]
  837. def add_column(padding, text_dir=">"):
  838. row_format_lst[0] += (
  839. "{: " + text_dir + str(padding) + "}" + (" " * SPACING_SIZE)
  840. )
  841. header_sep_lst[0] += "-" * padding + (" " * SPACING_SIZE)
  842. line_length_lst[0] += padding + SPACING_SIZE
  843. def auto_scale_flops(flops):
  844. flop_headers = [
  845. "FLOPs",
  846. "KFLOPs",
  847. "MFLOPs",
  848. "GFLOPs",
  849. "TFLOPs",
  850. "PFLOPs",
  851. ]
  852. assert flops > 0
  853. log_flops = max(0, min(math.log10(flops) / 3, float(len(flop_headers) - 1)))
  854. assert log_flops >= 0 and log_flops < len(flop_headers)
  855. return (pow(10, (math.floor(log_flops) * -3.0)), flop_headers[int(log_flops)])
  856. add_column(name_column_width)
  857. if has_overload_names:
  858. add_column(name_column_width)
  859. for _ in headers[1 + has_overload_names :]:
  860. add_column(DEFAULT_COLUMN_WIDTH)
  861. if has_input_shapes:
  862. headers.append("Input Shapes")
  863. add_column(shapes_column_width)
  864. if has_stack:
  865. headers.append("Source Location")
  866. add_column(src_column_width, text_dir="<")
  867. if with_flops:
  868. # Auto-scaling of flops header
  869. raw_flops = [evt.flops for evt in events if evt.flops > 0]
  870. if len(raw_flops) != 0:
  871. (flops_scale, flops_header) = auto_scale_flops(min(raw_flops))
  872. headers.append(f"Total {flops_header}")
  873. add_column(flops_column_width)
  874. else:
  875. with_flops = False # can't find any valid flops
  876. row_format = row_format_lst[0]
  877. header_sep = header_sep_lst[0]
  878. line_length = line_length_lst[0]
  879. add_column = None # type: ignore[assignment]
  880. # Have to use a list because nonlocal is Py3 only...
  881. result = []
  882. def append(s):
  883. result.append(s)
  884. result.append("\n") # Yes, newline after the end as well
  885. sum_self_cpu_time_total = 0
  886. sum_self_device_time_total = 0
  887. for evt in events:
  888. sum_self_cpu_time_total += evt.self_cpu_time_total
  889. if evt.device_type == DeviceType.CPU and evt.is_legacy:
  890. # in legacy profiler, kernel info is stored in cpu events
  891. sum_self_device_time_total += evt.self_device_time_total
  892. elif (
  893. evt.device_type
  894. in [
  895. DeviceType.CUDA,
  896. DeviceType.PrivateUse1,
  897. DeviceType.MTIA,
  898. ]
  899. and not evt.is_user_annotation
  900. ):
  901. # in kineto profiler, there're events with the correct device type (e.g. CUDA)
  902. sum_self_device_time_total += evt.self_device_time_total
  903. # Actual printing
  904. if header is not None:
  905. append("=" * line_length)
  906. append(header)
  907. if top_level_events_only:
  908. append("=" * line_length)
  909. append("This report only display top-level ops statistics")
  910. append(header_sep)
  911. append(row_format.format(*headers))
  912. append(header_sep)
  913. def trim_path(path, src_column_width):
  914. if len(path) > src_column_width:
  915. offset = len(path) - src_column_width
  916. path = path[offset:]
  917. if len(path) > 3:
  918. path = "..." + path[3:]
  919. return path
  920. def override_time_unit(time_us, default_str, time_unit):
  921. US_IN_SECOND = 1000.0 * 1000.0
  922. US_IN_MS = 1000.0
  923. if time_unit == "s":
  924. return f"{time_us / US_IN_SECOND:.3f}s"
  925. elif time_unit == "ms":
  926. return f"{time_us / US_IN_MS:.3f}ms"
  927. elif time_unit == "us":
  928. return f"{time_us:.3f}us"
  929. else:
  930. return default_str
  931. event_limit = 0
  932. for evt in events:
  933. if event_limit == row_limit:
  934. break
  935. if top_level_events_only and evt.cpu_parent is not None:
  936. continue
  937. else:
  938. event_limit += 1
  939. name = evt.key
  940. if max_name_column_width is not None and len(name) >= max_name_column_width - 3:
  941. name = name[: (max_name_column_width - 3)] + "..."
  942. evt.self_cpu_percent = _format_time_share(
  943. evt.self_cpu_time_total, sum_self_cpu_time_total
  944. )
  945. evt.total_cpu_percent = (
  946. _format_time_share(evt.cpu_time_total, sum_self_cpu_time_total)
  947. if not evt.is_async
  948. else 0
  949. )
  950. row_values = [name]
  951. if has_overload_names:
  952. overload_name = evt.overload_name
  953. if (
  954. max_name_column_width is not None
  955. and len(overload_name) >= max_name_column_width - 3
  956. ):
  957. overload_name = overload_name[: (max_name_column_width - 3)] + "..."
  958. row_values += [overload_name]
  959. row_values += [
  960. # Self CPU total %, 0 for async events.
  961. evt.self_cpu_percent,
  962. override_time_unit(
  963. evt.self_cpu_time_total, evt.self_cpu_time_total_str, time_unit
  964. ), # Self CPU total
  965. # CPU total %, 0 for async events.
  966. evt.total_cpu_percent,
  967. override_time_unit(
  968. evt.cpu_time_total, evt.cpu_time_total_str, time_unit
  969. ), # CPU total
  970. override_time_unit(
  971. evt.cpu_time, evt.cpu_time_str, time_unit
  972. ), # CPU time avg
  973. ]
  974. if has_device_time:
  975. evt.total_device_percent = _format_time_share(
  976. evt.self_device_time_total, sum_self_device_time_total
  977. )
  978. row_values.extend(
  979. [
  980. override_time_unit(
  981. evt.self_device_time_total,
  982. evt.self_device_time_total_str,
  983. time_unit,
  984. ),
  985. # device time total %
  986. evt.total_device_percent,
  987. override_time_unit(
  988. evt.device_time_total, evt.device_time_total_str, time_unit
  989. ),
  990. override_time_unit(
  991. evt.device_time, evt.device_time_str, time_unit
  992. ), # device time avg
  993. ]
  994. )
  995. if profile_memory:
  996. row_values.extend(
  997. [
  998. # CPU Mem Total
  999. _format_memory(evt.cpu_memory_usage),
  1000. # Self CPU Mem Total
  1001. _format_memory(evt.self_cpu_memory_usage),
  1002. ]
  1003. )
  1004. if use_device and has_device_mem:
  1005. row_values.extend(
  1006. [
  1007. # Device Mem Total
  1008. _format_memory(evt.device_memory_usage),
  1009. # Self Device Mem Total
  1010. _format_memory(evt.self_device_memory_usage),
  1011. ]
  1012. )
  1013. row_values.append(
  1014. evt.count, # Number of calls
  1015. )
  1016. if append_node_id:
  1017. row_values.append(evt.node_id)
  1018. if has_input_shapes:
  1019. row_values.append(str(evt.input_shapes)[:shapes_column_width])
  1020. if with_flops:
  1021. if evt.flops <= 0:
  1022. row_values.append("--")
  1023. else:
  1024. row_values.append(f"{evt.flops * flops_scale:8.3f}") # type: ignore[possibly-undefined]
  1025. if has_stack:
  1026. src_field = ""
  1027. if len(evt.stack) > 0:
  1028. src_field = trim_path(evt.stack[0], src_column_width)
  1029. row_values.append(src_field)
  1030. append(row_format.format(*row_values))
  1031. if has_stack:
  1032. empty_headers = [""] * (len(headers) - 1)
  1033. for entry in evt.stack[1:]:
  1034. append(
  1035. row_format.format(
  1036. *(empty_headers + [trim_path(entry, src_column_width)])
  1037. )
  1038. )
  1039. empty_headers.append("")
  1040. append(row_format.format(*empty_headers))
  1041. append(header_sep)
  1042. append(
  1043. f"Self CPU time total: {override_time_unit(sum_self_cpu_time_total, _format_time(sum_self_cpu_time_total), time_unit)}"
  1044. )
  1045. if has_device_time:
  1046. append(
  1047. f"Self {use_device.upper() if use_device is not None else 'None'} "
  1048. f"time total: {override_time_unit(sum_self_device_time_total, _format_time(sum_self_device_time_total), time_unit)}"
  1049. )
  1050. return "".join(result)