accuracy_compare.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700
  1. # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import os
  15. import numpy as np
  16. # Judge whether the value is within the range indicated by fp16
  17. def is_infinite(value, dtype=np.float16):
  18. # return value > np.finfo(np.float16).max or value < np.finfo(np.float16).min
  19. array = np.array([value]).astype(dtype)
  20. return np.isinf(array) or np.isnan(array)
  21. # Judge whether the value of fp32 is equal to that of fp16
  22. def is_allclose(actual, expected, atol=1e-2, rtol=1e-2):
  23. return np.allclose(
  24. np.array([actual]), np.array([expected]), atol=atol, rtol=rtol
  25. )
  26. class TensorInfo:
  27. def __init__(self):
  28. self.device = None
  29. self.op_type = None
  30. self.tensor_name = None
  31. self.dtype = None
  32. self.numel = None
  33. self.max_value = None
  34. self.min_value = None
  35. self.mean_value = None
  36. self.has_inf = None
  37. self.has_nan = None
  38. self.num_zero = None
  39. def __str__(self):
  40. return f"[TensorInfo] device={self.device}, op_type={self.op_type}, tensor_name={self.tensor_name}, dtype={self.dtype}, numel={self.numel}, num_inf={self.has_inf}, num_nan={self.has_nan}, num_zero={self.num_zero}, max_value={self.max_value:.6f}, min_value={self.min_value:.6f}, mean_value={self.mean_value:.6f}"
  41. def key(
  42. self,
  43. ):
  44. return self.op_type + "/" + self.tensor_name
  45. def init_from_string(self, line):
  46. try:
  47. line_frags = line.strip().split(" ")
  48. for frag in line_frags:
  49. word_str = (
  50. frag.replace("[", "").replace("]", "").replace(",", "")
  51. )
  52. words = word_str.split("=")
  53. if words[0] == "op":
  54. self.op_type = words[1]
  55. elif words[0] == "device":
  56. self.device = words[1]
  57. elif words[0] == "tensor":
  58. self.tensor_name = words[1]
  59. elif words[0] == "dtype":
  60. self.dtype = words[1]
  61. elif words[0] == "numel":
  62. self.numel = np.int64(words[1])
  63. elif words[0] == "max":
  64. self.max_value = np.float32(words[1])
  65. elif words[0] == "min":
  66. self.min_value = np.float32(words[1])
  67. elif words[0] == "mean":
  68. self.mean_value = np.float32(words[1])
  69. elif words[0] == "num_inf":
  70. self.has_inf = int(words[1])
  71. elif words[0] == "num_nan":
  72. self.has_nan = int(words[1])
  73. elif words[0] == "num_zero":
  74. self.num_zero = np.int64(words[1])
  75. except Exception as e:
  76. print(f"!! Error parsing {line}")
  77. return self
  78. class MixedPrecisionTensorInfo:
  79. def __init__(
  80. self, fp32_tensor_info, fp16_tensor_info, fp32_idx=0, grad_scale=1.0
  81. ):
  82. self.is_normal = True
  83. self.fp32_idx = fp32_idx
  84. self.fp32_tensor_name = None
  85. self.fp32_dtype = None
  86. self.fp32_max_value = None
  87. self.fp32_min_value = None
  88. self.fp32_mean_value = None
  89. self.fp32_num_zero = None
  90. self.scaled_fp32_max_value = None
  91. self.scaled_fp32_min_value = None
  92. self.fp16_tensor_name = None
  93. self.fp16_dtype = None
  94. self.fp16_max_value = None
  95. self.fp16_min_value = None
  96. self.fp16_mean_value = None
  97. self.fp16_num_zero = None
  98. self.fp16_has_inf = None
  99. self.fp16_has_nan = None
  100. self.fp32_div_fp16_max_value = None
  101. self.fp32_div_fp16_min_value = None
  102. self.fp32_div_fp16_mean_value = None
  103. if fp32_tensor_info is not None:
  104. self.op_type = fp32_tensor_info.op_type
  105. self.numel = fp32_tensor_info.numel
  106. self.fp32_num_zero = fp32_tensor_info.num_zero
  107. self.fp32_tensor_name = fp32_tensor_info.tensor_name
  108. self.fp32_dtype = fp32_tensor_info.dtype
  109. self.fp32_max_value = fp32_tensor_info.max_value
  110. self.fp32_min_value = fp32_tensor_info.min_value
  111. self.fp32_mean_value = fp32_tensor_info.mean_value
  112. if "GRAD" in self.fp32_tensor_name:
  113. self.scaled_fp32_max_value = (
  114. grad_scale * fp32_tensor_info.max_value
  115. )
  116. self.scaled_fp32_min_value = (
  117. grad_scale * fp32_tensor_info.min_value
  118. )
  119. if fp16_tensor_info is not None:
  120. self.op_type = fp16_tensor_info.op_type
  121. self.numel = fp16_tensor_info.numel
  122. self.fp16_num_zero = fp16_tensor_info.num_zero
  123. self.fp16_tensor_name = fp16_tensor_info.tensor_name
  124. self.fp16_dtype = fp16_tensor_info.dtype
  125. self.fp16_max_value = fp16_tensor_info.max_value
  126. self.fp16_min_value = fp16_tensor_info.min_value
  127. self.fp16_mean_value = fp16_tensor_info.mean_value
  128. self.fp16_has_inf = fp16_tensor_info.has_inf
  129. self.fp16_has_nan = fp16_tensor_info.has_nan
  130. if fp32_tensor_info is not None and fp16_tensor_info is not None:
  131. # Check whether the op name and data are equal
  132. assert fp32_tensor_info.op_type == fp16_tensor_info.op_type
  133. assert (
  134. fp32_tensor_info.numel == fp16_tensor_info.numel
  135. ), f"Error:\n\tFP32 Tensor Info:{fp32_tensor_info}\n\tFP16 Tensor Info:{fp16_tensor_info}"
  136. # Fp16 divided by fp32
  137. self.fp32_div_fp16_max_value = self._div(
  138. self.fp16_max_value, self.fp32_max_value
  139. )
  140. self.fp32_div_fp16_min_value = self._div(
  141. self.fp16_min_value, self.fp32_min_value
  142. )
  143. self.fp32_div_fp16_mean_value = self._div(
  144. self.fp16_mean_value, self.fp32_mean_value
  145. )
  146. self._check_normal()
  147. def __str__(self):
  148. def _float_str(value):
  149. return f"{value:.6f}" if value is not None else value
  150. debug_str = f"[MixedPrecisionTensorInfo] op_type={self.op_type}, numel={self.numel}"
  151. debug_str += f"\n FP32: tensor_name={self.fp32_tensor_name}, dtype={self.fp32_dtype}, max_value={_float_str(self.fp32_max_value)}, min_value={_float_str(self.fp32_min_value)}, mean_value={_float_str(self.fp32_mean_value)}"
  152. debug_str += f"\n FP16: tensor_name={self.fp16_tensor_name}, dtype={self.fp16_dtype}, max_value={_float_str(self.fp16_max_value)}, min_value={_float_str(self.fp16_min_value)}, mean_value={_float_str(self.fp16_mean_value)}, has_inf={self.fp16_has_inf}, has_nan={self.fp16_has_nan}"
  153. return debug_str
  154. def _div(self, a, b):
  155. if a is not None and b is not None:
  156. return a / b if b != 0 else 1
  157. return None
  158. def get_tensor_name(self):
  159. if self.fp32_tensor_name is None:
  160. return self.fp16_tensor_name # + "#" + str(self.idx)
  161. elif self.fp16_tensor_name is None:
  162. return self.fp32_tensor_name + "#" + str(self.fp32_idx)
  163. else:
  164. return (
  165. self.fp16_tensor_name.replace(".cast_fp16", "/.cast_fp16/")
  166. + "#"
  167. + str(self.fp32_idx)
  168. )
  169. def _check_normal(self):
  170. # When the OP meets the following conditions, it is abnormal data, and use --skip_normal_tensors to retain the data in Excel:
  171. # 1. The number of OP outputs exceeds the indication range of int32
  172. # 2. The output data exceeds the representation range of fp16
  173. # 3. Nan or inf appears in fp16 output data
  174. # 4. The maximum value of fp32 is not equal to the maximum value of fp16
  175. # 5. The minimum value of fp32 is not equal to the minimum value of fp16
  176. if self.numel is not None and self.numel > np.iinfo(np.int32).max:
  177. self.is_normal = False
  178. return
  179. check_list = [
  180. self.fp32_max_value,
  181. self.fp32_min_value,
  182. self.scaled_fp32_max_value,
  183. self.scaled_fp32_min_value,
  184. self.fp16_max_value,
  185. self.fp16_min_value,
  186. ]
  187. for value in check_list:
  188. if value is not None and is_infinite(value):
  189. self.is_normal = False
  190. return
  191. if self.fp16_has_inf is not None and self.fp16_has_inf:
  192. self.is_normal = False
  193. return
  194. if self.fp16_has_nan is not None and self.fp16_has_nan:
  195. self.is_normal = False
  196. return
  197. if (
  198. self.scaled_fp32_max_value is not None
  199. and self.fp16_max_value is not None
  200. and not is_allclose(self.fp16_max_value, self.scaled_fp32_max_value)
  201. ):
  202. self.is_normal = False
  203. return
  204. if (
  205. self.scaled_fp32_min_value is not None
  206. and self.fp16_min_value is not None
  207. and not is_allclose(self.fp16_min_value, self.scaled_fp32_min_value)
  208. ):
  209. self.is_normal = False
  210. return
  211. class ExcelWriter:
  212. def __init__(self, log_fp32_dir, log_fp16_dir, output_path):
  213. self.log_fp32_dir = log_fp32_dir
  214. self.log_fp16_dir = log_fp16_dir
  215. try:
  216. import xlsxwriter as xlw
  217. except ImportError:
  218. print(
  219. "import xlsxwriter failed. please run 'pip install xlsxwriter==3.0.9' to install it"
  220. )
  221. self.workbook = xlw.Workbook(output_path)
  222. self.title_format = self.workbook.add_format(
  223. {
  224. 'bold': True,
  225. 'border': 1,
  226. 'font_color': 'black',
  227. 'bg_color': '#6495ED',
  228. 'align': 'center',
  229. }
  230. )
  231. self.tensor_name_format = self.workbook.add_format(
  232. {'bold': True, 'bg_color': '#F5F5F5'}
  233. )
  234. self.red_bg_cell_format = self.workbook.add_format(
  235. {'bold': True, 'bg_color': 'red'}
  236. )
  237. self.yellow_bg_cell_format = self.workbook.add_format(
  238. {'bold': True, 'bg_color': 'yellow'}
  239. )
  240. self.orange_bg_cell_format = self.workbook.add_format(
  241. {'bold': True, 'bg_color': 'orange'}
  242. )
  243. def close(self):
  244. self.workbook.close()
  245. self.workbook = None
  246. def _write_dtype(self, worksheet, value, row, col):
  247. if value is None:
  248. worksheet.write(row, col, "--")
  249. else:
  250. if value == "fp16":
  251. worksheet.write(row, col, value, self.yellow_bg_cell_format)
  252. else:
  253. worksheet.write(row, col, value)
  254. def _write_tensor_name(self, worksheet, mp_tensor_info, row, col):
  255. tensor_name = mp_tensor_info.get_tensor_name()
  256. if (
  257. mp_tensor_info.fp32_tensor_name is not None
  258. and mp_tensor_info.fp16_tensor_name
  259. ):
  260. worksheet.write(row, col, tensor_name, self.tensor_name_format)
  261. else:
  262. worksheet.write(row, col, tensor_name)
  263. def _write_maxmin_value(
  264. self, worksheet, value, row, col, check_finite=True
  265. ):
  266. if value is None:
  267. worksheet.write(row, col, "--")
  268. else:
  269. if abs(value) < 1e-5:
  270. value_str = f"{value:.6E}"
  271. else:
  272. value_str = f"{value:.6f}"
  273. if check_finite and is_infinite(value, np.float16):
  274. worksheet.write(row, col, value_str, self.red_bg_cell_format)
  275. else:
  276. worksheet.write(row, col, value_str)
  277. def _write_tensor_num_zero(
  278. self, worksheet, value, row, col, check_finite=True
  279. ):
  280. if value is None:
  281. worksheet.write(row, col, "--")
  282. else:
  283. value_str = f"{value:>10d}"
  284. worksheet.write(row, col, value_str)
  285. def _write_infinite_status(self, worksheet, value, row, col):
  286. if value is None:
  287. worksheet.write(row, col, "--")
  288. else:
  289. if value == 1:
  290. worksheet.write(row, col, value, self.red_bg_cell_format)
  291. else:
  292. worksheet.write(row, col, value)
  293. def _write_fp32divfp16_value(self, worksheet, value, row, col, loss_scale):
  294. def _in_range(value, scale=1):
  295. return value > scale * 0.95 and value < scale * 1.05
  296. if value is None:
  297. worksheet.write(row, col, "--")
  298. else:
  299. value_str = f"{value:.6f}"
  300. if _in_range(value, scale=1) or _in_range(value, loss_scale):
  301. worksheet.write(row, col, value_str)
  302. else:
  303. worksheet.write(row, col, value_str, self.orange_bg_cell_format)
  304. def _write_titles(self, worksheet, loss_scale, row):
  305. column_width_dict = {
  306. "op_type": 24,
  307. "tensor_name": 60,
  308. "numel": 10,
  309. "num_zero": 10,
  310. "infinite": 8,
  311. "dtype": 8,
  312. "max_value": 16,
  313. "min_value": 16,
  314. "mean_value": 16,
  315. "num_inf": 8,
  316. "num_nan": 8,
  317. }
  318. title_names = ["op_type", "tensor_name", "numel", "infinite"]
  319. if self.log_fp16_dir is None:
  320. # only fp32 values
  321. worksheet.merge_range("E1:H1", "fp32", self.title_format)
  322. worksheet.merge_range(
  323. "I1:J1", f"fp32 (scale={loss_scale})", self.title_format
  324. )
  325. title_names.extend(
  326. [
  327. "dtype",
  328. "max_value",
  329. "min_value",
  330. "mean_value",
  331. "max_value",
  332. "min_value",
  333. ]
  334. )
  335. elif self.log_fp32_dir is None:
  336. # only fp16 values
  337. worksheet.merge_range(
  338. "E1:J1", f"fp16 (scale={loss_scale})", self.title_format
  339. )
  340. title_names.extend(
  341. [
  342. "dtype",
  343. "max_value",
  344. "min_value",
  345. "mean_value",
  346. "num_zero",
  347. "num_inf",
  348. "num_nan",
  349. ]
  350. )
  351. else:
  352. # fp32 and fp16 values
  353. worksheet.merge_range("E1:H1", "fp32", self.title_format)
  354. worksheet.merge_range(
  355. "I1:N1", f"fp16 (scale={loss_scale})", self.title_format
  356. )
  357. worksheet.merge_range("O1:Q1", "fp16 / fp32", self.title_format)
  358. title_names.extend(
  359. [
  360. "dtype",
  361. "max_value",
  362. "min_value",
  363. "mean_value",
  364. "num_zero",
  365. "dtype",
  366. "max_value",
  367. "min_value",
  368. "mean_value",
  369. "num_zero",
  370. "num_inf",
  371. "num_nan",
  372. "max_value",
  373. "min_value",
  374. "mean_value",
  375. ]
  376. )
  377. for col in range(len(title_names)):
  378. col_char = chr(ord("A") + col)
  379. worksheet.set_column(
  380. col_char + ":" + col_char, column_width_dict[title_names[col]]
  381. )
  382. for col in range(len(title_names)):
  383. worksheet.write(row, col, title_names[col], self.title_format)
  384. def add_worksheet(
  385. self, mp_tensor_info_list, sheetname, loss_scale, skip_normal_tensors
  386. ):
  387. assert self.workbook is not None
  388. worksheet = self.workbook.add_worksheet(sheetname)
  389. row = 1
  390. self._write_titles(worksheet, loss_scale, row)
  391. row += 1
  392. infinite_op_types = []
  393. for tensor_info in mp_tensor_info_list:
  394. if (
  395. not tensor_info.is_normal
  396. and tensor_info.op_type not in infinite_op_types
  397. ):
  398. infinite_op_types.append(tensor_info.op_type)
  399. if skip_normal_tensors and tensor_info.is_normal:
  400. continue
  401. worksheet.write(row, 0, tensor_info.op_type)
  402. self._write_tensor_name(worksheet, tensor_info, row, 1)
  403. if tensor_info.numel > np.iinfo(np.int32).max:
  404. worksheet.write(
  405. row, 2, tensor_info.numel, self.bad_value_format
  406. )
  407. else:
  408. worksheet.write(row, 2, tensor_info.numel)
  409. if tensor_info.is_normal:
  410. worksheet.write(row, 3, "0")
  411. else:
  412. worksheet.write(row, 3, "1", self.red_bg_cell_format)
  413. col = 4
  414. if self.log_fp32_dir is not None:
  415. self._write_dtype(worksheet, tensor_info.fp32_dtype, row, col)
  416. self._write_maxmin_value(
  417. worksheet, tensor_info.fp32_max_value, row, col + 1
  418. )
  419. self._write_maxmin_value(
  420. worksheet, tensor_info.fp32_min_value, row, col + 2
  421. )
  422. self._write_maxmin_value(
  423. worksheet, tensor_info.fp32_mean_value, row, col + 3
  424. )
  425. self._write_tensor_num_zero(
  426. worksheet, tensor_info.fp32_num_zero, row, col + 4
  427. )
  428. col += 5
  429. if self.log_fp16_dir is None:
  430. self._write_maxmin_value(
  431. worksheet, tensor_info.scaled_fp32_max_value, row, col
  432. )
  433. self._write_maxmin_value(
  434. worksheet,
  435. tensor_info.scaled_fp32_min_value,
  436. row,
  437. col + 1,
  438. )
  439. col += 2
  440. if self.log_fp16_dir is not None:
  441. self._write_dtype(worksheet, tensor_info.fp16_dtype, row, col)
  442. self._write_maxmin_value(
  443. worksheet, tensor_info.fp16_max_value, row, col + 1
  444. )
  445. self._write_maxmin_value(
  446. worksheet, tensor_info.fp16_min_value, row, col + 2
  447. )
  448. self._write_maxmin_value(
  449. worksheet, tensor_info.fp16_mean_value, row, col + 3
  450. )
  451. self._write_tensor_num_zero(
  452. worksheet, tensor_info.fp32_num_zero, row, col + 4
  453. )
  454. col += 5
  455. self._write_infinite_status(
  456. worksheet, tensor_info.fp16_has_inf, row, col
  457. )
  458. self._write_infinite_status(
  459. worksheet, tensor_info.fp16_has_nan, row, col + 1
  460. )
  461. col += 2
  462. if self.log_fp32_dir is not None and self.log_fp16_dir is not None:
  463. self._write_fp32divfp16_value(
  464. worksheet,
  465. tensor_info.fp32_div_fp16_max_value,
  466. row,
  467. col,
  468. loss_scale,
  469. )
  470. self._write_fp32divfp16_value(
  471. worksheet,
  472. tensor_info.fp32_div_fp16_min_value,
  473. row,
  474. col + 1,
  475. loss_scale,
  476. )
  477. self._write_fp32divfp16_value(
  478. worksheet,
  479. tensor_info.fp32_div_fp16_mean_value,
  480. row,
  481. col + 2,
  482. loss_scale,
  483. )
  484. col += 3
  485. row += 1
  486. print(f"-- OP Types produce infinite outputs: {infinite_op_types}")
  487. def parse_lines(lines, specified_op_list=None):
  488. tensor_info_list = []
  489. for i in range(len(lines)):
  490. if i % 10 == 0:
  491. print(
  492. f"-- Processing {i:-8d} / {len(lines):-8d} line",
  493. end="\r",
  494. )
  495. line = lines[i]
  496. if "[PRECISION]" in line:
  497. tensor_info = TensorInfo()
  498. tensor_info.init_from_string(line)
  499. if (
  500. tensor_info.tensor_name is not None
  501. and tensor_info.tensor_name != ""
  502. ):
  503. has_tensor_name = True
  504. if (
  505. specified_op_list is None
  506. or tensor_info.op_type in specified_op_list
  507. ):
  508. tensor_info_list.append(tensor_info)
  509. # print(tensor_info)
  510. return tensor_info_list
  511. def parse_log(log_dir, filename, specified_op_list=None):
  512. if log_dir is None or filename is None:
  513. return None
  514. complete_filename = log_dir + "/" + filename
  515. tensor_info_list = None
  516. has_tensor_name = False
  517. try:
  518. with open(complete_filename, 'r') as f:
  519. lines = f.readlines()
  520. tensor_info_list = parse_lines(lines, specified_op_list)
  521. except FileNotFoundError:
  522. print("the file ", complete_filename, "is not found")
  523. return None, has_tensor_name
  524. return tensor_info_list, has_tensor_name
  525. def merge_tensor_info_list(
  526. fp32_tensor_info_list, fp16_tensor_info_list, grad_scale
  527. ):
  528. mp_tensor_info_list = []
  529. if fp16_tensor_info_list is not None:
  530. fp32_tensor_info_dict = {}
  531. fp32_write_count = {}
  532. if fp32_tensor_info_list is not None:
  533. for tensor_info in fp32_tensor_info_list:
  534. tensor_info_key = tensor_info.key()
  535. count = fp32_write_count.get(tensor_info_key, 0)
  536. fp32_write_count[tensor_info_key] = count + 1
  537. fp32_tensor_info_dict[
  538. tensor_info_key + "#" + str(count)
  539. ] = tensor_info
  540. fp32_read_count = {}
  541. for i in range(len(fp16_tensor_info_list)):
  542. if i % 10 == 0:
  543. print(
  544. f"-- Processing {i:-8d} / {len(fp16_tensor_info_list):-8d} FP16 Tensor Info",
  545. end="\r",
  546. )
  547. fp16_tensor_info = fp16_tensor_info_list[i]
  548. fp32_tensor_info_key = (
  549. fp16_tensor_info.key()
  550. .replace(".cast_fp16", "")
  551. .replace(".cast_fp32", "")
  552. )
  553. count = fp32_read_count.get(fp32_tensor_info_key, 0)
  554. fp32_tensor_info = fp32_tensor_info_dict.get(
  555. fp32_tensor_info_key + "#" + str(count), None
  556. )
  557. if fp32_tensor_info is not None:
  558. fp32_read_count[fp32_tensor_info_key] = count + 1
  559. mp_tensor_info = MixedPrecisionTensorInfo(
  560. fp32_tensor_info, fp16_tensor_info, count, grad_scale
  561. )
  562. mp_tensor_info_list.append(mp_tensor_info)
  563. # print(mp_tensor_info)
  564. elif fp32_tensor_info_list is not None:
  565. fp32_count = {}
  566. for i in range(len(fp32_tensor_info_list)):
  567. if i % 10 == 0:
  568. print(
  569. f"-- Processing {i:-8d} / {len(fp32_tensor_info_list):-8d} FP32 Tensor Info",
  570. end="\r",
  571. )
  572. tensor_info = fp32_tensor_info_list[i]
  573. tensor_info_key = tensor_info.key()
  574. count = fp32_count.get(tensor_info_key, 0)
  575. fp32_count[tensor_info_key] = count + 1
  576. mp_tensor_info = MixedPrecisionTensorInfo(
  577. tensor_info, None, count, grad_scale
  578. )
  579. mp_tensor_info_list.append(mp_tensor_info)
  580. return mp_tensor_info_list
  581. def compare_accuracy(
  582. dump_path,
  583. another_dump_path,
  584. output_filename,
  585. loss_scale=1,
  586. dump_all_tensors=False,
  587. ):
  588. excel_writer = ExcelWriter(dump_path, another_dump_path, output_filename)
  589. grad_scale = loss_scale
  590. workerlog_filenames = []
  591. filenames = os.listdir(dump_path)
  592. for name in filenames:
  593. if "worker_" in name:
  594. workerlog_filenames.append(name)
  595. print(
  596. f"-- There are {len(workerlog_filenames)} workerlogs under {dump_path}: {workerlog_filenames}"
  597. )
  598. for filename in sorted(workerlog_filenames):
  599. print(f"-- [Step 1/4] Parsing FP32 logs under {dump_path}/{filename}")
  600. fp32_tensor_info_list, fp32_has_tensor_name = parse_log(
  601. dump_path, filename, None
  602. )
  603. print(
  604. f"-- [Step 2/4] Parsing FP16 logs under {another_dump_path}/{filename}"
  605. )
  606. fp16_tensor_info_list, fp16_has_tensor_name = parse_log(
  607. another_dump_path, filename, None
  608. )
  609. print(f"-- [Step 3/4] Merge FP32 and FP16 tensor info for {filename}")
  610. mp_tensor_info_list = merge_tensor_info_list(
  611. fp32_tensor_info_list, fp16_tensor_info_list, grad_scale
  612. )
  613. print(
  614. f"-- [Step 4/4] Add worksheet for mixed precision tensor info of {filename}"
  615. )
  616. excel_writer.add_worksheet(
  617. mp_tensor_info_list,
  618. filename,
  619. loss_scale,
  620. False,
  621. )
  622. print(f"-- Write to {output_filename}")
  623. print("")
  624. excel_writer.close()