graphs.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481
  1. # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import os
  15. import warnings
  16. import paddle
  17. from paddle.base import core
  18. from paddle.base.core import (
  19. CUDAPlace,
  20. is_compiled_with_cuda,
  21. is_compiled_with_rocm,
  22. )
  23. if is_compiled_with_cuda() or is_compiled_with_rocm():
  24. from paddle.base.core import CUDAGraph as CoreCUDAGraph
  25. def is_cuda_graph_supported():
  26. return True
  27. else:
  28. CoreCUDAGraph = None
  29. def is_cuda_graph_supported():
  30. return False
  31. ALL_MODES = ["global", "thread_local", "relaxed"]
  32. cuda_graph_id = 0
  33. class CUDAGraph:
  34. def __init__(self, place=None, mode="thread_local"):
  35. assert (
  36. CoreCUDAGraph is not None
  37. ), "CUDA Graph is only supported on PaddlePaddle compiled with NVIDIA GPU."
  38. self._graph = None
  39. if place is None:
  40. device_id = int(os.environ.get('FLAGS_selected_gpus', 0))
  41. place = CUDAPlace(device_id)
  42. self._place = place
  43. assert mode in ALL_MODES
  44. self._mode = ALL_MODES.index(mode)
  45. def capture_begin(self):
  46. CoreCUDAGraph.begin_capture(self._place, self._mode)
  47. def capture_end(self):
  48. self._graph = CoreCUDAGraph.end_capture()
  49. def replay(self):
  50. self._graph.replay()
  51. def reset(self):
  52. self._graph.reset()
  53. def print_to_dot_files(self, dirname, flags=None):
  54. if not isinstance(dirname, (str, bytes)):
  55. dirname = dirname.name
  56. os.makedirs(name=dirname, exist_ok=True)
  57. assert os.path.isdir(
  58. dirname
  59. ), f"The dirname {dirname} should be a directory"
  60. if flags is None:
  61. flags = 2047 # only all information. It can be any integer inside [1, 2048)
  62. self._graph.print_to_dot_files(dirname, flags)
  63. def wrap_cuda_graph(function, mode="thread_local", memory_pool="default"):
  64. assert mode in ALL_MODES
  65. if not paddle.in_dynamic_mode():
  66. # static graph mode
  67. from paddle.base.framework import _cuda_graph_guard
  68. global cuda_graph_id
  69. graph_id = str(cuda_graph_id)
  70. cuda_graph_id += 1
  71. if memory_pool == 'default':
  72. memory_pool_id = 0
  73. elif memory_pool == 'new':
  74. memory_pool_id = CoreCUDAGraph.gen_new_memory_pool_id()
  75. else:
  76. raise ValueError(
  77. "memory_pool should be one of default or new under static graph mode, but got",
  78. memory_pool,
  79. )
  80. return _cuda_graph_guard(
  81. mode + ';' + str(memory_pool_id) + ';' + graph_id
  82. )(lambda *args, **kwargs: function(*args, **kwargs))
  83. from paddle.jit import to_static
  84. from paddle.nn import Layer
  85. new_function = to_static(function)
  86. if isinstance(function, Layer):
  87. mock_func = new_function.forward
  88. else:
  89. mock_func = new_function
  90. mock_func._cuda_graph_capture_mode = mode
  91. if memory_pool == "default":
  92. mock_func._cuda_graph_pool_id = 0
  93. elif memory_pool == "new":
  94. mock_func._cuda_graph_pool_id = CoreCUDAGraph.gen_new_memory_pool_id()
  95. else:
  96. if isinstance(memory_pool, Layer):
  97. mock_func._cuda_graph_pool_id = (
  98. memory_pool.forward._cuda_graph_pool_id
  99. )
  100. else:
  101. mock_func._cuda_graph_pool_id = memory_pool._cuda_graph_pool_id
  102. return new_function
  103. def copy_var_desc(dst, src):
  104. """
  105. copy var desc from src to dst
  106. :param dst: framework.VarDesc(cpp), dst var desc, cpp VarDesc instance
  107. :param src: framework.VarDesc(cpp), src var desc, cpp VarDesc instance
  108. :return: no return
  109. """
  110. dst.set_shape(src.shape)
  111. dst.set_dtype(src.dtype)
  112. dst.set_lod_level(src.lod_level)
  113. dst.set_type(src.type)
  114. dst.set_persistable(src.persistable)
  115. dst.set_is_parameter(src.is_parameter)
  116. dst.set_stop_gradient(src.stop_gradient)
  117. def all_inputs_of_later_op(block, begin_idx):
  118. """
  119. find all inputs of ops after an idx, used to determine the logical output of a cuda graph section
  120. :param block: framework.Block, the original block
  121. :param begin_idx: int, from which idx (not include) to find the later ins
  122. :return: a list of inputs names for all ops behind begin_idx
  123. """
  124. ins = []
  125. for idx, op in enumerate(block.ops):
  126. if idx <= begin_idx:
  127. continue
  128. for in_name in op.input_arg_names:
  129. ins.append(in_name)
  130. return list(set(ins))
  131. def construct_program_and_find_ins_outs(section, origin_program, section_idx):
  132. """
  133. 1. Construct a new program for corresponding section
  134. 2. Find all the logical inputs and outputs of a program section
  135. :param section: list, one cuda graph section, list of ops
  136. :param origin_program: framework.Program, origin program
  137. :param section_idx: list, the section ops' idx corresponding to the cuda graph section, a list of idx
  138. :return: a new program for the cuda graph section
  139. the logical ins and outs of the cuda graph section
  140. """
  141. program = paddle.static.Program()
  142. block = program.global_block()
  143. origin_block = origin_program.global_block()
  144. ins = []
  145. outs = []
  146. op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
  147. later_ins = all_inputs_of_later_op(origin_block, section_idx[-1])
  148. for op in section:
  149. for in_name in op.input_arg_names:
  150. var = origin_block.var(in_name)
  151. new_var_desc = block.desc.var(var.name.encode("ascii"))
  152. copy_var_desc(new_var_desc, var)
  153. if outs.count(in_name) == 0 and ins.count(in_name) == 0:
  154. # This in var is generated from op outside this section
  155. # Only record once for same input
  156. ins.append(in_name)
  157. elif later_ins.count(in_name) == 0 and outs.count(in_name) > 0:
  158. # this is var is generated from op inside this section, and only will be used inside this section
  159. outs.remove(in_name)
  160. for out_name in op.output_arg_names:
  161. var = origin_block.var(out_name)
  162. new_var_desc = block.desc.var(var.name.encode("ascii"))
  163. copy_var_desc(new_var_desc, var)
  164. # for every output, we add it to the section's outs
  165. if outs.count(out_name) == 0:
  166. # Only record one out var even if it will be generated by multi ops.
  167. # For scenario like this:
  168. # A = op1(a)
  169. # A = op2(b)
  170. # B = op3(A)
  171. outs.append(out_name)
  172. new_op_desc = block.desc.append_op()
  173. new_op_desc.copy_from(op.desc)
  174. new_op_desc._set_attr(op_role_attr_name, op.attr(op_role_attr_name))
  175. program._sync_with_cpp()
  176. return program, [ins, outs]
  177. def get_cuda_graph_sections(program):
  178. """
  179. get all sections that should run under cuda graph and the corresponding idx
  180. :param program: framework.Program, the original program
  181. :return: A list of cuda graph sections and the corresponding ops' idx in the block.
  182. The program is under is test or not.
  183. """
  184. block = program.global_block()
  185. cuda_graph_sections = [] # record all ops in every cuda graph sections
  186. sections_idx = [] # idx of all ops in every cuda graph sections
  187. is_test = False # will be set to True is any op's 'is_test' attr is True
  188. # ops and it's idx between cuda graph wrapped op, may belong to a section
  189. internal_section = []
  190. internal_idx = []
  191. current_section = [] # current recording cuda graph sections
  192. current_idx = [] # current recording cuda graph ops' idx
  193. current_cuda_graph_id = -1 # current recording cuda graph id
  194. op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
  195. loss_op_role = int(core.op_proto_and_checker_maker.OpRole.Loss)
  196. backward_op_role = int(core.op_proto_and_checker_maker.OpRole.Backward)
  197. loss_grad_op_role = loss_op_role | backward_op_role
  198. for idx, op in enumerate(block.ops):
  199. if op.type == 'conditional_block' or op.type == 'while':
  200. assert (
  201. op._cuda_graph_attr is None
  202. ), "Cuda graph not support conditional block op and while op."
  203. if op.has_attr('is_test') and op.attr('is_test'):
  204. is_test = True
  205. # find cuda graph sections
  206. if op._cuda_graph_attr is not None:
  207. assert isinstance(
  208. op._cuda_graph_attr, str
  209. ), "cuda_graph_attr should be a str"
  210. cuda_graph_attrs = op._cuda_graph_attr.split(';')
  211. assert len(cuda_graph_attrs) == 3, (
  212. "cuda graph attr should have three fields: "
  213. "cuda graph mode, cuda graph memory pool id, cuda graph id"
  214. )
  215. local_cuda_graph_id = int(cuda_graph_attrs[2])
  216. if local_cuda_graph_id == current_cuda_graph_id:
  217. if len(internal_section) > 0:
  218. assert len(internal_section) == len(
  219. internal_idx
  220. ), "len of internal section should be equal with len of internal idx"
  221. for internal_op in internal_section:
  222. loss_related = (
  223. int(internal_op.attr(op_role_attr_name))
  224. == loss_op_role
  225. ) or int(
  226. (internal_op.attr(op_role_attr_name))
  227. == loss_grad_op_role
  228. )
  229. sub_block_related = (
  230. op.type == 'conditional_block' or op.type == 'while'
  231. )
  232. if loss_related or sub_block_related:
  233. # If loss_related is True
  234. # The internal section contains loss related ops,
  235. # although these ops are between two cuda graph sections with same graph id,
  236. # they belong to none of these two sections.
  237. # The loss related op should be wrapped by user explicitly.
  238. # If sub_block_related is True
  239. # The internal section contains while op or conditional block op.
  240. # These two ops are not supported by cuda graph. Won't extend the section.
  241. internal_section = []
  242. internal_idx = []
  243. # Beside clear the internal section, a new cuda graph section should be recorded
  244. assert len(current_section) == len(
  245. current_idx
  246. ), "num of section's op is not equal with the idx"
  247. if len(current_section) > 0:
  248. # store previous section
  249. cuda_graph_sections.append(current_section)
  250. sections_idx.append(current_idx)
  251. current_section = []
  252. current_idx = []
  253. break
  254. # some ops inserted by some optimizer, should be added to current section
  255. for i in range(len(internal_section)):
  256. current_section.append(internal_section[i])
  257. current_idx.append(internal_idx[i])
  258. internal_section = []
  259. internal_idx = []
  260. current_section.append(op)
  261. current_idx.append(idx)
  262. else:
  263. # current graph id is different with previous, start a new section of cuda graph
  264. # internal ops and idx belong to no section, just clear it
  265. internal_section = []
  266. internal_idx = []
  267. current_cuda_graph_id = (
  268. local_cuda_graph_id # start record a new section
  269. )
  270. assert len(current_section) == len(
  271. current_idx
  272. ), "num of section's op is not equal with num of idx"
  273. if len(current_section) > 0:
  274. # store previous section
  275. cuda_graph_sections.append(current_section)
  276. sections_idx.append(current_idx)
  277. current_section = [op]
  278. current_idx = [idx]
  279. else:
  280. # recode ops which cuda_graph_attr is None, may belong to a section
  281. internal_section.append(op)
  282. internal_idx.append(idx)
  283. # handle the last section
  284. assert len(current_section) == len(
  285. current_idx
  286. ), "num of section's op is not equal with num of idx"
  287. if len(current_section) > 0:
  288. # store previous section
  289. cuda_graph_sections.append(current_section)
  290. sections_idx.append(current_idx)
  291. return cuda_graph_sections, sections_idx, is_test
  292. def replace_cuda_graph_section(
  293. ins_and_outs,
  294. section_program,
  295. section_idx,
  296. origin_program,
  297. cuda_graph_section,
  298. order,
  299. is_test,
  300. ):
  301. """
  302. Use section_program and ins_and_outs to initialize a run_program_op,
  303. and replace the section_idx marks ops in the origin program.
  304. :param ins_and_outs: list, the logical ins and outs of the section program
  305. :param section_program: framework.Program, the partial program need to run under cuda graph
  306. :param section_idx: list, the idx need to be removed from origin program
  307. :param origin_program: framework.Program, the origin program
  308. :param cuda_graph_section: list, the ops in current sections, used to get the mode, memory pool id and is_test
  309. :param order: int, the order of current section, used to create unique cuda graph var
  310. :param is_test: bool, the program is running under is_test or not
  311. :return: no return
  312. """
  313. ins = ins_and_outs[0]
  314. outs = ins_and_outs[1]
  315. insert_idx = section_idx[0]
  316. origin_block = origin_program.global_block()
  317. for idx in reversed(section_idx):
  318. # remove all cuda graph marked ops from origin block
  319. origin_block._remove_op(idx, sync=False)
  320. mode = None
  321. memory_pool_id = None
  322. for op in cuda_graph_section:
  323. # find the cuda graph mode and memory pool id, determine is test or not
  324. if op._cuda_graph_attr is not None:
  325. attrs = op._cuda_graph_attr.split(';')
  326. mode = attrs[0]
  327. memory_pool_id = int(attrs[1])
  328. break
  329. assert (
  330. mode is not None and memory_pool_id is not None
  331. ), "mode and memory pool id should be specified in cuda graph attr"
  332. cuda_graph_var = origin_block.create_var(
  333. name="cuda_graph_" + str(order),
  334. type=core.VarDesc.VarType.RAW,
  335. persistable=True,
  336. stop_gradient=True,
  337. )
  338. # not used for the run_program_op, just needed by the op, but won't be used
  339. out_scope_var = origin_block.create_var(
  340. name="program_out_scope_" + str(order),
  341. type=core.VarDesc.VarType.STEP_SCOPES,
  342. persistable=True,
  343. stop_gradient=True,
  344. )
  345. program_id = paddle.utils._hash_with_id(section_program, ins_and_outs)
  346. # insert the run_program_op into the block
  347. origin_block._insert_op(
  348. insert_idx,
  349. type='run_program',
  350. inputs={'X': ins},
  351. outputs={
  352. 'Out': outs,
  353. 'OutScope': out_scope_var,
  354. 'CUDAGraph': cuda_graph_var,
  355. },
  356. attrs={
  357. 'global_block': section_program.global_block(),
  358. 'start_op_index': 0,
  359. 'end_op_index': len(section_program.global_block().ops),
  360. 'is_test': is_test,
  361. 'program_id': program_id,
  362. 'cuda_graph_capture_mode': mode,
  363. 'cuda_graph_pool_id': memory_pool_id,
  364. # Todo: now not support use interpretercore
  365. 'use_interpretorcore': False,
  366. 'forward_global_block': section_program.global_block(),
  367. 'backward_global_block': section_program.global_block(),
  368. },
  369. )
  370. def cuda_graph_transform(program):
  371. """
  372. replace the ops marked with cuda_graph_attr to run_program_op to use cuda graph
  373. :param program: framework.Program, the program to be transformed
  374. :return: the cuda graph section program, user should hold these programs!
  375. """
  376. if len(program.blocks) > 1:
  377. # some sub blocks may be inserted by optimizer but will not use during training, just warn here
  378. warnings.warn(
  379. "Sub block(s) has been detected in the program. "
  380. "Cuda graph not support op with sub block, and it will only handle the global block."
  381. )
  382. # step 1: get all cuda graph sections.
  383. # A cuda graph section contains all ops marked with same cuda graph id and
  384. # some ops inserted by some optimizers (amp, sharding for example) between ops with same id.
  385. cuda_graph_sections, sections_idx, is_test = get_cuda_graph_sections(
  386. program
  387. )
  388. assert len(cuda_graph_sections) == len(
  389. sections_idx
  390. ), "num of cuda graph sections is not equal with num of idx sections"
  391. # step 2: construct new program for each section and find inputs and outputs of each section.
  392. # The inputs are variables generated outside the section but will be used by this section.
  393. # The outputs are variables generated by this section and will be used after the end of the section.
  394. ins_and_outs = []
  395. section_programs = []
  396. for i in range(len(cuda_graph_sections)):
  397. # creating new program for current section
  398. section_program, ins_outs = construct_program_and_find_ins_outs(
  399. cuda_graph_sections[i], program, sections_idx[i]
  400. )
  401. ins_and_outs.append(ins_outs)
  402. section_programs.append(section_program)
  403. assert len(section_programs) == len(
  404. cuda_graph_sections
  405. ), "the num of cuda graph sections should be equal with the num of new program"
  406. # step 3: replace the ops in original program with run_program_op.
  407. # Will remove all ops in the section from origin program, and use run_program_op to replace them.
  408. for i in reversed(range(len(cuda_graph_sections))):
  409. # carry out the replacement in reversed order, to keep the previous idx intact
  410. replace_cuda_graph_section(
  411. ins_and_outs[i],
  412. section_programs[i],
  413. sections_idx[i],
  414. program,
  415. cuda_graph_sections[i],
  416. order=i,
  417. is_test=is_test,
  418. )
  419. # NOTE: user should hold these program, for now just return these program back to caller
  420. return section_programs