const_fold.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318
  1. # mypy: allow-untyped-defs
  2. import re
  3. from typing import Callable, Optional, Union
  4. import torch.fx
  5. from torch.fx.node import map_arg
  6. from torch.fx.passes.split_module import split_module
  7. __all__ = [
  8. "FoldedGraphModule",
  9. "get_unique_attr_name_in_module",
  10. "split_const_subgraphs",
  11. ]
  12. class FoldedGraphModule(torch.fx.GraphModule):
  13. """
  14. FoldedGraphModule is a GraphModule which also contains another
  15. `const_subgraph_module` representing a subgraph which has all const attr
  16. inputs and which can be run once before running the main standard
  17. `graph`. The `const_output_names` are the ordered list names of attrs which
  18. represent what each respective output from the const_subgraph should be set
  19. on which attrs.
  20. """
  21. def __init__(
  22. self,
  23. root: torch.nn.Module,
  24. graph: torch.fx.Graph,
  25. const_subgraph: Optional[torch.fx.Graph] = None,
  26. fx_const_folded_attrs_name: Optional[str] = None,
  27. device_for_folded_attrs: str = "cuda",
  28. ):
  29. super().__init__(root, graph)
  30. self.const_subgraph_module = (
  31. None
  32. if const_subgraph is None
  33. else torch.fx.GraphModule(root, const_subgraph)
  34. )
  35. self.has_folding_been_run = False
  36. self.fx_const_folded_attrs_name = fx_const_folded_attrs_name
  37. self.device_for_folded_attrs = device_for_folded_attrs
  38. def __call__(self, *args, **kwargs):
  39. if not self.has_folding_been_run:
  40. self.run_folding()
  41. return super().__call__(*args)
  42. def run_folding(self):
  43. # If there's no const subgraph module or attr output names to use, return
  44. # early as there is no const folding to perform.
  45. if (
  46. self.const_subgraph_module is None
  47. or self.fx_const_folded_attrs_name is None
  48. ):
  49. return
  50. assert not self.has_folding_been_run
  51. self.has_folding_been_run = True
  52. # Actually run const folding subgraph. Note that single attr const fold
  53. # subgraphs output a single Tensor while multiple outputs are returned as
  54. # Tuple[Tensor,].
  55. folded_attrs = self.const_subgraph_module()
  56. def _create_param(i):
  57. return torch.nn.Parameter(
  58. i.detach().clone()
  59. if not isinstance(i, int)
  60. else torch.Tensor([i]).to(device=self.device_for_folded_attrs),
  61. requires_grad=i.requires_grad if isinstance(i, torch.Tensor) else False,
  62. )
  63. params = (
  64. torch.nn.ParameterList([_create_param(i) for i in folded_attrs])
  65. if isinstance(folded_attrs, tuple)
  66. else _create_param(folded_attrs)
  67. )
  68. setattr(self, self.fx_const_folded_attrs_name, params)
  69. def _inline_module(gm: torch.fx.GraphModule, inline_mod_name: str):
  70. """
  71. Given `gm` and some graph module which is called with target name `inline_mod_name`,
  72. this helper will inline all of the nodes from that called graph module into `gm`.
  73. """
  74. # Fetch the inner graph module that we want to inline inside `gm`.
  75. inline_mod = dict(gm.named_modules())[inline_mod_name]
  76. assert isinstance(inline_mod, torch.fx.GraphModule)
  77. call_mod_node_to_replace = None
  78. for node in gm.graph.nodes:
  79. if node.op == "call_module" and node.target == inline_mod_name:
  80. call_mod_node_to_replace = node
  81. break
  82. assert call_mod_node_to_replace is not None
  83. # Now actually do the swap. Note that we have to keep track of new nodes that are
  84. # copied into `gm` -- we do this via replacement_mapping.
  85. call_mod_args = call_mod_node_to_replace.args
  86. call_mod_kwargs = call_mod_node_to_replace.kwargs
  87. replacement_mapping: dict[torch.fx.Node, torch.fx.Node] = {}
  88. ph_count = 0
  89. def replacement_fn(node):
  90. new_node = replacement_mapping[node]
  91. new_node.meta = node.meta.copy()
  92. return new_node
  93. for inline_node in inline_mod.graph.nodes:
  94. if inline_node.op == "placeholder":
  95. replacement_mapping[inline_node] = (
  96. call_mod_kwargs[inline_node.name]
  97. if inline_node.name in call_mod_kwargs
  98. else call_mod_args[ph_count]
  99. )
  100. ph_count += 1
  101. continue
  102. if inline_node.op == "output":
  103. outputs = inline_node.args[0]
  104. output_replacements = map_arg(outputs, replacement_fn)
  105. call_mod_node_to_replace.replace_all_uses_with(output_replacements)
  106. continue
  107. with gm.graph.inserting_before(call_mod_node_to_replace):
  108. new_node = gm.graph.node_copy(inline_node, replacement_fn)
  109. replacement_mapping[inline_node] = new_node
  110. gm.graph.eliminate_dead_code()
  111. def get_unique_attr_name_in_module(mod_traced: torch.fx.GraphModule, name: str) -> str:
  112. """
  113. Make sure the name is unique (in a module) and can represents an attr.
  114. """
  115. # Delete all characters that are illegal in a Python identifier.
  116. name = re.sub("[^0-9a-zA-Z_]+", "_", name)
  117. if name[0].isdigit():
  118. name = f"_{name}"
  119. # Now make sure it is in fact unique to the module by incrementing suffix value.
  120. while hasattr(mod_traced, name):
  121. match = re.match(r"(.*)_(\d+)$", name)
  122. if match is None:
  123. name = name + "_1"
  124. else:
  125. base, num = match.group(1, 2)
  126. name = f"{base}_{int(num) + 1}"
  127. return name
  128. def split_const_subgraphs(
  129. module: Union[torch.nn.Module, torch.fx.GraphModule],
  130. skip_folding_node_fn: Optional[Callable[[torch.fx.Node], bool]] = None,
  131. device_for_folded_attrs: str = "cpu",
  132. ) -> FoldedGraphModule:
  133. """
  134. Looks through `module` for any nodes that have all constant attribute inputs
  135. and separates them out into their own constant subgraph, and returns a
  136. FoldedGraphModule which runs that constant subgraph on the first run to set
  137. attributes on the module prior to running the non-constant portion of the
  138. graph.
  139. """
  140. import sympy
  141. if not isinstance(module, torch.fx.GraphModule):
  142. mod_traced = torch.fx.symbolic_trace(module)
  143. else:
  144. mod_traced = module
  145. # Build up a list of const_nodes, defined as nodes that are themselves
  146. # get_attrs, or have all get_attr or other constant node inputs.
  147. const_nodes: set[torch.fx.Node] = set()
  148. found_const_folding = False
  149. for node in mod_traced.graph.nodes:
  150. # Skip over placeholders/outputs because they can't be const folded and
  151. # we don't want to add tags to them.
  152. if node.op in {"placeholder", "output"}:
  153. continue
  154. # If the node itself is constant, or all of its inputs are constant,
  155. # then tag it as constant.
  156. if node.op != "get_attr" and not set(node.all_input_nodes).issubset(
  157. const_nodes
  158. ):
  159. continue
  160. # If provided skip folding function says to skip, then skip.
  161. if skip_folding_node_fn and skip_folding_node_fn(node):
  162. continue
  163. # Skip folding side-effectful functions
  164. if node.is_impure():
  165. continue
  166. # Skip folding nodes that have symbolic fill_value
  167. if isinstance(node.kwargs.get("fill_value", None), sympy.Expr):
  168. continue
  169. # Must be a constant foldable node at this point.
  170. const_nodes.add(node)
  171. if node.op != "get_attr":
  172. found_const_folding = True
  173. # If we did not find any const folding then return early without a const fold subgraph.
  174. if not found_const_folding:
  175. return FoldedGraphModule(mod_traced, mod_traced.graph)
  176. # Partition the module into two: submod_0 for constant folding subgraph, and
  177. # submod_1 for the rest.
  178. def mod_partition(node: torch.fx.Node):
  179. return 0 if node in const_nodes else 1
  180. split = split_module(mod_traced, module, mod_partition)
  181. const_mod_name, non_const_mod_name = "submod_0", "submod_1"
  182. # Safely get submod_1 in case there are no non-const nodes
  183. const_gm, non_const_gm = split.submod_0, getattr(split, non_const_mod_name, None)
  184. # The module that a call_module node refers to gets copied to submodules during split.
  185. # The path to the module also gets inlined, i.e. mod.a.b -> mod_a_b. Here we need to
  186. # attach inlined modules to `split` as it's the owning module now.
  187. for node in non_const_gm.graph.nodes if non_const_gm else []:
  188. if node.op == "call_module":
  189. setattr(split, node.target, getattr(non_const_gm, node.target))
  190. for node in const_gm.graph.nodes:
  191. if node.op == "call_module":
  192. setattr(split, node.target, getattr(const_gm, node.target))
  193. # split_module currently does not use get_attrs for attrs. Instead it passes
  194. # them in as args from the parent module, which used get_attrs. Here we set
  195. # them as get_attrs inside const_gm, allowing for running folding without
  196. # somehow a priori knowing the attrs that should be passed as args. We can
  197. # unconditionally do this for all placeholders because we know all
  198. # placeholders to const_gm must be constants accessible via get_attr.
  199. call_const_gm_args = None
  200. for node in split.graph.nodes:
  201. if node.op == "call_module":
  202. if node.target == const_mod_name:
  203. call_const_gm_args = node.args
  204. break
  205. assert call_const_gm_args is not None
  206. # Here we do the actual replacement of placeholders to get_attrs. Note that here we
  207. # set the const_gm.graph into a new root_const_gm with split as the root module,
  208. # because we are fetching attributes directly from the root module, instead of
  209. # fetching them from const_gm. Example: The const_gm must have some format like:
  210. # graph():
  211. # %inp : [num_users=1] = placeholder[target=const_inp]
  212. # %add : [num_users=1] = call_function[target=operator.add](args = (%inp, %inp), kwargs = {})
  213. # return add
  214. # We replace that with the following, which does not have any placeholders:
  215. # graph():
  216. # %inp_1 : [num_users=1] = get_attr[target=const_inp]
  217. # %add : [num_users=1] = call_function[target=operator.add](args = (%inp_1, %inp_1), kwargs = {})
  218. # return add
  219. root_const_gm = torch.fx.GraphModule(split, const_gm.graph)
  220. # The order of placeholders in the const_gm graph should match the order of
  221. # args in the outer module, so we can simply use an index for the
  222. # placeholder mapping
  223. ph_idx = 0
  224. for node in root_const_gm.graph.nodes:
  225. if node.op == "output":
  226. multiple_outputs = isinstance(node.args[0], tuple)
  227. continue
  228. if node.op != "placeholder":
  229. continue
  230. assert ph_idx < len(call_const_gm_args)
  231. in_node = call_const_gm_args[ph_idx]
  232. ph_idx += 1
  233. assert in_node.op == "get_attr"
  234. with root_const_gm.graph.inserting_before(node):
  235. new_node = root_const_gm.graph.get_attr(in_node.target)
  236. new_node.meta = node.meta.copy()
  237. node.replace_all_uses_with(new_node)
  238. root_const_gm.graph.erase_node(node)
  239. assert "multiple_outputs" in locals()
  240. # Now find the call to const_gm inside split, and replace it with a getattr to the
  241. # folded tensor(s) that result from constant folding. Note that we don't need to
  242. # worry about whether this is one or more tensors because the original graph
  243. # correctly uses getitem to extract individual tensors if there are multiple folded.
  244. fx_const_folded_attrs_name = get_unique_attr_name_in_module(
  245. mod_traced, "_FX_CONST_FOLDED_ATTRS"
  246. )
  247. setattr(
  248. split,
  249. fx_const_folded_attrs_name,
  250. torch.nn.ParameterList() if multiple_outputs else torch.nn.Parameter(), # type: ignore[possibly-undefined]
  251. )
  252. for node in split.graph.nodes:
  253. if node.op == "call_module" and node.target == const_mod_name:
  254. with node.graph.inserting_before(node):
  255. folded_attrs = node.graph.get_attr(fx_const_folded_attrs_name)
  256. folded_attrs.meta = node.meta.copy()
  257. node.replace_all_uses_with(folded_attrs)
  258. break
  259. # Finally, inline the non-constant submod (if it exists) into the split submod.
  260. # This is so that the original caller who may have passed in a graph module will
  261. # get back out a graph module whose graph is traced to the same granularity.
  262. if hasattr(split, non_const_mod_name):
  263. _inline_module(split, non_const_mod_name)
  264. split.graph.eliminate_dead_code()
  265. return FoldedGraphModule(
  266. split,
  267. split.graph,
  268. root_const_gm.graph,
  269. fx_const_folded_attrs_name,
  270. device_for_folded_attrs,
  271. )