CUDAGraph.h 3.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495
  1. #pragma once
  2. #include <ATen/Tensor.h>
  3. #include <c10/core/Device.h>
  4. #include <c10/cuda/CUDACachingAllocator.h>
  5. #include <c10/cuda/CUDAGraphsC10Utils.h>
  6. #include <c10/cuda/CUDAStream.h>
  7. #include <c10/util/flat_hash_map.h>
  8. namespace at {
  9. struct Generator;
  10. struct CUDAGeneratorImpl;
  11. struct CUDAGeneratorState;
  12. namespace cuda {
  13. // Standalone way to get a unique mempool id usable as a pool=... argument
  14. // to CUDAGraph::capture_begin
  15. TORCH_CUDA_CPP_API MempoolId_t graph_pool_handle();
  16. struct TORCH_CUDA_CPP_API CUDAGraph {
  17. CUDAGraph(bool keep_graph=false);
  18. ~CUDAGraph();
  19. // See Note [Explicit Registration of Generators to the CUDA Graph]
  20. void register_generator_state(c10::intrusive_ptr<at::CUDAGeneratorState> state);
  21. void register_generator_state(const at::Generator& generator);
  22. void capture_begin(
  23. MempoolId_t pool = {0, 0},
  24. cudaStreamCaptureMode capture_mode = cudaStreamCaptureModeGlobal);
  25. void capture_end();
  26. void instantiate();
  27. void replay();
  28. void reset();
  29. MempoolId_t pool();
  30. void enable_debug_mode();
  31. void debug_dump(const std::string& debug_path);
  32. cudaGraph_t raw_cuda_graph();
  33. cudaGraphExec_t raw_cuda_graph_exec();
  34. protected:
  35. cudaGraph_t graph_ = nullptr;
  36. cudaGraphExec_t graph_exec_ = nullptr;
  37. // internal states so reset() can do its best cleaning up
  38. // Set to true in capture_end if cudaStreamEndCapture succeeded
  39. // Set back to false after instantiate() unless keep_graph=True or
  40. // enable_debug_mode() was called on any CUDAGraph instance.
  41. bool has_graph_ = false;
  42. // Set to true in capture_end if cudaStreamEndCapture succeeded
  43. bool capture_ended_ = false;
  44. // Set to true in capture_end if cudaGraphInstantiate succeeded
  45. bool has_graph_exec_ = false;
  46. // the ID assigned by cuda during graph capture,
  47. // used to identify when a stream is participating in capture
  48. CaptureId_t capture_id_ = -1;
  49. // uuid used to request a particular private mempool from CUDACachingAllocator.
  50. // By default, this will be set to {id_, 0}.
  51. //
  52. // If capture_begin is called with "pool=other_graph.pool()", this graph's mempool_id_
  53. // will be set to the other graph's mempool_id_, and therefore share a mempool with the
  54. // other graph.
  55. //
  56. // If capture_begin is called with "pool=handle" where "handle" came from graph_pool_handle(),
  57. // it will share a mempool with any other captures that used "pool=handle".
  58. //
  59. // Sharing a mempool across graphs saves memory, and it's safe if you
  60. // know you'll replay those graphs in the same order you captured them.
  61. MempoolId_t mempool_id_;
  62. // Stream on which capture began
  63. at::cuda::CUDAStream capture_stream_;
  64. // multiple generator states and their wholegraph_increments in this graph
  65. // that are managed by the CUDA Graph
  66. ska::flat_hash_map<c10::intrusive_ptr<at::CUDAGeneratorState>, uint64_t>
  67. captured_generator_states_;
  68. // Device where capture occurred. Right now, for simplicity, we require all ops
  69. // in a capture to run on the same device, but this is a limitation of CUDAGraph,
  70. // not CUDA itself. We can straightforwardly modify CUDAGraph to support multi-device
  71. // captures if needed.
  72. // init capture_dev_ as UNDEFINED_DEVICE to check that it stores the real device id in the destructor
  73. static constexpr c10::DeviceIndex UNDEFINED_DEVICE = -1;
  74. c10::DeviceIndex capture_dev_{UNDEFINED_DEVICE};
  75. bool keep_graph_;
  76. };
  77. } // namespace cuda
  78. } // namespace at