Config.h 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549
  1. #if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
  2. /*
  3. * Copyright (c) Meta Platforms, Inc. and affiliates.
  4. * All rights reserved.
  5. *
  6. * This source code is licensed under the BSD-style license found in the
  7. * LICENSE file in the root directory of this source tree.
  8. */
  9. #pragma once
  10. #include "AbstractConfig.h"
  11. #include "ActivityType.h"
  12. #include <assert.h>
  13. #include <chrono>
  14. #include <functional>
  15. #include <set>
  16. #include <string>
  17. #include <vector>
  18. namespace libkineto {
  19. class Config : public AbstractConfig {
  20. public:
  21. Config();
  22. Config& operator=(const Config&) = delete;
  23. Config(Config&&) = delete;
  24. Config& operator=(Config&&) = delete;
  25. ~Config() override = default;
  26. // Return a full copy including feature config object
  27. std::unique_ptr<Config> clone() const {
  28. auto cfg = std::unique_ptr<Config>(new Config(*this));
  29. cloneFeaturesInto(*cfg);
  30. return cfg;
  31. }
  32. bool handleOption(const std::string& name, std::string& val) override;
  33. void setClientDefaults() override;
  34. // Log events to this file
  35. const std::string& eventLogFile() const {
  36. return eventLogFile_;
  37. }
  38. bool activityProfilerEnabled() const {
  39. return activityProfilerEnabled_ ||
  40. activitiesOnDemandTimestamp_.time_since_epoch().count() > 0;
  41. }
  42. // Log activitiy trace to this file
  43. const std::string& activitiesLogFile() const {
  44. return activitiesLogFile_;
  45. }
  46. // Log activitiy trace to this url
  47. const std::string& activitiesLogUrl() const {
  48. return activitiesLogUrl_;
  49. }
  50. void setActivitiesLogUrl(const std::string& url) {
  51. activitiesLogUrl_ = url;
  52. }
  53. bool activitiesLogToMemory() const {
  54. return activitiesLogToMemory_;
  55. }
  56. bool eventProfilerEnabled() const {
  57. return !eventNames_.empty() || !metricNames_.empty();
  58. }
  59. // Is profiling enabled for the given device?
  60. bool eventProfilerEnabledForDevice(uint32_t dev) const {
  61. return 0 != (eventProfilerDeviceMask_ & (1 << dev));
  62. }
  63. // Take a sample (read hardware counters) at this frequency.
  64. // This controls how often counters are read - if all counters cannot
  65. // be collected simultaneously then multiple samples are needed to
  66. // collect all requested counters - see multiplex period.
  67. std::chrono::milliseconds samplePeriod() const {
  68. return samplePeriod_;
  69. }
  70. void setSamplePeriod(std::chrono::milliseconds period) {
  71. samplePeriod_ = period;
  72. }
  73. // When all requested counters cannot be collected simultaneously,
  74. // counters will be multiplexed at this frequency.
  75. // Multiplexing can have a large performance impact if done frequently.
  76. // To avoid a perf impact, keep this at 1s or above.
  77. std::chrono::milliseconds multiplexPeriod() const {
  78. return multiplexPeriod_;
  79. }
  80. void setMultiplexPeriod(std::chrono::milliseconds period) {
  81. multiplexPeriod_ = period;
  82. }
  83. // Report counters at this frequency. Note that several samples can
  84. // be reported each time, see samplesPerReport.
  85. std::chrono::milliseconds reportPeriod() const {
  86. return reportPeriod_;
  87. }
  88. void setReportPeriod(std::chrono::milliseconds msecs);
  89. // Number of samples dispatched each report period.
  90. // Must be in the range [1, report period / sample period].
  91. // In other words, aggregation is supported but not interpolation.
  92. int samplesPerReport() const {
  93. return samplesPerReport_;
  94. }
  95. void setSamplesPerReport(int count) {
  96. samplesPerReport_ = count;
  97. }
  98. // The names of events to collect
  99. const std::set<std::string>& eventNames() const {
  100. return eventNames_;
  101. }
  102. // Add additional events to be profiled
  103. void addEvents(const std::set<std::string>& names) {
  104. eventNames_.insert(names.begin(), names.end());
  105. }
  106. // The names of metrics to collect
  107. const std::set<std::string>& metricNames() const {
  108. return metricNames_;
  109. }
  110. // Add additional metrics to be profiled
  111. void addMetrics(const std::set<std::string>& names) {
  112. metricNames_.insert(names.begin(), names.end());
  113. }
  114. const std::vector<int>& percentiles() const {
  115. return eventReportPercentiles_;
  116. }
  117. // Profile for this long, then revert to base config
  118. std::chrono::seconds eventProfilerOnDemandDuration() const {
  119. return eventProfilerOnDemandDuration_;
  120. }
  121. void setEventProfilerOnDemandDuration(std::chrono::seconds duration) {
  122. eventProfilerOnDemandDuration_ = duration;
  123. }
  124. // Too many event profilers on a single system can overload the driver.
  125. // At some point, latencies shoot through the roof and collection of samples
  126. // becomes impossible. To avoid this situation we have a limit of profilers
  127. // per GPU.
  128. // NOTE: Communication with a daemon is needed for this feature.
  129. // Library must be built with an active DaemonConfigLoader.
  130. int maxEventProfilersPerGpu() const {
  131. return eventProfilerMaxInstancesPerGpu_;
  132. }
  133. // On Cuda11 we've seen occasional hangs when reprogramming counters
  134. // Monitor profiling threads and report when a thread is not responding
  135. // for a given number of seconds.
  136. // A period of 0 means disable.
  137. std::chrono::seconds eventProfilerHeartbeatMonitorPeriod() const {
  138. return eventProfilerHeartbeatMonitorPeriod_;
  139. }
  140. // The types of activities selected in the configuration file
  141. const std::set<ActivityType>& selectedActivityTypes() const {
  142. return selectedActivityTypes_;
  143. }
  144. // Set the types of activities to be traced
  145. bool perThreadBufferEnabled() const {
  146. return perThreadBufferEnabled_;
  147. }
  148. void setSelectedActivityTypes(const std::set<ActivityType>& types) {
  149. selectedActivityTypes_ = types;
  150. }
  151. bool isReportInputShapesEnabled() const {
  152. return enableReportInputShapes_;
  153. }
  154. bool isProfileMemoryEnabled() const {
  155. return enableProfileMemory_;
  156. }
  157. bool isWithStackEnabled() const {
  158. return enableWithStack_;
  159. }
  160. bool isWithFlopsEnabled() const {
  161. return enableWithFlops_;
  162. }
  163. bool isWithModulesEnabled() const {
  164. return enableWithModules_;
  165. }
  166. // Trace for this long
  167. std::chrono::milliseconds activitiesDuration() const {
  168. return activitiesDuration_;
  169. }
  170. // Trace for this many iterations, determined by external API
  171. int activitiesRunIterations() const {
  172. return activitiesRunIterations_;
  173. }
  174. int activitiesMaxGpuBufferSize() const {
  175. return activitiesMaxGpuBufferSize_;
  176. }
  177. std::chrono::seconds activitiesWarmupDuration() const {
  178. return activitiesWarmupDuration_;
  179. }
  180. int activitiesWarmupIterations() const {
  181. return activitiesWarmupIterations_;
  182. }
  183. // Show CUDA Synchronization Stream Wait Events
  184. bool activitiesCudaSyncWaitEvents() const {
  185. return activitiesCudaSyncWaitEvents_;
  186. }
  187. void setActivitiesCudaSyncWaitEvents(bool enable) {
  188. activitiesCudaSyncWaitEvents_ = enable;
  189. }
  190. // Timestamp at which the profiling to start, requested by the user.
  191. const std::chrono::time_point<std::chrono::system_clock> requestTimestamp()
  192. const {
  193. if (profileStartTime_.time_since_epoch().count()) {
  194. return profileStartTime_;
  195. }
  196. // If no one requested timestamp, return 0.
  197. if (requestTimestamp_.time_since_epoch().count() == 0) {
  198. return requestTimestamp_;
  199. }
  200. // TODO(T94634890): Deprecate requestTimestamp
  201. return requestTimestamp_ + maxRequestAge() + activitiesWarmupDuration();
  202. }
  203. bool hasProfileStartTime() const {
  204. return requestTimestamp_.time_since_epoch().count() > 0 ||
  205. profileStartTime_.time_since_epoch().count() > 0;
  206. }
  207. int profileStartIteration() const {
  208. return profileStartIteration_;
  209. }
  210. bool hasProfileStartIteration() const {
  211. return profileStartIteration_ >= 0 && activitiesRunIterations_ > 0;
  212. }
  213. void setProfileStartIteration(int iter) {
  214. profileStartIteration_ = iter;
  215. }
  216. int profileStartIterationRoundUp() const {
  217. return profileStartIterationRoundUp_;
  218. }
  219. // calculate the start iteration accounting for warmup
  220. int startIterationIncludingWarmup() const {
  221. if (!hasProfileStartIteration()) {
  222. return -1;
  223. }
  224. return profileStartIteration_ - activitiesWarmupIterations_;
  225. }
  226. const std::chrono::seconds maxRequestAge() const;
  227. // All VLOG* macros will log if the verbose log level is >=
  228. // the verbosity specified for the verbose log message.
  229. // Default value is -1, so messages with log level 0 will log by default.
  230. int verboseLogLevel() const {
  231. return verboseLogLevel_;
  232. }
  233. // Modules for which verbose logging is enabled.
  234. // If empty, logging is enabled for all modules.
  235. const std::vector<std::string>& verboseLogModules() const {
  236. return verboseLogModules_;
  237. }
  238. bool sigUsr2Enabled() const {
  239. return enableSigUsr2_;
  240. }
  241. bool ipcFabricEnabled() const {
  242. return enableIpcFabric_;
  243. }
  244. std::chrono::seconds onDemandConfigUpdateIntervalSecs() const {
  245. return onDemandConfigUpdateIntervalSecs_;
  246. }
  247. static std::chrono::milliseconds alignUp(
  248. std::chrono::milliseconds duration,
  249. std::chrono::milliseconds alignment) {
  250. duration += alignment;
  251. return duration - (duration % alignment);
  252. }
  253. std::chrono::time_point<std::chrono::system_clock>
  254. eventProfilerOnDemandStartTime() const {
  255. return eventProfilerOnDemandTimestamp_;
  256. }
  257. std::chrono::time_point<std::chrono::system_clock>
  258. eventProfilerOnDemandEndTime() const {
  259. return eventProfilerOnDemandTimestamp_ + eventProfilerOnDemandDuration_;
  260. }
  261. std::chrono::time_point<std::chrono::system_clock>
  262. activityProfilerRequestReceivedTime() const {
  263. return activitiesOnDemandTimestamp_;
  264. }
  265. static constexpr std::chrono::milliseconds kControllerIntervalMsecs{1000};
  266. // Users may request and set trace id and group trace id.
  267. const std::string& requestTraceID() const {
  268. return requestTraceID_;
  269. }
  270. void setRequestTraceID(const std::string& tid) {
  271. requestTraceID_ = tid;
  272. }
  273. const std::string& requestGroupTraceID() const {
  274. return requestGroupTraceID_;
  275. }
  276. void setRequestGroupTraceID(const std::string& gtid) {
  277. requestGroupTraceID_ = gtid;
  278. }
  279. size_t cuptiDeviceBufferSize() const {
  280. return cuptiDeviceBufferSize_;
  281. }
  282. size_t cuptiDeviceBufferPoolLimit() const {
  283. return cuptiDeviceBufferPoolLimit_;
  284. }
  285. bool memoryProfilerEnabled() const {
  286. return memoryProfilerEnabled_;
  287. }
  288. int profileMemoryDuration() const {
  289. return profileMemoryDuration_;
  290. }
  291. void updateActivityProfilerRequestReceivedTime();
  292. void printActivityProfilerConfig(std::ostream& s) const override;
  293. void setActivityDependentConfig() override;
  294. void validate(
  295. const std::chrono::time_point<std::chrono::system_clock>&
  296. fallbackProfileStartTime) override;
  297. static void addConfigFactory(
  298. std::string name,
  299. std::function<AbstractConfig*(Config&)> factory);
  300. void print(std::ostream& s) const;
  301. // Config relies on some state with global static lifetime. If other
  302. // threads are using the config, it's possible that the global state
  303. // is destroyed before the threads stop. By hanging onto this handle,
  304. // correct destruction order can be ensured.
  305. static std::shared_ptr<void> getStaticObjectsLifetimeHandle();
  306. bool getTSCTimestampFlag() const {
  307. return useTSCTimestamp_;
  308. }
  309. void setTSCTimestampFlag(bool flag) {
  310. useTSCTimestamp_ = flag;
  311. }
  312. const std::string& getCustomConfig() const {
  313. return customConfig_;
  314. }
  315. uint32_t maxEvents() const {
  316. return maxEvents_;
  317. }
  318. private:
  319. explicit Config(const Config& other) = default;
  320. AbstractConfig* cloneDerived(AbstractConfig& parent) const override {
  321. // Clone from AbstractConfig not supported
  322. assert(false);
  323. return nullptr;
  324. }
  325. uint8_t createDeviceMask(const std::string& val);
  326. // Adds valid activity types from the user defined string list in the
  327. // configuration file
  328. void setActivityTypes(const std::vector<std::string>& selected_activities);
  329. // Sets the default activity types to be traced
  330. void selectDefaultActivityTypes() {
  331. // If the user has not specified an activity list, add all types
  332. for (ActivityType t : defaultActivityTypes()) {
  333. selectedActivityTypes_.insert(t);
  334. }
  335. }
  336. int verboseLogLevel_;
  337. std::vector<std::string> verboseLogModules_;
  338. // Event profiler
  339. // These settings are also supported in on-demand mode
  340. std::chrono::milliseconds samplePeriod_;
  341. std::chrono::milliseconds reportPeriod_;
  342. int samplesPerReport_;
  343. std::set<std::string> eventNames_;
  344. std::set<std::string> metricNames_;
  345. // On-demand duration
  346. std::chrono::seconds eventProfilerOnDemandDuration_;
  347. // Last on-demand request
  348. std::chrono::time_point<std::chrono::system_clock>
  349. eventProfilerOnDemandTimestamp_;
  350. int eventProfilerMaxInstancesPerGpu_;
  351. // Monitor whether event profiler threads are stuck
  352. // at this frequency
  353. std::chrono::seconds eventProfilerHeartbeatMonitorPeriod_;
  354. // These settings can not be changed on-demand
  355. std::string eventLogFile_;
  356. std::vector<int> eventReportPercentiles_ = {5, 25, 50, 75, 95};
  357. uint8_t eventProfilerDeviceMask_ = ~0;
  358. std::chrono::milliseconds multiplexPeriod_;
  359. // Activity profiler
  360. bool activityProfilerEnabled_;
  361. // Enable per-thread buffer
  362. bool perThreadBufferEnabled_;
  363. std::set<ActivityType> selectedActivityTypes_;
  364. // The activity profiler settings are all on-demand
  365. std::string activitiesLogFile_;
  366. std::string activitiesLogUrl_;
  367. // Log activities to memory buffer
  368. bool activitiesLogToMemory_{false};
  369. int activitiesMaxGpuBufferSize_;
  370. std::chrono::seconds activitiesWarmupDuration_;
  371. int activitiesWarmupIterations_;
  372. bool activitiesCudaSyncWaitEvents_;
  373. // Enable Profiler Config Options
  374. // Temporarily disable shape collection until we re-roll out the feature for
  375. // on-demand cases
  376. bool enableReportInputShapes_{false};
  377. bool enableProfileMemory_{false};
  378. bool enableWithStack_{false};
  379. bool enableWithFlops_{false};
  380. bool enableWithModules_{false};
  381. // Profile for specified iterations and duration
  382. std::chrono::milliseconds activitiesDuration_;
  383. int activitiesRunIterations_;
  384. // Below are not used
  385. // Use this net name for iteration count
  386. std::string activitiesExternalAPIIterationsTarget_;
  387. // Only profile nets that includes this in the name
  388. std::vector<std::string> activitiesExternalAPIFilter_;
  389. // Only profile nets with at least this many operators
  390. int activitiesExternalAPINetSizeThreshold_;
  391. // Only profile nets with at least this many GPU operators
  392. int activitiesExternalAPIGpuOpCountThreshold_;
  393. // Last activity profiler request
  394. std::chrono::time_point<std::chrono::system_clock>
  395. activitiesOnDemandTimestamp_;
  396. // ActivityProfilers are triggered by either:
  397. // Synchronized start timestamps
  398. std::chrono::time_point<std::chrono::system_clock> profileStartTime_;
  399. // Or start iterations.
  400. int profileStartIteration_;
  401. int profileStartIterationRoundUp_;
  402. // DEPRECATED
  403. std::chrono::time_point<std::chrono::system_clock> requestTimestamp_;
  404. // Enable profiling via SIGUSR2
  405. bool enableSigUsr2_;
  406. // Enable IPC Fabric instead of thrift communication
  407. bool enableIpcFabric_;
  408. std::chrono::seconds onDemandConfigUpdateIntervalSecs_;
  409. // Logger Metadata
  410. std::string requestTraceID_;
  411. std::string requestGroupTraceID_;
  412. // CUPTI Device Buffer
  413. size_t cuptiDeviceBufferSize_;
  414. size_t cuptiDeviceBufferPoolLimit_;
  415. // CUPTI Timestamp Format
  416. bool useTSCTimestamp_{true};
  417. // Memory Profiler
  418. bool memoryProfilerEnabled_{false};
  419. int profileMemoryDuration_{1000};
  420. // Used to flexibly configure some custom options, especially for custom
  421. // backends. How to parse this string is handled by the custom backend.
  422. std::string customConfig_;
  423. // Roctracer settings
  424. uint32_t maxEvents_{5000000};
  425. };
  426. constexpr char kUseDaemonEnvVar[] = "KINETO_USE_DAEMON";
  427. bool isDaemonEnvVarSet();
  428. } // namespace libkineto
  429. #else
  430. #error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
  431. #endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)