Config.h 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542
  1. /*
  2. * Copyright (c) Meta Platforms, Inc. and affiliates.
  3. * All rights reserved.
  4. *
  5. * This source code is licensed under the BSD-style license found in the
  6. * LICENSE file in the root directory of this source tree.
  7. */
  8. #pragma once
  9. #include "AbstractConfig.h"
  10. #include "ActivityType.h"
  11. #include <assert.h>
  12. #include <chrono>
  13. #include <functional>
  14. #include <set>
  15. #include <string>
  16. #include <vector>
  17. namespace libkineto {
  18. class Config : public AbstractConfig {
  19. public:
  20. Config();
  21. Config& operator=(const Config&) = delete;
  22. Config(Config&&) = delete;
  23. Config& operator=(Config&&) = delete;
  24. // Return a full copy including feature config object
  25. std::unique_ptr<Config> clone() const {
  26. auto cfg = std::unique_ptr<Config>(new Config(*this));
  27. cloneFeaturesInto(*cfg);
  28. return cfg;
  29. }
  30. bool handleOption(const std::string& name, std::string& val) override;
  31. void setClientDefaults() override;
  32. // Log events to this file
  33. const std::string& eventLogFile() const {
  34. return eventLogFile_;
  35. }
  36. bool activityProfilerEnabled() const {
  37. return activityProfilerEnabled_ ||
  38. activitiesOnDemandTimestamp_.time_since_epoch().count() > 0;
  39. }
  40. // Log activitiy trace to this file
  41. const std::string& activitiesLogFile() const {
  42. return activitiesLogFile_;
  43. }
  44. // Log activitiy trace to this url
  45. const std::string& activitiesLogUrl() const {
  46. return activitiesLogUrl_;
  47. }
  48. void setActivitiesLogUrl(const std::string& url) {
  49. activitiesLogUrl_ = url;
  50. }
  51. bool activitiesLogToMemory() const {
  52. return activitiesLogToMemory_;
  53. }
  54. bool eventProfilerEnabled() const {
  55. return !eventNames_.empty() || !metricNames_.empty();
  56. }
  57. // Is profiling enabled for the given device?
  58. bool eventProfilerEnabledForDevice(uint32_t dev) const {
  59. return 0 != (eventProfilerDeviceMask_ & (1 << dev));
  60. }
  61. // Take a sample (read hardware counters) at this frequency.
  62. // This controls how often counters are read - if all counters cannot
  63. // be collected simultaneously then multiple samples are needed to
  64. // collect all requested counters - see multiplex period.
  65. std::chrono::milliseconds samplePeriod() const {
  66. return samplePeriod_;
  67. }
  68. void setSamplePeriod(std::chrono::milliseconds period) {
  69. samplePeriod_ = period;
  70. }
  71. // When all requested counters cannot be collected simultaneously,
  72. // counters will be multiplexed at this frequency.
  73. // Multiplexing can have a large performance impact if done frequently.
  74. // To avoid a perf impact, keep this at 1s or above.
  75. std::chrono::milliseconds multiplexPeriod() const {
  76. return multiplexPeriod_;
  77. }
  78. void setMultiplexPeriod(std::chrono::milliseconds period) {
  79. multiplexPeriod_ = period;
  80. }
  81. // Report counters at this frequency. Note that several samples can
  82. // be reported each time, see samplesPerReport.
  83. std::chrono::milliseconds reportPeriod() const {
  84. return reportPeriod_;
  85. }
  86. void setReportPeriod(std::chrono::milliseconds msecs);
  87. // Number of samples dispatched each report period.
  88. // Must be in the range [1, report period / sample period].
  89. // In other words, aggregation is supported but not interpolation.
  90. int samplesPerReport() const {
  91. return samplesPerReport_;
  92. }
  93. void setSamplesPerReport(int count) {
  94. samplesPerReport_ = count;
  95. }
  96. // The names of events to collect
  97. const std::set<std::string>& eventNames() const {
  98. return eventNames_;
  99. }
  100. // Add additional events to be profiled
  101. void addEvents(const std::set<std::string>& names) {
  102. eventNames_.insert(names.begin(), names.end());
  103. }
  104. // The names of metrics to collect
  105. const std::set<std::string>& metricNames() const {
  106. return metricNames_;
  107. }
  108. // Add additional metrics to be profiled
  109. void addMetrics(const std::set<std::string>& names) {
  110. metricNames_.insert(names.begin(), names.end());
  111. }
  112. const std::vector<int>& percentiles() const {
  113. return eventReportPercentiles_;
  114. }
  115. // Profile for this long, then revert to base config
  116. std::chrono::seconds eventProfilerOnDemandDuration() const {
  117. return eventProfilerOnDemandDuration_;
  118. }
  119. void setEventProfilerOnDemandDuration(std::chrono::seconds duration) {
  120. eventProfilerOnDemandDuration_ = duration;
  121. }
  122. // Too many event profilers on a single system can overload the driver.
  123. // At some point, latencies shoot through the roof and collection of samples
  124. // becomes impossible. To avoid this situation we have a limit of profilers
  125. // per GPU.
  126. // NOTE: Communication with a daemon is needed for this feature.
  127. // Library must be built with an active DaemonConfigLoader.
  128. int maxEventProfilersPerGpu() const {
  129. return eventProfilerMaxInstancesPerGpu_;
  130. }
  131. // On Cuda11 we've seen occasional hangs when reprogramming counters
  132. // Monitor profiling threads and report when a thread is not responding
  133. // for a given number of seconds.
  134. // A period of 0 means disable.
  135. std::chrono::seconds eventProfilerHeartbeatMonitorPeriod() const {
  136. return eventProfilerHeartbeatMonitorPeriod_;
  137. }
  138. // The types of activities selected in the configuration file
  139. const std::set<ActivityType>& selectedActivityTypes() const {
  140. return selectedActivityTypes_;
  141. }
  142. // Set the types of activities to be traced
  143. bool perThreadBufferEnabled() const {
  144. return perThreadBufferEnabled_;
  145. }
  146. void setSelectedActivityTypes(const std::set<ActivityType>& types) {
  147. selectedActivityTypes_ = types;
  148. }
  149. bool isReportInputShapesEnabled() const {
  150. return enableReportInputShapes_;
  151. }
  152. bool isProfileMemoryEnabled() const {
  153. return enableProfileMemory_;
  154. }
  155. bool isWithStackEnabled() const {
  156. return enableWithStack_;
  157. }
  158. bool isWithFlopsEnabled() const {
  159. return enableWithFlops_;
  160. }
  161. bool isWithModulesEnabled() const {
  162. return enableWithModules_;
  163. }
  164. // Trace for this long
  165. std::chrono::milliseconds activitiesDuration() const {
  166. return activitiesDuration_;
  167. }
  168. // Trace for this many iterations, determined by external API
  169. int activitiesRunIterations() const {
  170. return activitiesRunIterations_;
  171. }
  172. int activitiesMaxGpuBufferSize() const {
  173. return activitiesMaxGpuBufferSize_;
  174. }
  175. std::chrono::seconds activitiesWarmupDuration() const {
  176. return activitiesWarmupDuration_;
  177. }
  178. int activitiesWarmupIterations() const {
  179. return activitiesWarmupIterations_;
  180. }
  181. // Show CUDA Synchronization Stream Wait Events
  182. bool activitiesCudaSyncWaitEvents() const {
  183. return activitiesCudaSyncWaitEvents_;
  184. }
  185. void setActivitiesCudaSyncWaitEvents(bool enable) {
  186. activitiesCudaSyncWaitEvents_ = enable;
  187. }
  188. // Timestamp at which the profiling to start, requested by the user.
  189. const std::chrono::time_point<std::chrono::system_clock> requestTimestamp()
  190. const {
  191. if (profileStartTime_.time_since_epoch().count()) {
  192. return profileStartTime_;
  193. }
  194. // If no one requested timestamp, return 0.
  195. if (requestTimestamp_.time_since_epoch().count() == 0) {
  196. return requestTimestamp_;
  197. }
  198. // TODO(T94634890): Deprecate requestTimestamp
  199. return requestTimestamp_ + maxRequestAge() + activitiesWarmupDuration();
  200. }
  201. bool hasProfileStartTime() const {
  202. return requestTimestamp_.time_since_epoch().count() > 0 ||
  203. profileStartTime_.time_since_epoch().count() > 0;
  204. }
  205. int profileStartIteration() const {
  206. return profileStartIteration_;
  207. }
  208. bool hasProfileStartIteration() const {
  209. return profileStartIteration_ >= 0 && activitiesRunIterations_ > 0;
  210. }
  211. void setProfileStartIteration(int iter) {
  212. profileStartIteration_ = iter;
  213. }
  214. int profileStartIterationRoundUp() const {
  215. return profileStartIterationRoundUp_;
  216. }
  217. // calculate the start iteration accounting for warmup
  218. int startIterationIncludingWarmup() const {
  219. if (!hasProfileStartIteration()) {
  220. return -1;
  221. }
  222. return profileStartIteration_ - activitiesWarmupIterations_;
  223. }
  224. const std::chrono::seconds maxRequestAge() const;
  225. // All VLOG* macros will log if the verbose log level is >=
  226. // the verbosity specified for the verbose log message.
  227. // Default value is -1, so messages with log level 0 will log by default.
  228. int verboseLogLevel() const {
  229. return verboseLogLevel_;
  230. }
  231. // Modules for which verbose logging is enabled.
  232. // If empty, logging is enabled for all modules.
  233. const std::vector<std::string>& verboseLogModules() const {
  234. return verboseLogModules_;
  235. }
  236. bool sigUsr2Enabled() const {
  237. return enableSigUsr2_;
  238. }
  239. bool ipcFabricEnabled() const {
  240. return enableIpcFabric_;
  241. }
  242. std::chrono::seconds onDemandConfigUpdateIntervalSecs() const {
  243. return onDemandConfigUpdateIntervalSecs_;
  244. }
  245. static std::chrono::milliseconds alignUp(
  246. std::chrono::milliseconds duration,
  247. std::chrono::milliseconds alignment) {
  248. duration += alignment;
  249. return duration - (duration % alignment);
  250. }
  251. std::chrono::time_point<std::chrono::system_clock>
  252. eventProfilerOnDemandStartTime() const {
  253. return eventProfilerOnDemandTimestamp_;
  254. }
  255. std::chrono::time_point<std::chrono::system_clock>
  256. eventProfilerOnDemandEndTime() const {
  257. return eventProfilerOnDemandTimestamp_ + eventProfilerOnDemandDuration_;
  258. }
  259. std::chrono::time_point<std::chrono::system_clock>
  260. activityProfilerRequestReceivedTime() const {
  261. return activitiesOnDemandTimestamp_;
  262. }
  263. static constexpr std::chrono::milliseconds kControllerIntervalMsecs{1000};
  264. // Users may request and set trace id and group trace id.
  265. const std::string& requestTraceID() const {
  266. return requestTraceID_;
  267. }
  268. void setRequestTraceID(const std::string& tid) {
  269. requestTraceID_ = tid;
  270. }
  271. const std::string& requestGroupTraceID() const {
  272. return requestGroupTraceID_;
  273. }
  274. void setRequestGroupTraceID(const std::string& gtid) {
  275. requestGroupTraceID_ = gtid;
  276. }
  277. size_t cuptiDeviceBufferSize() const {
  278. return cuptiDeviceBufferSize_;
  279. }
  280. size_t cuptiDeviceBufferPoolLimit() const {
  281. return cuptiDeviceBufferPoolLimit_;
  282. }
  283. bool memoryProfilerEnabled() const {
  284. return memoryProfilerEnabled_;
  285. }
  286. int profileMemoryDuration() const {
  287. return profileMemoryDuration_;
  288. }
  289. void updateActivityProfilerRequestReceivedTime();
  290. void printActivityProfilerConfig(std::ostream& s) const override;
  291. void setActivityDependentConfig() override;
  292. void validate(const std::chrono::time_point<std::chrono::system_clock>&
  293. fallbackProfileStartTime) override;
  294. static void addConfigFactory(
  295. std::string name,
  296. std::function<AbstractConfig*(Config&)> factory);
  297. void print(std::ostream& s) const;
  298. // Config relies on some state with global static lifetime. If other
  299. // threads are using the config, it's possible that the global state
  300. // is destroyed before the threads stop. By hanging onto this handle,
  301. // correct destruction order can be ensured.
  302. static std::shared_ptr<void> getStaticObjectsLifetimeHandle();
  303. bool getTSCTimestampFlag() const {
  304. return useTSCTimestamp_;
  305. }
  306. void setTSCTimestampFlag(bool flag) {
  307. useTSCTimestamp_ = flag;
  308. }
  309. const std::string& getCustomConfig() const {
  310. return customConfig_;
  311. }
  312. uint32_t maxEvents() const {
  313. return maxEvents_;
  314. }
  315. private:
  316. explicit Config(const Config& other) = default;
  317. AbstractConfig* cloneDerived(AbstractConfig& parent) const override {
  318. // Clone from AbstractConfig not supported
  319. assert(false);
  320. return nullptr;
  321. }
  322. uint8_t createDeviceMask(const std::string& val);
  323. // Adds valid activity types from the user defined string list in the
  324. // configuration file
  325. void setActivityTypes(const std::vector<std::string>& selected_activities);
  326. // Sets the default activity types to be traced
  327. void selectDefaultActivityTypes() {
  328. // If the user has not specified an activity list, add all types
  329. for (ActivityType t : defaultActivityTypes()) {
  330. selectedActivityTypes_.insert(t);
  331. }
  332. }
  333. int verboseLogLevel_;
  334. std::vector<std::string> verboseLogModules_;
  335. // Event profiler
  336. // These settings are also supported in on-demand mode
  337. std::chrono::milliseconds samplePeriod_;
  338. std::chrono::milliseconds reportPeriod_;
  339. int samplesPerReport_;
  340. std::set<std::string> eventNames_;
  341. std::set<std::string> metricNames_;
  342. // On-demand duration
  343. std::chrono::seconds eventProfilerOnDemandDuration_;
  344. // Last on-demand request
  345. std::chrono::time_point<std::chrono::system_clock>
  346. eventProfilerOnDemandTimestamp_;
  347. int eventProfilerMaxInstancesPerGpu_;
  348. // Monitor whether event profiler threads are stuck
  349. // at this frequency
  350. std::chrono::seconds eventProfilerHeartbeatMonitorPeriod_;
  351. // These settings can not be changed on-demand
  352. std::string eventLogFile_;
  353. std::vector<int> eventReportPercentiles_ = {5, 25, 50, 75, 95};
  354. uint8_t eventProfilerDeviceMask_ = ~0;
  355. std::chrono::milliseconds multiplexPeriod_;
  356. // Activity profiler
  357. bool activityProfilerEnabled_;
  358. // Enable per-thread buffer
  359. bool perThreadBufferEnabled_;
  360. std::set<ActivityType> selectedActivityTypes_;
  361. // The activity profiler settings are all on-demand
  362. std::string activitiesLogFile_;
  363. std::string activitiesLogUrl_;
  364. // Log activities to memory buffer
  365. bool activitiesLogToMemory_{false};
  366. int activitiesMaxGpuBufferSize_;
  367. std::chrono::seconds activitiesWarmupDuration_;
  368. int activitiesWarmupIterations_;
  369. bool activitiesCudaSyncWaitEvents_;
  370. // Enable Profiler Config Options
  371. // Temporarily disable shape collection until we re-roll out the feature for
  372. // on-demand cases
  373. bool enableReportInputShapes_{false};
  374. bool enableProfileMemory_{false};
  375. bool enableWithStack_{false};
  376. bool enableWithFlops_{false};
  377. bool enableWithModules_{false};
  378. // Profile for specified iterations and duration
  379. std::chrono::milliseconds activitiesDuration_;
  380. int activitiesRunIterations_;
  381. // Below are not used
  382. // Use this net name for iteration count
  383. std::string activitiesExternalAPIIterationsTarget_;
  384. // Only profile nets that includes this in the name
  385. std::vector<std::string> activitiesExternalAPIFilter_;
  386. // Only profile nets with at least this many operators
  387. int activitiesExternalAPINetSizeThreshold_;
  388. // Only profile nets with at least this many GPU operators
  389. int activitiesExternalAPIGpuOpCountThreshold_;
  390. // Last activity profiler request
  391. std::chrono::time_point<std::chrono::system_clock>
  392. activitiesOnDemandTimestamp_;
  393. // ActivityProfilers are triggered by either:
  394. // Synchronized start timestamps
  395. std::chrono::time_point<std::chrono::system_clock> profileStartTime_;
  396. // Or start iterations.
  397. int profileStartIteration_;
  398. int profileStartIterationRoundUp_;
  399. // DEPRECATED
  400. std::chrono::time_point<std::chrono::system_clock> requestTimestamp_;
  401. // Enable profiling via SIGUSR2
  402. bool enableSigUsr2_;
  403. // Enable IPC Fabric instead of thrift communication
  404. bool enableIpcFabric_;
  405. std::chrono::seconds onDemandConfigUpdateIntervalSecs_;
  406. // Logger Metadata
  407. std::string requestTraceID_;
  408. std::string requestGroupTraceID_;
  409. // CUPTI Device Buffer
  410. size_t cuptiDeviceBufferSize_;
  411. size_t cuptiDeviceBufferPoolLimit_;
  412. // CUPTI Timestamp Format
  413. bool useTSCTimestamp_{true};
  414. // Memory Profiler
  415. bool memoryProfilerEnabled_{false};
  416. int profileMemoryDuration_{1000};
  417. // Used to flexibly configure some custom options, especially for custom
  418. // backends. How to parse this string is handled by the custom backend.
  419. std::string customConfig_;
  420. // Roctracer settings
  421. uint32_t maxEvents_{1000000};
  422. };
  423. constexpr char kUseDaemonEnvVar[] = "KINETO_USE_DAEMON";
  424. bool isDaemonEnvVarSet();
  425. } // namespace libkineto