machine_info.py 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228
  1. # -------------------------------------------------------------------------
  2. # Copyright (c) Microsoft Corporation. All rights reserved.
  3. # Licensed under the MIT License.
  4. # --------------------------------------------------------------------------
  5. # It is used to dump machine information for Notebooks
  6. import argparse
  7. import json
  8. import logging
  9. import platform
  10. from os import environ
  11. import cpuinfo
  12. import psutil
  13. from py3nvml.py3nvml import (
  14. NVMLError,
  15. nvmlDeviceGetCount,
  16. nvmlDeviceGetHandleByIndex,
  17. nvmlDeviceGetMemoryInfo,
  18. nvmlDeviceGetName,
  19. nvmlInit,
  20. nvmlShutdown,
  21. nvmlSystemGetDriverVersion,
  22. )
  23. class MachineInfo:
  24. """Class encapsulating Machine Info logic."""
  25. def __init__(self, silent=False, logger=None):
  26. self.silent = silent
  27. if logger is None:
  28. logging.basicConfig(
  29. format="%(asctime)s - %(name)s - %(levelname)s: %(message)s",
  30. level=logging.INFO,
  31. )
  32. self.logger = logging.getLogger(__name__)
  33. else:
  34. self.logger = logger
  35. self.machine_info = None
  36. try:
  37. self.machine_info = self.get_machine_info()
  38. except Exception:
  39. self.logger.exception("Exception in getting machine info.")
  40. self.machine_info = None
  41. def get_machine_info(self):
  42. """Get machine info in metric format"""
  43. gpu_info = self.get_gpu_info_by_nvml()
  44. cpu_info = cpuinfo.get_cpu_info()
  45. machine_info = {
  46. "gpu": gpu_info,
  47. "cpu": self.get_cpu_info(),
  48. "memory": self.get_memory_info(),
  49. "os": platform.platform(),
  50. "python": self._try_get(cpu_info, ["python_version"]),
  51. "packages": self.get_related_packages(),
  52. "onnxruntime": self.get_onnxruntime_info(),
  53. "pytorch": self.get_pytorch_info(),
  54. "tensorflow": self.get_tensorflow_info(),
  55. }
  56. return machine_info
  57. def get_memory_info(self) -> dict:
  58. """Get memory info"""
  59. mem = psutil.virtual_memory()
  60. return {"total": mem.total, "available": mem.available}
  61. def _try_get(self, cpu_info: dict, names: list) -> str:
  62. for name in names:
  63. if name in cpu_info:
  64. value = cpu_info[name]
  65. if isinstance(value, (list, tuple)):
  66. return ",".join([str(i) for i in value])
  67. return value
  68. return ""
  69. def get_cpu_info(self) -> dict:
  70. """Get CPU info"""
  71. cpu_info = cpuinfo.get_cpu_info()
  72. return {
  73. "brand": self._try_get(cpu_info, ["brand", "brand_raw"]),
  74. "cores": psutil.cpu_count(logical=False),
  75. "logical_cores": psutil.cpu_count(logical=True),
  76. "hz": self._try_get(cpu_info, ["hz_actual"]),
  77. "l2_cache": self._try_get(cpu_info, ["l2_cache_size"]),
  78. "flags": self._try_get(cpu_info, ["flags"]),
  79. "processor": platform.uname().processor,
  80. }
  81. def get_gpu_info_by_nvml(self) -> dict:
  82. """Get GPU info using nvml"""
  83. gpu_info_list = []
  84. driver_version = None
  85. try:
  86. nvmlInit()
  87. driver_version = nvmlSystemGetDriverVersion()
  88. deviceCount = nvmlDeviceGetCount() # noqa: N806
  89. for i in range(deviceCount):
  90. handle = nvmlDeviceGetHandleByIndex(i)
  91. info = nvmlDeviceGetMemoryInfo(handle)
  92. gpu_info = {}
  93. gpu_info["memory_total"] = info.total
  94. gpu_info["memory_available"] = info.free
  95. gpu_info["name"] = nvmlDeviceGetName(handle)
  96. gpu_info_list.append(gpu_info)
  97. nvmlShutdown()
  98. except NVMLError as error:
  99. if not self.silent:
  100. self.logger.error("Error fetching GPU information using nvml: %s", error)
  101. return None
  102. result = {"driver_version": driver_version, "devices": gpu_info_list}
  103. if "CUDA_VISIBLE_DEVICES" in environ:
  104. result["cuda_visible"] = environ["CUDA_VISIBLE_DEVICES"]
  105. return result
  106. def get_related_packages(self) -> list[str]:
  107. import pkg_resources # noqa: PLC0415
  108. installed_packages = pkg_resources.working_set
  109. related_packages = [
  110. "onnxruntime-gpu",
  111. "onnxruntime",
  112. "onnx",
  113. "transformers",
  114. "protobuf",
  115. "sympy",
  116. "torch",
  117. "tensorflow",
  118. "flatbuffers",
  119. "numpy",
  120. "onnxconverter-common",
  121. ]
  122. related_packages_list = {i.key: i.version for i in installed_packages if i.key in related_packages}
  123. return related_packages_list
  124. def get_onnxruntime_info(self) -> dict:
  125. try:
  126. import onnxruntime # noqa: PLC0415
  127. return {
  128. "version": onnxruntime.__version__,
  129. "support_gpu": "CUDAExecutionProvider" in onnxruntime.get_available_providers(),
  130. }
  131. except ImportError as error:
  132. if not self.silent:
  133. self.logger.exception(error)
  134. return None
  135. except Exception as exception:
  136. if not self.silent:
  137. self.logger.exception(exception, False)
  138. return None
  139. def get_pytorch_info(self) -> dict:
  140. try:
  141. import torch # noqa: PLC0415
  142. return {
  143. "version": torch.__version__,
  144. "support_gpu": torch.cuda.is_available(),
  145. "cuda": torch.version.cuda,
  146. }
  147. except ImportError as error:
  148. if not self.silent:
  149. self.logger.exception(error)
  150. return None
  151. except Exception as exception:
  152. if not self.silent:
  153. self.logger.exception(exception, False)
  154. return None
  155. def get_tensorflow_info(self) -> dict:
  156. try:
  157. import tensorflow as tf # noqa: PLC0415
  158. return {
  159. "version": tf.version.VERSION,
  160. "git_version": tf.version.GIT_VERSION,
  161. "support_gpu": tf.test.is_built_with_cuda(),
  162. }
  163. except ImportError as error:
  164. if not self.silent:
  165. self.logger.exception(error)
  166. return None
  167. except ModuleNotFoundError as error:
  168. if not self.silent:
  169. self.logger.exception(error)
  170. return None
  171. def parse_arguments():
  172. parser = argparse.ArgumentParser()
  173. parser.add_argument(
  174. "--silent",
  175. required=False,
  176. action="store_true",
  177. help="Do not print error message",
  178. )
  179. parser.set_defaults(silent=False)
  180. args = parser.parse_args()
  181. return args
  182. def get_machine_info(silent=True) -> str:
  183. machine = MachineInfo(silent)
  184. return json.dumps(machine.machine_info, indent=2)
  185. def get_device_info(silent=True) -> str:
  186. machine = MachineInfo(silent)
  187. info = machine.machine_info
  188. if info:
  189. info = {key: value for key, value in info.items() if key in ["gpu", "cpu", "memory"]}
  190. return json.dumps(info, indent=2)
  191. if __name__ == "__main__":
  192. args = parse_arguments()
  193. print(get_machine_info(args.silent))