install_check.py 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300
  1. # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import logging
  15. import numpy as np
  16. import paddle
  17. __all__ = []
  18. def _simple_network():
  19. """
  20. Define a simple network composed by a single linear layer.
  21. """
  22. input = paddle.static.data(
  23. name="input", shape=[None, 2, 2], dtype="float32"
  24. )
  25. weight = paddle.create_parameter(
  26. shape=[2, 3],
  27. dtype="float32",
  28. attr=paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.1)),
  29. )
  30. bias = paddle.create_parameter(shape=[3], dtype="float32")
  31. linear_out = paddle.nn.functional.linear(x=input, weight=weight, bias=bias)
  32. out = paddle.tensor.sum(linear_out)
  33. return input, out, weight
  34. def _prepare_data():
  35. """
  36. Prepare feeding data for simple network. The shape is [1, 2, 2].
  37. """
  38. # Prepare the feeding data.
  39. np_input_single = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
  40. return np_input_single.reshape(1, 2, 2)
  41. def _is_cuda_available():
  42. """
  43. Check whether CUDA is available.
  44. """
  45. try:
  46. assert len(paddle.static.cuda_places()) > 0
  47. return True
  48. except Exception as e:
  49. logging.warning(
  50. "You are using GPU version PaddlePaddle, but there is no GPU "
  51. "detected on your machine. Maybe CUDA devices is not set properly."
  52. f"\n Original Error is {e}"
  53. )
  54. return False
  55. def _is_xpu_available():
  56. """
  57. Check whether XPU is available.
  58. """
  59. try:
  60. assert len(paddle.static.xpu_places()) > 0
  61. return True
  62. except Exception as e:
  63. logging.warning(
  64. "You are using XPU version PaddlePaddle, but there is no XPU "
  65. "detected on your machine. Maybe XPU devices is not set properly."
  66. f"\n Original Error is {e}"
  67. )
  68. return False
  69. def _run_dygraph_single(use_cuda, use_xpu, use_custom, custom_device_name):
  70. """
  71. Testing the simple network in dygraph mode using one CPU/GPU/XPU.
  72. Args:
  73. use_cuda (bool): Whether running with CUDA.
  74. use_xpu (bool): Whether running with XPU.
  75. """
  76. paddle.disable_static()
  77. if use_cuda:
  78. paddle.set_device('gpu')
  79. elif use_xpu:
  80. paddle.set_device('xpu')
  81. elif use_custom:
  82. paddle.set_device(custom_device_name)
  83. else:
  84. paddle.set_device('cpu')
  85. weight_attr = paddle.ParamAttr(
  86. name="weight", initializer=paddle.nn.initializer.Constant(value=0.5)
  87. )
  88. bias_attr = paddle.ParamAttr(
  89. name="bias", initializer=paddle.nn.initializer.Constant(value=1.0)
  90. )
  91. linear = paddle.nn.Linear(
  92. 2, 4, weight_attr=weight_attr, bias_attr=bias_attr
  93. )
  94. input_np = _prepare_data()
  95. input_tensor = paddle.to_tensor(input_np)
  96. linear_out = linear(input_tensor)
  97. out = paddle.tensor.sum(linear_out)
  98. out.backward()
  99. opt = paddle.optimizer.Adam(
  100. learning_rate=0.001, parameters=linear.parameters()
  101. )
  102. opt.step()
  103. def _run_static_single(use_cuda, use_xpu, use_custom, custom_device_name):
  104. """
  105. Testing the simple network with executor running directly, using one CPU/GPU/XPU.
  106. Args:
  107. use_cuda (bool): Whether running with CUDA.
  108. use_xpu (bool): Whether running with XPU.
  109. """
  110. paddle.enable_static()
  111. with paddle.static.scope_guard(paddle.static.Scope()):
  112. train_prog = paddle.static.Program()
  113. startup_prog = paddle.static.Program()
  114. startup_prog.random_seed = 1
  115. with paddle.static.program_guard(train_prog, startup_prog):
  116. input, out, weight = _simple_network()
  117. param_grads = paddle.static.append_backward(
  118. out, parameter_list=[weight.name]
  119. )[0]
  120. if use_cuda:
  121. place = paddle.CUDAPlace(0)
  122. elif use_xpu:
  123. place = paddle.XPUPlace(0)
  124. elif use_custom:
  125. place = paddle.CustomPlace(custom_device_name, 0)
  126. else:
  127. place = paddle.CPUPlace()
  128. exe = paddle.static.Executor(place)
  129. exe.run(startup_prog)
  130. exe.run(
  131. train_prog,
  132. feed={input.name: _prepare_data()},
  133. fetch_list=[out.name, param_grads[1].name],
  134. )
  135. paddle.disable_static()
  136. def train_for_run_parallel():
  137. """
  138. train script for parallel training check
  139. """
  140. # to avoid cyclic import
  141. class LinearNet(paddle.nn.Layer):
  142. """
  143. simple fc network for parallel training check
  144. """
  145. def __init__(self):
  146. super().__init__()
  147. self._linear1 = paddle.nn.Linear(10, 10)
  148. self._linear2 = paddle.nn.Linear(10, 1)
  149. def forward(self, x):
  150. """
  151. forward
  152. """
  153. return self._linear2(self._linear1(x))
  154. paddle.distributed.init_parallel_env()
  155. layer = LinearNet()
  156. dp_layer = paddle.DataParallel(layer)
  157. loss_fn = paddle.nn.MSELoss()
  158. adam = paddle.optimizer.Adam(
  159. learning_rate=0.001, parameters=dp_layer.parameters()
  160. )
  161. inputs = paddle.randn([10, 10], 'float32')
  162. outputs = dp_layer(inputs)
  163. labels = paddle.randn([10, 1], 'float32')
  164. loss = loss_fn(outputs, labels)
  165. loss.backward()
  166. adam.step()
  167. adam.clear_grad()
  168. def _run_parallel(device_list):
  169. """
  170. Testing the simple network in data parallel mode, using multiple CPU/GPU.
  171. Args:
  172. use_cuda (bool): Whether running with CUDA.
  173. use_xpu (bool): Whether running with XPU.
  174. device_list (int): The specified devices.
  175. """
  176. paddle.distributed.spawn(train_for_run_parallel, nprocs=len(device_list))
  177. def run_check():
  178. """
  179. Check whether PaddlePaddle is installed correctly and running successfully
  180. on your system.
  181. Examples:
  182. .. code-block:: python
  183. >>> import paddle
  184. >>> paddle.utils.run_check()
  185. >>> # doctest: +SKIP('the output will change in different run')
  186. Running verify PaddlePaddle program ...
  187. I0818 15:35:08.335391 30540 program_interpreter.cc:173] New Executor is Running.
  188. I0818 15:35:08.398319 30540 interpreter_util.cc:529] Standalone Executor is Used.
  189. PaddlePaddle works well on 1 CPU.
  190. PaddlePaddle is installed successfully! Let's start deep learning with PaddlePaddle now.
  191. """
  192. print("Running verify PaddlePaddle program ... ")
  193. use_cuda = False
  194. use_xpu = False
  195. use_custom = False
  196. custom_device_name = None
  197. if paddle.is_compiled_with_cuda():
  198. use_cuda = _is_cuda_available()
  199. elif paddle.is_compiled_with_xpu():
  200. use_xpu = _is_xpu_available()
  201. elif len(paddle.framework.core.get_all_custom_device_type()) > 0:
  202. use_custom = True
  203. if len(paddle.framework.core.get_all_custom_device_type()) > 1:
  204. logging.warning(
  205. f"More than one kind of custom devices detected, but run check would only be executed on {paddle.framework.core.get_all_custom_device_type()[0]}."
  206. )
  207. if use_cuda:
  208. device_str = "GPU"
  209. device_list = paddle.static.cuda_places()
  210. elif use_xpu:
  211. device_str = "XPU"
  212. device_list = paddle.static.xpu_places()
  213. elif use_custom:
  214. device_str = paddle.framework.core.get_all_custom_device_type()[0]
  215. custom_device_name = device_str
  216. device_list = list(
  217. range(
  218. paddle.framework.core.get_custom_device_count(
  219. custom_device_name
  220. )
  221. )
  222. )
  223. else:
  224. device_str = "CPU"
  225. device_list = paddle.static.cpu_places(device_count=1)
  226. device_count = len(device_list)
  227. _run_static_single(use_cuda, use_xpu, use_custom, custom_device_name)
  228. _run_dygraph_single(use_cuda, use_xpu, use_custom, custom_device_name)
  229. print(f"PaddlePaddle works well on 1 {device_str}.")
  230. try:
  231. if len(device_list) > 1:
  232. if use_custom:
  233. import os
  234. os.environ['PADDLE_DISTRI_BACKEND'] = "xccl"
  235. _run_parallel(device_list)
  236. print(f"PaddlePaddle works well on {device_count} {device_str}s.")
  237. print(
  238. "PaddlePaddle is installed successfully! Let's start deep learning with PaddlePaddle now."
  239. )
  240. except Exception as e:
  241. logging.warning(
  242. f"PaddlePaddle meets some problem with {device_count} {device_str}s. This may be caused by:"
  243. "\n 1. There is not enough GPUs visible on your system"
  244. "\n 2. Some GPUs are occupied by other process now"
  245. "\n 3. NVIDIA-NCCL2 is not installed correctly on your system. Please follow instruction on https://github.com/NVIDIA/nccl-tests "
  246. "\n to test your NCCL, or reinstall it following https://docs.nvidia.com/deeplearning/sdk/nccl-install-guide/index.html"
  247. )
  248. logging.warning(f"\n Original Error is: {e}")
  249. print(
  250. f"PaddlePaddle is installed successfully ONLY for single {device_str}! "
  251. "Let's start deep learning with PaddlePaddle now."
  252. )
  253. raise e