memory.py 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. # Copyright 2022 The HuggingFace Team. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """
  15. A collection of utilities for ensuring that training can always occur. Heavily influenced by the
  16. [toma](https://github.com/BlackHC/toma) library.
  17. """
  18. import functools
  19. import gc
  20. import importlib
  21. import inspect
  22. import warnings
  23. from typing import Optional
  24. import torch
  25. from packaging import version
  26. from .imports import (
  27. is_cuda_available,
  28. is_hpu_available,
  29. is_ipex_available,
  30. is_mlu_available,
  31. is_mps_available,
  32. is_musa_available,
  33. is_npu_available,
  34. is_sdaa_available,
  35. is_xpu_available,
  36. )
  37. from .versions import compare_versions
  38. def clear_device_cache(garbage_collection=False):
  39. """
  40. Clears the device cache by calling `torch.{backend}.empty_cache`. Can also run `gc.collect()`, but do note that
  41. this is a *considerable* slowdown and should be used sparingly.
  42. """
  43. if garbage_collection:
  44. gc.collect()
  45. if is_xpu_available():
  46. torch.xpu.empty_cache()
  47. elif is_mlu_available():
  48. torch.mlu.empty_cache()
  49. elif is_sdaa_available():
  50. torch.sdaa.empty_cache()
  51. elif is_musa_available():
  52. torch.musa.empty_cache()
  53. elif is_npu_available():
  54. torch.npu.empty_cache()
  55. elif is_mps_available(min_version="2.0"):
  56. torch.mps.empty_cache()
  57. elif is_cuda_available():
  58. torch.cuda.empty_cache()
  59. elif is_hpu_available():
  60. # torch.hpu.empty_cache() # not available on hpu as it reserves all device memory for the current process
  61. pass
  62. def release_memory(*objects):
  63. """
  64. Releases memory from `objects` by setting them to `None` and calls `gc.collect()` and `torch.cuda.empty_cache()`.
  65. Returned objects should be reassigned to the same variables.
  66. Args:
  67. objects (`Iterable`):
  68. An iterable of objects
  69. Returns:
  70. A list of `None` objects to replace `objects`
  71. Example:
  72. ```python
  73. >>> import torch
  74. >>> from accelerate.utils import release_memory
  75. >>> a = torch.ones(1000, 1000).cuda()
  76. >>> b = torch.ones(1000, 1000).cuda()
  77. >>> a, b = release_memory(a, b)
  78. ```
  79. """
  80. if not isinstance(objects, list):
  81. objects = list(objects)
  82. for i in range(len(objects)):
  83. objects[i] = None
  84. clear_device_cache(garbage_collection=True)
  85. return objects
  86. def should_reduce_batch_size(exception: Exception) -> bool:
  87. """
  88. Checks if `exception` relates to CUDA out-of-memory, XPU out-of-memory, CUDNN not supported, or CPU out-of-memory
  89. Args:
  90. exception (`Exception`):
  91. An exception
  92. """
  93. _statements = [
  94. " out of memory.", # OOM for CUDA, HIP, XPU
  95. "cuDNN error: CUDNN_STATUS_NOT_SUPPORTED.", # CUDNN SNAFU
  96. "DefaultCPUAllocator: can't allocate memory", # CPU OOM
  97. "FATAL ERROR :: MODULE:PT_DEVMEM Allocation failed", # HPU OOM
  98. ]
  99. if isinstance(exception, RuntimeError) and len(exception.args) == 1:
  100. return any(err in exception.args[0] for err in _statements)
  101. return False
  102. def find_executable_batch_size(
  103. function: Optional[callable] = None,
  104. starting_batch_size: int = 128,
  105. reduce_batch_size_fn: Optional[callable] = None,
  106. ):
  107. """
  108. A basic decorator that will try to execute `function`. If it fails from exceptions related to out-of-memory or
  109. CUDNN, the batch size is multiplied by 0.9 and passed to `function`
  110. `function` must take in a `batch_size` parameter as its first argument.
  111. Args:
  112. function (`callable`, *optional*):
  113. A function to wrap
  114. starting_batch_size (`int`, *optional*):
  115. The batch size to try and fit into memory
  116. Example:
  117. ```python
  118. >>> from accelerate.utils import find_executable_batch_size
  119. >>> @find_executable_batch_size(starting_batch_size=128)
  120. ... def train(batch_size, model, optimizer):
  121. ... ...
  122. >>> train(model, optimizer)
  123. ```
  124. """
  125. if function is None:
  126. return functools.partial(find_executable_batch_size, starting_batch_size=starting_batch_size)
  127. batch_size = starting_batch_size
  128. if reduce_batch_size_fn is None:
  129. def reduce_batch_size_fn():
  130. nonlocal batch_size
  131. batch_size = int(batch_size * 0.9)
  132. return batch_size
  133. def decorator(*args, **kwargs):
  134. nonlocal batch_size
  135. clear_device_cache(garbage_collection=True)
  136. params = list(inspect.signature(function).parameters.keys())
  137. # Guard against user error
  138. if len(params) < (len(args) + 1):
  139. arg_str = ", ".join([f"{arg}={value}" for arg, value in zip(params[1:], args[1:])])
  140. raise TypeError(
  141. f"Batch size was passed into `{function.__name__}` as the first argument when called."
  142. f"Remove this as the decorator already does so: `{function.__name__}({arg_str})`"
  143. )
  144. while True:
  145. if batch_size == 0:
  146. raise RuntimeError("No executable batch size found, reached zero.")
  147. try:
  148. return function(batch_size, *args, **kwargs)
  149. except Exception as e:
  150. if should_reduce_batch_size(e):
  151. clear_device_cache(garbage_collection=True)
  152. batch_size = reduce_batch_size_fn()
  153. else:
  154. raise
  155. return decorator
  156. def get_xpu_available_memory(device_index: int):
  157. if version.parse(torch.__version__).release >= version.parse("2.6").release:
  158. # torch.xpu.mem_get_info API is available starting from PyTorch 2.6
  159. # It further requires PyTorch built with the SYCL runtime which supports API
  160. # to query available device memory. If not available, exception will be
  161. # raised. Version of SYCL runtime used to build PyTorch is being reported
  162. # with print(torch.version.xpu) and corresponds to the version of Intel DPC++
  163. # SYCL compiler. First version to support required feature is 20250001.
  164. try:
  165. return torch.xpu.mem_get_info(device_index)[0]
  166. except Exception:
  167. pass
  168. elif is_ipex_available():
  169. ipex_version = version.parse(importlib.metadata.version("intel_extension_for_pytorch"))
  170. if compare_versions(ipex_version, ">=", "2.5"):
  171. from intel_extension_for_pytorch.xpu import mem_get_info
  172. return mem_get_info(device_index)[0]
  173. warnings.warn(
  174. "The XPU `mem_get_info` API is available in IPEX version >=2.5 or PyTorch >=2.6. The current returned available memory is incorrect. Please consider upgrading your IPEX or PyTorch version."
  175. )
  176. return torch.xpu.max_memory_allocated(device_index)