utils.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520
  1. # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """Utilities for logging and serialization"""
  15. import os
  16. import random
  17. import subprocess
  18. import time
  19. import json
  20. import numpy as np
  21. import torch
  22. from megatron_util import mpu, print_rank_0
  23. from megatron_util.fp16 import FP16_Optimizer
  24. SUMMARY_WRITER_DIR_NAME = 'runs'
  25. def get_log_dir(name, base):
  26. return os.path.join(base, SUMMARY_WRITER_DIR_NAME, name)
  27. def get_hostname():
  28. hostname_cmd = ['hostname -I']
  29. result = subprocess.check_output(hostname_cmd, shell=True)
  30. master_addr = result.decode('utf-8').split()[0]
  31. return master_addr
  32. def get_spare_port(args):
  33. if torch.distributed.get_rank() == 0:
  34. port = subprocess.check_output(['shuf -n 1 -i 10000-65535'],
  35. shell=True)
  36. port = int(port.strip())
  37. if port == args.master_port:
  38. port = subprocess.check_output(['shuf -n 1 -i 10000-65535'],
  39. shell=True)
  40. port = int(port.strip())
  41. port = torch.cuda.LongTensor([port])
  42. else:
  43. port = torch.cuda.LongTensor([0])
  44. torch.distributed.broadcast(port, 0)
  45. port = port.item()
  46. return port
  47. def print_and_save_args(args, verbose=True, log_dir=None):
  48. """Print arguments."""
  49. if verbose:
  50. print('arguments:', flush=True)
  51. for arg in vars(args):
  52. dots = '.' * (29 - len(arg))
  53. print(
  54. ' {} {} {}'.format(arg, dots, getattr(args, arg)), flush=True)
  55. if log_dir is not None:
  56. json_file = os.path.join(log_dir, 'config.json')
  57. with open(json_file, 'w') as output:
  58. json.dump(vars(args), output, sort_keys=True)
  59. if args.deepspeed and args.deepspeed_config is not None:
  60. with open(args.deepspeed_config, encoding='utf-8') as file:
  61. deepspeed_config = json.load(file)
  62. deepspeed_json_file = os.path.join(log_dir,
  63. 'config_gpt_large.json')
  64. with open(deepspeed_json_file, 'w') as output:
  65. json.dump(deepspeed_config, output)
  66. def print_params_min_max_norm(optimizer, iteration):
  67. """Print min, max, and norm of all parameters."""
  68. index = 0
  69. rank = torch.distributed.get_rank()
  70. string = 'iteration, rank, index, model-parallel,min, max, norm\n'
  71. optimizer_ = optimizer
  72. if isinstance(optimizer, FP16_Optimizer):
  73. optimizer_ = optimizer.optimizer
  74. for param_group in optimizer_.param_groups:
  75. for param in param_group['params']:
  76. index += 1
  77. min_ = param.data.min()
  78. max_ = param.data.max()
  79. norm = param.data.norm()
  80. string += '{:7d}, {:4d}, {:4d}, {:2d}, '.format(
  81. iteration, rank, index, int(param.model_parallel))
  82. string += '{:.6E}, {:.6E}, {:.6E}\n'.format(min_, max_, norm)
  83. print(string, flush=True)
  84. class Timers:
  85. """Group of timers."""
  86. class Timer:
  87. """Timer."""
  88. def __init__(self, name):
  89. self.name_ = name
  90. self.elapsed_ = 0.0
  91. self.started_ = False
  92. self.start_time = time.time()
  93. def start(self):
  94. """Start the timer."""
  95. assert not self.started_, 'timer has already been started'
  96. torch.cuda.synchronize()
  97. self.start_time = time.time()
  98. self.started_ = True
  99. def stop(self):
  100. """Stop the timer."""
  101. assert self.started_, 'timer is not started'
  102. torch.cuda.synchronize()
  103. self.elapsed_ += (time.time() - self.start_time)
  104. self.started_ = False
  105. def reset(self):
  106. """Reset timer."""
  107. self.elapsed_ = 0.0
  108. self.started_ = False
  109. def elapsed(self, reset=True):
  110. """Calculate the elapsed time."""
  111. started_ = self.started_
  112. # If the timing in progress, end it first.
  113. if self.started_:
  114. self.stop()
  115. # Get the elapsed time.
  116. elapsed_ = self.elapsed_
  117. # Reset the elapsed time
  118. if reset:
  119. self.reset()
  120. # If timing was in progress, set it back.
  121. if started_:
  122. self.start()
  123. return elapsed_
  124. def __init__(self):
  125. self.timers = {}
  126. def __call__(self, name):
  127. if name not in self.timers:
  128. self.timers[name] = self.Timer(name)
  129. return self.timers[name]
  130. def log(self, names, normalizer=1.0, reset=True):
  131. """Log a group of timers."""
  132. assert normalizer > 0.0
  133. string = 'time (ms)'
  134. for name in names:
  135. elapsed_time = self.timers[name].elapsed(
  136. reset=reset) * 1000.0 / normalizer
  137. string += ' | {}: {:.2f}'.format(name, elapsed_time)
  138. print_rank_0(string)
  139. def report_memory(name):
  140. """Simple GPU memory report."""
  141. mega_bytes = 1024.0 * 1024.0
  142. string = name + ' memory (MB)'
  143. string += ' | allocated: {}'.format(torch.cuda.memory_allocated()
  144. / mega_bytes)
  145. string += ' | max allocated: {}'.format(torch.cuda.max_memory_allocated()
  146. / mega_bytes)
  147. string += ' | cached: {}'.format(torch.cuda.memory_cached() / mega_bytes)
  148. string += ' | max cached: {}'.format(torch.cuda.memory_reserved()
  149. / mega_bytes)
  150. print_rank_0(string)
  151. def get_checkpoint_name(checkpoints_path,
  152. iteration,
  153. release=False,
  154. zero=False):
  155. if release:
  156. d = 'release'
  157. else:
  158. d = '{}'.format(iteration)
  159. if zero:
  160. dp_rank = mpu.get_data_parallel_rank()
  161. d += '_zero_dp_rank_{}'.format(dp_rank)
  162. return os.path.join(
  163. checkpoints_path, d,
  164. 'mp_rank_{:02d}_model_states.pt'.format(mpu.get_model_parallel_rank()))
  165. def ensure_directory_exists(filename):
  166. dirname = os.path.dirname(filename)
  167. if not os.path.exists(dirname):
  168. os.makedirs(dirname, exist_ok=True)
  169. def get_checkpoint_tracker_filename(checkpoints_path):
  170. return os.path.join(checkpoints_path, 'latest_checkpointed_iteration.txt')
  171. def save_zero_checkpoint(args, iteration, optimizer):
  172. zero_sd = {
  173. 'iteration': iteration,
  174. 'optimizer_state_dict': optimizer.state_dict()
  175. }
  176. zero_checkpoint_name = get_checkpoint_name(args.save, iteration, zero=True)
  177. ensure_directory_exists(zero_checkpoint_name)
  178. torch.save(zero_sd, zero_checkpoint_name)
  179. print(' successfully saved {}'.format(zero_checkpoint_name))
  180. def save_checkpoint(iteration,
  181. model,
  182. optimizer,
  183. lr_scheduler,
  184. args,
  185. tag=None,
  186. barrier=True,
  187. only_changed_parameters=False,
  188. no_deepspeed=False,
  189. no_save_optim=False):
  190. """Save a model checkpoint."""
  191. if tag is None:
  192. tag = str(iteration)
  193. if args.deepspeed and not no_deepspeed:
  194. save_ds_checkpoint(iteration, model, lr_scheduler, args, tag=tag)
  195. else:
  196. # Only rank zer0 of the data parallel writes to the disk.
  197. if mpu.get_data_parallel_rank() == 0:
  198. checkpoint_name = get_checkpoint_name(args.save, tag)
  199. print(
  200. 'global rank {} is saving checkpoint at iteration {:7d} to {}'.
  201. format(torch.distributed.get_rank(), iteration,
  202. checkpoint_name))
  203. sd = {'iteration': iteration}
  204. if args.deepspeed:
  205. model = model.module
  206. state_dict = model.state_dict()
  207. if only_changed_parameters:
  208. requires_grad_dict = {}
  209. for name, parameter in model.named_parameters():
  210. requires_grad_dict[name] = parameter.requires_grad
  211. state_dict = {
  212. key: value
  213. for key, value in state_dict.items()
  214. if requires_grad_dict[key]
  215. }
  216. sd['module'] = state_dict
  217. # Optimizer stuff.
  218. if not args.no_save_optim and not no_save_optim:
  219. if optimizer is not None:
  220. sd['optimizer'] = optimizer.state_dict()
  221. if lr_scheduler is not None:
  222. sd['lr_scheduler'] = lr_scheduler.state_dict()
  223. # rng states.
  224. if not args.no_save_rng:
  225. sd['random_rng_state'] = random.getstate()
  226. sd['np_rng_state'] = np.random.get_state()
  227. sd['torch_rng_state'] = torch.get_rng_state()
  228. sd['cuda_rng_state'] = torch.cuda.get_rng_state()
  229. sd['rng_tracker_states'] = mpu.get_cuda_rng_tracker(
  230. ).get_states()
  231. ensure_directory_exists(checkpoint_name)
  232. torch.save(sd, checkpoint_name)
  233. print(' successfully saved {}'.format(checkpoint_name))
  234. # Wait so everyone is done (necessary)
  235. if barrier:
  236. torch.distributed.barrier()
  237. # And update the latest iteration
  238. if torch.distributed.get_rank() == 0:
  239. tracker_filename = get_checkpoint_tracker_filename(args.save)
  240. with open(tracker_filename, 'w') as f:
  241. f.write(tag)
  242. def save_ds_checkpoint(iteration, model, lr_scheduler, args, tag):
  243. """Save a model checkpoint."""
  244. sd = {}
  245. sd['iteration'] = iteration
  246. if lr_scheduler is not None:
  247. sd['client_lr_scheduler'] = lr_scheduler.state_dict()
  248. # rng states.
  249. if not args.no_save_rng:
  250. sd['random_rng_state'] = random.getstate()
  251. sd['np_rng_state'] = np.random.get_state()
  252. sd['torch_rng_state'] = torch.get_rng_state()
  253. sd['cuda_rng_state'] = torch.cuda.get_rng_state()
  254. sd['rng_tracker_states'] = mpu.get_cuda_rng_tracker().get_states()
  255. model.save_checkpoint(args.save, tag, client_state=sd)
  256. def get_checkpoint_iteration(load_path):
  257. # Read the tracker file and set the iteration.
  258. tracker_filename = get_checkpoint_tracker_filename(load_path)
  259. if not os.path.isfile(tracker_filename):
  260. print_rank_0('WARNING: could not find the metadata file {} '.format(
  261. tracker_filename))
  262. if os.path.isdir(load_path):
  263. path = os.path.normpath(load_path)
  264. load_dir, tag = os.path.split(path)
  265. print_rank_0(
  266. 'Try to directly load the checkpoint from the directory')
  267. return load_dir, tag, False, True
  268. print_rank_0(' will not load any checkpoints and will start from '
  269. 'random')
  270. return load_path, 0, False, False
  271. with open(tracker_filename, 'r', encoding='utf-8') as f:
  272. metastring = f.read().strip()
  273. release = metastring == 'release'
  274. # try:
  275. # iteration = int(metastring)
  276. # except ValueError:
  277. # release = metastring == 'release'
  278. # if not release:
  279. # print_rank_0('ERROR: Invalid metadata file {}. Exiting'.format(
  280. # tracker_filename))
  281. # exit()
  282. # assert iteration > 0 or release, 'error parsing metadata file {}'.format(
  283. # tracker_filename)
  284. return load_path, metastring, release, True
  285. def load_checkpoint(model,
  286. optimizer,
  287. lr_scheduler,
  288. args,
  289. no_deepspeed=False,
  290. no_load_optim=False):
  291. """Load a model checkpoint."""
  292. load_dir, tag, release, success = get_checkpoint_iteration(args.load)
  293. if not success:
  294. return 0
  295. if args.deepspeed and not no_deepspeed:
  296. checkpoint_name, sd = model.load_checkpoint(
  297. load_dir,
  298. tag,
  299. load_optimizer_states=not args.no_load_optim and not no_load_optim,
  300. load_lr_scheduler_states=not args.no_load_lr_scheduler)
  301. if not args.no_load_lr_scheduler and 'client_lr_scheduler' in sd:
  302. lr_scheduler.load_state_dict(sd['client_lr_scheduler'])
  303. print_rank_0('Load lr scheduler state')
  304. if checkpoint_name is None:
  305. if mpu.get_data_parallel_rank() == 0:
  306. print('Unable to load checkpoint.')
  307. return tag
  308. else:
  309. # Checkpoint.
  310. checkpoint_name = get_checkpoint_name(load_dir, tag, release)
  311. if mpu.get_data_parallel_rank() == 0:
  312. print('global rank {} is loading checkpoint {}'.format(
  313. torch.distributed.get_rank(), checkpoint_name))
  314. # Load the checkpoint.
  315. sd = torch.load(checkpoint_name, map_location='cpu')
  316. # Model.
  317. if args.deepspeed:
  318. model = model.module
  319. missing_keys, unexpected_keys = model.load_state_dict(
  320. sd['module'], strict=False)
  321. if missing_keys or unexpected_keys:
  322. print_rank_0(
  323. f'Missing keys {missing_keys}, unexpected keys {unexpected_keys}'
  324. )
  325. # Optimizer.
  326. if not release and not args.finetune and not args.no_load_optim and not no_load_optim:
  327. try:
  328. if optimizer is not None:
  329. optimizer.load_state_dict(sd['optimizer'])
  330. if lr_scheduler is not None:
  331. lr_scheduler.load_state_dict(sd['lr_scheduler'])
  332. except KeyError:
  333. print_rank_0(
  334. 'Unable to load optimizer from checkpoint {}, exiting. '
  335. 'Specify --no-load-optim or --finetune to prevent '
  336. 'attempting to load the optimizer '
  337. 'state.'.format(checkpoint_name))
  338. # Iterations.
  339. if args.finetune or release:
  340. iteration = 0
  341. else:
  342. try:
  343. iteration = sd['iteration']
  344. except KeyError:
  345. try: # Backward compatible with older checkpoints
  346. iteration = sd['total_iters']
  347. except KeyError:
  348. print_rank_0(
  349. 'A metadata file exists but Unable to load iteration '
  350. ' from checkpoint {}, starting from 0 iteration'.format(
  351. checkpoint_name))
  352. iteration = 0
  353. # rng states.
  354. if not release and not args.finetune and not args.no_load_rng:
  355. try:
  356. random.setstate(sd['random_rng_state'])
  357. np.random.set_state(sd['np_rng_state'])
  358. torch.set_rng_state(sd['torch_rng_state'])
  359. torch.cuda.set_rng_state(sd['cuda_rng_state'])
  360. mpu.get_cuda_rng_tracker().set_states(sd['rng_tracker_states'])
  361. except KeyError:
  362. print_rank_0(
  363. 'Unable to load random state from checkpoint {}, exiting. '
  364. 'Specify --no-load-rng or --finetune to prevent '
  365. 'attempting to load the random '
  366. 'state.'.format(checkpoint_name))
  367. if mpu.get_data_parallel_rank() == 0:
  368. print(' successfully loaded {}'.format(checkpoint_name))
  369. return iteration
  370. def load_weights(src, dst, dst2src=False):
  371. """
  372. Loads weights from src to dst via in place copy.
  373. src is a huggingface gpt2model, while dst is one of our models.
  374. dst2src=True loads parameters from our models into huggingface's.
  375. ^dst2src is still untested
  376. """
  377. conv_layer = 'Conv1D' in str(type(src))
  378. for n, p in src.named_parameters():
  379. if dst2src:
  380. data = dst._parameters[n].data
  381. load = p.data
  382. else:
  383. data = p.data
  384. load = dst._parameters[n].data
  385. if conv_layer and 'weight' in n:
  386. data = data.t().contiguous()
  387. load.copy_(data)
  388. # dst._parameters[n].data.copy_(data)
  389. def load_mlp(our, oai, dst2src=False):
  390. load_weights(oai.c_fc, our.dense_h_to_4h, dst2src)
  391. load_weights(oai.c_proj, our.dense_4h_to_h, dst2src)
  392. def load_attention(our, oai, dst2src=False):
  393. load_weights(oai.c_attn, our.query_key_value, dst2src)
  394. load_weights(oai.c_proj, our.dense, dst2src)
  395. def load_transformer_layer(our, oai, dst2src=False):
  396. load_weights(oai.ln_1, our.input_layernorm, dst2src)
  397. load_weights(oai.ln_2, our.post_attention_layernorm, dst2src)
  398. load_mlp(our.mlp, oai.mlp, dst2src)
  399. load_attention(our.attention, oai.attn, dst2src)
  400. def move_weights(our, oai, dst2src=False):
  401. """
  402. Loads weights from `oai` to `our` via in place copy.
  403. `oai` is a huggingface gpt2model, while `our` is one of our models.
  404. dst2src=True loads parameters from our models into huggingface's.
  405. ^dst2src=True is still untested
  406. """
  407. # while isinstance(our, (torchDDP, model.distributed.DistributedDataParallel, FP16_Module)):
  408. # our=our.module
  409. transformer_model = oai.transformer
  410. load_weights(transformer_model.ln_f, our.transformer.final_layernorm,
  411. dst2src)
  412. load_weights(transformer_model.wte, our.word_embeddings, dst2src)
  413. load_weights(transformer_model.wpe, our.position_embeddings, dst2src)
  414. for our_layer, oai_layer in zip(our.transformer.layers, oai.transformer.h):
  415. load_transformer_layer(our_layer, oai_layer, dst2src)
  416. def debug_finetune_data(local_vars, batch_id, tokenizer):
  417. tokens, target_ids = local_vars['tokens'], local_vars['target_ids']
  418. attention_mask, logit_mask, position_ids = local_vars[
  419. 'attention_mask'], local_vars['logit_mask'], local_vars['position_ids']
  420. output_tokens = []
  421. sep = attention_mask[batch_id].item()
  422. for i, token in enumerate(tokens[batch_id][:sep].tolist()):
  423. token = tokenizer.IdToToken(token)
  424. if token == '[MASK]':
  425. token = f'[{position_ids[batch_id][0, i].item()}]'
  426. output_tokens.append(token)
  427. print(' '.join(output_tokens))
  428. target_positions = []
  429. for i in range(sep, tokens.size(-1)):
  430. if logit_mask[batch_id][i]:
  431. target_positions.append(i)
  432. print(target_positions)
  433. print(tokenizer.DecodeIds(tokens[batch_id][target_positions].tolist()))
  434. if len(target_ids.shape) > 2:
  435. print(
  436. tokenizer.DecodeIds(
  437. target_ids[batch_id][target_positions].tolist()))
  438. else:
  439. print(tokenizer.DecodeIds(target_ids[batch_id].tolist()))
  440. print(position_ids[batch_id][:, target_positions])