elastic.py 2.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import argparse
  15. import os
  16. class Command:
  17. def __init__(self, server, name):
  18. import etcd3
  19. srv, port = server.split(':')
  20. self.etcd = etcd3.client(host=srv, port=port)
  21. self.prefix = "/paddle/" + name
  22. self.node_prefix = self.prefix + '/nodes'
  23. self.np_path = self.prefix + '/np'
  24. def set_np(self, np):
  25. self.etcd.put(self.np_path, f'{np}'.encode('latin-1'))
  26. def scale_np(self, np):
  27. if self.etcd.get(self.np_path)[0] is not None:
  28. self.set_np(np)
  29. return True
  30. return False
  31. def clean(self):
  32. self.etcd.delete_prefix(self.prefix)
  33. def close(self):
  34. self.etcd.close()
  35. if __name__ == '__main__':
  36. parser = argparse.ArgumentParser(description='Elastic Command')
  37. parser.add_argument(
  38. "--elastic_server", type=str, help="etcd server host:port"
  39. )
  40. parser.add_argument("--job_id", type=str, help="job unique id")
  41. parser.add_argument(
  42. "--np",
  43. type=str,
  44. help="job pod/node number, need to be 'MIN' or 'MIN:MAX' format",
  45. )
  46. parser.add_argument("action", type=str, help="action to take")
  47. args = parser.parse_args()
  48. server = args.elastic_server or os.getenv('PADDLE_ELASTIC_SERVER')
  49. name = args.job_id or os.getenv('PADDLE_ELASTIC_JOB_ID')
  50. np = int(args.np.split(":")[0]) or int(os.getenv('PADDLE_ELASTIC_NP', 0))
  51. cmd = Command(server, name)
  52. if args.action == "scale":
  53. cmd.scale_np(np)
  54. if args.action == "clean":
  55. cmd.clean()
  56. print(f"action {args.action} done")
  57. cmd.close()