train.sh 1.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142
  1. #!/bin/bash
  2. # Run script
  3. # Settings of training & test for different tasks.
  4. method="$1"
  5. task=$(python3 config.py --print_task)
  6. case "${task}" in
  7. 'DIS5K') epochs=500 && val_last=50 && step=5 ;;
  8. 'COD') epochs=150 && val_last=50 && step=5 ;;
  9. 'HRSOD') epochs=150 && val_last=50 && step=5 ;;
  10. 'General') epochs=200 && val_last=50 && step=5 ;;
  11. 'General-2K') epochs=250 && val_last=30 && step=2 ;;
  12. 'Matting') epochs=150 && val_last=50 && step=5 ;;
  13. esac
  14. # Train
  15. devices=$2
  16. nproc_per_node=$(echo ${devices%%,} | grep -o "," | wc -l)
  17. to_be_distributed=`echo ${nproc_per_node} | awk '{if($e > 0) print "True"; else print "False";}'`
  18. echo Training started at $(date)
  19. resume_weights_path='path_to_a_pth'
  20. if [ ${to_be_distributed} == "True" ]
  21. then
  22. # Adapt the nproc_per_node by the number of GPUs. Give 8989 as the default value of master_port.
  23. echo "Multi-GPU mode received..."
  24. CUDA_VISIBLE_DEVICES=${devices} \
  25. torchrun --standalone --nproc_per_node $((nproc_per_node+1)) \
  26. train.py --ckpt_dir ckpts/${method} --epochs ${epochs} \
  27. --dist ${to_be_distributed} \
  28. --resume ${resume_weights_path} \
  29. --use_accelerate
  30. else
  31. echo "Single-GPU mode received..."
  32. CUDA_VISIBLE_DEVICES=${devices} \
  33. python train.py --ckpt_dir ckpts/${method} --epochs ${epochs} \
  34. --dist ${to_be_distributed} \
  35. --resume ${resume_weights_path} \
  36. --use_accelerate
  37. fi
  38. echo Training finished at $(date)