-
Notifications
You must be signed in to change notification settings - Fork 244
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add multinode support via slurm trainer
- Loading branch information
Showing
2 changed files
with
53 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
#!/bin/bash | ||
|
||
#SBATCH --job-name=torchtrain_multi_node | ||
|
||
#SBATCH --ntasks=2 | ||
|
||
#SBATCH --nodes=2 | ||
|
||
#SBATCH --gpus-per-task=8 | ||
|
||
#SBATCH --cpus-per-task=96 | ||
|
||
#SBATCH --partition=train | ||
|
||
|
||
nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) ) | ||
nodes_array=($nodes) | ||
head_node=${nodes_array[0]} | ||
head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) | ||
|
||
echo Node IP: $head_node_ip | ||
export LOGLEVEL=INFO | ||
# Enable for A100 | ||
export FI_PROVIDER="efa" | ||
# Ensure that P2P is available | ||
# export NCCL_P2P_DISABLE=1 | ||
export NCCL_IB_DISABLE=1 | ||
|
||
# debugging flags (optional) | ||
export NCCL_DEBUG=WARN | ||
export PYTHONFAULTHANDLER=1 | ||
# optional debug settings | ||
# export NCCL_DEBUG=INFO | ||
# NCCL_DEBUG_SUBSYS=INIT,GRAPH,ENV | ||
|
||
export LD_LIBRARY_PATH=/opt/amazon/efa/lib:$LD_LIBRARY_PATH | ||
export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH | ||
export CUDA_LAUNCH_BLOCKING=0 | ||
|
||
# on your cluster you might need these: | ||
# set the network interface | ||
export NCCL_SOCKET_IFNAME="eth0,en,eth,em,bond" | ||
export NCCL_BUFFSIZE=2097152 | ||
#export TORCH_DIST_INIT_BARRIER=1 | ||
export FI_EFA_SET_CUDA_SYNC_MEMOPS=0 | ||
|
||
dcgmi profile --pause | ||
# adjust sbatch --ntasks and sbatch --nodes above and --nnodes below | ||
# to your specific node count, and update target launch file. | ||
srun torchrun --nnodes 2 --nproc_per_node 8 --rdzv_id 101 --rdzv_backend c10d --rdzv_endpoint "$head_node_ip:29500" ./train.py --steps 10 | ||
dcgmi profile --resume |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters