Skip to content

Commit

Permalink
add multinode support via slurm trainer
Browse files Browse the repository at this point in the history
  • Loading branch information
lessw2020 committed Feb 16, 2024
1 parent 196d56e commit 6ee8941
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 2 deletions.
51 changes: 51 additions & 0 deletions multinode_trainer.slurm
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/bin/bash

#SBATCH --job-name=torchtrain_multi_node

#SBATCH --ntasks=2

#SBATCH --nodes=2

#SBATCH --gpus-per-task=8

#SBATCH --cpus-per-task=96

#SBATCH --partition=train


nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) )
nodes_array=($nodes)
head_node=${nodes_array[0]}
head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)

echo Node IP: $head_node_ip
export LOGLEVEL=INFO
# Enable for A100
export FI_PROVIDER="efa"
# Ensure that P2P is available
# export NCCL_P2P_DISABLE=1
export NCCL_IB_DISABLE=1

# debugging flags (optional)
export NCCL_DEBUG=WARN
export PYTHONFAULTHANDLER=1
# optional debug settings
# export NCCL_DEBUG=INFO
# NCCL_DEBUG_SUBSYS=INIT,GRAPH,ENV

export LD_LIBRARY_PATH=/opt/amazon/efa/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH
export CUDA_LAUNCH_BLOCKING=0

# on your cluster you might need these:
# set the network interface
export NCCL_SOCKET_IFNAME="eth0,en,eth,em,bond"
export NCCL_BUFFSIZE=2097152
#export TORCH_DIST_INIT_BARRIER=1
export FI_EFA_SET_CUDA_SYNC_MEMOPS=0

dcgmi profile --pause
# adjust sbatch --ntasks and sbatch --nodes above and --nnodes below
# to your specific node count, and update target launch file.
srun torchrun --nnodes 2 --nproc_per_node 8 --rdzv_id 101 --rdzv_backend c10d --rdzv_endpoint "$head_node_ip:29500" ./train.py --steps 10
dcgmi profile --resume
4 changes: 2 additions & 2 deletions torchtrain/profiling.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,15 +47,15 @@ def trace_handler(prof):
curr_trace_dir_name = "iteration_" + str(_global_iter_count)
curr_trace_dir = os.path.join(trace_dir, curr_trace_dir_name)
if not os.path.exists(curr_trace_dir):
os.makedirs(curr_trace_dir)
os.makedirs(curr_trace_dir, exist_ok=True)
rank0_log(f"exporting profile traces to {curr_trace_dir}")

prof.export_chrome_trace(f"{curr_trace_dir}/rank{rank}_trace.json")

rank0_log(f"Profiling active. Traces will be saved at {trace_dir}")

if not os.path.exists(trace_dir):
os.makedirs(trace_dir)
os.makedirs(trace_dir, exist_ok=True)

with torch.profiler.profile(
activities=[
Expand Down

0 comments on commit 6ee8941

Please sign in to comment.