add multinode support via slurm trainer

pytorch · Feb 16, 2024 · 6ee8941 · 6ee8941
1 parent 196d56e
commit 6ee8941
Show file tree

Hide file tree

Showing 2 changed files with 53 additions and 2 deletions.
diff --git a/multinode_trainer.slurm b/multinode_trainer.slurm
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+#SBATCH --job-name=torchtrain_multi_node
+
+#SBATCH --ntasks=2
+
+#SBATCH --nodes=2
+
+#SBATCH --gpus-per-task=8
+
+#SBATCH --cpus-per-task=96
+
+#SBATCH --partition=train
+
+
+nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) )
+nodes_array=($nodes)
+head_node=${nodes_array[0]}
+head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
+
+echo Node IP: $head_node_ip
+export LOGLEVEL=INFO
+# Enable for A100
+export FI_PROVIDER="efa"
+# Ensure that P2P is available
+# export NCCL_P2P_DISABLE=1
+export NCCL_IB_DISABLE=1
+
+# debugging flags (optional)
+export NCCL_DEBUG=WARN
+export PYTHONFAULTHANDLER=1
+# optional debug settings
+# export NCCL_DEBUG=INFO
+# NCCL_DEBUG_SUBSYS=INIT,GRAPH,ENV
+
+export LD_LIBRARY_PATH=/opt/amazon/efa/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH
+export CUDA_LAUNCH_BLOCKING=0
+
+# on your cluster you might need these:
+# set the network interface
+export NCCL_SOCKET_IFNAME="eth0,en,eth,em,bond"
+export NCCL_BUFFSIZE=2097152
+#export TORCH_DIST_INIT_BARRIER=1
+export FI_EFA_SET_CUDA_SYNC_MEMOPS=0
+
+dcgmi profile --pause
+# adjust sbatch --ntasks and sbatch --nodes above and --nnodes below
+# to your specific node count, and update target launch file.
+srun torchrun --nnodes 2 --nproc_per_node 8 --rdzv_id 101 --rdzv_backend c10d --rdzv_endpoint "$head_node_ip:29500" ./train.py --steps 10
+dcgmi profile --resume
diff --git a/torchtrain/profiling.py b/torchtrain/profiling.py
@@ -47,15 +47,15 @@ def trace_handler(prof):
             curr_trace_dir_name = "iteration_" + str(_global_iter_count)
             curr_trace_dir = os.path.join(trace_dir, curr_trace_dir_name)
             if not os.path.exists(curr_trace_dir):
-                os.makedirs(curr_trace_dir)
+                os.makedirs(curr_trace_dir, exist_ok=True)
             rank0_log(f"exporting profile traces to {curr_trace_dir}")
 
             prof.export_chrome_trace(f"{curr_trace_dir}/rank{rank}_trace.json")
 
         rank0_log(f"Profiling active.  Traces will be saved at {trace_dir}")
 
         if not os.path.exists(trace_dir):
-            os.makedirs(trace_dir)
+            os.makedirs(trace_dir, exist_ok=True)
 
         with torch.profiler.profile(
             activities=[