PrimeIntellect-ai · samsja · Nov 20, 2024 · Nov 20, 2024 · Nov 20, 2024 · Nov 20, 2024
diff --git a/.gitmodules b/.gitmodules
diff --git a/configs/debug/diloco.toml b/configs/debug/diloco.toml
@@ -8,8 +8,8 @@ micro_bs = 8
 
 [optim]
 batch_size = 16
-warmup_steps = 10
-total_steps = 4
+warmup_steps = 100
+total_steps = 1500
 
 [data]
 fake = true

diff --git a/pyproject.toml b/pyproject.toml
@@ -19,6 +19,7 @@ dependencies = [
     "pyarrow",
     "toposolve",
     "psutil",
+    "pccl @ git+ssh://[email protected]/PrimeIntellect-ai/pccl.git@16110e15#egg=pccl&subdirectory=bindings/python", #todo move to https once open source
 ]
 
 [project.optional-dependencies]

diff --git a/scripts/simulate_multi_node_diloco.sh b/scripts/simulate_multi_node_diloco.sh
@@ -1,23 +1,41 @@
 #!/bin/bash
 
 #
-# simulate multi nodes on one gpu. start N torchrun on X gpu locally.
-# example how to run ./scripts/simulate_multi_node.sh 2 1  src/zeroband/train.py @configs/debug/normal.toml
+# Simulate multi-node on a single GPU or multiple GPUs.
+# Start N torchrun instances on X GPUs locally.
+# Example usage:
+# ./scripts/simulate_multi_node.sh 2 1 src/zeroband/train.py @configs/debug/normal.toml
+
+# Function to get the total number of available GPUs
+get_total_gpus() {
+    nvidia-smi --query-gpu=name --format=csv,noheader | wc -l
+}
 
 # Function to get CUDA devices based on the number of GPUs and index
-function get_cuda_devices() {
+get_cuda_devices() {
     local num_gpu=$1
     local index=$2
     local start_gpu=$((num_gpu * index))
     local end_gpu=$((start_gpu + num_gpu - 1))
 
-    if [ "$num_gpu" -eq 1 ]; then
-        echo $start_gpu
+    if [ "$TOTAL_GPU" -eq 1 ]; then
+        echo "0"
+    elif [ "$num_gpu" -eq 1 ]; then
+        echo "$start_gpu"
     else
-        echo $(seq -s ',' $start_gpu $end_gpu)
+        echo "$(seq -s ',' $start_gpu $end_gpu)"
     fi
 }
 
+# Function to find an available port
+find_available_port() {
+    local port=$1
+    while ss -tuln | grep -q ":$port "; do
+        port=$((port + 1))
+    done
+    echo $port
+}
+
 # Array to store PIDs of child processes
 child_pids=()
 
@@ -35,37 +53,67 @@ cleanup() {
     exit
 }
 
-# Check if at least three arguments were passed
+# Register the cleanup function to be called on SIGINT (Ctrl+C) and SIGTERM
+trap cleanup SIGINT SIGTERM
+
 if [ "$#" -lt 3 ]; then
-    echo "Usage: $0 <N> <initial_peer> <num_gpu> [additional_python_args]"
+    echo "Usage: $0 <N> <num_gpu_per_node> <python_script> [additional_python_args...]"
+    echo "Example: $0 2 1 src/zeroband/train.py @configs/debug/normal.toml"
     exit 1
 fi
 
+N=$1               # Number of ranks/nodes
+NUM_GPU=$2         # Number of GPUs per node
+shift 2            # Shift the first two arguments so that $@ contains only additional Python arguments
 
-N=$1         # Set N from the first argument
-NUM_GPU=$2
-shift 2     # Remove the first three arguments so $@ contains only additional Python arguments
-
-# Register the cleanup function to be called on SIGINT (Ctrl+C)
-trap cleanup SIGINT
+TOTAL_GPU=$(get_total_gpus)
 
+if [ "$NUM_GPU" -gt "$TOTAL_GPU" ]; then
+    echo "Requested NUM_GPU ($NUM_GPU) exceeds the total available GPUs ($TOTAL_GPU)."
+    echo "Setting NUM_GPU to $TOTAL_GPU."
+    NUM_GPU=$TOTAL_GPU
+fi
 
 mkdir -p logs
 
 export GLOBAL_ADDR=localhost
 export GLOBAL_PORT=${GLOBAL_PORT:-5565}
 export GLOBAL_WORLD_SIZE=$N
-export BASE_PORT=${BASE_PORT:-10001}
-export GLOO_SOCKET_IFNAME=lo
 
-for i in $(seq 0 $(($N - 1 )))
-do
-    > logs/log$i.log
-    WANDB_MODE=$([ $i -eq 0 ] && echo "online" || echo "online") GLOBAL_UNIQUE_ID=$i GLOBAL_RANK=$i CUDA_VISIBLE_DEVICES=$(get_cuda_devices $NUM_GPU $i) uv run torchrun --nproc_per_node=$NUM_GPU --node-rank 0 --rdzv-endpoint localhost:$((BASE_PORT + $i)) --nnodes=1  $@ --data.data_rank $i --data.data_world_size $N > logs/log$i.log 2>&1 &
+BASE_PORT=${BASE_PORT:-10001}
+
+for i in $(seq 0 $((N - 1))); do
+    LOG_FILE="logs/log$i.log"
+    > "$LOG_FILE"
+
+    CUDA_DEVICES=$(get_cuda_devices "$NUM_GPU" "$i")
+
+    # Find an available port
+    PORT=$(find_available_port $((BASE_PORT + i)))
+
+    echo "Starting rank $i with CUDA_VISIBLE_DEVICES=$CUDA_DEVICES on port $PORT"
+
+    WANDB_MODE=$([ "$i" -eq 0 ] && echo "online" || echo "online") \
+    GLOBAL_UNIQUE_ID=$i \
+    GLOBAL_RANK=$i \
+    CUDA_VISIBLE_DEVICES="$CUDA_DEVICES" \
+    torchrun --nproc_per_node="$NUM_GPU" \
+             --node_rank=0 \
+             --rdzv_endpoint=localhost:$PORT \
+             --rdzv_id=simulate_multi_node \
+             --rdzv_backend=c10d \
+             --nnodes=1 \
+             "$@" \
+             --data.data_rank "$i" \
+             --data.data_world_size "$N" \
+             > "$LOG_FILE" 2>&1 &
+
     child_pids+=($!)
 done
 
-tail -f logs/log0.log &
-child_pids+=($!)
+if [ "$TOTAL_GPU" -ge 1 ]; then
+    tail -f "logs/log0.log" &
+    child_pids+=($!)
+fi
 
-wait
+wait