Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pcc int poc #163

Closed
wants to merge 10 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions .gitmodules

This file was deleted.

4 changes: 2 additions & 2 deletions configs/debug/diloco.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ micro_bs = 8

[optim]
batch_size = 16
warmup_steps = 10
total_steps = 4
warmup_steps = 100
total_steps = 1500

[data]
fake = true
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ dependencies = [
"pyarrow",
"toposolve",
"psutil",
"pccl @ git+ssh://[email protected]/PrimeIntellect-ai/pccl.git@16110e15#egg=pccl&subdirectory=bindings/python", #todo move to https once open source
]

[project.optional-dependencies]
Expand Down
94 changes: 71 additions & 23 deletions scripts/simulate_multi_node_diloco.sh
Original file line number Diff line number Diff line change
@@ -1,23 +1,41 @@
#!/bin/bash

#
# simulate multi nodes on one gpu. start N torchrun on X gpu locally.
# example how to run ./scripts/simulate_multi_node.sh 2 1 src/zeroband/train.py @configs/debug/normal.toml
# Simulate multi-node on a single GPU or multiple GPUs.
# Start N torchrun instances on X GPUs locally.
# Example usage:
# ./scripts/simulate_multi_node.sh 2 1 src/zeroband/train.py @configs/debug/normal.toml

# Function to get the total number of available GPUs
get_total_gpus() {
nvidia-smi --query-gpu=name --format=csv,noheader | wc -l
}

# Function to get CUDA devices based on the number of GPUs and index
function get_cuda_devices() {
get_cuda_devices() {
local num_gpu=$1
local index=$2
local start_gpu=$((num_gpu * index))
local end_gpu=$((start_gpu + num_gpu - 1))

if [ "$num_gpu" -eq 1 ]; then
echo $start_gpu
if [ "$TOTAL_GPU" -eq 1 ]; then
echo "0"
elif [ "$num_gpu" -eq 1 ]; then
echo "$start_gpu"
else
echo $(seq -s ',' $start_gpu $end_gpu)
echo "$(seq -s ',' $start_gpu $end_gpu)"
fi
}

# Function to find an available port
find_available_port() {
local port=$1
while ss -tuln | grep -q ":$port "; do
port=$((port + 1))
done
echo $port
}

# Array to store PIDs of child processes
child_pids=()

Expand All @@ -35,37 +53,67 @@ cleanup() {
exit
}

# Check if at least three arguments were passed
# Register the cleanup function to be called on SIGINT (Ctrl+C) and SIGTERM
trap cleanup SIGINT SIGTERM

if [ "$#" -lt 3 ]; then
echo "Usage: $0 <N> <initial_peer> <num_gpu> [additional_python_args]"
echo "Usage: $0 <N> <num_gpu_per_node> <python_script> [additional_python_args...]"
echo "Example: $0 2 1 src/zeroband/train.py @configs/debug/normal.toml"
exit 1
fi

N=$1 # Number of ranks/nodes
NUM_GPU=$2 # Number of GPUs per node
shift 2 # Shift the first two arguments so that $@ contains only additional Python arguments

N=$1 # Set N from the first argument
NUM_GPU=$2
shift 2 # Remove the first three arguments so $@ contains only additional Python arguments

# Register the cleanup function to be called on SIGINT (Ctrl+C)
trap cleanup SIGINT
TOTAL_GPU=$(get_total_gpus)

if [ "$NUM_GPU" -gt "$TOTAL_GPU" ]; then
echo "Requested NUM_GPU ($NUM_GPU) exceeds the total available GPUs ($TOTAL_GPU)."
echo "Setting NUM_GPU to $TOTAL_GPU."
NUM_GPU=$TOTAL_GPU
fi

mkdir -p logs

export GLOBAL_ADDR=localhost
export GLOBAL_PORT=${GLOBAL_PORT:-5565}
export GLOBAL_WORLD_SIZE=$N
export BASE_PORT=${BASE_PORT:-10001}
export GLOO_SOCKET_IFNAME=lo

for i in $(seq 0 $(($N - 1 )))
do
> logs/log$i.log
WANDB_MODE=$([ $i -eq 0 ] && echo "online" || echo "online") GLOBAL_UNIQUE_ID=$i GLOBAL_RANK=$i CUDA_VISIBLE_DEVICES=$(get_cuda_devices $NUM_GPU $i) uv run torchrun --nproc_per_node=$NUM_GPU --node-rank 0 --rdzv-endpoint localhost:$((BASE_PORT + $i)) --nnodes=1 $@ --data.data_rank $i --data.data_world_size $N > logs/log$i.log 2>&1 &
BASE_PORT=${BASE_PORT:-10001}

for i in $(seq 0 $((N - 1))); do
LOG_FILE="logs/log$i.log"
> "$LOG_FILE"

CUDA_DEVICES=$(get_cuda_devices "$NUM_GPU" "$i")

# Find an available port
PORT=$(find_available_port $((BASE_PORT + i)))

echo "Starting rank $i with CUDA_VISIBLE_DEVICES=$CUDA_DEVICES on port $PORT"

WANDB_MODE=$([ "$i" -eq 0 ] && echo "online" || echo "online") \
GLOBAL_UNIQUE_ID=$i \
GLOBAL_RANK=$i \
CUDA_VISIBLE_DEVICES="$CUDA_DEVICES" \
torchrun --nproc_per_node="$NUM_GPU" \
--node_rank=0 \
--rdzv_endpoint=localhost:$PORT \
--rdzv_id=simulate_multi_node \
--rdzv_backend=c10d \
--nnodes=1 \
"$@" \
--data.data_rank "$i" \
--data.data_world_size "$N" \
> "$LOG_FILE" 2>&1 &

child_pids+=($!)
done

tail -f logs/log0.log &
child_pids+=($!)
if [ "$TOTAL_GPU" -ge 1 ]; then
tail -f "logs/log0.log" &
child_pids+=($!)
fi

wait
wait
Loading