Skip to content

Commit

Permalink
scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
Panlichen committed Apr 11, 2023
1 parent 0ae238e commit cf708ad
Showing 1 changed file with 7 additions and 95 deletions.
102 changes: 7 additions & 95 deletions tools/train.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,103 +41,15 @@ export DEV_TRY_ROUND=10
export CHECK_REMAINING_SQE_INTERVAL=10000
export DEBUG_FILE="/home/panlichen/work/oneflow/log/oneflow_cpu_rank_"

export NUM_ITER_ENV=20
export NUM_ITER_ENV=200
echo NUM_ITER_ENV=$NUM_ITER_ENV

if [ $GPUS = 2 ]; then
export CUDA_VISIBLE_DEVICES=4,5

#pure dp
# export RECV_SUCCESS_FACTOR=5
# export RECV_SUCCESS_THRESHOLD=10000
# export BASE_CTX_SWITCH_THRESHOLD=100
# export TOLERANT_UNPROGRESSED_CNT=2000
# export NUM_TRY_TASKQ_HEAD=40

#pure tp
export RECV_SUCCESS_FACTOR=20
export RECV_SUCCESS_THRESHOLD=10000
export BASE_CTX_SWITCH_THRESHOLD=120
export TOLERANT_UNPROGRESSED_CNT=10000
export NUM_TRY_TASKQ_HEAD=100
elif [ $GPUS = 4 ]; then
export CUDA_VISIBLE_DEVICES=0,1,4,5
export ONEFLOW_OFCCL_SKIP_NEGO=0

#pure dp
# export ONEFLOW_OFCCL_SKIP_NEGO=0
# export RECV_SUCCESS_FACTOR=5
# export RECV_SUCCESS_THRESHOLD=10000
# export BASE_CTX_SWITCH_THRESHOLD=80
# export TOLERANT_UNPROGRESSED_CNT=10000
# export NUM_TRY_TASKQ_HEAD=50

#pure tp
export ONEFLOW_OFCCL_SKIP_NEGO=0
export RECV_SUCCESS_FACTOR=40
export RECV_SUCCESS_THRESHOLD=10000
export BASE_CTX_SWITCH_THRESHOLD=3000
export TOLERANT_UNPROGRESSED_CNT=16000
export NUM_TRY_TASKQ_HEAD=200

elif [ $GPUS = 8 ]; then

#pure dp
export ONEFLOW_OFCCL_SKIP_NEGO=0
export RECV_SUCCESS_FACTOR=10
export RECV_SUCCESS_THRESHOLD=10000
export BASE_CTX_SWITCH_THRESHOLD=100000
export TOLERANT_UNPROGRESSED_CNT=88000
export NUM_TRY_TASKQ_HEAD=240

#pure tp
# export ONEFLOW_OFCCL_SKIP_NEGO=1
# export RECV_SUCCESS_FACTOR=5
# export RECV_SUCCESS_THRESHOLD=10000
# export BASE_CTX_SWITCH_THRESHOLD=4000
# export TOLERANT_UNPROGRESSED_CNT=8000
# export NUM_TRY_TASKQ_HEAD=10

#3d
# export ONEFLOW_OFCCL_SKIP_NEGO=0
# export RECV_SUCCESS_FACTOR=5
# export RECV_SUCCESS_THRESHOLD=10000
# export BASE_CTX_SWITCH_THRESHOLD=8000
# export TOLERANT_UNPROGRESSED_CNT=80000
# export NUM_TRY_TASKQ_HEAD=10

#2dp4pp
# export ONEFLOW_OFCCL_SKIP_NEGO=0
# export RECV_SUCCESS_FACTOR=5
# export RECV_SUCCESS_THRESHOLD=10000
# export BASE_CTX_SWITCH_THRESHOLD=8000
# export TOLERANT_UNPROGRESSED_CNT=80000
# export NUM_TRY_TASKQ_HEAD=10

#2tp4pp
# export ONEFLOW_OFCCL_SKIP_NEGO=1
# export RECV_SUCCESS_FACTOR=10
# export RECV_SUCCESS_THRESHOLD=10000
# export BASE_CTX_SWITCH_THRESHOLD=12000
# export TOLERANT_UNPROGRESSED_CNT=8000
# export NUM_TRY_TASKQ_HEAD=10

#4tp2pp
# export ONEFLOW_OFCCL_SKIP_NEGO=1
# export RECV_SUCCESS_FACTOR=10
# export RECV_SUCCESS_THRESHOLD=10000
# export BASE_CTX_SWITCH_THRESHOLD=14000
# export TOLERANT_UNPROGRESSED_CNT=8000
# export NUM_TRY_TASKQ_HEAD=10

#4tp2dp
# export ONEFLOW_OFCCL_SKIP_NEGO=0
# export RECV_SUCCESS_FACTOR=5
# export RECV_SUCCESS_THRESHOLD=10000
# export BASE_CTX_SWITCH_THRESHOLD=8000
# export TOLERANT_UNPROGRESSED_CNT=9000
# export NUM_TRY_TASKQ_HEAD=10
fi
export ONEFLOW_OFCCL_SKIP_NEGO=0
export RECV_SUCCESS_FACTOR=5
export RECV_SUCCESS_THRESHOLD=10000000
export BASE_CTX_SWITCH_THRESHOLD=20000
export TOLERANT_UNPROGRESSED_CNT=80000
export NUM_TRY_TASKQ_HEAD=10

echo GPUS=$GPUS
echo ONEFLOW_ENABLE_OFCCL=$ONEFLOW_ENABLE_OFCCL
Expand Down

0 comments on commit cf708ad

Please sign in to comment.