From 2c04df94cfd34b37b9ce2c27f937afdba8e79ecc Mon Sep 17 00:00:00 2001 From: snordmann Date: Thu, 24 Oct 2024 02:08:07 +0300 Subject: [PATCH 01/20] working simple benchmark --- csrc/multidevice/communicator.cpp | 2 + tests/cpp/test_multidevice_overlap.cpp | 103 +++++++++++++++++++++++++ 2 files changed, 105 insertions(+) diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp index 8197ea224f4..ae6fc1fd9b4 100644 --- a/csrc/multidevice/communicator.cpp +++ b/csrc/multidevice/communicator.cpp @@ -196,6 +196,8 @@ Communicator::Communicator( return; } + cudaSetDevice(local_rank_); + #ifdef NVFUSER_DISTRIBUTED c10d::TCPStoreOptions store_opts; { diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index 39cab67cd13..5def14c8045 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -15,6 +15,7 @@ #include #include #include +#include namespace nvfuser { @@ -40,6 +41,108 @@ void synchronizeStreams(const std::vector& streams) { } // namespace +using OverlapBenchmarkParams = std::tuple< + CommunicatorBackend, + /*S=*/int64_t, + /*M=*/int64_t, + /*K=*/int64_t, + /*N=*/int64_t, + /*number_of_streams=*/int64_t>; + +class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface { + protected: + static std::unordered_map times; + + static void TearDownTestSuite() { + auto rank = Communicator::getInstance().deviceId(); + for (auto it: times) { + std::cout << "rank " << rank << ": " << it.first << ": " << it.second << std::endl; + } + } +}; + +std::unordered_map OverlapBenchmark::times = {}; + +TEST_P(OverlapBenchmark, DummyBenchmark) { + constexpr int64_t number_of_warmups = 120; + constexpr int64_t number_of_iterations = 500; + const int64_t D = communicator_->size(); + auto [backend, + S, + M, + K, + N, + number_of_streams] = GetParam(); + + GTEST_ASSERT_EQ(M % S, 0); + + auto world = communicator_->getWorld(backend); + + std::vector streams = + createStreams(number_of_streams, communicator_->deviceId()); + + auto options = at::TensorOptions().dtype(at::kFloat).device(communicator_->device()); + auto ta = at::randn({S, M/S,K}, options); + auto ta_unsharded = at::empty({S, D, M/S,K}, options); + auto tb = at::randn({K,N}, options); + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + for (const auto& iteration : + c10::irange(number_of_warmups + number_of_iterations)) { + if (iteration == number_of_warmups) { + cudaEventRecord(start); + } + for (auto j : c10::irange(S)) { + int64_t stream_index = j % streams.size(); + setCurrentCUDAStream(streams.at(stream_index)); + + auto ta_j = ta.select(0, j); + auto ta_unsharded_j = ta_unsharded.select(0, j); + + // communication + world->_allgather_base(ta_unsharded_j, ta_j)->wait(); + // compute + auto tc_j = torch::matmul(ta_unsharded_j,tb); + } + setCurrentCUDAStream(c10::cuda::getDefaultCUDAStream(communicator_->deviceId())); + synchronizeStreams(streams); + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + milliseconds /= number_of_iterations; + + std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name(); + times.insert({test_name, milliseconds}); +} + +INSTANTIATE_TEST_SUITE_P( + , + OverlapBenchmark, + testing::Combine( + testing::Values(CommunicatorBackend::kNccl, CommunicatorBackend::kUcc), + /*S=*/testing::Values(1,2,4,8), + /*M=*/testing::Values(pow(2,10), pow(2,15)), + /*K=*/testing::Values(pow(2,10), pow(2,15)), + /*N=*/testing::Values(pow(2,10)), + /*number_of_streams=*/testing::Values(3, 8)), + [](const testing::TestParamInfo& info) + -> std::string { + std::ostringstream os; + os << /*backend*/std::get<0>(info.param) << "_" + << "S" << std::get<1>(info.param) << "_" + << "M" << std::get<2>(info.param) << "_" + << "K" << std::get<3>(info.param) << "_" + << "N" << std::get<4>(info.param) << "_" + << "Streams" << std::get<5>(info.param); + return os.str(); + }); + + struct OverlapTestParams { // Tensors sizes int64_t M = std::pow(2, 6); From af36cf14ac7622945ba6ec6ff8ce68434cc94230 Mon Sep 17 00:00:00 2001 From: snordmann Date: Fri, 25 Oct 2024 03:54:42 +0300 Subject: [PATCH 02/20] minor --- tests/cpp/test_multidevice_overlap.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index 5def14c8045..76e8192d1fd 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -51,7 +51,7 @@ using OverlapBenchmarkParams = std::tuple< class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface { protected: - static std::unordered_map times; + static std::map times; static void TearDownTestSuite() { auto rank = Communicator::getInstance().deviceId(); @@ -61,11 +61,13 @@ class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterf } }; -std::unordered_map OverlapBenchmark::times = {}; +std::map OverlapBenchmark::times = {}; TEST_P(OverlapBenchmark, DummyBenchmark) { - constexpr int64_t number_of_warmups = 120; - constexpr int64_t number_of_iterations = 500; + int64_t number_of_warmups = 50; + constexpr int64_t number_of_iterations = 100; + + const int64_t D = communicator_->size(); auto [backend, S, @@ -118,6 +120,7 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name(); times.insert({test_name, milliseconds}); + std::cout << "rank " << communicator_->deviceId() << ", " << test_name << " : " << milliseconds << std::endl; } INSTANTIATE_TEST_SUITE_P( From 68b858a7fcd16e0f79fd62cafe6401496c924c60 Mon Sep 17 00:00:00 2001 From: snordmann Date: Fri, 25 Oct 2024 07:22:06 -0700 Subject: [PATCH 03/20] test script --- bench/process_outputs | 2 ++ bench/test | 35 ++++++++++++++++++++++++++ tests/cpp/test_multidevice_overlap.cpp | 4 +-- 3 files changed, 39 insertions(+), 2 deletions(-) create mode 100644 bench/process_outputs create mode 100755 bench/test diff --git a/bench/process_outputs b/bench/process_outputs new file mode 100644 index 00000000000..139597f9cb0 --- /dev/null +++ b/bench/process_outputs @@ -0,0 +1,2 @@ + + diff --git a/bench/test b/bench/test new file mode 100755 index 00000000000..f0d5728fb4b --- /dev/null +++ b/bench/test @@ -0,0 +1,35 @@ +#!/bin/bash +EXPERIMENT=tl_nccl +DATE=$(date +%Y%m%d-%H%M) +LOG_BASE="/opt/pytorch/Fuser/bench/logs" + +export LOGS="${LOG_BASE}/${EXPERIMENT}_${DATE}" + +mkdir -p $LOGS +LOG_FILE_INFO="${LOGS}/info" +echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO + +NP=8 +BACKEND=UCC +S=* +M=* +K=* +N=* +Streams=* +export GTEST_FILTER="OverlapBenchmark.DummyBenchmark/${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}" +echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO + +MPIFLAGS=" -np $NP" +MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1" +# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=^sharp,mlx5" +# MPIFLAGS+=" -x UCC_COLL_TRACE=info" +MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl" +echo "mpi flags: $MPIFLAGS" | tee -a $LOG_FILE_INFO + +TEST_CMD="$BUILD_DIRECTORY/test_multidevice --gtest_filter=${GTEST_FILTER}" +echo "test cmd: $TEST_CMD" | tee -a $LOG_FILE_INFO + +CMD="mpirun $MPIFLAGS $TEST_CMD" +echo $CMD | tee -a $LOG_FILE_INFO +$CMD | tee -a $LOG_FILE_INFO + diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index 76e8192d1fd..b8c998618b4 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -128,11 +128,11 @@ INSTANTIATE_TEST_SUITE_P( OverlapBenchmark, testing::Combine( testing::Values(CommunicatorBackend::kNccl, CommunicatorBackend::kUcc), - /*S=*/testing::Values(1,2,4,8), + /*S=*/testing::Values(1,2,4,8, 16, 32), /*M=*/testing::Values(pow(2,10), pow(2,15)), /*K=*/testing::Values(pow(2,10), pow(2,15)), /*N=*/testing::Values(pow(2,10)), - /*number_of_streams=*/testing::Values(3, 8)), + /*number_of_streams=*/testing::Values(3, 8, 32)), [](const testing::TestParamInfo& info) -> std::string { std::ostringstream os; From 0c3493b6c1782d27b5f417d2237751a0b37bf8df Mon Sep 17 00:00:00 2001 From: snordmann Date: Mon, 28 Oct 2024 13:13:09 +0200 Subject: [PATCH 04/20] minor --- bench/process_outputs | 5 +++++ bench/test | 2 +- tests/cpp/test_multidevice_overlap.cpp | 5 ++++- 3 files changed, 10 insertions(+), 2 deletions(-) mode change 100644 => 100755 bench/process_outputs diff --git a/bench/process_outputs b/bench/process_outputs old mode 100644 new mode 100755 index 139597f9cb0..c1781394dbc --- a/bench/process_outputs +++ b/bench/process_outputs @@ -1,2 +1,7 @@ +#!/bin/bash +FILE="/opt/pytorch/Fuser/bench/logs/${1}/info" +cat $FILE | grep "rank 0: " #| awk '{print $4}' + +# | grep -E 'Streams32\b' \ No newline at end of file diff --git a/bench/test b/bench/test index f0d5728fb4b..b6375719387 100755 --- a/bench/test +++ b/bench/test @@ -10,7 +10,7 @@ LOG_FILE_INFO="${LOGS}/info" echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO NP=8 -BACKEND=UCC +BACKEND=NCCL S=* M=* K=* diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index b8c998618b4..2febd097b62 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -55,8 +55,11 @@ class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterf static void TearDownTestSuite() { auto rank = Communicator::getInstance().deviceId(); + if (rank != 0) { + return; + } for (auto it: times) { - std::cout << "rank " << rank << ": " << it.first << ": " << it.second << std::endl; + std::cout << "time " << rank << ": " << it.first << ": " << it.second << std::endl; } } }; From b30b44bb897c0ec290f37f0e0e02d82ceea3421f Mon Sep 17 00:00:00 2001 From: snordmann Date: Tue, 29 Oct 2024 09:46:09 -0700 Subject: [PATCH 05/20] add nsight profiling --- bench/test | 36 ++++++++++++++++---------- tests/cpp/test_multidevice_overlap.cpp | 7 +++++ 2 files changed, 30 insertions(+), 13 deletions(-) diff --git a/bench/test b/bench/test index b6375719387..8ce85c8ff0f 100755 --- a/bench/test +++ b/bench/test @@ -1,35 +1,45 @@ #!/bin/bash -EXPERIMENT=tl_nccl +EXPERIMENT=profile DATE=$(date +%Y%m%d-%H%M) LOG_BASE="/opt/pytorch/Fuser/bench/logs" export LOGS="${LOG_BASE}/${EXPERIMENT}_${DATE}" mkdir -p $LOGS -LOG_FILE_INFO="${LOGS}/info" +LOG_FILE_INFO="${LOGS}/info.txt" echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO NP=8 BACKEND=NCCL -S=* -M=* -K=* -N=* -Streams=* -export GTEST_FILTER="OverlapBenchmark.DummyBenchmark/${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}" +S=4 +M=32768 +K=32768 +N=1024 +Streams=8 +GTEST_PREFIX="OverlapBenchmark.DummyBenchmark/" +GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}" +export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}" echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO - +`` MPIFLAGS=" -np $NP" MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1" # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=^sharp,mlx5" -# MPIFLAGS+=" -x UCC_COLL_TRACE=info" +MPIFLAGS+=" -x UCC_COLL_TRACE=info" MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl" +# MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO +MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1" echo "mpi flags: $MPIFLAGS" | tee -a $LOG_FILE_INFO TEST_CMD="$BUILD_DIRECTORY/test_multidevice --gtest_filter=${GTEST_FILTER}" echo "test cmd: $TEST_CMD" | tee -a $LOG_FILE_INFO -CMD="mpirun $MPIFLAGS $TEST_CMD" -echo $CMD | tee -a $LOG_FILE_INFO -$CMD | tee -a $LOG_FILE_INFO +MPICMD="mpirun $MPIFLAGS $TEST_CMD" +echo $MPICMD | tee -a $LOG_FILE_INFO + +NSYSCMD="nsys profile --stats=false -w true -t cublas,cuda,nvtx,osrt,mpi,ucx -o ${LOGS}/${GTEST_POSTFIX} --capture-range-end stop --capture-range=cudaProfilerApi --cudabacktrace=memory,sync,kernel,other" + +CMD="${NSYSCMD} ${MPICMD}" +sudo /bin/sh -c "echo '1' > /proc/sys/kernel/perf_event_paranoid" +echo $CMD | tee -a ${LOG_FILE_INFO} +$CMD | tee -a ${LOG_FILE_INFO} diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index 2febd097b62..189a0da732c 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #include namespace nvfuser { @@ -97,6 +98,9 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { for (const auto& iteration : c10::irange(number_of_warmups + number_of_iterations)) { + if (iteration == 10) { + cudaProfilerStart();; + } if (iteration == number_of_warmups) { cudaEventRecord(start); } @@ -114,6 +118,9 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { } setCurrentCUDAStream(c10::cuda::getDefaultCUDAStream(communicator_->deviceId())); synchronizeStreams(streams); + if (iteration == 15) { + cudaProfilerStop();; + } } cudaEventRecord(stop); cudaEventSynchronize(stop); From 0592a139918072d66c13790741310ecc195abe45 Mon Sep 17 00:00:00 2001 From: snordmann Date: Thu, 31 Oct 2024 06:39:08 -0700 Subject: [PATCH 06/20] nsight and tl/nccl/ sync mode --- bench/test | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bench/test b/bench/test index 8ce85c8ff0f..b51daa63ebd 100755 --- a/bench/test +++ b/bench/test @@ -1,16 +1,16 @@ #!/bin/bash -EXPERIMENT=profile +EXPERIMENT=profile_driver DATE=$(date +%Y%m%d-%H%M) LOG_BASE="/opt/pytorch/Fuser/bench/logs" export LOGS="${LOG_BASE}/${EXPERIMENT}_${DATE}" mkdir -p $LOGS -LOG_FILE_INFO="${LOGS}/info.txt" +export LOG_FILE_INFO="${LOGS}/info.txt" echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO NP=8 -BACKEND=NCCL +BACKEND=UCC S=4 M=32768 K=32768 @@ -28,6 +28,7 @@ MPIFLAGS+=" -x UCC_COLL_TRACE=info" MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl" # MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1" +MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=driver" echo "mpi flags: $MPIFLAGS" | tee -a $LOG_FILE_INFO TEST_CMD="$BUILD_DIRECTORY/test_multidevice --gtest_filter=${GTEST_FILTER}" From 0037b1e9b1398b9518a80011b1601f7e4f6cda5a Mon Sep 17 00:00:00 2001 From: snordmann Date: Mon, 4 Nov 2024 05:12:10 -0800 Subject: [PATCH 07/20] add cuStreamWriteValue but linkage error --- bench/test | 7 ++++--- tests/cpp/test_multidevice_overlap.cpp | 29 ++++++++++++++++++++++---- 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/bench/test b/bench/test index b51daa63ebd..2856cff9074 100755 --- a/bench/test +++ b/bench/test @@ -1,5 +1,5 @@ #!/bin/bash -EXPERIMENT=profile_driver +EXPERIMENT=profile_ncc_max_connection2 DATE=$(date +%Y%m%d-%H%M) LOG_BASE="/opt/pytorch/Fuser/bench/logs" @@ -10,7 +10,7 @@ export LOG_FILE_INFO="${LOGS}/info.txt" echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO NP=8 -BACKEND=UCC +BACKEND=NCCL S=4 M=32768 K=32768 @@ -28,7 +28,8 @@ MPIFLAGS+=" -x UCC_COLL_TRACE=info" MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl" # MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1" -MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=driver" +MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event" +MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2" echo "mpi flags: $MPIFLAGS" | tee -a $LOG_FILE_INFO TEST_CMD="$BUILD_DIRECTORY/test_multidevice --gtest_filter=${GTEST_FILTER}" diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index 189a0da732c..8fdaf8afdd9 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -48,7 +49,8 @@ using OverlapBenchmarkParams = std::tuple< /*M=*/int64_t, /*K=*/int64_t, /*N=*/int64_t, - /*number_of_streams=*/int64_t>; + /*number_of_streams=*/int64_t, + /*add_cuStreamWriteValue32=*/bool>; class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface { protected: @@ -78,7 +80,8 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { M, K, N, - number_of_streams] = GetParam(); + number_of_streams, + add_cuStreamWriteValue32] = GetParam(); GTEST_ASSERT_EQ(M % S, 0); @@ -96,6 +99,13 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { cudaEventCreate(&start); cudaEventCreate(&stop); + // CUdeviceptr pDevice; + // void* ptr; + // if (add_cuStreamWriteValue32) { + // cudaMallocHost(&ptr, 32); + // cudaHostGetDevicePointer((void**)&pDevice, ptr, 0); + // } + for (const auto& iteration : c10::irange(number_of_warmups + number_of_iterations)) { if (iteration == 10) { @@ -113,6 +123,11 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { // communication world->_allgather_base(ta_unsharded_j, ta_j)->wait(); + + // if (add_cuStreamWriteValue32) { + // cuStreamWriteValue32((CUstream)streams.at(stream_index), (CUdeviceptr)pDevice, (cuuint32_t)1, (unsigned int)0); + // } + // compute auto tc_j = torch::matmul(ta_unsharded_j,tb); } @@ -131,6 +146,10 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name(); times.insert({test_name, milliseconds}); std::cout << "rank " << communicator_->deviceId() << ", " << test_name << " : " << milliseconds << std::endl; + + // if (add_cuStreamWriteValue32) { + // cudaFree(ptr); + // } } INSTANTIATE_TEST_SUITE_P( @@ -142,7 +161,8 @@ INSTANTIATE_TEST_SUITE_P( /*M=*/testing::Values(pow(2,10), pow(2,15)), /*K=*/testing::Values(pow(2,10), pow(2,15)), /*N=*/testing::Values(pow(2,10)), - /*number_of_streams=*/testing::Values(3, 8, 32)), + /*number_of_streams=*/testing::Values(3, 8, 32), + /*add_cuStreamWriteValue32*/testing::Values(false)), [](const testing::TestParamInfo& info) -> std::string { std::ostringstream os; @@ -151,7 +171,8 @@ INSTANTIATE_TEST_SUITE_P( << "M" << std::get<2>(info.param) << "_" << "K" << std::get<3>(info.param) << "_" << "N" << std::get<4>(info.param) << "_" - << "Streams" << std::get<5>(info.param); + << "Streams" << std::get<5>(info.param) << "_" + << ((std::get<6>(info.param))? "With" : "Without") << "cuStreamWriteValue32"; return os.str(); }); From ec71e233de02deefb609bd81d2bd7dd6b3f2451f Mon Sep 17 00:00:00 2001 From: snordmann Date: Mon, 4 Nov 2024 06:27:56 -0800 Subject: [PATCH 08/20] multiple pgs --- bench/test | 13 +++++++------ tests/cpp/test_multidevice_overlap.cpp | 17 ++++++++++++----- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/bench/test b/bench/test index 2856cff9074..5433bbee9ce 100755 --- a/bench/test +++ b/bench/test @@ -1,5 +1,5 @@ #!/bin/bash -EXPERIMENT=profile_ncc_max_connection2 +EXPERIMENT=profile_baseline_NCCL DATE=$(date +%Y%m%d-%H%M) LOG_BASE="/opt/pytorch/Fuser/bench/logs" @@ -11,25 +11,26 @@ echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO NP=8 BACKEND=NCCL -S=4 +S=1 M=32768 K=32768 N=1024 Streams=8 +Pgs=1 GTEST_PREFIX="OverlapBenchmark.DummyBenchmark/" -GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}" +GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_Pgs${Pgs}" export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}" echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO `` MPIFLAGS=" -np $NP" MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1" # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=^sharp,mlx5" -MPIFLAGS+=" -x UCC_COLL_TRACE=info" +# MPIFLAGS+=" -x UCC_COLL_TRACE=info" MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl" # MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1" -MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event" -MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2" +# MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event" +# MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2" echo "mpi flags: $MPIFLAGS" | tee -a $LOG_FILE_INFO TEST_CMD="$BUILD_DIRECTORY/test_multidevice --gtest_filter=${GTEST_FILTER}" diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index 8fdaf8afdd9..ff79bb45609 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -50,7 +50,8 @@ using OverlapBenchmarkParams = std::tuple< /*K=*/int64_t, /*N=*/int64_t, /*number_of_streams=*/int64_t, - /*add_cuStreamWriteValue32=*/bool>; + /*add_cuStreamWriteValue32=*/bool, + /*number_of_pgs=*/int64_t>; class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface { protected: @@ -81,11 +82,13 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { K, N, number_of_streams, - add_cuStreamWriteValue32] = GetParam(); + add_cuStreamWriteValue32, + number_of_pgs] = GetParam(); GTEST_ASSERT_EQ(M % S, 0); - auto world = communicator_->getWorld(backend); + std::vector all_ranks(communicator_->size()); + std::iota(all_ranks.begin(), all_ranks.end(), 0); std::vector streams = createStreams(number_of_streams, communicator_->deviceId()); @@ -118,6 +121,8 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { int64_t stream_index = j % streams.size(); setCurrentCUDAStream(streams.at(stream_index)); + auto world = communicator_->getBackendForTeam(all_ranks, backend, std::to_string(j % number_of_pgs)); + auto ta_j = ta.select(0, j); auto ta_unsharded_j = ta_unsharded.select(0, j); @@ -162,7 +167,8 @@ INSTANTIATE_TEST_SUITE_P( /*K=*/testing::Values(pow(2,10), pow(2,15)), /*N=*/testing::Values(pow(2,10)), /*number_of_streams=*/testing::Values(3, 8, 32), - /*add_cuStreamWriteValue32*/testing::Values(false)), + /*add_cuStreamWriteValue32*/testing::Values(false), + /*number_of_pgs=*/testing::Values(1, 2, 4, 8)), [](const testing::TestParamInfo& info) -> std::string { std::ostringstream os; @@ -172,7 +178,8 @@ INSTANTIATE_TEST_SUITE_P( << "K" << std::get<3>(info.param) << "_" << "N" << std::get<4>(info.param) << "_" << "Streams" << std::get<5>(info.param) << "_" - << ((std::get<6>(info.param))? "With" : "Without") << "cuStreamWriteValue32"; + << ((std::get<6>(info.param))? "WithcuStreamWriteValue32_" : "") + << "Pgs" << std::get<7>(info.param); return os.str(); }); From a15fdfc9d84258d38442a78110d57be1a121598c Mon Sep 17 00:00:00 2001 From: snordmann Date: Mon, 4 Nov 2024 06:39:54 -0800 Subject: [PATCH 09/20] reenable cuStreamValue32 --- bench/test | 9 +++++---- csrc/driver_api.h | 1 + tests/cpp/test_multidevice_overlap.cpp | 26 +++++++++++++------------- 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/bench/test b/bench/test index 5433bbee9ce..4f3559e283a 100755 --- a/bench/test +++ b/bench/test @@ -1,5 +1,5 @@ #!/bin/bash -EXPERIMENT=profile_baseline_NCCL +EXPERIMENT=profile_cuStreamWrite_NCCL DATE=$(date +%Y%m%d-%H%M) LOG_BASE="/opt/pytorch/Fuser/bench/logs" @@ -11,14 +11,15 @@ echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO NP=8 BACKEND=NCCL -S=1 +S=8 M=32768 K=32768 N=1024 Streams=8 Pgs=1 +cuStreamWrite=WithcuStreamWriteValue32_ GTEST_PREFIX="OverlapBenchmark.DummyBenchmark/" -GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_Pgs${Pgs}" +GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}" export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}" echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO `` @@ -39,7 +40,7 @@ echo "test cmd: $TEST_CMD" | tee -a $LOG_FILE_INFO MPICMD="mpirun $MPIFLAGS $TEST_CMD" echo $MPICMD | tee -a $LOG_FILE_INFO -NSYSCMD="nsys profile --stats=false -w true -t cublas,cuda,nvtx,osrt,mpi,ucx -o ${LOGS}/${GTEST_POSTFIX} --capture-range-end stop --capture-range=cudaProfilerApi --cudabacktrace=memory,sync,kernel,other" +# NSYSCMD="nsys profile --stats=false -w true -t cublas,cuda,nvtx,osrt,mpi,ucx -o ${LOGS}/${GTEST_POSTFIX} --capture-range-end stop --capture-range=cudaProfilerApi --cudabacktrace=memory,sync,kernel,other" CMD="${NSYSCMD} ${MPICMD}" sudo /bin/sh -c "echo '1' > /proc/sys/kernel/perf_event_paranoid" diff --git a/csrc/driver_api.h b/csrc/driver_api.h index b8c413a4054..8105cf855c2 100644 --- a/csrc/driver_api.h +++ b/csrc/driver_api.h @@ -32,6 +32,7 @@ namespace nvfuser { fn(cuModuleGetFunction); \ fn(cuModuleLoadDataEx); \ fn(cuModuleUnload); \ + fn(cuStreamWriteValue32); \ fn(cuOccupancyMaxActiveBlocksPerMultiprocessor) #if (CUDA_VERSION >= 12000) diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index ff79bb45609..fef6e9bf468 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -102,12 +102,12 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { cudaEventCreate(&start); cudaEventCreate(&stop); - // CUdeviceptr pDevice; - // void* ptr; - // if (add_cuStreamWriteValue32) { - // cudaMallocHost(&ptr, 32); - // cudaHostGetDevicePointer((void**)&pDevice, ptr, 0); - // } + CUdeviceptr pDevice; + void* ptr; + if (add_cuStreamWriteValue32) { + cudaMallocHost(&ptr, 32); + cudaHostGetDevicePointer((void**)&pDevice, ptr, 0); + } for (const auto& iteration : c10::irange(number_of_warmups + number_of_iterations)) { @@ -129,9 +129,9 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { // communication world->_allgather_base(ta_unsharded_j, ta_j)->wait(); - // if (add_cuStreamWriteValue32) { - // cuStreamWriteValue32((CUstream)streams.at(stream_index), (CUdeviceptr)pDevice, (cuuint32_t)1, (unsigned int)0); - // } + if (add_cuStreamWriteValue32) { + cuStreamWriteValue32((CUstream)streams.at(stream_index), (CUdeviceptr)pDevice, (cuuint32_t)1, (unsigned int)0); + } // compute auto tc_j = torch::matmul(ta_unsharded_j,tb); @@ -152,9 +152,9 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { times.insert({test_name, milliseconds}); std::cout << "rank " << communicator_->deviceId() << ", " << test_name << " : " << milliseconds << std::endl; - // if (add_cuStreamWriteValue32) { - // cudaFree(ptr); - // } + if (add_cuStreamWriteValue32) { + cudaFree(ptr); + } } INSTANTIATE_TEST_SUITE_P( @@ -167,7 +167,7 @@ INSTANTIATE_TEST_SUITE_P( /*K=*/testing::Values(pow(2,10), pow(2,15)), /*N=*/testing::Values(pow(2,10)), /*number_of_streams=*/testing::Values(3, 8, 32), - /*add_cuStreamWriteValue32*/testing::Values(false), + /*add_cuStreamWriteValue32*/testing::Values(false, true), /*number_of_pgs=*/testing::Values(1, 2, 4, 8)), [](const testing::TestParamInfo& info) -> std::string { From 6682a33b366b3f21a1ced568106e8a3b475c8567 Mon Sep 17 00:00:00 2001 From: snordmann Date: Mon, 4 Nov 2024 07:57:44 -0800 Subject: [PATCH 10/20] add tl/cuda and ec/cuda flags in bash test script --- bench/test | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/bench/test b/bench/test index 4f3559e283a..5ad427b4876 100755 --- a/bench/test +++ b/bench/test @@ -1,5 +1,5 @@ #!/bin/bash -EXPERIMENT=profile_cuStreamWrite_NCCL +EXPERIMENT=profile_UCC_TL_CUDA DATE=$(date +%Y%m%d-%H%M) LOG_BASE="/opt/pytorch/Fuser/bench/logs" @@ -10,14 +10,14 @@ export LOG_FILE_INFO="${LOGS}/info.txt" echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO NP=8 -BACKEND=NCCL -S=8 +BACKEND=UCC +S=4 M=32768 K=32768 N=1024 Streams=8 Pgs=1 -cuStreamWrite=WithcuStreamWriteValue32_ +# cuStreamWrite=WithcuStreamWriteValue32_ GTEST_PREFIX="OverlapBenchmark.DummyBenchmark/" GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}" export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}" @@ -25,11 +25,21 @@ echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO `` MPIFLAGS=" -np $NP" MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1" +MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl" +# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=cuda" +# MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_NUM_WORKERS=8" +# MPIFLAGS+=" -x UCC_EC_CUDA_USE_COOPERATIVE_LAUNCH=0" +# MPIFLAGS+=" -x UCC_EC_CUDA_STREAM_TASK_MODE=kernel" +# MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_COPY_LARGE_THRESH=1M" +# MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_NUM_THREADS=512" +# MPIFLAGS+=" -x UCC_TL_CUDA_SCRATCH_SIZE=32mb" +# MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_MAX_RINGS=32" +# MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_NUM_CHUNKS=32" + # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=^sharp,mlx5" # MPIFLAGS+=" -x UCC_COLL_TRACE=info" -MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl" +# MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1" # MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO -MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1" # MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event" # MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2" echo "mpi flags: $MPIFLAGS" | tee -a $LOG_FILE_INFO From b01f1f4fe236be4144182cac5cbdcef15c559337 Mon Sep 17 00:00:00 2001 From: snordmann Date: Mon, 4 Nov 2024 08:40:14 -0800 Subject: [PATCH 11/20] add option to unfuse loops --- bench/test | 5 +++-- tests/cpp/test_multidevice_overlap.cpp | 27 ++++++++++++++++++++------ 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/bench/test b/bench/test index 5ad427b4876..2102c1eb743 100755 --- a/bench/test +++ b/bench/test @@ -11,15 +11,16 @@ echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO NP=8 BACKEND=UCC -S=4 +S=8 M=32768 K=32768 N=1024 Streams=8 Pgs=1 +UNFUSE="_unfused" # cuStreamWrite=WithcuStreamWriteValue32_ GTEST_PREFIX="OverlapBenchmark.DummyBenchmark/" -GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}" +GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}" export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}" echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO `` diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index fef6e9bf468..d4b9c757f7a 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -51,7 +51,8 @@ using OverlapBenchmarkParams = std::tuple< /*N=*/int64_t, /*number_of_streams=*/int64_t, /*add_cuStreamWriteValue32=*/bool, - /*number_of_pgs=*/int64_t>; + /*number_of_pgs=*/int64_t, + /*unfuse_loops=*/bool>; class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface { protected: @@ -83,7 +84,8 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { N, number_of_streams, add_cuStreamWriteValue32, - number_of_pgs] = GetParam(); + number_of_pgs, + unfuse_loops] = GetParam(); GTEST_ASSERT_EQ(M % S, 0); @@ -132,9 +134,20 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { if (add_cuStreamWriteValue32) { cuStreamWriteValue32((CUstream)streams.at(stream_index), (CUdeviceptr)pDevice, (cuuint32_t)1, (unsigned int)0); } + if (unfuse_loops == false) { + // compute + auto tc_j = torch::matmul(ta_unsharded_j,tb); + } + } + if (unfuse_loops) { + for (auto j : c10::irange(S)) { + int64_t stream_index = j % streams.size(); + setCurrentCUDAStream(streams.at(stream_index)); + auto ta_unsharded_j = ta_unsharded.select(0, j); - // compute - auto tc_j = torch::matmul(ta_unsharded_j,tb); + // compute + auto tc_j = torch::matmul(ta_unsharded_j,tb); + } } setCurrentCUDAStream(c10::cuda::getDefaultCUDAStream(communicator_->deviceId())); synchronizeStreams(streams); @@ -168,7 +181,8 @@ INSTANTIATE_TEST_SUITE_P( /*N=*/testing::Values(pow(2,10)), /*number_of_streams=*/testing::Values(3, 8, 32), /*add_cuStreamWriteValue32*/testing::Values(false, true), - /*number_of_pgs=*/testing::Values(1, 2, 4, 8)), + /*number_of_pgs=*/testing::Values(1, 2, 4, 8), + /*unfuse_loops=*/testing::Values(false, true)), [](const testing::TestParamInfo& info) -> std::string { std::ostringstream os; @@ -179,7 +193,8 @@ INSTANTIATE_TEST_SUITE_P( << "N" << std::get<4>(info.param) << "_" << "Streams" << std::get<5>(info.param) << "_" << ((std::get<6>(info.param))? "WithcuStreamWriteValue32_" : "") - << "Pgs" << std::get<7>(info.param); + << "Pgs" << std::get<7>(info.param) + << ((std::get<8>(info.param))? "_unfused" : ""); return os.str(); }); From ea7fd37d61ad310c5dcb2d8ca599d8212003ff44 Mon Sep 17 00:00:00 2001 From: snordmann Date: Tue, 5 Nov 2024 02:53:36 -0800 Subject: [PATCH 12/20] add cuda graphs. Only working for NCCL and S1 bc there is a syncStream in nccl --- bench/test | 13 ++-- tests/cpp/test_multidevice_overlap.cpp | 84 ++++++++++++++++---------- 2 files changed, 60 insertions(+), 37 deletions(-) diff --git a/bench/test b/bench/test index 2102c1eb743..8a64225d9e9 100755 --- a/bench/test +++ b/bench/test @@ -1,5 +1,5 @@ #!/bin/bash -EXPERIMENT=profile_UCC_TL_CUDA +EXPERIMENT=profile_cudaGraph_NCCL_S1 DATE=$(date +%Y%m%d-%H%M) LOG_BASE="/opt/pytorch/Fuser/bench/logs" @@ -10,17 +10,18 @@ export LOG_FILE_INFO="${LOGS}/info.txt" echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO NP=8 -BACKEND=UCC -S=8 +BACKEND=NCCL +S=1 M=32768 K=32768 N=1024 Streams=8 Pgs=1 -UNFUSE="_unfused" +# UNFUSE="_unfused" +GRAPH="_WithCudaGraph" # cuStreamWrite=WithcuStreamWriteValue32_ GTEST_PREFIX="OverlapBenchmark.DummyBenchmark/" -GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}" +GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}${GRAPH}" export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}" echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO `` @@ -51,7 +52,7 @@ echo "test cmd: $TEST_CMD" | tee -a $LOG_FILE_INFO MPICMD="mpirun $MPIFLAGS $TEST_CMD" echo $MPICMD | tee -a $LOG_FILE_INFO -# NSYSCMD="nsys profile --stats=false -w true -t cublas,cuda,nvtx,osrt,mpi,ucx -o ${LOGS}/${GTEST_POSTFIX} --capture-range-end stop --capture-range=cudaProfilerApi --cudabacktrace=memory,sync,kernel,other" +NSYSCMD="nsys profile --stats=false -w true -t cublas,cuda,nvtx,osrt,mpi,ucx -o ${LOGS}/${GTEST_POSTFIX} --capture-range-end stop --capture-range=cudaProfilerApi --cudabacktrace=memory,sync,kernel,other" CMD="${NSYSCMD} ${MPICMD}" sudo /bin/sh -c "echo '1' > /proc/sys/kernel/perf_event_paranoid" diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index d4b9c757f7a..c93987890b4 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -6,6 +6,7 @@ */ // clang-format on #include +#include #include #include #include @@ -52,7 +53,8 @@ using OverlapBenchmarkParams = std::tuple< /*number_of_streams=*/int64_t, /*add_cuStreamWriteValue32=*/bool, /*number_of_pgs=*/int64_t, - /*unfuse_loops=*/bool>; + /*unfuse_loops=*/bool, + /*use_cuda_graph=*/bool>; class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface { protected: @@ -72,8 +74,11 @@ class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterf std::map OverlapBenchmark::times = {}; TEST_P(OverlapBenchmark, DummyBenchmark) { - int64_t number_of_warmups = 50; + constexpr int64_t number_of_warmups = 50; constexpr int64_t number_of_iterations = 100; + constexpr int64_t iteration_profiler_start = 10; + constexpr int64_t iteration_profiler_end = 15; + constexpr int64_t iteration_cuda_graph_capture = 5; const int64_t D = communicator_->size(); @@ -85,7 +90,8 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { number_of_streams, add_cuStreamWriteValue32, number_of_pgs, - unfuse_loops] = GetParam(); + unfuse_loops, + use_cuda_graph] = GetParam(); GTEST_ASSERT_EQ(M % S, 0); @@ -94,6 +100,7 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { std::vector streams = createStreams(number_of_streams, communicator_->deviceId()); + setCurrentCUDAStream(streams.at(0)); auto options = at::TensorOptions().dtype(at::kFloat).device(communicator_->device()); auto ta = at::randn({S, M/S,K}, options); @@ -104,6 +111,8 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { cudaEventCreate(&start); cudaEventCreate(&stop); + at::cuda::CUDAGraph cuda_graph; + CUdeviceptr pDevice; void* ptr; if (add_cuStreamWriteValue32) { @@ -113,45 +122,56 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { for (const auto& iteration : c10::irange(number_of_warmups + number_of_iterations)) { - if (iteration == 10) { + if (iteration == iteration_profiler_start) { cudaProfilerStart();; } if (iteration == number_of_warmups) { cudaEventRecord(start); } - for (auto j : c10::irange(S)) { - int64_t stream_index = j % streams.size(); - setCurrentCUDAStream(streams.at(stream_index)); - - auto world = communicator_->getBackendForTeam(all_ranks, backend, std::to_string(j % number_of_pgs)); - - auto ta_j = ta.select(0, j); - auto ta_unsharded_j = ta_unsharded.select(0, j); - - // communication - world->_allgather_base(ta_unsharded_j, ta_j)->wait(); - - if (add_cuStreamWriteValue32) { - cuStreamWriteValue32((CUstream)streams.at(stream_index), (CUdeviceptr)pDevice, (cuuint32_t)1, (unsigned int)0); + if (iteration <= iteration_cuda_graph_capture) { + if (iteration == iteration_cuda_graph_capture) { + cuda_graph.capture_begin(); } - if (unfuse_loops == false) { - // compute - auto tc_j = torch::matmul(ta_unsharded_j,tb); - } - } - if (unfuse_loops) { for (auto j : c10::irange(S)) { int64_t stream_index = j % streams.size(); setCurrentCUDAStream(streams.at(stream_index)); + + auto world = communicator_->getBackendForTeam(all_ranks, backend, std::to_string(j % number_of_pgs)); + + auto ta_j = ta.select(0, j); auto ta_unsharded_j = ta_unsharded.select(0, j); - // compute - auto tc_j = torch::matmul(ta_unsharded_j,tb); + // communication + world->_allgather_base(ta_unsharded_j, ta_j)->wait(); + + if (add_cuStreamWriteValue32) { + cuStreamWriteValue32((CUstream)streams.at(stream_index), (CUdeviceptr)pDevice, (cuuint32_t)1, (unsigned int)0); + } + if (unfuse_loops == false) { + // compute + auto tc_j = torch::matmul(ta_unsharded_j,tb); + } + } + if (unfuse_loops) { + for (auto j : c10::irange(S)) { + int64_t stream_index = j % streams.size(); + setCurrentCUDAStream(streams.at(stream_index)); + auto ta_unsharded_j = ta_unsharded.select(0, j); + + // compute + auto tc_j = torch::matmul(ta_unsharded_j,tb); + } } + if (iteration == iteration_cuda_graph_capture) { + cuda_graph.capture_end(); + } else { + setCurrentCUDAStream(streams.at(0)); + synchronizeStreams(streams); + } + } else { + cuda_graph.replay(); } - setCurrentCUDAStream(c10::cuda::getDefaultCUDAStream(communicator_->deviceId())); - synchronizeStreams(streams); - if (iteration == 15) { + if (iteration == iteration_profiler_end) { cudaProfilerStop();; } } @@ -182,7 +202,8 @@ INSTANTIATE_TEST_SUITE_P( /*number_of_streams=*/testing::Values(3, 8, 32), /*add_cuStreamWriteValue32*/testing::Values(false, true), /*number_of_pgs=*/testing::Values(1, 2, 4, 8), - /*unfuse_loops=*/testing::Values(false, true)), + /*unfuse_loops=*/testing::Values(false, true), + /*use_cuda_graph=*/testing::Values(false)), // cuda graphs not supported: ucc does not supports it (segfault) and nccl PG has a "syncStream" that throws [](const testing::TestParamInfo& info) -> std::string { std::ostringstream os; @@ -194,7 +215,8 @@ INSTANTIATE_TEST_SUITE_P( << "Streams" << std::get<5>(info.param) << "_" << ((std::get<6>(info.param))? "WithcuStreamWriteValue32_" : "") << "Pgs" << std::get<7>(info.param) - << ((std::get<8>(info.param))? "_unfused" : ""); + << ((std::get<8>(info.param))? "_unfused" : "") + << ((std::get<9>(info.param))? "_WithCudaGraph" : ""); return os.str(); }); From 9dddac2a6320e315f1300febc624a03e084aa54f Mon Sep 17 00:00:00 2001 From: snordmann Date: Mon, 25 Nov 2024 16:51:59 -0800 Subject: [PATCH 13/20] write matmul to sliced output --- tests/cpp/test_multidevice_overlap.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index c93987890b4..5600041dc7d 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -73,7 +73,7 @@ class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterf std::map OverlapBenchmark::times = {}; -TEST_P(OverlapBenchmark, DummyBenchmark) { +TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) { constexpr int64_t number_of_warmups = 50; constexpr int64_t number_of_iterations = 100; constexpr int64_t iteration_profiler_start = 10; @@ -106,6 +106,7 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { auto ta = at::randn({S, M/S,K}, options); auto ta_unsharded = at::empty({S, D, M/S,K}, options); auto tb = at::randn({K,N}, options); + auto tc = at::empty({S, D, M/S, N}, options); cudaEvent_t start, stop; cudaEventCreate(&start); @@ -140,6 +141,7 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { auto ta_j = ta.select(0, j); auto ta_unsharded_j = ta_unsharded.select(0, j); + auto tc_j = ta_unsharded.select(0, j); // communication world->_allgather_base(ta_unsharded_j, ta_j)->wait(); @@ -149,7 +151,7 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { } if (unfuse_loops == false) { // compute - auto tc_j = torch::matmul(ta_unsharded_j,tb); + torch::matmul_out(tc_j, ta_unsharded_j,tb); } } if (unfuse_loops) { @@ -157,9 +159,10 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { int64_t stream_index = j % streams.size(); setCurrentCUDAStream(streams.at(stream_index)); auto ta_unsharded_j = ta_unsharded.select(0, j); + auto tc_j = ta_unsharded.select(0, j); // compute - auto tc_j = torch::matmul(ta_unsharded_j,tb); + torch::matmul_out(tc_j, ta_unsharded_j,tb); } } if (iteration == iteration_cuda_graph_capture) { From faf8bbe6b9c1ddf19b31069d66387b060481e9bf Mon Sep 17 00:00:00 2001 From: snordmann Date: Thu, 28 Nov 2024 08:12:18 -0800 Subject: [PATCH 14/20] wip cuStreamWriteValue not working --- bench/test | 14 +- tests/cpp/test_multidevice_overlap.cpp | 219 +++++++++++++++++++++++-- 2 files changed, 216 insertions(+), 17 deletions(-) diff --git a/bench/test b/bench/test index 8a64225d9e9..c27cb9ce74b 100755 --- a/bench/test +++ b/bench/test @@ -1,5 +1,5 @@ #!/bin/bash -EXPERIMENT=profile_cudaGraph_NCCL_S1 +EXPERIMENT=profile_NCCL_with_cuStreamValue DATE=$(date +%Y%m%d-%H%M) LOG_BASE="/opt/pytorch/Fuser/bench/logs" @@ -10,17 +10,17 @@ export LOG_FILE_INFO="${LOGS}/info.txt" echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO NP=8 -BACKEND=NCCL -S=1 +BACKEND=UCC +S=8 M=32768 K=32768 N=1024 Streams=8 Pgs=1 # UNFUSE="_unfused" -GRAPH="_WithCudaGraph" +# GRAPH="_WithCudaGraph" # cuStreamWrite=WithcuStreamWriteValue32_ -GTEST_PREFIX="OverlapBenchmark.DummyBenchmark/" +GTEST_PREFIX="OverlapBenchmark.PipelinedAGMatmulBenchmark/" GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}${GRAPH}" export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}" echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO @@ -42,7 +42,7 @@ MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl" # MPIFLAGS+=" -x UCC_COLL_TRACE=info" # MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1" # MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO -# MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event" +MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event" # MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2" echo "mpi flags: $MPIFLAGS" | tee -a $LOG_FILE_INFO @@ -52,7 +52,7 @@ echo "test cmd: $TEST_CMD" | tee -a $LOG_FILE_INFO MPICMD="mpirun $MPIFLAGS $TEST_CMD" echo $MPICMD | tee -a $LOG_FILE_INFO -NSYSCMD="nsys profile --stats=false -w true -t cublas,cuda,nvtx,osrt,mpi,ucx -o ${LOGS}/${GTEST_POSTFIX} --capture-range-end stop --capture-range=cudaProfilerApi --cudabacktrace=memory,sync,kernel,other" +# NSYSCMD="nsys profile --stats=false -w true -t cublas,cuda,nvtx,osrt,mpi,ucx -o ${LOGS}/${GTEST_POSTFIX} --capture-range-end stop --capture-range=cudaProfilerApi --cudabacktrace=memory,sync,kernel,other" CMD="${NSYSCMD} ${MPICMD}" sudo /bin/sh -c "echo '1' > /proc/sys/kernel/perf_event_paranoid" diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index 5600041dc7d..0d55580a11a 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -20,6 +20,8 @@ #include #include +#define CUSTOM_PG_WITH_INTERNAL_STREAM_ACCESS 1 + namespace nvfuser { namespace { @@ -44,6 +46,190 @@ void synchronizeStreams(const std::vector& streams) { } // namespace +TEST_F(NVFuserTest, cuStreamWriteValue32) { + constexpr cuuint32_t value = 3; + cudaError_t error; + CUdeviceptr pDevice; + volatile cuuint32_t* ptr; + error = cudaSetDevice(0); + ASSERT_EQ(error, 0); + error = cudaMallocHost((void**)&ptr, sizeof(cuuint32_t)); + ASSERT_EQ(error, 0); + error = cudaHostGetDevicePointer((void**)&pDevice, (void*)ptr, 0); + ASSERT_EQ(error, 0); + + at::cuda::CUDAStream c10_stream = at::cuda::getStreamFromPool( + /*isHighPriority=*/true, /*device_index*/0); + CUstream stream = c10_stream.stream(); + CUresult st; + st = cuStreamWriteValue32(stream, pDevice, value, /*flag=*/0); + ASSERT_EQ(st, 0); + + torch::cuda::synchronize(); + cuuint32_t ptr2; + error = cudaMemcpy(&ptr2, (void*)pDevice, sizeof(cuuint32_t), cudaMemcpyDeviceToHost); + ASSERT_EQ(error, 0); + ASSERT_EQ(ptr2, value); + + + int i = 0; + while (i < 10000000) { + if (*ptr == value) { + std::cout << " BREAK " << *ptr < DummyOverlapBenchmark::times = {}; + +TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) { + constexpr int64_t number_of_warmups = 50; + constexpr int64_t number_of_iterations = 100; + constexpr int64_t iteration_profiler_start = 10; + constexpr int64_t iteration_profiler_end = 15; + + + auto [backend, + M, + K, + N, + L, + number_of_streams, + add_cuStreamWriteValue32, + number_of_pgs] = GetParam(); + + std::vector all_ranks(communicator_->size()); + std::iota(all_ranks.begin(), all_ranks.end(), 0); + + std::vector streams = + createStreams(number_of_streams, communicator_->deviceId()); + setCurrentCUDAStream(streams.at(0)); + + auto options = at::TensorOptions().dtype(at::kFloat).device(communicator_->device()); + auto ta = at::randn({M, K}, options); + auto tb = at::randn({K, N}, options); + auto tc = at::empty({M, N}, options); + auto src = at::randn({L}, options); + auto dst = at::empty({L * communicator_->size()}, options); + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + CUdeviceptr pDevice; + void* ptr; + if (add_cuStreamWriteValue32) { + cudaMallocHost(&ptr, 32); + cudaHostGetDevicePointer((void**)&pDevice, ptr, 0); + } + + for (const auto& iteration : + c10::irange(number_of_warmups + number_of_iterations)) { + if (iteration == iteration_profiler_start) { + cudaProfilerStart();; + } + if (iteration == number_of_warmups) { + cudaEventRecord(start); + } + int64_t stream_index = iteration % streams.size(); + setCurrentCUDAStream(streams.at(stream_index)); + + auto world = communicator_->getBackendForTeam(all_ranks, backend, std::to_string(iteration % number_of_pgs)); + + // communication + world->_allgather_base(dst, src)->wait(); + + // compute + torch::matmul_out(tc, ta, tb); + + if (add_cuStreamWriteValue32) { + + cuStreamWriteValue32( +#if CUSTOM_PG_WITH_INTERNAL_STREAM_ACCESS + (CUstream)world->getCudaStream(communicator_->device()).stream(), +#else + (CUstream)streams.at(stream_index).stream(), +#endif + (CUdeviceptr)pDevice, (cuuint32_t)1, (unsigned int)0); + } + + setCurrentCUDAStream(streams.at(0)); + synchronizeStreams(streams); + if (iteration == iteration_profiler_end) { + cudaProfilerStop();; + } + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + milliseconds /= number_of_iterations; + + std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name(); + times.insert({test_name, milliseconds}); + std::cout << "rank " << communicator_->deviceId() << ", " << test_name << " : " << milliseconds << std::endl; + + if (add_cuStreamWriteValue32) { + cudaFree(ptr); + } +} + +INSTANTIATE_TEST_SUITE_P( + , + DummyOverlapBenchmark, + testing::Combine( + testing::Values(CommunicatorBackend::kNccl, CommunicatorBackend::kUcc), + /*M=*/testing::Values(pow(2,10), pow(2,15)), + /*K=*/testing::Values(pow(2,10), pow(2,15)), + /*N=*/testing::Values(pow(2,10)), + /*L=*/testing::Values(pow(2,15)), + /*number_of_streams=*/testing::Values(1, 8), + /*add_cuStreamWriteValue32*/testing::Values(false, true), + /*number_of_pgs=*/testing::Values(1, 2, 4, 8)), + [](const testing::TestParamInfo& info) + -> std::string { + std::ostringstream os; + os << /*backend*/std::get<0>(info.param) << "_" + << "M" << std::get<1>(info.param) << "_" + << "K" << std::get<2>(info.param) << "_" + << "N" << std::get<3>(info.param) << "_" + << "L" << std::get<4>(info.param) << "_" + << "Streams" << std::get<5>(info.param) << "_" + << ((std::get<6>(info.param))? "WithcuStreamWriteValue32_" : "") + << "Pgs" << std::get<7>(info.param); + return os.str(); + }); + using OverlapBenchmarkParams = std::tuple< CommunicatorBackend, /*S=*/int64_t, @@ -115,10 +301,10 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) { at::cuda::CUDAGraph cuda_graph; CUdeviceptr pDevice; - void* ptr; + cuuint32_t* ptr; if (add_cuStreamWriteValue32) { - cudaMallocHost(&ptr, 32); - cudaHostGetDevicePointer((void**)&pDevice, ptr, 0); + cudaMallocHost((void**)&ptr, sizeof(cuuint32_t)); + cudaHostGetDevicePointer((void**)&pDevice, (void*)ptr, 0); } for (const auto& iteration : @@ -129,8 +315,8 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) { if (iteration == number_of_warmups) { cudaEventRecord(start); } - if (iteration <= iteration_cuda_graph_capture) { - if (iteration == iteration_cuda_graph_capture) { + if (!use_cuda_graph || (iteration <= iteration_cuda_graph_capture)) { + if (use_cuda_graph && (iteration == iteration_cuda_graph_capture)) { cuda_graph.capture_begin(); } for (auto j : c10::irange(S)) { @@ -141,13 +327,22 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) { auto ta_j = ta.select(0, j); auto ta_unsharded_j = ta_unsharded.select(0, j); - auto tc_j = ta_unsharded.select(0, j); + auto tc_j = tc.select(0, j); // communication world->_allgather_base(ta_unsharded_j, ta_j)->wait(); if (add_cuStreamWriteValue32) { - cuStreamWriteValue32((CUstream)streams.at(stream_index), (CUdeviceptr)pDevice, (cuuint32_t)1, (unsigned int)0); + if (!communicator_->deviceId()){ + std::cout << "writing to stream " << world->getCudaStream(communicator_->device()).stream() << " the value " << (cuuint32_t)(iteration * S + j) << ", communicator_->device()=" << communicator_->device() << ", world=" << world << ", number_of_pgs=" << number_of_pgs << " with MACRO=" << CUSTOM_PG_WITH_INTERNAL_STREAM_ACCESS << std::endl; + } + cuStreamWriteValue32( +#if CUSTOM_PG_WITH_INTERNAL_STREAM_ACCESS + (CUstream)world->getCudaStream(communicator_->device()).stream(), +#else + // (CUstream)streams.at(stream_index).stream(), +#endif + (CUdeviceptr)pDevice, (cuuint32_t)(iteration * S + j), (unsigned int)0); } if (unfuse_loops == false) { // compute @@ -159,13 +354,13 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) { int64_t stream_index = j % streams.size(); setCurrentCUDAStream(streams.at(stream_index)); auto ta_unsharded_j = ta_unsharded.select(0, j); - auto tc_j = ta_unsharded.select(0, j); + auto tc_j = tc.select(0, j); // compute torch::matmul_out(tc_j, ta_unsharded_j,tb); } } - if (iteration == iteration_cuda_graph_capture) { + if (use_cuda_graph && (iteration == iteration_cuda_graph_capture)) { cuda_graph.capture_end(); } else { setCurrentCUDAStream(streams.at(0)); @@ -189,7 +384,11 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) { std::cout << "rank " << communicator_->deviceId() << ", " << test_name << " : " << milliseconds << std::endl; if (add_cuStreamWriteValue32) { - cudaFree(ptr); + std::cout << "RANK " << communicator_->device() << " entering while loop. Max index=" << (number_of_warmups + number_of_iterations)*S + S << std::endl; + while (*ptr < (cuuint32_t)(number_of_warmups + number_of_iterations)*S + S - 1) { + std::cout << "RANK " << communicator_->device() << " waiting at index=" << *ptr << std::endl; + } + cudaFree((void*)ptr); } } From a6b5fd75896d26a15fc0e2b6a8a66e9e81e60016 Mon Sep 17 00:00:00 2001 From: snordmann Date: Mon, 2 Dec 2024 05:45:31 -0800 Subject: [PATCH 15/20] dummy benchmark --- bench/test | 30 ++++--- tests/cpp/test_multidevice_overlap.cpp | 110 ++++--------------------- 2 files changed, 35 insertions(+), 105 deletions(-) diff --git a/bench/test b/bench/test index c27cb9ce74b..cff8d8b34bb 100755 --- a/bench/test +++ b/bench/test @@ -1,5 +1,5 @@ #!/bin/bash -EXPERIMENT=profile_NCCL_with_cuStreamValue +EXPERIMENT=Dummy_profile_NCCL_P2P_NET_CHUNKSIZE_LARGE DATE=$(date +%Y%m%d-%H%M) LOG_BASE="/opt/pytorch/Fuser/bench/logs" @@ -10,27 +10,33 @@ export LOG_FILE_INFO="${LOGS}/info.txt" echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO NP=8 -BACKEND=UCC +BACKEND=NCCL S=8 -M=32768 +M=131072 #32768 K=32768 -N=1024 +N=32768 #1024 +L=32768 Streams=8 Pgs=1 # UNFUSE="_unfused" # GRAPH="_WithCudaGraph" # cuStreamWrite=WithcuStreamWriteValue32_ -GTEST_PREFIX="OverlapBenchmark.PipelinedAGMatmulBenchmark/" -GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}${GRAPH}" +# GTEST_PREFIX="OverlapBenchmark.PipelinedAGMatmulBenchmark/" +GTEST_PREFIX="DummyOverlapBenchmark.PipelinedAGMatmulBenchmark/" +# GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}${GRAPH}" +GTEST_POSTFIX="${BACKEND}_M${M}_K${K}_N${N}_L${L}" export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}" echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO -`` + MPIFLAGS=" -np $NP" MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1" -MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl" +# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl" # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=cuda" +# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp" +# MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy" # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_NUM_WORKERS=8" # MPIFLAGS+=" -x UCC_EC_CUDA_USE_COOPERATIVE_LAUNCH=0" +# MPIFLAGS+=" -x UCC_EC_CUDA_STREAM_TASK_MODE=driver" # MPIFLAGS+=" -x UCC_EC_CUDA_STREAM_TASK_MODE=kernel" # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_COPY_LARGE_THRESH=1M" # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_NUM_THREADS=512" @@ -39,10 +45,12 @@ MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl" # MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_NUM_CHUNKS=32" # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=^sharp,mlx5" -# MPIFLAGS+=" -x UCC_COLL_TRACE=info" +# MPIFLAGS+=" -x UCC_COLL_TRACE=debug" +# MPIFLAGS+=" -x UCC_LOG_LEVEL=debug" # MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1" +MPIFLAGS+=" -x NCCL_P2P_NET_CHUNKSIZE=2MB" # MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO -MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event" +# MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event" # MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2" echo "mpi flags: $MPIFLAGS" | tee -a $LOG_FILE_INFO @@ -52,7 +60,7 @@ echo "test cmd: $TEST_CMD" | tee -a $LOG_FILE_INFO MPICMD="mpirun $MPIFLAGS $TEST_CMD" echo $MPICMD | tee -a $LOG_FILE_INFO -# NSYSCMD="nsys profile --stats=false -w true -t cublas,cuda,nvtx,osrt,mpi,ucx -o ${LOGS}/${GTEST_POSTFIX} --capture-range-end stop --capture-range=cudaProfilerApi --cudabacktrace=memory,sync,kernel,other" +NSYSCMD="nsys profile --stats=false -w true -t cublas,cuda,nvtx,osrt,mpi,ucx -o ${LOGS}/${GTEST_POSTFIX} --capture-range-end stop --capture-range=cudaProfilerApi --cudabacktrace=memory,sync,kernel,other" CMD="${NSYSCMD} ${MPICMD}" sudo /bin/sh -c "echo '1' > /proc/sys/kernel/perf_event_paranoid" diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index 0d55580a11a..85059b89a31 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -20,7 +20,7 @@ #include #include -#define CUSTOM_PG_WITH_INTERNAL_STREAM_ACCESS 1 +#define CUSTOM_PG_WITH_INTERNAL_STREAM_ACCESS 0 namespace nvfuser { @@ -46,54 +46,12 @@ void synchronizeStreams(const std::vector& streams) { } // namespace -TEST_F(NVFuserTest, cuStreamWriteValue32) { - constexpr cuuint32_t value = 3; - cudaError_t error; - CUdeviceptr pDevice; - volatile cuuint32_t* ptr; - error = cudaSetDevice(0); - ASSERT_EQ(error, 0); - error = cudaMallocHost((void**)&ptr, sizeof(cuuint32_t)); - ASSERT_EQ(error, 0); - error = cudaHostGetDevicePointer((void**)&pDevice, (void*)ptr, 0); - ASSERT_EQ(error, 0); - - at::cuda::CUDAStream c10_stream = at::cuda::getStreamFromPool( - /*isHighPriority=*/true, /*device_index*/0); - CUstream stream = c10_stream.stream(); - CUresult st; - st = cuStreamWriteValue32(stream, pDevice, value, /*flag=*/0); - ASSERT_EQ(st, 0); - - torch::cuda::synchronize(); - cuuint32_t ptr2; - error = cudaMemcpy(&ptr2, (void*)pDevice, sizeof(cuuint32_t), cudaMemcpyDeviceToHost); - ASSERT_EQ(error, 0); - ASSERT_EQ(ptr2, value); - - - int i = 0; - while (i < 10000000) { - if (*ptr == value) { - std::cout << " BREAK " << *ptr <deviceId() << ", " << test_name << " : " << milliseconds << std::endl; - - if (add_cuStreamWriteValue32) { - cudaFree(ptr); - } } INSTANTIATE_TEST_SUITE_P( @@ -209,13 +140,10 @@ INSTANTIATE_TEST_SUITE_P( DummyOverlapBenchmark, testing::Combine( testing::Values(CommunicatorBackend::kNccl, CommunicatorBackend::kUcc), - /*M=*/testing::Values(pow(2,10), pow(2,15)), - /*K=*/testing::Values(pow(2,10), pow(2,15)), - /*N=*/testing::Values(pow(2,10)), - /*L=*/testing::Values(pow(2,15)), - /*number_of_streams=*/testing::Values(1, 8), - /*add_cuStreamWriteValue32*/testing::Values(false, true), - /*number_of_pgs=*/testing::Values(1, 2, 4, 8)), + /*M=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)), + /*K=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)), + /*N=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)), + /*L=*/testing::Values(pow(2,10), pow(2,15), pow(2,17))), [](const testing::TestParamInfo& info) -> std::string { std::ostringstream os; @@ -223,10 +151,7 @@ INSTANTIATE_TEST_SUITE_P( << "M" << std::get<1>(info.param) << "_" << "K" << std::get<2>(info.param) << "_" << "N" << std::get<3>(info.param) << "_" - << "L" << std::get<4>(info.param) << "_" - << "Streams" << std::get<5>(info.param) << "_" - << ((std::get<6>(info.param))? "WithcuStreamWriteValue32_" : "") - << "Pgs" << std::get<7>(info.param); + << "L" << std::get<4>(info.param); return os.str(); }); @@ -333,14 +258,11 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) { world->_allgather_base(ta_unsharded_j, ta_j)->wait(); if (add_cuStreamWriteValue32) { - if (!communicator_->deviceId()){ - std::cout << "writing to stream " << world->getCudaStream(communicator_->device()).stream() << " the value " << (cuuint32_t)(iteration * S + j) << ", communicator_->device()=" << communicator_->device() << ", world=" << world << ", number_of_pgs=" << number_of_pgs << " with MACRO=" << CUSTOM_PG_WITH_INTERNAL_STREAM_ACCESS << std::endl; - } cuStreamWriteValue32( #if CUSTOM_PG_WITH_INTERNAL_STREAM_ACCESS (CUstream)world->getCudaStream(communicator_->device()).stream(), #else - // (CUstream)streams.at(stream_index).stream(), + (CUstream)streams.at(stream_index).stream(), #endif (CUdeviceptr)pDevice, (cuuint32_t)(iteration * S + j), (unsigned int)0); } From 8d927bf4d7537b2ae2450efd775c039c68ebffbe Mon Sep 17 00:00:00 2001 From: snordmann Date: Mon, 2 Dec 2024 06:45:27 -0800 Subject: [PATCH 16/20] add pre post comms option --- bench/test | 22 ++++++++++++--------- tests/cpp/test_multidevice_overlap.cpp | 27 ++++++++++++++++++++------ 2 files changed, 34 insertions(+), 15 deletions(-) diff --git a/bench/test b/bench/test index cff8d8b34bb..28532970124 100755 --- a/bench/test +++ b/bench/test @@ -1,5 +1,5 @@ #!/bin/bash -EXPERIMENT=Dummy_profile_NCCL_P2P_NET_CHUNKSIZE_LARGE +EXPERIMENT=Dummy_profile_POST_COMM_UCC_TL_UCP_OVER_IB_LARGE DATE=$(date +%Y%m%d-%H%M) LOG_BASE="/opt/pytorch/Fuser/bench/logs" @@ -10,21 +10,25 @@ export LOG_FILE_INFO="${LOGS}/info.txt" echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO NP=8 -BACKEND=NCCL -S=8 +BACKEND=UCC M=131072 #32768 K=32768 N=32768 #1024 -L=32768 + +S=8 Streams=8 Pgs=1 + +L=32768 +# PRE_COMM="_pre_comm" +POST_COMM="_post_comm" # UNFUSE="_unfused" # GRAPH="_WithCudaGraph" # cuStreamWrite=WithcuStreamWriteValue32_ # GTEST_PREFIX="OverlapBenchmark.PipelinedAGMatmulBenchmark/" GTEST_PREFIX="DummyOverlapBenchmark.PipelinedAGMatmulBenchmark/" # GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}${GRAPH}" -GTEST_POSTFIX="${BACKEND}_M${M}_K${K}_N${N}_L${L}" +GTEST_POSTFIX="${BACKEND}_M${M}_K${K}_N${N}_L${L}${PRE_COMM}${POST_COMM}" export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}" echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO @@ -32,8 +36,8 @@ MPIFLAGS=" -np $NP" MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1" # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl" # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=cuda" -# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp" -# MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy" +MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp" +MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy" # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_NUM_WORKERS=8" # MPIFLAGS+=" -x UCC_EC_CUDA_USE_COOPERATIVE_LAUNCH=0" # MPIFLAGS+=" -x UCC_EC_CUDA_STREAM_TASK_MODE=driver" @@ -45,10 +49,10 @@ MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1" # MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_NUM_CHUNKS=32" # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=^sharp,mlx5" -# MPIFLAGS+=" -x UCC_COLL_TRACE=debug" +# MPIFLAGS+=" -x UCC_COLL_TRACE=info" # MPIFLAGS+=" -x UCC_LOG_LEVEL=debug" # MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1" -MPIFLAGS+=" -x NCCL_P2P_NET_CHUNKSIZE=2MB" +# MPIFLAGS+=" -x NCCL_P2P_NET_CHUNKSIZE=2MB" # MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO # MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event" # MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2" diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index 85059b89a31..9898df02ac8 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -51,7 +51,9 @@ using DummyOverlapBenchmarkParams = std::tuple< /*M=*/int64_t, /*K=*/int64_t, /*N=*/int64_t, - /*L(communication msgsize)=*/int64_t>; + /*L(communication msgsize)=*/int64_t, + /*pre_comm=*/bool, + /*post_comm=*/bool>; class DummyOverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface { protected: @@ -81,7 +83,9 @@ TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) { M, K, N, - L] = GetParam(); + L, + pre_comm, + post_comm] = GetParam(); std::vector all_ranks(communicator_->size()); std::iota(all_ranks.begin(), all_ranks.end(), 0); @@ -112,13 +116,20 @@ TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) { cudaEventRecord(start); } - setCurrentCUDAStream(communication_stream); - world->_allgather_base(dst, src)->wait(); + if (pre_comm) { + setCurrentCUDAStream(communication_stream); + world->_allgather_base(dst, src)->wait(); + } // compute setCurrentCUDAStream(compute_stream); torch::matmul_out(tc, ta, tb); + if (post_comm) { + setCurrentCUDAStream(communication_stream); + world->_allgather_base(dst, src)->wait(); + } + if (iteration == iteration_profiler_end) { cudaProfilerStop();; } @@ -143,7 +154,9 @@ INSTANTIATE_TEST_SUITE_P( /*M=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)), /*K=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)), /*N=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)), - /*L=*/testing::Values(pow(2,10), pow(2,15), pow(2,17))), + /*L=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)), + /*pre-comm=*/testing::Bool(), + /*post-comm=*/testing::Bool()), [](const testing::TestParamInfo& info) -> std::string { std::ostringstream os; @@ -151,7 +164,9 @@ INSTANTIATE_TEST_SUITE_P( << "M" << std::get<1>(info.param) << "_" << "K" << std::get<2>(info.param) << "_" << "N" << std::get<3>(info.param) << "_" - << "L" << std::get<4>(info.param); + << "L" << std::get<4>(info.param) + << ((std::get<5>(info.param))? "_pre_comm" : "") + << ((std::get<6>(info.param))? "_post_comm" : ""); return os.str(); }); From d9c581c13a9742b3896baf1bd37bc8bcd0acb923 Mon Sep 17 00:00:00 2001 From: snordmann Date: Mon, 2 Dec 2024 06:45:27 -0800 Subject: [PATCH 17/20] add pre post comms option --- bench/test | 20 +++++++++++-------- tests/cpp/test_multidevice_overlap.cpp | 27 ++++++++++++++++++++------ 2 files changed, 33 insertions(+), 14 deletions(-) diff --git a/bench/test b/bench/test index cff8d8b34bb..72c22480714 100755 --- a/bench/test +++ b/bench/test @@ -1,5 +1,5 @@ #!/bin/bash -EXPERIMENT=Dummy_profile_NCCL_P2P_NET_CHUNKSIZE_LARGE +EXPERIMENT=Dummy_profile_POST_COMM_UCC_TL_UCP_OVER_IB_LARGE DATE=$(date +%Y%m%d-%H%M) LOG_BASE="/opt/pytorch/Fuser/bench/logs" @@ -11,20 +11,24 @@ echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO NP=8 BACKEND=NCCL -S=8 M=131072 #32768 K=32768 N=32768 #1024 -L=32768 + +S=8 Streams=8 Pgs=1 + +L=32768 +# PRE_COMM="_pre_comm" +POST_COMM="_post_comm" # UNFUSE="_unfused" # GRAPH="_WithCudaGraph" # cuStreamWrite=WithcuStreamWriteValue32_ # GTEST_PREFIX="OverlapBenchmark.PipelinedAGMatmulBenchmark/" GTEST_PREFIX="DummyOverlapBenchmark.PipelinedAGMatmulBenchmark/" # GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}${GRAPH}" -GTEST_POSTFIX="${BACKEND}_M${M}_K${K}_N${N}_L${L}" +GTEST_POSTFIX="${BACKEND}_M${M}_K${K}_N${N}_L${L}${PRE_COMM}${POST_COMM}" export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}" echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO @@ -32,8 +36,8 @@ MPIFLAGS=" -np $NP" MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1" # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl" # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=cuda" -# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp" -# MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy" +MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp" +MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy" # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_NUM_WORKERS=8" # MPIFLAGS+=" -x UCC_EC_CUDA_USE_COOPERATIVE_LAUNCH=0" # MPIFLAGS+=" -x UCC_EC_CUDA_STREAM_TASK_MODE=driver" @@ -45,10 +49,10 @@ MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1" # MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_NUM_CHUNKS=32" # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=^sharp,mlx5" -# MPIFLAGS+=" -x UCC_COLL_TRACE=debug" +# MPIFLAGS+=" -x UCC_COLL_TRACE=info" # MPIFLAGS+=" -x UCC_LOG_LEVEL=debug" # MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1" -MPIFLAGS+=" -x NCCL_P2P_NET_CHUNKSIZE=2MB" +# MPIFLAGS+=" -x NCCL_P2P_NET_CHUNKSIZE=2MB" # MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO # MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event" # MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2" diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index 85059b89a31..9898df02ac8 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -51,7 +51,9 @@ using DummyOverlapBenchmarkParams = std::tuple< /*M=*/int64_t, /*K=*/int64_t, /*N=*/int64_t, - /*L(communication msgsize)=*/int64_t>; + /*L(communication msgsize)=*/int64_t, + /*pre_comm=*/bool, + /*post_comm=*/bool>; class DummyOverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface { protected: @@ -81,7 +83,9 @@ TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) { M, K, N, - L] = GetParam(); + L, + pre_comm, + post_comm] = GetParam(); std::vector all_ranks(communicator_->size()); std::iota(all_ranks.begin(), all_ranks.end(), 0); @@ -112,13 +116,20 @@ TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) { cudaEventRecord(start); } - setCurrentCUDAStream(communication_stream); - world->_allgather_base(dst, src)->wait(); + if (pre_comm) { + setCurrentCUDAStream(communication_stream); + world->_allgather_base(dst, src)->wait(); + } // compute setCurrentCUDAStream(compute_stream); torch::matmul_out(tc, ta, tb); + if (post_comm) { + setCurrentCUDAStream(communication_stream); + world->_allgather_base(dst, src)->wait(); + } + if (iteration == iteration_profiler_end) { cudaProfilerStop();; } @@ -143,7 +154,9 @@ INSTANTIATE_TEST_SUITE_P( /*M=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)), /*K=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)), /*N=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)), - /*L=*/testing::Values(pow(2,10), pow(2,15), pow(2,17))), + /*L=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)), + /*pre-comm=*/testing::Bool(), + /*post-comm=*/testing::Bool()), [](const testing::TestParamInfo& info) -> std::string { std::ostringstream os; @@ -151,7 +164,9 @@ INSTANTIATE_TEST_SUITE_P( << "M" << std::get<1>(info.param) << "_" << "K" << std::get<2>(info.param) << "_" << "N" << std::get<3>(info.param) << "_" - << "L" << std::get<4>(info.param); + << "L" << std::get<4>(info.param) + << ((std::get<5>(info.param))? "_pre_comm" : "") + << ((std::get<6>(info.param))? "_post_comm" : ""); return os.str(); }); From bfc7fa6ac9e81d6b4a2552733cc9a76dc1c66635 Mon Sep 17 00:00:00 2001 From: snordmann Date: Fri, 6 Dec 2024 17:26:44 +0200 Subject: [PATCH 18/20] cleanup test script --- bench/test | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/bench/test b/bench/test index 72c22480714..969b8da00e2 100755 --- a/bench/test +++ b/bench/test @@ -1,5 +1,5 @@ #!/bin/bash -EXPERIMENT=Dummy_profile_POST_COMM_UCC_TL_UCP_OVER_IB_LARGE +EXPERIMENT=Dummy_profile_UCC_TL_CUDA DATE=$(date +%Y%m%d-%H%M) LOG_BASE="/opt/pytorch/Fuser/bench/logs" @@ -10,7 +10,7 @@ export LOG_FILE_INFO="${LOGS}/info.txt" echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO NP=8 -BACKEND=NCCL +BACKEND=UCC M=131072 #32768 K=32768 N=32768 #1024 @@ -20,8 +20,8 @@ Streams=8 Pgs=1 L=32768 -# PRE_COMM="_pre_comm" -POST_COMM="_post_comm" +PRE_COMM="_pre_comm" +# POST_COMM="_post_comm" # UNFUSE="_unfused" # GRAPH="_WithCudaGraph" # cuStreamWrite=WithcuStreamWriteValue32_ @@ -33,28 +33,37 @@ export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}" echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO MPIFLAGS=" -np $NP" -MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1" + +# MPIFLAGS+=" -x NCCL_P2P_NET_CHUNKSIZE=2MB" +# MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO +# MPIFLAGS+=" -x NCCL_MAX_NCHANNELS=1" + # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl" -# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=cuda" -MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp" -MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy" +# MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event" + +MPIFLAGS+=" -x UCC_CL_BASIC_TLS=cuda" +# MPIFLAGS+=" -x UCC_TL_CUDA_SCRATCH_SIZE=32mb" +# MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_MAX_RINGS=32" +# MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_NUM_CHUNKS=32" + # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_NUM_WORKERS=8" # MPIFLAGS+=" -x UCC_EC_CUDA_USE_COOPERATIVE_LAUNCH=0" # MPIFLAGS+=" -x UCC_EC_CUDA_STREAM_TASK_MODE=driver" # MPIFLAGS+=" -x UCC_EC_CUDA_STREAM_TASK_MODE=kernel" # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_COPY_LARGE_THRESH=1M" # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_NUM_THREADS=512" -# MPIFLAGS+=" -x UCC_TL_CUDA_SCRATCH_SIZE=32mb" -# MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_MAX_RINGS=32" -# MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_NUM_CHUNKS=32" +# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp" +# MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy" +# MPIFLAGS+=" -x UCX_RNDV_SCHEME=put_zcopy" +# MPIFLAGS+=" -x UCX_RNDV_SCHEME=get_zcopy" + + +MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1" # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=^sharp,mlx5" # MPIFLAGS+=" -x UCC_COLL_TRACE=info" # MPIFLAGS+=" -x UCC_LOG_LEVEL=debug" # MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1" -# MPIFLAGS+=" -x NCCL_P2P_NET_CHUNKSIZE=2MB" -# MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO -# MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event" # MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2" echo "mpi flags: $MPIFLAGS" | tee -a $LOG_FILE_INFO From 1a1138cbb5629fd47c9d0c056ac21db68af2f77b Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 8 Jan 2025 18:28:45 +0200 Subject: [PATCH 19/20] update --- bench/test | 27 +++++++++-------- csrc/multidevice/utils.cpp | 6 ++-- tests/cpp/test_multidevice_overlap.cpp | 42 ++++++++++++++++++-------- 3 files changed, 46 insertions(+), 29 deletions(-) diff --git a/bench/test b/bench/test index 969b8da00e2..1b5d6f41c5a 100755 --- a/bench/test +++ b/bench/test @@ -1,25 +1,19 @@ #!/bin/bash -EXPERIMENT=Dummy_profile_UCC_TL_CUDA +EXPERIMENT=Dummy_profile_msgsize256m_float16_M128k_K128k_N32k_UCC_IB DATE=$(date +%Y%m%d-%H%M) LOG_BASE="/opt/pytorch/Fuser/bench/logs" -export LOGS="${LOG_BASE}/${EXPERIMENT}_${DATE}" - -mkdir -p $LOGS -export LOG_FILE_INFO="${LOGS}/info.txt" -echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO - NP=8 BACKEND=UCC M=131072 #32768 -K=32768 +K=131072 N=32768 #1024 S=8 Streams=8 Pgs=1 -L=32768 +L=1048576 #268435456 #67108864 #131072 PRE_COMM="_pre_comm" # POST_COMM="_post_comm" # UNFUSE="_unfused" @@ -41,7 +35,7 @@ MPIFLAGS=" -np $NP" # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl" # MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event" -MPIFLAGS+=" -x UCC_CL_BASIC_TLS=cuda" +# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=cuda" # MPIFLAGS+=" -x UCC_TL_CUDA_SCRATCH_SIZE=32mb" # MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_MAX_RINGS=32" # MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_NUM_CHUNKS=32" @@ -53,10 +47,10 @@ MPIFLAGS+=" -x UCC_CL_BASIC_TLS=cuda" # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_COPY_LARGE_THRESH=1M" # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_NUM_THREADS=512" -# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp" -# MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy" +MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp" +MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy" # MPIFLAGS+=" -x UCX_RNDV_SCHEME=put_zcopy" -# MPIFLAGS+=" -x UCX_RNDV_SCHEME=get_zcopy" +MPIFLAGS+=" -x UCX_RNDV_SCHEME=get_zcopy" MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1" @@ -65,6 +59,13 @@ MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1" # MPIFLAGS+=" -x UCC_LOG_LEVEL=debug" # MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1" # MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2" + + +export LOGS="${LOG_BASE}/${EXPERIMENT}_${BACKEND}_${DATE}" +mkdir -p $LOGS +export LOG_FILE_INFO="${LOGS}/info.txt" +echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO + echo "mpi flags: $MPIFLAGS" | tee -a $LOG_FILE_INFO TEST_CMD="$BUILD_DIRECTORY/test_multidevice --gtest_filter=${GTEST_FILTER}" diff --git a/csrc/multidevice/utils.cpp b/csrc/multidevice/utils.cpp index d2117b222da..5eb4a8a21b9 100644 --- a/csrc/multidevice/utils.cpp +++ b/csrc/multidevice/utils.cpp @@ -43,11 +43,11 @@ std::unordered_set getShardedIterDomains(TensorView* tv) { // Returns whether a IterDomain in a TensorView is the outermost // allocated IterDomain in the TensorView. bool isOutermostAllocatedId(TensorView* tv, IterDomain* id) { - for (auto i : tv->getLoopDomain()) { - if (i == id) { + for (auto* loop_id : tv->getLoopDomain()) { + if (loop_id == id) { return true; } - if (!i->isDeviceDim() && !i->isReduction() && !i->isBroadcast()) { + if (!loop_id->isDeviceDim() && !loop_id->isReduction() && !loop_id->isBroadcast()) { return false; } } diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index 9898df02ac8..a3999b477ba 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -73,10 +73,11 @@ class DummyOverlapBenchmark : public MultiDeviceTest, public testing::WithParamI std::map DummyOverlapBenchmark::times = {}; TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) { - constexpr int64_t number_of_warmups = 50; - constexpr int64_t number_of_iterations = 100; - constexpr int64_t iteration_profiler_start = 10; - constexpr int64_t iteration_profiler_end = 15; + constexpr int64_t number_of_warmups = 20; + constexpr int64_t number_of_iterations = 80; + constexpr int64_t total_number_of_iterations = number_of_warmups + number_of_iterations; + constexpr int64_t iteration_profiler_start = 5; + constexpr int64_t iteration_profiler_end = 10; auto [backend, @@ -90,27 +91,36 @@ TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) { std::vector all_ranks(communicator_->size()); std::iota(all_ranks.begin(), all_ranks.end(), 0); auto world = communicator_->getBackendForTeam(all_ranks, backend); + auto nccl_world = communicator_->getBackendForTeam(all_ranks, CommunicatorBackend::kNccl); std::vector streams = createStreams(2, communicator_->deviceId()); auto& compute_stream = streams.at(0); auto& communication_stream = streams.at(1); - auto options = at::TensorOptions().dtype(at::kFloat).device(communicator_->device()); - auto ta = at::randn({M, K}, options); - auto tb = at::randn({K, N}, options); - auto tc = at::empty({M, N}, options); - auto src = at::randn({L}, options); - auto dst = at::empty({L * communicator_->size()}, options); + auto options_matmul = at::TensorOptions().dtype(torch::kFloat16).device(communicator_->device()); + auto ta = at::randn({M, K}, options_matmul); + auto tb = at::randn({K, N}, options_matmul); + auto tc = at::empty({M, N}, options_matmul); + + auto options_comms = at::TensorOptions().dtype(torch::kFloat32).device(communicator_->device()); + auto src = at::randn({L}, options_comms); + auto dst = at::empty({L * communicator_->size()}, options_comms); + std::vector barrier_scratch_buffer = {at::randn({1}, options_comms)}; cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); + nccl_world->allreduce(barrier_scratch_buffer)->wait(); + for (const auto& iteration : - c10::irange(number_of_warmups + number_of_iterations)) { + c10::irange(total_number_of_iterations)) { + if (iteration % 10 == 0 && communicator_->deviceId() == 0) { + std::cout << "iteration " << iteration <<"/" << total_number_of_iterations << std::endl; + } if (iteration == iteration_profiler_start) { - cudaProfilerStart();; + cudaProfilerStart(); } if (iteration == number_of_warmups) { cudaEventRecord(start); @@ -133,8 +143,14 @@ TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) { if (iteration == iteration_profiler_end) { cudaProfilerStop();; } + if (!pre_comm & !post_comm) { + nccl_world->allreduce(barrier_scratch_buffer)->wait(); + } synchronizeStreams(streams); } + if (pre_comm || post_comm) { + nccl_world->allreduce(barrier_scratch_buffer)->wait(); + } cudaEventRecord(stop); cudaEventSynchronize(stop); float milliseconds = 0; @@ -154,7 +170,7 @@ INSTANTIATE_TEST_SUITE_P( /*M=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)), /*K=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)), /*N=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)), - /*L=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)), + /*L=*/testing::Values(1, pow(2,10), pow(2,15), pow(2,17), pow(2,20), pow(2,24), pow(2,26), pow(2,28)), /*pre-comm=*/testing::Bool(), /*post-comm=*/testing::Bool()), [](const testing::TestParamInfo& info) From e037ee5b62418632055fe5f32f8659b0b4bc49d9 Mon Sep 17 00:00:00 2001 From: snordmann Date: Thu, 16 Jan 2025 02:38:52 -0800 Subject: [PATCH 20/20] test with stream parallel type and host IR --- bench/test | 34 ++++---- tests/cpp/test_multidevice_overlap.cpp | 111 ++++++++++++++++++++++++- 2 files changed, 127 insertions(+), 18 deletions(-) diff --git a/bench/test b/bench/test index 1b5d6f41c5a..6777835f7b4 100755 --- a/bench/test +++ b/bench/test @@ -1,28 +1,32 @@ #!/bin/bash -EXPERIMENT=Dummy_profile_msgsize256m_float16_M128k_K128k_N32k_UCC_IB +EXPERIMENT=StreamParallelType_tests DATE=$(date +%Y%m%d-%H%M) LOG_BASE="/opt/pytorch/Fuser/bench/logs" NP=8 BACKEND=UCC -M=131072 #32768 -K=131072 -N=32768 #1024 +M=32768 +K=32768 +N=1024 S=8 -Streams=8 +Streams=3 Pgs=1 -L=1048576 #268435456 #67108864 #131072 -PRE_COMM="_pre_comm" +# M=131072 #32768 +# K=131072 +# N=32768 #1024 +# L=1048576 #268435456 #67108864 #131072 +# PRE_COMM="_pre_comm" # POST_COMM="_post_comm" # UNFUSE="_unfused" # GRAPH="_WithCudaGraph" # cuStreamWrite=WithcuStreamWriteValue32_ # GTEST_PREFIX="OverlapBenchmark.PipelinedAGMatmulBenchmark/" -GTEST_PREFIX="DummyOverlapBenchmark.PipelinedAGMatmulBenchmark/" -# GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}${GRAPH}" -GTEST_POSTFIX="${BACKEND}_M${M}_K${K}_N${N}_L${L}${PRE_COMM}${POST_COMM}" +# GTEST_PREFIX="DummyOverlapBenchmark.PipelinedAGMatmulBenchmark/" +GTEST_PREFIX="OverlapBenchmark.PipelinedAGMatmulBenchmarkStreamParallelType/" +GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}${GRAPH}" +# GTEST_POSTFIX="${BACKEND}_M${M}_K${K}_N${N}_L${L}${PRE_COMM}${POST_COMM}" export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}" echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO @@ -32,7 +36,7 @@ MPIFLAGS=" -np $NP" # MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO # MPIFLAGS+=" -x NCCL_MAX_NCHANNELS=1" -# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl" +MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl" # MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event" # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=cuda" @@ -47,15 +51,15 @@ MPIFLAGS=" -np $NP" # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_COPY_LARGE_THRESH=1M" # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_NUM_THREADS=512" -MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp" -MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy" +# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp" +# MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy" # MPIFLAGS+=" -x UCX_RNDV_SCHEME=put_zcopy" -MPIFLAGS+=" -x UCX_RNDV_SCHEME=get_zcopy" +# MPIFLAGS+=" -x UCX_RNDV_SCHEME=get_zcopy" MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1" # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=^sharp,mlx5" -# MPIFLAGS+=" -x UCC_COLL_TRACE=info" +MPIFLAGS+=" -x UCC_COLL_TRACE=info" # MPIFLAGS+=" -x UCC_LOG_LEVEL=debug" # MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1" # MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2" diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index 272d785e2a1..7cf3cd288a4 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -345,15 +345,120 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) { } } +TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmarkStreamParallelType) { + constexpr int64_t number_of_warmups = 50; + constexpr int64_t number_of_iterations = 200; + constexpr int64_t iteration_profiler_start = 10; + constexpr int64_t iteration_profiler_end = 15; + + const int64_t D = communicator_->size(); + auto [backend, + S, + M, + K, + N, + number_of_streams, + add_cuStreamWriteValue32, + number_of_pgs, + unfuse_loops, + use_cuda_graph] = GetParam(); + + if (M % (D * S) != 0) { + GTEST_SKIP() << "M must be a multiple of D * S, but got M = " << M + << ", D = " << D << ", S = " << S; + } + if (add_cuStreamWriteValue32) { + GTEST_SKIP() << "cuStreamWriteValue32 not supported with StreamParallelType"; + } + if (number_of_pgs > 1) { + GTEST_SKIP() << "StreamParallelType not supported with multiple process groups"; + } + if (unfuse_loops) { + GTEST_SKIP() << "StreamParallelType not supported with unfused loops"; + } + if (use_cuda_graph) { + GTEST_SKIP() << "StreamParallelType not supported with cuda graphs"; + } + + + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + + TensorView* a = makeContigTensor(4); //[S, DIDx(D), M/(S*D), K] + TensorView* b = makeContigTensor(2); //[K, N] + TensorView* c = matmul(a, b); //[S, D, M/(S*D), N] + + fusion->addInput(a); + fusion->addInput(b); + fusion->addOutput(c); + + auto mesh = DeviceMesh::createForNumDevices(D); + a->setDeviceMesh(mesh); + b->setDeviceMesh(mesh); + c->setDeviceMesh(mesh); + + a->axis(1)->parallelize(ParallelType::DIDx); + c->axis(0)->parallelize(ParallelType::Stream); + + communicator_->setDefaultBackend(backend); + + hir::HostIrEvaluatorParams params; + params.number_of_streams = number_of_streams; + MultiDeviceExecutor executor(std::move(fusion), *communicator_, params); + + + auto tensor_options = + at::TensorOptions().dtype(at::kFloat).device(communicator_->device()); + at::Tensor ta_unsharded = at::randn({S, D, M / (S * D), K}, tensor_options); + at::Tensor ta = ta_unsharded.slice( + 1, communicator_->deviceId(), communicator_->deviceId() + 1); + at::Tensor tb = at::randn({K, N}, tensor_options); + at::Tensor tc_ref = at::matmul(ta_unsharded, tb); + + std::vector inputs = {ta, tb}; + at::Tensor tc; + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + for (const auto& iteration : + c10::irange(number_of_warmups + number_of_iterations)) { + if (iteration == iteration_profiler_start) { + cudaProfilerStart();; + } + if (iteration == number_of_warmups) { + cudaEventRecord(start); + } + + tc = executor.runWithInput(inputs).at(0); + + if (iteration == iteration_profiler_end) { + cudaProfilerStop();; + } + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + milliseconds /= number_of_iterations; + + std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name(); + times.insert({test_name, milliseconds}); + std::cout << "rank " << communicator_->deviceId() << ", " << test_name << " : " << milliseconds << std::endl; + + EXPECT_TRUE(torch::allclose(tc_ref, tc, 1e-1, 1e-1)); +} + INSTANTIATE_TEST_SUITE_P( , OverlapBenchmark, testing::Combine( testing::Values(CommunicatorBackend::kNccl, CommunicatorBackend::kUcc), /*S=*/testing::Values(1,2,4,8, 16, 32), - /*M=*/testing::Values(pow(2,10), pow(2,15)), - /*K=*/testing::Values(pow(2,10), pow(2,15)), - /*N=*/testing::Values(pow(2,10)), + /*M=*/testing::Values(pow(2,10), pow(2,15), pow(2,18)), + /*K=*/testing::Values(pow(2,10), pow(2,15), pow(2,18)), + /*N=*/testing::Values(pow(2,10), pow(2,15)), /*number_of_streams=*/testing::Values(3, 8, 32), /*add_cuStreamWriteValue32*/testing::Values(false, true), /*number_of_pgs=*/testing::Values(1, 2, 4, 8),