From 2c04df94cfd34b37b9ce2c27f937afdba8e79ecc Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Thu, 24 Oct 2024 02:08:07 +0300
Subject: [PATCH 01/20] working simple benchmark

---
 csrc/multidevice/communicator.cpp      |   2 +
 tests/cpp/test_multidevice_overlap.cpp | 103 +++++++++++++++++++++++++
 2 files changed, 105 insertions(+)

diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp
index 8197ea224f4..ae6fc1fd9b4 100644
--- a/csrc/multidevice/communicator.cpp
+++ b/csrc/multidevice/communicator.cpp
@@ -196,6 +196,8 @@ Communicator::Communicator(
     return;
   }
 
+  cudaSetDevice(local_rank_);
+
 #ifdef NVFUSER_DISTRIBUTED
   c10d::TCPStoreOptions store_opts;
   {
diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index 39cab67cd13..5def14c8045 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -15,6 +15,7 @@
 #include <ir/utils.h>
 #include <ops/all_ops.h>
 #include <tests/cpp/multidevice.h>
+#include <cuda_runtime.h>
 
 namespace nvfuser {
 
@@ -40,6 +41,108 @@ void synchronizeStreams(const std::vector<c10::cuda::CUDAStream>& streams) {
 
 } // namespace
 
+using OverlapBenchmarkParams = std::tuple<
+    CommunicatorBackend,
+    /*S=*/int64_t,
+    /*M=*/int64_t,
+    /*K=*/int64_t,
+    /*N=*/int64_t,
+    /*number_of_streams=*/int64_t>;
+
+class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface<OverlapBenchmarkParams> {
+ protected:
+  static std::unordered_map<std::string, float> times;
+
+  static void TearDownTestSuite() {
+    auto rank = Communicator::getInstance().deviceId();
+    for (auto it: times) {
+      std::cout << "rank " << rank << ": " << it.first << ": " << it.second << std::endl;
+    }
+  }
+};
+
+std::unordered_map<std::string, float> OverlapBenchmark::times = {};
+
+TEST_P(OverlapBenchmark, DummyBenchmark) {
+  constexpr int64_t number_of_warmups = 120;
+  constexpr int64_t number_of_iterations = 500;
+  const int64_t D = communicator_->size();
+  auto [backend,
+        S,
+        M,
+        K,
+        N,
+        number_of_streams] = GetParam();
+
+  GTEST_ASSERT_EQ(M % S, 0);
+
+  auto world = communicator_->getWorld(backend);
+
+  std::vector<c10::cuda::CUDAStream> streams =
+      createStreams(number_of_streams, communicator_->deviceId());
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(communicator_->device());
+  auto ta = at::randn({S, M/S,K}, options);
+  auto ta_unsharded = at::empty({S, D, M/S,K}, options);
+  auto tb = at::randn({K,N}, options);
+
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  for (const auto& iteration :
+       c10::irange(number_of_warmups + number_of_iterations)) {
+    if (iteration == number_of_warmups) {
+      cudaEventRecord(start);
+    }
+    for (auto j : c10::irange(S)) {
+      int64_t stream_index = j % streams.size();
+      setCurrentCUDAStream(streams.at(stream_index));
+
+      auto ta_j = ta.select(0, j);
+      auto ta_unsharded_j = ta_unsharded.select(0, j);
+
+      // communication
+      world->_allgather_base(ta_unsharded_j, ta_j)->wait();
+      // compute
+      auto tc_j = torch::matmul(ta_unsharded_j,tb);
+    }
+    setCurrentCUDAStream(c10::cuda::getDefaultCUDAStream(communicator_->deviceId()));
+    synchronizeStreams(streams);
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  float milliseconds = 0;
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  milliseconds /= number_of_iterations;
+
+  std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name();
+  times.insert({test_name, milliseconds});
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    ,
+    OverlapBenchmark,
+    testing::Combine(
+    testing::Values(CommunicatorBackend::kNccl, CommunicatorBackend::kUcc),
+    /*S=*/testing::Values(1,2,4,8),
+    /*M=*/testing::Values(pow(2,10), pow(2,15)),
+    /*K=*/testing::Values(pow(2,10), pow(2,15)),
+    /*N=*/testing::Values(pow(2,10)),
+    /*number_of_streams=*/testing::Values(3, 8)),
+    [](const testing::TestParamInfo<OverlapBenchmarkParams>& info)
+        -> std::string {
+      std::ostringstream os;
+      os << /*backend*/std::get<0>(info.param) << "_"
+         << "S" << std::get<1>(info.param) << "_"
+         << "M" << std::get<2>(info.param) << "_"
+         << "K" << std::get<3>(info.param) << "_"
+         << "N" << std::get<4>(info.param) << "_"
+         << "Streams" << std::get<5>(info.param);
+      return os.str();
+    });
+
+
 struct OverlapTestParams {
   // Tensors sizes
   int64_t M = std::pow(2, 6);

From af36cf14ac7622945ba6ec6ff8ce68434cc94230 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Fri, 25 Oct 2024 03:54:42 +0300
Subject: [PATCH 02/20] minor

---
 tests/cpp/test_multidevice_overlap.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index 5def14c8045..76e8192d1fd 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -51,7 +51,7 @@ using OverlapBenchmarkParams = std::tuple<
 
 class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface<OverlapBenchmarkParams> {
  protected:
-  static std::unordered_map<std::string, float> times;
+  static std::map<std::string, float> times;
 
   static void TearDownTestSuite() {
     auto rank = Communicator::getInstance().deviceId();
@@ -61,11 +61,13 @@ class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterf
   }
 };
 
-std::unordered_map<std::string, float> OverlapBenchmark::times = {};
+std::map<std::string, float> OverlapBenchmark::times = {};
 
 TEST_P(OverlapBenchmark, DummyBenchmark) {
-  constexpr int64_t number_of_warmups = 120;
-  constexpr int64_t number_of_iterations = 500;
+  int64_t number_of_warmups = 50;
+  constexpr int64_t number_of_iterations = 100;
+
+
   const int64_t D = communicator_->size();
   auto [backend,
         S,
@@ -118,6 +120,7 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
 
   std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name();
   times.insert({test_name, milliseconds});
+  std::cout << "rank " << communicator_->deviceId() << ", " << test_name << " : " << milliseconds << std::endl;
 }
 
 INSTANTIATE_TEST_SUITE_P(

From 68b858a7fcd16e0f79fd62cafe6401496c924c60 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Fri, 25 Oct 2024 07:22:06 -0700
Subject: [PATCH 03/20] test script

---
 bench/process_outputs                  |  2 ++
 bench/test                             | 35 ++++++++++++++++++++++++++
 tests/cpp/test_multidevice_overlap.cpp |  4 +--
 3 files changed, 39 insertions(+), 2 deletions(-)
 create mode 100644 bench/process_outputs
 create mode 100755 bench/test

diff --git a/bench/process_outputs b/bench/process_outputs
new file mode 100644
index 00000000000..139597f9cb0
--- /dev/null
+++ b/bench/process_outputs
@@ -0,0 +1,2 @@
+
+
diff --git a/bench/test b/bench/test
new file mode 100755
index 00000000000..f0d5728fb4b
--- /dev/null
+++ b/bench/test
@@ -0,0 +1,35 @@
+#!/bin/bash
+EXPERIMENT=tl_nccl
+DATE=$(date +%Y%m%d-%H%M)
+LOG_BASE="/opt/pytorch/Fuser/bench/logs"
+
+export LOGS="${LOG_BASE}/${EXPERIMENT}_${DATE}"
+
+mkdir -p $LOGS
+LOG_FILE_INFO="${LOGS}/info"
+echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO
+
+NP=8
+BACKEND=UCC
+S=*
+M=*
+K=*
+N=*
+Streams=*
+export GTEST_FILTER="OverlapBenchmark.DummyBenchmark/${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}"
+echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO
+
+MPIFLAGS=" -np $NP"
+MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1"
+# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=^sharp,mlx5"
+# MPIFLAGS+=" -x UCC_COLL_TRACE=info"
+MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl"
+echo "mpi flags: $MPIFLAGS" | tee -a $LOG_FILE_INFO
+
+TEST_CMD="$BUILD_DIRECTORY/test_multidevice --gtest_filter=${GTEST_FILTER}"
+echo "test cmd: $TEST_CMD" | tee -a $LOG_FILE_INFO
+
+CMD="mpirun $MPIFLAGS $TEST_CMD"
+echo $CMD | tee -a $LOG_FILE_INFO
+$CMD | tee -a $LOG_FILE_INFO
+
diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index 76e8192d1fd..b8c998618b4 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -128,11 +128,11 @@ INSTANTIATE_TEST_SUITE_P(
     OverlapBenchmark,
     testing::Combine(
     testing::Values(CommunicatorBackend::kNccl, CommunicatorBackend::kUcc),
-    /*S=*/testing::Values(1,2,4,8),
+    /*S=*/testing::Values(1,2,4,8, 16, 32),
     /*M=*/testing::Values(pow(2,10), pow(2,15)),
     /*K=*/testing::Values(pow(2,10), pow(2,15)),
     /*N=*/testing::Values(pow(2,10)),
-    /*number_of_streams=*/testing::Values(3, 8)),
+    /*number_of_streams=*/testing::Values(3, 8, 32)),
     [](const testing::TestParamInfo<OverlapBenchmarkParams>& info)
         -> std::string {
       std::ostringstream os;

From 0c3493b6c1782d27b5f417d2237751a0b37bf8df Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Mon, 28 Oct 2024 13:13:09 +0200
Subject: [PATCH 04/20] minor

---
 bench/process_outputs                  | 5 +++++
 bench/test                             | 2 +-
 tests/cpp/test_multidevice_overlap.cpp | 5 ++++-
 3 files changed, 10 insertions(+), 2 deletions(-)
 mode change 100644 => 100755 bench/process_outputs

diff --git a/bench/process_outputs b/bench/process_outputs
old mode 100644
new mode 100755
index 139597f9cb0..c1781394dbc
--- a/bench/process_outputs
+++ b/bench/process_outputs
@@ -1,2 +1,7 @@
+#!/bin/bash
 
+FILE="/opt/pytorch/Fuser/bench/logs/${1}/info"
 
+cat $FILE | grep "rank 0: "  #| awk '{print $4}'
+
+# | grep -E 'Streams32\b'
\ No newline at end of file
diff --git a/bench/test b/bench/test
index f0d5728fb4b..b6375719387 100755
--- a/bench/test
+++ b/bench/test
@@ -10,7 +10,7 @@ LOG_FILE_INFO="${LOGS}/info"
 echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO
 
 NP=8
-BACKEND=UCC
+BACKEND=NCCL
 S=*
 M=*
 K=*
diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index b8c998618b4..2febd097b62 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -55,8 +55,11 @@ class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterf
 
   static void TearDownTestSuite() {
     auto rank = Communicator::getInstance().deviceId();
+    if (rank != 0) {
+      return;
+    }
     for (auto it: times) {
-      std::cout << "rank " << rank << ": " << it.first << ": " << it.second << std::endl;
+      std::cout << "time " << rank << ": " << it.first << ": " << it.second << std::endl;
     }
   }
 };

From b30b44bb897c0ec290f37f0e0e02d82ceea3421f Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Tue, 29 Oct 2024 09:46:09 -0700
Subject: [PATCH 05/20] add nsight profiling

---
 bench/test                             | 36 ++++++++++++++++----------
 tests/cpp/test_multidevice_overlap.cpp |  7 +++++
 2 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/bench/test b/bench/test
index b6375719387..8ce85c8ff0f 100755
--- a/bench/test
+++ b/bench/test
@@ -1,35 +1,45 @@
 #!/bin/bash
-EXPERIMENT=tl_nccl
+EXPERIMENT=profile
 DATE=$(date +%Y%m%d-%H%M)
 LOG_BASE="/opt/pytorch/Fuser/bench/logs"
 
 export LOGS="${LOG_BASE}/${EXPERIMENT}_${DATE}"
 
 mkdir -p $LOGS
-LOG_FILE_INFO="${LOGS}/info"
+LOG_FILE_INFO="${LOGS}/info.txt"
 echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO
 
 NP=8
 BACKEND=NCCL
-S=*
-M=*
-K=*
-N=*
-Streams=*
-export GTEST_FILTER="OverlapBenchmark.DummyBenchmark/${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}"
+S=4
+M=32768
+K=32768
+N=1024
+Streams=8
+GTEST_PREFIX="OverlapBenchmark.DummyBenchmark/"
+GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}"
+export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}"
 echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO
-
+``
 MPIFLAGS=" -np $NP"
 MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1"
 # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=^sharp,mlx5"
-# MPIFLAGS+=" -x UCC_COLL_TRACE=info"
+MPIFLAGS+=" -x UCC_COLL_TRACE=info"
 MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl"
+# MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO
+MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1"
 echo "mpi flags: $MPIFLAGS" | tee -a $LOG_FILE_INFO
 
 TEST_CMD="$BUILD_DIRECTORY/test_multidevice --gtest_filter=${GTEST_FILTER}"
 echo "test cmd: $TEST_CMD" | tee -a $LOG_FILE_INFO
 
-CMD="mpirun $MPIFLAGS $TEST_CMD"
-echo $CMD | tee -a $LOG_FILE_INFO
-$CMD | tee -a $LOG_FILE_INFO
+MPICMD="mpirun $MPIFLAGS $TEST_CMD"
+echo $MPICMD | tee -a $LOG_FILE_INFO
+
+NSYSCMD="nsys profile --stats=false -w true -t cublas,cuda,nvtx,osrt,mpi,ucx -o ${LOGS}/${GTEST_POSTFIX} --capture-range-end stop --capture-range=cudaProfilerApi --cudabacktrace=memory,sync,kernel,other"
+
+CMD="${NSYSCMD} ${MPICMD}"
+sudo /bin/sh -c "echo '1' > /proc/sys/kernel/perf_event_paranoid"
+echo $CMD | tee -a ${LOG_FILE_INFO}
+$CMD | tee -a ${LOG_FILE_INFO}
 
diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index 2febd097b62..189a0da732c 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -15,6 +15,7 @@
 #include <ir/utils.h>
 #include <ops/all_ops.h>
 #include <tests/cpp/multidevice.h>
+#include <cuda_profiler_api.h>
 #include <cuda_runtime.h>
 
 namespace nvfuser {
@@ -97,6 +98,9 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
 
   for (const auto& iteration :
        c10::irange(number_of_warmups + number_of_iterations)) {
+    if (iteration == 10) {
+      cudaProfilerStart();;
+    }
     if (iteration == number_of_warmups) {
       cudaEventRecord(start);
     }
@@ -114,6 +118,9 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
     }
     setCurrentCUDAStream(c10::cuda::getDefaultCUDAStream(communicator_->deviceId()));
     synchronizeStreams(streams);
+    if (iteration == 15) {
+      cudaProfilerStop();;
+    }
   }
   cudaEventRecord(stop);
   cudaEventSynchronize(stop);

From 0592a139918072d66c13790741310ecc195abe45 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Thu, 31 Oct 2024 06:39:08 -0700
Subject: [PATCH 06/20] nsight and tl/nccl/ sync mode

---
 bench/test | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/bench/test b/bench/test
index 8ce85c8ff0f..b51daa63ebd 100755
--- a/bench/test
+++ b/bench/test
@@ -1,16 +1,16 @@
 #!/bin/bash
-EXPERIMENT=profile
+EXPERIMENT=profile_driver
 DATE=$(date +%Y%m%d-%H%M)
 LOG_BASE="/opt/pytorch/Fuser/bench/logs"
 
 export LOGS="${LOG_BASE}/${EXPERIMENT}_${DATE}"
 
 mkdir -p $LOGS
-LOG_FILE_INFO="${LOGS}/info.txt"
+export LOG_FILE_INFO="${LOGS}/info.txt"
 echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO
 
 NP=8
-BACKEND=NCCL
+BACKEND=UCC
 S=4
 M=32768
 K=32768
@@ -28,6 +28,7 @@ MPIFLAGS+=" -x UCC_COLL_TRACE=info"
 MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl"
 # MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO
 MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1"
+MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=driver"
 echo "mpi flags: $MPIFLAGS" | tee -a $LOG_FILE_INFO
 
 TEST_CMD="$BUILD_DIRECTORY/test_multidevice --gtest_filter=${GTEST_FILTER}"

From 0037b1e9b1398b9518a80011b1601f7e4f6cda5a Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Mon, 4 Nov 2024 05:12:10 -0800
Subject: [PATCH 07/20] add cuStreamWriteValue but linkage error

---
 bench/test                             |  7 ++++---
 tests/cpp/test_multidevice_overlap.cpp | 29 ++++++++++++++++++++++----
 2 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/bench/test b/bench/test
index b51daa63ebd..2856cff9074 100755
--- a/bench/test
+++ b/bench/test
@@ -1,5 +1,5 @@
 #!/bin/bash
-EXPERIMENT=profile_driver
+EXPERIMENT=profile_ncc_max_connection2
 DATE=$(date +%Y%m%d-%H%M)
 LOG_BASE="/opt/pytorch/Fuser/bench/logs"
 
@@ -10,7 +10,7 @@ export LOG_FILE_INFO="${LOGS}/info.txt"
 echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO
 
 NP=8
-BACKEND=UCC
+BACKEND=NCCL
 S=4
 M=32768
 K=32768
@@ -28,7 +28,8 @@ MPIFLAGS+=" -x UCC_COLL_TRACE=info"
 MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl"
 # MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO
 MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1"
-MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=driver"
+MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event"
+MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2"
 echo "mpi flags: $MPIFLAGS" | tee -a $LOG_FILE_INFO
 
 TEST_CMD="$BUILD_DIRECTORY/test_multidevice --gtest_filter=${GTEST_FILTER}"
diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index 189a0da732c..8fdaf8afdd9 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -15,6 +15,7 @@
 #include <ir/utils.h>
 #include <ops/all_ops.h>
 #include <tests/cpp/multidevice.h>
+#include <cuda.h>
 #include <cuda_profiler_api.h>
 #include <cuda_runtime.h>
 
@@ -48,7 +49,8 @@ using OverlapBenchmarkParams = std::tuple<
     /*M=*/int64_t,
     /*K=*/int64_t,
     /*N=*/int64_t,
-    /*number_of_streams=*/int64_t>;
+    /*number_of_streams=*/int64_t,
+    /*add_cuStreamWriteValue32=*/bool>;
 
 class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface<OverlapBenchmarkParams> {
  protected:
@@ -78,7 +80,8 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
         M,
         K,
         N,
-        number_of_streams] = GetParam();
+        number_of_streams,
+        add_cuStreamWriteValue32] = GetParam();
 
   GTEST_ASSERT_EQ(M % S, 0);
 
@@ -96,6 +99,13 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
 
+  // CUdeviceptr pDevice;
+  // void* ptr;
+  // if (add_cuStreamWriteValue32) {
+  //   cudaMallocHost(&ptr, 32);
+  //   cudaHostGetDevicePointer((void**)&pDevice, ptr, 0);
+  // }
+
   for (const auto& iteration :
        c10::irange(number_of_warmups + number_of_iterations)) {
     if (iteration == 10) {
@@ -113,6 +123,11 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
 
       // communication
       world->_allgather_base(ta_unsharded_j, ta_j)->wait();
+
+      // if (add_cuStreamWriteValue32) {
+      //   cuStreamWriteValue32((CUstream)streams.at(stream_index), (CUdeviceptr)pDevice, (cuuint32_t)1, (unsigned int)0);
+      // }
+
       // compute
       auto tc_j = torch::matmul(ta_unsharded_j,tb);
     }
@@ -131,6 +146,10 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
   std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name();
   times.insert({test_name, milliseconds});
   std::cout << "rank " << communicator_->deviceId() << ", " << test_name << " : " << milliseconds << std::endl;
+
+  // if (add_cuStreamWriteValue32) {
+  //   cudaFree(ptr);
+  // }
 }
 
 INSTANTIATE_TEST_SUITE_P(
@@ -142,7 +161,8 @@ INSTANTIATE_TEST_SUITE_P(
     /*M=*/testing::Values(pow(2,10), pow(2,15)),
     /*K=*/testing::Values(pow(2,10), pow(2,15)),
     /*N=*/testing::Values(pow(2,10)),
-    /*number_of_streams=*/testing::Values(3, 8, 32)),
+    /*number_of_streams=*/testing::Values(3, 8, 32),
+    /*add_cuStreamWriteValue32*/testing::Values(false)),
     [](const testing::TestParamInfo<OverlapBenchmarkParams>& info)
         -> std::string {
       std::ostringstream os;
@@ -151,7 +171,8 @@ INSTANTIATE_TEST_SUITE_P(
          << "M" << std::get<2>(info.param) << "_"
          << "K" << std::get<3>(info.param) << "_"
          << "N" << std::get<4>(info.param) << "_"
-         << "Streams" << std::get<5>(info.param);
+         << "Streams" << std::get<5>(info.param) << "_"
+         << ((std::get<6>(info.param))? "With" : "Without") << "cuStreamWriteValue32";
       return os.str();
     });
 

From ec71e233de02deefb609bd81d2bd7dd6b3f2451f Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Mon, 4 Nov 2024 06:27:56 -0800
Subject: [PATCH 08/20] multiple pgs

---
 bench/test                             | 13 +++++++------
 tests/cpp/test_multidevice_overlap.cpp | 17 ++++++++++++-----
 2 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/bench/test b/bench/test
index 2856cff9074..5433bbee9ce 100755
--- a/bench/test
+++ b/bench/test
@@ -1,5 +1,5 @@
 #!/bin/bash
-EXPERIMENT=profile_ncc_max_connection2
+EXPERIMENT=profile_baseline_NCCL
 DATE=$(date +%Y%m%d-%H%M)
 LOG_BASE="/opt/pytorch/Fuser/bench/logs"
 
@@ -11,25 +11,26 @@ echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO
 
 NP=8
 BACKEND=NCCL
-S=4
+S=1
 M=32768
 K=32768
 N=1024
 Streams=8
+Pgs=1
 GTEST_PREFIX="OverlapBenchmark.DummyBenchmark/"
-GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}"
+GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_Pgs${Pgs}"
 export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}"
 echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO
 ``
 MPIFLAGS=" -np $NP"
 MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1"
 # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=^sharp,mlx5"
-MPIFLAGS+=" -x UCC_COLL_TRACE=info"
+# MPIFLAGS+=" -x UCC_COLL_TRACE=info"
 MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl"
 # MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO
 MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1"
-MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event"
-MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2"
+# MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event"
+# MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2"
 echo "mpi flags: $MPIFLAGS" | tee -a $LOG_FILE_INFO
 
 TEST_CMD="$BUILD_DIRECTORY/test_multidevice --gtest_filter=${GTEST_FILTER}"
diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index 8fdaf8afdd9..ff79bb45609 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -50,7 +50,8 @@ using OverlapBenchmarkParams = std::tuple<
     /*K=*/int64_t,
     /*N=*/int64_t,
     /*number_of_streams=*/int64_t,
-    /*add_cuStreamWriteValue32=*/bool>;
+    /*add_cuStreamWriteValue32=*/bool,
+    /*number_of_pgs=*/int64_t>;
 
 class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface<OverlapBenchmarkParams> {
  protected:
@@ -81,11 +82,13 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
         K,
         N,
         number_of_streams,
-        add_cuStreamWriteValue32] = GetParam();
+        add_cuStreamWriteValue32,
+        number_of_pgs] = GetParam();
 
   GTEST_ASSERT_EQ(M % S, 0);
 
-  auto world = communicator_->getWorld(backend);
+  std::vector<RankType> all_ranks(communicator_->size());
+  std::iota(all_ranks.begin(), all_ranks.end(), 0);
 
   std::vector<c10::cuda::CUDAStream> streams =
       createStreams(number_of_streams, communicator_->deviceId());
@@ -118,6 +121,8 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
       int64_t stream_index = j % streams.size();
       setCurrentCUDAStream(streams.at(stream_index));
 
+      auto world = communicator_->getBackendForTeam(all_ranks, backend, std::to_string(j % number_of_pgs));
+
       auto ta_j = ta.select(0, j);
       auto ta_unsharded_j = ta_unsharded.select(0, j);
 
@@ -162,7 +167,8 @@ INSTANTIATE_TEST_SUITE_P(
     /*K=*/testing::Values(pow(2,10), pow(2,15)),
     /*N=*/testing::Values(pow(2,10)),
     /*number_of_streams=*/testing::Values(3, 8, 32),
-    /*add_cuStreamWriteValue32*/testing::Values(false)),
+    /*add_cuStreamWriteValue32*/testing::Values(false),
+    /*number_of_pgs=*/testing::Values(1, 2, 4, 8)),
     [](const testing::TestParamInfo<OverlapBenchmarkParams>& info)
         -> std::string {
       std::ostringstream os;
@@ -172,7 +178,8 @@ INSTANTIATE_TEST_SUITE_P(
          << "K" << std::get<3>(info.param) << "_"
          << "N" << std::get<4>(info.param) << "_"
          << "Streams" << std::get<5>(info.param) << "_"
-         << ((std::get<6>(info.param))? "With" : "Without") << "cuStreamWriteValue32";
+         << ((std::get<6>(info.param))? "WithcuStreamWriteValue32_" : "")
+         << "Pgs" << std::get<7>(info.param);
       return os.str();
     });
 

From a15fdfc9d84258d38442a78110d57be1a121598c Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Mon, 4 Nov 2024 06:39:54 -0800
Subject: [PATCH 09/20] reenable cuStreamValue32

---
 bench/test                             |  9 +++++----
 csrc/driver_api.h                      |  1 +
 tests/cpp/test_multidevice_overlap.cpp | 26 +++++++++++++-------------
 3 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/bench/test b/bench/test
index 5433bbee9ce..4f3559e283a 100755
--- a/bench/test
+++ b/bench/test
@@ -1,5 +1,5 @@
 #!/bin/bash
-EXPERIMENT=profile_baseline_NCCL
+EXPERIMENT=profile_cuStreamWrite_NCCL
 DATE=$(date +%Y%m%d-%H%M)
 LOG_BASE="/opt/pytorch/Fuser/bench/logs"
 
@@ -11,14 +11,15 @@ echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO
 
 NP=8
 BACKEND=NCCL
-S=1
+S=8
 M=32768
 K=32768
 N=1024
 Streams=8
 Pgs=1
+cuStreamWrite=WithcuStreamWriteValue32_
 GTEST_PREFIX="OverlapBenchmark.DummyBenchmark/"
-GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_Pgs${Pgs}"
+GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}"
 export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}"
 echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO
 ``
@@ -39,7 +40,7 @@ echo "test cmd: $TEST_CMD" | tee -a $LOG_FILE_INFO
 MPICMD="mpirun $MPIFLAGS $TEST_CMD"
 echo $MPICMD | tee -a $LOG_FILE_INFO
 
-NSYSCMD="nsys profile --stats=false -w true -t cublas,cuda,nvtx,osrt,mpi,ucx -o ${LOGS}/${GTEST_POSTFIX} --capture-range-end stop --capture-range=cudaProfilerApi --cudabacktrace=memory,sync,kernel,other"
+# NSYSCMD="nsys profile --stats=false -w true -t cublas,cuda,nvtx,osrt,mpi,ucx -o ${LOGS}/${GTEST_POSTFIX} --capture-range-end stop --capture-range=cudaProfilerApi --cudabacktrace=memory,sync,kernel,other"
 
 CMD="${NSYSCMD} ${MPICMD}"
 sudo /bin/sh -c "echo '1' > /proc/sys/kernel/perf_event_paranoid"
diff --git a/csrc/driver_api.h b/csrc/driver_api.h
index b8c413a4054..8105cf855c2 100644
--- a/csrc/driver_api.h
+++ b/csrc/driver_api.h
@@ -32,6 +32,7 @@ namespace nvfuser {
   fn(cuModuleGetFunction);                \
   fn(cuModuleLoadDataEx);                 \
   fn(cuModuleUnload);                     \
+  fn(cuStreamWriteValue32);               \
   fn(cuOccupancyMaxActiveBlocksPerMultiprocessor)
 
 #if (CUDA_VERSION >= 12000)
diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index ff79bb45609..fef6e9bf468 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -102,12 +102,12 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
 
-  // CUdeviceptr pDevice;
-  // void* ptr;
-  // if (add_cuStreamWriteValue32) {
-  //   cudaMallocHost(&ptr, 32);
-  //   cudaHostGetDevicePointer((void**)&pDevice, ptr, 0);
-  // }
+  CUdeviceptr pDevice;
+  void* ptr;
+  if (add_cuStreamWriteValue32) {
+    cudaMallocHost(&ptr, 32);
+    cudaHostGetDevicePointer((void**)&pDevice, ptr, 0);
+  }
 
   for (const auto& iteration :
        c10::irange(number_of_warmups + number_of_iterations)) {
@@ -129,9 +129,9 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
       // communication
       world->_allgather_base(ta_unsharded_j, ta_j)->wait();
 
-      // if (add_cuStreamWriteValue32) {
-      //   cuStreamWriteValue32((CUstream)streams.at(stream_index), (CUdeviceptr)pDevice, (cuuint32_t)1, (unsigned int)0);
-      // }
+      if (add_cuStreamWriteValue32) {
+        cuStreamWriteValue32((CUstream)streams.at(stream_index), (CUdeviceptr)pDevice, (cuuint32_t)1, (unsigned int)0);
+      }
 
       // compute
       auto tc_j = torch::matmul(ta_unsharded_j,tb);
@@ -152,9 +152,9 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
   times.insert({test_name, milliseconds});
   std::cout << "rank " << communicator_->deviceId() << ", " << test_name << " : " << milliseconds << std::endl;
 
-  // if (add_cuStreamWriteValue32) {
-  //   cudaFree(ptr);
-  // }
+  if (add_cuStreamWriteValue32) {
+    cudaFree(ptr);
+  }
 }
 
 INSTANTIATE_TEST_SUITE_P(
@@ -167,7 +167,7 @@ INSTANTIATE_TEST_SUITE_P(
     /*K=*/testing::Values(pow(2,10), pow(2,15)),
     /*N=*/testing::Values(pow(2,10)),
     /*number_of_streams=*/testing::Values(3, 8, 32),
-    /*add_cuStreamWriteValue32*/testing::Values(false),
+    /*add_cuStreamWriteValue32*/testing::Values(false, true),
     /*number_of_pgs=*/testing::Values(1, 2, 4, 8)),
     [](const testing::TestParamInfo<OverlapBenchmarkParams>& info)
         -> std::string {

From 6682a33b366b3f21a1ced568106e8a3b475c8567 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Mon, 4 Nov 2024 07:57:44 -0800
Subject: [PATCH 10/20] add tl/cuda and ec/cuda flags in bash test script

---
 bench/test | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/bench/test b/bench/test
index 4f3559e283a..5ad427b4876 100755
--- a/bench/test
+++ b/bench/test
@@ -1,5 +1,5 @@
 #!/bin/bash
-EXPERIMENT=profile_cuStreamWrite_NCCL
+EXPERIMENT=profile_UCC_TL_CUDA
 DATE=$(date +%Y%m%d-%H%M)
 LOG_BASE="/opt/pytorch/Fuser/bench/logs"
 
@@ -10,14 +10,14 @@ export LOG_FILE_INFO="${LOGS}/info.txt"
 echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO
 
 NP=8
-BACKEND=NCCL
-S=8
+BACKEND=UCC
+S=4
 M=32768
 K=32768
 N=1024
 Streams=8
 Pgs=1
-cuStreamWrite=WithcuStreamWriteValue32_
+# cuStreamWrite=WithcuStreamWriteValue32_
 GTEST_PREFIX="OverlapBenchmark.DummyBenchmark/"
 GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}"
 export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}"
@@ -25,11 +25,21 @@ echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO
 ``
 MPIFLAGS=" -np $NP"
 MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1"
+MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl"
+# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=cuda"
+# MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_NUM_WORKERS=8"
+# MPIFLAGS+=" -x UCC_EC_CUDA_USE_COOPERATIVE_LAUNCH=0"
+# MPIFLAGS+=" -x UCC_EC_CUDA_STREAM_TASK_MODE=kernel"
+# MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_COPY_LARGE_THRESH=1M"
+# MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_NUM_THREADS=512"
+# MPIFLAGS+=" -x UCC_TL_CUDA_SCRATCH_SIZE=32mb"
+# MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_MAX_RINGS=32"
+# MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_NUM_CHUNKS=32"
+
 # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=^sharp,mlx5"
 # MPIFLAGS+=" -x UCC_COLL_TRACE=info"
-MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl"
+# MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1"
 # MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO
-MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1"
 # MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event"
 # MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2"
 echo "mpi flags: $MPIFLAGS" | tee -a $LOG_FILE_INFO

From b01f1f4fe236be4144182cac5cbdcef15c559337 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Mon, 4 Nov 2024 08:40:14 -0800
Subject: [PATCH 11/20] add option to unfuse loops

---
 bench/test                             |  5 +++--
 tests/cpp/test_multidevice_overlap.cpp | 27 ++++++++++++++++++++------
 2 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/bench/test b/bench/test
index 5ad427b4876..2102c1eb743 100755
--- a/bench/test
+++ b/bench/test
@@ -11,15 +11,16 @@ echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO
 
 NP=8
 BACKEND=UCC
-S=4
+S=8
 M=32768
 K=32768
 N=1024
 Streams=8
 Pgs=1
+UNFUSE="_unfused"
 # cuStreamWrite=WithcuStreamWriteValue32_
 GTEST_PREFIX="OverlapBenchmark.DummyBenchmark/"
-GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}"
+GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}"
 export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}"
 echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO
 ``
diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index fef6e9bf468..d4b9c757f7a 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -51,7 +51,8 @@ using OverlapBenchmarkParams = std::tuple<
     /*N=*/int64_t,
     /*number_of_streams=*/int64_t,
     /*add_cuStreamWriteValue32=*/bool,
-    /*number_of_pgs=*/int64_t>;
+    /*number_of_pgs=*/int64_t,
+    /*unfuse_loops=*/bool>;
 
 class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface<OverlapBenchmarkParams> {
  protected:
@@ -83,7 +84,8 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
         N,
         number_of_streams,
         add_cuStreamWriteValue32,
-        number_of_pgs] = GetParam();
+        number_of_pgs,
+        unfuse_loops] = GetParam();
 
   GTEST_ASSERT_EQ(M % S, 0);
 
@@ -132,9 +134,20 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
       if (add_cuStreamWriteValue32) {
         cuStreamWriteValue32((CUstream)streams.at(stream_index), (CUdeviceptr)pDevice, (cuuint32_t)1, (unsigned int)0);
       }
+      if (unfuse_loops == false) {
+        // compute
+        auto tc_j = torch::matmul(ta_unsharded_j,tb);
+      }
+    }
+    if (unfuse_loops) {
+      for (auto j : c10::irange(S)) {
+        int64_t stream_index = j % streams.size();
+        setCurrentCUDAStream(streams.at(stream_index));
+        auto ta_unsharded_j = ta_unsharded.select(0, j);
 
-      // compute
-      auto tc_j = torch::matmul(ta_unsharded_j,tb);
+        // compute
+        auto tc_j = torch::matmul(ta_unsharded_j,tb);
+      }
     }
     setCurrentCUDAStream(c10::cuda::getDefaultCUDAStream(communicator_->deviceId()));
     synchronizeStreams(streams);
@@ -168,7 +181,8 @@ INSTANTIATE_TEST_SUITE_P(
     /*N=*/testing::Values(pow(2,10)),
     /*number_of_streams=*/testing::Values(3, 8, 32),
     /*add_cuStreamWriteValue32*/testing::Values(false, true),
-    /*number_of_pgs=*/testing::Values(1, 2, 4, 8)),
+    /*number_of_pgs=*/testing::Values(1, 2, 4, 8),
+    /*unfuse_loops=*/testing::Values(false, true)),
     [](const testing::TestParamInfo<OverlapBenchmarkParams>& info)
         -> std::string {
       std::ostringstream os;
@@ -179,7 +193,8 @@ INSTANTIATE_TEST_SUITE_P(
          << "N" << std::get<4>(info.param) << "_"
          << "Streams" << std::get<5>(info.param) << "_"
          << ((std::get<6>(info.param))? "WithcuStreamWriteValue32_" : "")
-         << "Pgs" << std::get<7>(info.param);
+         << "Pgs" << std::get<7>(info.param)
+         << ((std::get<8>(info.param))? "_unfused" : "");
       return os.str();
     });
 

From ea7fd37d61ad310c5dcb2d8ca599d8212003ff44 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Tue, 5 Nov 2024 02:53:36 -0800
Subject: [PATCH 12/20] add cuda graphs. Only working for NCCL and S1 bc there
 is a syncStream in nccl

---
 bench/test                             | 13 ++--
 tests/cpp/test_multidevice_overlap.cpp | 84 ++++++++++++++++----------
 2 files changed, 60 insertions(+), 37 deletions(-)

diff --git a/bench/test b/bench/test
index 2102c1eb743..8a64225d9e9 100755
--- a/bench/test
+++ b/bench/test
@@ -1,5 +1,5 @@
 #!/bin/bash
-EXPERIMENT=profile_UCC_TL_CUDA
+EXPERIMENT=profile_cudaGraph_NCCL_S1
 DATE=$(date +%Y%m%d-%H%M)
 LOG_BASE="/opt/pytorch/Fuser/bench/logs"
 
@@ -10,17 +10,18 @@ export LOG_FILE_INFO="${LOGS}/info.txt"
 echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO
 
 NP=8
-BACKEND=UCC
-S=8
+BACKEND=NCCL
+S=1
 M=32768
 K=32768
 N=1024
 Streams=8
 Pgs=1
-UNFUSE="_unfused"
+# UNFUSE="_unfused"
+GRAPH="_WithCudaGraph"
 # cuStreamWrite=WithcuStreamWriteValue32_
 GTEST_PREFIX="OverlapBenchmark.DummyBenchmark/"
-GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}"
+GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}${GRAPH}"
 export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}"
 echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO
 ``
@@ -51,7 +52,7 @@ echo "test cmd: $TEST_CMD" | tee -a $LOG_FILE_INFO
 MPICMD="mpirun $MPIFLAGS $TEST_CMD"
 echo $MPICMD | tee -a $LOG_FILE_INFO
 
-# NSYSCMD="nsys profile --stats=false -w true -t cublas,cuda,nvtx,osrt,mpi,ucx -o ${LOGS}/${GTEST_POSTFIX} --capture-range-end stop --capture-range=cudaProfilerApi --cudabacktrace=memory,sync,kernel,other"
+NSYSCMD="nsys profile --stats=false -w true -t cublas,cuda,nvtx,osrt,mpi,ucx -o ${LOGS}/${GTEST_POSTFIX} --capture-range-end stop --capture-range=cudaProfilerApi --cudabacktrace=memory,sync,kernel,other"
 
 CMD="${NSYSCMD} ${MPICMD}"
 sudo /bin/sh -c "echo '1' > /proc/sys/kernel/perf_event_paranoid"
diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index d4b9c757f7a..c93987890b4 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -6,6 +6,7 @@
  */
 // clang-format on
 #include <ATen/Functions.h>
+#include <ATen/cuda/CUDAGraph.h>
 #include <c10/cuda/CUDAStream.h>
 #include <c10/util/ArrayRef.h>
 #include <fusion.h>
@@ -52,7 +53,8 @@ using OverlapBenchmarkParams = std::tuple<
     /*number_of_streams=*/int64_t,
     /*add_cuStreamWriteValue32=*/bool,
     /*number_of_pgs=*/int64_t,
-    /*unfuse_loops=*/bool>;
+    /*unfuse_loops=*/bool,
+    /*use_cuda_graph=*/bool>;
 
 class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface<OverlapBenchmarkParams> {
  protected:
@@ -72,8 +74,11 @@ class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterf
 std::map<std::string, float> OverlapBenchmark::times = {};
 
 TEST_P(OverlapBenchmark, DummyBenchmark) {
-  int64_t number_of_warmups = 50;
+  constexpr int64_t number_of_warmups = 50;
   constexpr int64_t number_of_iterations = 100;
+  constexpr int64_t iteration_profiler_start = 10;
+  constexpr int64_t iteration_profiler_end = 15;
+  constexpr int64_t iteration_cuda_graph_capture = 5;
 
 
   const int64_t D = communicator_->size();
@@ -85,7 +90,8 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
         number_of_streams,
         add_cuStreamWriteValue32,
         number_of_pgs,
-        unfuse_loops] = GetParam();
+        unfuse_loops,
+        use_cuda_graph] = GetParam();
 
   GTEST_ASSERT_EQ(M % S, 0);
 
@@ -94,6 +100,7 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
 
   std::vector<c10::cuda::CUDAStream> streams =
       createStreams(number_of_streams, communicator_->deviceId());
+  setCurrentCUDAStream(streams.at(0));
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(communicator_->device());
   auto ta = at::randn({S, M/S,K}, options);
@@ -104,6 +111,8 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
 
+  at::cuda::CUDAGraph cuda_graph;
+
   CUdeviceptr pDevice;
   void* ptr;
   if (add_cuStreamWriteValue32) {
@@ -113,45 +122,56 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
 
   for (const auto& iteration :
        c10::irange(number_of_warmups + number_of_iterations)) {
-    if (iteration == 10) {
+    if (iteration == iteration_profiler_start) {
       cudaProfilerStart();;
     }
     if (iteration == number_of_warmups) {
       cudaEventRecord(start);
     }
-    for (auto j : c10::irange(S)) {
-      int64_t stream_index = j % streams.size();
-      setCurrentCUDAStream(streams.at(stream_index));
-
-      auto world = communicator_->getBackendForTeam(all_ranks, backend, std::to_string(j % number_of_pgs));
-
-      auto ta_j = ta.select(0, j);
-      auto ta_unsharded_j = ta_unsharded.select(0, j);
-
-      // communication
-      world->_allgather_base(ta_unsharded_j, ta_j)->wait();
-
-      if (add_cuStreamWriteValue32) {
-        cuStreamWriteValue32((CUstream)streams.at(stream_index), (CUdeviceptr)pDevice, (cuuint32_t)1, (unsigned int)0);
+    if (iteration <= iteration_cuda_graph_capture) {
+      if (iteration == iteration_cuda_graph_capture) {
+        cuda_graph.capture_begin();
       }
-      if (unfuse_loops == false) {
-        // compute
-        auto tc_j = torch::matmul(ta_unsharded_j,tb);
-      }
-    }
-    if (unfuse_loops) {
       for (auto j : c10::irange(S)) {
         int64_t stream_index = j % streams.size();
         setCurrentCUDAStream(streams.at(stream_index));
+
+        auto world = communicator_->getBackendForTeam(all_ranks, backend, std::to_string(j % number_of_pgs));
+
+        auto ta_j = ta.select(0, j);
         auto ta_unsharded_j = ta_unsharded.select(0, j);
 
-        // compute
-        auto tc_j = torch::matmul(ta_unsharded_j,tb);
+        // communication
+        world->_allgather_base(ta_unsharded_j, ta_j)->wait();
+
+        if (add_cuStreamWriteValue32) {
+          cuStreamWriteValue32((CUstream)streams.at(stream_index), (CUdeviceptr)pDevice, (cuuint32_t)1, (unsigned int)0);
+        }
+        if (unfuse_loops == false) {
+          // compute
+          auto tc_j = torch::matmul(ta_unsharded_j,tb);
+        }
+      }
+      if (unfuse_loops) {
+        for (auto j : c10::irange(S)) {
+          int64_t stream_index = j % streams.size();
+          setCurrentCUDAStream(streams.at(stream_index));
+          auto ta_unsharded_j = ta_unsharded.select(0, j);
+
+          // compute
+          auto tc_j = torch::matmul(ta_unsharded_j,tb);
+        }
       }
+      if (iteration == iteration_cuda_graph_capture) {
+        cuda_graph.capture_end();
+      } else {
+        setCurrentCUDAStream(streams.at(0));
+        synchronizeStreams(streams);
+      }
+    } else {
+      cuda_graph.replay();
     }
-    setCurrentCUDAStream(c10::cuda::getDefaultCUDAStream(communicator_->deviceId()));
-    synchronizeStreams(streams);
-    if (iteration == 15) {
+    if (iteration == iteration_profiler_end) {
       cudaProfilerStop();;
     }
   }
@@ -182,7 +202,8 @@ INSTANTIATE_TEST_SUITE_P(
     /*number_of_streams=*/testing::Values(3, 8, 32),
     /*add_cuStreamWriteValue32*/testing::Values(false, true),
     /*number_of_pgs=*/testing::Values(1, 2, 4, 8),
-    /*unfuse_loops=*/testing::Values(false, true)),
+    /*unfuse_loops=*/testing::Values(false, true),
+    /*use_cuda_graph=*/testing::Values(false)), // cuda graphs not supported: ucc does not supports it (segfault) and nccl PG has a "syncStream" that throws
     [](const testing::TestParamInfo<OverlapBenchmarkParams>& info)
         -> std::string {
       std::ostringstream os;
@@ -194,7 +215,8 @@ INSTANTIATE_TEST_SUITE_P(
          << "Streams" << std::get<5>(info.param) << "_"
          << ((std::get<6>(info.param))? "WithcuStreamWriteValue32_" : "")
          << "Pgs" << std::get<7>(info.param)
-         << ((std::get<8>(info.param))? "_unfused" : "");
+         << ((std::get<8>(info.param))? "_unfused" : "")
+         << ((std::get<9>(info.param))? "_WithCudaGraph" : "");
       return os.str();
     });
 

From 9dddac2a6320e315f1300febc624a03e084aa54f Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Mon, 25 Nov 2024 16:51:59 -0800
Subject: [PATCH 13/20] write matmul to sliced output

---
 tests/cpp/test_multidevice_overlap.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index c93987890b4..5600041dc7d 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -73,7 +73,7 @@ class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterf
 
 std::map<std::string, float> OverlapBenchmark::times = {};
 
-TEST_P(OverlapBenchmark, DummyBenchmark) {
+TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) {
   constexpr int64_t number_of_warmups = 50;
   constexpr int64_t number_of_iterations = 100;
   constexpr int64_t iteration_profiler_start = 10;
@@ -106,6 +106,7 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
   auto ta = at::randn({S, M/S,K}, options);
   auto ta_unsharded = at::empty({S, D, M/S,K}, options);
   auto tb = at::randn({K,N}, options);
+  auto tc = at::empty({S, D, M/S, N}, options);
 
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
@@ -140,6 +141,7 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
 
         auto ta_j = ta.select(0, j);
         auto ta_unsharded_j = ta_unsharded.select(0, j);
+        auto tc_j = ta_unsharded.select(0, j);
 
         // communication
         world->_allgather_base(ta_unsharded_j, ta_j)->wait();
@@ -149,7 +151,7 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
         }
         if (unfuse_loops == false) {
           // compute
-          auto tc_j = torch::matmul(ta_unsharded_j,tb);
+          torch::matmul_out(tc_j, ta_unsharded_j,tb);
         }
       }
       if (unfuse_loops) {
@@ -157,9 +159,10 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
           int64_t stream_index = j % streams.size();
           setCurrentCUDAStream(streams.at(stream_index));
           auto ta_unsharded_j = ta_unsharded.select(0, j);
+          auto tc_j = ta_unsharded.select(0, j);
 
           // compute
-          auto tc_j = torch::matmul(ta_unsharded_j,tb);
+          torch::matmul_out(tc_j, ta_unsharded_j,tb);
         }
       }
       if (iteration == iteration_cuda_graph_capture) {

From faf8bbe6b9c1ddf19b31069d66387b060481e9bf Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Thu, 28 Nov 2024 08:12:18 -0800
Subject: [PATCH 14/20] wip cuStreamWriteValue not working

---
 bench/test                             |  14 +-
 tests/cpp/test_multidevice_overlap.cpp | 219 +++++++++++++++++++++++--
 2 files changed, 216 insertions(+), 17 deletions(-)

diff --git a/bench/test b/bench/test
index 8a64225d9e9..c27cb9ce74b 100755
--- a/bench/test
+++ b/bench/test
@@ -1,5 +1,5 @@
 #!/bin/bash
-EXPERIMENT=profile_cudaGraph_NCCL_S1
+EXPERIMENT=profile_NCCL_with_cuStreamValue
 DATE=$(date +%Y%m%d-%H%M)
 LOG_BASE="/opt/pytorch/Fuser/bench/logs"
 
@@ -10,17 +10,17 @@ export LOG_FILE_INFO="${LOGS}/info.txt"
 echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO
 
 NP=8
-BACKEND=NCCL
-S=1
+BACKEND=UCC
+S=8
 M=32768
 K=32768
 N=1024
 Streams=8
 Pgs=1
 # UNFUSE="_unfused"
-GRAPH="_WithCudaGraph"
+# GRAPH="_WithCudaGraph"
 # cuStreamWrite=WithcuStreamWriteValue32_
-GTEST_PREFIX="OverlapBenchmark.DummyBenchmark/"
+GTEST_PREFIX="OverlapBenchmark.PipelinedAGMatmulBenchmark/"
 GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}${GRAPH}"
 export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}"
 echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO
@@ -42,7 +42,7 @@ MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl"
 # MPIFLAGS+=" -x UCC_COLL_TRACE=info"
 # MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1"
 # MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO
-# MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event"
+MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event"
 # MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2"
 echo "mpi flags: $MPIFLAGS" | tee -a $LOG_FILE_INFO
 
@@ -52,7 +52,7 @@ echo "test cmd: $TEST_CMD" | tee -a $LOG_FILE_INFO
 MPICMD="mpirun $MPIFLAGS $TEST_CMD"
 echo $MPICMD | tee -a $LOG_FILE_INFO
 
-NSYSCMD="nsys profile --stats=false -w true -t cublas,cuda,nvtx,osrt,mpi,ucx -o ${LOGS}/${GTEST_POSTFIX} --capture-range-end stop --capture-range=cudaProfilerApi --cudabacktrace=memory,sync,kernel,other"
+# NSYSCMD="nsys profile --stats=false -w true -t cublas,cuda,nvtx,osrt,mpi,ucx -o ${LOGS}/${GTEST_POSTFIX} --capture-range-end stop --capture-range=cudaProfilerApi --cudabacktrace=memory,sync,kernel,other"
 
 CMD="${NSYSCMD} ${MPICMD}"
 sudo /bin/sh -c "echo '1' > /proc/sys/kernel/perf_event_paranoid"
diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index 5600041dc7d..0d55580a11a 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -20,6 +20,8 @@
 #include <cuda_profiler_api.h>
 #include <cuda_runtime.h>
 
+#define CUSTOM_PG_WITH_INTERNAL_STREAM_ACCESS 1
+
 namespace nvfuser {
 
 namespace {
@@ -44,6 +46,190 @@ void synchronizeStreams(const std::vector<c10::cuda::CUDAStream>& streams) {
 
 } // namespace
 
+TEST_F(NVFuserTest, cuStreamWriteValue32) {
+  constexpr cuuint32_t value = 3;
+  cudaError_t error;
+  CUdeviceptr pDevice;
+  volatile cuuint32_t* ptr;
+  error = cudaSetDevice(0);
+  ASSERT_EQ(error, 0);
+  error = cudaMallocHost((void**)&ptr, sizeof(cuuint32_t));
+  ASSERT_EQ(error, 0);
+  error = cudaHostGetDevicePointer((void**)&pDevice, (void*)ptr, 0);
+  ASSERT_EQ(error, 0);
+
+  at::cuda::CUDAStream c10_stream = at::cuda::getStreamFromPool(
+              /*isHighPriority=*/true, /*device_index*/0);
+  CUstream stream = c10_stream.stream();
+  CUresult st;
+  st = cuStreamWriteValue32(stream, pDevice, value, /*flag=*/0);
+  ASSERT_EQ(st, 0);
+
+  torch::cuda::synchronize();
+  cuuint32_t ptr2;
+  error = cudaMemcpy(&ptr2, (void*)pDevice, sizeof(cuuint32_t), cudaMemcpyDeviceToHost);
+  ASSERT_EQ(error, 0);
+  ASSERT_EQ(ptr2, value);
+
+
+  int i = 0;
+  while (i < 10000000) {
+    if (*ptr == value) {
+      std::cout << " BREAK " << *ptr <<std::endl;
+      break;
+    }
+    if (i % 1000000 == 0) {
+      std::cout << "waiting, read value = " << *ptr <<std::endl;
+    }
+    i++;
+  }
+}
+
+using DummyOverlapBenchmarkParams = std::tuple<
+    CommunicatorBackend,
+    /*M=*/int64_t,
+    /*K=*/int64_t,
+    /*N=*/int64_t,
+    /*L(communication msgsize)=*/int64_t,
+    /*number_of_streams=*/int64_t,
+    /*add_cuStreamWriteValue32=*/bool,
+    /*number_of_pgs=*/int64_t>;
+
+class DummyOverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface<DummyOverlapBenchmarkParams> {
+ protected:
+  static std::map<std::string, float> times;
+
+  static void TearDownTestSuite() {
+    auto rank = Communicator::getInstance().deviceId();
+    if (rank != 0) {
+      return;
+    }
+    for (auto it: times) {
+      std::cout << "time " << rank << ": " << it.first << ": " << it.second << std::endl;
+    }
+  }
+};
+
+std::map<std::string, float> DummyOverlapBenchmark::times = {};
+
+TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) {
+  constexpr int64_t number_of_warmups = 50;
+  constexpr int64_t number_of_iterations = 100;
+  constexpr int64_t iteration_profiler_start = 10;
+  constexpr int64_t iteration_profiler_end = 15;
+
+
+  auto [backend,
+        M,
+        K,
+        N,
+        L,
+        number_of_streams,
+        add_cuStreamWriteValue32,
+        number_of_pgs] = GetParam();
+
+  std::vector<RankType> all_ranks(communicator_->size());
+  std::iota(all_ranks.begin(), all_ranks.end(), 0);
+
+  std::vector<c10::cuda::CUDAStream> streams =
+      createStreams(number_of_streams, communicator_->deviceId());
+  setCurrentCUDAStream(streams.at(0));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(communicator_->device());
+  auto ta = at::randn({M, K}, options);
+  auto tb = at::randn({K, N}, options);
+  auto tc = at::empty({M, N}, options);
+  auto src = at::randn({L}, options);
+  auto dst = at::empty({L * communicator_->size()}, options);
+
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  CUdeviceptr pDevice;
+  void* ptr;
+  if (add_cuStreamWriteValue32) {
+    cudaMallocHost(&ptr, 32);
+    cudaHostGetDevicePointer((void**)&pDevice, ptr, 0);
+  }
+
+  for (const auto& iteration :
+       c10::irange(number_of_warmups + number_of_iterations)) {
+    if (iteration == iteration_profiler_start) {
+      cudaProfilerStart();;
+    }
+    if (iteration == number_of_warmups) {
+      cudaEventRecord(start);
+    }
+    int64_t stream_index = iteration % streams.size();
+    setCurrentCUDAStream(streams.at(stream_index));
+
+    auto world = communicator_->getBackendForTeam(all_ranks, backend, std::to_string(iteration % number_of_pgs));
+
+    // communication
+    world->_allgather_base(dst, src)->wait();
+
+    // compute
+    torch::matmul_out(tc, ta, tb);
+
+    if (add_cuStreamWriteValue32) {
+
+      cuStreamWriteValue32(
+#if CUSTOM_PG_WITH_INTERNAL_STREAM_ACCESS
+        (CUstream)world->getCudaStream(communicator_->device()).stream(),
+#else
+        (CUstream)streams.at(stream_index).stream(), 
+#endif
+        (CUdeviceptr)pDevice, (cuuint32_t)1, (unsigned int)0);
+    }
+
+    setCurrentCUDAStream(streams.at(0));
+    synchronizeStreams(streams);
+    if (iteration == iteration_profiler_end) {
+      cudaProfilerStop();;
+    }
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  float milliseconds = 0;
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  milliseconds /= number_of_iterations;
+
+  std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name();
+  times.insert({test_name, milliseconds});
+  std::cout << "rank " << communicator_->deviceId() << ", " << test_name << " : " << milliseconds << std::endl;
+
+  if (add_cuStreamWriteValue32) {
+    cudaFree(ptr);
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    ,
+    DummyOverlapBenchmark,
+    testing::Combine(
+    testing::Values(CommunicatorBackend::kNccl, CommunicatorBackend::kUcc),
+    /*M=*/testing::Values(pow(2,10), pow(2,15)),
+    /*K=*/testing::Values(pow(2,10), pow(2,15)),
+    /*N=*/testing::Values(pow(2,10)),
+    /*L=*/testing::Values(pow(2,15)),
+    /*number_of_streams=*/testing::Values(1, 8),
+    /*add_cuStreamWriteValue32*/testing::Values(false, true),
+    /*number_of_pgs=*/testing::Values(1, 2, 4, 8)),
+    [](const testing::TestParamInfo<DummyOverlapBenchmarkParams>& info)
+        -> std::string {
+      std::ostringstream os;
+      os << /*backend*/std::get<0>(info.param) << "_"
+         << "M" << std::get<1>(info.param) << "_"
+         << "K" << std::get<2>(info.param) << "_"
+         << "N" << std::get<3>(info.param) << "_"
+         << "L" << std::get<4>(info.param) << "_"
+         << "Streams" << std::get<5>(info.param) << "_"
+         << ((std::get<6>(info.param))? "WithcuStreamWriteValue32_" : "")
+         << "Pgs" << std::get<7>(info.param);
+      return os.str();
+    });
+
 using OverlapBenchmarkParams = std::tuple<
     CommunicatorBackend,
     /*S=*/int64_t,
@@ -115,10 +301,10 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) {
   at::cuda::CUDAGraph cuda_graph;
 
   CUdeviceptr pDevice;
-  void* ptr;
+  cuuint32_t* ptr;
   if (add_cuStreamWriteValue32) {
-    cudaMallocHost(&ptr, 32);
-    cudaHostGetDevicePointer((void**)&pDevice, ptr, 0);
+    cudaMallocHost((void**)&ptr, sizeof(cuuint32_t));
+    cudaHostGetDevicePointer((void**)&pDevice, (void*)ptr, 0);
   }
 
   for (const auto& iteration :
@@ -129,8 +315,8 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) {
     if (iteration == number_of_warmups) {
       cudaEventRecord(start);
     }
-    if (iteration <= iteration_cuda_graph_capture) {
-      if (iteration == iteration_cuda_graph_capture) {
+    if (!use_cuda_graph || (iteration <= iteration_cuda_graph_capture)) {
+      if (use_cuda_graph && (iteration == iteration_cuda_graph_capture)) {
         cuda_graph.capture_begin();
       }
       for (auto j : c10::irange(S)) {
@@ -141,13 +327,22 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) {
 
         auto ta_j = ta.select(0, j);
         auto ta_unsharded_j = ta_unsharded.select(0, j);
-        auto tc_j = ta_unsharded.select(0, j);
+        auto tc_j = tc.select(0, j);
 
         // communication
         world->_allgather_base(ta_unsharded_j, ta_j)->wait();
 
         if (add_cuStreamWriteValue32) {
-          cuStreamWriteValue32((CUstream)streams.at(stream_index), (CUdeviceptr)pDevice, (cuuint32_t)1, (unsigned int)0);
+          if (!communicator_->deviceId()){
+            std::cout << "writing to stream " << world->getCudaStream(communicator_->device()).stream() << " the value " << (cuuint32_t)(iteration * S + j) << ", communicator_->device()=" << communicator_->device() << ", world=" << world << ", number_of_pgs=" << number_of_pgs << " with MACRO=" << CUSTOM_PG_WITH_INTERNAL_STREAM_ACCESS << std::endl;
+          }
+          cuStreamWriteValue32(
+#if CUSTOM_PG_WITH_INTERNAL_STREAM_ACCESS
+            (CUstream)world->getCudaStream(communicator_->device()).stream(),
+#else
+            // (CUstream)streams.at(stream_index).stream(),
+#endif
+            (CUdeviceptr)pDevice, (cuuint32_t)(iteration * S + j), (unsigned int)0);
         }
         if (unfuse_loops == false) {
           // compute
@@ -159,13 +354,13 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) {
           int64_t stream_index = j % streams.size();
           setCurrentCUDAStream(streams.at(stream_index));
           auto ta_unsharded_j = ta_unsharded.select(0, j);
-          auto tc_j = ta_unsharded.select(0, j);
+          auto tc_j = tc.select(0, j);
 
           // compute
           torch::matmul_out(tc_j, ta_unsharded_j,tb);
         }
       }
-      if (iteration == iteration_cuda_graph_capture) {
+      if (use_cuda_graph && (iteration == iteration_cuda_graph_capture)) {
         cuda_graph.capture_end();
       } else {
         setCurrentCUDAStream(streams.at(0));
@@ -189,7 +384,11 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) {
   std::cout << "rank " << communicator_->deviceId() << ", " << test_name << " : " << milliseconds << std::endl;
 
   if (add_cuStreamWriteValue32) {
-    cudaFree(ptr);
+    std::cout << "RANK " << communicator_->device() << " entering while loop. Max index=" << (number_of_warmups + number_of_iterations)*S + S << std::endl;
+    while (*ptr < (cuuint32_t)(number_of_warmups + number_of_iterations)*S + S - 1) {
+      std::cout << "RANK " << communicator_->device() << " waiting at index=" << *ptr << std::endl;
+    }
+    cudaFree((void*)ptr);
   }
 }
 

From a6b5fd75896d26a15fc0e2b6a8a66e9e81e60016 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Mon, 2 Dec 2024 05:45:31 -0800
Subject: [PATCH 15/20] dummy benchmark

---
 bench/test                             |  30 ++++---
 tests/cpp/test_multidevice_overlap.cpp | 110 ++++---------------------
 2 files changed, 35 insertions(+), 105 deletions(-)

diff --git a/bench/test b/bench/test
index c27cb9ce74b..cff8d8b34bb 100755
--- a/bench/test
+++ b/bench/test
@@ -1,5 +1,5 @@
 #!/bin/bash
-EXPERIMENT=profile_NCCL_with_cuStreamValue
+EXPERIMENT=Dummy_profile_NCCL_P2P_NET_CHUNKSIZE_LARGE
 DATE=$(date +%Y%m%d-%H%M)
 LOG_BASE="/opt/pytorch/Fuser/bench/logs"
 
@@ -10,27 +10,33 @@ export LOG_FILE_INFO="${LOGS}/info.txt"
 echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO
 
 NP=8
-BACKEND=UCC
+BACKEND=NCCL
 S=8
-M=32768
+M=131072 #32768
 K=32768
-N=1024
+N=32768 #1024
+L=32768
 Streams=8
 Pgs=1
 # UNFUSE="_unfused"
 # GRAPH="_WithCudaGraph"
 # cuStreamWrite=WithcuStreamWriteValue32_
-GTEST_PREFIX="OverlapBenchmark.PipelinedAGMatmulBenchmark/"
-GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}${GRAPH}"
+# GTEST_PREFIX="OverlapBenchmark.PipelinedAGMatmulBenchmark/"
+GTEST_PREFIX="DummyOverlapBenchmark.PipelinedAGMatmulBenchmark/"
+# GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}${GRAPH}"
+GTEST_POSTFIX="${BACKEND}_M${M}_K${K}_N${N}_L${L}"
 export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}"
 echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO
-``
+
 MPIFLAGS=" -np $NP"
 MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1"
-MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl"
+# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl"
 # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=cuda"
+# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp"
+# MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy"
 # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_NUM_WORKERS=8"
 # MPIFLAGS+=" -x UCC_EC_CUDA_USE_COOPERATIVE_LAUNCH=0"
+# MPIFLAGS+=" -x UCC_EC_CUDA_STREAM_TASK_MODE=driver"
 # MPIFLAGS+=" -x UCC_EC_CUDA_STREAM_TASK_MODE=kernel"
 # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_COPY_LARGE_THRESH=1M"
 # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_NUM_THREADS=512"
@@ -39,10 +45,12 @@ MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl"
 # MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_NUM_CHUNKS=32"
 
 # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=^sharp,mlx5"
-# MPIFLAGS+=" -x UCC_COLL_TRACE=info"
+# MPIFLAGS+=" -x UCC_COLL_TRACE=debug"
+# MPIFLAGS+=" -x UCC_LOG_LEVEL=debug"
 # MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1"
+MPIFLAGS+=" -x NCCL_P2P_NET_CHUNKSIZE=2MB"
 # MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO
-MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event"
+# MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event"
 # MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2"
 echo "mpi flags: $MPIFLAGS" | tee -a $LOG_FILE_INFO
 
@@ -52,7 +60,7 @@ echo "test cmd: $TEST_CMD" | tee -a $LOG_FILE_INFO
 MPICMD="mpirun $MPIFLAGS $TEST_CMD"
 echo $MPICMD | tee -a $LOG_FILE_INFO
 
-# NSYSCMD="nsys profile --stats=false -w true -t cublas,cuda,nvtx,osrt,mpi,ucx -o ${LOGS}/${GTEST_POSTFIX} --capture-range-end stop --capture-range=cudaProfilerApi --cudabacktrace=memory,sync,kernel,other"
+NSYSCMD="nsys profile --stats=false -w true -t cublas,cuda,nvtx,osrt,mpi,ucx -o ${LOGS}/${GTEST_POSTFIX} --capture-range-end stop --capture-range=cudaProfilerApi --cudabacktrace=memory,sync,kernel,other"
 
 CMD="${NSYSCMD} ${MPICMD}"
 sudo /bin/sh -c "echo '1' > /proc/sys/kernel/perf_event_paranoid"
diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index 0d55580a11a..85059b89a31 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -20,7 +20,7 @@
 #include <cuda_profiler_api.h>
 #include <cuda_runtime.h>
 
-#define CUSTOM_PG_WITH_INTERNAL_STREAM_ACCESS 1
+#define CUSTOM_PG_WITH_INTERNAL_STREAM_ACCESS 0
 
 namespace nvfuser {
 
@@ -46,54 +46,12 @@ void synchronizeStreams(const std::vector<c10::cuda::CUDAStream>& streams) {
 
 } // namespace
 
-TEST_F(NVFuserTest, cuStreamWriteValue32) {
-  constexpr cuuint32_t value = 3;
-  cudaError_t error;
-  CUdeviceptr pDevice;
-  volatile cuuint32_t* ptr;
-  error = cudaSetDevice(0);
-  ASSERT_EQ(error, 0);
-  error = cudaMallocHost((void**)&ptr, sizeof(cuuint32_t));
-  ASSERT_EQ(error, 0);
-  error = cudaHostGetDevicePointer((void**)&pDevice, (void*)ptr, 0);
-  ASSERT_EQ(error, 0);
-
-  at::cuda::CUDAStream c10_stream = at::cuda::getStreamFromPool(
-              /*isHighPriority=*/true, /*device_index*/0);
-  CUstream stream = c10_stream.stream();
-  CUresult st;
-  st = cuStreamWriteValue32(stream, pDevice, value, /*flag=*/0);
-  ASSERT_EQ(st, 0);
-
-  torch::cuda::synchronize();
-  cuuint32_t ptr2;
-  error = cudaMemcpy(&ptr2, (void*)pDevice, sizeof(cuuint32_t), cudaMemcpyDeviceToHost);
-  ASSERT_EQ(error, 0);
-  ASSERT_EQ(ptr2, value);
-
-
-  int i = 0;
-  while (i < 10000000) {
-    if (*ptr == value) {
-      std::cout << " BREAK " << *ptr <<std::endl;
-      break;
-    }
-    if (i % 1000000 == 0) {
-      std::cout << "waiting, read value = " << *ptr <<std::endl;
-    }
-    i++;
-  }
-}
-
 using DummyOverlapBenchmarkParams = std::tuple<
     CommunicatorBackend,
     /*M=*/int64_t,
     /*K=*/int64_t,
     /*N=*/int64_t,
-    /*L(communication msgsize)=*/int64_t,
-    /*number_of_streams=*/int64_t,
-    /*add_cuStreamWriteValue32=*/bool,
-    /*number_of_pgs=*/int64_t>;
+    /*L(communication msgsize)=*/int64_t>;
 
 class DummyOverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface<DummyOverlapBenchmarkParams> {
  protected:
@@ -123,17 +81,16 @@ TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) {
         M,
         K,
         N,
-        L,
-        number_of_streams,
-        add_cuStreamWriteValue32,
-        number_of_pgs] = GetParam();
+        L] = GetParam();
 
   std::vector<RankType> all_ranks(communicator_->size());
   std::iota(all_ranks.begin(), all_ranks.end(), 0);
+  auto world = communicator_->getBackendForTeam(all_ranks, backend);
 
   std::vector<c10::cuda::CUDAStream> streams =
-      createStreams(number_of_streams, communicator_->deviceId());
-  setCurrentCUDAStream(streams.at(0));
+      createStreams(2, communicator_->deviceId());
+  auto& compute_stream = streams.at(0);
+  auto& communication_stream = streams.at(1);
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(communicator_->device());
   auto ta = at::randn({M, K}, options);
@@ -146,13 +103,6 @@ TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) {
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
 
-  CUdeviceptr pDevice;
-  void* ptr;
-  if (add_cuStreamWriteValue32) {
-    cudaMallocHost(&ptr, 32);
-    cudaHostGetDevicePointer((void**)&pDevice, ptr, 0);
-  }
-
   for (const auto& iteration :
        c10::irange(number_of_warmups + number_of_iterations)) {
     if (iteration == iteration_profiler_start) {
@@ -161,33 +111,18 @@ TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) {
     if (iteration == number_of_warmups) {
       cudaEventRecord(start);
     }
-    int64_t stream_index = iteration % streams.size();
-    setCurrentCUDAStream(streams.at(stream_index));
-
-    auto world = communicator_->getBackendForTeam(all_ranks, backend, std::to_string(iteration % number_of_pgs));
 
-    // communication
+    setCurrentCUDAStream(communication_stream);
     world->_allgather_base(dst, src)->wait();
 
     // compute
+    setCurrentCUDAStream(compute_stream);
     torch::matmul_out(tc, ta, tb);
 
-    if (add_cuStreamWriteValue32) {
-
-      cuStreamWriteValue32(
-#if CUSTOM_PG_WITH_INTERNAL_STREAM_ACCESS
-        (CUstream)world->getCudaStream(communicator_->device()).stream(),
-#else
-        (CUstream)streams.at(stream_index).stream(), 
-#endif
-        (CUdeviceptr)pDevice, (cuuint32_t)1, (unsigned int)0);
-    }
-
-    setCurrentCUDAStream(streams.at(0));
-    synchronizeStreams(streams);
     if (iteration == iteration_profiler_end) {
       cudaProfilerStop();;
     }
+    synchronizeStreams(streams);
   }
   cudaEventRecord(stop);
   cudaEventSynchronize(stop);
@@ -198,10 +133,6 @@ TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) {
   std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name();
   times.insert({test_name, milliseconds});
   std::cout << "rank " << communicator_->deviceId() << ", " << test_name << " : " << milliseconds << std::endl;
-
-  if (add_cuStreamWriteValue32) {
-    cudaFree(ptr);
-  }
 }
 
 INSTANTIATE_TEST_SUITE_P(
@@ -209,13 +140,10 @@ INSTANTIATE_TEST_SUITE_P(
     DummyOverlapBenchmark,
     testing::Combine(
     testing::Values(CommunicatorBackend::kNccl, CommunicatorBackend::kUcc),
-    /*M=*/testing::Values(pow(2,10), pow(2,15)),
-    /*K=*/testing::Values(pow(2,10), pow(2,15)),
-    /*N=*/testing::Values(pow(2,10)),
-    /*L=*/testing::Values(pow(2,15)),
-    /*number_of_streams=*/testing::Values(1, 8),
-    /*add_cuStreamWriteValue32*/testing::Values(false, true),
-    /*number_of_pgs=*/testing::Values(1, 2, 4, 8)),
+    /*M=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)),
+    /*K=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)),
+    /*N=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)),
+    /*L=*/testing::Values(pow(2,10), pow(2,15), pow(2,17))),
     [](const testing::TestParamInfo<DummyOverlapBenchmarkParams>& info)
         -> std::string {
       std::ostringstream os;
@@ -223,10 +151,7 @@ INSTANTIATE_TEST_SUITE_P(
          << "M" << std::get<1>(info.param) << "_"
          << "K" << std::get<2>(info.param) << "_"
          << "N" << std::get<3>(info.param) << "_"
-         << "L" << std::get<4>(info.param) << "_"
-         << "Streams" << std::get<5>(info.param) << "_"
-         << ((std::get<6>(info.param))? "WithcuStreamWriteValue32_" : "")
-         << "Pgs" << std::get<7>(info.param);
+         << "L" << std::get<4>(info.param);
       return os.str();
     });
 
@@ -333,14 +258,11 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) {
         world->_allgather_base(ta_unsharded_j, ta_j)->wait();
 
         if (add_cuStreamWriteValue32) {
-          if (!communicator_->deviceId()){
-            std::cout << "writing to stream " << world->getCudaStream(communicator_->device()).stream() << " the value " << (cuuint32_t)(iteration * S + j) << ", communicator_->device()=" << communicator_->device() << ", world=" << world << ", number_of_pgs=" << number_of_pgs << " with MACRO=" << CUSTOM_PG_WITH_INTERNAL_STREAM_ACCESS << std::endl;
-          }
           cuStreamWriteValue32(
 #if CUSTOM_PG_WITH_INTERNAL_STREAM_ACCESS
             (CUstream)world->getCudaStream(communicator_->device()).stream(),
 #else
-            // (CUstream)streams.at(stream_index).stream(),
+            (CUstream)streams.at(stream_index).stream(),
 #endif
             (CUdeviceptr)pDevice, (cuuint32_t)(iteration * S + j), (unsigned int)0);
         }

From 8d927bf4d7537b2ae2450efd775c039c68ebffbe Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Mon, 2 Dec 2024 06:45:27 -0800
Subject: [PATCH 16/20] add pre post comms option

---
 bench/test                             | 22 ++++++++++++---------
 tests/cpp/test_multidevice_overlap.cpp | 27 ++++++++++++++++++++------
 2 files changed, 34 insertions(+), 15 deletions(-)

diff --git a/bench/test b/bench/test
index cff8d8b34bb..28532970124 100755
--- a/bench/test
+++ b/bench/test
@@ -1,5 +1,5 @@
 #!/bin/bash
-EXPERIMENT=Dummy_profile_NCCL_P2P_NET_CHUNKSIZE_LARGE
+EXPERIMENT=Dummy_profile_POST_COMM_UCC_TL_UCP_OVER_IB_LARGE
 DATE=$(date +%Y%m%d-%H%M)
 LOG_BASE="/opt/pytorch/Fuser/bench/logs"
 
@@ -10,21 +10,25 @@ export LOG_FILE_INFO="${LOGS}/info.txt"
 echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO
 
 NP=8
-BACKEND=NCCL
-S=8
+BACKEND=UCC
 M=131072 #32768
 K=32768
 N=32768 #1024
-L=32768
+
+S=8
 Streams=8
 Pgs=1
+
+L=32768
+# PRE_COMM="_pre_comm"
+POST_COMM="_post_comm"
 # UNFUSE="_unfused"
 # GRAPH="_WithCudaGraph"
 # cuStreamWrite=WithcuStreamWriteValue32_
 # GTEST_PREFIX="OverlapBenchmark.PipelinedAGMatmulBenchmark/"
 GTEST_PREFIX="DummyOverlapBenchmark.PipelinedAGMatmulBenchmark/"
 # GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}${GRAPH}"
-GTEST_POSTFIX="${BACKEND}_M${M}_K${K}_N${N}_L${L}"
+GTEST_POSTFIX="${BACKEND}_M${M}_K${K}_N${N}_L${L}${PRE_COMM}${POST_COMM}"
 export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}"
 echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO
 
@@ -32,8 +36,8 @@ MPIFLAGS=" -np $NP"
 MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1"
 # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl"
 # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=cuda"
-# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp"
-# MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy"
+MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp"
+MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy"
 # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_NUM_WORKERS=8"
 # MPIFLAGS+=" -x UCC_EC_CUDA_USE_COOPERATIVE_LAUNCH=0"
 # MPIFLAGS+=" -x UCC_EC_CUDA_STREAM_TASK_MODE=driver"
@@ -45,10 +49,10 @@ MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1"
 # MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_NUM_CHUNKS=32"
 
 # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=^sharp,mlx5"
-# MPIFLAGS+=" -x UCC_COLL_TRACE=debug"
+# MPIFLAGS+=" -x UCC_COLL_TRACE=info"
 # MPIFLAGS+=" -x UCC_LOG_LEVEL=debug"
 # MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1"
-MPIFLAGS+=" -x NCCL_P2P_NET_CHUNKSIZE=2MB"
+# MPIFLAGS+=" -x NCCL_P2P_NET_CHUNKSIZE=2MB"
 # MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO
 # MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event"
 # MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2"
diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index 85059b89a31..9898df02ac8 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -51,7 +51,9 @@ using DummyOverlapBenchmarkParams = std::tuple<
     /*M=*/int64_t,
     /*K=*/int64_t,
     /*N=*/int64_t,
-    /*L(communication msgsize)=*/int64_t>;
+    /*L(communication msgsize)=*/int64_t,
+    /*pre_comm=*/bool,
+    /*post_comm=*/bool>;
 
 class DummyOverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface<DummyOverlapBenchmarkParams> {
  protected:
@@ -81,7 +83,9 @@ TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) {
         M,
         K,
         N,
-        L] = GetParam();
+        L,
+        pre_comm,
+        post_comm] = GetParam();
 
   std::vector<RankType> all_ranks(communicator_->size());
   std::iota(all_ranks.begin(), all_ranks.end(), 0);
@@ -112,13 +116,20 @@ TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) {
       cudaEventRecord(start);
     }
 
-    setCurrentCUDAStream(communication_stream);
-    world->_allgather_base(dst, src)->wait();
+    if (pre_comm) {
+      setCurrentCUDAStream(communication_stream);
+      world->_allgather_base(dst, src)->wait();
+    }
 
     // compute
     setCurrentCUDAStream(compute_stream);
     torch::matmul_out(tc, ta, tb);
 
+    if (post_comm) {
+      setCurrentCUDAStream(communication_stream);
+      world->_allgather_base(dst, src)->wait();
+    }
+
     if (iteration == iteration_profiler_end) {
       cudaProfilerStop();;
     }
@@ -143,7 +154,9 @@ INSTANTIATE_TEST_SUITE_P(
     /*M=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)),
     /*K=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)),
     /*N=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)),
-    /*L=*/testing::Values(pow(2,10), pow(2,15), pow(2,17))),
+    /*L=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)),
+    /*pre-comm=*/testing::Bool(),
+    /*post-comm=*/testing::Bool()),
     [](const testing::TestParamInfo<DummyOverlapBenchmarkParams>& info)
         -> std::string {
       std::ostringstream os;
@@ -151,7 +164,9 @@ INSTANTIATE_TEST_SUITE_P(
          << "M" << std::get<1>(info.param) << "_"
          << "K" << std::get<2>(info.param) << "_"
          << "N" << std::get<3>(info.param) << "_"
-         << "L" << std::get<4>(info.param);
+         << "L" << std::get<4>(info.param)
+         << ((std::get<5>(info.param))? "_pre_comm" : "")
+         << ((std::get<6>(info.param))? "_post_comm" : "");
       return os.str();
     });
 

From d9c581c13a9742b3896baf1bd37bc8bcd0acb923 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Mon, 2 Dec 2024 06:45:27 -0800
Subject: [PATCH 17/20] add pre post comms option

---
 bench/test                             | 20 +++++++++++--------
 tests/cpp/test_multidevice_overlap.cpp | 27 ++++++++++++++++++++------
 2 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/bench/test b/bench/test
index cff8d8b34bb..72c22480714 100755
--- a/bench/test
+++ b/bench/test
@@ -1,5 +1,5 @@
 #!/bin/bash
-EXPERIMENT=Dummy_profile_NCCL_P2P_NET_CHUNKSIZE_LARGE
+EXPERIMENT=Dummy_profile_POST_COMM_UCC_TL_UCP_OVER_IB_LARGE
 DATE=$(date +%Y%m%d-%H%M)
 LOG_BASE="/opt/pytorch/Fuser/bench/logs"
 
@@ -11,20 +11,24 @@ echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO
 
 NP=8
 BACKEND=NCCL
-S=8
 M=131072 #32768
 K=32768
 N=32768 #1024
-L=32768
+
+S=8
 Streams=8
 Pgs=1
+
+L=32768
+# PRE_COMM="_pre_comm"
+POST_COMM="_post_comm"
 # UNFUSE="_unfused"
 # GRAPH="_WithCudaGraph"
 # cuStreamWrite=WithcuStreamWriteValue32_
 # GTEST_PREFIX="OverlapBenchmark.PipelinedAGMatmulBenchmark/"
 GTEST_PREFIX="DummyOverlapBenchmark.PipelinedAGMatmulBenchmark/"
 # GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}${GRAPH}"
-GTEST_POSTFIX="${BACKEND}_M${M}_K${K}_N${N}_L${L}"
+GTEST_POSTFIX="${BACKEND}_M${M}_K${K}_N${N}_L${L}${PRE_COMM}${POST_COMM}"
 export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}"
 echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO
 
@@ -32,8 +36,8 @@ MPIFLAGS=" -np $NP"
 MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1"
 # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl"
 # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=cuda"
-# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp"
-# MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy"
+MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp"
+MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy"
 # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_NUM_WORKERS=8"
 # MPIFLAGS+=" -x UCC_EC_CUDA_USE_COOPERATIVE_LAUNCH=0"
 # MPIFLAGS+=" -x UCC_EC_CUDA_STREAM_TASK_MODE=driver"
@@ -45,10 +49,10 @@ MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1"
 # MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_NUM_CHUNKS=32"
 
 # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=^sharp,mlx5"
-# MPIFLAGS+=" -x UCC_COLL_TRACE=debug"
+# MPIFLAGS+=" -x UCC_COLL_TRACE=info"
 # MPIFLAGS+=" -x UCC_LOG_LEVEL=debug"
 # MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1"
-MPIFLAGS+=" -x NCCL_P2P_NET_CHUNKSIZE=2MB"
+# MPIFLAGS+=" -x NCCL_P2P_NET_CHUNKSIZE=2MB"
 # MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO
 # MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event"
 # MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2"
diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index 85059b89a31..9898df02ac8 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -51,7 +51,9 @@ using DummyOverlapBenchmarkParams = std::tuple<
     /*M=*/int64_t,
     /*K=*/int64_t,
     /*N=*/int64_t,
-    /*L(communication msgsize)=*/int64_t>;
+    /*L(communication msgsize)=*/int64_t,
+    /*pre_comm=*/bool,
+    /*post_comm=*/bool>;
 
 class DummyOverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface<DummyOverlapBenchmarkParams> {
  protected:
@@ -81,7 +83,9 @@ TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) {
         M,
         K,
         N,
-        L] = GetParam();
+        L,
+        pre_comm,
+        post_comm] = GetParam();
 
   std::vector<RankType> all_ranks(communicator_->size());
   std::iota(all_ranks.begin(), all_ranks.end(), 0);
@@ -112,13 +116,20 @@ TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) {
       cudaEventRecord(start);
     }
 
-    setCurrentCUDAStream(communication_stream);
-    world->_allgather_base(dst, src)->wait();
+    if (pre_comm) {
+      setCurrentCUDAStream(communication_stream);
+      world->_allgather_base(dst, src)->wait();
+    }
 
     // compute
     setCurrentCUDAStream(compute_stream);
     torch::matmul_out(tc, ta, tb);
 
+    if (post_comm) {
+      setCurrentCUDAStream(communication_stream);
+      world->_allgather_base(dst, src)->wait();
+    }
+
     if (iteration == iteration_profiler_end) {
       cudaProfilerStop();;
     }
@@ -143,7 +154,9 @@ INSTANTIATE_TEST_SUITE_P(
     /*M=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)),
     /*K=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)),
     /*N=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)),
-    /*L=*/testing::Values(pow(2,10), pow(2,15), pow(2,17))),
+    /*L=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)),
+    /*pre-comm=*/testing::Bool(),
+    /*post-comm=*/testing::Bool()),
     [](const testing::TestParamInfo<DummyOverlapBenchmarkParams>& info)
         -> std::string {
       std::ostringstream os;
@@ -151,7 +164,9 @@ INSTANTIATE_TEST_SUITE_P(
          << "M" << std::get<1>(info.param) << "_"
          << "K" << std::get<2>(info.param) << "_"
          << "N" << std::get<3>(info.param) << "_"
-         << "L" << std::get<4>(info.param);
+         << "L" << std::get<4>(info.param)
+         << ((std::get<5>(info.param))? "_pre_comm" : "")
+         << ((std::get<6>(info.param))? "_post_comm" : "");
       return os.str();
     });
 

From bfc7fa6ac9e81d6b4a2552733cc9a76dc1c66635 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Fri, 6 Dec 2024 17:26:44 +0200
Subject: [PATCH 18/20] cleanup test script

---
 bench/test | 37 +++++++++++++++++++++++--------------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/bench/test b/bench/test
index 72c22480714..969b8da00e2 100755
--- a/bench/test
+++ b/bench/test
@@ -1,5 +1,5 @@
 #!/bin/bash
-EXPERIMENT=Dummy_profile_POST_COMM_UCC_TL_UCP_OVER_IB_LARGE
+EXPERIMENT=Dummy_profile_UCC_TL_CUDA
 DATE=$(date +%Y%m%d-%H%M)
 LOG_BASE="/opt/pytorch/Fuser/bench/logs"
 
@@ -10,7 +10,7 @@ export LOG_FILE_INFO="${LOGS}/info.txt"
 echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO
 
 NP=8
-BACKEND=NCCL
+BACKEND=UCC
 M=131072 #32768
 K=32768
 N=32768 #1024
@@ -20,8 +20,8 @@ Streams=8
 Pgs=1
 
 L=32768
-# PRE_COMM="_pre_comm"
-POST_COMM="_post_comm"
+PRE_COMM="_pre_comm"
+# POST_COMM="_post_comm"
 # UNFUSE="_unfused"
 # GRAPH="_WithCudaGraph"
 # cuStreamWrite=WithcuStreamWriteValue32_
@@ -33,28 +33,37 @@ export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}"
 echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO
 
 MPIFLAGS=" -np $NP"
-MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1"
+
+# MPIFLAGS+=" -x NCCL_P2P_NET_CHUNKSIZE=2MB"
+# MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO
+# MPIFLAGS+=" -x NCCL_MAX_NCHANNELS=1"
+
 # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl"
-# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=cuda"
-MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp"
-MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy"
+# MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event"
+
+MPIFLAGS+=" -x UCC_CL_BASIC_TLS=cuda"
+# MPIFLAGS+=" -x UCC_TL_CUDA_SCRATCH_SIZE=32mb"
+# MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_MAX_RINGS=32"
+# MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_NUM_CHUNKS=32"
+
 # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_NUM_WORKERS=8"
 # MPIFLAGS+=" -x UCC_EC_CUDA_USE_COOPERATIVE_LAUNCH=0"
 # MPIFLAGS+=" -x UCC_EC_CUDA_STREAM_TASK_MODE=driver"
 # MPIFLAGS+=" -x UCC_EC_CUDA_STREAM_TASK_MODE=kernel"
 # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_COPY_LARGE_THRESH=1M"
 # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_NUM_THREADS=512"
-# MPIFLAGS+=" -x UCC_TL_CUDA_SCRATCH_SIZE=32mb"
-# MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_MAX_RINGS=32"
-# MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_NUM_CHUNKS=32"
 
+# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp"
+# MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy"
+# MPIFLAGS+=" -x UCX_RNDV_SCHEME=put_zcopy"
+# MPIFLAGS+=" -x UCX_RNDV_SCHEME=get_zcopy"
+
+
+MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1"
 # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=^sharp,mlx5"
 # MPIFLAGS+=" -x UCC_COLL_TRACE=info"
 # MPIFLAGS+=" -x UCC_LOG_LEVEL=debug"
 # MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1"
-# MPIFLAGS+=" -x NCCL_P2P_NET_CHUNKSIZE=2MB"
-# MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO
-# MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event"
 # MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2"
 echo "mpi flags: $MPIFLAGS" | tee -a $LOG_FILE_INFO
 

From 1a1138cbb5629fd47c9d0c056ac21db68af2f77b Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 8 Jan 2025 18:28:45 +0200
Subject: [PATCH 19/20] update

---
 bench/test                             | 27 +++++++++--------
 csrc/multidevice/utils.cpp             |  6 ++--
 tests/cpp/test_multidevice_overlap.cpp | 42 ++++++++++++++++++--------
 3 files changed, 46 insertions(+), 29 deletions(-)

diff --git a/bench/test b/bench/test
index 969b8da00e2..1b5d6f41c5a 100755
--- a/bench/test
+++ b/bench/test
@@ -1,25 +1,19 @@
 #!/bin/bash
-EXPERIMENT=Dummy_profile_UCC_TL_CUDA
+EXPERIMENT=Dummy_profile_msgsize256m_float16_M128k_K128k_N32k_UCC_IB
 DATE=$(date +%Y%m%d-%H%M)
 LOG_BASE="/opt/pytorch/Fuser/bench/logs"
 
-export LOGS="${LOG_BASE}/${EXPERIMENT}_${DATE}"
-
-mkdir -p $LOGS
-export LOG_FILE_INFO="${LOGS}/info.txt"
-echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO
-
 NP=8
 BACKEND=UCC
 M=131072 #32768
-K=32768
+K=131072
 N=32768 #1024
 
 S=8
 Streams=8
 Pgs=1
 
-L=32768
+L=1048576  #268435456 #67108864 #131072
 PRE_COMM="_pre_comm"
 # POST_COMM="_post_comm"
 # UNFUSE="_unfused"
@@ -41,7 +35,7 @@ MPIFLAGS=" -np $NP"
 # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl"
 # MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event"
 
-MPIFLAGS+=" -x UCC_CL_BASIC_TLS=cuda"
+# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=cuda"
 # MPIFLAGS+=" -x UCC_TL_CUDA_SCRATCH_SIZE=32mb"
 # MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_MAX_RINGS=32"
 # MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_NUM_CHUNKS=32"
@@ -53,10 +47,10 @@ MPIFLAGS+=" -x UCC_CL_BASIC_TLS=cuda"
 # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_COPY_LARGE_THRESH=1M"
 # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_NUM_THREADS=512"
 
-# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp"
-# MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy"
+MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp"
+MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy"
 # MPIFLAGS+=" -x UCX_RNDV_SCHEME=put_zcopy"
-# MPIFLAGS+=" -x UCX_RNDV_SCHEME=get_zcopy"
+MPIFLAGS+=" -x UCX_RNDV_SCHEME=get_zcopy"
 
 
 MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1"
@@ -65,6 +59,13 @@ MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1"
 # MPIFLAGS+=" -x UCC_LOG_LEVEL=debug"
 # MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1"
 # MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2"
+
+
+export LOGS="${LOG_BASE}/${EXPERIMENT}_${BACKEND}_${DATE}"
+mkdir -p $LOGS
+export LOG_FILE_INFO="${LOGS}/info.txt"
+echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO
+
 echo "mpi flags: $MPIFLAGS" | tee -a $LOG_FILE_INFO
 
 TEST_CMD="$BUILD_DIRECTORY/test_multidevice --gtest_filter=${GTEST_FILTER}"
diff --git a/csrc/multidevice/utils.cpp b/csrc/multidevice/utils.cpp
index d2117b222da..5eb4a8a21b9 100644
--- a/csrc/multidevice/utils.cpp
+++ b/csrc/multidevice/utils.cpp
@@ -43,11 +43,11 @@ std::unordered_set<IterDomain*> getShardedIterDomains(TensorView* tv) {
 // Returns whether a IterDomain in a TensorView is the outermost
 // allocated IterDomain in the TensorView.
 bool isOutermostAllocatedId(TensorView* tv, IterDomain* id) {
-  for (auto i : tv->getLoopDomain()) {
-    if (i == id) {
+  for (auto* loop_id : tv->getLoopDomain()) {
+    if (loop_id == id) {
       return true;
     }
-    if (!i->isDeviceDim() && !i->isReduction() && !i->isBroadcast()) {
+    if (!loop_id->isDeviceDim() && !loop_id->isReduction() && !loop_id->isBroadcast()) {
       return false;
     }
   }
diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index 9898df02ac8..a3999b477ba 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -73,10 +73,11 @@ class DummyOverlapBenchmark : public MultiDeviceTest, public testing::WithParamI
 std::map<std::string, float> DummyOverlapBenchmark::times = {};
 
 TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) {
-  constexpr int64_t number_of_warmups = 50;
-  constexpr int64_t number_of_iterations = 100;
-  constexpr int64_t iteration_profiler_start = 10;
-  constexpr int64_t iteration_profiler_end = 15;
+  constexpr int64_t number_of_warmups = 20;
+  constexpr int64_t number_of_iterations = 80;
+  constexpr int64_t total_number_of_iterations = number_of_warmups + number_of_iterations;
+  constexpr int64_t iteration_profiler_start = 5;
+  constexpr int64_t iteration_profiler_end = 10;
 
 
   auto [backend,
@@ -90,27 +91,36 @@ TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) {
   std::vector<RankType> all_ranks(communicator_->size());
   std::iota(all_ranks.begin(), all_ranks.end(), 0);
   auto world = communicator_->getBackendForTeam(all_ranks, backend);
+  auto nccl_world = communicator_->getBackendForTeam(all_ranks, CommunicatorBackend::kNccl);
 
   std::vector<c10::cuda::CUDAStream> streams =
       createStreams(2, communicator_->deviceId());
   auto& compute_stream = streams.at(0);
   auto& communication_stream = streams.at(1);
 
-  auto options = at::TensorOptions().dtype(at::kFloat).device(communicator_->device());
-  auto ta = at::randn({M, K}, options);
-  auto tb = at::randn({K, N}, options);
-  auto tc = at::empty({M, N}, options);
-  auto src = at::randn({L}, options);
-  auto dst = at::empty({L * communicator_->size()}, options);
+  auto options_matmul = at::TensorOptions().dtype(torch::kFloat16).device(communicator_->device());
+  auto ta = at::randn({M, K}, options_matmul);
+  auto tb = at::randn({K, N}, options_matmul);
+  auto tc = at::empty({M, N}, options_matmul);
+
+  auto options_comms = at::TensorOptions().dtype(torch::kFloat32).device(communicator_->device());
+  auto src = at::randn({L}, options_comms);
+  auto dst = at::empty({L * communicator_->size()},  options_comms);
+  std::vector<at::Tensor> barrier_scratch_buffer = {at::randn({1}, options_comms)};
 
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
 
+  nccl_world->allreduce(barrier_scratch_buffer)->wait();
+
   for (const auto& iteration :
-       c10::irange(number_of_warmups + number_of_iterations)) {
+       c10::irange(total_number_of_iterations)) {
+    if (iteration % 10 == 0 && communicator_->deviceId() == 0) {
+      std::cout << "iteration " << iteration <<"/" << total_number_of_iterations << std::endl;
+    }
     if (iteration == iteration_profiler_start) {
-      cudaProfilerStart();;
+      cudaProfilerStart();
     }
     if (iteration == number_of_warmups) {
       cudaEventRecord(start);
@@ -133,8 +143,14 @@ TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) {
     if (iteration == iteration_profiler_end) {
       cudaProfilerStop();;
     }
+    if (!pre_comm & !post_comm) {
+      nccl_world->allreduce(barrier_scratch_buffer)->wait();
+    }
     synchronizeStreams(streams);
   }
+  if (pre_comm || post_comm) {
+    nccl_world->allreduce(barrier_scratch_buffer)->wait();
+  }
   cudaEventRecord(stop);
   cudaEventSynchronize(stop);
   float milliseconds = 0;
@@ -154,7 +170,7 @@ INSTANTIATE_TEST_SUITE_P(
     /*M=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)),
     /*K=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)),
     /*N=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)),
-    /*L=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)),
+    /*L=*/testing::Values(1, pow(2,10), pow(2,15), pow(2,17), pow(2,20), pow(2,24), pow(2,26), pow(2,28)),
     /*pre-comm=*/testing::Bool(),
     /*post-comm=*/testing::Bool()),
     [](const testing::TestParamInfo<DummyOverlapBenchmarkParams>& info)

From e037ee5b62418632055fe5f32f8659b0b4bc49d9 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Thu, 16 Jan 2025 02:38:52 -0800
Subject: [PATCH 20/20] test with stream parallel type and host IR

---
 bench/test                             |  34 ++++----
 tests/cpp/test_multidevice_overlap.cpp | 111 ++++++++++++++++++++++++-
 2 files changed, 127 insertions(+), 18 deletions(-)

diff --git a/bench/test b/bench/test
index 1b5d6f41c5a..6777835f7b4 100755
--- a/bench/test
+++ b/bench/test
@@ -1,28 +1,32 @@
 #!/bin/bash
-EXPERIMENT=Dummy_profile_msgsize256m_float16_M128k_K128k_N32k_UCC_IB
+EXPERIMENT=StreamParallelType_tests
 DATE=$(date +%Y%m%d-%H%M)
 LOG_BASE="/opt/pytorch/Fuser/bench/logs"
 
 NP=8
 BACKEND=UCC
-M=131072 #32768
-K=131072
-N=32768 #1024
+M=32768
+K=32768
+N=1024
 
 S=8
-Streams=8
+Streams=3
 Pgs=1
 
-L=1048576  #268435456 #67108864 #131072
-PRE_COMM="_pre_comm"
+# M=131072 #32768
+# K=131072
+# N=32768 #1024
+# L=1048576  #268435456 #67108864 #131072
+# PRE_COMM="_pre_comm"
 # POST_COMM="_post_comm"
 # UNFUSE="_unfused"
 # GRAPH="_WithCudaGraph"
 # cuStreamWrite=WithcuStreamWriteValue32_
 # GTEST_PREFIX="OverlapBenchmark.PipelinedAGMatmulBenchmark/"
-GTEST_PREFIX="DummyOverlapBenchmark.PipelinedAGMatmulBenchmark/"
-# GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}${GRAPH}"
-GTEST_POSTFIX="${BACKEND}_M${M}_K${K}_N${N}_L${L}${PRE_COMM}${POST_COMM}"
+# GTEST_PREFIX="DummyOverlapBenchmark.PipelinedAGMatmulBenchmark/"
+GTEST_PREFIX="OverlapBenchmark.PipelinedAGMatmulBenchmarkStreamParallelType/"
+GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}${GRAPH}"
+# GTEST_POSTFIX="${BACKEND}_M${M}_K${K}_N${N}_L${L}${PRE_COMM}${POST_COMM}"
 export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}"
 echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO
 
@@ -32,7 +36,7 @@ MPIFLAGS=" -np $NP"
 # MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO
 # MPIFLAGS+=" -x NCCL_MAX_NCHANNELS=1"
 
-# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl"
+MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl"
 # MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event"
 
 # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=cuda"
@@ -47,15 +51,15 @@ MPIFLAGS=" -np $NP"
 # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_COPY_LARGE_THRESH=1M"
 # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_NUM_THREADS=512"
 
-MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp"
-MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy"
+# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp"
+# MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy"
 # MPIFLAGS+=" -x UCX_RNDV_SCHEME=put_zcopy"
-MPIFLAGS+=" -x UCX_RNDV_SCHEME=get_zcopy"
+# MPIFLAGS+=" -x UCX_RNDV_SCHEME=get_zcopy"
 
 
 MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1"
 # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=^sharp,mlx5"
-# MPIFLAGS+=" -x UCC_COLL_TRACE=info"
+MPIFLAGS+=" -x UCC_COLL_TRACE=info"
 # MPIFLAGS+=" -x UCC_LOG_LEVEL=debug"
 # MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1"
 # MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2"
diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index 272d785e2a1..7cf3cd288a4 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -345,15 +345,120 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) {
   }
 }
 
+TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmarkStreamParallelType) {
+  constexpr int64_t number_of_warmups = 50;
+  constexpr int64_t number_of_iterations = 200;
+  constexpr int64_t iteration_profiler_start = 10;
+  constexpr int64_t iteration_profiler_end = 15;
+
+  const int64_t D = communicator_->size();
+  auto [backend,
+        S,
+        M,
+        K,
+        N,
+        number_of_streams,
+        add_cuStreamWriteValue32,
+        number_of_pgs,
+        unfuse_loops,
+        use_cuda_graph] = GetParam();
+
+  if (M % (D * S) != 0) {
+    GTEST_SKIP() << "M must be a multiple of D * S, but got M = " << M
+                 << ", D = " << D << ", S = " << S;
+  }
+  if (add_cuStreamWriteValue32) {
+    GTEST_SKIP() << "cuStreamWriteValue32 not supported with StreamParallelType";
+  }
+  if (number_of_pgs > 1) {
+    GTEST_SKIP() << "StreamParallelType not supported with multiple process groups";
+  }
+  if (unfuse_loops) {
+    GTEST_SKIP() << "StreamParallelType not supported with unfused loops";
+  }
+  if (use_cuda_graph) {
+    GTEST_SKIP() << "StreamParallelType not supported with cuda graphs";
+  }
+
+
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  TensorView* a = makeContigTensor(4); //[S, DIDx(D), M/(S*D), K]
+  TensorView* b = makeContigTensor(2); //[K, N]
+  TensorView* c = matmul(a, b); //[S, D, M/(S*D), N]
+
+  fusion->addInput(a);
+  fusion->addInput(b);
+  fusion->addOutput(c);
+
+  auto mesh = DeviceMesh::createForNumDevices(D);
+  a->setDeviceMesh(mesh);
+  b->setDeviceMesh(mesh);
+  c->setDeviceMesh(mesh);
+
+  a->axis(1)->parallelize(ParallelType::DIDx);
+  c->axis(0)->parallelize(ParallelType::Stream);
+
+  communicator_->setDefaultBackend(backend);
+
+  hir::HostIrEvaluatorParams params;
+  params.number_of_streams = number_of_streams;
+  MultiDeviceExecutor executor(std::move(fusion), *communicator_, params);
+
+
+  auto tensor_options =
+      at::TensorOptions().dtype(at::kFloat).device(communicator_->device());
+  at::Tensor ta_unsharded = at::randn({S, D, M / (S * D), K}, tensor_options);
+  at::Tensor ta = ta_unsharded.slice(
+      1, communicator_->deviceId(), communicator_->deviceId() + 1);
+  at::Tensor tb = at::randn({K, N}, tensor_options);
+  at::Tensor tc_ref = at::matmul(ta_unsharded, tb);
+
+  std::vector<c10::IValue> inputs = {ta, tb};
+  at::Tensor tc;
+
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  for (const auto& iteration :
+       c10::irange(number_of_warmups + number_of_iterations)) {
+    if (iteration == iteration_profiler_start) {
+      cudaProfilerStart();;
+    }
+    if (iteration == number_of_warmups) {
+      cudaEventRecord(start);
+    }
+
+    tc = executor.runWithInput(inputs).at(0);
+
+    if (iteration == iteration_profiler_end) {
+      cudaProfilerStop();;
+    }
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  float milliseconds = 0;
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  milliseconds /= number_of_iterations;
+
+  std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name();
+  times.insert({test_name, milliseconds});
+  std::cout << "rank " << communicator_->deviceId() << ", " << test_name << " : " << milliseconds << std::endl;
+
+  EXPECT_TRUE(torch::allclose(tc_ref, tc, 1e-1, 1e-1));
+}
+
 INSTANTIATE_TEST_SUITE_P(
     ,
     OverlapBenchmark,
     testing::Combine(
     testing::Values(CommunicatorBackend::kNccl, CommunicatorBackend::kUcc),
     /*S=*/testing::Values(1,2,4,8, 16, 32),
-    /*M=*/testing::Values(pow(2,10), pow(2,15)),
-    /*K=*/testing::Values(pow(2,10), pow(2,15)),
-    /*N=*/testing::Values(pow(2,10)),
+    /*M=*/testing::Values(pow(2,10), pow(2,15), pow(2,18)),
+    /*K=*/testing::Values(pow(2,10), pow(2,15), pow(2,18)),
+    /*N=*/testing::Values(pow(2,10), pow(2,15)),
     /*number_of_streams=*/testing::Values(3, 8, 32),
     /*add_cuStreamWriteValue32*/testing::Values(false, true),
     /*number_of_pgs=*/testing::Values(1, 2, 4, 8),