From 6ff775aefda74705d099f8d415605502e7b82b3f Mon Sep 17 00:00:00 2001
From: Nicholas Sarkauskas <nsarkauskas@PDX01-M03-I41-DGX-A100-DL-1.nvidia.com>
Date: Fri, 20 Dec 2024 11:59:32 -0800
Subject: [PATCH 01/19] Add transformer benchmark skeleton based off of a bert
 test

---
 CMakeLists.txt                 |   1 +
 benchmarks/cpp/transformer.cpp | 111 +++++++++++++++++++++++++++++++++
 2 files changed, 112 insertions(+)
 create mode 100644 benchmarks/cpp/transformer.cpp
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 91c3076d4ba..7733fa6a5aa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -755,6 +755,7 @@ if(BUILD_NVFUSER_BENCHMARK)
     ${NVFUSER_ROOT}/benchmarks/cpp/softmax_backward.cpp
     ${NVFUSER_ROOT}/benchmarks/cpp/softmax_dropout.cpp
     ${NVFUSER_ROOT}/benchmarks/cpp/timm.cpp
+    ${NVFUSER_ROOT}/benchmarks/cpp/transformer.cpp
     ${NVFUSER_ROOT}/benchmarks/cpp/transpose.cpp
     ${NVFUSER_ROOT}/benchmarks/cpp/utils.cpp
     ${NVFUSER_ROOT}/tests/cpp/utils.cpp
diff --git a/benchmarks/cpp/transformer.cpp b/benchmarks/cpp/transformer.cpp
new file mode 100644
index 00000000000..f17bb20a3fc
--- /dev/null
+++ b/benchmarks/cpp/transformer.cpp
@@ -0,0 +1,111 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#include <device_lower/lower2device.h>
+#include <fusion.h>
+#include <ir/all_nodes.h>
+#include <ir/builder.h>
+#include <ir/utils.h>
+#include <ops/all_ops.h>
+#include <runtime/executor.h>
+#include <scheduler/all_schedulers.h>
+#include <scheduler/utils.h>
+
+#include <benchmark/benchmark.h>
+
+#include <cuda_runtime.h>
+
+#include <sstream>
+
+#include <benchmarks/cpp/utils.h>
+#include <tests/cpp/utils.h>
+
+using namespace nvfuser;
+
+// Return reduction tensor view and output of reduction
+static void setupDivMaxSoftmaxDropoutForward(Fusion* fusion, DataType dtype) {
+  FusionGuard fg(fusion);
+
+  bool is_fp16 = dtype == DataType::Half;
+
+  TensorView* tv0 = TensorViewBuilder()
+                        .ndims(4)
+                        .dtype(dtype)
+                        .contiguity({true, std::nullopt, std::nullopt, true})
+                        .shape({-1, 1, 1, -1})
+                        .build();
+  TensorView* tv1 = makeContigTensor(4, dtype);
+
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+
+  // TODO: should be input
+  auto d16 = IrBuilder::create<Val>(1.0);
+
+  if (is_fp16) {
+    tv0 = castOp(DataType::Float, tv0);
+    tv1 = castOp(DataType::Float, tv1);
+  }
+
+  auto tv2 = div(tv1, d16);
+  auto tv3 = add(tv2, tv0);
+
+  auto tv10 = softmax(tv3, 3);
+  auto dropout_tvs = dropout(tv10, IrBuilder::create<Val>(0.9));
+  auto tv12 = dropout_tvs.mask;
+  auto tv14 = dropout_tvs.output;
+
+  if (is_fp16) {
+    tv14 = castOp(DataType::Half, tv14);
+    tv10 = castOp(DataType::Half, tv10);
+    tv3 = castOp(DataType::Half, tv3);
+  }
+
+  fusion->addOutput(tv14);
+  fusion->addOutput(tv12);
+  fusion->addOutput(tv10);
+  fusion->addOutput(tv3);
+}
+
+static void NvFuserScheduler_DivMaxSoftDropFwd(
+    benchmark::State& benchmark_state,
+    FusionExecutorCache* executor_cache,
+    DataType dtype) {
+  auto w = benchmark_state.range(0);
+  auto x = benchmark_state.range(1);
+  auto y = benchmark_state.range(2);
+  auto z = benchmark_state.range(3);
+
+  at::manual_seed(0);
+  auto options =
+      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({w, 1, 1, z}, options);
+  at::Tensor t1 = at::randn({w, x, y, z}, options);
+
+  std::vector<c10::IValue> at_inputs = {t0, t1};
+
+  auto bytes =
+      runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);
+
+  benchmark_state.SetBytesProcessed(
+      bytes * int64_t(benchmark_state.iterations()));
+}
+
+//------------------------------------------------------------------------------
+
+NVFUSER_BENCHMARK_DEFINE(
+    nick_transformer,
+    setupDivMaxSoftmaxDropoutForward,
+    NvFuserScheduler_DivMaxSoftDropFwd,
+    DataType::Float);
+
+NVFUSER_BENCHMARK_RUN(nick_transformer)
+    // ->RangeMultiplier(2)
+    ->Ranges({{8, 8}, {16, 16}, {128, 128}, {128, 128}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();

From c66336be6cf80b13b2ee5ae9667fc7ac0d48a0f8 Mon Sep 17 00:00:00 2001
From: Nicholas Sarkauskas <nsarkauskas@PDX01-M03-I41-DGX-A100-DL-1.nvidia.com>
Date: Fri, 20 Dec 2024 13:10:45 -0800
Subject: [PATCH 02/19] savE

---
 benchmarks/cpp/transformer.cpp             | 73 +++++++++-------------
 tests/cpp/test_multidevice_transformer.cpp |  1 +
 2 files changed, 31 insertions(+), 43 deletions(-)

diff --git a/benchmarks/cpp/transformer.cpp b/benchmarks/cpp/transformer.cpp
index f17bb20a3fc..4854178f15f 100644
--- a/benchmarks/cpp/transformer.cpp
+++ b/benchmarks/cpp/transformer.cpp
@@ -23,55 +23,42 @@
 
 #include <benchmarks/cpp/utils.h>
 #include <tests/cpp/utils.h>
+#include <tests/cpp/multidevice_transformer.h>
 
 using namespace nvfuser;
 
+namespace {
+// Note: We test on smaller model and input sizes to avoid high error
+// accumulation for validation.
+static constexpr int64_t B = 2, E = 768, H = 16, S = 128;
+// Note: Dropout probabilities are set to 0. Since the dropout mask is sharded
+// it throws off the seed offset between the sharded nvFuser program and the
+// unsharded reference.
+static constexpr double kDropoutProb = 0.0, kSdpaProb = 0.0, kSdpaScale = 1e-3;
+// Note parameters scaled by kParamScale following weight initialization
+// recommendations:
+// https://huggingface.co/docs/transformers/en/model_doc/gpt2#transformers.GPT2Config.initializer_range
+static constexpr double kParamScale = 0.02;
+} // namespace
+
 // Return reduction tensor view and output of reduction
-static void setupDivMaxSoftmaxDropoutForward(Fusion* fusion, DataType dtype) {
+static void setupTransformerForward(Fusion* fusion, DataType dtype) {
+
   FusionGuard fg(fusion);
 
-  bool is_fp16 = dtype == DataType::Half;
-
-  TensorView* tv0 = TensorViewBuilder()
-                        .ndims(4)
-                        .dtype(dtype)
-                        .contiguity({true, std::nullopt, std::nullopt, true})
-                        .shape({-1, 1, 1, -1})
-                        .build();
-  TensorView* tv1 = makeContigTensor(4, dtype);
-
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-
-  // TODO: should be input
-  auto d16 = IrBuilder::create<Val>(1.0);
-
-  if (is_fp16) {
-    tv0 = castOp(DataType::Float, tv0);
-    tv1 = castOp(DataType::Float, tv1);
-  }
-
-  auto tv2 = div(tv1, d16);
-  auto tv3 = add(tv2, tv0);
-
-  auto tv10 = softmax(tv3, 3);
-  auto dropout_tvs = dropout(tv10, IrBuilder::create<Val>(0.9));
-  auto tv12 = dropout_tvs.mask;
-  auto tv14 = dropout_tvs.output;
-
-  if (is_fp16) {
-    tv14 = castOp(DataType::Half, tv14);
-    tv10 = castOp(DataType::Half, tv10);
-    tv3 = castOp(DataType::Half, tv3);
-  }
-
-  fusion->addOutput(tv14);
-  fusion->addOutput(tv12);
-  fusion->addOutput(tv10);
-  fusion->addOutput(tv3);
+  auto* communicator_ = Communicator::getInstance(); // nick TODO call Communicator::getInstance().cleanup() somewhere before program exit
+
+  const int64_t D = communicator_->size(); // number of devices
+
+  NVF_ERROR((4 * E) % D == 0, "Requires number of devices ", D, "evenly divide 4*E=", 4*E);
+
+  std::unique_ptr<DistributedTransformer> model = std::make_unique<DistributedTransformer>(
+        D, B, E, H, S, kDropoutProb, kSdpaProb);
+
+  const auto mesh = DeviceMesh::createForNumDevices(D);
 }
 
-static void NvFuserScheduler_DivMaxSoftDropFwd(
+static void NvFuserScheduler_TransformerFwd(
     benchmark::State& benchmark_state,
     FusionExecutorCache* executor_cache,
     DataType dtype) {
@@ -100,8 +87,8 @@ static void NvFuserScheduler_DivMaxSoftDropFwd(
 
 NVFUSER_BENCHMARK_DEFINE(
     nick_transformer,
-    setupDivMaxSoftmaxDropoutForward,
-    NvFuserScheduler_DivMaxSoftDropFwd,
+    setupTransformerForward,
+    NvFuserScheduler_TransformerFwd,
     DataType::Float);
 
 NVFUSER_BENCHMARK_RUN(nick_transformer)
diff --git a/tests/cpp/test_multidevice_transformer.cpp b/tests/cpp/test_multidevice_transformer.cpp
index 6ccb217137f..8db0db5915a 100644
--- a/tests/cpp/test_multidevice_transformer.cpp
+++ b/tests/cpp/test_multidevice_transformer.cpp
@@ -12,6 +12,7 @@
 
 #include <fusion.h>
 #include <ops/all_ops.h>
+#include <multidevice/communicator.h>
 #include <tests/cpp/multidevice.h>
 #include <tests/cpp/multidevice_transformer.h>
 #include <tests/cpp/validator.h>

From 4005cc379798ceefbaa554a4ee938fc6a65ad045 Mon Sep 17 00:00:00 2001
From: Nicholas Sarkauskas <nsarkauskas@PDX01-M03-I41-DGX-A100-DL-1.nvidia.com>
Date: Fri, 20 Dec 2024 14:04:15 -0800
Subject: [PATCH 03/19] fusion definition

---
 CMakeLists.txt                        |  1 +
 benchmarks/cpp/transformer.cpp        | 17 ++++++-----------
 tests/cpp/multidevice_transformer.cpp | 14 +++++++++++---
 tests/cpp/multidevice_transformer.h   |  4 ++++
 4 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7733fa6a5aa..47e5c019399 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -759,6 +759,7 @@ if(BUILD_NVFUSER_BENCHMARK)
     ${NVFUSER_ROOT}/benchmarks/cpp/transpose.cpp
     ${NVFUSER_ROOT}/benchmarks/cpp/utils.cpp
     ${NVFUSER_ROOT}/tests/cpp/utils.cpp
+    ${NVFUSER_ROOT}/tests/cpp/multidevice_transformer.cpp
   )
 
   add_executable(nvfuser_bench ${BENCHMARK_SRCS})
diff --git a/benchmarks/cpp/transformer.cpp b/benchmarks/cpp/transformer.cpp
index 4854178f15f..e9e3b463e53 100644
--- a/benchmarks/cpp/transformer.cpp
+++ b/benchmarks/cpp/transformer.cpp
@@ -43,25 +43,20 @@ static constexpr double kParamScale = 0.02;
 
 // Return reduction tensor view and output of reduction
 static void setupTransformerForward(Fusion* fusion, DataType dtype) {
-
-  FusionGuard fg(fusion);
-
-  auto* communicator_ = Communicator::getInstance(); // nick TODO call Communicator::getInstance().cleanup() somewhere before program exit
+  Communicator* communicator_ = &Communicator::getInstance(); // nick TODO call Communicator::getInstance().cleanup() somewhere before program exit
 
   const int64_t D = communicator_->size(); // number of devices
 
-  NVF_ERROR((4 * E) % D == 0, "Requires number of devices ", D, "evenly divide 4*E=", 4*E);
-
   std::unique_ptr<DistributedTransformer> model = std::make_unique<DistributedTransformer>(
         D, B, E, H, S, kDropoutProb, kSdpaProb);
 
-  const auto mesh = DeviceMesh::createForNumDevices(D);
+  model->setupForward(fusion, dtype, /*sequence_parallel*/false);
 }
 
 static void NvFuserScheduler_TransformerFwd(
     benchmark::State& benchmark_state,
     FusionExecutorCache* executor_cache,
-    DataType dtype) {
+    DataType dtype) { /*
   auto w = benchmark_state.range(0);
   auto x = benchmark_state.range(1);
   auto y = benchmark_state.range(2);
@@ -80,18 +75,18 @@ static void NvFuserScheduler_TransformerFwd(
       runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);
 
   benchmark_state.SetBytesProcessed(
-      bytes * int64_t(benchmark_state.iterations()));
+      bytes * int64_t(benchmark_state.iterations()));*/
 }
 
 //------------------------------------------------------------------------------
 
 NVFUSER_BENCHMARK_DEFINE(
-    nick_transformer,
+    TransformerForward,
     setupTransformerForward,
     NvFuserScheduler_TransformerFwd,
     DataType::Float);
 
-NVFUSER_BENCHMARK_RUN(nick_transformer)
+NVFUSER_BENCHMARK_RUN(TransformerForward)
     // ->RangeMultiplier(2)
     ->Ranges({{8, 8}, {16, 16}, {128, 128}, {128, 128}})
     ->Unit(benchmark::kMicrosecond)
diff --git a/tests/cpp/multidevice_transformer.cpp b/tests/cpp/multidevice_transformer.cpp
index fe552b6606a..67b3e48aedc 100644
--- a/tests/cpp/multidevice_transformer.cpp
+++ b/tests/cpp/multidevice_transformer.cpp
@@ -392,11 +392,13 @@ std::vector<TensorView*> DistributedTransformer::mha_backwards(
       linear0_grads.grad_x};
 }
 
-std::unique_ptr<FusionExecutorCache> DistributedTransformer::forward(
+/* NVFuser benchmark manages the unique_ptr for Fusion and FusionExecutorCache,
+   so update the raw pointer with this setupForward function */
+void DistributedTransformer::setupForward(
+    Fusion *fusion,
     DataType dtype,
     bool sequence_parallel) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
+  FusionGuard fg(fusion);
   const auto mesh = DeviceMesh::createForNumDevices(D);
 
   TensorView* x = sequence_parallel
@@ -478,7 +480,13 @@ std::unique_ptr<FusionExecutorCache> DistributedTransformer::forward(
     shardBetween({mha_in}, {mha_tvs.output}, mha_w0);
     shardBetween({mlp_in}, {mlp_tvs.output}, mlp_w0);
   }
+}
 
+std::unique_ptr<FusionExecutorCache> DistributedTransformer::forward(
+    DataType dtype,
+    bool sequence_parallel) {
+  auto fusion = std::make_unique<Fusion>();
+  setupForward(fusion.get(), dtype, sequence_parallel);
   return std::make_unique<FusionExecutorCache>(std::move(fusion));
 }
 
diff --git a/tests/cpp/multidevice_transformer.h b/tests/cpp/multidevice_transformer.h
index 33a3f759926..43c0b06bf80 100644
--- a/tests/cpp/multidevice_transformer.h
+++ b/tests/cpp/multidevice_transformer.h
@@ -45,6 +45,10 @@ class DistributedTransformer {
         kDropoutProb(dropout_prob),
         kSdpaProb(sdpa_dropout_prob) {}
 
+  void setupForward(
+      Fusion *fusion,
+      DataType dtype,
+      bool sequence_parallel = false);
   std::unique_ptr<FusionExecutorCache> forward(
       DataType dtype,
       bool sequence_parallel = false);

From 83b51bc74b3c3907a990b718f00409d8e1191118 Mon Sep 17 00:00:00 2001
From: Nicholas Sarkauskas <nsarkauskas@PDX01-M03-I40-DGX-A100-DL-2.nvidia.com>
Date: Mon, 23 Dec 2024 15:21:19 -0800
Subject: [PATCH 04/19] add at_inputs, compiling

---
 benchmarks/cpp/transformer.cpp | 117 +++++++++++++++++++++++++++++++++
 1 file changed, 117 insertions(+)

diff --git a/benchmarks/cpp/transformer.cpp b/benchmarks/cpp/transformer.cpp
index e9e3b463e53..65672bcfd43 100644
--- a/benchmarks/cpp/transformer.cpp
+++ b/benchmarks/cpp/transformer.cpp
@@ -22,6 +22,7 @@
 #include <sstream>
 
 #include <benchmarks/cpp/utils.h>
+#include <csrc/multidevice/utils.h>
 #include <tests/cpp/utils.h>
 #include <tests/cpp/multidevice_transformer.h>
 
@@ -53,6 +54,61 @@ static void setupTransformerForward(Fusion* fusion, DataType dtype) {
   model->setupForward(fusion, dtype, /*sequence_parallel*/false);
 }
 
+static std::vector<at::Tensor> reference_mlp(
+    at::Tensor x,
+    at::Tensor w0,
+    at::Tensor b0,
+    at::Tensor w1,
+    at::Tensor b1) {
+  auto at_dtype = w0.dtype();
+  auto linear0 = at::linear(x, w0, b0);
+  auto gelu = at::gelu(linear0.to(at::kFloat), "tanh").to(at_dtype);
+  auto linear1 = at::linear(gelu, w1, b1).to(at::kFloat);
+  auto [dropout, mask] = at::native_dropout(linear1, kDropoutProb, true);
+  return {linear0, gelu, linear1, dropout, mask};
+}
+
+static std::vector<at::Tensor> reference_mha(
+    at::Tensor x,
+    at::Tensor w0,
+    at::Tensor b0,
+    at::Tensor w1,
+    at::Tensor b1) {
+  auto linear0 = at::linear(x, w0, b0);
+  auto qkv = linear0.view({B, S, 3 * E}).split(E, 2);
+  for (auto i = 0; i < 3; i++) {
+    qkv[i] = qkv[i].reshape({B, S, H, E / H}).transpose(1, 2);
+  }
+  auto sdpa_out = at::_scaled_dot_product_flash_attention(
+      qkv[0], qkv[1], qkv[2], kSdpaProb, true, false, kSdpaScale);
+  auto sdpa = std::get<0>(sdpa_out);
+  // Reassemble heads (B, H, S, E/H) to (B, S, H, E/H) to (B, S, E)
+  auto y = sdpa.transpose(1, 2).reshape({B * S, E});
+  auto linear1 = at::linear(y, w1, b1).to(at::kFloat);
+  auto [dropout, mask] = at::native_dropout(linear1, kDropoutProb, true);
+  return {linear0, sdpa, linear1, dropout, mask};
+}
+
+static at::Tensor transformerShardTensor_Mesh(
+    at::Tensor tensor,
+    const int64_t axis,
+    const DeviceMesh& mesh,
+    Communicator* communicator_) {
+  const auto device_id = communicator_->deviceId();
+  return nvfuser::shardTensor(tensor, axis, mesh, device_id);
+}
+
+static at::Tensor transformerShardTensor(at::Tensor tensor, TensorView* tv, Communicator* communicator_) {
+  if (!isSharded(tv)) {
+    return tensor;
+  }
+  NVF_ERROR(tv->hasDeviceMesh(), "`tv` has no DeviceMesh: ", tv);
+  return transformerShardTensor_Mesh(
+      tensor,
+      getShardedLogicalAxis(tv, ParallelType::DIDx),
+      tv->getDeviceMesh(), communicator_);
+}
+
 static void NvFuserScheduler_TransformerFwd(
     benchmark::State& benchmark_state,
     FusionExecutorCache* executor_cache,
@@ -76,6 +132,67 @@ static void NvFuserScheduler_TransformerFwd(
 
   benchmark_state.SetBytesProcessed(
       bytes * int64_t(benchmark_state.iterations()));*/
+
+  Communicator* communicator_ = &Communicator::getInstance(); // nick TODO call Communicator::getInstance().cleanup() somewhere before program exit
+  const int64_t D = communicator_->size(); // number of devices
+
+  at::ScalarType at_dtype = data_type_to_aten(dtype);
+  const auto mesh = DeviceMesh::createForNumDevices(D);
+  constexpr float kEps = 1e-5;
+  std::vector<int64_t> norm_shape{E};
+
+  const auto options =
+      at::TensorOptions().dtype(at_dtype).device(communicator_->device());
+  auto x_ = at::randn({B * S, E}, options);
+  auto ln0_w_ = at::randn(E, options).to(at::kFloat);
+  auto ln0_b_ = at::randn(E, options).to(at::kFloat);
+  auto mha_w0_ = at::randn({3 * E, E}, options) * kParamScale;
+  auto mha_b0_ = at::randn({3 * E}, options) * kParamScale;
+  auto mha_w1_ = at::randn({E, E}, options) * kParamScale;
+  auto mha_b1_ = at::randn({E}, options) * kParamScale;
+  auto ln1_w_ = at::randn(E, options).to(at::kFloat);
+  auto ln1_b_ = at::randn(E, options).to(at::kFloat);
+  auto mlp_w0_ = at::randn({4 * E, E}, options) * kParamScale;
+  auto mlp_b0_ = at::randn({4 * E}, options) * kParamScale;
+  auto mlp_w1_ = at::randn({E, 4 * E}, options) * kParamScale;
+  auto mlp_b1_ = at::randn({E}, options) * kParamScale;
+
+  at::manual_seed(getATenRandomSeed());
+  auto x_float_ = x_.to(at::kFloat);
+  auto ln0_ = at::native_layer_norm(x_float_, norm_shape, ln0_w_, ln0_b_, kEps);
+  auto ln0_out_ = std::get<0>(ln0_);
+
+  auto mha_out_ = reference_mha(
+      ln0_out_.to(at_dtype), mha_w0_, mha_b0_, mha_w1_, mha_b1_)[3];
+
+  auto resid0_ = mha_out_ + x_float_;
+  auto ln1_ = at::native_layer_norm(resid0_, norm_shape, ln1_w_, ln1_b_, kEps);
+  auto ln1_out_ = std::get<0>(ln1_);
+
+  auto mlp_out_ = reference_mlp(
+      ln1_out_.to(at_dtype), mlp_w0_, mlp_b0_, mlp_w1_, mlp_b1_)[3];
+  auto at_out = (resid0_ + mlp_out_).to(at_dtype);
+
+  std::vector<c10::IValue> at_inputs = {
+      x_,
+      ln0_w_,
+      ln0_b_,
+      transformerShardTensor_Mesh(mha_w0_.view({3, E, E}), 1, mesh, communicator_).view({1, 3 * E / D, E}),
+      transformerShardTensor_Mesh(mha_b0_.view({3, E}), 1, mesh, communicator_).view({1, 3 * E / D}),
+      transformerShardTensor_Mesh(mha_w1_, 1, mesh, communicator_).unsqueeze(0),
+      mha_b1_,
+      ln1_w_,
+      ln1_b_,
+      transformerShardTensor_Mesh(mlp_w0_, 0, mesh, communicator_).unsqueeze(0),
+      transformerShardTensor_Mesh(mlp_b0_, 0, mesh, communicator_).unsqueeze(0),
+      transformerShardTensor_Mesh(mlp_w1_, 1, mesh, communicator_).unsqueeze(0),
+      mlp_b1_};
+
+  auto bytes =
+      runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);
+
+  benchmark_state.SetBytesProcessed(
+      bytes * int64_t(benchmark_state.iterations()));
 }
 
 //------------------------------------------------------------------------------

From 528caa024dfb798f8136b8327726658b018e56b0 Mon Sep 17 00:00:00 2001
From: Nicholas Sarkauskas <nsarkauskas@PDX01-M03-I40-DGX-A100-DL-2.nvidia.com>
Date: Mon, 23 Dec 2024 15:30:07 -0800
Subject: [PATCH 05/19] forward working

---
 benchmarks/cpp/transformer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/cpp/transformer.cpp b/benchmarks/cpp/transformer.cpp
index 65672bcfd43..9eb2843d2f5 100644
--- a/benchmarks/cpp/transformer.cpp
+++ b/benchmarks/cpp/transformer.cpp
@@ -201,7 +201,7 @@ NVFUSER_BENCHMARK_DEFINE(
     TransformerForward,
     setupTransformerForward,
     NvFuserScheduler_TransformerFwd,
-    DataType::Float);
+    DataType::BFloat16);
 
 NVFUSER_BENCHMARK_RUN(TransformerForward)
     // ->RangeMultiplier(2)

From aad41ebd4cffcd897fe4b72c7089d33f444a3901 Mon Sep 17 00:00:00 2001
From: Nicholas Sarkauskas <nsarkauskas@PDX01-M03-I40-DGX-A100-DL-2.nvidia.com>
Date: Mon, 23 Dec 2024 15:44:52 -0800
Subject: [PATCH 06/19] remove unused code

---
 benchmarks/cpp/transformer.cpp | 22 +---------------------
 1 file changed, 1 insertion(+), 21 deletions(-)

diff --git a/benchmarks/cpp/transformer.cpp b/benchmarks/cpp/transformer.cpp
index 9eb2843d2f5..d74c07fcd80 100644
--- a/benchmarks/cpp/transformer.cpp
+++ b/benchmarks/cpp/transformer.cpp
@@ -112,27 +112,7 @@ static at::Tensor transformerShardTensor(at::Tensor tensor, TensorView* tv, Comm
 static void NvFuserScheduler_TransformerFwd(
     benchmark::State& benchmark_state,
     FusionExecutorCache* executor_cache,
-    DataType dtype) { /*
-  auto w = benchmark_state.range(0);
-  auto x = benchmark_state.range(1);
-  auto y = benchmark_state.range(2);
-  auto z = benchmark_state.range(3);
-
-  at::manual_seed(0);
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({w, 1, 1, z}, options);
-  at::Tensor t1 = at::randn({w, x, y, z}, options);
-
-  std::vector<c10::IValue> at_inputs = {t0, t1};
-
-  auto bytes =
-      runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);
-
-  benchmark_state.SetBytesProcessed(
-      bytes * int64_t(benchmark_state.iterations()));*/
-
+    DataType dtype) {
   Communicator* communicator_ = &Communicator::getInstance(); // nick TODO call Communicator::getInstance().cleanup() somewhere before program exit
   const int64_t D = communicator_->size(); // number of devices
 

From 8b46e4d3358b903c88227d65ed3871139cf1220f Mon Sep 17 00:00:00 2001
From: Nicholas Sarkauskas <nsarkauskas@PDX01-M03-I40-DGX-A100-DL-2.nvidia.com>
Date: Wed, 8 Jan 2025 14:05:57 -0800
Subject: [PATCH 07/19] add debug prints and comm cleanup

---
 benchmarks/cpp/main.cpp           | 8 ++++++++
 benchmarks/cpp/transformer.cpp    | 4 ++++
 csrc/multidevice/communicator.cpp | 8 ++++++++
 3 files changed, 20 insertions(+)

diff --git a/benchmarks/cpp/main.cpp b/benchmarks/cpp/main.cpp
index 30a159bc661..69cbfad271c 100644
--- a/benchmarks/cpp/main.cpp
+++ b/benchmarks/cpp/main.cpp
@@ -69,6 +69,7 @@ void addGPUBenchmarkContext() {
 
 // Copied from BENCHMARK_MAIN with extra custom settings
 int main(int argc, char** argv) {
+  Communicator* communicator_ = &Communicator::getInstance();
   ::benchmark::Initialize(&argc, argv);
   if (::benchmark::ReportUnrecognizedArguments(argc, argv)) {
     return 1;
@@ -90,6 +91,13 @@ int main(int argc, char** argv) {
 
   ::benchmark::RunSpecifiedBenchmarks();
 
+  printf("calling comm cleanup, size=%ld, did=%ld\n", communicator_->size(), communicator_->deviceId());
+  Communicator::getInstance().cleanup();
+  printf("done calling comm cleanup, size=%ld, did=%ld\n", communicator_->size(), communicator_->deviceId());
+
   ::benchmark::Shutdown();
+
+
+
   return 0;
 }
diff --git a/benchmarks/cpp/transformer.cpp b/benchmarks/cpp/transformer.cpp
index d74c07fcd80..66384fb7e0c 100644
--- a/benchmarks/cpp/transformer.cpp
+++ b/benchmarks/cpp/transformer.cpp
@@ -116,6 +116,10 @@ static void NvFuserScheduler_TransformerFwd(
   Communicator* communicator_ = &Communicator::getInstance(); // nick TODO call Communicator::getInstance().cleanup() somewhere before program exit
   const int64_t D = communicator_->size(); // number of devices
 
+  printf("did=%ld in fwd before barrier\n", communicator_->deviceId());fflush(0);
+  communicator_->barrier();
+  printf("did=%ld in fwd after barrier\n", communicator_->deviceId());fflush(0);
+
   at::ScalarType at_dtype = data_type_to_aten(dtype);
   const auto mesh = DeviceMesh::createForNumDevices(D);
   constexpr float kEps = 1e-5;
diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp
index 6cf1a499bb9..f4e4eb9f136 100644
--- a/csrc/multidevice/communicator.cpp
+++ b/csrc/multidevice/communicator.cpp
@@ -193,6 +193,8 @@ Communicator::Communicator(
   is_available_ = parseEnv(
       rank_, size_, local_rank_, local_size_, master_addr_, master_port_);
 
+  printf("rank=%ld, size=%ld, local_rank_=%ld, local_size_=%ld\n", rank_, size_, local_rank_, local_size_);
+
   if (!is_available_) {
     return;
   }
@@ -232,11 +234,15 @@ void Communicator::cleanup() {
       "likely because Communicator::cleanup was called more than once");
   cleaned_up = true;
 
+  printf("entered cleanup on rank %ld\n", rank_);
+
   // Without this, the TCPStore server can be cleaned up before TCPStore
   // clients are created, causing an hang. This happened with
   // test_multidevice.py::test_sizes_and_ranks.
   if (is_available()) {
+    printf("calling barrier on rank %ld\n", rank_);
     barrier();
+    printf("done calling barrier on rank %ld\n", rank_);
   }
 
   store_ = nullptr;
@@ -251,7 +257,9 @@ void Communicator::cleanup() {
     // Call shutdown before destructing a ProcessGroupNCCL as instructed by
     // https://github.com/pytorch/pytorch/blob/e62073d7997c9e63896cb5289ffd0874a8cc1838/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp#L1164-L1170.
     if (auto* pg_nccl = dynamic_cast<c10d::ProcessGroupNCCL*>(backend.get())) {
+      printf("pg shutdown on rank %ld\n", rank_);
       pg_nccl->shutdown();
+      printf("done calling pg shutdown on rank %ld\n", rank_);
     }
   }
 #endif

From 569c24c0dad7107bc13a6cf4702c62884f872e6e Mon Sep 17 00:00:00 2001
From: Nick Sarkauskas <nsarkauskas@nvidia.com>
Date: Thu, 9 Jan 2025 08:39:46 -0500
Subject: [PATCH 08/19] Update benchmarks/cpp/transformer.cpp

Co-authored-by: Jingyue Wu <wujingyue@gmail.com>
---
 benchmarks/cpp/transformer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/cpp/transformer.cpp b/benchmarks/cpp/transformer.cpp
index 66384fb7e0c..855215513dd 100644
--- a/benchmarks/cpp/transformer.cpp
+++ b/benchmarks/cpp/transformer.cpp
@@ -51,7 +51,7 @@ static void setupTransformerForward(Fusion* fusion, DataType dtype) {
   std::unique_ptr<DistributedTransformer> model = std::make_unique<DistributedTransformer>(
         D, B, E, H, S, kDropoutProb, kSdpaProb);
 
-  model->setupForward(fusion, dtype, /*sequence_parallel*/false);
+  model->setupForward(fusion, dtype, /*sequence_parallel=*/false);
 }
 
 static std::vector<at::Tensor> reference_mlp(

From 403367eed5fdc712dd14d83741d0fbae27efeabc Mon Sep 17 00:00:00 2001
From: Nicholas Sarkauskas <nsarkauskas@PDX01-M03-I40-DGX-A100-DL-1.nvidia.com>
Date: Thu, 9 Jan 2025 09:02:53 -0800
Subject: [PATCH 09/19] working with multiple ranks, disabled cupti profiling
 and set iters to 1

---
 benchmarks/cpp/transformer.cpp | 64 +++++-----------------------------
 1 file changed, 9 insertions(+), 55 deletions(-)

diff --git a/benchmarks/cpp/transformer.cpp b/benchmarks/cpp/transformer.cpp
index 855215513dd..6c512968063 100644
--- a/benchmarks/cpp/transformer.cpp
+++ b/benchmarks/cpp/transformer.cpp
@@ -31,7 +31,7 @@ using namespace nvfuser;
 namespace {
 // Note: We test on smaller model and input sizes to avoid high error
 // accumulation for validation.
-static constexpr int64_t B = 2, E = 768, H = 16, S = 128;
+static constexpr int64_t B = 2, E = 32/*768*/, H = 2/*16*/, S = 32/*128*/;
 // Note: Dropout probabilities are set to 0. Since the dropout mask is sharded
 // it throws off the seed offset between the sharded nvFuser program and the
 // unsharded reference.
@@ -48,47 +48,14 @@ static void setupTransformerForward(Fusion* fusion, DataType dtype) {
 
   const int64_t D = communicator_->size(); // number of devices
 
+  ProfilerOptionsGuard::getCurOptions().set(ProfilerOption::EnableNocupti);
+
   std::unique_ptr<DistributedTransformer> model = std::make_unique<DistributedTransformer>(
         D, B, E, H, S, kDropoutProb, kSdpaProb);
 
   model->setupForward(fusion, dtype, /*sequence_parallel=*/false);
 }
 
-static std::vector<at::Tensor> reference_mlp(
-    at::Tensor x,
-    at::Tensor w0,
-    at::Tensor b0,
-    at::Tensor w1,
-    at::Tensor b1) {
-  auto at_dtype = w0.dtype();
-  auto linear0 = at::linear(x, w0, b0);
-  auto gelu = at::gelu(linear0.to(at::kFloat), "tanh").to(at_dtype);
-  auto linear1 = at::linear(gelu, w1, b1).to(at::kFloat);
-  auto [dropout, mask] = at::native_dropout(linear1, kDropoutProb, true);
-  return {linear0, gelu, linear1, dropout, mask};
-}
-
-static std::vector<at::Tensor> reference_mha(
-    at::Tensor x,
-    at::Tensor w0,
-    at::Tensor b0,
-    at::Tensor w1,
-    at::Tensor b1) {
-  auto linear0 = at::linear(x, w0, b0);
-  auto qkv = linear0.view({B, S, 3 * E}).split(E, 2);
-  for (auto i = 0; i < 3; i++) {
-    qkv[i] = qkv[i].reshape({B, S, H, E / H}).transpose(1, 2);
-  }
-  auto sdpa_out = at::_scaled_dot_product_flash_attention(
-      qkv[0], qkv[1], qkv[2], kSdpaProb, true, false, kSdpaScale);
-  auto sdpa = std::get<0>(sdpa_out);
-  // Reassemble heads (B, H, S, E/H) to (B, S, H, E/H) to (B, S, E)
-  auto y = sdpa.transpose(1, 2).reshape({B * S, E});
-  auto linear1 = at::linear(y, w1, b1).to(at::kFloat);
-  auto [dropout, mask] = at::native_dropout(linear1, kDropoutProb, true);
-  return {linear0, sdpa, linear1, dropout, mask};
-}
-
 static at::Tensor transformerShardTensor_Mesh(
     at::Tensor tensor,
     const int64_t axis,
@@ -116,13 +83,13 @@ static void NvFuserScheduler_TransformerFwd(
   Communicator* communicator_ = &Communicator::getInstance(); // nick TODO call Communicator::getInstance().cleanup() somewhere before program exit
   const int64_t D = communicator_->size(); // number of devices
 
-  printf("did=%ld in fwd before barrier\n", communicator_->deviceId());fflush(0);
-  communicator_->barrier();
-  printf("did=%ld in fwd after barrier\n", communicator_->deviceId());fflush(0);
+  // printf("did=%ld in fwd before barrier\n", communicator_->deviceId());fflush(0);
+  // communicator_->barrier();
+  // printf("did=%ld in fwd after barrier\n", communicator_->deviceId());fflush(0);
+  printf("did=%ld in fwd\n", communicator_->deviceId());fflush(0);
 
   at::ScalarType at_dtype = data_type_to_aten(dtype);
   const auto mesh = DeviceMesh::createForNumDevices(D);
-  constexpr float kEps = 1e-5;
   std::vector<int64_t> norm_shape{E};
 
   const auto options =
@@ -142,20 +109,6 @@ static void NvFuserScheduler_TransformerFwd(
   auto mlp_b1_ = at::randn({E}, options) * kParamScale;
 
   at::manual_seed(getATenRandomSeed());
-  auto x_float_ = x_.to(at::kFloat);
-  auto ln0_ = at::native_layer_norm(x_float_, norm_shape, ln0_w_, ln0_b_, kEps);
-  auto ln0_out_ = std::get<0>(ln0_);
-
-  auto mha_out_ = reference_mha(
-      ln0_out_.to(at_dtype), mha_w0_, mha_b0_, mha_w1_, mha_b1_)[3];
-
-  auto resid0_ = mha_out_ + x_float_;
-  auto ln1_ = at::native_layer_norm(resid0_, norm_shape, ln1_w_, ln1_b_, kEps);
-  auto ln1_out_ = std::get<0>(ln1_);
-
-  auto mlp_out_ = reference_mlp(
-      ln1_out_.to(at_dtype), mlp_w0_, mlp_b0_, mlp_w1_, mlp_b1_)[3];
-  auto at_out = (resid0_ + mlp_out_).to(at_dtype);
 
   std::vector<c10::IValue> at_inputs = {
       x_,
@@ -189,6 +142,7 @@ NVFUSER_BENCHMARK_DEFINE(
 
 NVFUSER_BENCHMARK_RUN(TransformerForward)
     // ->RangeMultiplier(2)
-    ->Ranges({{8, 8}, {16, 16}, {128, 128}, {128, 128}})
+    ->Ranges({{8, 8}})
+    ->Iterations(1)
     ->Unit(benchmark::kMicrosecond)
     ->UseManualTime();

From 68cf0dbebb0499de50b9978b615e23cf9a3a911a Mon Sep 17 00:00:00 2001
From: Nicholas Sarkauskas <nsarkauskas@PDX01-M03-I40-DGX-A100-DL-1.nvidia.com>
Date: Thu, 9 Jan 2025 09:29:42 -0800
Subject: [PATCH 10/19] remove

---
 benchmarks/cpp/transformer.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/benchmarks/cpp/transformer.cpp b/benchmarks/cpp/transformer.cpp
index 6c512968063..7b7f77de852 100644
--- a/benchmarks/cpp/transformer.cpp
+++ b/benchmarks/cpp/transformer.cpp
@@ -48,8 +48,6 @@ static void setupTransformerForward(Fusion* fusion, DataType dtype) {
 
   const int64_t D = communicator_->size(); // number of devices
 
-  ProfilerOptionsGuard::getCurOptions().set(ProfilerOption::EnableNocupti);
-
   std::unique_ptr<DistributedTransformer> model = std::make_unique<DistributedTransformer>(
         D, B, E, H, S, kDropoutProb, kSdpaProb);
 

From dadeecc4bbc504aae67348b011d2707050d2da41 Mon Sep 17 00:00:00 2001
From: Nicholas Sarkauskas <nsarkauskas@PDX01-M03-I40-DGX-A100-DL-1.nvidia.com>
Date: Thu, 9 Jan 2025 09:39:40 -0800
Subject: [PATCH 11/19] remove debug prints

---
 benchmarks/cpp/main.cpp           |  5 -----
 benchmarks/cpp/transformer.cpp    | 13 +++----------
 csrc/multidevice/communicator.cpp |  8 --------
 3 files changed, 3 insertions(+), 23 deletions(-)

diff --git a/benchmarks/cpp/main.cpp b/benchmarks/cpp/main.cpp
index 69cbfad271c..3d78e6cac67 100644
--- a/benchmarks/cpp/main.cpp
+++ b/benchmarks/cpp/main.cpp
@@ -69,7 +69,6 @@ void addGPUBenchmarkContext() {
 
 // Copied from BENCHMARK_MAIN with extra custom settings
 int main(int argc, char** argv) {
-  Communicator* communicator_ = &Communicator::getInstance();
   ::benchmark::Initialize(&argc, argv);
   if (::benchmark::ReportUnrecognizedArguments(argc, argv)) {
     return 1;
@@ -91,13 +90,9 @@ int main(int argc, char** argv) {
 
   ::benchmark::RunSpecifiedBenchmarks();
 
-  printf("calling comm cleanup, size=%ld, did=%ld\n", communicator_->size(), communicator_->deviceId());
   Communicator::getInstance().cleanup();
-  printf("done calling comm cleanup, size=%ld, did=%ld\n", communicator_->size(), communicator_->deviceId());
 
   ::benchmark::Shutdown();
 
-
-
   return 0;
 }
diff --git a/benchmarks/cpp/transformer.cpp b/benchmarks/cpp/transformer.cpp
index 7b7f77de852..b223f265099 100644
--- a/benchmarks/cpp/transformer.cpp
+++ b/benchmarks/cpp/transformer.cpp
@@ -31,7 +31,7 @@ using namespace nvfuser;
 namespace {
 // Note: We test on smaller model and input sizes to avoid high error
 // accumulation for validation.
-static constexpr int64_t B = 2, E = 32/*768*/, H = 2/*16*/, S = 32/*128*/;
+static constexpr int64_t B = 2, E = 768, H = 16, S = 128;
 // Note: Dropout probabilities are set to 0. Since the dropout mask is sharded
 // it throws off the seed offset between the sharded nvFuser program and the
 // unsharded reference.
@@ -78,14 +78,9 @@ static void NvFuserScheduler_TransformerFwd(
     benchmark::State& benchmark_state,
     FusionExecutorCache* executor_cache,
     DataType dtype) {
-  Communicator* communicator_ = &Communicator::getInstance(); // nick TODO call Communicator::getInstance().cleanup() somewhere before program exit
+  Communicator* communicator_ = &Communicator::getInstance();
   const int64_t D = communicator_->size(); // number of devices
 
-  // printf("did=%ld in fwd before barrier\n", communicator_->deviceId());fflush(0);
-  // communicator_->barrier();
-  // printf("did=%ld in fwd after barrier\n", communicator_->deviceId());fflush(0);
-  printf("did=%ld in fwd\n", communicator_->deviceId());fflush(0);
-
   at::ScalarType at_dtype = data_type_to_aten(dtype);
   const auto mesh = DeviceMesh::createForNumDevices(D);
   std::vector<int64_t> norm_shape{E};
@@ -139,8 +134,6 @@ NVFUSER_BENCHMARK_DEFINE(
     DataType::BFloat16);
 
 NVFUSER_BENCHMARK_RUN(TransformerForward)
-    // ->RangeMultiplier(2)
-    ->Ranges({{8, 8}})
-    ->Iterations(1)
+    ->Iterations(10)
     ->Unit(benchmark::kMicrosecond)
     ->UseManualTime();
diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp
index f4e4eb9f136..6cf1a499bb9 100644
--- a/csrc/multidevice/communicator.cpp
+++ b/csrc/multidevice/communicator.cpp
@@ -193,8 +193,6 @@ Communicator::Communicator(
   is_available_ = parseEnv(
       rank_, size_, local_rank_, local_size_, master_addr_, master_port_);
 
-  printf("rank=%ld, size=%ld, local_rank_=%ld, local_size_=%ld\n", rank_, size_, local_rank_, local_size_);
-
   if (!is_available_) {
     return;
   }
@@ -234,15 +232,11 @@ void Communicator::cleanup() {
       "likely because Communicator::cleanup was called more than once");
   cleaned_up = true;
 
-  printf("entered cleanup on rank %ld\n", rank_);
-
   // Without this, the TCPStore server can be cleaned up before TCPStore
   // clients are created, causing an hang. This happened with
   // test_multidevice.py::test_sizes_and_ranks.
   if (is_available()) {
-    printf("calling barrier on rank %ld\n", rank_);
     barrier();
-    printf("done calling barrier on rank %ld\n", rank_);
   }
 
   store_ = nullptr;
@@ -257,9 +251,7 @@ void Communicator::cleanup() {
     // Call shutdown before destructing a ProcessGroupNCCL as instructed by
     // https://github.com/pytorch/pytorch/blob/e62073d7997c9e63896cb5289ffd0874a8cc1838/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp#L1164-L1170.
     if (auto* pg_nccl = dynamic_cast<c10d::ProcessGroupNCCL*>(backend.get())) {
-      printf("pg shutdown on rank %ld\n", rank_);
       pg_nccl->shutdown();
-      printf("done calling pg shutdown on rank %ld\n", rank_);
     }
   }
 #endif

From 86e55691b47383f3d40c43900516a5ad5e4c1f60 Mon Sep 17 00:00:00 2001
From: Nick Sarkauskas <nsarkauskas@nvidia.com>
Date: Thu, 9 Jan 2025 13:11:36 -0500
Subject: [PATCH 12/19] Update benchmarks/cpp/transformer.cpp

Co-authored-by: Jingyue Wu <wujingyue@gmail.com>
---
 benchmarks/cpp/transformer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/cpp/transformer.cpp b/benchmarks/cpp/transformer.cpp
index b223f265099..cf2516804bc 100644
--- a/benchmarks/cpp/transformer.cpp
+++ b/benchmarks/cpp/transformer.cpp
@@ -48,7 +48,7 @@ static void setupTransformerForward(Fusion* fusion, DataType dtype) {
 
   const int64_t D = communicator_->size(); // number of devices
 
-  std::unique_ptr<DistributedTransformer> model = std::make_unique<DistributedTransformer>(
+  auto model = std::make_unique<DistributedTransformer>(
         D, B, E, H, S, kDropoutProb, kSdpaProb);
 
   model->setupForward(fusion, dtype, /*sequence_parallel=*/false);

From 8604cce7c9a826eac4dd8cd7df674f404fc954f3 Mon Sep 17 00:00:00 2001
From: Nick Sarkauskas <nsarkauskas@nvidia.com>
Date: Thu, 9 Jan 2025 13:12:10 -0500
Subject: [PATCH 13/19] Update benchmarks/cpp/transformer.cpp

Co-authored-by: Jingyue Wu <wujingyue@gmail.com>
---
 benchmarks/cpp/transformer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/cpp/transformer.cpp b/benchmarks/cpp/transformer.cpp
index cf2516804bc..3926e1e5429 100644
--- a/benchmarks/cpp/transformer.cpp
+++ b/benchmarks/cpp/transformer.cpp
@@ -74,7 +74,7 @@ static at::Tensor transformerShardTensor(at::Tensor tensor, TensorView* tv, Comm
       tv->getDeviceMesh(), communicator_);
 }
 
-static void NvFuserScheduler_TransformerFwd(
+static void transformerFwd(
     benchmark::State& benchmark_state,
     FusionExecutorCache* executor_cache,
     DataType dtype) {

From a3c228c868ed1690f4fd78e81c1adeb81df8d855 Mon Sep 17 00:00:00 2001
From: Nicholas Sarkauskas <nsarkauskas@PDX01-M03-I40-DGX-A100-DL-1.nvidia.com>
Date: Thu, 9 Jan 2025 10:16:11 -0800
Subject: [PATCH 14/19] review feedback

---
 benchmarks/cpp/transformer.cpp | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/benchmarks/cpp/transformer.cpp b/benchmarks/cpp/transformer.cpp
index 3926e1e5429..330f6eeb566 100644
--- a/benchmarks/cpp/transformer.cpp
+++ b/benchmarks/cpp/transformer.cpp
@@ -31,20 +31,20 @@ using namespace nvfuser;
 namespace {
 // Note: We test on smaller model and input sizes to avoid high error
 // accumulation for validation.
-static constexpr int64_t B = 2, E = 768, H = 16, S = 128;
+  constexpr int64_t B = 2, E = 768, H = 16, S = 128;
 // Note: Dropout probabilities are set to 0. Since the dropout mask is sharded
 // it throws off the seed offset between the sharded nvFuser program and the
 // unsharded reference.
-static constexpr double kDropoutProb = 0.0, kSdpaProb = 0.0, kSdpaScale = 1e-3;
+  constexpr double kDropoutProb = 0.0, kSdpaProb = 0.0, kSdpaScale = 1e-3;
 // Note parameters scaled by kParamScale following weight initialization
 // recommendations:
 // https://huggingface.co/docs/transformers/en/model_doc/gpt2#transformers.GPT2Config.initializer_range
-static constexpr double kParamScale = 0.02;
+  constexpr double kParamScale = 0.02;
 } // namespace
 
 // Return reduction tensor view and output of reduction
-static void setupTransformerForward(Fusion* fusion, DataType dtype) {
-  Communicator* communicator_ = &Communicator::getInstance(); // nick TODO call Communicator::getInstance().cleanup() somewhere before program exit
+void setupTransformerForward(Fusion* fusion, DataType dtype) {
+  Communicator* communicator_ = &Communicator::getInstance();
 
   const int64_t D = communicator_->size(); // number of devices
 
@@ -63,17 +63,6 @@ static at::Tensor transformerShardTensor_Mesh(
   return nvfuser::shardTensor(tensor, axis, mesh, device_id);
 }
 
-static at::Tensor transformerShardTensor(at::Tensor tensor, TensorView* tv, Communicator* communicator_) {
-  if (!isSharded(tv)) {
-    return tensor;
-  }
-  NVF_ERROR(tv->hasDeviceMesh(), "`tv` has no DeviceMesh: ", tv);
-  return transformerShardTensor_Mesh(
-      tensor,
-      getShardedLogicalAxis(tv, ParallelType::DIDx),
-      tv->getDeviceMesh(), communicator_);
-}
-
 static void transformerFwd(
     benchmark::State& benchmark_state,
     FusionExecutorCache* executor_cache,

From b0c65bc8dc8af0e60742c404237bf37e94ac248c Mon Sep 17 00:00:00 2001
From: Nicholas Sarkauskas <nsarkauskas@PDX01-M03-I40-DGX-A100-DL-1.nvidia.com>
Date: Thu, 9 Jan 2025 10:19:04 -0800
Subject: [PATCH 15/19] update

---
 benchmarks/cpp/transformer.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/cpp/transformer.cpp b/benchmarks/cpp/transformer.cpp
index 330f6eeb566..2e6ef6d9ff0 100644
--- a/benchmarks/cpp/transformer.cpp
+++ b/benchmarks/cpp/transformer.cpp
@@ -54,7 +54,7 @@ void setupTransformerForward(Fusion* fusion, DataType dtype) {
   model->setupForward(fusion, dtype, /*sequence_parallel=*/false);
 }
 
-static at::Tensor transformerShardTensor_Mesh(
+at::Tensor transformerShardTensor_Mesh(
     at::Tensor tensor,
     const int64_t axis,
     const DeviceMesh& mesh,
@@ -63,7 +63,7 @@ static at::Tensor transformerShardTensor_Mesh(
   return nvfuser::shardTensor(tensor, axis, mesh, device_id);
 }
 
-static void transformerFwd(
+void transformerFwd(
     benchmark::State& benchmark_state,
     FusionExecutorCache* executor_cache,
     DataType dtype) {
@@ -119,7 +119,7 @@ static void transformerFwd(
 NVFUSER_BENCHMARK_DEFINE(
     TransformerForward,
     setupTransformerForward,
-    NvFuserScheduler_TransformerFwd,
+    transformerFwd,
     DataType::BFloat16);
 
 NVFUSER_BENCHMARK_RUN(TransformerForward)

From d34a9023fd181c88a01ca4a32aa0711804ad1e74 Mon Sep 17 00:00:00 2001
From: Nicholas Sarkauskas <nsarkauskas@PDX01-M03-I40-DGX-A100-DL-1.nvidia.com>
Date: Thu, 9 Jan 2025 10:33:49 -0800
Subject: [PATCH 16/19] linter

---
 benchmarks/cpp/transformer.cpp             | 17 ++++++++++-------
 tests/cpp/multidevice_transformer.cpp      |  2 +-
 tests/cpp/multidevice_transformer.h        |  2 +-
 tests/cpp/test_multidevice_transformer.cpp |  2 +-
 4 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/benchmarks/cpp/transformer.cpp b/benchmarks/cpp/transformer.cpp
index 2e6ef6d9ff0..4728d6c114f 100644
--- a/benchmarks/cpp/transformer.cpp
+++ b/benchmarks/cpp/transformer.cpp
@@ -23,23 +23,23 @@
 
 #include <benchmarks/cpp/utils.h>
 #include <csrc/multidevice/utils.h>
-#include <tests/cpp/utils.h>
 #include <tests/cpp/multidevice_transformer.h>
+#include <tests/cpp/utils.h>
 
 using namespace nvfuser;
 
 namespace {
 // Note: We test on smaller model and input sizes to avoid high error
 // accumulation for validation.
-  constexpr int64_t B = 2, E = 768, H = 16, S = 128;
+constexpr int64_t B = 2, E = 768, H = 16, S = 128;
 // Note: Dropout probabilities are set to 0. Since the dropout mask is sharded
 // it throws off the seed offset between the sharded nvFuser program and the
 // unsharded reference.
-  constexpr double kDropoutProb = 0.0, kSdpaProb = 0.0, kSdpaScale = 1e-3;
+constexpr double kDropoutProb = 0.0, kSdpaProb = 0.0, kSdpaScale = 1e-3;
 // Note parameters scaled by kParamScale following weight initialization
 // recommendations:
 // https://huggingface.co/docs/transformers/en/model_doc/gpt2#transformers.GPT2Config.initializer_range
-  constexpr double kParamScale = 0.02;
+constexpr double kParamScale = 0.02;
 } // namespace
 
 // Return reduction tensor view and output of reduction
@@ -49,7 +49,7 @@ void setupTransformerForward(Fusion* fusion, DataType dtype) {
   const int64_t D = communicator_->size(); // number of devices
 
   auto model = std::make_unique<DistributedTransformer>(
-        D, B, E, H, S, kDropoutProb, kSdpaProb);
+      D, B, E, H, S, kDropoutProb, kSdpaProb);
 
   model->setupForward(fusion, dtype, /*sequence_parallel=*/false);
 }
@@ -96,8 +96,11 @@ void transformerFwd(
       x_,
       ln0_w_,
       ln0_b_,
-      transformerShardTensor_Mesh(mha_w0_.view({3, E, E}), 1, mesh, communicator_).view({1, 3 * E / D, E}),
-      transformerShardTensor_Mesh(mha_b0_.view({3, E}), 1, mesh, communicator_).view({1, 3 * E / D}),
+      transformerShardTensor_Mesh(
+          mha_w0_.view({3, E, E}), 1, mesh, communicator_)
+          .view({1, 3 * E / D, E}),
+      transformerShardTensor_Mesh(mha_b0_.view({3, E}), 1, mesh, communicator_)
+          .view({1, 3 * E / D}),
       transformerShardTensor_Mesh(mha_w1_, 1, mesh, communicator_).unsqueeze(0),
       mha_b1_,
       ln1_w_,
diff --git a/tests/cpp/multidevice_transformer.cpp b/tests/cpp/multidevice_transformer.cpp
index 67b3e48aedc..38917151004 100644
--- a/tests/cpp/multidevice_transformer.cpp
+++ b/tests/cpp/multidevice_transformer.cpp
@@ -395,7 +395,7 @@ std::vector<TensorView*> DistributedTransformer::mha_backwards(
 /* NVFuser benchmark manages the unique_ptr for Fusion and FusionExecutorCache,
    so update the raw pointer with this setupForward function */
 void DistributedTransformer::setupForward(
-    Fusion *fusion,
+    Fusion* fusion,
     DataType dtype,
     bool sequence_parallel) {
   FusionGuard fg(fusion);
diff --git a/tests/cpp/multidevice_transformer.h b/tests/cpp/multidevice_transformer.h
index 43c0b06bf80..caf2a250ced 100644
--- a/tests/cpp/multidevice_transformer.h
+++ b/tests/cpp/multidevice_transformer.h
@@ -46,7 +46,7 @@ class DistributedTransformer {
         kSdpaProb(sdpa_dropout_prob) {}
 
   void setupForward(
-      Fusion *fusion,
+      Fusion* fusion,
       DataType dtype,
       bool sequence_parallel = false);
   std::unique_ptr<FusionExecutorCache> forward(
diff --git a/tests/cpp/test_multidevice_transformer.cpp b/tests/cpp/test_multidevice_transformer.cpp
index 8db0db5915a..1b90f1d0b65 100644
--- a/tests/cpp/test_multidevice_transformer.cpp
+++ b/tests/cpp/test_multidevice_transformer.cpp
@@ -11,8 +11,8 @@
 #include <gtest/gtest.h>
 
 #include <fusion.h>
-#include <ops/all_ops.h>
 #include <multidevice/communicator.h>
+#include <ops/all_ops.h>
 #include <tests/cpp/multidevice.h>
 #include <tests/cpp/multidevice_transformer.h>
 #include <tests/cpp/validator.h>

From 437fdd1c2f376ed7f9e92d0b493a662aba63a152 Mon Sep 17 00:00:00 2001
From: Nicholas Sarkauskas <nsarkauskas@PDX01-M03-I40-DGX-A100-DL-1.nvidia.com>
Date: Thu, 9 Jan 2025 12:15:52 -0800
Subject: [PATCH 17/19] review feedback

---
 benchmarks/cpp/transformer.cpp | 40 +++++++++++++++++-----------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/benchmarks/cpp/transformer.cpp b/benchmarks/cpp/transformer.cpp
index 4728d6c114f..fab2bd4cdcd 100644
--- a/benchmarks/cpp/transformer.cpp
+++ b/benchmarks/cpp/transformer.cpp
@@ -28,7 +28,6 @@
 
 using namespace nvfuser;
 
-namespace {
 // Note: We test on smaller model and input sizes to avoid high error
 // accumulation for validation.
 constexpr int64_t B = 2, E = 768, H = 16, S = 128;
@@ -40,13 +39,12 @@ constexpr double kDropoutProb = 0.0, kSdpaProb = 0.0, kSdpaScale = 1e-3;
 // recommendations:
 // https://huggingface.co/docs/transformers/en/model_doc/gpt2#transformers.GPT2Config.initializer_range
 constexpr double kParamScale = 0.02;
-} // namespace
 
 // Return reduction tensor view and output of reduction
 void setupTransformerForward(Fusion* fusion, DataType dtype) {
-  Communicator* communicator_ = &Communicator::getInstance();
+  Communicator* communicator = &Communicator::getInstance();
 
-  const int64_t D = communicator_->size(); // number of devices
+  const int64_t D = communicator->size(); // number of devices
 
   auto model = std::make_unique<DistributedTransformer>(
       D, B, E, H, S, kDropoutProb, kSdpaProb);
@@ -54,28 +52,30 @@ void setupTransformerForward(Fusion* fusion, DataType dtype) {
   model->setupForward(fusion, dtype, /*sequence_parallel=*/false);
 }
 
-at::Tensor transformerShardTensor_Mesh(
-    at::Tensor tensor,
-    const int64_t axis,
-    const DeviceMesh& mesh,
-    Communicator* communicator_) {
-  const auto device_id = communicator_->deviceId();
+namespace {
+  at::Tensor shardTensor(
+      at::Tensor tensor,
+      const int64_t axis,
+      const DeviceMesh& mesh,
+      Communicator* communicator) {
+  const auto device_id = communicator->deviceId();
   return nvfuser::shardTensor(tensor, axis, mesh, device_id);
 }
+}
 
 void transformerFwd(
     benchmark::State& benchmark_state,
     FusionExecutorCache* executor_cache,
     DataType dtype) {
-  Communicator* communicator_ = &Communicator::getInstance();
-  const int64_t D = communicator_->size(); // number of devices
+  Communicator* communicator = &Communicator::getInstance();
+  const int64_t D = communicator->size(); // number of devices
 
   at::ScalarType at_dtype = data_type_to_aten(dtype);
   const auto mesh = DeviceMesh::createForNumDevices(D);
   std::vector<int64_t> norm_shape{E};
 
   const auto options =
-      at::TensorOptions().dtype(at_dtype).device(communicator_->device());
+      at::TensorOptions().dtype(at_dtype).device(communicator->device());
   auto x_ = at::randn({B * S, E}, options);
   auto ln0_w_ = at::randn(E, options).to(at::kFloat);
   auto ln0_b_ = at::randn(E, options).to(at::kFloat);
@@ -96,18 +96,18 @@ void transformerFwd(
       x_,
       ln0_w_,
       ln0_b_,
-      transformerShardTensor_Mesh(
-          mha_w0_.view({3, E, E}), 1, mesh, communicator_)
+      shardTensor(
+          mha_w0_.view({3, E, E}), 1, mesh, communicator)
           .view({1, 3 * E / D, E}),
-      transformerShardTensor_Mesh(mha_b0_.view({3, E}), 1, mesh, communicator_)
+      shardTensor(mha_b0_.view({3, E}), 1, mesh, communicator)
           .view({1, 3 * E / D}),
-      transformerShardTensor_Mesh(mha_w1_, 1, mesh, communicator_).unsqueeze(0),
+      shardTensor(mha_w1_, 1, mesh, communicator).unsqueeze(0),
       mha_b1_,
       ln1_w_,
       ln1_b_,
-      transformerShardTensor_Mesh(mlp_w0_, 0, mesh, communicator_).unsqueeze(0),
-      transformerShardTensor_Mesh(mlp_b0_, 0, mesh, communicator_).unsqueeze(0),
-      transformerShardTensor_Mesh(mlp_w1_, 1, mesh, communicator_).unsqueeze(0),
+      shardTensor(mlp_w0_, 0, mesh, communicator).unsqueeze(0),
+      shardTensor(mlp_b0_, 0, mesh, communicator).unsqueeze(0),
+      shardTensor(mlp_w1_, 1, mesh, communicator).unsqueeze(0),
       mlp_b1_};
 
   auto bytes =

From 9742a9015a4ed604521438183a6136316898b3ee Mon Sep 17 00:00:00 2001
From: Nicholas Sarkauskas <nsarkauskas@PDX01-M03-I40-DGX-A100-DL-1.nvidia.com>
Date: Thu, 9 Jan 2025 12:20:18 -0800
Subject: [PATCH 18/19] lint

---
 benchmarks/cpp/transformer.cpp | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/benchmarks/cpp/transformer.cpp b/benchmarks/cpp/transformer.cpp
index fab2bd4cdcd..66bb1759b13 100644
--- a/benchmarks/cpp/transformer.cpp
+++ b/benchmarks/cpp/transformer.cpp
@@ -53,15 +53,15 @@ void setupTransformerForward(Fusion* fusion, DataType dtype) {
 }
 
 namespace {
-  at::Tensor shardTensor(
-      at::Tensor tensor,
-      const int64_t axis,
-      const DeviceMesh& mesh,
-      Communicator* communicator) {
+at::Tensor shardTensor(
+    at::Tensor tensor,
+    const int64_t axis,
+    const DeviceMesh& mesh,
+    Communicator* communicator) {
   const auto device_id = communicator->deviceId();
   return nvfuser::shardTensor(tensor, axis, mesh, device_id);
 }
-}
+} // namespace
 
 void transformerFwd(
     benchmark::State& benchmark_state,
@@ -96,8 +96,7 @@ void transformerFwd(
       x_,
       ln0_w_,
       ln0_b_,
-      shardTensor(
-          mha_w0_.view({3, E, E}), 1, mesh, communicator)
+      shardTensor(mha_w0_.view({3, E, E}), 1, mesh, communicator)
           .view({1, 3 * E / D, E}),
       shardTensor(mha_b0_.view({3, E}), 1, mesh, communicator)
           .view({1, 3 * E / D}),

From 450596e27bbb9640b4f392c3f79ffa000afb98c5 Mon Sep 17 00:00:00 2001
From: Nicholas Sarkauskas <nsarkauskas@nsarkauskas-mlt.client.nvidia.com>
Date: Fri, 10 Jan 2025 10:15:20 -0500
Subject: [PATCH 19/19] remove unused variable

---
 benchmarks/cpp/transformer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/cpp/transformer.cpp b/benchmarks/cpp/transformer.cpp
index 66bb1759b13..e21e81be987 100644
--- a/benchmarks/cpp/transformer.cpp
+++ b/benchmarks/cpp/transformer.cpp
@@ -34,7 +34,7 @@ constexpr int64_t B = 2, E = 768, H = 16, S = 128;
 // Note: Dropout probabilities are set to 0. Since the dropout mask is sharded
 // it throws off the seed offset between the sharded nvFuser program and the
 // unsharded reference.
-constexpr double kDropoutProb = 0.0, kSdpaProb = 0.0, kSdpaScale = 1e-3;
+constexpr double kDropoutProb = 0.0, kSdpaProb = 0.0;
 // Note parameters scaled by kParamScale following weight initialization
 // recommendations:
 // https://huggingface.co/docs/transformers/en/model_doc/gpt2#transformers.GPT2Config.initializer_range