From 557797d0c201e292ef749439a9b32de703a55600 Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Sat, 18 Jan 2025 13:17:52 -0800
Subject: [PATCH] Testing rng.

---
 CMakeLists.txt                     |   2 +
 csrc/codegen.cpp                   |   8 ++-
 csrc/device_lower/lower2device.cpp |   2 +
 csrc/device_lower/pass/rng.cpp     | 111 +++++++++++++++++++++++++++++
 csrc/device_lower/pass/rng.h       |  16 +++++
 tests/cpp/test_gpu4.cpp            | 107 +++++++++++++++++++++++++++
 6 files changed, 245 insertions(+), 1 deletion(-)
 create mode 100644 csrc/device_lower/pass/rng.cpp
 create mode 100644 csrc/device_lower/pass/rng.h
 create mode 100644 tests/cpp/test_gpu4.cpp
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3d0c5db76b9..9441b8a7737 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -115,6 +115,7 @@ list(APPEND NVFUSER_SRCS
   ${NVFUSER_SRCS_DIR}/device_lower/pass/misaligned_vectorization.cpp
   ${NVFUSER_SRCS_DIR}/device_lower/pass/predicate.cpp
   ${NVFUSER_SRCS_DIR}/device_lower/pass/replace_size.cpp
+  ${NVFUSER_SRCS_DIR}/device_lower/pass/rng.cpp
   ${NVFUSER_SRCS_DIR}/device_lower/pass/scalar_hoist.cpp
   ${NVFUSER_SRCS_DIR}/device_lower/pass/unroll.cpp
   ${NVFUSER_SRCS_DIR}/device_lower/pass/vectorize_welford.cpp
@@ -555,6 +556,7 @@ list(APPEND JIT_TEST_SRCS
   ${NVFUSER_ROOT}/tests/cpp/test_gpu1.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_gpu2.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_gpu3.cpp
+  ${NVFUSER_ROOT}/tests/cpp/test_gpu4.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_gpu_compute_with.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_gpu_fused_reduction.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_gpu_indexing_ops.cpp
diff --git a/csrc/codegen.cpp b/csrc/codegen.cpp
index fe00c1f8105..efefd6d32e9 100644
--- a/csrc/codegen.cpp
+++ b/csrc/codegen.cpp
@@ -655,10 +655,15 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
   }
 
   void handle(const NamedScalar* ns) final {
+    std::cout << "Named scalar?" << std::endl;
     if (ns->definition() != nullptr &&
         alloc_set_.find(ns) == alloc_set_.end()) {
+      std::cout << "Def: " << ns->definition()->toString() << std::endl;
       code_ << genInline(ns->definition());
     } else {
+      if (ns->definition()) {
+        std::cout << "Else: " << ns->definition()->toString() << std::endl;
+      }
       code_ << genVariableName(ns);
     }
   }
@@ -811,7 +816,8 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
       }
       code_ << " = ";
     }
-
+    // NVF_ERROR(uop->out()->isA<NamedScalar>(), "Cannot inline a named
+    // scalar.");
     if (auto op = inline_op_str(op_type)) {
       code_ << *op << gen(uop->in());
     } else {
diff --git a/csrc/device_lower/lower2device.cpp b/csrc/device_lower/lower2device.cpp
index 2ce917e6842..75ea2f4d284 100644
--- a/csrc/device_lower/lower2device.cpp
+++ b/csrc/device_lower/lower2device.cpp
@@ -27,6 +27,7 @@
 #include <device_lower/pass/misaligned_vectorization.h>
 #include <device_lower/pass/predicate.h>
 #include <device_lower/pass/replace_size.h>
+#include <device_lower/pass/rng.h>
 #include <device_lower/pass/unroll.h>
 #include <device_lower/pass/vectorize_welford.h>
 #include <device_lower/pass/warp_reduce.h>
@@ -282,6 +283,7 @@ GpuLower::GpuLower(Fusion* fusion, const CompileParams& cparams)
             generateConditionalFromPredicate},
            {"vectorizeWelford", vectorizeWelford},
            {"allocateCommonScalars", allocateCommonScalars},
+           {"addRNG", addRNG},
            {"insertMagicZero", insertMagicZero},
            {"KIRCleaner", KIRCleaner::cleanUp},
            {"instrumentKernel", instrumentKernel},
diff --git a/csrc/device_lower/pass/rng.cpp b/csrc/device_lower/pass/rng.cpp
new file mode 100644
index 00000000000..4b1e4d45822
--- /dev/null
+++ b/csrc/device_lower/pass/rng.cpp
@@ -0,0 +1,111 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#include <device_lower/pass/magic_zero.h>
+
+#include <device_lower/analysis/index_compute.h>
+#include <device_lower/lower2device.h>
+#include <dispatch.h>
+#include <instrumentation.h>
+#include <ir/utils.h>
+#include <kernel_ir_dispatch.h>
+
+namespace nvfuser {
+
+namespace {
+
+class RNGInserter : public kir::ExprMutator {
+ public:
+  static std::vector<Expr*> insert(const std::vector<Expr*>& exprs) {
+    RNGInserter inserter(exprs);
+    return inserter.exprs_;
+  }
+
+ private:
+  Val* rng_subseq;
+  Val* rng_offset;
+  struct InsertionInfo {
+    Scope* scope = nullptr;
+    ForLoop* fl = nullptr;
+  };
+
+  RNGInserter(const std::vector<Expr*>& exprs) {
+    NVF_ERROR(!exprs.empty());
+    auto neg_1 = IrBuilder::create<Val>(-1, DataType::Index);
+    auto rng_subseq =
+        IrBuilder::create<NamedScalar>("rng_subseq", DataType::Index);
+    auto rng_offset =
+        IrBuilder::create<NamedScalar>("rng_offset", DataType::Index);
+    kir::ExprMutator::registerInsertBefore(
+        exprs.front(),
+        IrBuilder::create<LoadStoreOp>(
+            LoadStoreOpType::Set, rng_subseq, neg_1));
+    kir::ExprMutator::registerInsertBefore(
+        exprs.front(),
+        IrBuilder::create<LoadStoreOp>(
+            LoadStoreOpType::Set, rng_offset, neg_1));
+    kir::ExprMutator::traverseAndInsert(exprs);
+  }
+
+  void handle(RNGOp* rng_op) final {
+    std::cout << rng_op->toString() << std::endl;
+    // auto linear_index = rng_op->getPhiloxIndex();
+    // auto multiple =  rng_op->getPhiloxMultiple();
+    // auto rng_subseq = SimplifyingIrBuilder::div(linear_index, multiple);
+    // auto rng_component = SimplifyingIrBuilder::mod(linear_index, multiple);
+    // auto rng_offset = rng_op->getRNGOffsetVal();
+
+    //  nvfuser_index_t rng_offset215 = (((ptr2 == nullptr) ? i3 : ((*ptr2) +
+    //  i3)) / 4LL);
+    //   if (rng_subseq != rng_subseq215 || rng_offset != rng_offset215) {
+    //     rng_result = philox(((ptr0 == nullptr) ? i1 : (*ptr0)),
+    //     rng_subseq215, rng_offset215); rng_subseq = rng_subseq215; rng_offset
+    //     = rng_offset215;
+    //   }
+    //   T1[i5] = rng_uniformf(rng_result, rng_component215);
+    // }
+
+    // if (fl->isUnrolled()) {
+    //   if (scope_.empty()) {
+    //     kir::ExprMutator::registerInsertAfter(
+    //         fl, IrBuilder::create<kir::UpdateMagicZero>());
+    //   } else {
+    //     NVF_ERROR(
+    //         !scope_.back()->exprs().empty(), "Not expecting an empty loop.");
+    //     kir::ExprMutator::registerInsertAfter(
+    //         fl, IrBuilder::create<kir::UpdateMagicZero>(), scope_.back());
+    //   }
+    // } else {
+    //   kir::ExprMutator::handle(fl);
+    // }
+    // NVF_THROW("TEST");
+  }
+
+  std::vector<InsertionInfo> insertion_list_;
+};
+
+} // namespace
+
+std::vector<Expr*> addRNG(const std::vector<Expr*>& exprs) {
+  FUSER_PERF_SCOPE("GpuLower::Lower::addRNG");
+  // Check if magic zero was even used, if not we don't have to define it or
+  // update it.
+  const auto gpu_lower = GpuLower::current();
+  auto kernel = gpu_lower->kernel();
+  const bool has_rng = std::any_of(
+      kernel->exprs().begin(), kernel->exprs().end(), [](Expr* expr) {
+        return expr->isA<RNGOp>();
+      });
+
+  if (!has_rng) {
+    return exprs;
+  }
+
+  return RNGInserter::insert(exprs);
+}
+
+} // namespace nvfuser
diff --git a/csrc/device_lower/pass/rng.h b/csrc/device_lower/pass/rng.h
new file mode 100644
index 00000000000..c0cb8c3aef1
--- /dev/null
+++ b/csrc/device_lower/pass/rng.h
@@ -0,0 +1,16 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+
+#include <ir/all_nodes.h>
+#include <kernel_ir.h>
+#include <kernel_ir_dispatch.h>
+
+namespace nvfuser {
+std::vector<Expr*> addRNG(const std::vector<Expr*>& exprs);
+} // namespace nvfuser
diff --git a/tests/cpp/test_gpu4.cpp b/tests/cpp/test_gpu4.cpp
new file mode 100644
index 00000000000..c3a79862aad
--- /dev/null
+++ b/tests/cpp/test_gpu4.cpp
@@ -0,0 +1,107 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#include <csrc/exceptions.h>
+#include <gmock/gmock-matchers.h>
+#include <gtest/gtest.h>
+
+#include <codegen.h>
+#include <debug.h>
+#include <device_lower/lower2device.h>
+#include <device_lower/pass/magic_zero.h>
+#include <device_lower/pass/replace_size.h>
+#include <disjoint_set.h>
+#include <expr_evaluator.h>
+#include <fusion.h>
+#include <fusion_segmenter.h>
+#include <grouped_reduction.h>
+#include <id_model/id_model.h>
+#include <ir/all_nodes.h>
+#include <ir/builder.h>
+#include <ir/graphviz.h>
+#include <ir/iostream.h>
+#include <ir/utils.h>
+#include <iter_visitor.h>
+#include <kernel_ir.h>
+#include <kernel_ir_dispatch.h>
+#include <logical_domain_map.h>
+#include <ops/all_ops.h>
+#include <runtime/executor.h>
+#include <runtime/executor_params.h>
+#include <runtime/fusion_executor_cache.h>
+#include <scheduler/all_schedulers.h>
+#include <scheduler/reduction_utils.h>
+#include <scheduler/tools/abstract_tensor.h>
+#include <scheduler/tools/inlining.h>
+#include <scheduler/utils.h>
+#include <tests/cpp/utils.h>
+#include <tests/cpp/validator.h>
+#include <transform_replay.h>
+#include <transform_rfactor.h>
+
+#include <torch/csrc/jit/api/function_impl.h>
+#include <torch/csrc/jit/codegen/cuda/interface.h>
+#include <torch/torch.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include <algorithm>
+#include <cmath>
+#include <sstream>
+#include "parallel_dimension_map.h"
+
+namespace nvfuser {
+
+using namespace at::indexing;
+
+TEST_F(NVFuserTest, IntRNG_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto input_tv = makeContigConcreteTensor({4 * 128 * 4});
+  fusion.addInput(input_tv);
+
+  constexpr float kDropoutProbability = 0.9;
+  constexpr float kScale = 1.0f / kDropoutProbability;
+
+  auto prob = IrBuilder::create<Val>(kDropoutProbability);
+  auto scale = IrBuilder::create<Val>(kScale);
+
+  // dropout start
+  auto rand_vals = rand_like(input_tv);
+  auto mask = lt(rand_vals, prob);
+  auto apply_mask = mul(input_tv, mask);
+  auto output_tv = mul(apply_mask, scale);
+  // dropout end
+  //   fusion.addOutput(mask);
+  fusion.addOutput(output_tv);
+
+  auto inp_cache = input_tv->cacheAfter();
+  output_tv->cacheBefore();
+
+  output_tv->split(0, 4);
+  output_tv->split(0, 128);
+  output_tv->axis(0)->parallelize(ParallelType::BIDx);
+
+  TransformPropagator propagator(output_tv);
+  MaxLogicalDomainInfoSpanningTree spanning_tree(output_tv);
+  spanning_tree.traverse(&propagator);
+  scheduler_utils::parallelizeAllLike(output_tv);
+
+  inp_cache->axis(-1)->parallelize(ParallelType::Vectorize);
+  rand_vals->axis(-1)->parallelize(ParallelType::Unroll);
+  output_tv->axis(-1)->parallelize(ParallelType::Vectorize);
+
+  inlineMost();
+
+  fusion.printMath();
+  fusion.printKernel();
+}
+
+} // namespace nvfuser