From 557797d0c201e292ef749439a9b32de703a55600 Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Sat, 18 Jan 2025 13:17:52 -0800 Subject: [PATCH] Testing rng. --- CMakeLists.txt | 2 + csrc/codegen.cpp | 8 ++- csrc/device_lower/lower2device.cpp | 2 + csrc/device_lower/pass/rng.cpp | 111 +++++++++++++++++++++++++++++ csrc/device_lower/pass/rng.h | 16 +++++ tests/cpp/test_gpu4.cpp | 107 +++++++++++++++++++++++++++ 6 files changed, 245 insertions(+), 1 deletion(-) create mode 100644 csrc/device_lower/pass/rng.cpp create mode 100644 csrc/device_lower/pass/rng.h create mode 100644 tests/cpp/test_gpu4.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 3d0c5db76b9..9441b8a7737 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -115,6 +115,7 @@ list(APPEND NVFUSER_SRCS ${NVFUSER_SRCS_DIR}/device_lower/pass/misaligned_vectorization.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/predicate.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/replace_size.cpp + ${NVFUSER_SRCS_DIR}/device_lower/pass/rng.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/scalar_hoist.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/unroll.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/vectorize_welford.cpp @@ -555,6 +556,7 @@ list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/tests/cpp/test_gpu1.cpp ${NVFUSER_ROOT}/tests/cpp/test_gpu2.cpp ${NVFUSER_ROOT}/tests/cpp/test_gpu3.cpp + ${NVFUSER_ROOT}/tests/cpp/test_gpu4.cpp ${NVFUSER_ROOT}/tests/cpp/test_gpu_compute_with.cpp ${NVFUSER_ROOT}/tests/cpp/test_gpu_fused_reduction.cpp ${NVFUSER_ROOT}/tests/cpp/test_gpu_indexing_ops.cpp diff --git a/csrc/codegen.cpp b/csrc/codegen.cpp index fe00c1f8105..efefd6d32e9 100644 --- a/csrc/codegen.cpp +++ b/csrc/codegen.cpp @@ -655,10 +655,15 @@ class CudaKernelGenerator : private kir::ConstIrVisitor { } void handle(const NamedScalar* ns) final { + std::cout << "Named scalar?" << std::endl; if (ns->definition() != nullptr && alloc_set_.find(ns) == alloc_set_.end()) { + std::cout << "Def: " << ns->definition()->toString() << std::endl; code_ << genInline(ns->definition()); } else { + if (ns->definition()) { + std::cout << "Else: " << ns->definition()->toString() << std::endl; + } code_ << genVariableName(ns); } } @@ -811,7 +816,8 @@ class CudaKernelGenerator : private kir::ConstIrVisitor { } code_ << " = "; } - + // NVF_ERROR(uop->out()->isA(), "Cannot inline a named + // scalar."); if (auto op = inline_op_str(op_type)) { code_ << *op << gen(uop->in()); } else { diff --git a/csrc/device_lower/lower2device.cpp b/csrc/device_lower/lower2device.cpp index 2ce917e6842..75ea2f4d284 100644 --- a/csrc/device_lower/lower2device.cpp +++ b/csrc/device_lower/lower2device.cpp @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -282,6 +283,7 @@ GpuLower::GpuLower(Fusion* fusion, const CompileParams& cparams) generateConditionalFromPredicate}, {"vectorizeWelford", vectorizeWelford}, {"allocateCommonScalars", allocateCommonScalars}, + {"addRNG", addRNG}, {"insertMagicZero", insertMagicZero}, {"KIRCleaner", KIRCleaner::cleanUp}, {"instrumentKernel", instrumentKernel}, diff --git a/csrc/device_lower/pass/rng.cpp b/csrc/device_lower/pass/rng.cpp new file mode 100644 index 00000000000..4b1e4d45822 --- /dev/null +++ b/csrc/device_lower/pass/rng.cpp @@ -0,0 +1,111 @@ +// clang-format off +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + */ +// clang-format on +#include + +#include +#include +#include +#include +#include +#include + +namespace nvfuser { + +namespace { + +class RNGInserter : public kir::ExprMutator { + public: + static std::vector insert(const std::vector& exprs) { + RNGInserter inserter(exprs); + return inserter.exprs_; + } + + private: + Val* rng_subseq; + Val* rng_offset; + struct InsertionInfo { + Scope* scope = nullptr; + ForLoop* fl = nullptr; + }; + + RNGInserter(const std::vector& exprs) { + NVF_ERROR(!exprs.empty()); + auto neg_1 = IrBuilder::create(-1, DataType::Index); + auto rng_subseq = + IrBuilder::create("rng_subseq", DataType::Index); + auto rng_offset = + IrBuilder::create("rng_offset", DataType::Index); + kir::ExprMutator::registerInsertBefore( + exprs.front(), + IrBuilder::create( + LoadStoreOpType::Set, rng_subseq, neg_1)); + kir::ExprMutator::registerInsertBefore( + exprs.front(), + IrBuilder::create( + LoadStoreOpType::Set, rng_offset, neg_1)); + kir::ExprMutator::traverseAndInsert(exprs); + } + + void handle(RNGOp* rng_op) final { + std::cout << rng_op->toString() << std::endl; + // auto linear_index = rng_op->getPhiloxIndex(); + // auto multiple = rng_op->getPhiloxMultiple(); + // auto rng_subseq = SimplifyingIrBuilder::div(linear_index, multiple); + // auto rng_component = SimplifyingIrBuilder::mod(linear_index, multiple); + // auto rng_offset = rng_op->getRNGOffsetVal(); + + // nvfuser_index_t rng_offset215 = (((ptr2 == nullptr) ? i3 : ((*ptr2) + + // i3)) / 4LL); + // if (rng_subseq != rng_subseq215 || rng_offset != rng_offset215) { + // rng_result = philox(((ptr0 == nullptr) ? i1 : (*ptr0)), + // rng_subseq215, rng_offset215); rng_subseq = rng_subseq215; rng_offset + // = rng_offset215; + // } + // T1[i5] = rng_uniformf(rng_result, rng_component215); + // } + + // if (fl->isUnrolled()) { + // if (scope_.empty()) { + // kir::ExprMutator::registerInsertAfter( + // fl, IrBuilder::create()); + // } else { + // NVF_ERROR( + // !scope_.back()->exprs().empty(), "Not expecting an empty loop."); + // kir::ExprMutator::registerInsertAfter( + // fl, IrBuilder::create(), scope_.back()); + // } + // } else { + // kir::ExprMutator::handle(fl); + // } + // NVF_THROW("TEST"); + } + + std::vector insertion_list_; +}; + +} // namespace + +std::vector addRNG(const std::vector& exprs) { + FUSER_PERF_SCOPE("GpuLower::Lower::addRNG"); + // Check if magic zero was even used, if not we don't have to define it or + // update it. + const auto gpu_lower = GpuLower::current(); + auto kernel = gpu_lower->kernel(); + const bool has_rng = std::any_of( + kernel->exprs().begin(), kernel->exprs().end(), [](Expr* expr) { + return expr->isA(); + }); + + if (!has_rng) { + return exprs; + } + + return RNGInserter::insert(exprs); +} + +} // namespace nvfuser diff --git a/csrc/device_lower/pass/rng.h b/csrc/device_lower/pass/rng.h new file mode 100644 index 00000000000..c0cb8c3aef1 --- /dev/null +++ b/csrc/device_lower/pass/rng.h @@ -0,0 +1,16 @@ +// clang-format off +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + */ +// clang-format on +#pragma once + +#include +#include +#include + +namespace nvfuser { +std::vector addRNG(const std::vector& exprs); +} // namespace nvfuser diff --git a/tests/cpp/test_gpu4.cpp b/tests/cpp/test_gpu4.cpp new file mode 100644 index 00000000000..c3a79862aad --- /dev/null +++ b/tests/cpp/test_gpu4.cpp @@ -0,0 +1,107 @@ +// clang-format off +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + */ +// clang-format on +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include "parallel_dimension_map.h" + +namespace nvfuser { + +using namespace at::indexing; + +TEST_F(NVFuserTest, IntRNG_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto input_tv = makeContigConcreteTensor({4 * 128 * 4}); + fusion.addInput(input_tv); + + constexpr float kDropoutProbability = 0.9; + constexpr float kScale = 1.0f / kDropoutProbability; + + auto prob = IrBuilder::create(kDropoutProbability); + auto scale = IrBuilder::create(kScale); + + // dropout start + auto rand_vals = rand_like(input_tv); + auto mask = lt(rand_vals, prob); + auto apply_mask = mul(input_tv, mask); + auto output_tv = mul(apply_mask, scale); + // dropout end + // fusion.addOutput(mask); + fusion.addOutput(output_tv); + + auto inp_cache = input_tv->cacheAfter(); + output_tv->cacheBefore(); + + output_tv->split(0, 4); + output_tv->split(0, 128); + output_tv->axis(0)->parallelize(ParallelType::BIDx); + + TransformPropagator propagator(output_tv); + MaxLogicalDomainInfoSpanningTree spanning_tree(output_tv); + spanning_tree.traverse(&propagator); + scheduler_utils::parallelizeAllLike(output_tv); + + inp_cache->axis(-1)->parallelize(ParallelType::Vectorize); + rand_vals->axis(-1)->parallelize(ParallelType::Unroll); + output_tv->axis(-1)->parallelize(ParallelType::Vectorize); + + inlineMost(); + + fusion.printMath(); + fusion.printKernel(); +} + +} // namespace nvfuser