diff --git a/CMakeLists.txt b/CMakeLists.txt index 3d0c5db76b9..9441b8a7737 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -115,6 +115,7 @@ list(APPEND NVFUSER_SRCS ${NVFUSER_SRCS_DIR}/device_lower/pass/misaligned_vectorization.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/predicate.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/replace_size.cpp + ${NVFUSER_SRCS_DIR}/device_lower/pass/rng.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/scalar_hoist.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/unroll.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/vectorize_welford.cpp @@ -555,6 +556,7 @@ list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/tests/cpp/test_gpu1.cpp ${NVFUSER_ROOT}/tests/cpp/test_gpu2.cpp ${NVFUSER_ROOT}/tests/cpp/test_gpu3.cpp + ${NVFUSER_ROOT}/tests/cpp/test_gpu4.cpp ${NVFUSER_ROOT}/tests/cpp/test_gpu_compute_with.cpp ${NVFUSER_ROOT}/tests/cpp/test_gpu_fused_reduction.cpp ${NVFUSER_ROOT}/tests/cpp/test_gpu_indexing_ops.cpp diff --git a/csrc/codegen.cpp b/csrc/codegen.cpp index 9571425b7e7..20567e6a36c 100644 --- a/csrc/codegen.cpp +++ b/csrc/codegen.cpp @@ -265,7 +265,9 @@ class CudaKernelGenerator : private kir::ConstIrVisitor { } else if (v->isA()) { tv = v->as(); } - if (tv && aligned_array_of_regs_.count(tv)) { + if (tv && + (aligned_array_of_regs_.count(tv) || + tv->getMemoryType() == MemoryType::Local)) { return genVariableName(tv).append(".array"); } else { return genVariableName(v); @@ -358,7 +360,7 @@ class CudaKernelGenerator : private kir::ConstIrVisitor { const auto& kernel_summary = kernel_->summary(); if (kernel_summary.has_philox_op) { - indent() << "uint4 rng_result;\n"; + indent() << "Array rng_result;\n"; indent() << "nvfuser_index_t rng_subseq = -1;\n"; indent() << "nvfuser_index_t rng_offset = -1;\n"; } @@ -3169,14 +3171,11 @@ class CudaKernelGenerator : private kir::ConstIrVisitor { break; case MemoryType::Local: { auto va = kernel_->summary().vectorized_accesses; + indent() << "Array<" << buffer_dtype << ", " << genInline(size) + << ", " << (va.find(tv) != va.end() ? va.at(tv) : 1) << "> " + << genVariableName(tv) << ";\n"; if (va.find(tv) != va.end()) { - indent() << "Array<" << buffer_dtype << ", " << genInline(size) - << ", " << va.at(tv) << "> " << genVariableName(tv) - << ";\n"; aligned_array_of_regs_.insert(tv); - } else { - indent() << buffer_dtype << " " << genVariableName(tv) << "[" - << genInline(size) << "];\n"; } } break; default: diff --git a/csrc/device_lower/lower2device.cpp b/csrc/device_lower/lower2device.cpp index 2ce917e6842..75ea2f4d284 100644 --- a/csrc/device_lower/lower2device.cpp +++ b/csrc/device_lower/lower2device.cpp @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -282,6 +283,7 @@ GpuLower::GpuLower(Fusion* fusion, const CompileParams& cparams) generateConditionalFromPredicate}, {"vectorizeWelford", vectorizeWelford}, {"allocateCommonScalars", allocateCommonScalars}, + {"addRNG", addRNG}, {"insertMagicZero", insertMagicZero}, {"KIRCleaner", KIRCleaner::cleanUp}, {"instrumentKernel", instrumentKernel}, diff --git a/csrc/device_lower/pass/rng.cpp b/csrc/device_lower/pass/rng.cpp new file mode 100644 index 00000000000..4eb96fbe163 --- /dev/null +++ b/csrc/device_lower/pass/rng.cpp @@ -0,0 +1,190 @@ +// clang-format off +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + */ +// clang-format on +#include + +#include +#include +#include +#include +#include +#include + +namespace nvfuser { + +namespace { + +std::tuple createAndAllocNS( + std::string name, + DataType dtype = DataType::Index) { + Val* val = IrBuilder::create(name, dtype); + auto alloc = IrBuilder::create( + val, MemoryType::Local, GpuLower::current()->kernel()->oneVal()); + return std::make_tuple(val, alloc); +} + +class RNGInserter : public kir::ExprMutator { + public: + static std::vector insert(const std::vector& exprs) { + RNGInserter inserter(exprs); + return inserter.exprs_; + } + + private: + Val* rng_subseq = nullptr; + Val* rng_offset = nullptr; + TensorView* rng_result = nullptr; + const std::vector& exprs; + + struct InsertionInfo { + Scope* scope = nullptr; + ForLoop* fl = nullptr; + }; + + RNGInserter(const std::vector& _exprs) : exprs(_exprs) { + kir::ExprMutator::traverseAndInsert(exprs); + } + + void handle(RNGOp* rop) final { + // Set prologue if not already set + if (rng_subseq == nullptr) { + NVF_ERROR(!exprs.empty()); + auto neg_1 = IrBuilder::create(-1, DataType::Index); + auto subseq_tuple = createAndAllocNS("rng_subseq"); + kir::ExprMutator::registerInsertBefore( + exprs.front(), std::get<1>(subseq_tuple), nullptr); + kir::ExprMutator::registerInsertBefore( + exprs.front(), + IrBuilder::create( + LoadStoreOpType::Set, std::get<0>(subseq_tuple), neg_1), + nullptr); + + rng_subseq = std::get<0>(subseq_tuple); + + auto offset_tuple = createAndAllocNS("rng_offset"); + kir::ExprMutator::registerInsertBefore( + exprs.front(), std::get<1>(offset_tuple), nullptr); + kir::ExprMutator::registerInsertBefore( + exprs.front(), + IrBuilder::create( + LoadStoreOpType::Set, std::get<0>(offset_tuple), neg_1), + nullptr); + + rng_offset = std::get<0>(offset_tuple); + + rng_result = TensorViewBuilder() + .shape(std::vector{4}) + .dtype(DataType::UInt64) + .contiguity(true) + .build(); + rng_result->setMemoryType(MemoryType::Local); + + auto rng_result_alloc = + IrBuilder::create(rng_result, MemoryType::Local); + kir::ExprMutator::registerInsertBefore( + exprs.front(), rng_result_alloc, nullptr); + } + + auto index_tuple = + createAndAllocNS("liner_index" + std::to_string(rop->name())); + kir::ExprMutator::registerInsertBefore(rop, std::get<1>(index_tuple)); + kir::ExprMutator::registerInsertBefore( + rop, + IrBuilder::create( + LoadStoreOpType::Set, + std::get<0>(index_tuple), + rop->getPhiloxIndex())); + + auto multiple = + IrBuilder::create(rop->getPhiloxMultiple(), DataType::Index); + + auto rop_subseq_tuple = + createAndAllocNS("rng_subseq" + std::to_string(rop->name())); + kir::ExprMutator::registerInsertBefore(rop, std::get<1>(rop_subseq_tuple)); + kir::ExprMutator::registerInsertBefore( + rop, + IrBuilder::create( + BinaryOpType::Div, + std::get<0>(rop_subseq_tuple), + std::get<0>(index_tuple), + multiple)); + + auto rop_component_tuple = + createAndAllocNS("rng_component" + std::to_string(rop->name())); + kir::ExprMutator::registerInsertBefore( + rop, std::get<1>(rop_component_tuple)); + kir::ExprMutator::registerInsertBefore( + rop, + IrBuilder::create( + BinaryOpType::Mod, + std::get<0>(rop_component_tuple), + std::get<0>(index_tuple), + multiple)); + + auto rop_offset_tuple = + createAndAllocNS("rng_offset" + std::to_string(rop->name())); + kir::ExprMutator::registerInsertBefore(rop, std::get<1>(rop_offset_tuple)); + kir::ExprMutator::registerInsertBefore( + rop, + IrBuilder::create( + LoadStoreOpType::Set, + std::get<0>(rop_offset_tuple), + rop->getRNGOffsetVal())); + + kir::IfThenElse* ite = IrBuilder::create( + IrBuilder::create(SimplifyingIrBuilder::logicalOrExpr( + SimplifyingIrBuilder::neExpr( + rng_subseq, std::get<0>(rop_subseq_tuple)), + SimplifyingIrBuilder::neExpr( + rng_offset, std::get<0>(rop_offset_tuple))))); + + ite->thenBody().push_back(IrBuilder::create( + TernaryOpType::Philox, + rng_result, + rop->getRNGSeedVal(), + rng_subseq, + rng_offset)); + + ite->thenBody().push_back(IrBuilder::create( + LoadStoreOpType::Set, rng_subseq, std::get<0>(rop_subseq_tuple))); + + ite->thenBody().push_back(IrBuilder::create( + LoadStoreOpType::Set, rng_offset, std::get<0>(rop_offset_tuple))); + + kir::ExprMutator::registerInsertBefore(rop, ite); + } + + std::vector insertion_list_; +}; + +} // namespace + +std::vector addRNG(const std::vector& exprs) { + FUSER_PERF_SCOPE("GpuLower::Lower::addRNG"); + // Check if magic zero was even used, if not we don't have to define it or + // update it. + const auto gpu_lower = GpuLower::current(); + auto kernel = gpu_lower->kernel(); + const bool has_rng = std::any_of( + kernel->exprs().begin(), kernel->exprs().end(), [](Expr* expr) { + return expr->isA(); + }); + + if (!has_rng) { + return exprs; + } + auto exprs_ = RNGInserter::insert(exprs); + std::cout << "====================" << std::endl; + for (auto expr : exprs_) { + std::cout << expr->toString() << std::endl; + } + std::cout << "====================" << std::endl; + // NVF_THROW("throw"); + return exprs_; +} + +} // namespace nvfuser diff --git a/csrc/device_lower/pass/rng.h b/csrc/device_lower/pass/rng.h new file mode 100644 index 00000000000..c0cb8c3aef1 --- /dev/null +++ b/csrc/device_lower/pass/rng.h @@ -0,0 +1,16 @@ +// clang-format off +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + */ +// clang-format on +#pragma once + +#include +#include +#include + +namespace nvfuser { +std::vector addRNG(const std::vector& exprs); +} // namespace nvfuser diff --git a/csrc/ir/internal_nodes.h b/csrc/ir/internal_nodes.h index 6aebcb3c457..eae3c0d186c 100644 --- a/csrc/ir/internal_nodes.h +++ b/csrc/ir/internal_nodes.h @@ -2001,6 +2001,10 @@ class NVF_API NamedScalar : public Val { p == ParallelType::BIDz); } + bool isParallelScalar() const { + return isGridDim() || isBlockDim() || isBlockIdx() || isThreadIdx(); + } + //! Return the named scalar extent of a parallel dimension (e.g. blockDim.x) //! WARNING: Only works with Fusion container at the moment static NamedScalar* getParallelDim(ParallelType p_type); diff --git a/csrc/kernel_ir_dispatch.cpp b/csrc/kernel_ir_dispatch.cpp index 3761cc5faed..03e14bad080 100644 --- a/csrc/kernel_ir_dispatch.cpp +++ b/csrc/kernel_ir_dispatch.cpp @@ -91,6 +91,7 @@ void ConstIrVisitor::handle(const IfThenElse* ite) { std::vector ExprMutator::mutate(bool reverse_order) { if (insertions_.empty() && replacements_.empty() && removal_.empty()) { + std::cout << "ExprMutator::Empty" << std::endl; return exprs_; } @@ -104,7 +105,7 @@ std::vector ExprMutator::mutate(bool reverse_order) { } auto pos_it = std::find(exprs_.begin(), exprs_.end(), info.reference); NVF_ERROR( - pos_it != exprs_.end(), + pos_it >= exprs_.begin() && pos_it != exprs_.end(), "Issue finding reference expression for insertion."); if (info.mode == MutationMode::BEFORE) { exprs_.insert(pos_it, info.new_expr); @@ -132,6 +133,7 @@ std::vector ExprMutator::mutate(bool reverse_order) { } } else { for (auto insertion_info : insertions_) { + std::cout << "ExprMutator::run_insertion" << std::endl; run_insertion(insertion_info); } } @@ -173,6 +175,12 @@ std::vector ExprMutator::mutate(bool reverse_order) { insertions_.clear(); replacements_.clear(); + std::cout << "------------" << std::endl; + for (auto expr : exprs_) { + std::cout << expr->toString() << std::endl; + } + std::cout << "------------" << std::endl; + return exprs_; } @@ -208,6 +216,7 @@ void ExprMutator::registerInsertBefore( Expr* reference, Expr* new_expr, Scope* scope) { + std::cout << "Register insert before" << std::endl; registerMutation(reference, new_expr, scope, MutationMode::BEFORE); } diff --git a/csrc/type.cpp b/csrc/type.cpp index c3e07fa1d67..ac748807894 100644 --- a/csrc/type.cpp +++ b/csrc/type.cpp @@ -684,6 +684,8 @@ static const char* ternary_op_type2string(TernaryOpType t) { return "threshold"; case TernaryOpType::Where: return "where"; + case TernaryOpType::Philox: + return "philox"; default: NVF_THROW("Unexpected TernaryOpType"); } diff --git a/csrc/type.h b/csrc/type.h index 13e84ca1d98..74c4e17fee2 100644 --- a/csrc/type.h +++ b/csrc/type.h @@ -697,7 +697,7 @@ bool isIntegerOp(const BinaryOpType bopt); // Return if output of operator should be a boolean bool isLogicalOp(const BinaryOpType bopt); -enum class TernaryOpType { Clamp, Lerp, Threshold, Where }; +enum class TernaryOpType { Clamp, Lerp, Threshold, Where, Philox }; enum class ParallelType { DIDx, diff --git a/runtime/random_numbers.cu b/runtime/random_numbers.cu index a9fc07b30da..c99fe957574 100644 --- a/runtime/random_numbers.cu +++ b/runtime/random_numbers.cu @@ -13,39 +13,39 @@ __device__ unsigned int mulhilo32( return a * b; } -__device__ uint4 single_round(uint4 ctr, uint2 key) { +__device__ Array single_round(Array ctr, Array key) { constexpr unsigned long kPhiloxSA = 0xD2511F53; constexpr unsigned long kPhiloxSB = 0xCD9E8D57; unsigned int hi0; unsigned int hi1; - unsigned int lo0 = mulhilo32(kPhiloxSA, ctr.x, &hi0); - unsigned int lo1 = mulhilo32(kPhiloxSB, ctr.z, &hi1); - uint4 ret = {hi1 ^ ctr.y ^ key.x, lo1, hi0 ^ ctr.w ^ key.y, lo0}; + unsigned int lo0 = mulhilo32(kPhiloxSA, ctr[0], &hi0); + unsigned int lo1 = mulhilo32(kPhiloxSB, ctr[2], &hi1); + Array ret = {hi1 ^ ctr[1] ^ key[0], lo1, hi0 ^ ctr[3] ^ key[1], lo0}; return ret; } -__device__ uint4 philox( +__device__ Array philox( unsigned long long seed, unsigned long long subsequence, unsigned long long offset) { constexpr unsigned long kPhilox10A = 0x9E3779B9; constexpr unsigned long kPhilox10B = 0xBB67AE85; - uint2 key = {}; - key.x = (unsigned int)seed; - key.y = (unsigned int)(seed >> 32); - uint4 counter = make_uint4(0, 0, 0, 0); - counter.x = (unsigned int)(offset); - counter.y = (unsigned int)(offset >> 32); - counter.z = (unsigned int)(subsequence); - counter.w = (unsigned int)(subsequence >> 32); - - uint4 output = {}; - uint2 key_ = key; - uint4 counter_ = counter; + Array key; + key[0] = (unsigned int)seed; + key[1] = (unsigned int)(seed >> 32); + Array counter; + counter[0] = (unsigned int)(offset); + counter[1] = (unsigned int)(offset >> 32); + counter[2] = (unsigned int)(subsequence); + counter[3] = (unsigned int)(subsequence >> 32); + + Array output = {}; + Array key_ = key; + Array counter_ = counter; for (int i = 0; i < 9; i++) { counter_ = single_round(counter_, key_); - key_.x += (kPhilox10A); - key_.y += (kPhilox10B); + key_[0] += (kPhilox10A); + key_[1] += (kPhilox10B); } output = single_round(counter_, key_); return output; @@ -85,27 +85,27 @@ __device__ double uniform(unsigned int x, unsigned int y) { return result == 1.0 ? 0.0 : result; } -__device__ double rng_uniform(const uint4& rng_result, int rng_component) { +__device__ double rng_uniform(const Array& rng_result, int rng_component) { return uniform( - (&rng_result.x)[rng_component * 2], - (&rng_result.x)[rng_component * 2 + 1]); + rng_result[rng_component * 2], + rng_result[rng_component * 2 + 1]); } -__device__ float rng_uniformf(const uint4& rng_result, int rng_component) { - return uniformf((&rng_result.x)[rng_component]); +__device__ float rng_uniformf(const Array& rng_result, int rng_component) { + return uniformf(rng_result[rng_component]); } -__device__ __half rng_uniform_half(const uint4& rng_result, int rng_component) { - return uniform_half((&rng_result.x)[rng_component]); +__device__ __half rng_uniform_half(const Array& rng_result, int rng_component) { + return uniform_half(rng_result[rng_component]); } __device__ __bfloat -rng_uniform_bfloat(const uint4& rng_result, int rng_component) { - return uniform_bfloat((&rng_result.x)[rng_component]); +rng_uniform_bfloat(const Array& rng_result, int rng_component) { + return uniform_bfloat(rng_result[rng_component]); } __device__ double rng_uniform_range( - const uint4& rng_result, + const Array& rng_result, int rng_component, double from, double to) { @@ -115,7 +115,7 @@ __device__ double rng_uniform_range( } __device__ float rng_uniform_rangef( - const uint4& rng_result, + const Array& rng_result, int rng_component, float from, float to) { @@ -125,23 +125,23 @@ __device__ float rng_uniform_rangef( } __device__ __half rng_uniform_range_half( - const uint4& rng_result, + const Array& rng_result, int rng_component, float from, float to) { auto range = to - from; - float uniform01 = raw_uniform_float((&rng_result.x)[rng_component]); + float uniform01 = raw_uniform_float(rng_result[rng_component]); __half result = __float2half(from + range * uniform01); return __heq(result, __float2half(to)) ? __float2half(from) : result; } __device__ __bfloat rng_uniform_range_bfloat( - const uint4& rng_result, + const Array& rng_result, int rng_component, float from, float to) { auto range = to - from; - float uniform01 = raw_uniform_float((&rng_result.x)[rng_component]); + float uniform01 = raw_uniform_float(rng_result[rng_component]); __bfloat result = __float2bfloat(from + range * uniform01); return __heq(result, __float2bfloat(to)) ? __float2bfloat(from) : result; } @@ -174,39 +174,39 @@ __device__ double normal( } __device__ double rng_normal_standard( - const uint4& rng_result, + const Array& rng_result, int rng_component) { return normal( - rng_result.x, rng_result.y, rng_result.z, rng_result.w, rng_component); + rng_result[0], rng_result[1], rng_result[2], rng_result[3], rng_component); } __device__ float rng_normal_standardf( - const uint4& rng_result, + const Array& rng_result, int rng_component) { return normalf( - (&rng_result.x)[rng_component / 2 * 2], - (&rng_result.y)[rng_component / 2 * 2], + rng_result[rng_component / 2 * 2], + rng_result[1 + rng_component / 2 * 2], rng_component); } __device__ __half -rng_normal_standard_half(const uint4& rng_result, int rng_component) { +rng_normal_standard_half(const Array& rng_result, int rng_component) { return __float2half(normalf( - (&rng_result.x)[rng_component / 2 * 2], - (&rng_result.y)[rng_component / 2 * 2], + rng_result[rng_component / 2 * 2], + rng_result[1 + rng_component / 2 * 2], rng_component)); } __device__ __bfloat -rng_normal_standard_bfloat(const uint4& rng_result, int rng_component) { +rng_normal_standard_bfloat(const Array& rng_result, int rng_component) { return __float2bfloat(normalf( - (&rng_result.x)[rng_component / 2 * 2], - (&rng_result.y)[rng_component / 2 * 2], + rng_result[rng_component / 2 * 2], + rng_result[1 + rng_component / 2 * 2], rng_component)); } __device__ double rng_normal_general( - const uint4& rng_result, + const Array& rng_result, int rng_component, double mean, double std) { @@ -215,7 +215,7 @@ __device__ double rng_normal_general( } __device__ float rng_normal_generalf( - const uint4& rng_result, + const Array& rng_result, int rng_component, float mean, float std) { @@ -224,25 +224,25 @@ __device__ float rng_normal_generalf( } __device__ __half rng_normal_general_half( - const uint4& rng_result, + const Array& rng_result, int rng_component, float mean, float std) { auto normal01 = normalf( - (&rng_result.x)[rng_component / 2 * 2], - (&rng_result.y)[rng_component / 2 * 2], + rng_result[rng_component / 2 * 2], + rng_result[1 + rng_component / 2 * 2], rng_component); return __float2half(normal01 * std + mean); } __device__ __bfloat rng_normal_general_bfloat( - const uint4& rng_result, + const Array& rng_result, int rng_component, float mean, float std) { auto normal01 = normalf( - (&rng_result.x)[rng_component / 2 * 2], - (&rng_result.y)[rng_component / 2 * 2], + rng_result[rng_component / 2 * 2], + rng_result[1 + rng_component / 2 * 2], rng_component); return __float2bfloat(normal01 * std + mean); } diff --git a/tests/cpp/test_gpu4.cpp b/tests/cpp/test_gpu4.cpp new file mode 100644 index 00000000000..c3a79862aad --- /dev/null +++ b/tests/cpp/test_gpu4.cpp @@ -0,0 +1,107 @@ +// clang-format off +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + */ +// clang-format on +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include "parallel_dimension_map.h" + +namespace nvfuser { + +using namespace at::indexing; + +TEST_F(NVFuserTest, IntRNG_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto input_tv = makeContigConcreteTensor({4 * 128 * 4}); + fusion.addInput(input_tv); + + constexpr float kDropoutProbability = 0.9; + constexpr float kScale = 1.0f / kDropoutProbability; + + auto prob = IrBuilder::create(kDropoutProbability); + auto scale = IrBuilder::create(kScale); + + // dropout start + auto rand_vals = rand_like(input_tv); + auto mask = lt(rand_vals, prob); + auto apply_mask = mul(input_tv, mask); + auto output_tv = mul(apply_mask, scale); + // dropout end + // fusion.addOutput(mask); + fusion.addOutput(output_tv); + + auto inp_cache = input_tv->cacheAfter(); + output_tv->cacheBefore(); + + output_tv->split(0, 4); + output_tv->split(0, 128); + output_tv->axis(0)->parallelize(ParallelType::BIDx); + + TransformPropagator propagator(output_tv); + MaxLogicalDomainInfoSpanningTree spanning_tree(output_tv); + spanning_tree.traverse(&propagator); + scheduler_utils::parallelizeAllLike(output_tv); + + inp_cache->axis(-1)->parallelize(ParallelType::Vectorize); + rand_vals->axis(-1)->parallelize(ParallelType::Unroll); + output_tv->axis(-1)->parallelize(ParallelType::Vectorize); + + inlineMost(); + + fusion.printMath(); + fusion.printKernel(); +} + +} // namespace nvfuser diff --git a/tests/cpp/test_loop_rotation.cpp b/tests/cpp/test_loop_rotation.cpp index 4b5122a66fd..d8ae9d49bda 100644 --- a/tests/cpp/test_loop_rotation.cpp +++ b/tests/cpp/test_loop_rotation.cpp @@ -41,8 +41,8 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor i1 = T0.alloc_stride[0LL] * i0; nvfuser_index_t i2; i2 = 3LL * i0; - float T1[1LL]; - float T2[1LL]; + Array T1; + Array T2; T1[0LL] = 0LL; T1[0LL] = T0[i1]; @@ -53,7 +53,7 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor for(nvfuser_index_t i3 = 0LL; i3 < 3LL; ++i3) { nvfuser_index_t i4; i4 = (1LL + i3) + nvfuser_zero; - float T3[1LL]; + Array T3; T3[0LL] = T2[0LL]; T4[(i2 + (i3 + nvfuser_zero))] @@ -101,8 +101,8 @@ TEST_F(LoopRotationTest, RotateOuter) { const std::string expected_kernel = R"( __global__ void CUDAGeneratedKernel(Tensor T0, Tensor T4) { NVFUSER_DEFINE_MAGIC_ZERO; - float T1[3LL]; - float T2[3LL]; + Array T1; + Array T2; #pragma unroll for(nvfuser_index_t i0 = 0LL; i0 < 3LL; ++i0) { T1[i0] = 0LL; @@ -202,8 +202,8 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor i0 = T0.logical_size[0LL] * T0.logical_size[1LL]; nvfuser_index_t i1; i1 = ceilDiv(i0, 5LL); - float T1[5LL]; - float T2[5LL]; + Array T1; + Array T2; #pragma unroll for(nvfuser_index_t i2 = 0LL; i2 < 5LL; ++i2) { T1[i2] = 0LL; @@ -306,7 +306,7 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor NVFUSER_DEFINE_MAGIC_ZERO; nvfuser_index_t i0; i0 = 4LL * T0.alloc_stride[0LL]; - float T1[15LL]; + Array T1; #pragma unroll 4 for(nvfuser_index_t i1 = 0LL; i1 < 4LL; ++i1) { nvfuser_index_t i2; @@ -328,7 +328,7 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor } } NVFUSER_UPDATE_MAGIC_ZERO; - float T2[3LL]; + Array T2; #pragma unroll for(nvfuser_index_t i6 = 0LL; i6 < 3LL; ++i6) { T2[i6] @@ -362,7 +362,7 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor } } NVFUSER_UPDATE_MAGIC_ZERO; - float T3[3LL]; + Array T3; #pragma unroll for(nvfuser_index_t i14 = 0LL; i14 < 3LL; ++i14) { T3[i14] @@ -421,7 +421,7 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor i1 = 5LL * T0.alloc_stride[0LL]; bool b2; b2 = 4LL < T0.logical_size[0LL]; - float T1[15LL]; + Array T1; #pragma unroll for(nvfuser_index_t i3 = 0LL; i3 < 3LL; ++i3) { T1[i3] = 0LL; @@ -454,7 +454,7 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor } } NVFUSER_UPDATE_MAGIC_ZERO; - float T2[3LL]; + Array T2; #pragma unroll for(nvfuser_index_t i3 = 0LL; i3 < 3LL; ++i3) { T1[(12LL + i3)] = 0LL; @@ -486,7 +486,7 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor i13 = 3LL * ((1LL + i9) % 5LL); bool b14; b14 = (5LL + i9) < T0.logical_size[0LL]; - float T3[3LL]; + Array T3; #pragma unroll for(nvfuser_index_t i15 = 0LL; i15 < 3LL; ++i15) { T3[i15] @@ -599,7 +599,7 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor } NVFUSER_UPDATE_MAGIC_ZERO; asm volatile("cp.async.wait_group %0;\n"::"n"(3LL)); - float T1[2LL]; + Array T1; T1[0LL] = T4[0LL]; #pragma unroll 4 @@ -637,14 +637,14 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor for(nvfuser_index_t i14 = 0LL; i14 < 2LL; ++i14) { T1[((1LL + i14) % 2LL)] = T4[(i11 + i14)]; - float T2[1LL]; + Array T2; T2[0LL] = T1[i14]; T3[(i12 + (i14 + nvfuser_zero))] = T2[0LL]; } NVFUSER_UPDATE_MAGIC_ZERO; - float T2[1LL]; + Array T2; T2[0LL] = T1[0LL]; T3[(2LL + i12)] diff --git a/tests/cpp/test_scalar_hoisting.cpp b/tests/cpp/test_scalar_hoisting.cpp index d0295aa20f3..ae23b3e5593 100644 --- a/tests/cpp/test_scalar_hoisting.cpp +++ b/tests/cpp/test_scalar_hoisting.cpp @@ -316,7 +316,7 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor b7 = i0 < i6; float f8; f8 = (float)(i6); - float T1[1LL]; + Array T1; if (b7) { T1[0LL] = sinf(T0[i0]);