From 7086d52e16aef80509ee710c40bfddb14adab90c Mon Sep 17 00:00:00 2001 From: Ryan Spring Date: Sun, 3 Nov 2024 09:26:26 -0800 Subject: [PATCH 01/27] Add TT, TN, NT, NN tests for HopperMultipleMatmulScheduler (#3310) This PR creates four tests for the `HopperMultiMatmulScheduler`. Each tests covers a different matmul layout - TT, TN, NT, and NN where the input arguments are already broadcasted. --- tests/cpp/test_matmul_scheduler.cpp | 300 ++++++++++++++++++++++++++++ 1 file changed, 300 insertions(+) diff --git a/tests/cpp/test_matmul_scheduler.cpp b/tests/cpp/test_matmul_scheduler.cpp index e532f32ef57..18256e18892 100644 --- a/tests/cpp/test_matmul_scheduler.cpp +++ b/tests/cpp/test_matmul_scheduler.cpp @@ -3075,4 +3075,304 @@ TEST_F(MatmulSchedulerTest, OperandOrderIssue2434) { NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); } +TEST_F(MatmulSchedulerTest, HSH_TT) { + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(9, 0, 10, 0); + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + + const auto dtype = DataType::Half; + constexpr auto layout = MmaLayout::TT; + + auto tv0 = makeContigConcreteTensor({-1, -1, 1}, dtype); // A [M, K, b] + auto tv1 = makeContigConcreteTensor({1, -1, -1}, dtype); // B [b, K, N] + fusion->addInput(tv0); + fusion->addInput(tv1); + + auto tv2 = fusedMultiplySum(tv0, tv1, {1}); + + // Reorder the accumulator as [M, N, K] + // [M, rK, N] -> [M, N, K] + tv2->reorder({{-2, -1}, {-1, -2}}); + tv2->commitLeafToLogical(); + + auto tv3 = castOp(DataType::Half, tv2); + fusion->addOutput(tv3); + + NVF_CHECK( + 1 == ir_utils::getOpsOfType(fusion.get()).size(), + "matmul fusion must have at least one MmaOp"); + + // Create custom Matmul Params + MatMulTileOptions gemm_tile; + // TODO cta tile is a multiple of mma macro for hopper. + gemm_tile.cta_tile = GemmTile(128, 128, 32); + + // TODO warp tile is (macroM, macroN, macroK) for hopper. + gemm_tile.warp_tile = GemmTile(64, 64, 32); + + // TODO instruction tile is not used for hopper. + gemm_tile.instruction_tile = GemmTile(16, 8, 16); + + MatmulParams mparams; + mparams.supported_vec_size = {8, 8, 4}; + + // TODO use hopper macro + // mparams.mma_macro = MmaMacro::Hopper_64_256_16; + mparams.mma_macro = MmaMacro::Ampere_16_8_16; + + mparams.tile_sizes = gemm_tile; + mparams.async_gmem_load_operands = true; + mparams.circular_buffer_options.circular_buffer_smem_write = true; + mparams.circular_buffer_options.circular_buffer_smem_read = true; + mparams.circular_buffer_options.smem_circular_buffer_stage = 4; + + // TODO Create prefetch parameter + // mparams.circular_buffer_options.smem_circular_buffer_prefetch = 3; + + // Schedule matmul fusion using custom parameters + SchedulerEntry::makeSchedulerInstance(SchedulerType::Matmul) + ->schedule(fusion.get(), &mparams); + + const int M = 32, N = 32, K = 256; + auto inputs = + matmulAtInput3DHopperSS(M, N, K, layout, data_type_to_aten(dtype)); + + //! TODO Disabled because hopper multiple matmul scheduler is currently a copy + //! of ampere scheduler. + /* + FusionExecutor fe; + fe.compileFusion( + fusion.get(), + {inputs.first, inputs.second}, + LaunchParams(), + matmul_cparams); + auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + auto tref = atMatmul(inputs.first.squeeze(), inputs.second.squeeze(), layout); + EXPECT_TRUE(at::allclose(cg_outputs[0], tref, 1e-5, 1e-5)); + */ +} + +TEST_F(MatmulSchedulerTest, HSH_TN) { + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(9, 0, 10, 0); + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + + const auto dtype = DataType::Half; + constexpr auto layout = MmaLayout::TN; + + auto tv0 = makeContigConcreteTensor({-1, 1, -1}, dtype); + auto tv1 = makeContigConcreteTensor({1, -1, -1}, dtype); + fusion->addInput(tv0); + fusion->addInput(tv1); + + // [M, b, K] x [b, N, K] -> [M, N, rK] + auto tv2 = fusedMultiplySum(tv0, tv1, {-1}); + + // [M, N] + auto tv3 = castOp(DataType::Half, tv2); + fusion->addOutput(tv3); + + NVF_CHECK( + 1 == ir_utils::getOpsOfType(fusion.get()).size(), + "matmul fusion must have at least one MmaOp"); + + // Create custom Matmul Params + MatMulTileOptions gemm_tile; + // TODO cta tile is a multiple of mma macro for hopper. + gemm_tile.cta_tile = GemmTile(128, 128, 32); + + // TODO warp tile is (macroM, macroN, macroK) for hopper. + gemm_tile.warp_tile = GemmTile(64, 64, 32); + + // TODO instruction tile is not used for hopper. + gemm_tile.instruction_tile = GemmTile(16, 8, 16); + + MatmulParams mparams; + mparams.supported_vec_size = {8, 8, 4}; + + // TODO use hopper macro + // mparams.mma_macro = MmaMacro::Hopper_64_256_16; + mparams.mma_macro = MmaMacro::Ampere_16_8_16; + + mparams.tile_sizes = gemm_tile; + mparams.async_gmem_load_operands = true; + mparams.circular_buffer_options.circular_buffer_smem_write = true; + mparams.circular_buffer_options.circular_buffer_smem_read = true; + mparams.circular_buffer_options.smem_circular_buffer_stage = 4; + + // TODO Create prefetch parameter + // mparams.circular_buffer_options.smem_circular_buffer_prefetch = 3; + + // Schedule matmul fusion using custom parameters + SchedulerEntry::makeSchedulerInstance(SchedulerType::Matmul) + ->schedule(fusion.get(), &mparams); + + const int M = 32, N = 32, K = 256; + auto inputs = + matmulAtInput3DHopperSS(M, N, K, layout, data_type_to_aten(dtype)); + + FusionExecutor fe; + fe.compileFusion( + fusion.get(), + {inputs.first, inputs.second}, + LaunchParams(), + matmul_cparams); + + auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + auto tref = atMatmul(inputs.first.squeeze(), inputs.second.squeeze(), layout); + EXPECT_TRUE(at::allclose(cg_outputs[0], tref, 1e-5, 1e-5)); +} + +TEST_F(MatmulSchedulerTest, HSH_NT) { + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(9, 0, 10, 0); + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + + const auto dtype = DataType::Half; + constexpr auto layout = MmaLayout::NT; // [K, M] x [K, N] -> [M, N] + + auto tv0 = makeContigConcreteTensor({-1, -1, 1}, dtype); + auto tv1 = makeContigConcreteTensor({-1, 1, -1}, dtype); + fusion->addInput(tv0); + fusion->addInput(tv1); + + auto tv2 = fusedMultiplySum(tv0, tv1, {0}); + + // Reorder the accumulator as [M, N, K] + // [K, M, N] -> [M, N, K] + tv2->reorder({{-3, -1}}); + tv2->commitLeafToLogical(); + + auto tv3 = castOp(DataType::Half, tv2); + + fusion->addOutput(tv3); + + NVF_CHECK( + 1 == ir_utils::getOpsOfType(fusion.get()).size(), + "matmul fusion must have at least one MmaOp"); + + // Create custom Matmul Params + MatMulTileOptions gemm_tile; + // TODO cta tile is a multiple of mma macro for hopper. + gemm_tile.cta_tile = GemmTile(128, 128, 32); + + // TODO warp tile is (macroM, macroN, macroK) for hopper. + gemm_tile.warp_tile = GemmTile(64, 64, 32); + + // TODO instruction tile is not used for hopper. + gemm_tile.instruction_tile = GemmTile(16, 8, 16); + + MatmulParams mparams; + mparams.supported_vec_size = {8, 8, 4}; + + // TODO use hopper macro + // mparams.mma_macro = MmaMacro::Hopper_64_256_16; + mparams.mma_macro = MmaMacro::Ampere_16_8_16; + + mparams.tile_sizes = gemm_tile; + mparams.async_gmem_load_operands = true; + mparams.circular_buffer_options.circular_buffer_smem_write = true; + mparams.circular_buffer_options.circular_buffer_smem_read = true; + mparams.circular_buffer_options.smem_circular_buffer_stage = 4; + + // TODO Create prefetch parameter + // mparams.circular_buffer_options.smem_circular_buffer_prefetch = 3; + + // Schedule matmul fusion using custom parameters + SchedulerEntry::makeSchedulerInstance(SchedulerType::Matmul) + ->schedule(fusion.get(), &mparams); + + const int M = 32, N = 32, K = 256; + auto inputs = + matmulAtInput3DHopperSS(M, N, K, layout, data_type_to_aten(dtype)); + + FusionExecutor fe; + fe.compileFusion( + fusion.get(), + {inputs.first, inputs.second}, + LaunchParams(), + matmul_cparams); + + auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + auto tref = atMatmul(inputs.first.squeeze(), inputs.second.squeeze(), layout); + EXPECT_TRUE(at::allclose(cg_outputs[0], tref, 1e-5, 1e-5)); +} + +TEST_F(MatmulSchedulerTest, HSH_NN) { + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(9, 0, 10, 0); + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + + const auto dtype = DataType::Half; + constexpr auto layout = MmaLayout::NN; + + auto tv0 = makeContigConcreteTensor({1, -1, -1}, dtype); // A [b, K, M] + auto tv1 = makeContigConcreteTensor({-1, -1, 1}, dtype); // B [N, K, 1] + fusion->addInput(tv0); + fusion->addInput(tv1); + + auto tv2 = fusedMultiplySum(tv0, tv1, {1}); + + // Reorder the accumulator as [M, N, K] + // [N, rK, M] -> [M, N, K] + tv2->reorder({{-1, -3}}); + tv2->commitLeafToLogical(); + + auto tv3 = castOp(DataType::Half, tv2); + fusion->addOutput(tv3); + + NVF_CHECK( + 1 == ir_utils::getOpsOfType(fusion.get()).size(), + "matmul fusion must have at least one MmaOp"); + + // Create custom Matmul Params + MatMulTileOptions gemm_tile; + // TODO cta tile is a multiple of mma macro for hopper. + gemm_tile.cta_tile = GemmTile(128, 128, 32); + + // TODO warp tile is (macroM, macroN, macroK) for hopper. + gemm_tile.warp_tile = GemmTile(64, 64, 32); + + // TODO instruction tile is not used for hopper. + gemm_tile.instruction_tile = GemmTile(16, 8, 16); + + MatmulParams mparams; + mparams.supported_vec_size = {8, 8, 4}; + + // TODO use hopper macro + // mparams.mma_macro = MmaMacro::Hopper_64_256_16; + mparams.mma_macro = MmaMacro::Ampere_16_8_16; + + mparams.tile_sizes = gemm_tile; + mparams.async_gmem_load_operands = true; + mparams.circular_buffer_options.circular_buffer_smem_write = true; + mparams.circular_buffer_options.circular_buffer_smem_read = true; + mparams.circular_buffer_options.smem_circular_buffer_stage = 4; + + // TODO Create prefetch parameter + // mparams.circular_buffer_options.smem_circular_buffer_prefetch = 3; + + // Schedule matmul fusion using custom parameters + SchedulerEntry::makeSchedulerInstance(SchedulerType::Matmul) + ->schedule(fusion.get(), &mparams); + + const int M = 32, N = 32, K = 256; + auto inputs = + matmulAtInput3DHopperSS(M, N, K, layout, data_type_to_aten(dtype)); + + // TODO Disabled because hopper multiple matmul scheduler is currently a copy + // of ampere scheduler. + /* + FusionExecutor fe; + fe.compileFusion( + fusion.get(), + {inputs.first, inputs.second}, + LaunchParams(), + matmul_cparams); + auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + auto tref = atMatmul(inputs.first.squeeze(), inputs.second.squeeze(), layout); + EXPECT_TRUE(at::allclose(cg_outputs[0], tref, 1e-5, 1e-5)); + */ +} + } // namespace nvfuser From aabe7539f840d20159d2812ddb772de2bd2ea6f2 Mon Sep 17 00:00:00 2001 From: jjsjann123 Date: Mon, 4 Nov 2024 12:50:40 -0800 Subject: [PATCH 02/27] removing obsolete code (#2982) Fixes #2910 removes code logic where alias source is propagated across cast. This was necessary in the old TorchScript use case. --- csrc/fusion.cpp | 40 +---------------------------- csrc/fusion.h | 5 ---- nvfuser/contrib/nn/normalization.py | 6 ----- 3 files changed, 1 insertion(+), 50 deletions(-) diff --git a/csrc/fusion.cpp b/csrc/fusion.cpp index d9d9cc08003..ab1f44b4571 100644 --- a/csrc/fusion.cpp +++ b/csrc/fusion.cpp @@ -752,27 +752,6 @@ std::vector Fusion::getTerminatingOutputs() const { return terminating_outputs; } -bool Fusion::isAliasCompatible(Val* left, Val* right) { - // Nullptr check - if (left == nullptr || right == nullptr) { - return false; - } - - // DataType check - if (!left->getDataType().has_value() || !right->getDataType().has_value() || - left->getDataType().value() != right->getDataType().value()) { - return false; - } - - // ValType check - if (!left->getValType().has_value() || !right->getValType().has_value() || - left->getValType().value() != right->getValType().value()) { - return false; - } - - return true; -} - void Fusion::aliasOutputToInput( Val* output, Val* input, @@ -791,33 +770,16 @@ void Fusion::aliasOutputToInput( } NVF_ERROR(type == AllocationType::ReuseBuffer); - // `input` can be a cast of a fusion input. - if (!input->isFusionInput()) { - auto input_expr = input->definition(); - NVF_ERROR( - input_expr->isA(), "expected unary op for aliased input"); - auto input_uop = input_expr->as(); - NVF_ERROR( - input_uop->getUnaryOpType() == UnaryOpType::Cast, - "expected aliased input to be output of cast op"); - input = input_uop->in(); - } + NVF_ERROR(input->isFusionInput(), "alias source can only be a fusion input"); NVF_ERROR( input->getDataType().has_value() && output->getDataType().has_value(), "requires DataType to be available for aliased output to input"); - if (input->getDataType().value() != output->getDataType().value()) { - output = castOp(input->getDataType().value(), output); - } - if (output->isFusionInput()) { // ensure that codegen produce a write operation on the buffer. output = set(output); } - NVF_ERROR( - isAliasCompatible(input, output), - "The input and output values are not alias-compatible."); // Let integration hide any output that wasn't a fusion output when // `aliasOutputToInput` was called. For example, running mean and var for // batch norm. diff --git a/csrc/fusion.h b/csrc/fusion.h index 7b72aef0414..3bc97d957a4 100644 --- a/csrc/fusion.h +++ b/csrc/fusion.h @@ -464,11 +464,6 @@ class NVF_API Fusion : public IrContainer { all_tvs_ptr_.reset(); } - private: - // Determine if the two values are compatible for aliasing - // Same DataType, ValType, and number of dimensions - bool isAliasCompatible(Val* left, Val* right); - private: // Fusion inputs and outputs std::vector inputs_; diff --git a/nvfuser/contrib/nn/normalization.py b/nvfuser/contrib/nn/normalization.py index 4d05eb538ec..c01faf86cdb 100644 --- a/nvfuser/contrib/nn/normalization.py +++ b/nvfuser/contrib/nn/normalization.py @@ -401,12 +401,6 @@ def forward( tv_running_mean = partially_contig_tensor(fd, running_mean) tv_running_var = partially_contig_tensor(fd, running_var) inputs.extend([running_mean, running_var]) - if running_mean.dtype in [torch.half, torch.bfloat16]: - tv_running_mean = fd.ops.cast( - tv_running_mean, nvfuser.DataType.Float - ) - if running_var.dtype in [torch.half, torch.bfloat16]: - tv_running_var = fd.ops.cast(tv_running_var, nvfuser.DataType.Float) s_momentum = fd.define_scalar(nvfuser.DataType.Double) s_eps = fd.define_scalar(nvfuser.DataType.Double) From df888ac915a3d8a37be80e1d60d50ccb4e671faa Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Mon, 4 Nov 2024 13:37:47 -0800 Subject: [PATCH 03/27] Use IterDomain::split and IterDomain::merge (#3327) Instead of manually creating output IDs and directly constructing Split and Merge nodes, IterDomain::split and IterDomain::merge are used. This would reduce duplication of logic. There's some slight change in iteration types of IDs generated when `TensorView::rFactor` is used. See the inline comment. As of [8321cfd](https://github.com/NVIDIA/Fuser/pull/3327/commits/8321cfdd27ac15e9464f551e0c2ea0e5159357db), everything seems fine. Closes #2520 --- csrc/ir/internal_base_nodes.h | 13 ++++-- csrc/ir/nodes.cpp | 70 +++++++++++++++++++--------- csrc/transform_replay.cpp | 44 ++++++------------ csrc/transform_rfactor.cpp | 86 +++++++++++++++++------------------ tests/cpp/test_gpu3.cpp | 28 ++++++++++++ 5 files changed, 144 insertions(+), 97 deletions(-) diff --git a/csrc/ir/internal_base_nodes.h b/csrc/ir/internal_base_nodes.h index 6ac52ba0564..f9f422cd994 100644 --- a/csrc/ir/internal_base_nodes.h +++ b/csrc/ir/internal_base_nodes.h @@ -120,18 +120,23 @@ class NVF_API IterDomain : public Val { static std::vector clone( const std::vector& domains); - //! When `rfactor_domain` is true, also set the `is_rfactor_domain_` flag of - //! the result IterDomain. + //! The optional parameters of rfactor_domain and iter_type can be + //! used to override the default behavior. static IterDomain* merge( IterDomain* outer, IterDomain* inner, - bool rfactor_domain = false); + std::optional rfactor_domain = std::nullopt, + std::optional iter_type = std::nullopt); + //! The optional parameters of rfactor_domain, outer_iter_type and + //! inner_iter_type can be used to override the default behavior. static std::pair split( IterDomain* in, Val* factor, bool inner_split, - bool rfactor_domain = false); + std::optional rfactor_domain = std::nullopt, + std::optional outer_iter_type = std::nullopt, + std::optional inner_iter_type = std::nullopt); //! Resize an IterDomain by expanding both the left and right sides //! by given widths. The resulting IterDomain has an extent of diff --git a/csrc/ir/nodes.cpp b/csrc/ir/nodes.cpp index ca39d51684a..d647f66b566 100644 --- a/csrc/ir/nodes.cpp +++ b/csrc/ir/nodes.cpp @@ -2550,7 +2550,8 @@ IterDomain* IterDomain::cloneWithoutRFactor(bool map_with_original) { IterDomain* IterDomain::merge( IterDomain* outer, IterDomain* inner, - bool rfactor_domain) { + std::optional rfactor_domain, + std::optional iter_type) { NVF_CHECK( outer->isReduction() == inner->isReduction(), "Merging IterDomains requires that their iteration types match. ", @@ -2563,24 +2564,33 @@ IterDomain* IterDomain::merge( !outer->isStride() && !inner->isStride(), "No support for merging stride domains"); + // By default, if not specified, don't create rfactor + // outputs. Reshape transformations should propagate the flag, which + // should explicitly specify the flag + if (!rfactor_domain.has_value()) { + rfactor_domain = false; + } + Val* merged_id_size = mul(outer->extent(), inner->extent()); - IterType itype = outer->getIterType(); + if (!iter_type.has_value()) { + iter_type = outer->getIterType(); - if (outer->isBroadcast() && inner->isBroadcast()) { - itype = IterType::Broadcast; - } + if (outer->isBroadcast() && inner->isBroadcast()) { + iter_type = IterType::Broadcast; + } - if ((outer->isBroadcast() || inner->isBroadcast()) && - (outer->getIterType() == IterType::Iteration || - inner->getIterType() == IterType::Iteration)) { - itype = IterType::Iteration; - } + if ((outer->isBroadcast() || inner->isBroadcast()) && + (outer->getIterType() == IterType::Iteration || + inner->getIterType() == IterType::Iteration)) { + iter_type = IterType::Iteration; + } - if ((outer->isBroadcast() || inner->isBroadcast()) && - (outer->getIterType() == IterType::GatherScatter || - inner->getIterType() == IterType::GatherScatter)) { - itype = IterType::GatherScatter; + if ((outer->isBroadcast() || inner->isBroadcast()) && + (outer->getIterType() == IterType::GatherScatter || + inner->getIterType() == IterType::GatherScatter)) { + iter_type = IterType::GatherScatter; + } } Val* expanded_extent = nullptr; @@ -2606,8 +2616,8 @@ IterDomain* IterDomain::merge( IterDomainBuilder(outer->container()->zeroVal(), merged_id_size) .parallel_type(outer->getParallelType()) .expanded_extent(expanded_extent) - .iter_type(itype) - .is_rfactor_domain(rfactor_domain) + .iter_type(*iter_type) + .is_rfactor_domain(*rfactor_domain) .build(); IrBuilder::createInContainer( @@ -2620,7 +2630,9 @@ std::pair IterDomain::split( IterDomain* in, Val* factor, bool inner_split, - bool rfactor_domain) { + std::optional rfactor_domain, + std::optional outer_iter_type, + std::optional inner_iter_type) { NVF_CHECK( factor->isIntegralScalar(), "Cannot split by non-integer value ", factor); @@ -2631,6 +2643,22 @@ std::pair IterDomain::split( expanded_remainder = ceilDiv(in->expandedExtent(), factor); } + // By default, if not specified, don't create rfactor + // outputs. Reshape transformations should propagate the flag, which + // should explicitly specify the flag + if (!rfactor_domain.has_value()) { + rfactor_domain = false; + } + + // If not specified, inherit these properties from the input iter domain + if (!outer_iter_type.has_value()) { + outer_iter_type = in->getIterType(); + } + + if (!inner_iter_type.has_value()) { + inner_iter_type = in->getIterType(); + } + // outer loop IterDomain IterDomain* ido = IterDomainBuilder( @@ -2639,8 +2667,8 @@ std::pair IterDomain::split( in->hasExpandedExtent() && inner_split ? expanded_remainder : nullptr) .parallel_type(in->getParallelType()) - .iter_type(in->getIterType()) - .is_rfactor_domain(rfactor_domain) + .iter_type(*outer_iter_type) + .is_rfactor_domain(*rfactor_domain) .build(); // inner loop IterDomain @@ -2651,8 +2679,8 @@ std::pair IterDomain::split( in->hasExpandedExtent() && !inner_split ? expanded_remainder : nullptr) .parallel_type(in->getParallelType()) - .iter_type(in->getIterType()) - .is_rfactor_domain(rfactor_domain) + .iter_type(*inner_iter_type) + .is_rfactor_domain(*rfactor_domain) .build(); IrBuilder::createInContainer( diff --git a/csrc/transform_replay.cpp b/csrc/transform_replay.cpp index 093715f92a8..06e15929aa9 100644 --- a/csrc/transform_replay.cpp +++ b/csrc/transform_replay.cpp @@ -52,25 +52,20 @@ class ReplaySelf : public ReplayTransformations { loop_ids_.find(mapped) != loop_ids_.end(), "Transform traversal failed, modified a node but it was not a loop node."); - // outer loop size - Val* remainder = ceilDiv(mapped->extent(), s->factor()); - - // Manually replay the split, following the output of the operations. - // This is so rfactor ops are replayed correctly. - IterDomain* ido = IterDomainBuilder(s->outer()) - .start(s->container()->zeroVal()) - .extent(s->innerSplit() ? remainder : s->factor()) - .build(); - - // inner IterDomain - IterDomain* idi = IterDomainBuilder(s->inner()) - .start(s->container()->zeroVal()) - .extent(s->innerSplit() ? s->factor() : remainder) - .build(); - - // Generate the split node - IrBuilder::createInContainer( - s->container(), ido, idi, mapped, s->factor(), s->innerSplit()); + NVF_ERROR(s->outer()->isRFactorProduct() == s->inner()->isRFactorProduct()); + + // Due to rfactor transformations, the iter types of the outputs + // may not follow the default rule. For example, even if the input + // is a reduction iter domain, the outputs may not. To replay the + // original split expression, the output iter types need to be + // specified explicitly. + auto [ido, idi] = IterDomain::split( + mapped, + s->factor(), + s->innerSplit(), + s->outer()->isRFactorProduct(), + s->outer()->getIterType(), + s->inner()->getIterType()); // Remove mapped id from loop IDs loop_ids_.erase(mapped); @@ -107,16 +102,7 @@ class ReplaySelf : public ReplayTransformations { id_inner_mapped, " however one or both are not loop nodes."); - Val* merged_id_size = - mul(id_outer_mapped->extent(), id_inner_mapped->extent()); - - IterDomain* merged_id = IterDomainBuilder(m->out()) - .start(m->container()->zeroVal()) - .extent(merged_id_size) - .build(); - - IrBuilder::createInContainer( - m->container(), merged_id, id_outer_mapped, id_inner_mapped); + IterDomain* merged_id = IterDomain::merge(id_outer_mapped, id_inner_mapped); // Remove inputs from the loop IDs loop_ids_.erase(id_outer_mapped); diff --git a/csrc/transform_rfactor.cpp b/csrc/transform_rfactor.cpp index 311bec23796..07799487eb0 100644 --- a/csrc/transform_rfactor.cpp +++ b/csrc/transform_rfactor.cpp @@ -108,9 +108,6 @@ class ReplayRFactor : public ReplayTransformations { loop_ids_.find(mapped) != loop_ids_.end(), "Transform traversal failed, modified a node but it was not a loop node."); - // outer loop size - Val* remainder = ceilDiv(mapped->extent(), s->factor()); - // Check if we need to mark the outputs as an logical domain meaning this // transformation must be present in replays otherwise it breaks the compute // definition of the fusion. Iter domains are actually not static, its the @@ -119,32 +116,27 @@ class ReplayRFactor : public ReplayTransformations { bool static_logical_outputs = static_logical_ids_.count(s->outer()) || static_logical_ids_.count(s->inner()); - // Manually replay the split, making reduction = false and rfactor = true - // outer IterDomain - IterDomain* ido = - IterDomainBuilder( - s->container()->zeroVal(), - s->innerSplit() ? remainder : s->factor()) - .iter_type( - rfactor_axes_.count(s->outer()) ? IterType::Reduction - : IterType::Iteration) - .is_rfactor_domain(static_logical_outputs) - .build(); + // Let IterDomain::split determine the correct IterType, except + // when the output is a reduction domain but not part of the + // rfactored domains. If it isn't involved in the rfactor, it's no + // longer a redunction domain + std::optional outer_iter_type; + if (s->outer()->isReduction() && !rfactor_dep_ids_.count(s->outer())) { + outer_iter_type = IterType::Iteration; + } - // inner IterDomain - IterDomain* idi = - IterDomainBuilder( - s->container()->zeroVal(), - s->innerSplit() ? s->factor() : remainder) - .iter_type( - rfactor_axes_.count(s->inner()) ? IterType::Reduction - : IterType::Iteration) - .is_rfactor_domain(static_logical_outputs) - .build(); + std::optional inner_iter_type; + if (s->inner()->isReduction() && !rfactor_dep_ids_.count(s->inner())) { + inner_iter_type = IterType::Iteration; + } - // Generate the split node - IrBuilder::createInContainer( - s->container(), ido, idi, mapped, s->factor(), s->innerSplit()); + auto [ido, idi] = IterDomain::split( + mapped, + s->factor(), + s->innerSplit(), + static_logical_outputs, + outer_iter_type, + inner_iter_type); // Remove mapped id from loop IDs loop_ids_.erase(mapped); @@ -182,23 +174,20 @@ class ReplayRFactor : public ReplayTransformations { id_inner_mapped, " however one or both are not loop nodes."); - Val* merged_id_size = - mul(id_outer_mapped->extent(), id_inner_mapped->extent()); - - bool is_bcast = - id_outer_mapped->isBroadcast() && id_inner_mapped->isBroadcast(); - auto iter_type = rfactor_axes_.count(m->out()) - ? IterType::Reduction - : (is_bcast ? IterType::Broadcast : IterType::Iteration); - - IterDomain* merged_id = - IterDomainBuilder(m->container()->zeroVal(), merged_id_size) - .iter_type(iter_type) - .is_rfactor_domain(static_logical_ids_.count(m->out())) - .build(); + // Let IterDomain::merge determine the correct IterType, except + // when the output is a reduction domain but not part of the + // rfactored domains. If it isn't involved in the rfactor, it's no + // longer a redunction domain + std::optional iter_type; + if (m->out()->isReduction() && !rfactor_dep_ids_.count(m->out())) { + iter_type = IterType::Iteration; + } - IrBuilder::createInContainer( - m->container(), merged_id, id_outer_mapped, id_inner_mapped); + IterDomain* merged_id = IterDomain::merge( + id_outer_mapped, + id_inner_mapped, + static_logical_ids_.count(m->out()), + iter_type); // Remove inputs from the loop IDs loop_ids_.erase(id_outer_mapped); @@ -236,6 +225,9 @@ class ReplayRFactor : public ReplayTransformations { // The IterDomains in the original_domain that are being factored into the // first stage of the two stage reduction (the producer). std::unordered_set rfactor_axes_; + // All iter domains between the logical and the loop that the + // rfactor_axes_ depend on + std::unordered_set rfactor_dep_ids_; // Iter domains whose history cannot be changed as it would break rfactor // dependencies. std::unordered_set static_logical_ids_; @@ -262,6 +254,14 @@ class ReplayRFactor : public ReplayTransformations { rfactor_axes_(std::move(rfactor_axes)), static_logical_ids_(std::move(static_logical_ids)), logical_domain_(original_domain->logical()) { + const auto all_dep_vals = DependencyCheck::getAllValsBetween( + {original_domain->maybeRoot().begin(), + original_domain->maybeRoot().end()}, + {rfactor_axes_.begin(), rfactor_axes_.end()}); + + auto all_dep_ids = ir_utils::filterByType(all_dep_vals); + rfactor_dep_ids_.insert(all_dep_ids.begin(), all_dep_ids.end()); + setErrorOnFailure(false); } }; diff --git a/tests/cpp/test_gpu3.cpp b/tests/cpp/test_gpu3.cpp index 9862dcb8b07..cd8a96e9ba9 100644 --- a/tests/cpp/test_gpu3.cpp +++ b/tests/cpp/test_gpu3.cpp @@ -8874,6 +8874,34 @@ TEST_F(NVFuserTest, CpAsyncDataTypeBool) { auto cg_outputs = fe.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } + +// Intermediate IDs generaetd by rFactor should also remain +// reductions. See #3327 for more info. +TEST_F(NVFuserTest, RfactorIntermediateIDs) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeSymbolicTensor(3); + fusion.addInput(tv0); + + auto tv1 = sum(tv0, {1, 2}); + fusion.addOutput(tv1); + + tv1->merge(1, 2); + tv1->split(1, 4); + + auto tv2 = tv1->rFactor({-1}); + + EXPECT_TRUE(tv2->axis(-1)->isReduction()); + EXPECT_FALSE(tv2->axis(-2)->isReduction()); + + auto split = dynamic_cast(tv2->axis(-1)->definition()); + ASSERT_NE(split, nullptr); + + auto merge_out = split->in(); + EXPECT_TRUE(merge_out->isReduction()); +} + // Test file size should be up to 10K LoC. Create a new file for more tests. } // namespace nvfuser From 43f8147f55fce0fbc320fa2345e58d1c039faa5a Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Mon, 4 Nov 2024 16:44:06 -0800 Subject: [PATCH 04/27] Make prepareInputs private. (#3340) --- csrc/runtime/fusion_executor_cache.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/csrc/runtime/fusion_executor_cache.h b/csrc/runtime/fusion_executor_cache.h index 6ec0435bbd0..57cf6ce9c15 100644 --- a/csrc/runtime/fusion_executor_cache.h +++ b/csrc/runtime/fusion_executor_cache.h @@ -140,12 +140,6 @@ class FusionExecutorCache { std::optional forced_index_type = std::nullopt, std::optional selected_device = std::nullopt); - //! Converts inputs from IValue to KernelArgumentHolder, also handles cache - //! lookup - KernelArgumentHolder prepareInputs( - const at::ArrayRef& inputs, - std::optional selected_device = std::nullopt); - //! query if there's a kernel ready to go for given inputs NVF_API bool isCompiled( const at::ArrayRef& inputs, @@ -241,6 +235,12 @@ class FusionExecutorCache { void deserialize(const serde::FusionExecutorCache* buffer, int64_t fusion_id); private: + //! Converts inputs from IValue to KernelArgumentHolder, also handles cache + //! lookup + KernelArgumentHolder prepareInputs( + const at::ArrayRef& inputs, + std::optional selected_device = std::nullopt); + //! evict cached short cut entry in `code_to_fe_lookup_` as well as cached //! entry in `FusionExecutor` void evictCache(size_t cache_id); From 162a13b0851d8b6ac8e8332e76668e20db8eac79 Mon Sep 17 00:00:00 2001 From: Ryan Spring Date: Mon, 4 Nov 2024 17:47:06 -0800 Subject: [PATCH 05/27] Fix autotune_pointwise.py script (#3339) Fix the `autotune_pointwise` script which was broken by https://github.com/NVIDIA/Fuser/pull/3275. The earlier PR changed the pointwise setting from `unroll_factor` to `inner_unroll_factor`. --- doc/dev/python_scheduling/autotune_pointwise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/dev/python_scheduling/autotune_pointwise.py b/doc/dev/python_scheduling/autotune_pointwise.py index 5034ebfd5c6..014ae8197a2 100644 --- a/doc/dev/python_scheduling/autotune_pointwise.py +++ b/doc/dev/python_scheduling/autotune_pointwise.py @@ -89,7 +89,7 @@ def inner_fn(): if config is not None: vectorization_factor, unroll_factor = config schedule_params.vectorization_factor = vectorization_factor - schedule_params.unroll_factor = unroll_factor + schedule_params.unroll_factor_inner = unroll_factor # Schedule fusion fd.sched.schedule() From 2a2e562323621cfe565e23b37de04768e699298e Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Tue, 5 Nov 2024 08:21:19 -0800 Subject: [PATCH 06/27] Avoid replacing a Val with a dependent Val (#3343) It would result in a recursive definition. --- csrc/ir/utils.cpp | 34 ++++++++++++++++++++++++++++++++++ csrc/ir/utils.h | 5 +++++ csrc/mutator.cpp | 12 ++++++++++++ tests/cpp/test_gpu3.cpp | 26 ++++++++++++++++++++++++++ 4 files changed, 77 insertions(+) diff --git a/csrc/ir/utils.cpp b/csrc/ir/utils.cpp index 91a1170bf38..ccde32b5378 100644 --- a/csrc/ir/utils.cpp +++ b/csrc/ir/utils.cpp @@ -1204,6 +1204,40 @@ bool isFunctional(const Val* v) { return std::all_of(def->inputs().begin(), def->inputs().end(), isFunctional); } +bool isRecursivelyDefined(Val* val) { + NVF_ERROR(val != nullptr); + + std::deque vals_to_visit; + vals_to_visit.push_back(val); + + std::unordered_set visited_vals; + + while (!vals_to_visit.empty()) { + auto v = vals_to_visit.front(); + vals_to_visit.pop_front(); + + visited_vals.insert(v); + + auto v_def = v->definition(); + if (v_def == nullptr) { + continue; + } + + for (const auto inp : v_def->inputs()) { + if (inp == val) { + // Recursive dependency detected + return true; + } + // Don't visit the same multiple times + if (!visited_vals.count(inp)) { + vals_to_visit.push_back(inp); + } + } + } + + return false; +} + } // namespace nvfuser::ir_utils namespace nvfuser::MmaOpUtils { diff --git a/csrc/ir/utils.h b/csrc/ir/utils.h index b02fb2fbe3e..ad5a7279bc4 100644 --- a/csrc/ir/utils.h +++ b/csrc/ir/utils.h @@ -728,4 +728,9 @@ std::string nullOrToInlineString(const Statement* stmt); //! always returns the same result when called with the same inputs. bool isFunctional(const Val* v); +// Check if the given val is recursively defined, which is invalid in +// the Fusion IR but may not be necessarily the case in other IRs +// such as the Kernel IR +bool isRecursivelyDefined(Val* val); + } // namespace nvfuser::ir_utils diff --git a/csrc/mutator.cpp b/csrc/mutator.cpp index 87f44c797fe..5f183bf4839 100644 --- a/csrc/mutator.cpp +++ b/csrc/mutator.cpp @@ -77,6 +77,18 @@ void OptOutMutator::registerMutation(Val* val, Val* mutation) { ", ", mutation->dtype(), ")"); + + NVF_ERROR( + !DependencyCheck::isDependencyOf(val, mutation), + "Attempted to replace a val, ", + val->toString(), + ", with a dependent val, ", + mutation->toString(), + " (", + mutation->toInlineString(), + "), which is not allowed as it would result in a recursive definition of ", + mutation->toString()); + mutations_[val] = mutation; } diff --git a/tests/cpp/test_gpu3.cpp b/tests/cpp/test_gpu3.cpp index cd8a96e9ba9..faf18593549 100644 --- a/tests/cpp/test_gpu3.cpp +++ b/tests/cpp/test_gpu3.cpp @@ -8902,6 +8902,32 @@ TEST_F(NVFuserTest, RfactorIntermediateIDs) { EXPECT_TRUE(merge_out->isReduction()); } +// Simple test to make sure replacement with a dependent val is +// detected as an error +TEST_F(NVFuserTest, AvoidReplacingWithDependentVal) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto i0 = IrBuilder::create(DataType::Int); + fusion.addInput(i0); + + auto i1 = mul(i0, IrBuilder::create(1, DataType::Int)); + + auto tv0 = TensorViewBuilder().shape({i1}).build(); + fusion.addInput(tv0); + + auto tv1 = set(tv0); + fusion.addOutput(tv1); + + std::unordered_map replacement_map; + replacement_map.emplace(i0, i1); + + EXPECT_THAT( + [&]() { ir_utils::replaceValue(&fusion, replacement_map); }, + testing::ThrowsMessage(testing::HasSubstr( + "not allowed as it would result in a recursive definition"))); +} + // Test file size should be up to 10K LoC. Create a new file for more tests. } // namespace nvfuser From 62bd3b569dd6a6709faf0de3dbd2c91ce0dec16e Mon Sep 17 00:00:00 2001 From: jjsjann123 Date: Tue, 5 Nov 2024 08:51:55 -0800 Subject: [PATCH 07/27] Lowering vectorized pad (#3261) Added support for lowering TernaryOp:where with vectorization factor. i.e. ``` predicate ? loadGlobalToLocal<...>(&dst[0], &src[i_src]) : dst.set(0.0f) ``` Currently this can only be done via manual scheduling. The follow up PR on vectorization analysis will make this automatically applied in PR #3321 --- csrc/codegen.cpp | 162 +++++++++++++++++++-------- csrc/device_lower/lower2device.h | 4 - csrc/device_lower/pass/predicate.cpp | 3 +- csrc/device_lower/validation.cpp | 38 +++++-- tests/cpp/test_resize.cpp | 70 ++++++++++++ 5 files changed, 215 insertions(+), 62 deletions(-) diff --git a/csrc/codegen.cpp b/csrc/codegen.cpp index 4cec43b2c92..727894caebb 100644 --- a/csrc/codegen.cpp +++ b/csrc/codegen.cpp @@ -402,6 +402,55 @@ class CudaKernelGenerator : private kir::ConstIrVisitor { } } + void generateVectorizedLdSt( + Val* in, + Val* out, + CacheOp cache_op, + int64_t vector_word_size) { + auto out_tv = out->as()->view(); + auto in_tv = in->as()->view(); + + bool localToGlobal = out_tv->getMemoryType() == MemoryType::Global && + in_tv->getMemoryType() == MemoryType::Local; + + bool globalToLocal = out_tv->getMemoryType() == MemoryType::Local && + in_tv->getMemoryType() == MemoryType::Global; + + bool globalToGlobal = out_tv->getMemoryType() == MemoryType::Global && + in_tv->getMemoryType() == MemoryType::Global; + + bool is_volatile_to = out_tv->getMemoryType() == MemoryType::Global && + kernel_->summary().sync_map->needsRawSync(out_tv).hasBID(); + + bool is_volatile_from = in_tv->getMemoryType() == MemoryType::Global && + kernel_->summary().sync_map->needsRawSync(in_tv).hasBID(); + + if (localToGlobal) { + code_ << "loadLocalToGlobal<" << out->dtype() << ", /*vec_size=*/" + << vector_word_size << ", /*is_volatile=*/" + << (is_volatile_to ? "true" : "false") << ">("; + code_ << " &" << gen(out) << ", &" << gen(in) << ")"; + } else if (globalToLocal) { + code_ << "loadGlobalToLocal<" << out->dtype() << ", /*vec_size=*/" + << vector_word_size << ", /*is_volatile=*/" + << (is_volatile_from ? "true" : "false") << ", " + << "CacheOp::" << cache_op << ">(&" << gen(out) << ", "; + code_ << " &" << gen(in) << ")"; + } else if (globalToGlobal) { + code_ << "loadGlobalToGlobal<" << out->dtype() << ", /*vec_size=*/" + << vector_word_size << ", /*is_volatile_to=*/" + << (is_volatile_to ? "true" : "false") << ", /*is_volatile_from=*/" + << (is_volatile_from ? "true" : "false") << ">("; + code_ << " &" << gen(out) << ", "; + code_ << " &" << gen(in) << ")"; + } else { + code_ << "loadGeneric<" << out->dtype() << ", " << vector_word_size + << ">("; + code_ << " &" << gen(out) << ", "; + code_ << " &" << gen(in) << ")"; + } + } + // Cannot just use ConstIrVisitor::handle as it expects a vector of // const Expr*, whereas most of the IR API returns a vector of // non-const Expr*. @@ -1001,6 +1050,68 @@ class CudaKernelGenerator : private kir::ConstIrVisitor { } void handle(const TernaryOp* top) final { + // Note: vectorized TernaryOp looks something like: + // ``` + // predicate + // ? LoadGlobalToLocal(&dst[0], &in2[index]) + // : arraySet(&dst[0], in3); + // ``` + // + // Current limitation: + // 1. only TernaryOpType::Where is supported; + // 2. predicate needs to be a scalar; + // 3. output needs to be a TensorView; + // 4. one and only one of the inputs needs to be a TensorView. (This is + // coming from validation analysis.) + if (top->out()->isA()) { + // Get vectorization information + auto out_tv = top->out()->as()->view(); + int64_t vector_word_size = ir_utils::getVectorizeSize(out_tv); + bool is_vector_op = vectorize_scope_ && vector_word_size != 1; + + if (is_vector_op) { + NVF_CHECK( + top->in1()->isScalar(), + "predicate should be a scalar for vectorized TernaryOp::where"); + NVF_CHECK( + !top->out()->isScalar(), + "scalar output in vectorization isn't supported"); + NVF_CHECK( + top->getTernaryOpType() == TernaryOpType::Where, + "vectorization only works on TernaryOp::where"); + indent() << gen(top->in1()) << "\n"; + indent() << kTab << "? "; + auto vec_load = [&out_tv, &top, &vector_word_size, this](Val* in) { + if (in->isScalar()) { + if (out_tv->getMemoryType() == MemoryType::Local && + !out_tv->isCircularBuffered()) { + // Vectorized initialization, explicit type conversion is needed + // for complex numbers + code_ << genVariableName(out_tv) << ".set(" + << genCall(out_tv->dtype(), gen(in)) << ")"; + } else { + // Note: currently arraySet option is not vectorized, so it will + // rely on auto vectorization pass of cuda compiler. + code_ << "arraySet<" << out_tv->getDataType().value() << ", " + << vector_word_size << ">(&" << gen(top->out()) << ", (" + << out_tv->getDataType().value() << ")" << gen(in) << ")"; + } + } else { + generateVectorizedLdSt( + in, top->out(), CacheOp::AllLevels, vector_word_size); + } + }; + + // TODO: should we have the option to specify cache level? + vec_load(top->in2()); + code_ << "\n"; + indent() << kTab << ": "; + vec_load(top->in3()); + code_ << ";\n"; + return; + } + } + if (!print_inline_) { indent() << gen(top->out()); if (!top->out()->isScalar()) { @@ -1338,53 +1449,10 @@ class CudaKernelGenerator : private kir::ConstIrVisitor { "Invalid input to unary op with tensor output, found: ", ldst->in()->toString()); - auto in_tv = ldst->in()->as()->view(); - bool localToGlobal = out_tv->getMemoryType() == MemoryType::Global && - in_tv->getMemoryType() == MemoryType::Local; - - bool globalToLocal = out_tv->getMemoryType() == MemoryType::Local && - in_tv->getMemoryType() == MemoryType::Global; - - bool globalToGlobal = out_tv->getMemoryType() == MemoryType::Global && - in_tv->getMemoryType() == MemoryType::Global; - - bool is_volatile_to = out_tv->getMemoryType() == MemoryType::Global && - kernel_->summary().sync_map->needsRawSync(out_tv).hasBID(); - - bool is_volatile_from = - in_tv->getMemoryType() == MemoryType::Global && - kernel_->summary().sync_map->needsRawSync(in_tv).hasBID(); - - if (localToGlobal) { - indent() << "loadLocalToGlobal<" << ldst->out()->dtype() - << ", /*vec_size=*/" << vector_word_size - << ", /*is_volatile=*/" - << (is_volatile_to ? "true" : "false") << ">("; - code_ << " &" << gen(ldst->out()) << ", &" << gen(ldst->in()) - << ");\n"; - } else if (globalToLocal) { - indent() << "loadGlobalToLocal<" << ldst->out()->dtype() - << ", /*vec_size=*/" << vector_word_size - << ", /*is_volatile=*/" - << (is_volatile_from ? "true" : "false") << ", " - << "CacheOp::" << ldst->cacheOp() << ">(&" - << gen(ldst->out()) << ", "; - code_ << " &" << gen(ldst->in()) << ");\n"; - } else if (globalToGlobal) { - indent() << "loadGlobalToGlobal<" << ldst->out()->dtype() - << ", /*vec_size=*/" << vector_word_size - << ", /*is_volatile_to=*/" - << (is_volatile_to ? "true" : "false") - << ", /*is_volatile_from=*/" - << (is_volatile_from ? "true" : "false") << ">("; - code_ << " &" << gen(ldst->out()) << ", "; - code_ << " &" << gen(ldst->in()) << ");\n"; - } else { - indent() << "loadGeneric<" << ldst->out()->dtype() << ", " - << vector_word_size << ">("; - code_ << " &" << gen(ldst->out()) << ", "; - code_ << " &" << gen(ldst->in()) << ");\n"; - } + indent(); + generateVectorizedLdSt( + ldst->in(), ldst->out(), ldst->cacheOp(), vector_word_size); + code_ << ";\n"; } return; } diff --git a/csrc/device_lower/lower2device.h b/csrc/device_lower/lower2device.h index 38f914b92ab..16d2177ca33 100644 --- a/csrc/device_lower/lower2device.h +++ b/csrc/device_lower/lower2device.h @@ -45,10 +45,6 @@ namespace nvfuser { -// TODO: we frequently use pairwise root mapping from consumers to producers. -// This information is implicitly in the computeAtMaps, but there's no isolated -// container for this information that we can reuse. Would be nice to generate -// such a structure and propagate it through lowering. // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) class GpuLower : public NonCopyable { class KernelIrMapper; diff --git a/csrc/device_lower/pass/predicate.cpp b/csrc/device_lower/pass/predicate.cpp index 034534be7be..18e632d9e9e 100644 --- a/csrc/device_lower/pass/predicate.cpp +++ b/csrc/device_lower/pass/predicate.cpp @@ -103,7 +103,8 @@ class ConditionalFromPredicateModifier : public kir::ExprMutator { "Expecting predicated body to only have one vectorized expression."); auto vec_expr = ite->thenBody()[0]; NVF_ERROR( - vec_expr->isA() || vec_expr->isA(), + vec_expr->isA() || vec_expr->isA() || + vec_expr->isA(), "Vectorize predicate exprs only supported on set operations."); NVF_ERROR( ir_utils::isTvOp(vec_expr), diff --git a/csrc/device_lower/validation.cpp b/csrc/device_lower/validation.cpp index ef10cdb6bc1..cc1b2dec53a 100644 --- a/csrc/device_lower/validation.cpp +++ b/csrc/device_lower/validation.cpp @@ -668,17 +668,31 @@ class VectorizeValidator : public OptInDispatch { tv_def != nullptr, "Tv has no definition, cannot validate vectorization:", tv); - auto producer_tv = tv_def->inputs().at(0)->as(); - auto producer_word_size_it = - GpuLower::current()->vectorizedAccesses().find(producer_tv); - if (producer_word_size_it != - GpuLower::current()->vectorizedAccesses().end()) { - producer_word_size_it->second = - std::max(vector_word_size, producer_word_size_it->second); - } else { - GpuLower::current()->vectorizedAccesses().emplace( - producer_tv, vector_word_size); + // TernaryOp(where) is a could have multiple inputs. But we only support + // single TensorView input for vectorization. + TensorView* producer_tv = nullptr; + for (auto input : tv_def->inputs()) { + if (!input->isA()) { + continue; + } + NVF_ERROR( + producer_tv == nullptr, + "Vectorization validation only support op with a single TensorView input"); + producer_tv = input->as(); + auto producer_word_size_it = + GpuLower::current()->vectorizedAccesses().find(producer_tv); + if (producer_word_size_it != + GpuLower::current()->vectorizedAccesses().end()) { + producer_word_size_it->second = + std::max(vector_word_size, producer_word_size_it->second); + } else { + GpuLower::current()->vectorizedAccesses().emplace( + producer_tv, vector_word_size); + } } + NVF_ERROR( + producer_tv != nullptr, + "Vectorization validation requires a TensorView input"); VectorizedSetInfo vectorized_set_info; vectorized_set_info.consumer_tv = tv; @@ -798,6 +812,10 @@ void validateAndCollectVectorizeInfo(Fusion* fusion) { Expr* def = tv->definition(); NVF_ERROR( def == nullptr || def->isA() || def->isA() || + def->isA() || + (def->isA() && + def->as()->getTernaryOpType() == + TernaryOpType::Where) || (def->isA() && def->as()->serialGridReductionRequested()), "Vectorized accesses cannot be inline with computation: ", diff --git a/tests/cpp/test_resize.cpp b/tests/cpp/test_resize.cpp index cc7fc96d8cd..e794559216e 100644 --- a/tests/cpp/test_resize.cpp +++ b/tests/cpp/test_resize.cpp @@ -4041,4 +4041,74 @@ TEST_F(ResizeTest, SliceSliceConcatConcat) { NVF_CHECK(ref.equal(cg_outputs[0])); } +// manual scheduling that should have vectorized load on padded inputs. +TEST_F(ResizeTest, VectorizePadLowering) { + auto fusion_ptr = std::make_unique(); + auto& fusion = *fusion_ptr; + FusionGuard fg(fusion_ptr.get()); + + const std::vector shape({1024L * 1024L}); + + auto tv0 = makeContigConcreteTensor(shape); + fusion.addInput(tv0); + + auto tv1 = pad(tv0, {IrBuilder::create(4L), IrBuilder::create(4L)}); + fusion.addOutput(tv1); + + tv1->split(0, 4); + tv1->split(0, 128); + + tv1->axis(0)->parallelize(ParallelType::BIDx); + tv1->axis(1)->parallelize(ParallelType::TIDx); + tv1->axis(2)->parallelize(ParallelType::Vectorize); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + auto t0 = at::randn(shape, options); + std::vector aten_inputs({t0}); + + FusionExecutor fe; + fe.compileFusion(&fusion, aten_inputs); + auto cg_outputs = fe.runFusion(aten_inputs); + + auto ref = at::pad(t0, {4, 4}); + ASSERT_TRUE(ref.equal(cg_outputs[0])); +} + +// manual scheduling that should have vectorized load. +TEST_F(ResizeTest, VectorizeWhereLowering) { + auto fusion_ptr = std::make_unique(); + auto& fusion = *fusion_ptr; + FusionGuard fg(fusion_ptr.get()); + + const std::vector shape({1024L * 1024L}); + + // Note: nvfuser currently only supports vectorization with a single + // TensorView input. + auto s0 = IrBuilder::create(DataType::Bool); + fusion.addInput(s0); + auto tv0 = makeContigConcreteTensor(shape); + fusion.addInput(tv0); + auto tv1 = where(s0, IrBuilder::create(2.0), tv0); + fusion.addOutput(tv1); + + tv1->split(0, 4); + tv1->split(0, 128); + + tv1->axis(0)->parallelize(ParallelType::BIDx); + tv1->axis(1)->parallelize(ParallelType::TIDx); + tv1->axis(2)->parallelize(ParallelType::Vectorize); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + auto t0 = at::randn(shape, options); + std::vector aten_inputs({at::Scalar(false), t0}); + + FusionExecutor fe; + fe.compileFusion(&fusion, aten_inputs); + auto cg_outputs = fe.runFusion(aten_inputs); + + // Note: we cannot use at::where, because aten only support tensor as + // predicate. + ASSERT_TRUE(t0.equal(cg_outputs[0])); +} + } // namespace nvfuser From 6b3ee4fe2ab46c3d03cbfe3e9ed767065492161b Mon Sep 17 00:00:00 2001 From: Jacob Hinkle <1454944+jacobhinkle@users.noreply.github.com> Date: Tue, 5 Nov 2024 12:05:26 -0500 Subject: [PATCH 08/27] Remove MatmulParams::rotate_ldmatrix_out_of_main_loop (#3337) I can't find any commit in which this option was ever actually used. This is the commit where the option was originally introduced: https://github.com/csarofeen/pytorch/pull/2488/files#diff-e7a5a84a2cfeddeb15669f07105bdb3722a796600ea9e1f2eb25afb29283457eR22 We've gone this long without the ability to disable loop rotation, so either we should change the condition in the schedulers to respect it, or just remove it. --- csrc/python_frontend/python_bindings.cpp | 1 - csrc/scheduler/matmul_heuristic.h | 12 ++---------- csrc/scheduler/matmul_heuristic_plugin.cpp | 4 ---- csrc/scheduler/matmul_heuristic_plugin_api.h | 1 - 4 files changed, 2 insertions(+), 16 deletions(-) diff --git a/csrc/python_frontend/python_bindings.cpp b/csrc/python_frontend/python_bindings.cpp index bf092240b06..f17ea228ad0 100644 --- a/csrc/python_frontend/python_bindings.cpp +++ b/csrc/python_frontend/python_bindings.cpp @@ -651,7 +651,6 @@ void defineHeuristicParamBindings(py::module& nvfuser) { .PARAM(MatmulParams, circular_buffer_options) .PARAM(MatmulParams, supported_vec_size) .PARAM(MatmulParams, async_gmem_load_operands) - .PARAM(MatmulParams, rotate_ldmatrix_out_of_main_loop) .PARAM(MatmulParams, grid_swizzle_factor) .PARAM(MatmulParams, use_smem_epilogue) .PARAM(MatmulParams, promote_prologue_smem_reuse) diff --git a/csrc/scheduler/matmul_heuristic.h b/csrc/scheduler/matmul_heuristic.h index b97d8515011..ae7e7ff476e 100644 --- a/csrc/scheduler/matmul_heuristic.h +++ b/csrc/scheduler/matmul_heuristic.h @@ -138,9 +138,6 @@ class MatmulParams : public HeuristicParams { } } supported_vec_size; - //! Whether to rotate the ldmatrix out of the main loop - bool rotate_ldmatrix_out_of_main_loop = true; - //! (Ampere+) Use cp.async to load operands. bool async_gmem_load_operands = false; @@ -191,8 +188,6 @@ class MatmulParams : public HeuristicParams { << circular_buffer_options.toString() << "\n" << supported_vec_size.toString() << "\n" << nvfuser::toString(tile_sizes) << "\n" - << "Rotate ldmatrix out of main loop: " - << (rotate_ldmatrix_out_of_main_loop ? "true" : "false") << "\n" << "Async global mem load: " << (async_gmem_load_operands ? "true" : "false") << "\n" << "Indexing mode: " @@ -216,9 +211,8 @@ class MatmulParams : public HeuristicParams { size_t hash() const override { // combine boolean flags for hashing - size_t attr_hash = (static_cast(promote_prologue_smem_reuse) << 3) | - (static_cast(use_smem_epilogue) << 2) | - (static_cast(rotate_ldmatrix_out_of_main_loop) << 1) | + size_t attr_hash = (static_cast(promote_prologue_smem_reuse) << 2) | + (static_cast(use_smem_epilogue) << 1) | (static_cast(async_gmem_load_operands)); // combined hash @@ -240,8 +234,6 @@ class MatmulParams : public HeuristicParams { return other->cparams == cparams && other->mma_macro == mma_macro && other->async_gmem_load_operands == async_gmem_load_operands && - other->rotate_ldmatrix_out_of_main_loop == - rotate_ldmatrix_out_of_main_loop && other->tile_sizes == tile_sizes && other->circular_buffer_options == circular_buffer_options && other->supported_vec_size == supported_vec_size && diff --git a/csrc/scheduler/matmul_heuristic_plugin.cpp b/csrc/scheduler/matmul_heuristic_plugin.cpp index c1b7acf00c4..658ad2b07f7 100644 --- a/csrc/scheduler/matmul_heuristic_plugin.cpp +++ b/csrc/scheduler/matmul_heuristic_plugin.cpp @@ -146,8 +146,6 @@ void copyParamsToConfig(KernelConfig* config, const MatmulParams* mparams) { : 1; config->circular_buffer_smem_read = mparams->circular_buffer_options.circular_buffer_smem_read; - config->rotate_ldmatrix_out_of_main_loop = - mparams->rotate_ldmatrix_out_of_main_loop; config->problem.supported_vec_size.a = (uint8_t)mparams->supported_vec_size.a; config->problem.supported_vec_size.b = (uint8_t)mparams->supported_vec_size.b; config->problem.supported_vec_size.epilogue = @@ -190,8 +188,6 @@ void copyConfigToParams(MatmulParams* mparams, const KernelConfig* config) { } mparams->circular_buffer_options.circular_buffer_smem_read = config->circular_buffer_smem_read; - mparams->rotate_ldmatrix_out_of_main_loop = - config->rotate_ldmatrix_out_of_main_loop; // enable circular buffering if configured mparams->circular_buffer_options.circular_buffer_smem_write = diff --git a/csrc/scheduler/matmul_heuristic_plugin_api.h b/csrc/scheduler/matmul_heuristic_plugin_api.h index 65094cb9d8c..224705530e5 100644 --- a/csrc/scheduler/matmul_heuristic_plugin_api.h +++ b/csrc/scheduler/matmul_heuristic_plugin_api.h @@ -77,7 +77,6 @@ struct KernelConfig { uint8_t grid_swizzle_factor = 0; uint8_t cta_order = 0; bool circular_buffer_smem_read = true; - bool rotate_ldmatrix_out_of_main_loop = true; bool async_gmem_load_operands = true; public: From ea9c9135dc15fb118e660a9d28480c2942f89859 Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Tue, 5 Nov 2024 10:51:14 -0800 Subject: [PATCH 09/27] Remove use of RECORD_FUNCTION. (#3348) Seems to be legacy code that nobody cares about. --- csrc/runtime/fusion_executor_cache.cpp | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/csrc/runtime/fusion_executor_cache.cpp b/csrc/runtime/fusion_executor_cache.cpp index a3b8de148e2..24830ba9bd1 100644 --- a/csrc/runtime/fusion_executor_cache.cpp +++ b/csrc/runtime/fusion_executor_cache.cpp @@ -33,9 +33,6 @@ #include #include -#include -#include - namespace nvfuser { FusionExecutorCache::FusionExecutorCache( @@ -81,15 +78,7 @@ std::vector FusionExecutorCache::runFusionWithInputs( " failed"); } - int seq_id = 0; - // Record kernel input and output tensors so profiler can construct - // the data flow graph - RECORD_FUNCTION( - "run_fused_kernel", - std::vector(inputs.begin(), inputs.end()), - seq_id); auto outputs = kernel_runtime->runWithInputs(args); - RECORD_OUTPUTS(outputs); // Kernel time measurement is off by default kernel_runtime->disableKernelTimeMeasurement(); From 86a6a802005d6d722ae84c6f44de69ebe4ca736e Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Tue, 5 Nov 2024 16:19:45 -0800 Subject: [PATCH 10/27] replaceSymbolicSizes (#3346) can't repro but this seems to be the right thing to do Update: added a standalone repro --- csrc/device_lower/pass/replace_size.cpp | 13 ++++--- tests/cpp/test_gpu3.cpp | 45 +++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 5 deletions(-) diff --git a/csrc/device_lower/pass/replace_size.cpp b/csrc/device_lower/pass/replace_size.cpp index 1e1bc8b9738..6758fd75415 100644 --- a/csrc/device_lower/pass/replace_size.cpp +++ b/csrc/device_lower/pass/replace_size.cpp @@ -70,17 +70,20 @@ std::unordered_map getSimplificationMap(Fusion* fusion) { NVF_ERROR( id != nullptr, "Expected only IterDomains in exact graph ValGroups"); bool is_input_id = fusion_input_ids.count(id) > 0; - if (rep == nullptr) { - rep = id; - rep_is_input_id = is_input_id; - continue; - } Val* ext = id->extent(); bool ext_is_const = ext->isConstInt(); if (!ext_is_const) { dynamic_scalars.insert(ext); } + // Initializing rep with the first ID + if (rep == nullptr) { + rep = id; + rep_is_input_id = is_input_id; + group_is_const = ext_is_const; + continue; + } + if (ext_is_const) { if (!group_is_const || id->name() < rep->name()) { rep = id; diff --git a/tests/cpp/test_gpu3.cpp b/tests/cpp/test_gpu3.cpp index faf18593549..fe9e73847c0 100644 --- a/tests/cpp/test_gpu3.cpp +++ b/tests/cpp/test_gpu3.cpp @@ -8928,6 +8928,51 @@ TEST_F(NVFuserTest, AvoidReplacingWithDependentVal) { "not allowed as it would result in a recursive definition"))); } +// Repro of issue #3347 +TEST_F(NVFuserTest, ReplaceSymbolicSizesRepro3347) { + auto fusion_ptr = std::make_unique(); + Fusion& fusion = *fusion_ptr; + FusionGuard fg(fusion_ptr.get()); + + auto tv0 = makeSymbolicTensor(3); + fusion.addInput(tv0); + auto tv1 = makeSymbolicTensor(2); + fusion.addInput(tv1); + auto i0 = IrBuilder::create(DataType::Index); + fusion.addInput(i0); + + auto tv2 = reshape(tv0, {i0}); + auto tv3 = reshape(tv1, {i0}); + auto tv4 = add(tv2, tv3); + fusion.addOutput(tv4); + + ExpressionEvaluator expr_eval; + + expr_eval.bind(tv0->axis(0)->extent(), 2L); + expr_eval.bind(tv0->axis(1)->extent(), 4L); + expr_eval.bind(tv0->axis(2)->extent(), 8L); + expr_eval.bind(tv1->axis(0)->extent(), 8L); + expr_eval.bind(tv1->axis(1)->extent(), 8L); + expr_eval.bind(i0, 64L); + + auto initial_info = DynamicTransform::getInitialInfo(&fusion); + auto info = DynamicTransformConcretizationInfo(&initial_info, &expr_eval); + + DynamicTransform::concretizeFusion(&fusion, &info); + + replaceSymbolicSizes(&fusion); + + // All expr output tensors should use the same extent. + auto ref_ext = fusion.outputs().at(0)->as()->axis(0)->extent(); + for (auto expr : fusion.exprs()) { + auto tv_output = ir_utils::getTvOutput(expr); + ASSERT_EQ(tv_output->nDims(), 1); + auto ext = tv_output->axis(0)->extent(); + EXPECT_EQ(ref_ext, ext) << "Reference: " << ref_ext->toString() + << ", actual: " << ext->toString(); + } +} + // Test file size should be up to 10K LoC. Create a new file for more tests. } // namespace nvfuser From 69124356c055e4fc8fc5f0b767b4412a539dc00d Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Tue, 5 Nov 2024 16:35:08 -0800 Subject: [PATCH 11/27] Move allocOutputSpace to multidevice/executor.cpp (#3352) --- csrc/multidevice/executor.cpp | 20 +++++++++++++++++++- csrc/runtime/allocations.cpp | 14 -------------- csrc/runtime/allocations.h | 8 -------- 3 files changed, 19 insertions(+), 23 deletions(-) diff --git a/csrc/multidevice/executor.cpp b/csrc/multidevice/executor.cpp index 6de05e48d76..6546e4555cd 100644 --- a/csrc/multidevice/executor.cpp +++ b/csrc/multidevice/executor.cpp @@ -6,10 +6,12 @@ */ // clang-format on #include + #include #include #include #include +#include #include #include #include @@ -52,6 +54,22 @@ std::unique_ptr copyFusionAndChangeOutputs( return fusion_copy; } +// Used in distributed setting where we only want to allocate output space and +// receive output data from a different rank instead of computing them. +std::vector allocateOutputSpace( + const at::ArrayRef& inputs, + Fusion* fusion, + const c10::Device& device) { + FUSER_PERF_SCOPE("multidevice::executor::allocateOutputSpace"); + auto fusion_inputs = KernelArgumentHolder::createKernelArgumentHolder(inputs); + auto expr_eval = executor_utils::bindInputs(fusion_inputs, fusion); + + auto output_info = + getBufferInfos(expr_eval, PrimDataType::Int, fusion->outputs()); + + return allocateOutputs(fusion, output_info, device, expr_eval); +} + } // namespace MultiDeviceExecutor::MultiDeviceExecutor( @@ -186,7 +204,7 @@ std::vector MultiDeviceExecutor::runWithInput( } auto allocations = - allocOutputSpace(inputs, allocator_fusion_.get(), comm()->device()); + allocateOutputSpace(inputs, allocator_fusion_.get(), comm()->device()); NVF_ERROR(vals_to_allocate_.size() == allocations.size()); for (auto i : c10::irange(allocations.size())) { val_to_IValue[vals_to_allocate_.at(i)] = allocations.at(i); diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp index f482bf6dfb4..bc7bb5cceaf 100644 --- a/csrc/runtime/allocations.cpp +++ b/csrc/runtime/allocations.cpp @@ -366,20 +366,6 @@ std::vector allocateOutputs( return out_tensors; } -std::vector allocOutputSpace( - const at::ArrayRef& inputs, - Fusion* fusion, - const c10::Device& device) { - FUSER_PERF_SCOPE("fusion_executor::allocations::allocOutputSpace"); - auto fusion_inputs = KernelArgumentHolder::createKernelArgumentHolder(inputs); - auto expr_eval = executor_utils::bindInputs(fusion_inputs, fusion); - - auto output_info = - getBufferInfos(expr_eval, PrimDataType::Int, fusion->outputs()); - - return allocateOutputs(fusion, output_info, device, expr_eval); -} - namespace { GlobalBufferInfo getBufferInfo( ExpressionEvaluator& expr_eval, diff --git a/csrc/runtime/allocations.h b/csrc/runtime/allocations.h index 294013f4e1a..1ec77eb3ce2 100644 --- a/csrc/runtime/allocations.h +++ b/csrc/runtime/allocations.h @@ -56,14 +56,6 @@ NVF_API void setFillAllocationWithNan(bool value); void fillTensorWithNan(at::Tensor& t); -//! Used in distributed setting where we only want to -//! allocate output space and receive output data from -//! a different rank instead of computing them. -std::vector allocOutputSpace( - const at::ArrayRef& inputs, - Fusion* fusion, - const c10::Device& device); - // Infer the sizes and strides of an output tensor std::pair, std::vector> inferShapeOfOutput( TensorView* tv, From 967c2824cfdf967901706d7aee1f57a9ab91f261 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Wed, 6 Nov 2024 08:01:52 -0800 Subject: [PATCH 12/27] Prefer simpler vals in replace sizes (#3344) Noticed while working on #3309 that `i0` is replaced with `ceilDiv(i0, 1)`. While it isn't incorrect, it would make generated code look simpler if `ceilDiv(i0, 1)` is replaced with `i0`. This PR just changes representative iter domains used for replacing extents. In addition to the existing priority rules, the iter domain with the simplest extent is preferred as the representative ID of a given ID group. The simplicity of extents is just defined based on the number of expressions defining the extent val. So, for example, an iter domain with extent of `i0` should be used as the representative ID instead of iter domains with extent `ceilDiv(i0, 1)`. There should be no logic change. --- csrc/device_lower/pass/replace_size.cpp | 15 +++++++++++--- csrc/ir/utils.cpp | 26 +++++++++++++++++++++++++ csrc/ir/utils.h | 4 ++++ tests/cpp/test_gpu3.cpp | 17 ++++++++++++++-- 4 files changed, 57 insertions(+), 5 deletions(-) diff --git a/csrc/device_lower/pass/replace_size.cpp b/csrc/device_lower/pass/replace_size.cpp index 6758fd75415..1ace266794c 100644 --- a/csrc/device_lower/pass/replace_size.cpp +++ b/csrc/device_lower/pass/replace_size.cpp @@ -59,11 +59,14 @@ std::unordered_map getSimplificationMap(Fusion* fusion) { // 1. Constant ints. These might be non-immediate constants // 2. Extents of input TVs. // 3. Extents of non-input TVs. - // Within these three classes, we find the IterDomain with the smallest - // name(). + // Within these three classes, we find the IterDomain with the + // smallest name(). For case 3, we also prefer the IterDomain with + // the simplest extent, which has the smallest number of defining + // expessions. bool group_is_const = false; IterDomain* rep = nullptr; bool rep_is_input_id = false; + int64_t rep_num_defs = 0; std::unordered_set dynamic_scalars; for (Val* v : *group) { auto* id = dynamic_cast(v); @@ -81,6 +84,10 @@ std::unordered_map getSimplificationMap(Fusion* fusion) { rep = id; rep_is_input_id = is_input_id; group_is_const = ext_is_const; + // If neigher const nor input, record the number of exprs + if (!ext_is_const && !is_input_id) { + rep_num_defs = ir_utils::getOperationCount(id->extent()); + } continue; } @@ -106,9 +113,11 @@ std::unordered_map getSimplificationMap(Fusion* fusion) { if (group_is_const || rep_is_input_id) { continue; } - if (id->name() < rep->name()) { + auto num_defs = ir_utils::getOperationCount(id->extent()); + if (num_defs < rep_num_defs || id->name() < rep->name()) { rep = id; rep_is_input_id = is_input_id; + rep_num_defs = num_defs; continue; } } diff --git a/csrc/ir/utils.cpp b/csrc/ir/utils.cpp index ccde32b5378..868ba36144d 100644 --- a/csrc/ir/utils.cpp +++ b/csrc/ir/utils.cpp @@ -1238,6 +1238,32 @@ bool isRecursivelyDefined(Val* val) { return false; } +int64_t getOperationCount(Val* val) { + int64_t num_ops = 0; + + // Start with the given val and recursively count the number of ops + // by traversing inputs + std::deque vals; + vals.push_back(val); + + while (!vals.empty()) { + auto v = vals.front(); + vals.pop_front(); + + auto def = v->definition(); + if (def == nullptr) { + continue; + } + ++num_ops; + + for (auto inp : def->inputs()) { + vals.push_back(inp); + } + } + + return num_ops; +} + } // namespace nvfuser::ir_utils namespace nvfuser::MmaOpUtils { diff --git a/csrc/ir/utils.h b/csrc/ir/utils.h index ad5a7279bc4..60062b0e440 100644 --- a/csrc/ir/utils.h +++ b/csrc/ir/utils.h @@ -733,4 +733,8 @@ bool isFunctional(const Val* v); // such as the Kernel IR bool isRecursivelyDefined(Val* val); +// Return the number of operations that are used to define val. One +// instance of Expr is counted as a single operation. +int64_t getOperationCount(Val* val); + } // namespace nvfuser::ir_utils diff --git a/tests/cpp/test_gpu3.cpp b/tests/cpp/test_gpu3.cpp index fe9e73847c0..c9e7b7fdb3a 100644 --- a/tests/cpp/test_gpu3.cpp +++ b/tests/cpp/test_gpu3.cpp @@ -8928,8 +8928,8 @@ TEST_F(NVFuserTest, AvoidReplacingWithDependentVal) { "not allowed as it would result in a recursive definition"))); } -// Repro of issue #3347 -TEST_F(NVFuserTest, ReplaceSymbolicSizesRepro3347) { +// Was also a repro of issue #3347 +TEST_F(NVFuserTest, ReplaceSymbolicSizesPreferSimplerExtents) { auto fusion_ptr = std::make_unique(); Fusion& fusion = *fusion_ptr; FusionGuard fg(fusion_ptr.get()); @@ -8962,8 +8962,21 @@ TEST_F(NVFuserTest, ReplaceSymbolicSizesRepro3347) { replaceSymbolicSizes(&fusion); + // All expr output tensors should use the extent of the tv3 since it + // has only one merge, whereas tv2 has two merges // All expr output tensors should use the same extent. auto ref_ext = fusion.outputs().at(0)->as()->axis(0)->extent(); + + // ref_ext should look like getMetaData(T1).logical_size[0] * + // getMetaData(T1).logical_size[1] + auto ext_def = dynamic_cast(ref_ext->definition()); + ASSERT_NE(ext_def, nullptr); + ASSERT_EQ(ext_def->getBinaryOpType(), BinaryOpType::Mul); + auto lhs = ext_def->input(0); + auto rhs = ext_def->input(1); + ASSERT_NE(dynamic_cast(lhs->definition()), nullptr); + ASSERT_NE(dynamic_cast(rhs->definition()), nullptr); + for (auto expr : fusion.exprs()) { auto tv_output = ir_utils::getTvOutput(expr); ASSERT_EQ(tv_output->nDims(), 1); From 9cc07d7342b701aab1b02263582da7c7e237cf26 Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Wed, 6 Nov 2024 11:18:11 -0800 Subject: [PATCH 13/27] Clean uses of unique_ptr. (#3326) --- tests/cpp/test_allocation_domain.cpp | 140 +++++++++++---------------- 1 file changed, 59 insertions(+), 81 deletions(-) diff --git a/tests/cpp/test_allocation_domain.cpp b/tests/cpp/test_allocation_domain.cpp index 167374c3799..42e1c48df8b 100644 --- a/tests/cpp/test_allocation_domain.cpp +++ b/tests/cpp/test_allocation_domain.cpp @@ -29,8 +29,7 @@ using ::testing::ElementsAre; // A global->shared->global copy kernel, shared memory allocated transposed to // avoid bank conflict. TEST_F(AllocationDomainTest, TransposedIntermediate) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigConcreteTensor({32, 32}); @@ -59,7 +58,7 @@ TEST_F(AllocationDomainTest, TransposedIntermediate) { at::Tensor t0 = at::randn({32, 32}, options); FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), {t0}); + fe.compileFusion(&fusion, {t0}); auto cg_outputs = fe.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -67,8 +66,7 @@ TEST_F(AllocationDomainTest, TransposedIntermediate) { // A global->global copy kernel converting NCHW memory format into NHWC, with a // 4d allocation domain in output. TEST_F(AllocationDomainTest, NCHW4d_To_NHWC4d) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigTensor(4); @@ -97,7 +95,7 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC4d) { at::Tensor t0 = at::randn({n, c, h, w}, options); FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), {t0}); + fe.compileFusion(&fusion, {t0}); auto cg_outputs = fe.runFusion({t0}); @@ -109,8 +107,7 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC4d) { // A global->global copy kernel converting NCHW memory format into NHWC, with a // 1d allocation domain in output. TEST_F(AllocationDomainTest, NCHW4d_To_NHWC1d) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigTensor(4); @@ -136,7 +133,7 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC1d) { at::Tensor t0 = at::randn({n, c, h, w}, options); FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), {t0}); + fe.compileFusion(&fusion, {t0}); auto cg_outputs = fe.runFusion({t0}); @@ -148,8 +145,7 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC1d) { // A global->global copy kernel converting NCHW memory format into NHWC, with a // 2d allocation domain in output. TEST_F(AllocationDomainTest, NCHW4d_To_NHWC2d) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigTensor(4); @@ -176,7 +172,7 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC2d) { at::Tensor t0 = at::randn({n, c, h, w}, options); FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), {t0}); + fe.compileFusion(&fusion, {t0}); auto cg_outputs = fe.runFusion({t0}); @@ -188,8 +184,7 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC2d) { // Reshape and transpose a 3d tensor into an NHWC tensor with a 3d allocation // domain in fusion output. TEST_F(AllocationDomainTest, Tensor3d_To_NHWC3d) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); int n1 = 31, n2 = 29, h = 64, w = 104, c = 21; @@ -223,7 +218,7 @@ TEST_F(AllocationDomainTest, Tensor3d_To_NHWC3d) { at::Tensor t0 = at::randn({n1, n2, h * w * c}, options); FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), {t0}); + fe.compileFusion(&fusion, {t0}); auto cg_outputs = fe.runFusion({t0}); @@ -242,8 +237,7 @@ TEST_F(AllocationDomainTest, Tensor3d_To_NHWC3d) { // output. The allocation domain is on both the producer and the consumer side // of the rFactor domain. TEST_F(AllocationDomainTest, Tensor3d_To_NHWC4d_FwdBwd) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); int n1 = 31, n2 = 29, h = 64, w = 104, c = 21; @@ -283,7 +277,7 @@ TEST_F(AllocationDomainTest, Tensor3d_To_NHWC4d_FwdBwd) { at::Tensor t0 = at::randn({n1, n2, c * h * w}, options); FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), {t0}); + fe.compileFusion(&fusion, {t0}); auto cg_outputs = fe.runFusion({t0}); @@ -301,8 +295,7 @@ TEST_F(AllocationDomainTest, Tensor3d_To_NHWC4d_FwdBwd) { // A global->global copy kernel where both inputs and outputs are NHWC memory // format TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigTensor(4); @@ -339,7 +332,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), {t0}); + fe.compileFusion(&fusion, {t0}); EXPECT_THAT( [&]() { fe.runFusion({t0_wrong_format}); }, @@ -356,8 +349,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d) { // A global->global copy kernel where both inputs are NHWC memory format. The // allocation domain view the input as a 1d tensor. TEST_F(AllocationDomainTest, NHWC1d_To_NHWC4d) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); int n = 31, h = 64, w = 103, c = 21; @@ -398,7 +390,7 @@ TEST_F(AllocationDomainTest, NHWC1d_To_NHWC4d) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), {t0}); + fe.compileFusion(&fusion, {t0}); EXPECT_THAT( [&]() { fe.runFusion({t0_wrong_format}); }, @@ -415,8 +407,7 @@ TEST_F(AllocationDomainTest, NHWC1d_To_NHWC4d) { // A global->global copy kernel where both inputs are NHWC memory format. The // allocation domain of the output view the output as a 1d tensor. TEST_F(AllocationDomainTest, NHWC4d_To_NHWC1d) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); int n = 31, h = 64, w = 103, c = 21; @@ -454,7 +445,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC1d) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), {t0}); + fe.compileFusion(&fusion, {t0}); EXPECT_THAT( [&]() { fe.runFusion({t0_wrong_format}); }, @@ -471,8 +462,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC1d) { // A global->global copy kernel where both inputs are NHWC memory format. The // allocation domain view both the input and the output as a 1d tensors. TEST_F(AllocationDomainTest, NHWC1d_To_NHWC1d) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); int n = 31, h = 64, w = 103, c = 21; @@ -515,7 +505,7 @@ TEST_F(AllocationDomainTest, NHWC1d_To_NHWC1d) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), {t0}); + fe.compileFusion(&fusion, {t0}); EXPECT_THAT( [&]() { fe.runFusion({t0_wrong_format}); }, @@ -533,8 +523,7 @@ TEST_F(AllocationDomainTest, NHWC1d_To_NHWC1d) { // allocation domain view the input as a 2d tensor of shape [N*H/8, 8*W*C], and // view the output as a 2d tensor of shape [N*H*W*C/4, 4] TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); int n = 31, h = 64, w = 103, c = 21; @@ -583,7 +572,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), {t0}); + fe.compileFusion(&fusion, {t0}); EXPECT_THAT( [&]() { fe.runFusion({t0_wrong_format}); }, @@ -599,8 +588,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d) { // Similar to NHWC4d_To_NHWC4d, but does a cacheBefore TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheBefore) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigTensor(4); @@ -648,7 +636,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheBefore) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), {t0}); + fe.compileFusion(&fusion, {t0}); EXPECT_THAT( [&]() { fe.runFusion({t0_wrong_format}); }, @@ -664,8 +652,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheBefore) { // Similar to NHWC2d_To_NHWC2d, but does a cacheBefore TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheBefore) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); int n = 31, h = 64, w = 103, c = 21; @@ -725,7 +712,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheBefore) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), {t0}); + fe.compileFusion(&fusion, {t0}); EXPECT_THAT( [&]() { fe.runFusion({t0_wrong_format}); }, @@ -741,8 +728,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheBefore) { // Similar to NHWC4d_To_NHWC4d, but does a cacheAfter TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheAfter) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigTensor(4); @@ -790,7 +776,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheAfter) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), {t0}); + fe.compileFusion(&fusion, {t0}); EXPECT_THAT( [&]() { fe.runFusion({t0_wrong_format}); }, @@ -808,8 +794,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheAfter) { // allocation tensor to be between rFactor domain and loop domain, which is not // the case for NHWC2d_To_NHWC2d TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheAfter) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); int n = 31, h = 64, w = 103, c = 21; @@ -861,7 +846,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheAfter) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), {t0}); + fe.compileFusion(&fusion, {t0}); EXPECT_THAT( [&]() { fe.runFusion({t0_wrong_format}); }, @@ -877,8 +862,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheAfter) { // Similar to NHWC4d_To_NHWC4d, but does a cacheFork TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheFork) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigTensor(4); @@ -933,7 +917,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheFork) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), {t0}); + fe.compileFusion(&fusion, {t0}); EXPECT_THAT( [&]() { fe.runFusion({t0_wrong_format}); }, @@ -949,8 +933,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheFork) { // Similar to NHWC2d_To_NHWC2d, but does a cacheFork TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheFork) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); int n = 31, h = 64, w = 103, c = 21; @@ -1023,7 +1006,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheFork) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), {t0}); + fe.compileFusion(&fusion, {t0}); EXPECT_THAT( [&]() { fe.runFusion({t0_wrong_format}); }, @@ -1038,30 +1021,29 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheFork) { } TEST_F(AllocationDomainTest, VectorizationIssue902) { - auto fusion_ptr = std::make_unique(); - auto& fusion = *fusion_ptr; - FusionGuard fg(fusion_ptr.get()); + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); const std::vector shape({16, 16, 512, 64}); auto tv0 = makeContigTensor(4); - fusion.addInput(tv0); + fusion->addInput(tv0); auto tv1 = set(tv0); - fusion.addOutput(tv1); + fusion->addOutput(tv1); - std::vector aloc_domain; - aloc_domain.push_back(tv1->axis(0)); - aloc_domain.push_back(tv1->axis(2)); - aloc_domain.push_back(tv1->axis(3)); - aloc_domain.push_back(tv1->axis(1)); - tv1->setAllocationDomain(aloc_domain, true); + std::vector alloc_domain; + alloc_domain.push_back(tv1->axis(0)); + alloc_domain.push_back(tv1->axis(2)); + alloc_domain.push_back(tv1->axis(3)); + alloc_domain.push_back(tv1->axis(1)); + tv1->setAllocationDomain(alloc_domain, true); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutorCache executor_cache(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion)); auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); ASSERT_TRUE(cg_outputs[0].equal(t0)); @@ -1101,9 +1083,8 @@ TEST_F(AllocationDomainTest, TransposeMatrix) { } TEST_F(AllocationDomainTest, ContiguityIssue1021) { - std::unique_ptr fusion_ptr = std::make_unique(); - Fusion* fusion = fusion_ptr.get(); - FusionGuard fg(fusion); + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); auto tv0 = TensorViewBuilder() .ndims(2) @@ -1119,17 +1100,16 @@ TEST_F(AllocationDomainTest, ContiguityIssue1021) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({8, 8}, options).as_strided({4, 8}, {1, 8}); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache fec(std::move(fusion)); auto outputs = fec.runFusionWithInputs({t0}); auto t1 = t0.add(5.0); - testValidate(fusion, outputs, {t0}, __LINE__, __FILE__); + testValidate(fec.fusion(), outputs, {t0}, __LINE__, __FILE__); } TEST_F(AllocationDomainTest, ContiguityForBroadcast) { - std::unique_ptr fusion_ptr = std::make_unique(); - Fusion* fusion = fusion_ptr.get(); - FusionGuard fg(fusion); + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); auto tv0 = TensorViewBuilder() .ndims(2) @@ -1145,17 +1125,16 @@ TEST_F(AllocationDomainTest, ContiguityForBroadcast) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({1, 1}, options).as_strided({1, 1}, {0, 3}); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache fec(std::move(fusion)); auto outputs = fec.runFusionWithInputs({t0}); auto t1 = t0.add(5.0); - testValidate(fusion, outputs, {t0}, __LINE__, __FILE__); + testValidate(fec.fusion(), outputs, {t0}, __LINE__, __FILE__); } TEST_F(AllocationDomainTest, ContiguityForExplicitBroadcast) { - std::unique_ptr fusion_ptr = std::make_unique(); - Fusion* fusion = fusion_ptr.get(); - FusionGuard fg(fusion); + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); auto tv0 = TensorViewBuilder() .ndims(3) @@ -1172,11 +1151,11 @@ TEST_F(AllocationDomainTest, ContiguityForExplicitBroadcast) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({4, 8}, options).as_strided({3, 8, 4}, {0, 1, 8}); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache fec(std::move(fusion)); auto outputs = fec.runFusionWithInputs({t0}); auto t1 = t0.add(5.0); - testValidate(fusion, outputs, {t0}, __LINE__, __FILE__); + testValidate(fec.fusion(), outputs, {t0}, __LINE__, __FILE__); } // Test that allocation domain can be used to vectorize overlapping tensors, @@ -1189,8 +1168,7 @@ TEST_F(AllocationDomainTest, ContiguityForExplicitBroadcast) { // automatically supports all kinds of use cases, even those that we don't have // an active plan to support on). TEST_F(AllocationDomainTest, VectorizeOverlappingTensor) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigTensor(3); @@ -1226,7 +1204,7 @@ TEST_F(AllocationDomainTest, VectorizeOverlappingTensor) { at::randn({4 * 5 * 7}).cuda().as_strided({4, 5, 7}, {7, 4, 1}); FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), {t0}); + fe.compileFusion(&fusion, {t0}); auto cg_outputs = fe.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); From 7074f87691737c55cd968ddbe12a495b65f4d42d Mon Sep 17 00:00:00 2001 From: Ryan Spring Date: Wed, 6 Nov 2024 12:10:50 -0800 Subject: [PATCH 14/27] Fix typecast error in SelectOpRecord (#3358) The `index` argument does not have to be a `TensorView` for the `select` operation. Reference: `NVF_API TensorView* select(TensorView* tv, int64_t dim, Val* index);` --- csrc/python_frontend/fusion_record.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/python_frontend/fusion_record.h b/csrc/python_frontend/fusion_record.h index 154f8d28805..8d9f77c80bf 100644 --- a/csrc/python_frontend/fusion_record.h +++ b/csrc/python_frontend/fusion_record.h @@ -1823,7 +1823,7 @@ struct SelectOpRecord : RecordFunctor { void operator()(FusionState& fd) final { auto arg1 = fd.getFusionState(args_.at(0).index)->template as(); - auto arg3 = fd.getFusionState(args_.at(1).index)->template as(); + auto arg3 = fd.getFusionState(args_.at(1).index); Val* output = select(arg1, dim_, arg3); fd.setFusionState(outputs_.at(0).index, output); From e4ec3aa5fde89cb261d386f52314c0dc5c2cfa81 Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Wed, 6 Nov 2024 12:38:36 -0800 Subject: [PATCH 15/27] Rename transformOutputFromAllocationToLogical (#3341) --- csrc/runtime/allocations.cpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp index bc7bb5cceaf..29fa52461e6 100644 --- a/csrc/runtime/allocations.cpp +++ b/csrc/runtime/allocations.cpp @@ -671,12 +671,11 @@ class BackwardTraverseFromAllocToLogical { // Another example, if the logical domain is [I1*I2] and the allocation domain // is [I1, I2], then we will allocate as [I1, I2] and do a tensor.view(I1*I2) to // get a tensor whose semantics is [I1*I2] but memory is [I1,I2] -at::Tensor transformOutputFromAllocationToLogical( +at::Tensor transformFromAllocationToLogical( at::Tensor tensor, TensorView* tv, ExpressionEvaluator& ee) { - FUSER_PERF_SCOPE( - "fusion_executor::allocations::transformOutputFromAllocationToLogical"); + FUSER_PERF_SCOPE("allocations::transformFromAllocationToLogical"); // Ignore reductions because reductions does not exist in tensor's definition auto logical = TensorDomain::noReductions(tv->getLogicalDomain()); auto alloc = TensorDomain::noReductions(tv->getMaybeAllocationDomain()); @@ -751,9 +750,8 @@ std::pair, std::vector> inferShapeOfOutput( at::empty_strided(size_stride.first, size_stride.second, options); // TODO(jiej): we should refactor it here, there's no need to use // meta_tensor at all, size + stride should be used directly in the - // `transformOutputFromAllocationToLogical` - meta_tensor = - transformOutputFromAllocationToLogical(meta_tensor, tv, expr_eval); + // `transformFromAllocationToLogical` + meta_tensor = transformFromAllocationToLogical(meta_tensor, tv, expr_eval); return {meta_tensor.sizes().vec(), meta_tensor.strides().vec()}; } From 1836ed0755c31d1f3d4903b10b00aacc55f20101 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Wed, 6 Nov 2024 13:21:19 -0800 Subject: [PATCH 16/27] LoadStoreOp can have scalar inputs, so type check is required (#3359) Type error was detected with #3263 while I was testing it with a Debug build. ``` pytest -v tests/python/test_python_frontend.py -k test_pad_dynamic ``` It has a fusion of: ``` Inputs: T0_g_float[ bS0{1}, iS1{i1}, iS2{i2} ] Outputs: T1_g_float[ bS11{1}, iS12{( ( i1 + ( (nvfuser_index_t)(( (int64_t)(( float(2.5) * ( (float)(7) ) )) )) ) ) + ( (nvfuser_index_t)(( (int64_t)(( float(2.5) * ( (float)(7) ) )) )) ) )}, iS13{( ( i2 + ( (nvfuser_index_t)(( (int64_t)(( float(2.5) * ( (float)(7) ) )) )) ) ) + ( (nvfuser_index_t)(( (int64_t)(( float(2.5) * ( (float)(7) ) )) )) ) )} ] %kernel_math { f7 = (float)(7); f9 = float(2.5) * f7; i11 = (int64_t)(f9); i14 = (nvfuser_index_t)(i11); i16 = (nvfuser_index_t)(i11); i18 = (nvfuser_index_t)(i11); i20 = (nvfuser_index_t)(i11); T2_l_float[ bS3{1}, iS5{( ( i1 + ( (nvfuser_index_t)(( (int64_t)(( float(2.5) * ( (float)(7) ) )) )) ) ) + ( (nvfuser_index_t)(( (int64_t)(( float(2.5) * ( (float)(7) ) )) )) ) )}rf, iS7{( ( i2 + ( (nvfuser_index_t)(( (int64_t)(( float(2.5) * ( (float)(7) ) )) )) ) ) + ( (nvfuser_index_t)(( (int64_t)(( float(2.5) * ( (float)(7) ) )) )) ) )}rf ] = pad( T0_g_float[ bS0{1}, iS1{i1}, iS2{i2} ], {0, 0, i14, i16, i18, i20} ) T1_g_float[ bS11{1}, iS12{( ( i1 + ( (nvfuser_index_t)(( (int64_t)(( float(2.5) * ( (float)(7) ) )) )) ) ) + ( (nvfuser_index_t)(( (int64_t)(( float(2.5) * ( (float)(7) ) )) )) ) )}, iS13{( ( i2 + ( (nvfuser_index_t)(( (int64_t)(( float(2.5) * ( (float)(7) ) )) )) ) ) + ( (nvfuser_index_t)(( (int64_t)(( float(2.5) * ( (float)(7) ) )) )) ) )} ] = Set( T2_l_float[ bS3{1}, iS5{( ( i1 + ( (nvfuser_index_t)(( (int64_t)(( float(2.5) * ( (float)(7) ) )) )) ) ) + ( (nvfuser_index_t)(( (int64_t)(( float(2.5) * ( (float)(7) ) )) )) ) )}rf, iS7{( ( i2 + ( (nvfuser_index_t)(( (int64_t)(( float(2.5) * ( (float)(7) ) )) )) ) ) + ( (nvfuser_index_t)(( (int64_t)(( float(2.5) * ( (float)(7) ) )) )) ) )}rf ], cache_op=Streaming ) } // %kernel_math ``` Stack trace: ``` #0 __cxxabiv1::__cxa_throw (obj=0xabc6ea0, tinfo=0x7ffeba81c248 , dest=0x7ffeba059370 ) at ../../../../libstdc++-v3/libsupc++/eh_throw.cc:80 #1 0x00007ffeba058665 in nvfuser::nvfCheckFail (func=0x7ffeb9927c33 "as", file=0x7ffeb99d9a1b "/raid/nmaruyama/debug1/csrc/utils.h", line=119, msg=0x7ffeb99c36e6 " INTERNAL ASSERT FAILED at \"/raid/nmaruyama/debug1/csrc/utils.h\":119, please report a bug with repro script to NVFuser at https://github.com/NVIDIA/Fuser/issues. ") at /raid/nmaruyama/debug1/csrc/exceptions.cpp:283 #2 0x00007ffeb9c1be4b in nvfuser::nvfErrorFail (func=0x7ffeb9927c33 "as", file=0x7ffeb99d9a1b "/raid/nmaruyama/debug1/csrc/utils.h", line=119, condMsg=0x7ffeb99c36e6 " INTERNAL ASSERT FAILED at \"/raid/nmaruyama/debug1/csrc/utils.h\":119, please report a bug with repro script to NVFuser at https://github.com/NVIDIA/Fuser/issues. ") at /raid/nmaruyama/debug1/csrc/exceptions.h:229 #3 0x00007ffeb9c1bbe4 in nvfuser::PolymorphicBase::as (this=0xac07490) at /raid/nmaruyama/debug1/csrc/utils.h:119 #4 0x00007ffeba54c67f in nvfuser::(anonymous namespace)::isLoadGlobalToLocal (expr=0xabd7c50) at /raid/nmaruyama/debug1/csrc/scheduler/cache_policy_refiner.cpp:61 #5 0x00007ffeba54c599 in nvfuser::refineCachePolicy (fusion=0xabef940) at /raid/nmaruyama/debug1/csrc/scheduler/cache_policy_refiner.cpp:153 ``` --- csrc/scheduler/cache_policy_refiner.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/csrc/scheduler/cache_policy_refiner.cpp b/csrc/scheduler/cache_policy_refiner.cpp index 7e6eab7eb18..2159c93c969 100644 --- a/csrc/scheduler/cache_policy_refiner.cpp +++ b/csrc/scheduler/cache_policy_refiner.cpp @@ -58,6 +58,12 @@ bool isLoadGlobalToLocal(const Expr* expr) { if (ldst->opType() != LoadStoreOpType::Set) { return false; } + // It should not be necessary to check the output since it should be + // always a TensorView as long as the input is a TensorView, but + // just in case. + if (!ldst->in()->isA() || !ldst->out()->isA()) { + return false; + } if (ldst->in()->as()->getMemoryType() != MemoryType::Global) { return false; } From ba4f7d4cbab7f6b6544e54026134a27d3cdb8a02 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Wed, 6 Nov 2024 17:19:23 -0800 Subject: [PATCH 17/27] Rename FusionExector to KernelExecutor, fe to ke, fec to executor_cache (#3349) This is just mechanical name change only. Intended to simplify #3263. --- benchmarks/cpp/batch_norm_channels_first.cpp | 4 +- .../batch_norm_channels_first_backward.cpp | 4 +- benchmarks/cpp/batch_norm_channels_last.cpp | 4 +- .../cpp/batch_norm_channels_last_backward.cpp | 4 +- benchmarks/cpp/bert.cpp | 24 +- benchmarks/cpp/broadcast.cpp | 4 +- benchmarks/cpp/gelu_backward.cpp | 24 +- benchmarks/cpp/gelu_backward_reduction.cpp | 4 +- benchmarks/cpp/heuristic_cache.cpp | 24 +- benchmarks/cpp/heuristic_lookup.cpp | 24 +- benchmarks/cpp/indexselect.cpp | 18 +- benchmarks/cpp/instance_norm.cpp | 8 +- benchmarks/cpp/layer_norm.cpp | 8 +- benchmarks/cpp/layer_norm_backward.cpp | 4 +- benchmarks/cpp/layer_norm_fused.cpp | 4 +- benchmarks/cpp/lstm_cell.cpp | 24 +- benchmarks/cpp/matmul.cpp | 30 +- benchmarks/cpp/reduction.cpp | 4 +- benchmarks/cpp/rms_norm.cpp | 4 +- benchmarks/cpp/rms_norm_backward.cpp | 4 +- benchmarks/cpp/scale_bias_relu.cpp | 8 +- benchmarks/cpp/shape_inference.cpp | 44 +- benchmarks/cpp/softmax.cpp | 16 +- benchmarks/cpp/softmax_backward.cpp | 4 +- benchmarks/cpp/softmax_dropout.cpp | 4 +- benchmarks/cpp/timm.cpp | 32 +- benchmarks/cpp/transpose.cpp | 4 +- benchmarks/cpp/utils.cpp | 18 +- benchmarks/cpp/utils.h | 4 +- csrc/fusion.h | 2 +- csrc/host_ir/executor.cpp | 10 +- csrc/host_ir/executor.h | 4 +- csrc/kernel_ir.h | 10 +- csrc/options.h | 2 +- csrc/python_frontend/fusion_cache.cpp | 6 +- csrc/python_frontend/fusion_cache.h | 4 +- csrc/runtime/executor.cpp | 120 +-- csrc/runtime/executor.h | 20 +- csrc/runtime/executor_utils.cpp | 2 +- csrc/runtime/executor_utils.h | 8 +- csrc/runtime/fusion_cache_utils.h | 4 +- csrc/runtime/fusion_executor_cache.cpp | 8 +- csrc/runtime/fusion_executor_cache.h | 6 +- csrc/runtime/fusion_kernel_runtime.cpp | 14 +- csrc/runtime/fusion_kernel_runtime.h | 6 +- csrc/scheduler/compile_time_info.h | 2 +- csrc/scheduler/matmul_utils.cpp | 2 +- csrc/scheduler/utils.cpp | 2 +- csrc/serde/Serde.md | 8 +- csrc/serde/fusion_cache.fbs | 10 +- csrc/serde/polymorphic_value.h | 2 +- examples/sinh_extension/main.cpp | 6 +- examples/sinh_libtorch/main.cpp | 6 +- tests/cpp/test_alias.cpp | 437 ++++++---- tests/cpp/test_alias_analysis.cpp | 6 +- tests/cpp/test_allocation_domain.cpp | 256 +++--- tests/cpp/test_allocation_order_inference.cpp | 4 +- tests/cpp/test_circular_buffering.cpp | 182 ++-- .../test_combined_inner_outer_reduction.cpp | 44 +- tests/cpp/test_dynamic_transform.cpp | 121 ++- tests/cpp/test_external_src.cpp | 12 +- tests/cpp/test_gpu1.cpp | 582 +++++++------ tests/cpp/test_gpu2.cpp | 814 +++++++++--------- tests/cpp/test_gpu3.cpp | 677 ++++++++------- tests/cpp/test_gpu_compute_with.cpp | 36 +- tests/cpp/test_gpu_fused_reduction.cpp | 244 +++--- tests/cpp/test_gpu_indexing_ops.cpp | 6 +- tests/cpp/test_gpu_outer_reduction.cpp | 84 +- tests/cpp/test_gpu_transpose.cpp | 12 +- tests/cpp/test_gpu_view.cpp | 72 +- tests/cpp/test_host_irs.cpp | 2 +- tests/cpp/test_indexing.cpp | 102 +-- tests/cpp/test_indexing_advanced.cpp | 128 +-- tests/cpp/test_inlining.cpp | 24 +- tests/cpp/test_loop_domain_scheduling.cpp | 18 +- tests/cpp/test_loop_rotation.cpp | 36 +- tests/cpp/test_matmul.cpp | 362 ++++---- tests/cpp/test_matmul_aten_evaluation.cpp | 28 +- tests/cpp/test_matmul_sass.cpp | 16 +- tests/cpp/test_matmul_scheduler.cpp | 70 +- tests/cpp/test_mbarrier.cpp | 10 +- tests/cpp/test_memory.cpp | 434 +++++----- tests/cpp/test_mma.cpp | 36 +- tests/cpp/test_move_pad.cpp | 102 ++- tests/cpp/test_move_split_cat.cpp | 130 +-- .../test_multidevice_lower_communication.cpp | 66 +- tests/cpp/test_multidevice_matmul.cpp | 78 +- tests/cpp/test_multidevice_sharding.cpp | 69 +- tests/cpp/test_multidevice_transformer.cpp | 28 +- tests/cpp/test_no_op.cpp | 16 +- tests/cpp/test_persistent_buffer.cpp | 41 +- tests/cpp/test_pointwise.cpp | 67 +- tests/cpp/test_predicate_elimination.cpp | 58 +- tests/cpp/test_preseg_passes.cpp | 7 +- tests/cpp/test_replay.cpp | 10 +- tests/cpp/test_resize.cpp | 338 ++++---- tests/cpp/test_rng.cpp | 68 +- tests/cpp/test_scalar_hoisting.cpp | 24 +- tests/cpp/test_scatter_gather.cpp | 99 +-- tests/cpp/test_sdpa_node.cpp | 52 +- tests/cpp/test_segmentation.cpp | 145 ++-- tests/cpp/test_serial_gridreduce.cpp | 6 +- tests/cpp/test_sharding.cpp | 6 +- tests/cpp/test_smem_reuse.cpp | 18 +- tests/cpp/test_swizzle.cpp | 62 +- tests/cpp/test_tensor_factories.cpp | 12 +- tests/cpp/test_translate_mma.cpp | 8 +- tests/cpp/test_tutorial.cpp | 112 +-- tests/cpp/test_unary.cpp | 11 +- tests/cpp/test_utils.cpp | 10 +- tests/cpp/utils.cpp | 10 +- tests/cpp/utils.h | 4 +- tools/examples/repro.cpp | 4 +- 113 files changed, 3684 insertions(+), 3458 deletions(-) diff --git a/benchmarks/cpp/batch_norm_channels_first.cpp b/benchmarks/cpp/batch_norm_channels_first.cpp index 1bc1845d912..22098787766 100644 --- a/benchmarks/cpp/batch_norm_channels_first.cpp +++ b/benchmarks/cpp/batch_norm_channels_first.cpp @@ -78,7 +78,7 @@ static void setupBatchNorm(Fusion* fusion, DataType dtype) { static void NvFuserScheduler_BatchNorm( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype) { NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half); @@ -102,7 +102,7 @@ static void NvFuserScheduler_BatchNorm( std::vector aten_inputs( {at_x, at_weight, at_bias, at_run_mean, at_run_var}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * diff --git a/benchmarks/cpp/batch_norm_channels_first_backward.cpp b/benchmarks/cpp/batch_norm_channels_first_backward.cpp index 271d04eece9..0edd2d3e52d 100644 --- a/benchmarks/cpp/batch_norm_channels_first_backward.cpp +++ b/benchmarks/cpp/batch_norm_channels_first_backward.cpp @@ -89,7 +89,7 @@ static void setupBatchNorm_BWD(Fusion* fusion, DataType dtype) { static void NvFuserScheduler_BatchNorm_BWD( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype) { NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half); @@ -115,7 +115,7 @@ static void NvFuserScheduler_BatchNorm_BWD( std::vector aten_inputs( {input, grad_out, weight, run_mean, run_var, save_mean, save_var}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * diff --git a/benchmarks/cpp/batch_norm_channels_last.cpp b/benchmarks/cpp/batch_norm_channels_last.cpp index dc21fede5f6..bbdd5c82e63 100644 --- a/benchmarks/cpp/batch_norm_channels_last.cpp +++ b/benchmarks/cpp/batch_norm_channels_last.cpp @@ -79,7 +79,7 @@ static void setupBatchNorm_nhwc(Fusion* fusion, DataType dtype) { static void NvFuserScheduler_BatchNorm_nhwc( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype) { NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half); @@ -103,7 +103,7 @@ static void NvFuserScheduler_BatchNorm_nhwc( std::vector aten_inputs( {at_x, at_weight, at_bias, at_run_mean, at_run_var}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * diff --git a/benchmarks/cpp/batch_norm_channels_last_backward.cpp b/benchmarks/cpp/batch_norm_channels_last_backward.cpp index a11627139d4..e40508fffe3 100644 --- a/benchmarks/cpp/batch_norm_channels_last_backward.cpp +++ b/benchmarks/cpp/batch_norm_channels_last_backward.cpp @@ -90,7 +90,7 @@ static void setupBatchNorm_nhwc_BWD(Fusion* fusion, DataType dtype) { static void NvFuserScheduler_BatchNorm_nhwc_BWD( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype) { NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half); @@ -116,7 +116,7 @@ static void NvFuserScheduler_BatchNorm_nhwc_BWD( std::vector aten_inputs( {input, grad_out, weight, run_mean, run_var, save_mean, save_var}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * diff --git a/benchmarks/cpp/bert.cpp b/benchmarks/cpp/bert.cpp index d22e234cde1..94edab48479 100644 --- a/benchmarks/cpp/bert.cpp +++ b/benchmarks/cpp/bert.cpp @@ -118,7 +118,7 @@ static void setupDivMaxSoftmaxDropoutBackward(Fusion* fusion, DataType dtype) { static void NvFuserScheduler_DivMaxSoftDropFwd( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype) { auto w = benchmark_state.range(0); auto x = benchmark_state.range(1); @@ -135,7 +135,7 @@ static void NvFuserScheduler_DivMaxSoftDropFwd( std::vector at_inputs = {t0, t1}; auto bytes = - runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, at_inputs); benchmark_state.SetBytesProcessed( bytes * int64_t(benchmark_state.iterations())); @@ -143,7 +143,7 @@ static void NvFuserScheduler_DivMaxSoftDropFwd( static void NvFuserScheduler_DivMaxSoftDropBwd( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype) { auto w = benchmark_state.range(0); auto x = benchmark_state.range(1); @@ -162,7 +162,7 @@ static void NvFuserScheduler_DivMaxSoftDropBwd( std::vector at_inputs = {t0, t1, t2, t3}; auto bytes = - runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, at_inputs); // Some reason t1 isn't used, ignore it. bytes -= @@ -228,7 +228,7 @@ static void setupBiasDropoutAddLayernormFwd(Fusion* fusion, DataType dtype) { static void NvFuserScheduler_BiasDropoutAddLayernormFwd( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype) { auto x = benchmark_state.range(0); auto y = benchmark_state.range(1); @@ -247,7 +247,7 @@ static void NvFuserScheduler_BiasDropoutAddLayernormFwd( std::vector at_inputs = {t0, t1, t2, t3, t4}; auto bytes = - runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, at_inputs); benchmark_state.SetBytesProcessed( bytes * int64_t(benchmark_state.iterations())); @@ -304,7 +304,7 @@ static void setupBiasDropoutAddLayernormBwd1(Fusion* fusion, DataType dtype) { static void NvFuserScheduler_BiasDropoutAddLayernormBwd1( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype) { auto x = benchmark_state.range(0); auto y = benchmark_state.range(1); @@ -322,7 +322,7 @@ static void NvFuserScheduler_BiasDropoutAddLayernormBwd1( std::vector at_inputs = {t0, t1, t2, t3}; auto bytes = - runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, at_inputs); benchmark_state.SetBytesProcessed( bytes * int64_t(benchmark_state.iterations())); @@ -380,7 +380,7 @@ static void setupBiasDropoutAddLayernormBwd2(Fusion* fusion, DataType dtype) { static void NvFuserScheduler_BiasDropoutAddLayernormBwd2( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype) { auto x = benchmark_state.range(0); auto y = benchmark_state.range(1); @@ -398,7 +398,7 @@ static void NvFuserScheduler_BiasDropoutAddLayernormBwd2( std::vector at_inputs = {t4, t5, t1, t8}; auto bytes = - runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, at_inputs); benchmark_state.SetBytesProcessed( bytes * int64_t(benchmark_state.iterations())); @@ -438,7 +438,7 @@ static void setupBiasDropoutAddLayernormBwd3(Fusion* fusion, DataType dtype) { static void NvFuserScheduler_BiasDropoutAddLayernormBwd3( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype) { auto x = benchmark_state.range(0); auto y = benchmark_state.range(1); @@ -454,7 +454,7 @@ static void NvFuserScheduler_BiasDropoutAddLayernormBwd3( std::vector at_inputs = {t0, t21}; auto bytes = - runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, at_inputs); benchmark_state.SetBytesProcessed( bytes * int64_t(benchmark_state.iterations())); diff --git a/benchmarks/cpp/broadcast.cpp b/benchmarks/cpp/broadcast.cpp index c3accd47d2e..6ef7564a6e0 100644 --- a/benchmarks/cpp/broadcast.cpp +++ b/benchmarks/cpp/broadcast.cpp @@ -56,7 +56,7 @@ static void setupBroadcast(Fusion* fusion, DataType dtype, int bcast_axis) { static void NvFuserScheduler_Broadcast( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype, int bcast_dim) { auto bcast_size = benchmark_state.range(0); @@ -74,7 +74,7 @@ static void NvFuserScheduler_Broadcast( std::vector aten_inputs({t0, t1}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * diff --git a/benchmarks/cpp/gelu_backward.cpp b/benchmarks/cpp/gelu_backward.cpp index 512ea915ae2..24cb5fa72f6 100644 --- a/benchmarks/cpp/gelu_backward.cpp +++ b/benchmarks/cpp/gelu_backward.cpp @@ -162,8 +162,8 @@ static void NvFuserScheduler_GeluBackward_Compile( &fusion, SchedulerType::PointWise, c10::ArrayRef(inputs)); for (auto _ : benchmark_state) { - FusionExecutor executor; - executor.compileFusion(&fusion, inputs, heuristic_params->lparams); + KernelExecutor ke; + ke.compileFusion(&fusion, inputs, heuristic_params->lparams); } } @@ -187,13 +187,13 @@ static void NvFuserScheduler_GeluBackward_RunFusion( auto heuristic_params = SchedulerEntry::scheduleWith( &fusion, SchedulerType::PointWise, c10::ArrayRef(inputs)); - FusionExecutor executor; - executor.compileFusion(&fusion, inputs, heuristic_params->lparams); + KernelExecutor ke; + ke.compileFusion(&fusion, inputs, heuristic_params->lparams); C10_CUDA_CHECK(cudaDeviceSynchronize()); for (auto _ : benchmark_state) { - outputs = executor.runFusion( + outputs = ke.runFusion( c10::ArrayRef(inputs), heuristic_params->lparams); C10_CUDA_CHECK(cudaDeviceSynchronize()); clearL2Cache(); @@ -218,11 +218,11 @@ static void NvFuserScheduler_GeluBackward_RunFusion_GpuOnly( auto heuristic_params = SchedulerEntry::scheduleWith( &fusion, SchedulerType::PointWise, c10::ArrayRef(inputs)); - FusionExecutor executor; - executor.compileFusion(&fusion, inputs, heuristic_params->lparams); + KernelExecutor ke; + ke.compileFusion(&fusion, inputs, heuristic_params->lparams); runBenchmarkIterations( - benchmark_state, &executor, inputs, heuristic_params->lparams); + benchmark_state, &ke, inputs, heuristic_params->lparams); } BENCHMARK(NvFuserScheduler_GeluBackward_RunFusion_GpuOnly) @@ -247,12 +247,12 @@ static void NvFuserScheduler_GeluBackward_RunFusion_CpuOnly( auto heuristic_params = SchedulerEntry::scheduleWith( &fusion, SchedulerType::PointWise, c10::ArrayRef(inputs)); - FusionExecutor executor; - executor.setExecuteKernelFlag(false); - executor.compileFusion(&fusion, inputs, heuristic_params->lparams); + KernelExecutor ke; + ke.setExecuteKernelFlag(false); + ke.compileFusion(&fusion, inputs, heuristic_params->lparams); for (auto _ : benchmark_state) { - outputs = executor.runFusion( + outputs = ke.runFusion( c10::ArrayRef(inputs), heuristic_params->lparams); } } diff --git a/benchmarks/cpp/gelu_backward_reduction.cpp b/benchmarks/cpp/gelu_backward_reduction.cpp index 60ea8ed2b29..c4e97fc6d3b 100644 --- a/benchmarks/cpp/gelu_backward_reduction.cpp +++ b/benchmarks/cpp/gelu_backward_reduction.cpp @@ -93,7 +93,7 @@ static void setupGeluBackwardReduction( static void NvFuserScheduler_GeluBackwardReduction( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype, int reduction_dim) { auto reduction_size = benchmark_state.range(0); @@ -112,7 +112,7 @@ static void NvFuserScheduler_GeluBackwardReduction( std::vector aten_inputs = {aten_input_grad, aten_input_x}; - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); // inputs: gradient tensor + input tensor // outputs: output, output_of_reduction diff --git a/benchmarks/cpp/heuristic_cache.cpp b/benchmarks/cpp/heuristic_cache.cpp index 29d5b13bef2..5ab923bd6d0 100644 --- a/benchmarks/cpp/heuristic_cache.cpp +++ b/benchmarks/cpp/heuristic_cache.cpp @@ -26,7 +26,7 @@ using namespace nvfuser; static auto getLayerBackwardNormRuntime( std::unique_ptr fusion_ptr, - std::unique_ptr& fec, + std::unique_ptr& executor_cache, std::vector& aten_inputs, std::vector& shape, std::vector& norm_shape) { @@ -84,12 +84,12 @@ static auto getLayerBackwardNormRuntime( auto aten_mean = std::get<1>(aten_results); auto aten_rstd = std::get<2>(aten_results); - fec = std::make_unique(std::move(fusion_ptr)); + executor_cache = std::make_unique(std::move(fusion_ptr)); aten_inputs = { aten_grad_out, aten_input, aten_mean, aten_rstd, aten_weight, aten_bias}; - auto cg_outputs = fec->runFusionWithInputs(aten_inputs); + auto cg_outputs = executor_cache->runFusionWithInputs(aten_inputs); - return fec->getMostRecentKernelRuntime(); + return executor_cache->getMostRecentKernelRuntime(); } static void NvFuserScheduler_LayerNormBackward_HeuristicCache( @@ -98,14 +98,14 @@ static void NvFuserScheduler_LayerNormBackward_HeuristicCache( FusionGuard fg(fusion_ptr.get()); // PreAllocate - std::unique_ptr fec; + std::unique_ptr executor_cache; std::vector aten_inputs; std::vector shape{20, 100, 35, 67}; std::vector norm_shape{67}; auto runtime = getLayerBackwardNormRuntime( - std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape); + std::move(fusion_ptr), executor_cache, aten_inputs, shape, norm_shape); KernelArgumentHolder args = KernelArgumentHolder::createKernelArgumentHolder(aten_inputs); @@ -120,7 +120,7 @@ static void NvFuserScheduler_LayerNormBackward_HeuristicCache( static auto getLayerForwardNormRuntime( std::unique_ptr fusion_ptr, - std::unique_ptr& fec, + std::unique_ptr& executor_cache, std::vector& aten_inputs, std::vector& shape, std::vector& norm_shape) { @@ -141,11 +141,11 @@ static auto getLayerForwardNormRuntime( auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn(shape, options); - fec = std::make_unique(std::move(fusion_ptr)); + executor_cache = std::make_unique(std::move(fusion_ptr)); aten_inputs = {aten_input}; - auto cg_outputs = fec->runFusionWithInputs(aten_inputs); + auto cg_outputs = executor_cache->runFusionWithInputs(aten_inputs); - return fec->getMostRecentKernelRuntime(); + return executor_cache->getMostRecentKernelRuntime(); } static void NvFuserScheduler_LayerNormForward_HeuristicCache( @@ -154,14 +154,14 @@ static void NvFuserScheduler_LayerNormForward_HeuristicCache( FusionGuard fg(fusion_ptr.get()); // PreAllocate - std::unique_ptr fec; + std::unique_ptr executor_cache; std::vector aten_inputs; std::vector shape{20, 100, 35, 67}; std::vector norm_shape{67}; auto runtime = getLayerForwardNormRuntime( - std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape); + std::move(fusion_ptr), executor_cache, aten_inputs, shape, norm_shape); KernelArgumentHolder args = KernelArgumentHolder::createKernelArgumentHolder(aten_inputs); diff --git a/benchmarks/cpp/heuristic_lookup.cpp b/benchmarks/cpp/heuristic_lookup.cpp index aecc7dc824f..16be106b728 100644 --- a/benchmarks/cpp/heuristic_lookup.cpp +++ b/benchmarks/cpp/heuristic_lookup.cpp @@ -26,7 +26,7 @@ using namespace nvfuser; static auto getLayerBackwardNormRuntime( std::unique_ptr fusion_ptr, - std::unique_ptr& fec, + std::unique_ptr& executor_cache, std::vector& aten_inputs, std::vector& shape, std::vector& norm_shape) { @@ -86,12 +86,12 @@ static auto getLayerBackwardNormRuntime( auto aten_mean = std::get<1>(aten_results); auto aten_rstd = std::get<2>(aten_results); - fec = std::make_unique(std::move(fusion_ptr)); + executor_cache = std::make_unique(std::move(fusion_ptr)); aten_inputs = { aten_grad_out, aten_input, aten_mean, aten_rstd, aten_weight, aten_bias}; - auto cg_outputs = fec->runFusionWithInputs(aten_inputs); + auto cg_outputs = executor_cache->runFusionWithInputs(aten_inputs); - return fec->getMostRecentKernelRuntime(); + return executor_cache->getMostRecentKernelRuntime(); } static void NvFuserScheduler_LayerNormBackward_HeuristicLookup( @@ -100,14 +100,14 @@ static void NvFuserScheduler_LayerNormBackward_HeuristicLookup( FusionGuard fg(fusion_ptr.get()); // PreAllocate - std::unique_ptr fec; + std::unique_ptr executor_cache; std::vector aten_inputs; std::vector shape{20, 100, 35, 67}; std::vector norm_shape{67}; auto runtime = getLayerBackwardNormRuntime( - std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape); + std::move(fusion_ptr), executor_cache, aten_inputs, shape, norm_shape); KernelArgumentHolder args = KernelArgumentHolder::createKernelArgumentHolder(aten_inputs); @@ -122,7 +122,7 @@ static void NvFuserScheduler_LayerNormBackward_HeuristicLookup( static auto getLayerForwardNormRuntime( std::unique_ptr fusion_ptr, - std::unique_ptr& fec, + std::unique_ptr& executor_cache, std::vector& aten_inputs, std::vector& shape, std::vector& norm_shape) { @@ -143,11 +143,11 @@ static auto getLayerForwardNormRuntime( auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn(shape, options); - fec = std::make_unique(std::move(fusion_ptr)); + executor_cache = std::make_unique(std::move(fusion_ptr)); aten_inputs = {aten_input}; - auto cg_outputs = fec->runFusionWithInputs(aten_inputs); + auto cg_outputs = executor_cache->runFusionWithInputs(aten_inputs); - return fec->getMostRecentKernelRuntime(); + return executor_cache->getMostRecentKernelRuntime(); } static void NvFuserScheduler_LayerNormForward_HeuristicLookup( @@ -156,14 +156,14 @@ static void NvFuserScheduler_LayerNormForward_HeuristicLookup( FusionGuard fg(fusion_ptr.get()); // PreAllocate - std::unique_ptr fec; + std::unique_ptr executor_cache; std::vector aten_inputs; std::vector shape{20, 100, 35, 67}; std::vector norm_shape{67}; auto runtime = getLayerForwardNormRuntime( - std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape); + std::move(fusion_ptr), executor_cache, aten_inputs, shape, norm_shape); KernelArgumentHolder args = KernelArgumentHolder::createKernelArgumentHolder(aten_inputs); diff --git a/benchmarks/cpp/indexselect.cpp b/benchmarks/cpp/indexselect.cpp index ba5c7054cab..24eeb31679e 100644 --- a/benchmarks/cpp/indexselect.cpp +++ b/benchmarks/cpp/indexselect.cpp @@ -132,8 +132,8 @@ static void NvFuserScheduler_IndexSelect_Compile( &fusion, SchedulerType::PointWise, c10::ArrayRef(inputs)); for (auto _ : benchmark_state) { - FusionExecutor executor; - executor.compileFusion( + KernelExecutor ke; + ke.compileFusion( &fusion, c10::ArrayRef(inputs), heuristic_params->lparams); } } @@ -155,8 +155,8 @@ static void NvFuserScheduler_IndexSelect_RunFusion( auto heuristic_params = SchedulerEntry::scheduleWith( &fusion, SchedulerType::PointWise, c10::ArrayRef(inputs)); - FusionExecutor executor; - executor.compileFusion( + KernelExecutor ke; + ke.compileFusion( &fusion, c10::ArrayRef(inputs), heuristic_params->lparams); C10_CUDA_CHECK(cudaDeviceSynchronize()); @@ -164,7 +164,7 @@ static void NvFuserScheduler_IndexSelect_RunFusion( at::Tensor output = at::empty_like(inputs[0].toTensor()); for (auto _ : benchmark_state) { - executor.runFusion( + ke.runFusion( c10::ArrayRef(inputs), {output}, heuristic_params->lparams); @@ -235,7 +235,7 @@ static void setupIndexSelect(Fusion* fusion, DataType dtype, int select_dim) { static void NvFuserScheduler_IndexSelectSimple( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype, int select_dim) { auto elem_size = benchmark_state.range(0); @@ -257,7 +257,7 @@ static void NvFuserScheduler_IndexSelectSimple( std::vector aten_inputs = {t0, t1}; - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * @@ -267,7 +267,7 @@ static void NvFuserScheduler_IndexSelectSimple( static void NvFuserScheduler_IndexSelect( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype, int select_dim) { auto elem_size = benchmark_state.range(0); @@ -289,7 +289,7 @@ static void NvFuserScheduler_IndexSelect( std::vector aten_inputs = {t2, t0, t1}; - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * diff --git a/benchmarks/cpp/instance_norm.cpp b/benchmarks/cpp/instance_norm.cpp index d4c6707e912..2f3a832d8db 100644 --- a/benchmarks/cpp/instance_norm.cpp +++ b/benchmarks/cpp/instance_norm.cpp @@ -81,7 +81,7 @@ static void setupInstanceNorm( static void NvFuserScheduler_InstanceNorm( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype, bool channels_last_3d = false) { NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half); @@ -116,7 +116,7 @@ static void NvFuserScheduler_InstanceNorm( at_x, at_weight, at_bias, at_mean, at_var}; std::vector outputs; - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); const size_t kChannels = benchmark_state.range(2); @@ -165,7 +165,7 @@ static void setupInstanceNormNHWC(Fusion* fusion, DataType dtype) { static void NvFuserScheduler_InstanceNormNHWC( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype) { NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half); @@ -186,7 +186,7 @@ static void NvFuserScheduler_InstanceNormNHWC( std::vector aten_inputs = {at_x, at_weight, at_bias}; std::vector outputs; - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); const size_t kChannels = benchmark_state.range(2); diff --git a/benchmarks/cpp/layer_norm.cpp b/benchmarks/cpp/layer_norm.cpp index 445b1637274..706f1e8fa84 100644 --- a/benchmarks/cpp/layer_norm.cpp +++ b/benchmarks/cpp/layer_norm.cpp @@ -67,7 +67,7 @@ static void setupLayerNorm(Fusion* fusion, DataType dtype) { static void NvFuserScheduler_LayerNorm( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype) { NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half); @@ -84,7 +84,7 @@ static void NvFuserScheduler_LayerNorm( std::vector aten_inputs({input, weight, bias}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * @@ -142,7 +142,7 @@ static void Baseline_LayerNorm_fp16(benchmark::State& benchmark_state) { static void NvFuserScheduler_TIMM_LayerNorm( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype) { NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half); @@ -162,7 +162,7 @@ static void NvFuserScheduler_TIMM_LayerNorm( std::vector aten_inputs({input, weight, bias}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * diff --git a/benchmarks/cpp/layer_norm_backward.cpp b/benchmarks/cpp/layer_norm_backward.cpp index a14e89fce4f..3da9de991dc 100644 --- a/benchmarks/cpp/layer_norm_backward.cpp +++ b/benchmarks/cpp/layer_norm_backward.cpp @@ -80,7 +80,7 @@ static void setupLayerNorm_BWD(Fusion* fusion, DataType dtype) { static void NvFuserScheduler_LayerNorm_BWD( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype) { NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half); @@ -103,7 +103,7 @@ static void NvFuserScheduler_LayerNorm_BWD( std::vector aten_inputs( {grad_out, input, weight, bias, mean, rstd}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * diff --git a/benchmarks/cpp/layer_norm_fused.cpp b/benchmarks/cpp/layer_norm_fused.cpp index 823b571aa1c..12cba780d4b 100644 --- a/benchmarks/cpp/layer_norm_fused.cpp +++ b/benchmarks/cpp/layer_norm_fused.cpp @@ -84,7 +84,7 @@ static void setupLayerNormFused(Fusion* fusion, DataType dtype) { static void NvFuserScheduler_LayerNormFused( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype) { NVF_ERROR(dtype == DataType::Half); @@ -104,7 +104,7 @@ static void NvFuserScheduler_LayerNormFused( std::vector aten_inputs({tv0, tv1, tv2, tv3, tv4}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * diff --git a/benchmarks/cpp/lstm_cell.cpp b/benchmarks/cpp/lstm_cell.cpp index 3c7b98a4c84..71a2dbc6cba 100644 --- a/benchmarks/cpp/lstm_cell.cpp +++ b/benchmarks/cpp/lstm_cell.cpp @@ -155,8 +155,8 @@ static void NvFuserScheduler_LstmCell_Compile( &fusion, SchedulerType::PointWise, c10::ArrayRef(inputs)); for (auto _ : benchmark_state) { - FusionExecutor executor; - executor.compileFusion(&fusion, inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, inputs); } } @@ -182,13 +182,13 @@ static void NvFuserScheduler_LstmCell_RunFusion( auto heuristic_params = SchedulerEntry::scheduleWith( &fusion, SchedulerType::PointWise, c10::ArrayRef(inputs)); - FusionExecutor executor; - executor.compileFusion(&fusion, inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, inputs); C10_CUDA_CHECK(cudaDeviceSynchronize()); for (auto _ : benchmark_state) { - outputs = executor.runFusion( + outputs = ke.runFusion( c10::ArrayRef(inputs), heuristic_params->lparams); C10_CUDA_CHECK(cudaDeviceSynchronize()); } @@ -220,11 +220,11 @@ static void NvFuserScheduler_LstmCell_RunFusion_GpuOnly( auto heuristic_params = SchedulerEntry::scheduleWith( &fusion, SchedulerType::PointWise, c10::ArrayRef(inputs)); - FusionExecutor executor; - executor.compileFusion(&fusion, inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, inputs); runBenchmarkIterations( - benchmark_state, &executor, inputs, heuristic_params->lparams); + benchmark_state, &ke, inputs, heuristic_params->lparams); } BENCHMARK_CAPTURE(NvFuserScheduler_LstmCell_RunFusion_GpuOnly, Small, 512, 64) @@ -259,12 +259,12 @@ static void NvFuserScheduler_LstmCell_RunFusion_CpuOnly( auto heuristic_params = SchedulerEntry::scheduleWith( &fusion, SchedulerType::PointWise, c10::ArrayRef(inputs)); - FusionExecutor executor; - executor.setExecuteKernelFlag(false); - executor.compileFusion(&fusion, inputs); + KernelExecutor ke; + ke.setExecuteKernelFlag(false); + ke.compileFusion(&fusion, inputs); for (auto _ : benchmark_state) { - outputs = executor.runFusion( + outputs = ke.runFusion( c10::ArrayRef(inputs), heuristic_params->lparams); } } diff --git a/benchmarks/cpp/matmul.cpp b/benchmarks/cpp/matmul.cpp index 4f93dfbaf62..d48bc250ea6 100644 --- a/benchmarks/cpp/matmul.cpp +++ b/benchmarks/cpp/matmul.cpp @@ -175,19 +175,19 @@ static void SingleMatmulBase( // Compile kernel auto launch_constraints = LaunchParams(); - FusionExecutor fe; - fe.compileFusion(fusion, args, launch_constraints, cparams); + KernelExecutor ke; + ke.compileFusion(fusion, args, launch_constraints, cparams); NVF_CHECK( - getBankConflictInfo(fe.kernel(), launch_constraints).empty(), + getBankConflictInfo(ke.kernel(), launch_constraints).empty(), "Shared memory bank conflict not removed."); std::vector aten_inputs({inputs.first, inputs.second}); // Warm up run - auto outputs = fe.runFusion(aten_inputs); + auto outputs = ke.runFusion(aten_inputs); checkMatch(expected_output, outputs.at(0).to(at::kDouble), k); - runBenchmarkIterations(benchmark_state, &fe, aten_inputs); + runBenchmarkIterations(benchmark_state, &ke, aten_inputs); // TODO: FLOPS calculation } @@ -355,19 +355,19 @@ static void SingleMatmulPartitionedK( cparams.index_type = computeIndexType(M, N, K); // Compile kernel - FusionExecutor fe; + KernelExecutor ke; auto lparams = LaunchParams(); - fe.compileFusion(fusion, args, lparams, cparams); + ke.compileFusion(fusion, args, lparams, cparams); NVF_CHECK( - getBankConflictInfo(fe.kernel(), lparams).empty(), + getBankConflictInfo(ke.kernel(), lparams).empty(), "Shared memory bank conflict not removed."); // Warm up run - auto outputs = fe.runFusion(aten_inputs); + auto outputs = ke.runFusion(aten_inputs); checkMatch(expected_output, outputs.at(0).to(at::kDouble), Ki); - runBenchmarkIterations(benchmark_state, &fe, aten_inputs); + runBenchmarkIterations(benchmark_state, &ke, aten_inputs); // TODO: FLOPS calculation } @@ -461,21 +461,21 @@ static void NvFuserScheduler_MatmulSplitKReduction( KernelArgumentHolder::createKernelArgumentHolder(aten_inputs); // Compile kernel - FusionExecutor fe; - fe.compileFusion( + KernelExecutor ke; + ke.compileFusion( fusion, args, heuristic_params->lparams, heuristic_params->cparams); NVF_CHECK( - getBankConflictInfo(fe.kernel(), heuristic_params->lparams).empty(), + getBankConflictInfo(ke.kernel(), heuristic_params->lparams).empty(), "Shared memory bank conflict not removed."); // Warm up run - auto outputs = fe.runFusion(aten_inputs, heuristic_params->lparams); + auto outputs = ke.runFusion(aten_inputs, heuristic_params->lparams); checkMatch(expected_output, outputs.at(0).to(at::kDouble), splitk_factor); runBenchmarkIterations( - benchmark_state, &fe, aten_inputs, heuristic_params->lparams); + benchmark_state, &ke, aten_inputs, heuristic_params->lparams); // TODO: FLOPS calculation } diff --git a/benchmarks/cpp/reduction.cpp b/benchmarks/cpp/reduction.cpp index f70fb931e84..84f378967ca 100644 --- a/benchmarks/cpp/reduction.cpp +++ b/benchmarks/cpp/reduction.cpp @@ -50,7 +50,7 @@ static void setupReduction(Fusion* fusion, DataType dtype, int red_axis) { static void NvFuserScheduler_Reduction( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype, int reduction_dim) { auto reduction_size = benchmark_state.range(0); @@ -65,7 +65,7 @@ static void NvFuserScheduler_Reduction( std::vector aten_inputs({aten_input}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * diff --git a/benchmarks/cpp/rms_norm.cpp b/benchmarks/cpp/rms_norm.cpp index 1c29c66a631..6085929c179 100644 --- a/benchmarks/cpp/rms_norm.cpp +++ b/benchmarks/cpp/rms_norm.cpp @@ -62,7 +62,7 @@ static void setupRMSNorm(Fusion* fusion, DataType dtype) { static void NvFuserScheduler_RMSNorm( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype) { NVF_ERROR( dtype == DataType::Float || dtype == DataType::Half || @@ -80,7 +80,7 @@ static void NvFuserScheduler_RMSNorm( std::vector aten_inputs({input, weight}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * diff --git a/benchmarks/cpp/rms_norm_backward.cpp b/benchmarks/cpp/rms_norm_backward.cpp index 357499f2be4..c9422846b19 100644 --- a/benchmarks/cpp/rms_norm_backward.cpp +++ b/benchmarks/cpp/rms_norm_backward.cpp @@ -69,7 +69,7 @@ static void setupRMSNorm_BWD(Fusion* fusion, DataType dtype) { static void NvFuserScheduler_RMSNorm_BWD( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype) { NVF_ERROR( dtype == DataType::Float || dtype == DataType::Half || @@ -89,7 +89,7 @@ static void NvFuserScheduler_RMSNorm_BWD( std::vector aten_inputs({grad_out, input, weight, rstd}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * diff --git a/benchmarks/cpp/scale_bias_relu.cpp b/benchmarks/cpp/scale_bias_relu.cpp index e68c0b9140d..ed1505a2884 100644 --- a/benchmarks/cpp/scale_bias_relu.cpp +++ b/benchmarks/cpp/scale_bias_relu.cpp @@ -114,7 +114,7 @@ static void setupSBRNorm(Fusion* fusion, DataType dtype) { static void NvFuserScheduler_SBR( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype) { // N, H, W, C format std::vector input_shape{ @@ -136,7 +136,7 @@ static void NvFuserScheduler_SBR( // inputs std::vector aten_inputs = {at_x, at_scale, at_bias}; - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); const size_t size = input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3]; @@ -191,7 +191,7 @@ static void Baseline_SBR(benchmark::State& benchmark_state, DataType dtype) { static void NvFuserScheduler_SBR_Norm( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype) { // N, H, W, C format std::vector input_shape{ @@ -215,7 +215,7 @@ static void NvFuserScheduler_SBR_Norm( std::vector aten_inputs = { at_x, at_weight, at_bias, at_mean, at_var}; - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); const size_t size = input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3]; diff --git a/benchmarks/cpp/shape_inference.cpp b/benchmarks/cpp/shape_inference.cpp index 801759e2c03..3e580b4e6b4 100644 --- a/benchmarks/cpp/shape_inference.cpp +++ b/benchmarks/cpp/shape_inference.cpp @@ -26,7 +26,7 @@ using namespace nvfuser; static auto getLayerBackwardNormRuntime( std::unique_ptr fusion_ptr, - std::unique_ptr& fec, + std::unique_ptr& executor_cache, std::vector& aten_inputs, std::vector& shape, std::vector& norm_shape) { @@ -86,12 +86,12 @@ static auto getLayerBackwardNormRuntime( auto aten_mean = std::get<1>(aten_results); auto aten_rstd = std::get<2>(aten_results); - fec = std::make_unique(std::move(fusion_ptr)); + executor_cache = std::make_unique(std::move(fusion_ptr)); aten_inputs = { aten_grad_out, aten_input, aten_mean, aten_rstd, aten_weight, aten_bias}; - auto cg_outputs = fec->runFusionWithInputs(aten_inputs); + auto cg_outputs = executor_cache->runFusionWithInputs(aten_inputs); - return fec->getMostRecentKernelRuntime(); + return executor_cache->getMostRecentKernelRuntime(); } void LayerNormBackward_ShapeInference_Base( @@ -101,30 +101,30 @@ void LayerNormBackward_ShapeInference_Base( FusionGuard fg(fusion_ptr.get()); // PreAllocate - std::unique_ptr fec; + std::unique_ptr executor_cache; std::vector aten_inputs; std::vector shape{20, 100, 35, 67}; std::vector norm_shape{67}; auto runtime = getLayerBackwardNormRuntime( - std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape); + std::move(fusion_ptr), executor_cache, aten_inputs, shape, norm_shape); KernelArgumentHolder args = KernelArgumentHolder::createKernelArgumentHolder(aten_inputs); NVF_ERROR(runtime->getMaybeHeuristicsFor(args).has_value()); - fec->profile(true); - fec->disableKernelLaunch(); - fec->runFusionWithInputs(aten_inputs); + executor_cache->profile(true); + executor_cache->disableKernelLaunch(); + executor_cache->runFusionWithInputs(aten_inputs); if (disable_launch_parameter_cache) { - fec->disableLaunchParamCache(); + executor_cache->disableLaunchParamCache(); } for (auto _ : benchmark_state) { // Setup (not included in the measurement) - fec->runFusionWithInputs(aten_inputs); + executor_cache->runFusionWithInputs(aten_inputs); } } @@ -140,7 +140,7 @@ static void NvFuserScheduler_LayerNormBackward_NoShapeInferenceCachedBaseline( static auto getLayerForwardNormRuntime( std::unique_ptr fusion_ptr, - std::unique_ptr& fec, + std::unique_ptr& executor_cache, std::vector& aten_inputs, std::vector& shape, std::vector& norm_shape) { @@ -161,11 +161,11 @@ static auto getLayerForwardNormRuntime( auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn(shape, options); - fec = std::make_unique(std::move(fusion_ptr)); + executor_cache = std::make_unique(std::move(fusion_ptr)); aten_inputs = {aten_input}; - auto cg_outputs = fec->runFusionWithInputs(aten_inputs); + auto cg_outputs = executor_cache->runFusionWithInputs(aten_inputs); - return fec->getMostRecentKernelRuntime(); + return executor_cache->getMostRecentKernelRuntime(); } void LayerNormForward_ShapeInferenceBase( @@ -175,31 +175,31 @@ void LayerNormForward_ShapeInferenceBase( FusionGuard fg(fusion_ptr.get()); // PreAllocate - std::unique_ptr fec; + std::unique_ptr executor_cache; std::vector aten_inputs; std::vector shape{20, 100, 35, 67}; std::vector norm_shape{67}; auto runtime = getLayerForwardNormRuntime( - std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape); + std::move(fusion_ptr), executor_cache, aten_inputs, shape, norm_shape); KernelArgumentHolder args = KernelArgumentHolder::createKernelArgumentHolder(aten_inputs); NVF_ERROR(runtime->getMaybeHeuristicsFor(args).has_value()); - fec->profile(true); - fec->disableKernelLaunch(); - fec->runFusionWithInputs(aten_inputs); + executor_cache->profile(true); + executor_cache->disableKernelLaunch(); + executor_cache->runFusionWithInputs(aten_inputs); if (disable_launch_param_cache) { - fec->disableLaunchParamCache(); + executor_cache->disableLaunchParamCache(); } for (auto _ : benchmark_state) { // Setup (not included in the measurement) - fec->runFusionWithInputs(aten_inputs); + executor_cache->runFusionWithInputs(aten_inputs); } } diff --git a/benchmarks/cpp/softmax.cpp b/benchmarks/cpp/softmax.cpp index f1dca672349..ba6b707dd33 100644 --- a/benchmarks/cpp/softmax.cpp +++ b/benchmarks/cpp/softmax.cpp @@ -52,7 +52,7 @@ static void setupSoftmax( static void NvFuserScheduler_Softmax( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype, const int reduction_axis) { NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half); @@ -70,7 +70,7 @@ static void NvFuserScheduler_Softmax( std::vector aten_inputs({aten_input}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * @@ -105,10 +105,10 @@ static void NvFuserScheduler_Softmax_WarpReduceReference( auto heuristic_params = scheduler->computeHeuristics(fusion, runtime_info); scheduler->schedule(fusion, heuristic_params.get()); - FusionExecutor fe; - fe.compileFusion(fusion, aten_inputs); + KernelExecutor ke; + ke.compileFusion(fusion, aten_inputs); - runBenchmarkIterations(benchmark_state, &fe, aten_inputs); + runBenchmarkIterations(benchmark_state, &ke, aten_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * @@ -152,10 +152,10 @@ static void NvFuserScheduler_Softmax_WarpReduce( } } - FusionExecutor fe; - fe.compileFusion(fusion, aten_inputs); + KernelExecutor ke; + ke.compileFusion(fusion, aten_inputs); - runBenchmarkIterations(benchmark_state, &fe, aten_inputs); + runBenchmarkIterations(benchmark_state, &ke, aten_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * diff --git a/benchmarks/cpp/softmax_backward.cpp b/benchmarks/cpp/softmax_backward.cpp index 8c4a84562cc..364a5246016 100644 --- a/benchmarks/cpp/softmax_backward.cpp +++ b/benchmarks/cpp/softmax_backward.cpp @@ -57,7 +57,7 @@ static void setupSoftmaxBWD( static void NvFuserScheduler_Softmax_BWD( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype, const int reduction_axis) { NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half); @@ -83,7 +83,7 @@ static void NvFuserScheduler_Softmax_BWD( std::vector aten_inputs({grad_output, output, input}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * diff --git a/benchmarks/cpp/softmax_dropout.cpp b/benchmarks/cpp/softmax_dropout.cpp index f43fa24da81..2999d6442f1 100644 --- a/benchmarks/cpp/softmax_dropout.cpp +++ b/benchmarks/cpp/softmax_dropout.cpp @@ -75,7 +75,7 @@ static void setupSoftmaxDropout( static void NvFuserScheduler_SoftmaxDropout( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype, const int kReductionAxis) { NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half); @@ -96,7 +96,7 @@ static void NvFuserScheduler_SoftmaxDropout( std::vector aten_inputs( {at_scores, at_mask, sqrt(kAttentionHeadSize)}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); // 5 dtype: attention_scores + attention_mask + attention_scores_out + // attention_probs_out + output diff --git a/benchmarks/cpp/timm.cpp b/benchmarks/cpp/timm.cpp index 8bffc0bd1ef..cac01eadcca 100644 --- a/benchmarks/cpp/timm.cpp +++ b/benchmarks/cpp/timm.cpp @@ -56,7 +56,7 @@ static void setup_vit_base_patch16_224_bcast7(Fusion* fusion, void* null) { static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast7( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, void* null) { std::vector input_shape{ benchmark_state.range(0), @@ -74,7 +74,7 @@ static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast7( auto t7 = at::randn(input_shape, fp16_options); std::vector aten_inputs({t2, t3, t4, t7}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); // full tensor - float + halfx2 - t2, t7, t39 // Inner most dimension only - floatx2 - t36, t37 @@ -170,7 +170,7 @@ static void setup_vit_base_patch16_224_bcast5(Fusion* fusion, void* null) { static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast5( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, void* null) { std::vector input_shape{ benchmark_state.range(0), @@ -189,7 +189,7 @@ static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast5( auto t1 = at::randn({input_shape[2]}, fp32_options); std::vector aten_inputs({t2, t5, t3, t0, t1}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); // Full tensor - floatx2, halfx2, bool - t2, t16, t3, t34, t16 // Inner most dim only - floatx5 - t5, t0, t1, t7, t17 @@ -236,7 +236,7 @@ static void setup_vit_base_patch16_224_bcast_outer2( static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_outer2( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, void* null) { std::vector input_shape{ benchmark_state.range(0), @@ -252,7 +252,7 @@ static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_outer2( auto t2 = at::randn({input_shape[2]}, fp32_options); std::vector aten_inputs({t0, t2}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); // full tensor - halfx2 - t0, t6 // inner dimension only - halfx2 - t2, t7 @@ -314,7 +314,7 @@ static void setup_vit_base_patch16_224_norm_inner3(Fusion* fusion, void* null) { static void NvFuserScheduler_TIMM_vit_base_patch16_224_norm_inner3( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, void* null) { std::vector input_shape{ benchmark_state.range(0), @@ -328,7 +328,7 @@ static void NvFuserScheduler_TIMM_vit_base_patch16_224_norm_inner3( auto t0 = at::randn(input_shape, fp16_options); std::vector aten_inputs({t0, 0.125}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); // Full tensors - floatx2, half x2, bool - t12, t4, t0, t19, t14 benchmark_state.SetBytesProcessed( @@ -391,7 +391,7 @@ static void setup_vit_base_patch16_224_bcast_outer6( static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_outer6( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, void* null) { std::vector input_shape{ benchmark_state.range(0), @@ -407,7 +407,7 @@ static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_outer6( auto t2 = at::randn({input_shape[2]}, fp32_options); std::vector aten_inputs({t0, t2}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); // full tensors - float, halfx2, bool - t6, t0, t18, t13 // inner dimension only - float, half - t2, t19 benchmark_state.SetBytesProcessed( @@ -480,7 +480,7 @@ static void setup_vit_base_patch16_224_bcast_inner6( static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_inner6( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, void* null) { std::vector input_shape{ benchmark_state.range(0), @@ -496,7 +496,7 @@ static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_inner6( auto t2 = at::randn({input_shape[0], input_shape[1]}, fp32_options); std::vector aten_inputs({t0, t2}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); // full tensors - float, halfx2, bool - t6, t0, t18, t13 // outer two dimensions only - float, half - t2, t19 @@ -620,7 +620,7 @@ static void setup_vit_base_patch16_224_LN_BWD(Fusion* fusion, void* null) { static void NvFuserScheduler_TIMM_vit_base_patch16_224_LN_BWD( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, void* null) { std::vector input_shape{ benchmark_state.range(0), @@ -641,7 +641,7 @@ static void NvFuserScheduler_TIMM_vit_base_patch16_224_LN_BWD( auto t9 = at::randn({input_shape[2]}, fp16_options); std::vector aten_inputs({t0, t1, t3, t5, t6, t7, t9, 1.0}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); // Full tensors - bool, halfx4 - t0, t1, t3, t34, t35 // Outer two dimensions - floatx2 - t5, t6 @@ -701,7 +701,7 @@ static void nhwc_seresnet152d_transpose65(Fusion* fusion, void* null) { static void NvFuserScheduler_nhwc_seresnet152d_transpose65( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, void* null) { std::vector input_shape{ benchmark_state.range(0), @@ -721,7 +721,7 @@ static void NvFuserScheduler_nhwc_seresnet152d_transpose65( auto t4 = at::randn({2}, fp16_options).sum(); std::vector aten_inputs({t2, t5, t7, t9, t4}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); // Full tensors - halfx6 - t2, t5, t7, t9, t29, t30 benchmark_state.SetBytesProcessed( diff --git a/benchmarks/cpp/transpose.cpp b/benchmarks/cpp/transpose.cpp index b24a2fdbfbe..21389f49f36 100644 --- a/benchmarks/cpp/transpose.cpp +++ b/benchmarks/cpp/transpose.cpp @@ -108,7 +108,7 @@ static void setupTranspose( static void NvFuserScheduler_Transpose( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype, int num_dims, std::pair axes, @@ -125,7 +125,7 @@ static void NvFuserScheduler_Transpose( auto at_input2 = aten_inputs[1]; std::vector fuser_inputs = {at_input1, at_input2}; - runBenchmarkIterations(benchmark_state, fusion_executor_cache, fuser_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, fuser_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * diff --git a/benchmarks/cpp/utils.cpp b/benchmarks/cpp/utils.cpp index e171badd9ae..54fa56a063d 100644 --- a/benchmarks/cpp/utils.cpp +++ b/benchmarks/cpp/utils.cpp @@ -170,27 +170,27 @@ int64_t getSizeOfOutputs(const std::vector& outputs) { int64_t runBenchmarkIterations( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, std::vector& aten_inputs) { c10::cuda::CUDACachingAllocator::emptyCache(); - fusion_executor_cache->profile(true); + executor_cache->profile(true); int64_t io_bytes = getSizeOfInputs(aten_inputs); // Segment and compile the fusion { - auto cg_outputs = fusion_executor_cache->runFusionWithInputs(aten_inputs); + auto cg_outputs = executor_cache->runFusionWithInputs(aten_inputs); io_bytes += getSizeOfOutputs(cg_outputs); } bool segmented = - fusion_executor_cache->getMostRecentKernelRuntime()->isSegmented() && - fusion_executor_cache->getMostRecentKernelRuntime() + executor_cache->getMostRecentKernelRuntime()->isSegmented() && + executor_cache->getMostRecentKernelRuntime() ->fusionSegments() ->groups() .size() > 1; - const auto& compile_log = fusion_executor_cache->getMostRecentExecutorInfo(); + const auto& compile_log = executor_cache->getMostRecentExecutorInfo(); auto params = toString(compile_log.params); auto lparams = toString(compile_log.fusion_executor->lastLaunchParams()); // Only set if not segmented. In the case of segmented fusions, @@ -200,7 +200,7 @@ int64_t runBenchmarkIterations( benchmark_state.SetLabel(params + lparams); } - fusion_executor_cache->profile(false); + executor_cache->profile(false); // Sync everything up before we start NVFUSER_CUDA_RT_SAFE_CALL(cudaDeviceSynchronize()); @@ -208,7 +208,7 @@ int64_t runBenchmarkIterations( for (auto _ : benchmark_state) { clearL2Cache(); - auto cg_outputs = fusion_executor_cache->runFusionWithInputs(aten_inputs); + auto cg_outputs = executor_cache->runFusionWithInputs(aten_inputs); benchmark_state.SetIterationTime( FusionProfiler::profile().kernel_time_ms / 1000.0); } @@ -223,7 +223,7 @@ int64_t runBenchmarkIterations( int64_t runBenchmarkIterations( benchmark::State& benchmark_state, - FusionExecutor* fusion_executor, + KernelExecutor* fusion_executor, std::vector& aten_inputs, const LaunchParams& launch_constraints, CompileParams compile_params) { diff --git a/benchmarks/cpp/utils.h b/benchmarks/cpp/utils.h index 61c5e556af3..67beb1ca7d5 100644 --- a/benchmarks/cpp/utils.h +++ b/benchmarks/cpp/utils.h @@ -40,7 +40,7 @@ std::string toString(LaunchParams lparams); //! if not segmented. int64_t runBenchmarkIterations( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, std::vector& aten_inputs); //! Run benchmark iterations with a fusion executor and @@ -48,7 +48,7 @@ int64_t runBenchmarkIterations( //! kernel time is added to benchmark_state. int64_t runBenchmarkIterations( benchmark::State& benchmark_state, - FusionExecutor* fusion_executor, + KernelExecutor* fusion_executor, std::vector& aten_inputs, const LaunchParams& launch_constraints = LaunchParams(), CompileParams compile_params = CompileParams()); diff --git a/csrc/fusion.h b/csrc/fusion.h index 3bc97d957a4..871dda53811 100644 --- a/csrc/fusion.h +++ b/csrc/fusion.h @@ -403,7 +403,7 @@ class NVF_API Fusion : public IrContainer { static IrCloner copy(const Fusion* from, Fusion* to); //! During scheduling, this can be set to a non-negative value. If done, then - //! during execution by FusionExecutor, we will check that this value matches + //! during execution by KernelExecutor, we will check that this value matches //! the corresponding value in LaunchParams. int64_t expectedDynamicSmemBytes() const { return expected_dynamic_smem_bytes_; diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index 402784153a0..1ef0e81b3e3 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -141,7 +141,7 @@ void HostIrExecutor::handle(PostOnStream* post_ir) { "op must be a HostUnit: ", post_ir->hostOpToPost()); auto hu = post_ir->hostOpToPost()->as(); - // Compile the fusion and execute it with FusionExecutor(Cache) + // Compile the fusion and execute it with KernelExecutor(Cache) // Check if the executor has been cached. If not, create and cache it if (params_.use_fusion_executor_cache) { if (!fec_.count(hu)) { @@ -153,13 +153,13 @@ void HostIrExecutor::handle(PostOnStream* post_ir) { } outputs = fec_.at(hu).runFusionWithInputs(input_IValues); } else { - FusionExecutor& fe = fe_[hu]; - if (!fe.isCompiled()) { + KernelExecutor& ke = fe_[hu]; + if (!ke.isCompiled()) { Fusion* fusion = hu->fusion_to_execute(); DynamicTransform::concretizeFusion(fusion, input_IValues); - fe.compileFusion(fusion, input_IValues); + ke.compileFusion(fusion, input_IValues); } - outputs = fe.runFusion(input_IValues); + outputs = ke.runFusion(input_IValues); if (!params_.cache_fusion_executor) { fe_.erase(hu); } diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h index 2dcec129cc8..96bcc725d7b 100644 --- a/csrc/host_ir/executor.h +++ b/csrc/host_ir/executor.h @@ -36,7 +36,7 @@ duplication will be resolved in the future. // Set of parameters that control the behavior of HostIrExecutor struct HostIrExecutorParams { // Experimental: whether to use FusionExecutorCache rather than - // FusionExecutor. + // KernelExecutor. bool use_fusion_executor_cache = false; // Experimental: whether to apply auto-scheduling in FusionExecutorCache if // use_fusion_executor_cache=true. WAR: temporary hack mainly use for @@ -95,7 +95,7 @@ class HostIrExecutor final : public OptOutDispatch { // Stores concrete computed values ExpressionEvaluator expr_evaluator_; // Cache Fusions, FusionExecutors - std::unordered_map fe_; + std::unordered_map fe_; std::unordered_map fec_; using StreamKey = std::variant; std::unordered_map streams_; diff --git a/csrc/kernel_ir.h b/csrc/kernel_ir.h index 8be502233a5..f5f062cdbb7 100644 --- a/csrc/kernel_ir.h +++ b/csrc/kernel_ir.h @@ -332,12 +332,12 @@ class NVF_API Allocate final : public Expr { //! hold counters starting at zero. Typically, each participating thread would //! increment the counter and the last thread would leave the counter in a //! non-zeroed state. The next time that kernel is run, it can no longer - //! re-use the non-zero semaphore buffer, so FusionExecutor will launch + //! re-use the non-zero semaphore buffer, so KernelExecutor will launch //! at::zeroes to allocate a new buffer, resulting in a memset kernel launch. //! //! Instead, if the last thread resets the counter to zero, then the buffer //! can be re-used, and at::zeroes need only be run at the first kernel - //! launch. If resetsToZero() is true, then FusionExecutor will use + //! launch. If resetsToZero() is true, then KernelExecutor will use //! contigZeroedTensor() and releaseZeroedMemory() from global_allocator.h to //! reuse zeroed memory avoiding the additional kernel launch. //! @@ -840,7 +840,7 @@ class NVF_API IfThenElse final : public Expr { //! This node is used only after lowering a fusion to explicitly mark a grid //! reduction and the buffer allocation needed to do it. //! -//! This node provides FusionExecutor the information it needs to allocate the +//! This node provides KernelExecutor the information it needs to allocate the //! reduction and sync buffers. class GridReduction final : public ReductionOp { static constexpr int num_reduction_op_attr = 4; @@ -1004,7 +1004,7 @@ class NVF_API GroupedGridReduction final : public GroupedReductionOp { //! This node is used only after lowering a fusion to explicitly mark a grid //! broadcast and the buffer allocation needed to do it. //! -//! This node provides FusionExecutor the information it needs to allocate the +//! This node provides KernelExecutor the information it needs to allocate the //! broadcast and sync buffers. class NVF_API GridBroadcast final : public Expr { public: @@ -1043,7 +1043,7 @@ class NVF_API GridBroadcast final : public Expr { //! This node is used only after lowering a fusion to explicitly mark a grid //! reduction and the buffer allocation needed to do it. //! -//! This node provides FusionExecutor the information it needs to allocate the +//! This node provides KernelExecutor the information it needs to allocate the //! reduction and sync buffers. //! //! TODO: Make this a subclass of WelfordOp diff --git a/csrc/options.h b/csrc/options.h index 76522b93a9d..f256f9860cc 100644 --- a/csrc/options.h +++ b/csrc/options.h @@ -39,7 +39,7 @@ enum class DebugDumpOption { FusionIrPresched, //!< Dump the segmented Fusion IR before it is scheduled // TODO(wujingyue): name the following FusionIrSched FusionIr, //!< Dump the Fusion IR before lowering. This is the Fusion IR fed - //!< to `FusionExecutor::compileFusion`. + //!< to `KernelExecutor::compileFusion`. FusionIrMath, //!< Dump just the compute (math) part of the above `FusionIr` //!< for conciseness KernelIr, //!< Dump the compiler Kernel IR diff --git a/csrc/python_frontend/fusion_cache.cpp b/csrc/python_frontend/fusion_cache.cpp index e95ee6820da..c896592af5a 100644 --- a/csrc/python_frontend/fusion_cache.cpp +++ b/csrc/python_frontend/fusion_cache.cpp @@ -188,7 +188,7 @@ FusionCache* FusionCache::singleton_ = nullptr; UserSchedule::UserSchedule() : scheduled_fusion(nullptr), executor(nullptr) { scheduled_fusion = std::make_unique(); - executor = std::make_unique(); + executor = std::make_unique(); } bool UserSchedule::canSchedule(const SchedulerType& scheduler_type) { @@ -688,7 +688,7 @@ void FusionCache::serialize(std::string filename) const { &fb_nodes, &terminal_node_idx, &fb_auto_gen_schedules, - FusionExecutor::getGlobalFusionCount(), + KernelExecutor::getGlobalFusionCount(), device_prop->major, device_prop->minor, cuda_major, @@ -722,7 +722,7 @@ void FusionCache::deserialize(std::string filename) { NVF_CHECK(fusion_cache_buffer != nullptr, "Fusion Cache buffer is invalid."); // 0. Set static fusion count in Fusion Executor - FusionExecutor::setGlobalFusionCount( + KernelExecutor::setGlobalFusionCount( fusion_cache_buffer->global_fusion_count()); // 1. Deserialize max_fusions field diff --git a/csrc/python_frontend/fusion_cache.h b/csrc/python_frontend/fusion_cache.h index 2d4f2533ba5..ffe8088b82c 100644 --- a/csrc/python_frontend/fusion_cache.h +++ b/csrc/python_frontend/fusion_cache.h @@ -41,7 +41,7 @@ struct UserSchedule { std::unique_ptr scheduled_fusion; //! Generated kernel container - std::unique_ptr executor; + std::unique_ptr executor; //! ID of fusion in python frontend fusion cache int64_t fusion_id_ = -1; @@ -102,7 +102,7 @@ struct FusionSchedules { //! Keeps a pointer to the last scheduled Fusion IR for printing Fusion* last_user_def_scheduled_ir; //! Keeps a pointer to the last executed executor for printing its cuda kernel - FusionExecutor* last_user_def_executor; + KernelExecutor* last_user_def_executor; //! For thread-Safe locking of Fusion Schedules std::mutex scheds_lock; //! ID of fusion in python frontend fusion cache diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp index 2acc3001822..90eb036073c 100644 --- a/csrc/runtime/executor.cpp +++ b/csrc/runtime/executor.cpp @@ -138,10 +138,10 @@ std::string getStructuredCodeFromExternalFiles(const int64_t fusion_id) { } } // namespace -FusionExecutor::FusionExecutor() +KernelExecutor::KernelExecutor() : communicator_(&Communicator::getInstance()) {} -std::unique_ptr& FusionExecutor:: +std::unique_ptr& KernelExecutor:: evaluatorPrecomputedValues() { if (!evaluator_precomputed_values_) { evaluator_precomputed_values_ = @@ -150,7 +150,7 @@ std::unique_ptr& FusionExecutor:: return evaluator_precomputed_values_; } -std::string FusionExecutor::getStructuredCode( +std::string KernelExecutor::getStructuredCode( const std::string& kernel_str, PrimDataType index_type) const { // generating cuda code; @@ -181,11 +181,11 @@ std::string FusionExecutor::getStructuredCode( return code; } -std::string FusionExecutor::getStructuredCode() const { +std::string KernelExecutor::getStructuredCode() const { return getStructuredCode(kernelString(), kernel()->indexType()); } -void FusionExecutor::compileFusion( +void KernelExecutor::compileFusion( Fusion* fusion, const KernelArgumentHolder& args, const LaunchParams& launch_constraints, @@ -195,7 +195,7 @@ void FusionExecutor::compileFusion( int64_t concrete_id, int64_t runtime_id, int64_t group_id) { - FUSER_PERF_SCOPE("FusionExecutor::compileFusion"); + FUSER_PERF_SCOPE("KernelExecutor::compileFusion"); NVF_ERROR( !fusion->outputs().empty(), "No output found for this kernel, aborting."); @@ -456,7 +456,7 @@ void FusionExecutor::compileFusion( kernel_id_, compile_params, block_size); - NVF_ERROR(validKernelId(), "Invalid kernel id for FusionExecutor."); + NVF_ERROR(validKernelId(), "Invalid kernel id for KernelExecutor."); // These should be nullopt at this point, but reset just in case resetCompiledKernelProperties(); @@ -475,12 +475,12 @@ void FusionExecutor::compileFusion( } } -LaunchParams FusionExecutor::computeLaunchParams( +LaunchParams KernelExecutor::computeLaunchParams( const LaunchParams& launch_constraints, ExpressionEvaluator& expr_eval, const int64_t warp_size, DataType index_type) { - FUSER_PERF_SCOPE("FusionExecutor::computeLaunchParams"); + FUSER_PERF_SCOPE("KernelExecutor::computeLaunchParams"); NVF_ERROR(warp_size > 0, "WARP_SIZE should be larger than 0"); LaunchParams launch_params; @@ -555,7 +555,7 @@ LaunchParams FusionExecutor::computeLaunchParams( // Run through the rest of the parallel IterDomains and infer their size for (auto [p_type, extent] : simplified_parallel_iter_extents) { - FUSER_PERF_SCOPE("FusionExecutor::ParallelBindingResolution"); + FUSER_PERF_SCOPE("KernelExecutor::ParallelBindingResolution"); auto val = expr_eval.evaluate(extent); NVF_ERROR( val.hasValue(), @@ -635,10 +635,10 @@ LaunchParams FusionExecutor::computeLaunchParams( return launch_params; } -std::vector FusionExecutor::getIntermediateBufferInfo( +std::vector KernelExecutor::getIntermediateBufferInfo( ExpressionEvaluator& expr_eval, DataType index_type) { - FUSER_PERF_SCOPE("FusionExecutor::getIntermediateBufferInfo"); + FUSER_PERF_SCOPE("KernelExecutor::getIntermediateBufferInfo"); std::vector global_buffers; const auto kernel = lowered_->kernel(); @@ -685,7 +685,7 @@ std::vector FusionExecutor::getIntermediateBufferInfo( return global_buffers; } -void FusionExecutor::setUsedTVs() { +void KernelExecutor::setUsedTVs() { auto used_vals = fusion()->usedMathVals(); auto used_tvs = ir_utils::filterByType(used_vals); used_tvs_.clear(); @@ -744,7 +744,7 @@ void validateCooperativeLaunch( // Dump fusion inputs and outputs as well as some useful fusion // information. Note that inputs and outputs are those that are passed -// to FusionExecutor::runFusion, so outputs may not be given. +// to KernelExecutor::runFusion, so outputs may not be given. void dumpFusionArgs( int64_t fusion_id, const KernelArgumentHolder& args, @@ -768,7 +768,7 @@ void dumpFusionArgs( // Dump arguments that are passed to a CUDA kernel call, which include // the inputs and outputs of the fusion as well as temporary // global-memory buffers. Unlike dumpFusionArgs, which dumps inputs -// and outputs passed to FusionExecutor::runFusion, this function +// and outputs passed to KernelExecutor::runFusion, this function // dumps those that are passed to a CUDA kernel. void dumpKernelArgs( int64_t fusion_id, @@ -803,14 +803,14 @@ void dumpKernelArgs( } // namespace -void FusionExecutor::initializeExecutorEntry( +void KernelExecutor::initializeExecutorEntry( ExecutorEntry& executor_entry, const KernelArgumentHolder& args, const LaunchParams& launch_constraints, const CompileParams& compile_params, const std::vector& outputs, DataType index_type) { - FUSER_PERF_SCOPE("FusionExecutor::initializeExecutorEntry"); + FUSER_PERF_SCOPE("KernelExecutor::initializeExecutorEntry"); ExpressionEvaluator expr_eval; evaluatorPrecomputedValues()->bindInputs(args); @@ -882,7 +882,7 @@ void FusionExecutor::initializeExecutorEntry( /// @param idx_type_size generally sizeof(int32_t) or sizeof(int64_t); used for /// computing how large the arrays to copy are. static void fillTensorArgMetadata( - FusionExecutor::ExecutorEntry& entry, + KernelExecutor::ExecutorEntry& entry, const PolymorphicValue& tensor_metadata, size_t idx, size_t idx_type_size) { @@ -943,11 +943,11 @@ static void fillTensorArgMetadata( // when we change the rank of a tensor or the number of arguments to a kernel. // It does not need to happen when only shapes change---use recomputeArgs for // that. -void FusionExecutor::computeArgs( +void KernelExecutor::computeArgs( ExecutorEntry& entry, ExpressionEvaluator& expr_eval, const kir::Kernel* kernel) const { - FUSER_PERF_SCOPE("FusionExecutor::computeArgs"); + FUSER_PERF_SCOPE("KernelExecutor::computeArgs"); const std::vector& params = kernel->parameters(); entry.args.resize(params.size()); @@ -961,11 +961,11 @@ void FusionExecutor::computeArgs( // Reset the arguments that we'll pass to cuLaunchKernel. This needs to be // invoked on every shape change. -void FusionExecutor::recomputeArgs( +void KernelExecutor::recomputeArgs( ExecutorEntry& entry, ExpressionEvaluator& expr_eval, const kir::Kernel* kernel) const { - FUSER_PERF_SCOPE("FusionExecutor::recomputeArgs"); + FUSER_PERF_SCOPE("KernelExecutor::recomputeArgs"); // assert(entry.init && "entry was never initialized"); const std::vector& params = kernel->parameters(); @@ -996,10 +996,10 @@ void FusionExecutor::recomputeArgs( } } -void FusionExecutor::recompileKernel( +void KernelExecutor::recompileKernel( const LaunchParams& new_launch_params, const CompileParams& new_compile_params) { - FUSER_PERF_SCOPE("FusionExecutor::runFusion::recompileKernel"); + FUSER_PERF_SCOPE("KernelExecutor::runFusion::recompileKernel"); const auto structured_code = getStructuredCode(); block_size_high_water_mark_ = new_launch_params.nThreads(); @@ -1026,7 +1026,7 @@ void FusionExecutor::recompileKernel( } } -int64_t FusionExecutor::getAvailableDynamicSmemSize() { +int64_t KernelExecutor::getAvailableDynamicSmemSize() { NVF_ERROR( hasCompiledKernel(), "Cannot get dynamic smem size unless kernel is compiled"); @@ -1041,7 +1041,7 @@ int64_t FusionExecutor::getAvailableDynamicSmemSize() { return available_dynamic_smem_size_.value(); } -int64_t FusionExecutor::getStaticSmemSize() { +int64_t KernelExecutor::getStaticSmemSize() { NVF_ERROR( hasCompiledKernel(), "Cannot get static smem size unless kernel is compiled"); @@ -1057,7 +1057,7 @@ int64_t FusionExecutor::getStaticSmemSize() { return static_smem_size_.value(); } -void FusionExecutor::validateDynamicSmemSize(int64_t dynamic_smem_size) { +void KernelExecutor::validateDynamicSmemSize(int64_t dynamic_smem_size) { // If specified, check that dynamic smem size matches what the scheduler // expects int64_t expected_dynamic_smem_size = fusion()->expectedDynamicSmemBytes(); @@ -1082,7 +1082,7 @@ void FusionExecutor::validateDynamicSmemSize(int64_t dynamic_smem_size) { device_smem_limit_); } -int64_t FusionExecutor::ensureAvailableDynamicSmemSize( +int64_t KernelExecutor::ensureAvailableDynamicSmemSize( int64_t dynamic_smem_size) { NVF_ERROR( hasCompiledKernel(), @@ -1098,15 +1098,15 @@ int64_t FusionExecutor::ensureAvailableDynamicSmemSize( return getAvailableDynamicSmemSize(); } -void FusionExecutor::resetCompiledKernelProperties() { +void KernelExecutor::resetCompiledKernelProperties() { available_dynamic_smem_size_.reset(); static_smem_size_.reset(); } -std::vector FusionExecutor::evaluateFusionOutputs( +std::vector KernelExecutor::evaluateFusionOutputs( std::vector outputs, ExpressionEvaluator& expr_eval) { - FUSER_PERF_SCOPE("FusionExecutor::runFusion::evaluateFusionOutputs"); + FUSER_PERF_SCOPE("KernelExecutor::runFusion::evaluateFusionOutputs"); NVF_ERROR( outputs.empty(), "Fusion executor is using expression evaluator,", @@ -1137,12 +1137,12 @@ at::Tensor findBufferForFusionOutput( } } // namespace -std::vector FusionExecutor::runFusion( +std::vector KernelExecutor::runFusion( KernelArgumentHolder& args, const LaunchParams& launch_constraints, CompileParams compile_params, std::vector outputs) { - FUSER_PERF_SCOPE("FusionExecutor::runFusion"); + FUSER_PERF_SCOPE("KernelExecutor::runFusion"); if (isProfilerEnabled()) { NVF_CHECK( @@ -1165,7 +1165,7 @@ std::vector FusionExecutor::runFusion( auto expr_eval = executor_utils::bindInputs(args, fusion()); if (isExpressionEvaluated(fusion())) { - FUSER_PERF_SCOPE("FusionExecutor::runFusion::evaluate_with_ExprEval"); + FUSER_PERF_SCOPE("KernelExecutor::runFusion::evaluate_with_ExprEval"); outputs = evaluateFusionOutputs(outputs, expr_eval); if (isProfilerEnabled()) { auto& sprof = FusionProfiler::segment(group_id_); @@ -1176,7 +1176,7 @@ std::vector FusionExecutor::runFusion( } if (host_ir_container_ != nullptr) { - FUSER_PERF_SCOPE("FusionExecutor::runFusion::host_ir_evaluate"); + FUSER_PERF_SCOPE("KernelExecutor::runFusion::host_ir_evaluate"); if (outputs.empty()) { std::vector output_info = getBufferInfos( expr_eval, PrimDataType::Int, host_ir_container_->outputs()); @@ -1204,7 +1204,7 @@ std::vector FusionExecutor::runFusion( return outputs; } - NVF_ERROR(validKernelId(), "Invalid kernel id for FusionExecutor."); + NVF_ERROR(validKernelId(), "Invalid kernel id for KernelExecutor."); NVF_ERROR( !args.getCacheId().has_value() || outputs.empty(), "short cut input cache is not compatible with pre-allocated output"); @@ -1276,7 +1276,7 @@ std::vector FusionExecutor::runFusion( std::vector intermediates; at::Tensor profile_buffer; { - FUSER_PERF_SCOPE("FusionExecutor::runFusion::intermediates"); + FUSER_PERF_SCOPE("KernelExecutor::runFusion::intermediates"); for (const auto i : c10::irange(executor_entry->intermediates.size())) { const auto& buf_info = executor_entry->intermediates.at(i); bool has_expansion = false; @@ -1356,7 +1356,7 @@ std::vector FusionExecutor::runFusion( executor_utils::CudaKernelTimer timer(stream); if (execute_kernel_ && !kernel()->topLevelExprs().empty()) { - FUSER_PERF_SCOPE("FusionExecutor::runFusion::execute_kernel"); + FUSER_PERF_SCOPE("KernelExecutor::runFusion::execute_kernel"); ensureAvailableDynamicSmemSize(executor_entry->launch_params.smem()); recomputeArgs(*executor_entry, expr_eval, kernel()); @@ -1435,7 +1435,7 @@ std::vector FusionExecutor::runFusion( return outputs; } -int64_t FusionExecutor::inputBytesProcessed(const KernelArgumentHolder& args) { +int64_t KernelExecutor::inputBytesProcessed(const KernelArgumentHolder& args) { int64_t num_bytes = 0; // Figure how many bytes are inputs, outputs, and temporary buffers for (auto i : c10::irange(args.size())) { @@ -1447,7 +1447,7 @@ int64_t FusionExecutor::inputBytesProcessed(const KernelArgumentHolder& args) { return num_bytes; } -int64_t FusionExecutor::outputBytesProcessed( +int64_t KernelExecutor::outputBytesProcessed( const std::vector& outputs) { int64_t num_bytes = 0; for (auto i : c10::irange(outputs.size())) { @@ -1459,12 +1459,12 @@ int64_t FusionExecutor::outputBytesProcessed( return num_bytes; } -void FusionExecutor::compileRtc( +void KernelExecutor::compileRtc( const std::string& code, const std::string& name, bool structured, PrimDataType index_type) { - FUSER_PERF_SCOPE("FusionExecutor::compileRtc"); + FUSER_PERF_SCOPE("KernelExecutor::compileRtc"); NVF_ERROR( index_type == PrimDataType::Int || index_type == PrimDataType::Int32 || "Invalid index type: ", @@ -1482,11 +1482,11 @@ void FusionExecutor::compileRtc( executor_utils::getCompiledKernel(std::nullopt, scode, name, kernel_id_); } -float FusionExecutor::runRtc( +float KernelExecutor::runRtc( const LaunchParams& launch_params, const std::vector& args, PrimDataType index_type) { - FUSER_PERF_SCOPE("FusionExecutor::runRtc"); + FUSER_PERF_SCOPE("KernelExecutor::runRtc"); c10::DeviceGuard dg(options_.device); auto stream = at::cuda::getCurrentCUDAStream(); @@ -1547,9 +1547,9 @@ float FusionExecutor::runRtc( return kernel_time_ms; } -flatbuffers::Offset FusionExecutor::serialize( +flatbuffers::Offset KernelExecutor::serialize( flatbuffers::FlatBufferBuilder& builder) const { - // See table definition for FusionExecutor in serde/fusion_cache.fbs + // See table definition for KernelExecutor in serde/fusion_cache.fbs using fb_executor_entry = flatbuffers::Offset; // Separate unordered_map for executor_entry_lookup into key and value @@ -1564,10 +1564,10 @@ flatbuffers::Offset FusionExecutor::serialize( // When compilation is skipped, avoid serializing cubin because it doesn't // exist. The remaining fields are also not necessary in this case. if (!hasCompiledKernel()) { - return serde::CreateFusionExecutorDirect(builder); + return serde::CreateKernelExecutorDirect(builder); } - return serde::CreateFusionExecutorDirect( + return serde::CreateKernelExecutorDirect( builder, device_smem_limit_, block_size_high_water_mark_, @@ -1585,13 +1585,13 @@ flatbuffers::Offset FusionExecutor::serialize( serialize(builder, compiled_kernel_.get())); } -flatbuffers::Offset FusionExecutor::serialize( +flatbuffers::Offset KernelExecutor::serialize( flatbuffers::FlatBufferBuilder& builder, const executor_utils::CompiledKernel* compiled_kernel) const { NVF_ERROR( compiled_kernel_ != nullptr && (!compiled_kernel->cubin.empty() || !compiled_kernel->ptx.empty()), - "Expected compiled cuda kernel before serializing FusionExecutor."); + "Expected compiled cuda kernel before serializing KernelExecutor."); auto fb_kernel_name = builder.CreateString(compiled_kernel->kernel_name); auto fb_compile_args = builder.CreateString(compiled_kernel->compile_args); @@ -1631,7 +1631,7 @@ flatbuffers::Offset FusionExecutor::serialize( return ckb.Finish(); } -flatbuffers::Offset FusionExecutor::serialize( +flatbuffers::Offset KernelExecutor::serialize( flatbuffers::FlatBufferBuilder& builder, const ExecutorEntry& data) const { // See table definition for ExecutorEntry in serde/fusion_cache.fbs @@ -1683,7 +1683,7 @@ flatbuffers::Offset FusionExecutor::serialize( &intermediates_fb); } -flatbuffers::Offset FusionExecutor::serialize( +flatbuffers::Offset KernelExecutor::serialize( flatbuffers::FlatBufferBuilder& builder, const GlobalBufferInfo& data, int64_t tv_position, @@ -1701,8 +1701,8 @@ flatbuffers::Offset FusionExecutor::serialize( is_fusion_output); } -void FusionExecutor::deserialize( - const serde::FusionExecutor* buffer, +void KernelExecutor::deserialize( + const serde::KernelExecutor* buffer, Fusion* fusion, int8_t device_index, CompileParams compile_params, @@ -1711,15 +1711,15 @@ void FusionExecutor::deserialize( int64_t concrete_id, int64_t runtime_id, int64_t group_id) { - // See table definition for FusionExecutor in serde/fusion_cache.fbs + // See table definition for KernelExecutor in serde/fusion_cache.fbs - NVF_ERROR(buffer != nullptr, "serde::FusionExecutor is nullptr."); + NVF_ERROR(buffer != nullptr, "serde::KernelExecutor is nullptr."); // TODO Should we set fusion_id, concrete_id, runtime_id, and group_id when we // skip compilation? if (isExpressionEvaluated(fusion)) { fusion_ = std::make_unique(*fusion); - NVF_ERROR(!hasCompiledKernel(), "Failed to deserialize FusionExecutor"); + NVF_ERROR(!hasCompiledKernel(), "Failed to deserialize KernelExecutor"); return; } @@ -1781,10 +1781,10 @@ void FusionExecutor::deserialize( compiled_kernel_ = executor_utils::getCompiledKernel( buffer->compiled_kernel(), compile_params); - NVF_ERROR(hasCompiledKernel(), "Failed to deserialize FusionExecutor"); + NVF_ERROR(hasCompiledKernel(), "Failed to deserialize KernelExecutor"); } -FusionExecutor::ExecutorEntry FusionExecutor::deserialize( +KernelExecutor::ExecutorEntry KernelExecutor::deserialize( const serde::ExecutorEntry* buffer) { // See table definition for ExecutorEntry in serde/fusion_cache.fbs @@ -1807,7 +1807,7 @@ FusionExecutor::ExecutorEntry FusionExecutor::deserialize( return entry; } -GlobalBufferInfo FusionExecutor::deserialize( +GlobalBufferInfo KernelExecutor::deserialize( const serde::GlobalBufferInfo* buffer) { // See table definition for GlobalBufferInfo in serde/fusion_cache.fbs diff --git a/csrc/runtime/executor.h b/csrc/runtime/executor.h index fd68acc3baf..28742b3e9f3 100644 --- a/csrc/runtime/executor.h +++ b/csrc/runtime/executor.h @@ -34,10 +34,10 @@ struct CompileOptions { c10::Device device = c10::Device(c10::DeviceType::CUDA, 0); }; -class FusionExecutor : public NonCopyable { +class KernelExecutor : public NonCopyable { public: // NVF_API was added for nvfuser_extension. See examples/sinh_extension. - NVF_API FusionExecutor(); + NVF_API KernelExecutor(); //! To compile a fusion with the 32-bit index type, CompileParams //! must be passed in. There used to be an index type associated @@ -135,7 +135,7 @@ class FusionExecutor : public NonCopyable { post_lowering_hooks_.push_back(std::move(hook)); } - // Function to query whether compilation was attempted for a `FusionExecutor` + // Function to query whether compilation was attempted for a `KernelExecutor` bool isCompiled() const { int num_compiled_artifacts = (fusion_ != nullptr) + (lowered_ != nullptr) + (host_ir_container_ != nullptr); @@ -143,7 +143,7 @@ class FusionExecutor : public NonCopyable { return num_compiled_artifacts == 1; }; - // function to query whether a `FusionExecutor` has a compiled kernel to + // function to query whether a `KernelExecutor` has a compiled kernel to // execute bool hasCompiledKernel() const { if (compiled_kernel_ != nullptr) { @@ -355,12 +355,12 @@ class FusionExecutor : public NonCopyable { } //! Serialize Fusion Executor using flatbuffers - flatbuffers::Offset serialize( + flatbuffers::Offset serialize( flatbuffers::FlatBufferBuilder& builder) const; //! Deserialize Fusion Executor using flatbuffers void deserialize( - const serde::FusionExecutor* buffer, + const serde::KernelExecutor* buffer, Fusion* fusion, int8_t device_index, CompileParams compile_params, @@ -428,9 +428,9 @@ class FusionExecutor : public NonCopyable { flatbuffers::FlatBufferBuilder& builder, const executor_utils::CompiledKernel* kernel) const; - // ExecutorEntry is an internal POD struct for the FusionExecutor class. + // ExecutorEntry is an internal POD struct for the KernelExecutor class. // We define ExecutorEntry's serialize and deserialize as private methods in - // FusionExecutor. + // KernelExecutor. flatbuffers::Offset serialize( flatbuffers::FlatBufferBuilder& builder, const ExecutorEntry& data) const; @@ -438,9 +438,9 @@ class FusionExecutor : public NonCopyable { //! Deserialize ExecutorEntry using flatbuffers ExecutorEntry deserialize(const serde::ExecutorEntry* buffer); - // GlobalBufferInfo is an internal POD struct for the FusionExecutor class. + // GlobalBufferInfo is an internal POD struct for the KernelExecutor class. // We define GlobalBufferInfo's serialize and deserialize as private methods - // in FusionExecutor. + // in KernelExecutor. flatbuffers::Offset serialize( flatbuffers::FlatBufferBuilder& builder, const GlobalBufferInfo& data, diff --git a/csrc/runtime/executor_utils.cpp b/csrc/runtime/executor_utils.cpp index bb1c9b59f63..565f823d263 100644 --- a/csrc/runtime/executor_utils.cpp +++ b/csrc/runtime/executor_utils.cpp @@ -689,7 +689,7 @@ void validateVectorizedTensors( const std::vector& outputs, caching::ExecutorCompileTimeInfoCache* data_cache, ExpressionEvaluator& expr_eval) { - FUSER_PERF_SCOPE("FusionExecutor::validateVectorizedTensors"); + FUSER_PERF_SCOPE("KernelExecutor::validateVectorizedTensors"); validateAlignedVectorizedTensors( kernel, args, outputs, data_cache, expr_eval); diff --git a/csrc/runtime/executor_utils.h b/csrc/runtime/executor_utils.h index 8e99129356c..6c8418dc200 100644 --- a/csrc/runtime/executor_utils.h +++ b/csrc/runtime/executor_utils.h @@ -77,7 +77,7 @@ namespace caching { // the logic in the common space and re-use //! List of all the possible entry types in -//! `FusionExecutor` compile-time data cache. +//! `KernelExecutor` compile-time data cache. enum class CompileTimeEntryType { PARALLEL_BINDING_ITERDOMAINS, PARALLEL_ITER_EXTENT_MAP, @@ -91,7 +91,7 @@ enum class CompileTimeEntryType { //! Entry class definitions for each entry type: //! each class defines the data type for each entry type -//! Compile-time info to be cached in each FusionExecutor: +//! Compile-time info to be cached in each KernelExecutor: //! ParallelBindingIterDomains: //! Stores all the iterdomains that are parallelized //! on the scheduled Fusion graph. They will be used @@ -104,7 +104,7 @@ class ParallelBindingIterDomains { CompileTimeEntryType::PARALLEL_BINDING_ITERDOMAINS; }; -//! Compile-time info to be cached in each FusionExecutor: +//! Compile-time info to be cached in each KernelExecutor: //! ParallelIterExtentMap //! Stores the symbolic extents of all the parallelized //! iterdomains corresponding to each used parallel type. @@ -132,7 +132,7 @@ struct VectorizedTensorInfo { std::vector out_misaligned_tensors_pos; }; -//! Compile-time info to be cached in each FusionExecutor: +//! Compile-time info to be cached in each KernelExecutor: //! VectorizedTensorValidation //! Stores position info and vector word sizes of //! vectorized input/output tensors, to be used diff --git a/csrc/runtime/fusion_cache_utils.h b/csrc/runtime/fusion_cache_utils.h index a61f1844343..452192b11b8 100644 --- a/csrc/runtime/fusion_cache_utils.h +++ b/csrc/runtime/fusion_cache_utils.h @@ -28,7 +28,7 @@ class SegmentedFusion; // Utilities for benchmarking and profiling struct ExecutorLog { std::unique_ptr params = nullptr; - FusionExecutor* fusion_executor = nullptr; + KernelExecutor* fusion_executor = nullptr; }; struct RuntimeWorkSpace { @@ -153,7 +153,7 @@ class InputsIdLookup : public NonCopyable { //! Encode each input sets to with an unique id; //! The returned data structure also indicates whether eviction has happened //! within the lookup cache. This is needed because lookup shortcut is also - //! cached in nested `FusionExecutorCache` and `FusionExecutor`. + //! cached in nested `FusionExecutorCache` and `KernelExecutor`. //! see [ Note -- Post-definition cache implementation ] and [ Note -- 2 level //! cache implementation ]. //! diff --git a/csrc/runtime/fusion_executor_cache.cpp b/csrc/runtime/fusion_executor_cache.cpp index 24830ba9bd1..af9c83da2b6 100644 --- a/csrc/runtime/fusion_executor_cache.cpp +++ b/csrc/runtime/fusion_executor_cache.cpp @@ -183,8 +183,8 @@ std::string FusionExecutorCache::getCode( if (intrinsic_code) { const auto& execs = kernel_runtime->executors(); - const FusionExecutor& fe = execs[0]; - auto index_type = fe.kernel()->indexType(); + const KernelExecutor& ke = execs[0]; + auto index_type = ke.kernel()->indexType(); // Make sure all the segment index types match. All segments currently // use the same index type but this code change in the future. for (const auto& exec : execs) { @@ -195,7 +195,7 @@ std::string FusionExecutorCache::getCode( " ", exec.kernel()->indexType()); } - std::string full_code = fe.getStructuredCode(kernel_code, index_type); + std::string full_code = ke.getStructuredCode(kernel_code, index_type); return full_code; } else { return kernel_code; @@ -481,7 +481,7 @@ void FusionExecutorCache::deserialize( device_runtimes.size())); // 3. For FusionKernelRuntime, we have a separate deserialize function - // to create the FusionExecutor objects. + // to create the KernelExecutor objects. device_runtimes.back()->deserialize( fb_fusion_kernel_runtime, args.getDeviceIndex()); diff --git a/csrc/runtime/fusion_executor_cache.h b/csrc/runtime/fusion_executor_cache.h index 57cf6ce9c15..0a0e0520cf8 100644 --- a/csrc/runtime/fusion_executor_cache.h +++ b/csrc/runtime/fusion_executor_cache.h @@ -63,7 +63,7 @@ enum class PrimDataType; //! properties might: rank, DataType, contiguity, stride order, size (whether a //! dimension has size=1). When all of these properties are repeated, there is //! an opportunity to reduce the latency of producing a compiled Fusion and -//! launch params (a FusionExecutor). Given inputs, we first compute an ID using +//! launch params (a KernelExecutor). Given inputs, we first compute an ID using //! InputsIdLookup::lookupId that encodes tensor properties along with values of //! any integer-valued input scalars that might affect concretization. This ID //! is guaranteed not to conflict unless the inputs can be executed by the same @@ -124,7 +124,7 @@ class FusionExecutorCache { int64_t fusion_id = 0, bool auto_schedule = true); - //! Execute fusion graph with given inputs, create `FusionExecutor` as needed + //! Execute fusion graph with given inputs, create `KernelExecutor` as needed //! Note this function also handles permutation & input update outside of //! codegen. //! @@ -242,7 +242,7 @@ class FusionExecutorCache { std::optional selected_device = std::nullopt); //! evict cached short cut entry in `code_to_fe_lookup_` as well as cached - //! entry in `FusionExecutor` + //! entry in `KernelExecutor` void evictCache(size_t cache_id); //! The index type of forced_index_type is used to get a kernel diff --git a/csrc/runtime/fusion_kernel_runtime.cpp b/csrc/runtime/fusion_kernel_runtime.cpp index 7a2768e0cbf..99cb881a2c4 100644 --- a/csrc/runtime/fusion_kernel_runtime.cpp +++ b/csrc/runtime/fusion_kernel_runtime.cpp @@ -125,7 +125,7 @@ FusionKernelRuntime::FusionKernelRuntime( // would go directly to kernel launch. prepareRuntimeOrder(segmented_fusion_.get(), runtime_workspace_); - executors_ = std::vector(segmented_fusion_->groups().size()); + executors_ = std::vector(segmented_fusion_->groups().size()); if (isDebugDumpEnabled(DebugDumpOption::FusionSegments)) { segmented_fusion_->print(); } @@ -142,8 +142,8 @@ FusionKernelRuntime::FusionKernelRuntime( } void FusionKernelRuntime::evictCache(size_t input_id) { - for (auto& fe : executors_) { - fe.evictCache(input_id); + for (auto& ke : executors_) { + ke.evictCache(input_id); } } @@ -159,8 +159,8 @@ flatbuffers::Offset FusionKernelRuntime::serialize( flatbuffers::FlatBufferBuilder& builder) const { // See table definition for FusionKernelRuntime in serde/fusion_cache.fbs - // 1. Serialize FusionExecutor objects - std::vector> executors_fb; + // 1. Serialize KernelExecutor objects + std::vector> executors_fb; executors_fb.reserve(executors_.size()); for (auto& executor : executors_) { executors_fb.push_back(executor.serialize(builder)); @@ -198,7 +198,7 @@ void FusionKernelRuntime::deserialize( runtime_id_ == buffer->runtime_id(), "Expected FusionKernelRuntime runtime_id to match serde runtime_id."); - // 1. Deserialize FusionExecutor objects + // 1. Deserialize KernelExecutor objects for (auto idx : c10::irange(buffer->executors()->size())) { auto sg = runtime_workspace_.group_run_order.at(idx); @@ -497,7 +497,7 @@ void FusionKernelRuntime::updateHeuristicsLaunchParams( } } -const std::vector& FusionKernelRuntime::executors() const { +const std::vector& FusionKernelRuntime::executors() const { return executors_; } diff --git a/csrc/runtime/fusion_kernel_runtime.h b/csrc/runtime/fusion_kernel_runtime.h index 7e34da833ce..5bf20883203 100644 --- a/csrc/runtime/fusion_kernel_runtime.h +++ b/csrc/runtime/fusion_kernel_runtime.h @@ -35,7 +35,7 @@ struct FusionKernelRuntime; //! //! Two types of instance can be created, one for complete/single-kernel fusion //! and one for segmented/multi-kernel fusion. -//! Conceptually this is a generalization of FusionExecutor that supports both +//! Conceptually this is a generalization of KernelExecutor that supports both //! single-kernel and multi-kernel caching/compiling/launching //! //! When serde_buffer argument is a nullptr, we run the @@ -143,7 +143,7 @@ class FusionKernelRuntime { //! for kernel launch for a new input dimension but same heuristics void updateHeuristicsLaunchParams(HeuristicParamsList* update_heuristics); - const std::vector& executors() const; + const std::vector& executors() const; private: //! Runs each fusion segment given arguments. The outputs for a fusion are @@ -176,7 +176,7 @@ class FusionKernelRuntime { private: //! Entries indexed by groupID: //! Executors holding compiled kernels - std::vector executors_; + std::vector executors_; // A metadata copy of initial arguments used to contruct this // FusionKernelRuntime. Used during deserialization to schedule the fusion diff --git a/csrc/scheduler/compile_time_info.h b/csrc/scheduler/compile_time_info.h index d413c99ae81..f7ec9d4a97f 100644 --- a/csrc/scheduler/compile_time_info.h +++ b/csrc/scheduler/compile_time_info.h @@ -234,7 +234,7 @@ class CompileTimeInfoBase : public PolymorphicBase { //! Compile-time information cache for `canSchedule` and `getHeuristics` //! interfaces. Each cache instance stores information that could be inferred at //! compile time in a fusion and therefore corresponds to an instance of -//! FusionExecutor. +//! KernelExecutor. class HeuristicDataCache { using EntryOwningPtr = std::unique_ptr; diff --git a/csrc/scheduler/matmul_utils.cpp b/csrc/scheduler/matmul_utils.cpp index 799d519a8d2..86ad7e5144d 100644 --- a/csrc/scheduler/matmul_utils.cpp +++ b/csrc/scheduler/matmul_utils.cpp @@ -411,7 +411,7 @@ class VectorizationCalculator { //! To analyze vectorization, we need to know pointer alignment, sizes, and //! strides. SchedulerRuntimeInfo contains all this info about fusion - //! inputs, but fusion outputs are allocated by FusionExecutor so they are + //! inputs, but fusion outputs are allocated by KernelExecutor so they are //! absent from SchedulerRuntimeInfo. //! //! This function just extracts sizes and strides from runtime_info_ when diff --git a/csrc/scheduler/utils.cpp b/csrc/scheduler/utils.cpp index 750cdb43597..260f5813be7 100644 --- a/csrc/scheduler/utils.cpp +++ b/csrc/scheduler/utils.cpp @@ -2565,7 +2565,7 @@ int64_t getSharedMemoryOverheadPerBlock( dtype_size = std::max(dtype_size, dataTypeSize(tv->getDataType().value())); } // for welford, three arrays of type nvfuser_index_t are used to store var, - // avg, and n. see FusionExecutor::computeLaunchParams. Here index type is + // avg, and n. see KernelExecutor::computeLaunchParams. Here index type is // assumed as int64_t int64_t welford_factor = ir_utils::hasOpsOfType(fusion) ? 3l : 1l; if (welford_factor == 3l) { diff --git a/csrc/serde/Serde.md b/csrc/serde/Serde.md index c0528950d6b..5f5bcbc18bb 100644 --- a/csrc/serde/Serde.md +++ b/csrc/serde/Serde.md @@ -27,19 +27,19 @@ The string's position in the cache becomes the input's cache id. This table represents a key-value pair in the unordered_map. ### FusionKernelRuntime -* `FusionKernelRuntime` contains the segments for a Fusion. Each segment is represented by a `FusionExecutor` object. +* `FusionKernelRuntime` contains the segments for a Fusion. Each segment is represented by a `KernelExecutor` object. #### Serialization: * We save a metadata copy of the arguments used to construct the `FusionKernelRuntime`. During deserialization, -we call the constructor using the saved metadata arguments. Afterwards, we regenerate the `FusionExecutor` objects, +we call the constructor using the saved metadata arguments. Afterwards, we regenerate the `KernelExecutor` objects, which are normally built by calling `compileFusionParallel` outside the constructor. ### KernelArgumentHolder * A collection of `PolymorphicValue` objects representing Scalars [`int, double, bool, complex`], Cpu Scalars, and Gpu Tensors. * **Note:** Pointer address of meta aten tensors is zero. The pointer address is used to specify vectorization during schedule. -### FusionExecutor -* `FusionExecutor` defines two data structs: `ExecutorEntry` and `GlobalBufferInfo` +### KernelExecutor +* `KernelExecutor` defines two data structs: `ExecutorEntry` and `GlobalBufferInfo` * `ExecutorEntry` contains information to launch a kernel for a set of input arguments. It contains the launch parameters, output-to-input alias map, and global buffer configurations. * `GlobalBufferInfo` specifies the buffer's tensor properties [`shape, stride, dtype`] and its corresponding TensorView. diff --git a/csrc/serde/fusion_cache.fbs b/csrc/serde/fusion_cache.fbs index 0cc499416b6..b21e4ea4f82 100644 --- a/csrc/serde/fusion_cache.fbs +++ b/csrc/serde/fusion_cache.fbs @@ -156,7 +156,7 @@ table Scalar { } // ===================================================================================== -// Tables for PolymorphicValue, ScalarCpu, TensorArg, KernelArgumentHolder used in FusionExecutor. +// Tables for PolymorphicValue, ScalarCpu, TensorArg, KernelArgumentHolder used in KernelExecutor. // The ScalarCpu is represented by a fixed size array of raw bytes. table ScalarCpu { @@ -188,7 +188,7 @@ table KernelArgumentHolder { // // ===================================================================================== -// Tables for LaunchParams, GlobalBufferInfo, ExecutorEntry, and TensorShape used in FusionExecutor +// Tables for LaunchParams, GlobalBufferInfo, ExecutorEntry, and TensorShape used in KernelExecutor // Data representing a tensor shape used in LaunchParam table TensorShape { @@ -355,7 +355,7 @@ table CudaKernel { } // Each Fusion Executor maps to a lowered and compiled kernel. -table FusionExecutor { +table KernelExecutor { device_smem_limit: long; block_size_high_water_mark: long; maxrregcount_high_water_mark: long; @@ -415,14 +415,14 @@ table SegmentedFusion { // Each FusionKernelRuntime represents a concretized, segmented Fusion. // We store the metadata for the original arguments to segment, schedule, and compile the Fusion at deserialization. -// Each fusion segment is given a FusionExecutor. +// Each fusion segment is given a KernelExecutor. // The unscheduled fusion is defined by traversing Trie in FusionCache. table FusionKernelRuntime { fusion_id: long; concrete_id: long; runtime_id: long; args: KernelArgumentHolder; - executors: [FusionExecutor]; + executors: [KernelExecutor]; segmented_fusion: SegmentedFusion; } diff --git a/csrc/serde/polymorphic_value.h b/csrc/serde/polymorphic_value.h index 6ca56a1c69a..5c0245303f9 100644 --- a/csrc/serde/polymorphic_value.h +++ b/csrc/serde/polymorphic_value.h @@ -21,7 +21,7 @@ namespace nvfuser::serde { //! PolymorphicValue table. This factory creates Bool, ComplexDouble, Double, //! Long, CPU Scalar, and CUDA Tensor objects. These arguments are stored in //! KernelArgumentHolder, which is used to schedule the fusion in -//! FusionKernelRuntime and to run a kernel in FusionExecutor. +//! FusionKernelRuntime and to run a kernel in KernelExecutor. class PolymorphicValueFactory : public Factory { public: diff --git a/examples/sinh_extension/main.cpp b/examples/sinh_extension/main.cpp index e44086dbbe3..f011b51c786 100644 --- a/examples/sinh_extension/main.cpp +++ b/examples/sinh_extension/main.cpp @@ -34,9 +34,9 @@ at::Tensor sinh_nvfuser(const at::Tensor& input) { auto heuristic_params = SchedulerEntry::scheduleWith(&fusion, SchedulerType::PointWise, {input}); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}, heuristic_params->lparams); - auto outputs = fe.runFusion({input}, heuristic_params->lparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {input}, heuristic_params->lparams); + auto outputs = ke.runFusion({input}, heuristic_params->lparams); return outputs[0]; } diff --git a/examples/sinh_libtorch/main.cpp b/examples/sinh_libtorch/main.cpp index 8c83f6d0e23..4487011b249 100644 --- a/examples/sinh_libtorch/main.cpp +++ b/examples/sinh_libtorch/main.cpp @@ -31,9 +31,9 @@ at::Tensor sinh_nvfuser(const at::Tensor& input) { auto heuristic_params = SchedulerEntry::scheduleWith( &fusion, SchedulerType::PointWise, {input}); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}, heuristic_params->lparams); - auto outputs = fe.runFusion({input}, heuristic_params->lparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {input}, heuristic_params->lparams); + auto outputs = ke.runFusion({input}, heuristic_params->lparams); return outputs[0]; } diff --git a/tests/cpp/test_alias.cpp b/tests/cpp/test_alias.cpp index 68337688656..efb6a464fd5 100644 --- a/tests/cpp/test_alias.cpp +++ b/tests/cpp/test_alias.cpp @@ -50,10 +50,11 @@ TEST_F(AliasTest, View) { TensorView* out = reshape(in, in_shape, out_shape); fusion->addOutput(out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3, 4}, at::dtype(at::kFloat).device(at::kCUDA, 0)); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); ASSERT_EQ(out_tensors.size(), 1); at::Tensor out_tensor = out_tensors[0]; @@ -61,7 +62,8 @@ TEST_F(AliasTest, View) { EXPECT_EQ(in_tensor.data_ptr(), out_tensor.data_ptr()); // Verify output values. - testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); } TEST_F(AliasTest, View_AliasForSameLayout) { @@ -80,13 +82,15 @@ TEST_F(AliasTest, View_AliasForSameLayout) { {in->axis(1), in->axis(2), in->axis(0)}, {true, false, false}); out->setAllocationDomain({out->axis(1), out->axis(0)}, false); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({60}).cuda().as_strided({2, 3, 4}, {2, 20, 5}); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); ASSERT_EQ(out_tensors.size(), 1); at::Tensor out_tensor = out_tensors[0]; - testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); EXPECT_TRUE(out_tensor.is_alias_of(in_tensor)); } @@ -105,12 +109,14 @@ TEST_F(AliasTest, View_AliasForCompliantLayout) { out->setAllocationDomain({out->axis(0), out->axis(1)}, {false, false}); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3, 4}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); ASSERT_EQ(out_tensors.size(), 1); at::Tensor out_tensor = out_tensors[0]; - testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); EXPECT_TRUE(out_tensor.is_alias_of(in_tensor)); } @@ -131,10 +137,11 @@ TEST_F(AliasTest, View_NoAliasForIncompliantLayout) { // alias. out->setAllocationDomain({out->axis(1), out->axis(0)}, true); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3, 4}, at::dtype(at::kFloat).device(at::kCUDA, 0)); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); ASSERT_EQ(out_tensors.size(), 1); at::Tensor out_tensor = out_tensors[0]; @@ -142,7 +149,8 @@ TEST_F(AliasTest, View_NoAliasForIncompliantLayout) { EXPECT_FALSE(out_tensor.is_alias_of(in_tensor)); // Verify output values. - testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); } TEST_F(AliasTest, ViewPermute) { @@ -158,10 +166,11 @@ TEST_F(AliasTest, ViewPermute) { out = permute(out, {1, 0}); fusion->addOutput(out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3, 4}, at::dtype(at::kFloat).device(at::kCUDA, 0)); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); ASSERT_EQ(out_tensors.size(), 1); at::Tensor out_tensor = out_tensors[0]; @@ -169,7 +178,8 @@ TEST_F(AliasTest, ViewPermute) { EXPECT_EQ(in_tensor.data_ptr(), out_tensor.data_ptr()); // Verify output values. - testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); } TEST_F(AliasTest, DuplicateOutputs) { @@ -185,10 +195,11 @@ TEST_F(AliasTest, DuplicateOutputs) { fusion->addOutput(out); fusion->addOutput(out); // duplicated outputs - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn(in_shape, at::dtype(at::kFloat).device(at::kCUDA, 0)); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); ASSERT_EQ(out_tensors.size(), 2); at::Tensor out_tensor_0 = out_tensors[0]; at::Tensor out_tensor_1 = out_tensors[1]; @@ -196,12 +207,13 @@ TEST_F(AliasTest, DuplicateOutputs) { // Verify aliasing among duplicated outputs EXPECT_TRUE(out_tensor_0.is_alias_of(out_tensor_1)); // Verify no segmentation - EXPECT_FALSE(fec.getMostRecentKernelRuntime()->isSegmented()) + EXPECT_FALSE(executor_cache.getMostRecentKernelRuntime()->isSegmented()) << "segmentation is not supposed to happen"; at::Tensor expected_out_tensor = in_tensor.add(3.141); // Verify output values. - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); } TEST_F(AliasTest, SliceToSizeOne_Issue1353) { @@ -213,14 +225,14 @@ TEST_F(AliasTest, SliceToSizeOne_Issue1353) { TensorView* out = slice(in, {0, 0, 0}, {4, 6, 1}); fusion->addOutput(out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({4, 6, 7}).cuda(); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; EXPECT_EQ(in_tensor.data_ptr(), out_tensor.data_ptr()); EXPECT_THAT(out_tensor.strides(), ElementsAre(42, 7, _)); testValidate( - fec.fusion(), + executor_cache.fusion(), {in_tensor.slice(/*dim=*/2, /*start=*/c10::nullopt, /*end=*/1)}, {in_tensor}, __LINE__, @@ -236,14 +248,14 @@ TEST_F(AliasTest, SliceRightOfBroadcast) { TensorView* out = slice(in, {0, 0, 0}, {4, 1, 5}); fusion->addOutput(out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({4, 1, 7}).cuda(); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; EXPECT_EQ(in_tensor.data_ptr(), out_tensor.data_ptr()); EXPECT_THAT(out_tensor.strides(), ElementsAre(7, _, 1)); testValidate( - fec.fusion(), + executor_cache.fusion(), {in_tensor.slice(/*dim=*/2, /*start=*/c10::nullopt, /*end=*/5)}, {in_tensor}, __LINE__, @@ -274,9 +286,10 @@ TEST_F(AliasTest, SliceViewPermute) { fusion->addOutput(split); } - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({batches, seq_length, features * 3}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); EXPECT_EQ(out_tensors.size(), 3); for (const auto& out_tensor : out_tensors) { @@ -292,7 +305,7 @@ TEST_F(AliasTest, SliceViewPermute) { } testValidate( - fec.fusion(), + executor_cache.fusion(), out_tensors, {in_tensor}, expected_out_tensors, @@ -317,11 +330,13 @@ TEST_F(AliasTest, DuplicateOutputsSegmentedFusion) { fusion->addOutput(out); fusion->addOutput(out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn(in_shape, at::dtype(at::kFloat).device(at::kCUDA, 0)); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); // Verify aliasing among duplicated outputs EXPECT_TRUE(out_tensors[0].is_alias_of(out_tensors[1])); @@ -329,22 +344,26 @@ TEST_F(AliasTest, DuplicateOutputsSegmentedFusion) { // Verify segmentation EXPECT_EQ( - fec.getMostRecentKernelRuntime()->fusionSegments()->groups().size(), 2) + executor_cache.getMostRecentKernelRuntime() + ->fusionSegments() + ->groups() + .size(), + 2) << "segmentation didn't happen as expected"; } namespace { // Returns the only executor in the most recent runtime. -const FusionExecutor& onlyExecutorInMostRecentRuntime( - const FusionExecutorCache& fec) { - const std::vector& executors = - fec.getMostRecentKernelRuntime()->executors(); +const KernelExecutor& onlyExecutorInMostRecentRuntime( + const FusionExecutorCache& executor_cache) { + const std::vector& executors = + executor_cache.getMostRecentKernelRuntime()->executors(); EXPECT_EQ(executors.size(), 1); return executors.front(); } -bool storesToOutput(const FusionExecutor& executor, const int64_t out_index) { +bool storesToOutput(const KernelExecutor& executor, const int64_t out_index) { // Get the variable name from the `kir::Kernel` not the input fusion, because // they are not always the same. std::string var_name = @@ -371,10 +390,12 @@ TEST_F(AliasTest, NotAllOutputsAlias_Pointwise) { fusion->addOutput(broadcast_out); fusion->addOutput(add_out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_TRUE(out_tensors[0].is_alias_of(in_tensor)); @@ -385,7 +406,7 @@ TEST_F(AliasTest, NotAllOutputsAlias_Pointwise) { // that stores only to the output of the add. // // - broadcast & expand. This segment is meta-op only. - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_THAT( runtime->fusionSegments()->groups(), UnorderedElementsAre( @@ -394,17 +415,17 @@ TEST_F(AliasTest, NotAllOutputsAlias_Pointwise) { for (SegmentedGroup* group : runtime->fusionSegments()->groups()) { if (group->schedulerType() == SchedulerType::PointWise) { - const FusionExecutor& fe = runtime->executors().at(group->groupId()); + const KernelExecutor& ke = runtime->executors().at(group->groupId()); int num_stores = 0; for (auto i : c10::irange(group->outputs().size())) { - if (storesToOutput(fe, i)) { + if (storesToOutput(ke, i)) { num_stores++; } } EXPECT_EQ(num_stores, 1) << "The generated CUDA kernel is expected to store data to one output:" << std::endl - << fe.kernelString(); + << ke.kernelString(); } } } @@ -427,13 +448,15 @@ TEST_F(AliasTest, NotAllOutputsAlias_Reduction) { fusion->addOutput(view_out); fusion->addOutput(permute_out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({16 * 12 * 128 * 192}) .cuda() .as_strided({16, 12, 128, 192}, {128 * 12 * 192, 192, 12 * 192, 1}); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_TRUE(out_tensors[1].is_alias_of(in_tensor)); EXPECT_TRUE(out_tensors[2].is_alias_of(in_tensor)); @@ -452,15 +475,17 @@ TEST_F(AliasTest, Issue1452) { fusion->addOutput(set_out); fusion->addOutput(add_out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({1024, 1024}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); at::Tensor set_out_tensor = out_tensors[0]; EXPECT_TRUE(set_out_tensor.is_alias_of(in_tensor)); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_THAT( runtime->fusionSegments()->groups(), UnorderedElementsAre( @@ -469,17 +494,17 @@ TEST_F(AliasTest, Issue1452) { for (SegmentedGroup* group : runtime->fusionSegments()->groups()) { if (group->schedulerType() == SchedulerType::PointWise) { - const FusionExecutor& fe = runtime->executors().at(group->groupId()); + const KernelExecutor& ke = runtime->executors().at(group->groupId()); int num_stores = 0; for (auto i : c10::irange(group->outputs().size())) { - if (storesToOutput(fe, i)) { + if (storesToOutput(ke, i)) { num_stores++; } } EXPECT_EQ(num_stores, 1) << "The generated CUDA kernel is expected to store data to one output:" << std::endl - << fe.kernelString(); + << ke.kernelString(); } } } @@ -495,20 +520,22 @@ TEST_F(AliasTest, AliasOutputBeforeNonAliasOutput) { fusion->addOutput(slice_out); fusion->addOutput(add_out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); at::Tensor slice_out_tensor = out_tensors[0]; EXPECT_TRUE(slice_out_tensor.is_alias_of(in_tensor)); - const FusionExecutor& fe = onlyExecutorInMostRecentRuntime(fec); - EXPECT_FALSE(storesToOutput(fe, /*out_index=*/0)) + const KernelExecutor& ke = onlyExecutorInMostRecentRuntime(executor_cache); + EXPECT_FALSE(storesToOutput(ke, /*out_index=*/0)) << "The generated CUDA kernel shouldn't store data to output 0:" << std::endl - << fe.kernelString(); + << ke.kernelString(); } TEST_F(AliasTest, Set_NoAliasForIncompatibleLayout) { @@ -523,9 +550,10 @@ TEST_F(AliasTest, Set_NoAliasForIncompatibleLayout) { // I intentionally set the allocation order to be different to block aliasing. out->setAllocationDomain({out->axis(1), out->axis(2), out->axis(0)}, true); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3, 5}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); ASSERT_EQ(out_tensors.size(), 1); at::Tensor out_tensor = out_tensors[0]; @@ -549,10 +577,11 @@ TEST_F(AliasTest, DuplicateOutputsComplex) { // duplicated output fusion->addOutput(out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3, 5}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); ASSERT_EQ(out_tensors.size(), 4); // Verify aliases among outputs. @@ -561,7 +590,8 @@ TEST_F(AliasTest, DuplicateOutputsComplex) { EXPECT_TRUE(out_tensors[0].is_alias_of(out_tensors[3])); // Verify output values. - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); } // test verifying that duplicated input is not allowed in nvfuser @@ -593,10 +623,12 @@ TEST_F(AliasTest, AliasInSegment) { fusion->addOutput(add_out); fusion->addOutput(permute_out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_TRUE(out_tensors[1].is_alias_of(in_tensor)); } @@ -617,17 +649,20 @@ TEST_F(AliasTest, TrivialInputForwarding) { at::Tensor t0 = at::randn({10, 4}).cuda(); at::Tensor t1 = at::randn({10, 4}).cuda(); - FusionExecutorCache fec(std::move(fusion)); - std::vector cg_outputs = fec.runFusionWithInputs({t0, t1}); + FusionExecutorCache executor_cache(std::move(fusion)); + std::vector cg_outputs = + executor_cache.runFusionWithInputs({t0, t1}); EXPECT_EQ(cg_outputs[0].data_ptr(), t0.data_ptr()); - testValidate(fec.fusion(), cg_outputs, {t0, t1}, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), cg_outputs, {t0, t1}, __LINE__, __FILE__); // Second run to ensure cache hit handles trivial forwarding properly - EXPECT_TRUE(fec.isCompiled({t0, t1})); - auto cg_outputs2 = fec.runFusionWithInputs({t0, t1}); + EXPECT_TRUE(executor_cache.isCompiled({t0, t1})); + auto cg_outputs2 = executor_cache.runFusionWithInputs({t0, t1}); EXPECT_EQ(cg_outputs2[0].data_ptr(), t0.data_ptr()); - testValidate(fec.fusion(), cg_outputs2, {t0, t1}, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), cg_outputs2, {t0, t1}, __LINE__, __FILE__); } TEST_F(AliasTest, TrivialInputForwarding_ScalarTensor) { @@ -640,16 +675,16 @@ TEST_F(AliasTest, TrivialInputForwarding_ScalarTensor) { at::Tensor t0 = at::randn({}).cuda(); - FusionExecutorCache fec(std::move(fusion)); - auto cg_outputs = fec.runFusionWithInputs({t0}); + FusionExecutorCache executor_cache(std::move(fusion)); + auto cg_outputs = executor_cache.runFusionWithInputs({t0}); EXPECT_EQ(cg_outputs[0].data_ptr(), t0.data_ptr()); - testValidate(fec.fusion(), cg_outputs, {t0}, __LINE__, __FILE__); + testValidate(executor_cache.fusion(), cg_outputs, {t0}, __LINE__, __FILE__); // Second run to ensure cache hit handles trivial forwarding properly - EXPECT_TRUE(fec.isCompiled({t0})); - auto cg_outputs2 = fec.runFusionWithInputs({t0}); + EXPECT_TRUE(executor_cache.isCompiled({t0})); + auto cg_outputs2 = executor_cache.runFusionWithInputs({t0}); EXPECT_EQ(cg_outputs2[0].data_ptr(), t0.data_ptr()); - testValidate(fec.fusion(), cg_outputs2, {t0}, __LINE__, __FILE__); + testValidate(executor_cache.fusion(), cg_outputs2, {t0}, __LINE__, __FILE__); } TEST_F(AliasTest, OutputAliasesAnotherOutput) { @@ -665,10 +700,12 @@ TEST_F(AliasTest, OutputAliasesAnotherOutput) { fusion->addOutput(reshape_out); fusion->addOutput(permute_out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3, 5}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); ASSERT_EQ(out_tensors.size(), 2); EXPECT_TRUE(out_tensors[1].is_alias_of(out_tensors[0])); @@ -689,12 +726,14 @@ TEST_F(AliasTest, OutputNotAliasedByAnotherOutputShouldNotBeSegmented) { fusion->addOutput(reshape_out); fusion->addOutput(mul_out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3, 5}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_FALSE(runtime->isSegmented()); } @@ -716,10 +755,12 @@ TEST_F(AliasTest, ManyAliasesBetweenOutputs) { fusion->addOutput(permute_out); fusion->addOutput(add_out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3, 5}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); ASSERT_EQ(out_tensors.size(), 4); at::Tensor slice_out_tensor = out_tensors[0]; at::Tensor reshape_out_tensor = out_tensors[1]; @@ -732,7 +773,7 @@ TEST_F(AliasTest, ManyAliasesBetweenOutputs) { // Segment 1: in -> add_out // Segment 2: add_out -> its output aliases - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_EQ(runtime->fusionSegments()->groups().size(), 2); } @@ -750,12 +791,14 @@ TEST_F(AliasTest, DoNotOverSegment_Straightline) { fusion->addOutput(permute_out); fusion->addOutput(mul_out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_FALSE(runtime->isSegmented()); // permute_out should be recognized as an alias of add_out. However, the @@ -781,12 +824,14 @@ TEST_F(AliasTest, DoNotOverSegment_WithForks) { fusion->addOutput(out1); fusion->addOutput(out2); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_THAT( runtime->fusionSegments()->groups(), Contains(HeuristicIs(SchedulerType::PointWise)).Times(1)); @@ -804,10 +849,11 @@ TEST_F(AliasTest, Broadcast) { fusion->addInput(in); fusion->addOutput(out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3}).cuda(); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; - testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; + testValidate( + executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); EXPECT_EQ(out_tensor.data_ptr(), in_tensor.data_ptr()); } @@ -826,10 +872,11 @@ TEST_F(AliasTest, Expand) { broadcast_tv->axis(2)->extent()}); fusion->addOutput(expanded_tv); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3}).cuda(); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; - testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; + testValidate( + executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); EXPECT_EQ(out_tensor.data_ptr(), in_tensor.data_ptr()); } @@ -848,10 +895,11 @@ TEST_F(AliasTest, MergeTwoExpandedBroadcasts) { TensorView* out = reshape(in, {4, 5, 6}, {20, -1}); fusion->addOutput(out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({1}).cuda().as_strided({4, 5, 6}, {0, 0, 0}); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; - testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; + testValidate( + executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); // TODO(#1126): This should become an alias when #1126 is fixed. // EXPECT_TRUE(out_tensor.is_alias_of(in_tensor)); @@ -872,11 +920,12 @@ TEST_F(AliasTest, MergeBroadcastsBetweenConcretes) { out = reshape(out, {2, 15, 7}, {30, 7}); fusion->addOutput(out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2 * 7}).cuda().as_strided({2, 3, 5, 7}, {7, 0, 0, 1}); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; - testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; + testValidate( + executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); } TEST_F(AliasTest, Squeeze) { @@ -888,10 +937,11 @@ TEST_F(AliasTest, Squeeze) { fusion->addInput(in); fusion->addOutput(out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 1, 3}).cuda(); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; - testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; + testValidate( + executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); EXPECT_EQ(out_tensor.data_ptr(), in_tensor.data_ptr()); } @@ -906,10 +956,12 @@ TEST_F(AliasTest, SourceIsBothInputAndOutput) { fusion->addOutput(in); fusion->addOutput(out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_EQ(in_tensor.data_ptr(), out_tensors[0].data_ptr()); EXPECT_EQ(in_tensor.data_ptr(), out_tensors[1].data_ptr()); @@ -929,12 +981,13 @@ TEST_F(AliasTest, SegmentBoundary) { fusion->addInput(in); fusion->addOutput(out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3}).cuda(); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; - testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; + testValidate( + executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_THAT( runtime->fusionSegments()->groups(), UnorderedElementsAre( @@ -955,12 +1008,12 @@ TEST_F(AliasTest, ReuseBuffer) { auto tensor = at::randn({10}, options); auto expected_tensor = tensor + 1.0; - FusionExecutorCache fec(std::move(fusion)); - fec.runFusionWithInputs({tensor}); + FusionExecutorCache executor_cache(std::move(fusion)); + executor_cache.runFusionWithInputs({tensor}); EXPECT_TRUE(tensor.allclose(expected_tensor)); } -TEST_F(AliasTest, ReuseBuffer_FusionExecutor) { +TEST_F(AliasTest, ReuseBuffer_KernelExecutor) { Fusion fusion; FusionGuard fg(&fusion); TensorView* in = makeContigTensor(1); @@ -972,9 +1025,9 @@ TEST_F(AliasTest, ReuseBuffer_FusionExecutor) { auto tensor = at::randn({10}, options); auto expected_tensor = tensor + 1.0; - FusionExecutor fe; - fe.compileFusion(&fusion, {tensor}); - fe.runFusion({tensor}, {tensor}); + KernelExecutor ke; + ke.compileFusion(&fusion, {tensor}); + ke.runFusion({tensor}, {tensor}); EXPECT_TRUE(tensor.allclose(expected_tensor)); } @@ -1010,18 +1063,27 @@ TEST_F(AliasTest, ReuseBuffer_AliasAcrossSegments) { at::Tensor t1 = at::randn({65}, options); at::Tensor t2 = at::randn({128, 65}, options); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); // Make a copy of `t0` because `t0` will be in-place updated. at::Tensor original_t0 = t0.clone(); - std::vector outputs = fec.runFusionWithInputs({t0, t1, t2}); + std::vector outputs = + executor_cache.runFusionWithInputs({t0, t1, t2}); testValidate( - fec.fusion(), outputs, {original_t0, t1, t2}, __LINE__, __FILE__); + executor_cache.fusion(), + outputs, + {original_t0, t1, t2}, + __LINE__, + __FILE__); // https://github.com/NVIDIA/Fuser/pull/2999 will cause 3 segments instead of // the optimal 2 segments. Change back to 2 segments once // https://github.com/NVIDIA/Fuser/issues/3251 is resolved. EXPECT_EQ( - fec.getMostRecentKernelRuntime()->fusionSegments()->groups().size(), 3) + executor_cache.getMostRecentKernelRuntime() + ->fusionSegments() + ->groups() + .size(), + 3) << "segmentation didn't happen as expected"; auto t3 = original_t0.add(1.0); @@ -1055,16 +1117,17 @@ TEST_F(AliasTest, AliasOnlyKernelsAreNotLaunched) { fusion->addOutput(add_out); fusion->addOutput(permute_out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); auto options = at::dtype(at::kFloat).device(at::kCUDA); at::Tensor in_tensor = at::randn({2, 3}, options); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); if (ProfilerState::Running == FusionProfiler::state()) { FusionProfiler::stop(); } ProfilerOptionsGuard::getCurOptions().unset(ProfilerOption::Enable); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); const FusionProfile& profile = FusionProfiler::profile(); // Expect a kernel launched for one of the two segments but not the @@ -1094,13 +1157,14 @@ TEST_F(AliasTest, PerfDebugVerboseWhenSomeKernelsNotLaunched) { fusion->addOutput(add_out); fusion->addOutput(permute_out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); auto options = at::dtype(at::kFloat).device(at::kCUDA); at::Tensor in_tensor = at::randn({2, 3}, options); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_THAT( runtime->fusionSegments()->groups(), UnorderedElementsAre( @@ -1127,10 +1191,10 @@ TEST_F(AliasTest, NoKernelsAreLaunched) { fusion->addInput(in); fusion->addOutput(out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); auto options = at::dtype(at::kFloat).device(at::kCUDA); at::Tensor in_tensor = at::randn({2, 3}, options); - fec.runFusionWithInputs({in_tensor}); + executor_cache.runFusionWithInputs({in_tensor}); if (ProfilerState::Running == FusionProfiler::state()) { FusionProfiler::stop(); @@ -1146,8 +1210,8 @@ TEST_F(AliasTest, NoKernelsAreLaunched) { } // While most use cases go through FusionExecutorCache, nvFuser also supports -// evaluating an alias via FusionExecutor. -TEST_F(AliasTest, FusionExecutor) { +// evaluating an alias via KernelExecutor. +TEST_F(AliasTest, KernelExecutor) { Fusion fusion; FusionGuard fg(&fusion); @@ -1160,15 +1224,15 @@ TEST_F(AliasTest, FusionExecutor) { AliasAnalysisResult analysis = findAliases(&fusion); EXPECT_EQ(analysis.getRoot(out), in); - // Mark them alias so FusionExecutor::runFusion expression-evaluates the + // Mark them alias so KernelExecutor::runFusion expression-evaluates the // output on the host instead of launching a CUDA kernel. fusion.aliasOutputToInput(out, in, AllocationType::Evaluate); - FusionExecutor fe; + KernelExecutor ke; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({10, 10}, options); - fe.compileFusion(&fusion, {in_tensor}); - at::Tensor out_tensor = fe.runFusion({in_tensor})[0]; + ke.compileFusion(&fusion, {in_tensor}); + at::Tensor out_tensor = ke.runFusion({in_tensor})[0]; EXPECT_EQ(out_tensor.data_ptr(), in_tensor.data_ptr()); } @@ -1182,13 +1246,13 @@ TEST_F(AliasTest, InplaceUpdate) { fusion->addInput(out); fusion->aliasOutputToInput(out, in, AllocationType::ReuseBuffer); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3}).cuda(); at::Tensor out_tensor = in_tensor + 1; - fec.runFusionWithInputs({in_tensor, out_tensor}); + executor_cache.runFusionWithInputs({in_tensor, out_tensor}); EXPECT_TRUE(out_tensor.equal(in_tensor)); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_THAT( runtime->fusionSegments()->groups(), UnorderedElementsAre(HeuristicIs(SchedulerType::PointWise))); @@ -1209,10 +1273,12 @@ TEST_F(AliasTest, Bookend_SegmentSetPreservesAllocation) { permute_out->setAllocationDomain( {permute_out->axis(0), permute_out->axis(1)}, true); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({3, 2}).cuda().transpose(0, 1); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); at::Tensor permute_out_tensor = out_tensors[0]; EXPECT_TRUE(permute_out_tensor.is_alias_of(in_tensor)); @@ -1230,15 +1296,17 @@ TEST_F(AliasTest, Bookend_InputsAndOutputs) { fusion->addOutput(permute_out); fusion->addOutput(compute_out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); at::Tensor permute_out_tensor = out_tensors[0]; EXPECT_TRUE(permute_out_tensor.is_alias_of(in_tensor)); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); // MarkAliasesPrepare adds a `segment_set` between `in` and `permute`, which // leads to three segments: // 1. segment_set`, a no-op segment, @@ -1269,12 +1337,14 @@ TEST_F(AliasTest, Bookend_IntermediateTensors) { fusion->addInput(in); fusion->addOutput(out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_THAT( runtime->fusionSegments()->groups(), UnorderedElementsAre( @@ -1303,15 +1373,17 @@ TEST_F(AliasTest, Bookend_AliasesOfSameTensor) { fusion->addOutput(out1); fusion->addOutput(out2); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_EQ(out_tensors[0].data_ptr(), out_tensors[1].data_ptr()); EXPECT_EQ(out_tensors[0].data_ptr(), out_tensors[2].data_ptr()); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_THAT( runtime->fusionSegments()->groups(), Contains(HeuristicIs(SchedulerType::PointWise)).Times(1)); @@ -1338,14 +1410,16 @@ TEST_F(AliasTest, Bookend_ReuseSegmentSet) { fusion->addOutput(out0); fusion->addOutput(out1); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3, 5}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_EQ(out_tensors[0].data_ptr(), out_tensors[1].data_ptr()); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_THAT( runtime->fusionSegments()->groups(), UnorderedElementsAre( @@ -1384,13 +1458,15 @@ TEST_F(AliasTest, QKVSplitBackprop) { fusion->addOutput(view_out); fusion->addOutput(permute_out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); std::vector in_tensors; for (int i = 0; i < 3; i++) { in_tensors.push_back(at::randn({b, s, h * f}).cuda()); } - std::vector out_tensors = fec.runFusionWithInputs(in_tensors); - testValidate(fec.fusion(), out_tensors, in_tensors, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs(in_tensors); + testValidate( + executor_cache.fusion(), out_tensors, in_tensors, __LINE__, __FILE__); EXPECT_TRUE(out_tensors[2].is_alias_of(out_tensors[1])); } @@ -1419,12 +1495,12 @@ TEST_F(AliasTest, Bookend_Issue2375) { at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::randn(input_shape, options); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({t0}); - testValidate(fec.fusion(), out_tensors, {t0}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({t0}); + testValidate(executor_cache.fusion(), out_tensors, {t0}, __LINE__, __FILE__); EXPECT_THAT( - fec.getMostRecentKernelRuntime()->fusionSegments()->groups(), + executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups(), UnorderedElementsAre( HeuristicIs(SchedulerType::NoOp), HeuristicIs(SchedulerType::InnerPersistent))); @@ -1458,10 +1534,15 @@ TEST_F(AliasTest, Issue2664) { auto t2 = at::randn({}, options); auto aten_out = (t2 + 1.0) * t1; - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({t1, t2}); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({t1, t2}); testValidate( - fec.fusion(), out_tensors, {t1, t2}, {aten_out}, __LINE__, __FILE__); + executor_cache.fusion(), + out_tensors, + {t1, t2}, + {aten_out}, + __LINE__, + __FILE__); } } // namespace nvfuser diff --git a/tests/cpp/test_alias_analysis.cpp b/tests/cpp/test_alias_analysis.cpp index ef72282f1b2..d79f9d1405e 100644 --- a/tests/cpp/test_alias_analysis.cpp +++ b/tests/cpp/test_alias_analysis.cpp @@ -182,11 +182,11 @@ TEST_F(AliasAnalysisTest, View_ForwardExpandedBroadcast) { EXPECT_EQ(analysis.getRoot(out), in); // Verify the last dimension isn't expanded physically. - FusionExecutor fe; + KernelExecutor ke; at::Tensor in_tensor = at::randn({4, 5}).cuda().as_strided({4, 5, 6}, {5, 1, 0}); - fe.compileFusion(&fusion, {in_tensor}); - at::Tensor out_tensor = fe.runFusion({in_tensor})[0]; + ke.compileFusion(&fusion, {in_tensor}); + at::Tensor out_tensor = ke.runFusion({in_tensor})[0]; EXPECT_THAT(out_tensor.strides(), ElementsAre(1, 0)); } diff --git a/tests/cpp/test_allocation_domain.cpp b/tests/cpp/test_allocation_domain.cpp index 42e1c48df8b..c825263aed2 100644 --- a/tests/cpp/test_allocation_domain.cpp +++ b/tests/cpp/test_allocation_domain.cpp @@ -29,7 +29,8 @@ using ::testing::ElementsAre; // A global->shared->global copy kernel, shared memory allocated transposed to // avoid bank conflict. TEST_F(AllocationDomainTest, TransposedIntermediate) { - Fusion fusion; + auto fusion_ptr = std::make_unique(); + Fusion& fusion = *fusion_ptr.get(); FusionGuard fg(&fusion); auto tv0 = makeContigConcreteTensor({32, 32}); @@ -57,16 +58,17 @@ TEST_F(AllocationDomainTest, TransposedIntermediate) { at::Tensor t0 = at::randn({32, 32}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(fusion_ptr.get(), {t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } // A global->global copy kernel converting NCHW memory format into NHWC, with a // 4d allocation domain in output. TEST_F(AllocationDomainTest, NCHW4d_To_NHWC4d) { - Fusion fusion; + auto fusion_ptr = std::make_unique(); + Fusion& fusion = *fusion_ptr.get(); FusionGuard fg(&fusion); auto tv0 = makeContigTensor(4); @@ -94,10 +96,10 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC4d) { at::Tensor t0 = at::randn({n, c, h, w}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compileFusion(fusion_ptr.get(), {t0}); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -107,7 +109,8 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC4d) { // A global->global copy kernel converting NCHW memory format into NHWC, with a // 1d allocation domain in output. TEST_F(AllocationDomainTest, NCHW4d_To_NHWC1d) { - Fusion fusion; + auto fusion_ptr = std::make_unique(); + Fusion& fusion = *fusion_ptr.get(); FusionGuard fg(&fusion); auto tv0 = makeContigTensor(4); @@ -132,10 +135,10 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC1d) { at::Tensor t0 = at::randn({n, c, h, w}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compileFusion(fusion_ptr.get(), {t0}); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -145,7 +148,8 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC1d) { // A global->global copy kernel converting NCHW memory format into NHWC, with a // 2d allocation domain in output. TEST_F(AllocationDomainTest, NCHW4d_To_NHWC2d) { - Fusion fusion; + auto fusion_ptr = std::make_unique(); + Fusion& fusion = *fusion_ptr.get(); FusionGuard fg(&fusion); auto tv0 = makeContigTensor(4); @@ -171,10 +175,10 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC2d) { at::Tensor t0 = at::randn({n, c, h, w}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compileFusion(fusion_ptr.get(), {t0}); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -184,7 +188,8 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC2d) { // Reshape and transpose a 3d tensor into an NHWC tensor with a 3d allocation // domain in fusion output. TEST_F(AllocationDomainTest, Tensor3d_To_NHWC3d) { - Fusion fusion; + auto fusion_ptr = std::make_unique(); + Fusion& fusion = *fusion_ptr.get(); FusionGuard fg(&fusion); int n1 = 31, n2 = 29, h = 64, w = 104, c = 21; @@ -217,10 +222,10 @@ TEST_F(AllocationDomainTest, Tensor3d_To_NHWC3d) { at::Tensor t0 = at::randn({n1, n2, h * w * c}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compileFusion(fusion_ptr.get(), {t0}); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -237,7 +242,8 @@ TEST_F(AllocationDomainTest, Tensor3d_To_NHWC3d) { // output. The allocation domain is on both the producer and the consumer side // of the rFactor domain. TEST_F(AllocationDomainTest, Tensor3d_To_NHWC4d_FwdBwd) { - Fusion fusion; + auto fusion_ptr = std::make_unique(); + Fusion& fusion = *fusion_ptr.get(); FusionGuard fg(&fusion); int n1 = 31, n2 = 29, h = 64, w = 104, c = 21; @@ -276,10 +282,10 @@ TEST_F(AllocationDomainTest, Tensor3d_To_NHWC4d_FwdBwd) { at::Tensor t0 = at::randn({n1, n2, c * h * w}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compileFusion(fusion_ptr.get(), {t0}); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -295,7 +301,8 @@ TEST_F(AllocationDomainTest, Tensor3d_To_NHWC4d_FwdBwd) { // A global->global copy kernel where both inputs and outputs are NHWC memory // format TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d) { - Fusion fusion; + auto fusion_ptr = std::make_unique(); + Fusion& fusion = *fusion_ptr.get(); FusionGuard fg(&fusion); auto tv0 = makeContigTensor(4); @@ -331,15 +338,15 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d) { at::Tensor t0 = t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compileFusion(fusion_ptr.get(), {t0}); EXPECT_THAT( - [&]() { fe.runFusion({t0_wrong_format}); }, + [&]() { ke.runFusion({t0_wrong_format}); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Stride mismatch with contiguity info"))); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -349,7 +356,8 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d) { // A global->global copy kernel where both inputs are NHWC memory format. The // allocation domain view the input as a 1d tensor. TEST_F(AllocationDomainTest, NHWC1d_To_NHWC4d) { - Fusion fusion; + auto fusion_ptr = std::make_unique(); + Fusion& fusion = *fusion_ptr.get(); FusionGuard fg(&fusion); int n = 31, h = 64, w = 103, c = 21; @@ -389,15 +397,15 @@ TEST_F(AllocationDomainTest, NHWC1d_To_NHWC4d) { at::Tensor t0 = t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compileFusion(fusion_ptr.get(), {t0}); EXPECT_THAT( - [&]() { fe.runFusion({t0_wrong_format}); }, + [&]() { ke.runFusion({t0_wrong_format}); }, ::testing::ThrowsMessage(::testing::HasSubstr( "splitting one dimension into discontiguous dimensions is not allowed in allocation domain"))); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -407,7 +415,8 @@ TEST_F(AllocationDomainTest, NHWC1d_To_NHWC4d) { // A global->global copy kernel where both inputs are NHWC memory format. The // allocation domain of the output view the output as a 1d tensor. TEST_F(AllocationDomainTest, NHWC4d_To_NHWC1d) { - Fusion fusion; + auto fusion_ptr = std::make_unique(); + Fusion& fusion = *fusion_ptr.get(); FusionGuard fg(&fusion); int n = 31, h = 64, w = 103, c = 21; @@ -444,15 +453,15 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC1d) { at::Tensor t0 = t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compileFusion(fusion_ptr.get(), {t0}); EXPECT_THAT( - [&]() { fe.runFusion({t0_wrong_format}); }, + [&]() { ke.runFusion({t0_wrong_format}); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Stride mismatch with contiguity info"))); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -462,7 +471,8 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC1d) { // A global->global copy kernel where both inputs are NHWC memory format. The // allocation domain view both the input and the output as a 1d tensors. TEST_F(AllocationDomainTest, NHWC1d_To_NHWC1d) { - Fusion fusion; + auto fusion_ptr = std::make_unique(); + Fusion& fusion = *fusion_ptr.get(); FusionGuard fg(&fusion); int n = 31, h = 64, w = 103, c = 21; @@ -504,15 +514,15 @@ TEST_F(AllocationDomainTest, NHWC1d_To_NHWC1d) { at::Tensor t0 = t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compileFusion(fusion_ptr.get(), {t0}); EXPECT_THAT( - [&]() { fe.runFusion({t0_wrong_format}); }, + [&]() { ke.runFusion({t0_wrong_format}); }, ::testing::ThrowsMessage(::testing::HasSubstr( "splitting one dimension into discontiguous dimensions is not"))); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -523,7 +533,8 @@ TEST_F(AllocationDomainTest, NHWC1d_To_NHWC1d) { // allocation domain view the input as a 2d tensor of shape [N*H/8, 8*W*C], and // view the output as a 2d tensor of shape [N*H*W*C/4, 4] TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d) { - Fusion fusion; + auto fusion_ptr = std::make_unique(); + Fusion& fusion = *fusion_ptr.get(); FusionGuard fg(&fusion); int n = 31, h = 64, w = 103, c = 21; @@ -571,15 +582,15 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d) { at::Tensor t0 = t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compileFusion(fusion_ptr.get(), {t0}); EXPECT_THAT( - [&]() { fe.runFusion({t0_wrong_format}); }, + [&]() { ke.runFusion({t0_wrong_format}); }, ::testing::ThrowsMessage(::testing::HasSubstr( "splitting one dimension into discontiguous dimensions is not allowed in allocation domain"))); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -588,7 +599,8 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d) { // Similar to NHWC4d_To_NHWC4d, but does a cacheBefore TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheBefore) { - Fusion fusion; + auto fusion_ptr = std::make_unique(); + Fusion& fusion = *fusion_ptr.get(); FusionGuard fg(&fusion); auto tv0 = makeContigTensor(4); @@ -635,15 +647,15 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheBefore) { at::Tensor t0 = t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compileFusion(fusion_ptr.get(), {t0}); EXPECT_THAT( - [&]() { fe.runFusion({t0_wrong_format}); }, + [&]() { ke.runFusion({t0_wrong_format}); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Stride mismatch with contiguity info"))); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -652,7 +664,8 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheBefore) { // Similar to NHWC2d_To_NHWC2d, but does a cacheBefore TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheBefore) { - Fusion fusion; + auto fusion_ptr = std::make_unique(); + Fusion& fusion = *fusion_ptr.get(); FusionGuard fg(&fusion); int n = 31, h = 64, w = 103, c = 21; @@ -711,15 +724,15 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheBefore) { at::Tensor t0 = t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compileFusion(fusion_ptr.get(), {t0}); EXPECT_THAT( - [&]() { fe.runFusion({t0_wrong_format}); }, + [&]() { ke.runFusion({t0_wrong_format}); }, ::testing::ThrowsMessage(::testing::HasSubstr( "splitting one dimension into discontiguous dimensions is not allowed in allocation domain"))); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -728,7 +741,8 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheBefore) { // Similar to NHWC4d_To_NHWC4d, but does a cacheAfter TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheAfter) { - Fusion fusion; + auto fusion_ptr = std::make_unique(); + Fusion& fusion = *fusion_ptr.get(); FusionGuard fg(&fusion); auto tv0 = makeContigTensor(4); @@ -775,15 +789,15 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheAfter) { at::Tensor t0 = t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compileFusion(fusion_ptr.get(), {t0}); EXPECT_THAT( - [&]() { fe.runFusion({t0_wrong_format}); }, + [&]() { ke.runFusion({t0_wrong_format}); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Stride mismatch with contiguity info"))); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -794,7 +808,8 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheAfter) { // allocation tensor to be between rFactor domain and loop domain, which is not // the case for NHWC2d_To_NHWC2d TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheAfter) { - Fusion fusion; + auto fusion_ptr = std::make_unique(); + Fusion& fusion = *fusion_ptr.get(); FusionGuard fg(&fusion); int n = 31, h = 64, w = 103, c = 21; @@ -845,15 +860,15 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheAfter) { at::Tensor t0 = t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compileFusion(fusion_ptr.get(), {t0}); EXPECT_THAT( - [&]() { fe.runFusion({t0_wrong_format}); }, + [&]() { ke.runFusion({t0_wrong_format}); }, ::testing::ThrowsMessage(::testing::HasSubstr( "merging of discontiguous dimensions is not allowed in allocation domain"))); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -862,7 +877,8 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheAfter) { // Similar to NHWC4d_To_NHWC4d, but does a cacheFork TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheFork) { - Fusion fusion; + auto fusion_ptr = std::make_unique(); + Fusion& fusion = *fusion_ptr.get(); FusionGuard fg(&fusion); auto tv0 = makeContigTensor(4); @@ -916,15 +932,15 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheFork) { at::Tensor t0 = t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compileFusion(fusion_ptr.get(), {t0}); EXPECT_THAT( - [&]() { fe.runFusion({t0_wrong_format}); }, + [&]() { ke.runFusion({t0_wrong_format}); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Stride mismatch with contiguity info"))); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -933,7 +949,8 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheFork) { // Similar to NHWC2d_To_NHWC2d, but does a cacheFork TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheFork) { - Fusion fusion; + auto fusion_ptr = std::make_unique(); + Fusion& fusion = *fusion_ptr.get(); FusionGuard fg(&fusion); int n = 31, h = 64, w = 103, c = 21; @@ -1005,15 +1022,15 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheFork) { at::Tensor t0 = t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compileFusion(fusion_ptr.get(), {t0}); EXPECT_THAT( - [&]() { fe.runFusion({t0_wrong_format}); }, + [&]() { ke.runFusion({t0_wrong_format}); }, ::testing::ThrowsMessage(::testing::HasSubstr( "splitting one dimension into discontiguous dimensions is not allowed in allocation domain"))); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -1021,29 +1038,30 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheFork) { } TEST_F(AllocationDomainTest, VectorizationIssue902) { - auto fusion = std::make_unique(); - FusionGuard fg(fusion.get()); + auto fusion_ptr = std::make_unique(); + auto& fusion = *fusion_ptr; + FusionGuard fg(fusion_ptr.get()); const std::vector shape({16, 16, 512, 64}); auto tv0 = makeContigTensor(4); - fusion->addInput(tv0); + fusion.addInput(tv0); auto tv1 = set(tv0); - fusion->addOutput(tv1); + fusion.addOutput(tv1); - std::vector alloc_domain; - alloc_domain.push_back(tv1->axis(0)); - alloc_domain.push_back(tv1->axis(2)); - alloc_domain.push_back(tv1->axis(3)); - alloc_domain.push_back(tv1->axis(1)); - tv1->setAllocationDomain(alloc_domain, true); + std::vector aloc_domain; + aloc_domain.push_back(tv1->axis(0)); + aloc_domain.push_back(tv1->axis(2)); + aloc_domain.push_back(tv1->axis(3)); + aloc_domain.push_back(tv1->axis(1)); + tv1->setAllocationDomain(aloc_domain, true); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutorCache executor_cache(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); ASSERT_TRUE(cg_outputs[0].equal(t0)); @@ -1083,8 +1101,9 @@ TEST_F(AllocationDomainTest, TransposeMatrix) { } TEST_F(AllocationDomainTest, ContiguityIssue1021) { - auto fusion = std::make_unique(); - FusionGuard fg(fusion.get()); + std::unique_ptr fusion_ptr = std::make_unique(); + Fusion* fusion = fusion_ptr.get(); + FusionGuard fg(fusion); auto tv0 = TensorViewBuilder() .ndims(2) @@ -1100,16 +1119,17 @@ TEST_F(AllocationDomainTest, ContiguityIssue1021) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({8, 8}, options).as_strided({4, 8}, {1, 8}); - FusionExecutorCache fec(std::move(fusion)); - auto outputs = fec.runFusionWithInputs({t0}); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs({t0}); auto t1 = t0.add(5.0); - testValidate(fec.fusion(), outputs, {t0}, __LINE__, __FILE__); + testValidate(fusion, outputs, {t0}, __LINE__, __FILE__); } TEST_F(AllocationDomainTest, ContiguityForBroadcast) { - auto fusion = std::make_unique(); - FusionGuard fg(fusion.get()); + std::unique_ptr fusion_ptr = std::make_unique(); + Fusion* fusion = fusion_ptr.get(); + FusionGuard fg(fusion); auto tv0 = TensorViewBuilder() .ndims(2) @@ -1125,16 +1145,17 @@ TEST_F(AllocationDomainTest, ContiguityForBroadcast) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({1, 1}, options).as_strided({1, 1}, {0, 3}); - FusionExecutorCache fec(std::move(fusion)); - auto outputs = fec.runFusionWithInputs({t0}); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs({t0}); auto t1 = t0.add(5.0); - testValidate(fec.fusion(), outputs, {t0}, __LINE__, __FILE__); + testValidate(fusion, outputs, {t0}, __LINE__, __FILE__); } TEST_F(AllocationDomainTest, ContiguityForExplicitBroadcast) { - auto fusion = std::make_unique(); - FusionGuard fg(fusion.get()); + std::unique_ptr fusion_ptr = std::make_unique(); + Fusion* fusion = fusion_ptr.get(); + FusionGuard fg(fusion); auto tv0 = TensorViewBuilder() .ndims(3) @@ -1151,11 +1172,11 @@ TEST_F(AllocationDomainTest, ContiguityForExplicitBroadcast) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({4, 8}, options).as_strided({3, 8, 4}, {0, 1, 8}); - FusionExecutorCache fec(std::move(fusion)); - auto outputs = fec.runFusionWithInputs({t0}); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs({t0}); auto t1 = t0.add(5.0); - testValidate(fec.fusion(), outputs, {t0}, __LINE__, __FILE__); + testValidate(fusion, outputs, {t0}, __LINE__, __FILE__); } // Test that allocation domain can be used to vectorize overlapping tensors, @@ -1168,7 +1189,8 @@ TEST_F(AllocationDomainTest, ContiguityForExplicitBroadcast) { // automatically supports all kinds of use cases, even those that we don't have // an active plan to support on). TEST_F(AllocationDomainTest, VectorizeOverlappingTensor) { - Fusion fusion; + auto fusion_ptr = std::make_unique(); + Fusion& fusion = *fusion_ptr.get(); FusionGuard fg(&fusion); auto tv0 = makeContigTensor(3); @@ -1203,9 +1225,9 @@ TEST_F(AllocationDomainTest, VectorizeOverlappingTensor) { at::Tensor t0 = at::randn({4 * 5 * 7}).cuda().as_strided({4, 5, 7}, {7, 4, 1}); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(fusion_ptr.get(), {t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -1228,14 +1250,14 @@ TEST_F(AllocationDomainTest, Issue1290_ContiguityWasMissing) { at::Tensor in_tensor = at::randn({2 * 4}).cuda().as_strided({2, 3}, {4, 1}); - FusionExecutorCache fec(std::move(fusion)); - fec.runFusionWithInputs({in_tensor}); + FusionExecutorCache executor_cache(std::move(fusion)); + executor_cache.runFusionWithInputs({in_tensor}); // The initial issue was detected in the pointwise scheduler, so I added these // checks to make sure it's a valid regression test. The transpose scheduler // could accept this but decided not to because of a small problem size. const std::vector& groups = - fec.getMostRecentKernelRuntime()->fusionSegments()->groups(); + executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups(); ASSERT_EQ(groups.size(), 1); SegmentedGroup* group = groups[0]; EXPECT_EQ(group->schedulerType(), SchedulerType::PointWise); @@ -1253,9 +1275,9 @@ TEST_F(AllocationDomainTest, Issue1290_ReplayCasPFailedDueToDifferentRanks) { out->cacheBefore(); at::Tensor in_tensor = at::randn({2, 3}).cuda(); - FusionExecutor fe; - fe.compileFusion(&fusion, {in_tensor}); - at::Tensor out_tensor = fe.runFusion({in_tensor})[0]; + KernelExecutor ke; + ke.compileFusion(&fusion, {in_tensor}); + at::Tensor out_tensor = ke.runFusion({in_tensor})[0]; EXPECT_THAT(out_tensor.sizes(), ElementsAre(2)); } @@ -1289,8 +1311,8 @@ TEST_F(AllocationDomainTest, Issue1524) { {permute_out->axis(1), permute_out->axis(0)}, true); at::Tensor in_tensor = at::randn({2, 3}).cuda(); - FusionExecutorCache fec(std::move(fusion)); - fec.runFusionWithInputs({in_tensor}); + FusionExecutorCache executor_cache(std::move(fusion)); + executor_cache.runFusionWithInputs({in_tensor}); } TEST_F(AllocationDomainTest, EmptyAllocationDomainApi) { diff --git a/tests/cpp/test_allocation_order_inference.cpp b/tests/cpp/test_allocation_order_inference.cpp index b936d2252d0..c24d679bfbb 100644 --- a/tests/cpp/test_allocation_order_inference.cpp +++ b/tests/cpp/test_allocation_order_inference.cpp @@ -315,9 +315,9 @@ TEST_F(AllocationOrderInferenceTest, EnableInRuntime) { at::Tensor in_tensor = at::randn({2, 4, 8, 8}, options); at::Tensor in_nhwc = in_tensor.as_strided({2, 4, 8, 8}, {4 * 8 * 8, 1, 4 * 8, 4}); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); - auto cg_outputs = fec.runFusionWithInputs({in_nhwc}); + auto cg_outputs = executor_cache.runFusionWithInputs({in_nhwc}); auto ref_out = in_nhwc.relu(); EXPECT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); diff --git a/tests/cpp/test_circular_buffering.cpp b/tests/cpp/test_circular_buffering.cpp index d607579196f..78b79a31e83 100644 --- a/tests/cpp/test_circular_buffering.cpp +++ b/tests/cpp/test_circular_buffering.cpp @@ -64,17 +64,17 @@ TEST_P(CircularBufferingTest, SingleDim1) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({1000}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); // Given computeAt axis 1, the axis_extent is I0/128. constexpr int64_t axis_extent = 8; if (axis_extent < number_of_stages) { - ASSERT_ANY_THROW(fe.runFusion({t0})); + ASSERT_ANY_THROW(ke.runFusion({t0})); return; } - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); auto ref = t0 + 1; testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); } @@ -112,17 +112,17 @@ TEST_P(CircularBufferingTest, SingleDim2) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({1000}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); // Given computeAt axis 1, the axis_extent is I0/128. constexpr int64_t axis_extent = 8; if (axis_extent < number_of_stages) { - ASSERT_ANY_THROW(fe.runFusion({t0})); + ASSERT_ANY_THROW(ke.runFusion({t0})); return; } - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); auto ref = t0 + 1; testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); } @@ -167,17 +167,17 @@ TEST_P(CircularBufferingTest, SingleDim3) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({1000}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); // Given computeAt axis 2, the axis_extent is 128/32. constexpr int64_t axis_extent = 4; if (axis_extent < number_of_stages) { - ASSERT_ANY_THROW(fe.runFusion({t0})); + ASSERT_ANY_THROW(ke.runFusion({t0})); return; } - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); auto ref = t0 + 2; testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); } @@ -219,18 +219,18 @@ TEST_P(CircularBufferingTest, SingleDimUnswitch1) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({1000}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); // Given computeAt axis -1 and axis 3 is parallelized with TIDx, the axis // extent is 4. constexpr int64_t axis_extent = 4; if (axis_extent < number_of_stages) { - ASSERT_ANY_THROW(fe.runFusion({t0})); + ASSERT_ANY_THROW(ke.runFusion({t0})); return; } - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); auto ref = t0 + 2; testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); } @@ -271,18 +271,18 @@ TEST_P(CircularBufferingTest, SingleDimUnswitch2) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({1000}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); // Given computeAt axis -1 and axis 3 is parallelized with TIDx, the axis // extent is 4. constexpr int64_t axis_extent = 4; if (axis_extent < number_of_stages) { - ASSERT_ANY_THROW(fe.runFusion({t0})); + ASSERT_ANY_THROW(ke.runFusion({t0})); return; } - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); auto ref = t0 + 1; testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); } @@ -325,18 +325,18 @@ TEST_P(CircularBufferingTest, SingleDimUnroll) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({199}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); // Given computeAt axis -1 and axis 4 is parallelized with TIDx, the axis // extent is 2. constexpr int64_t axis_extent = 2; if (axis_extent < number_of_stages) { - ASSERT_ANY_THROW(fe.runFusion({t0})); + ASSERT_ANY_THROW(ke.runFusion({t0})); return; } - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); auto ref = t0 + 2; testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); } @@ -372,18 +372,18 @@ TEST_P(CircularBufferingTest, SingleDimVectorize) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({200}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); // Given computeAt axis 2 and axis 1 is parallelized with TIDx, the axis // extent is I0/128. constexpr int64_t axis_extent = 2; if (axis_extent < number_of_stages) { - ASSERT_ANY_THROW(fe.runFusion({t0})); + ASSERT_ANY_THROW(ke.runFusion({t0})); return; } - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); auto ref = t0 + 1; testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); } @@ -424,17 +424,17 @@ TEST_P(CircularBufferingTest, MultipleTensors) { auto t0 = at::randn({500}, options); auto t1 = at::randn({500}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, t1}); // Given computeAt axis 1, the axis extent is I0/32/4. constexpr int64_t axis_extent = 1; if (axis_extent < number_of_stages) { - ASSERT_ANY_THROW(fe.runFusion({t0})); + ASSERT_ANY_THROW(ke.runFusion({t0})); return; } - auto cg_outputs = fe.runFusion({t0, t1}); + auto cg_outputs = ke.runFusion({t0, t1}); auto ref = t0 + t1; testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__); } @@ -475,19 +475,19 @@ TEST_P(CircularBufferingTest, NestedTensors) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({1001}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); // Given computeAt axis 1 for tv2, the axis extent is I0/32/4 = 8. // Given computeAt axis 3 for tv3 and axis 3 is parallelized with TIDx, // the axis extent is 4. constexpr int64_t axis_extent = 4; if (axis_extent < number_of_stages) { - ASSERT_ANY_THROW(fe.runFusion({t0})); + ASSERT_ANY_THROW(ke.runFusion({t0})); return; } - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); auto ref = t0 + 1; testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); } @@ -569,16 +569,16 @@ TEST_P(CircularBufferingTest, SmemBlockGemmCache) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); constexpr int64_t axis_extent = 2; if (axis_extent < number_of_stages) { - ASSERT_ANY_THROW(fe.runFusion({t0})); + ASSERT_ANY_THROW(ke.runFusion({t0})); return; } - auto cg_outputs = fe.runFusion(aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); // The smem cache write in this test case is redundant predicated, @@ -586,7 +586,7 @@ TEST_P(CircularBufferingTest, SmemBlockGemmCache) { // insertion to ensure ordering of circular buffered tensor access. // The check below makes sure that the sync is inserted so that the // test isn't running on a race condition. - NVF_CHECK(fe.kernel()->summary().war_hazard_syncs_count > 0); + NVF_CHECK(ke.kernel()->summary().war_hazard_syncs_count > 0); } // Vectorized reset test for circular buffered registers @@ -623,16 +623,16 @@ TEST_P(CircularBufferingTest, Vector) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({200}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); constexpr int64_t axis_extent = 8; if (axis_extent < number_of_stages) { - ASSERT_ANY_THROW(fe.runFusion({t0})); + ASSERT_ANY_THROW(ke.runFusion({t0})); return; } - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); auto ref = (t0 + 1).sum({0}); testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); } @@ -678,14 +678,14 @@ TEST_P(CircularBufferingTest, CpAsync1) { at::Tensor t0 = at::randn({m, n}, options); at::Tensor t1 = at::randn({m, n}, options); - FusionExecutor fe; + KernelExecutor ke; // requires ampere+ GPU if (!deviceMajorMinorCheck(8)) { - ASSERT_ANY_THROW(fe.compileFusion(&fusion, {t0, t1})); + ASSERT_ANY_THROW(ke.compileFusion(&fusion, {t0, t1})); GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs"; } - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + ke.compileFusion(&fusion, {t0, t1}); + auto cg_outputs = ke.runFusion({t0, t1}); auto ref = t0 + t1; @@ -731,14 +731,14 @@ TEST_P(CircularBufferingTest, CpAsync2) { at::Tensor t0 = at::randn({m, n}, options); at::Tensor t1 = at::randn({m, n}, options); - FusionExecutor fe; + KernelExecutor ke; // requires ampere+ GPU if (!deviceMajorMinorCheck(8)) { - ASSERT_ANY_THROW(fe.compileFusion(&fusion, {t0, t1})); + ASSERT_ANY_THROW(ke.compileFusion(&fusion, {t0, t1})); GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs"; } - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + ke.compileFusion(&fusion, {t0, t1}); + auto cg_outputs = ke.runFusion({t0, t1}); auto ref = t0 + t1; @@ -794,9 +794,9 @@ TEST_P(CircularBufferingTest, NoSync) { }); NVF_ERROR(!sync_inserted, "Un-expected block sync inserted"); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, t1}); + auto cg_outputs = ke.runFusion({t0, t1}); auto ref = t0 + t1; @@ -971,9 +971,9 @@ TEST_F(NVFuserTest, ElectSyncCompatibility) { // (threadIdx.x < 4) predicate. This thread predicate is incompatible with // circular buffering because we generate an ElectSync predicate that uses // a single thread. - FusionExecutor fe; + KernelExecutor ke; try { - fe.compileFusion(fusion.get(), {t0}); + ke.compileFusion(fusion.get(), {t0}); } catch (const std::exception& e) { const char* reference = R"(This thread-parallelized TensorView T2_s_float[ iblockIdx.x15{( ceilDiv(( ceilDiv(( ceilDiv(( ( ( (( (( getMetaData(T0) )).logical_size ))[0] ) * ( (( (( getMetaData(T0) )).logical_size ))[1] ) ) * ( (( (( getMetaData(T0) )).logical_size ))[2] ) ), 256) ), 4) ), 2) )}, iS16{2}, ithreadIdx.x14{4}, iB12{256} ] ca_pos( 2 ) is incorrectly contained within a If-Then-Else with the ElectSync predicate.)"; @@ -1023,10 +1023,10 @@ TEST_P(TmaCircularBufferingTest, SingleDim) { at::Tensor t0 = at::randn({tensor_inner_dim}, options); at::Tensor t1 = at::exp(t0); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {t0}); + KernelExecutor ke; + ke.compileFusion(fusion.get(), {t0}); - std::vector cg_outputs = fe.runFusion({t0}); + std::vector cg_outputs = ke.runFusion({t0}); compare(tensor_inner_dim, cg_outputs.front(), t1); testValidate(fusion.get(), cg_outputs, {t0}, {t1}, __LINE__, __FILE__); } @@ -1076,17 +1076,17 @@ TEST_P(TmaCircularBufferingTest, SingleDimUnroll) { at::Tensor t0 = at::randn({tensor_inner_dim}, options); at::Tensor t1 = at::exp(t0); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {t0}); + KernelExecutor ke; + ke.compileFusion(fusion.get(), {t0}); int64_t axis_extent = ceilDiv(ceilDiv(tensor_inner_dim, bulk_inner_dim), unroll_dim); if (axis_extent < number_of_stages) { - ASSERT_ANY_THROW(fe.runFusion({t0})); + ASSERT_ANY_THROW(ke.runFusion({t0})); return; } - std::vector cg_outputs = fe.runFusion({t0}); + std::vector cg_outputs = ke.runFusion({t0}); compare(tensor_inner_dim, cg_outputs.front(), t1); testValidate(fusion.get(), cg_outputs, {t0}, {t1}, __LINE__, __FILE__); } @@ -1136,17 +1136,17 @@ TEST_P(TmaCircularBufferingTest, SingleDimUnswitch) { at::Tensor t0 = at::randn({tensor_inner_dim}, options); at::Tensor t1 = at::exp(t0); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {t0}); + KernelExecutor ke; + ke.compileFusion(fusion.get(), {t0}); int64_t axis_extent = ceilDiv(ceilDiv(tensor_inner_dim, bulk_inner_dim), unroll_dim); if (axis_extent < number_of_stages) { - ASSERT_ANY_THROW(fe.runFusion({t0})); + ASSERT_ANY_THROW(ke.runFusion({t0})); return; } - std::vector cg_outputs = fe.runFusion({t0}); + std::vector cg_outputs = ke.runFusion({t0}); compare(tensor_inner_dim, cg_outputs.front(), t1); testValidate(fusion.get(), cg_outputs, {t0}, {t1}, __LINE__, __FILE__); } @@ -1206,10 +1206,10 @@ TEST_P(TmaCircularBufferingTest, MultiDim) { at::Tensor t0 = at::ones({tensor_outer_dim, tensor_inner_dim}, options); at::Tensor t1 = at::exp(t0); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {t0}); + KernelExecutor ke; + ke.compileFusion(fusion.get(), {t0}); - std::vector cg_outputs = fe.runFusion({t0}); + std::vector cg_outputs = ke.runFusion({t0}); compare(tensor_outer_dim, tensor_inner_dim, cg_outputs.front(), t1); testValidate(fusion.get(), cg_outputs, {t0}, {t1}, __LINE__, __FILE__); } @@ -1268,10 +1268,10 @@ TEST_P(TmaCircularBufferingTest, Pointwise) { at::Tensor t1 = at::randn({tensor_outer_dim, tensor_inner_dim}, options); at::Tensor t2 = t0 + t1; - FusionExecutor fe; - fe.compileFusion(fusion.get(), {t0, t1}); + KernelExecutor ke; + ke.compileFusion(fusion.get(), {t0, t1}); - std::vector cg_outputs = fe.runFusion({t0, t1}); + std::vector cg_outputs = ke.runFusion({t0, t1}); compare(tensor_outer_dim, tensor_inner_dim, cg_outputs.front(), t2); testValidate(fusion.get(), cg_outputs, {t0, t1}, {t2}, __LINE__, __FILE__); } @@ -1335,10 +1335,10 @@ TEST_P(TmaCircularBufferingTest, PointwiseCpAsync) { at::Tensor t1 = at::randn({tensor_outer_dim, tensor_inner_dim}, options); at::Tensor t2 = t0 + t1; - FusionExecutor fe; - fe.compileFusion(fusion.get(), {t0, t1}); + KernelExecutor ke; + ke.compileFusion(fusion.get(), {t0, t1}); - std::vector cg_outputs = fe.runFusion({t0, t1}); + std::vector cg_outputs = ke.runFusion({t0, t1}); compare(tensor_outer_dim, tensor_inner_dim, cg_outputs.front(), t2); testValidate(fusion.get(), cg_outputs, {t0, t1}, {t2}, __LINE__, __FILE__); } @@ -1393,10 +1393,10 @@ TEST_P(TmaCircularBufferingTest, Reduction) { at::Tensor t0 = at::randn({tensor_outer_dim, tensor_inner_dim}, options); at::Tensor t1 = sum(t0, {-1}); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {t0}); + KernelExecutor ke; + ke.compileFusion(fusion.get(), {t0}); - std::vector cg_outputs = fe.runFusion({t0}); + std::vector cg_outputs = ke.runFusion({t0}); compare(tensor_outer_dim, cg_outputs.front(), t1); testValidate(fusion.get(), cg_outputs, {t0}, {t1}, __LINE__, __FILE__); } @@ -1518,10 +1518,10 @@ TEST_P(TmaCircularBufferingTest, Persistent) { at::Tensor at_tv0 = at::randn({tensor_outer_dim, tensor_inner_dim}, options); at::Tensor at_tv1 = at::randn({tensor_outer_dim, tensor_inner_dim}, options); - // Compile with FusionExecutor directly to avoid scheduling - FusionExecutor fe; - fe.compileFusion(fusion.get(), {at_tv0}); - std::vector cg_outputs = fe.runFusion({at_tv0}); + // Compile with KernelExecutor directly to avoid scheduling + KernelExecutor ke; + ke.compileFusion(fusion.get(), {at_tv0}); + std::vector cg_outputs = ke.runFusion({at_tv0}); std::tuple at_var_mean = at::var_mean(at_tv0, {-1}, correction, keepdim); @@ -1640,10 +1640,10 @@ TEST_P(TmaCircularBufferingTest, Matmul) { at::Tensor aten_output = (t0.unsqueeze(/*dim=*/-1) * t1.unsqueeze(/*dim=*/0)).sum(/*dim=*/1); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {t0, t1}); + KernelExecutor ke; + ke.compileFusion(fusion.get(), {t0, t1}); - std::vector cg_outputs = fe.runFusion({t0, t1}); + std::vector cg_outputs = ke.runFusion({t0, t1}); compare( tensor_outer_dim, tensor_inner_dim, cg_outputs.front(), aten_output); testValidate( @@ -1754,10 +1754,10 @@ TEST_P(TmaCircularBufferingTest, MatmulWithBroadcastedInput) { at::Tensor t1 = at::randn({1, K, tensor_inner_dim}, options); at::Tensor aten_output = (t0 * t1).sum(/*dim=*/1); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {t0, t1}); + KernelExecutor ke; + ke.compileFusion(fusion.get(), {t0, t1}); - std::vector cg_outputs = fe.runFusion({t0, t1}); + std::vector cg_outputs = ke.runFusion({t0, t1}); compare( tensor_outer_dim, tensor_inner_dim, cg_outputs.front(), aten_output); testValidate( diff --git a/tests/cpp/test_combined_inner_outer_reduction.cpp b/tests/cpp/test_combined_inner_outer_reduction.cpp index 95eaadd4ad7..cdb2f39c3db 100644 --- a/tests/cpp/test_combined_inner_outer_reduction.cpp +++ b/tests/cpp/test_combined_inner_outer_reduction.cpp @@ -104,10 +104,10 @@ TEST_P(CombinedSchedulerTest, LayerNormBackward) { auto aten_mean = std::get<1>(aten_results); auto aten_rstd = std::get<2>(aten_results); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); std::vector aten_inputs = { aten_grad_out, aten_input, aten_mean, aten_rstd, aten_weight, aten_bias}; - auto cg_outputs = fec.runFusionWithInputs(aten_inputs); + auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); auto aten_gradients = at::native_layer_norm_backward( aten_grad_out, @@ -120,7 +120,7 @@ TEST_P(CombinedSchedulerTest, LayerNormBackward) { {true, true, true}); testValidate( - fec.fusion(), + executor_cache.fusion(), {cg_outputs[0], cg_outputs[1], cg_outputs[2]}, aten_inputs, {std::get<0>(aten_gradients), @@ -261,7 +261,7 @@ TEST_F(CombinedSchedulerTest, SharedConsumer) { auto aten_mean = std::get<1>(aten_results); auto aten_rstd = std::get<2>(aten_results); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); std::vector aten_inputs = { aten_grad_out, aten_input, @@ -269,7 +269,7 @@ TEST_F(CombinedSchedulerTest, SharedConsumer) { aten_rstd, aten_weight, aten_bias}; - auto cg_outputs = fec.runFusionWithInputs(aten_inputs); + auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); auto aten_gradients = at::native_layer_norm_backward( aten_grad_out.to(at::kDouble), @@ -287,7 +287,8 @@ TEST_F(CombinedSchedulerTest, SharedConsumer) { if (!link_inner_outer) { aten_out_linked = aten_out_linked.mul(0.5); } - bool is_segmented = fec.getMostRecentKernelRuntime()->isSegmented(); + bool is_segmented = + executor_cache.getMostRecentKernelRuntime()->isSegmented(); NVF_CHECK(is_segmented, "Fusion is not segmented"); testValidate( @@ -443,7 +444,7 @@ TEST_F(CombinedSchedulerTest, SharedProducer) { auto aten_mean = std::get<1>(aten_results); auto aten_rstd = std::get<2>(aten_results); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); std::vector aten_inputs = { aten_grad_out, aten_input, @@ -451,9 +452,9 @@ TEST_F(CombinedSchedulerTest, SharedProducer) { aten_rstd, aten_weight, aten_bias}; - auto cg_outputs = fec.runFusionWithInputs(aten_inputs); + auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); switch (case_id) { case 0: case 1: @@ -634,9 +635,9 @@ TEST_F(CombinedSchedulerTest, CombinedReduction) { at::Tensor qv_cg_output = at::empty({dim1}, options); auto qv_aten_output = tv_input.to(at::kFloat).sum({0}); - FusionExecutor fe; - fe.compileFusion(&fusion, {tv_input}, launch_constraints, compile_params); - fe.runFusion( + KernelExecutor ke; + ke.compileFusion(&fusion, {tv_input}, launch_constraints, compile_params); + ke.runFusion( {tv_input}, {tv_cg_output, qv_cg_output}, launch_constraints, @@ -811,9 +812,9 @@ TEST_F(CombinedSchedulerTest, CombinedReductionMultiPerBlock) { at::Tensor qv_cg_output = at::empty({dim1}, options); at::Tensor tv_input2 = at::ones({dim0, dim1}, options); auto qv_aten_output = tv_input2.to(at::kFloat).sum({0}); - FusionExecutor fe; - fe.compileFusion(&fusion, {tv_input}, launch_constraints, compile_params); - fe.runFusion( + KernelExecutor ke; + ke.compileFusion(&fusion, {tv_input}, launch_constraints, compile_params); + ke.runFusion( {tv_input}, {tv_cg_output, qv_cg_output}, launch_constraints, @@ -850,10 +851,11 @@ TEST_F(CombinedSchedulerTest, InnerOuterMismatch) { at::Tensor t0 = at::randn({x, y, z}, options); std::vector aten_inputs = {t0}; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); - bool is_segmented = fec.getMostRecentKernelRuntime()->isSegmented(); + bool is_segmented = + executor_cache.getMostRecentKernelRuntime()->isSegmented(); if (outer_reduction_axis.size() == 2) { NVF_ERROR(!is_segmented, "Fusion should NOT be segmented!"); } else { @@ -980,8 +982,8 @@ TEST_F(CombinedSchedulerTest, SharedMemoryPersistentVectFactor) { heuristic_params->as()->smem_persistent_buffers = std::vector{tv1}; scheduler->schedule(&fusion, heuristic_params.get()); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); for (auto tv : fusion.allTvs()) { if (tv->getMemoryType() == MemoryType::Shared) { @@ -990,7 +992,7 @@ TEST_F(CombinedSchedulerTest, SharedMemoryPersistentVectFactor) { } } } - auto cg_outputs = fe.runFusion( + auto cg_outputs = ke.runFusion( aten_inputs, heuristic_params->as()->lparams); testValidate(&fusion_copy, cg_outputs, aten_inputs, __LINE__, __FILE__); } diff --git a/tests/cpp/test_dynamic_transform.cpp b/tests/cpp/test_dynamic_transform.cpp index ca29bf0825b..c7178597652 100644 --- a/tests/cpp/test_dynamic_transform.cpp +++ b/tests/cpp/test_dynamic_transform.cpp @@ -209,10 +209,10 @@ TEST_F(NVFuserTest, DynamicTransform3_CUDA) { at::Tensor t1 = at::randn(shape_after, options); std::vector inputs = {t0, t1}; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = fec.runFusionWithInputs(inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs(inputs); - testValidate(fec.fusion(), cg_outputs, inputs, __LINE__, __FILE__); + testValidate(executor_cache.fusion(), cg_outputs, inputs, __LINE__, __FILE__); } // Test multiple patterns of reshape @@ -777,13 +777,13 @@ void reductionDynamicViewAddFusion( : add(x_reshape, bias); fusion.addOutput(y); - FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); - size_t num_concretizations = fusion_executor_cache.countConcretizations(); + size_t num_concretizations = executor_cache.countConcretizations(); // Check that concretizations and runtimes are cache misses only when they // should be auto checkCache = [&](bool expect_miss) { - auto current = fusion_executor_cache.countConcretizations(); + auto current = executor_cache.countConcretizations(); ASSERT_EQ(current, num_concretizations + (size_t)expect_miss); num_concretizations = current; }; @@ -830,7 +830,7 @@ void reductionDynamicViewAddFusion( aten_inputs.emplace_back(output_shape[i]); } - auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); checkCache(expect_miss); auto at_tv1 = (reshape_before_reduction) ? (at_x + at_bias) @@ -902,22 +902,22 @@ void reductionDynamicPadAddFusion( auto y = sum(x_pad, {kReductionAxis}); fusion.addOutput(y); - FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); // Check that concretizations and runtimes are cache misses only when they // should be - size_t num_concretizations = fusion_executor_cache.getKernelRuntimes().size(); -#define CHECK_CACHE(expect_miss, ...) \ - auto current = fusion_executor_cache.getKernelRuntimes().size(); \ - auto expected = num_concretizations + (size_t)expect_miss; \ - NVF_CHECK( \ - current == expected, \ - "Expected cache size ", \ - expected, \ - " but found ", \ - current, \ - ". ", \ - __VA_ARGS__); \ + size_t num_concretizations = executor_cache.getKernelRuntimes().size(); +#define CHECK_CACHE(expect_miss, ...) \ + auto current = executor_cache.getKernelRuntimes().size(); \ + auto expected = num_concretizations + (size_t)expect_miss; \ + NVF_CHECK( \ + current == expected, \ + "Expected cache size ", \ + expected, \ + " but found ", \ + current, \ + ". ", \ + __VA_ARGS__); \ num_concretizations = current; for (auto& inv : invocations) { @@ -943,7 +943,7 @@ void reductionDynamicPadAddFusion( aten_inputs.emplace_back(pad_widths[i]); } - auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); CHECK_CACHE( expect_miss, "Input shape=", input_shape, " pad_widths=", pad_widths); @@ -1011,11 +1011,11 @@ TEST_F(NVFuserTest, FusionDynamicSliceToBroadcast_CUDA) { // concretized to Iteration, it does not wind up overwriting the Broadcast // logical. - FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor at0 = at::randn({5}, options); std::vector aten_inputs = {at0}; - auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -1037,13 +1037,13 @@ TEST_F(NVFuserTest, FusionDynamicEmptyCat1_CUDA) { fusion.addOutput(tv3); // Check correctness - FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor at0 = at::randn({5}, options); at::Tensor at1 = at::randn({0}, options); at::Tensor at2 = at::randn({3}, options); std::vector aten_inputs = {at0, at1, at2}; - auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -1063,16 +1063,16 @@ TEST_F(NVFuserTest, FusionDynamicEmptyCat2_CUDA) { fusion.addOutput(tv2); // Check correctness - FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor at0 = at::randn({5}, options); at::Tensor at1 = at::randn({0}, options); std::vector aten_inputs = {at0, at1}; - auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); // Check that fusion consists only of tv2 = set(tv0) - auto fkr = fusion_executor_cache.getMostRecentKernelRuntime(); + auto fkr = executor_cache.getMostRecentKernelRuntime(); auto seg_fusion = fkr->fusionSegments(); auto output_def = seg_fusion->outputs()[0]->definition(); EXPECT_TRUE(output_def->isA()); @@ -1098,15 +1098,15 @@ TEST_F(NVFuserTest, DynamicTransformIssue418_CUDA) { fusion->addOutput(vm.mean); fusion->addOutput(vm.var); - FusionExecutorCache fusion_executor_cache(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor at0 = at::randn({256, 128, 28, 28}, options); std::vector aten_inputs = {at0, 32}; - auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); testValidate( - fusion_executor_cache.fusion(), outputs, aten_inputs, __LINE__, __FILE__); + executor_cache.fusion(), outputs, aten_inputs, __LINE__, __FILE__); } TEST_F(NVFuserTest, Issue249_CUDA) { @@ -1126,15 +1126,14 @@ TEST_F(NVFuserTest, Issue249_CUDA) { auto tv3 = add(tv2, tv2); fusion.addOutput(tv3); - FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor at_x = at::randn({2, 3, 4, 5}, options); - auto outputs = fusion_executor_cache.runFusionWithInputs({at_x}); + auto outputs = executor_cache.runFusionWithInputs({at_x}); - testValidate( - fusion_executor_cache.fusion(), outputs, {at_x}, __LINE__, __FILE__); + testValidate(executor_cache.fusion(), outputs, {at_x}, __LINE__, __FILE__); } // This is just like the test above, but uses an input scalar with value -1 @@ -1158,7 +1157,7 @@ TEST_F(NVFuserTest, Issue249InputNegative1_CUDA) { auto tv3 = add(tv2, tv2); fusion.addOutput(tv3); - FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor at_x = at::randn({2, 3, 4, 5}, options); @@ -1166,18 +1165,13 @@ TEST_F(NVFuserTest, Issue249InputNegative1_CUDA) { // Dynamic reshape sizes that are not constant at definition must be explicit: // no -1 allowed EXPECT_THROW( - fusion_executor_cache.runFusionWithInputs({at_x, 2, 4, -1}), - std::exception); + executor_cache.runFusionWithInputs({at_x, 2, 4, -1}), std::exception); // Passing explicit sizes works fine - auto outputs = fusion_executor_cache.runFusionWithInputs({at_x, 2, 4, 15}); + auto outputs = executor_cache.runFusionWithInputs({at_x, 2, 4, 15}); testValidate( - fusion_executor_cache.fusion(), - outputs, - {at_x, 2, 4, 15}, - __LINE__, - __FILE__); + executor_cache.fusion(), outputs, {at_x, 2, 4, 15}, __LINE__, __FILE__); } // Test that OptOutMutator mutates expressions in a predictable way @@ -1215,10 +1209,10 @@ TEST_F(NVFuserTest, OptOutMutatorMutatedOutput) { inlineMost(); - FusionExecutor fe; - fe.compileFusion(fusion); + KernelExecutor ke; + ke.compileFusion(fusion); - auto outputs = fe.runFusion({t0}); + auto outputs = ke.runFusion({t0}); testValidate(fusion, outputs, {t0}, __LINE__, __FILE__); } @@ -1252,10 +1246,10 @@ TEST_F(NVFuserTest, OptOutMutatorRedefinedConstant) { inlineMost(); - FusionExecutor fe; - fe.compileFusion(fusion); + KernelExecutor ke; + ke.compileFusion(fusion); - auto outputs = fe.runFusion({3L}); + auto outputs = ke.runFusion({3L}); testValidate(fusion, outputs, {3L}, __LINE__, __FILE__); } @@ -1281,7 +1275,7 @@ TEST_F(NVFuserTest, SymbolicSqueeze) { tv1, std::vector({false, true})); // Squeeze second dimension fusion->addOutput(tv2); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({3, 2}, options); @@ -1289,14 +1283,14 @@ TEST_F(NVFuserTest, SymbolicSqueeze) { // An invalid input has a second dimension that cannot be squeezed std::vector invalid_inputs = {t0, 2, 3}; - auto outputs = fec.runFusionWithInputs(valid_inputs); + auto outputs = executor_cache.runFusionWithInputs(valid_inputs); testValidate(fusion, outputs, valid_inputs, __LINE__, __FILE__); // An informative error message should be given by // SqueezeOp::checkConcretization EXPECT_THAT( - [&]() { fec.runFusionWithInputs(invalid_inputs); }, + [&]() { executor_cache.runFusionWithInputs(invalid_inputs); }, ::testing::ThrowsMessage(::testing::HasSubstr( " must concretize to IterType::Broadcast but found"))); } @@ -1325,7 +1319,7 @@ TEST_F(NVFuserTest, SymbolicExpand) { fusion->addOutput(tv3); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({3, 2}, options); @@ -1333,13 +1327,14 @@ TEST_F(NVFuserTest, SymbolicExpand) { // An invalid input has a second dimension that cannot be expanded std::vector invalid_inputs = {t0, 2, 3, 2, 5}; - auto outputs = fec.runFusionWithInputs(valid_inputs); + auto outputs = executor_cache.runFusionWithInputs(valid_inputs); - testValidate(fec.fusion(), outputs, valid_inputs, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), outputs, valid_inputs, __LINE__, __FILE__); // An informative error message should be given during concretization EXPECT_THAT( - [&]() { fec.runFusionWithInputs(invalid_inputs); }, + [&]() { executor_cache.runFusionWithInputs(invalid_inputs); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Mismatch in sizes when concretizing expand."))); } @@ -1380,13 +1375,13 @@ TEST_F(NVFuserTest, ConcretizeConstantExtents) { fusion->addOutput(tv5); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({4096, 12288}, options); std::vector inputs = {t0}; - auto outputs = fec.runFusionWithInputs(inputs); + auto outputs = executor_cache.runFusionWithInputs(inputs); testValidate(fusion, outputs, inputs, __LINE__, __FILE__); } @@ -1417,13 +1412,13 @@ TEST_F(NVFuserTest, DynamicSqueezeTrivialReduction) { auto tv2 = sum(tv1, {0, 2, 3, 4}); fusion->addOutput(tv2); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({2, 2, 9}, options); std::vector inputs = {t0}; - auto outputs = fec.runFusionWithInputs(inputs); + auto outputs = executor_cache.runFusionWithInputs(inputs); testValidate(fusion, outputs, inputs, __LINE__, __FILE__); } @@ -1455,13 +1450,13 @@ TEST_F(NVFuserTest, DynamicSqueezeTrivialWelford) { fusion->addOutput(res.mean); fusion->addOutput(res.var); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({2, 2, 9}, options); std::vector inputs = {t0}; - auto outputs = fec.runFusionWithInputs(inputs); + auto outputs = executor_cache.runFusionWithInputs(inputs); testValidate(fusion, outputs, inputs, __LINE__, __FILE__); } diff --git a/tests/cpp/test_external_src.cpp b/tests/cpp/test_external_src.cpp index f0623ade609..21d487f17b4 100644 --- a/tests/cpp/test_external_src.cpp +++ b/tests/cpp/test_external_src.cpp @@ -28,7 +28,7 @@ class ExternalSrcExample : public NVFuserTest {}; TEST_F(ExternalSrcExample, Reduction_CUDA) { Fusion fusion; FusionGuard fg(&fusion); - FusionExecutor fe; + KernelExecutor ke; // By default, this env var should not be defined. To test using an // external source file, set it to the path to the external source @@ -44,7 +44,7 @@ TEST_F(ExternalSrcExample, Reduction_CUDA) { buffer << cuda_src.rdbuf(); std::string cuda_src_str = buffer.str(); - fe.compileRtc(cuda_src_str, "kernel1", true, PrimDataType::Int32); + ke.compileRtc(cuda_src_str, "kernel1", true, PrimDataType::Int32); // The following is a sample launch pattern of the compiled // kernel. It must be adapted for each particular source file. @@ -82,7 +82,7 @@ TEST_F(ExternalSrcExample, Reduction_CUDA) { clearL2Cache(); std::cout << "Launching the kernel" << std::endl; float elapsed_time_ms = - fe.runRtc(lp, {t0, t7, t14, t15, t16, t17}, PrimDataType::Int32); + ke.runRtc(lp, {t0, t7, t14, t15, t16, t17}, PrimDataType::Int32); std::cout << "kernel run in " << elapsed_time_ms << " ms, achieved " << (read_write_bytes / elapsed_time_ms / 1000.0 / 1000.0) << " GB/s" << std::endl; @@ -99,7 +99,7 @@ TEST_F(ExternalSrcExample, Reduction_CUDA) { TEST_F(ExternalSrcExample, Matmul_CUDA) { Fusion fusion; FusionGuard fg(&fusion); - FusionExecutor fe; + KernelExecutor ke; // By default, this env var should not be defined. To test using an // external source file, set it to the path to the external source @@ -115,7 +115,7 @@ TEST_F(ExternalSrcExample, Matmul_CUDA) { buffer << cuda_src.rdbuf(); std::string cuda_src_str = buffer.str(); - fe.compileRtc(cuda_src_str, "kernel1", true, PrimDataType::Int32); + ke.compileRtc(cuda_src_str, "kernel1", true, PrimDataType::Int32); int M = 2048, N = 3456, K = 2048; MmaLayout layout = MmaLayout::TN; @@ -129,7 +129,7 @@ TEST_F(ExternalSrcExample, Matmul_CUDA) { auto output = at::zeros_like(at_output); clearL2Cache(); std::cout << "Launching the kernel" << std::endl; - float elapsed_time_ms = fe.runRtc( + float elapsed_time_ms = ke.runRtc( lp, {inputs.first, inputs.second, output}, PrimDataType::Int32); std::cout << "kernel run in " << elapsed_time_ms << " ms." << std::endl; diff --git a/tests/cpp/test_gpu1.cpp b/tests/cpp/test_gpu1.cpp index f5ee88d0936..2d626a271b4 100644 --- a/tests/cpp/test_gpu1.cpp +++ b/tests/cpp/test_gpu1.cpp @@ -207,9 +207,9 @@ TEST_F(NVFuserTest, FusionClear_CUDA) { at::Tensor input1 = at::randn({16, 8, 8}, options); at::Tensor input2 = at::randn_like(input1); - FusionExecutor fe; - fe.compileFusion(&fusion, {input1, input2}); - auto outputs = fe.runFusion({input1, input2}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input1, input2}); + auto outputs = ke.runFusion({input1, input2}); at::Tensor tv2_ref = input2 + 2.0; at::Tensor output_ref = input1 + tv2_ref; @@ -813,9 +813,9 @@ TEST_F(NVFuserTest, FusionOuterSplit_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion({}); + KernelExecutor ke; + ke.compileFusion(&fusion); + auto outputs = ke.runFusion({}); const auto& output = outputs.at(0); at::Tensor output_ref = at::ones_like(output, options); @@ -855,9 +855,9 @@ TEST_F(NVFuserTest, FusionCodeGen_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion({}); + KernelExecutor ke; + ke.compileFusion(&fusion); + auto outputs = ke.runFusion({}); const auto& output = outputs.at(0); at::Tensor output_ref = at::ones_like(output, options); @@ -899,9 +899,9 @@ TEST_F(NVFuserTest, FusionCodeGen2_CUDA) { at::Tensor input1 = at::randn({16, 8, 8}, options); at::Tensor input2 = at::randn_like(input1); - FusionExecutor fe; - fe.compileFusion(&fusion, {input1, input2}); - auto outputs = fe.runFusion({input1, input2}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input1, input2}); + auto outputs = ke.runFusion({input1, input2}); at::Tensor tv2_ref = input2 + 2.0; at::Tensor output_ref = input1 + tv2_ref; @@ -955,9 +955,9 @@ TEST_F(NVFuserTest, FusionSimplePWise_CUDA) { at::Tensor input2 = at::rand_like(input1); at::Tensor output = at::empty_like(input1); - FusionExecutor fe; - fe.compileFusion(&fusion, {input1, input2}); - fe.runFusion({input1, input2}, {output}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input1, input2}); + ke.runFusion({input1, input2}, {output}); at::Tensor tv2_ref = input2 + 2.0; at::Tensor output_ref = input1 + tv2_ref; @@ -1013,9 +1013,9 @@ TEST_F(NVFuserTest, FusionSimplePWiseDtypeComplex_CUDA) { at::Tensor input2 = at::rand_like(input1); at::Tensor output = at::empty_like(input1); - FusionExecutor fe; - fe.compileFusion(&fusion, {input1, input2}); - fe.runFusion({input1, input2}, {output}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input1, input2}); + ke.runFusion({input1, input2}, {output}); at::Tensor tv2_ref = input2 + static_cast>(scalar1); at::Tensor output_ref = input1 + tv2_ref; @@ -1063,9 +1063,9 @@ TEST_F(NVFuserTest, FusionExecKernel_CUDA) { at::Tensor input1 = at::ones({1, 128}, options); at::Tensor input2 = at::ones_like(input1); - FusionExecutor fe; - fe.compileFusion(&fusion, {input1, input2}); - auto outputs = fe.runFusion({input1, input2}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input1, input2}); + auto outputs = ke.runFusion({input1, input2}); at::Tensor check = at::full({1, 128}, 4, options); ; @@ -1145,9 +1145,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAt1_CUDA) { std::vector cg_outputs = { at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - fe.runFusion({aten_input}, cg_outputs); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); + ke.runFusion({aten_input}, cg_outputs); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -1199,9 +1199,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAt2_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({129, 127}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto cg_outputs = fe.runFusion({input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input}); + auto cg_outputs = ke.runFusion({input}); testValidate(&fusion, cg_outputs, {input}, __LINE__, __FILE__); } @@ -1253,9 +1253,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAt3_CUDA) { at::Tensor cg_output = at::empty_like(t0, options); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - fe.runFusion(aten_inputs, {cg_output}); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + ke.runFusion(aten_inputs, {cg_output}); testValidate(&fusion, {cg_output}, aten_inputs, __LINE__, __FILE__); } @@ -1317,9 +1317,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAt4_CUDA) { std::vector aten_inputs = {t0, t1, t2, t3}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -1353,9 +1353,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAt5_CUDA) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -1388,9 +1388,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAt6_CUDA) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -1449,9 +1449,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAt7_CUDA) { std::vector aten_inputs = {t0, t2, t6}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -1505,9 +1505,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAt8_CUDA) { std::vector aten_inputs = {t0, t2, t6}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -1574,9 +1574,9 @@ TEST_F(NVFuserTest, FusionComputeAtMultiConsumers_CUDA) { std::vector cg_outputs = { at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - fe.runFusion({aten_input}, cg_outputs); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); + ke.runFusion({aten_input}, cg_outputs); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -1644,9 +1644,9 @@ TEST_F(NVFuserTest, FusionComputeAtCommonConsumer1_CUDA) { at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - fe.runFusion({aten_input}, cg_outputs); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); + ke.runFusion({aten_input}, cg_outputs); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -1719,9 +1719,9 @@ TEST_F(NVFuserTest, FusionComputeAtCommonConsumer2_CUDA) { at::Tensor cg_output = at::empty_like(aten_input, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - fe.runFusion({aten_input}, {cg_output}); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); + ke.runFusion({aten_input}, {cg_output}); testValidate(&fusion, {cg_output}, {aten_input}, __LINE__, __FILE__); } @@ -1800,9 +1800,9 @@ TEST_F(NVFuserTest, FusionComputeAtCommonConsumer3_CUDA) { std::vector cg_outputs = { at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - fe.runFusion({aten_input}, cg_outputs); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); + ke.runFusion({aten_input}, cg_outputs); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -1864,9 +1864,9 @@ TEST_F(NVFuserTest, FusionComputeAtNoCommonConsumer_CUDA) { at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - fe.runFusion({aten_input}, cg_outputs); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); + ke.runFusion({aten_input}, cg_outputs); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -1972,9 +1972,9 @@ TEST_F(NVFuserTest, FusionScalarInputs_CUDA) { at::Scalar(fl2), at::Scalar(fl3)}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - fe.runFusion(aten_inputs, {cg_output}); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + ke.runFusion(aten_inputs, {cg_output}); testValidate(&fusion, {cg_output}, aten_inputs, __LINE__, __FILE__); } @@ -2024,9 +2024,9 @@ TEST_F(NVFuserTest, FusionLoopUnroll_CUDA) { at::Tensor input0 = at::randn({129, 13, 3}, options); at::Tensor input1 = at::randn({129, 13, 3}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input0, input1}); - auto outputs = fe.runFusion({input0, input1}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input0, input1}); + auto outputs = ke.runFusion({input0, input1}); NVF_CHECK(outputs[0].equal(input0.add(input1.add(2.0)))); } @@ -2173,9 +2173,9 @@ void test_op( std::vector output_vect = {cg_output}; cudaDeviceSynchronize(); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs_ivalues); - fe.runFusion(aten_inputs_ivalues, output_vect); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs_ivalues); + ke.runFusion(aten_inputs_ivalues, output_vect); cudaDeviceSynchronize(); at::Tensor aten_output = af(aten_inputs); @@ -2710,17 +2710,17 @@ TEST_F(NVFuserTest, FusionFp8CastOps_CUDA) { // const at::ArrayRef input_ivalues(inputs); std::vector inputs = {input1}; - FusionExecutor fe; + KernelExecutor ke; if (!deviceMajorMinorCheck(9)) { ASSERT_THAT( - [&]() { fe.compileFusion(&fusion, inputs); }, + [&]() { ke.compileFusion(&fusion, inputs); }, testing::ThrowsMessage(testing::HasSubstr( "Reason: Fusion contains Float8_xxx values which was introduced in Hopper (9.0)"))); GTEST_SKIP() << "skipping tests on pre-HOPPER GPUs"; } else { - fe.compileFusion(&fusion, inputs); - auto outputs = fe.runFusion(inputs); + ke.compileFusion(&fusion, inputs); + auto outputs = ke.runFusion(inputs); at::Tensor ref_output = input1.to(at_fp8_type).to(at_src_type); @@ -2790,9 +2790,9 @@ TEST_F(NVFuserTest, FusionReduction1_CUDA) { at::Tensor input = at::randn({numel_x, numel_y}, options); at::Tensor cg_output = at::empty({numel_x}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - fe.runFusion({input}, {cg_output}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input}); + ke.runFusion({input}, {cg_output}); auto aten_output = input.to(at::kDouble).sum({1}); @@ -2862,9 +2862,9 @@ TEST_F(NVFuserTest, FusionReduction2_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({numel_x, numel_y}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto cg_outputs = fe.runFusion({input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input}); + auto cg_outputs = ke.runFusion({input}); auto aten_output = input.to(at::kDouble).sum({1}); testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__); @@ -2913,9 +2913,9 @@ TEST_F(NVFuserTest, FusionReduction3_CUDA) { at::Tensor aten_input = at::randn({numel_x, numel_y}, options); at::Tensor cg_output = at::empty({numel_x}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - fe.runFusion({aten_input}, {cg_output}); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); + ke.runFusion({aten_input}, {cg_output}); auto aten_output = aten_input.to(at::kDouble).sum({1}); @@ -2979,9 +2979,9 @@ TEST_F(NVFuserTest, FusionReduction4_CUDA) { at::Tensor t1 = at::randn({numel_x, numel_y}, options); at::Tensor t4 = at::randn({numel_x}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1, t4}); - auto cg_outputs = fe.runFusion({t0, t1, t4}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, t1, t4}); + auto cg_outputs = ke.runFusion({t0, t1, t4}); auto t2 = t0.add(t1); auto t3 = t2.to(at::kDouble).sum({1}); @@ -3033,9 +3033,9 @@ TEST_F(NVFuserTest, FusionReduction5_CUDA) { at::Tensor cg_output = at::empty({bidy, tidx}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - fe.runFusion({input}, {cg_output}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input}); + ke.runFusion({input}, {cg_output}); auto aten_output = input.to(at::kDouble).sum({1}); testValidate( @@ -3098,9 +3098,9 @@ TEST_F(NVFuserTest, FusionReduction6_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({numel_x, numel_y, numel_z}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto cg_outputs = fe.runFusion({input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input}); + auto cg_outputs = ke.runFusion({input}); auto aten_output = input.to(at::kDouble).sum({1, 2}); testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__); @@ -3130,9 +3130,9 @@ TEST_F(NVFuserTest, FusionMultiGridReduction_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({numel_x, numel_y}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto cg_outputs = fe.runFusion({input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input}); + auto cg_outputs = ke.runFusion({input}); testValidate(&fusion, cg_outputs, {input}, __LINE__, __FILE__); } @@ -3154,9 +3154,9 @@ TEST_F(NVFuserTest, FusionMultiGridReduction2_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({4, 8}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto cg_output = fe.runFusion({input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input}); + auto cg_output = ke.runFusion({input}); testValidate(&fusion, cg_output, {input}, __LINE__, __FILE__); } @@ -3207,9 +3207,9 @@ TEST_F(NVFuserTest, FusionReductionTFT_CUDA) { at::Tensor input = at::randn({numel_x, numel_y}, options); at::Tensor cg_output = at::empty({numel_x}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - fe.runFusion({input}, {cg_output}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input}); + ke.runFusion({input}, {cg_output}); auto aten_output = input.to(at::kDouble).sum({1}); testValidate( @@ -3271,9 +3271,9 @@ TEST_F(NVFuserTest, FusionReductionOuterSplit_CUDA) { at::Tensor t1 = at::randn({numel_x, numel_y}, options); at::Tensor t4 = at::randn({numel_x}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1, t4}); - auto cg_outputs = fe.runFusion({t0, t1, t4}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, t1, t4}); + auto cg_outputs = ke.runFusion({t0, t1, t4}); auto t2 = t0.add(t1); auto t3 = t2.to(at::kDouble).sum({1}); @@ -3310,7 +3310,7 @@ TEST_F(NVFuserTest, FusionBranches_CUDA) { at::Tensor t1 = at::randn({x, y}, options); at::Tensor t2 = at::randn({x, y}, options); - FusionExecutor fe; + KernelExecutor ke; tv6->merge(0); tv6->split(0, 128); tv6->split(0, 4); @@ -3331,8 +3331,8 @@ TEST_F(NVFuserTest, FusionBranches_CUDA) { std::vector aten_inputs = {t0, t1, t2}; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -3377,9 +3377,9 @@ TEST_F(NVFuserTest, FusionSimpleBCast1_CUDA) { std::vector aten_inputs = {t0, t2, t3}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -3429,9 +3429,9 @@ TEST_F(NVFuserTest, FusionSimpleBCast2_CUDA) { std::vector aten_inputs = {t0, t1, t4}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - fe.runFusion(aten_inputs, {cg_output}); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + ke.runFusion(aten_inputs, {cg_output}); testValidate(&fusion, {cg_output}, aten_inputs, __LINE__, __FILE__); } @@ -3471,9 +3471,9 @@ TEST_F(NVFuserTest, FusionSimpleBCast3_CUDA) { std::vector aten_inputs = {t0, t2}; at::Tensor cg_output = at::empty({x, y, z}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - fe.runFusion(aten_inputs, {cg_output}); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + ke.runFusion(aten_inputs, {cg_output}); testValidate(&fusion, {cg_output}, aten_inputs, __LINE__, __FILE__); } @@ -3516,9 +3516,9 @@ TEST_F(NVFuserTest, FusionSimpleBCast4_CUDA) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - fe.runFusion(aten_inputs, {cg_output}); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + ke.runFusion(aten_inputs, {cg_output}); testValidate(&fusion, {cg_output}, aten_inputs, __LINE__, __FILE__); } @@ -3556,9 +3556,9 @@ TEST_F(NVFuserTest, FusionSimpleBCast5_CUDA) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - fe.runFusion(aten_inputs, {cg_output}); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + ke.runFusion(aten_inputs, {cg_output}); testValidate(&fusion, {cg_output}, aten_inputs, __LINE__, __FILE__); } @@ -3608,9 +3608,9 @@ TEST_F(NVFuserTest, FusionComplexBCast1_CUDA) { std::vector aten_inputs = {t0, t3, t6}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -3652,9 +3652,9 @@ TEST_F(NVFuserTest, FusionComplexBCast2_CUDA) { at::Tensor t0 = at::randn({y, z}, options); at::Tensor t4 = at::randn({x, y}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t4}); - auto cg_outputs = fe.runFusion({t0, t4}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, t4}); + auto cg_outputs = ke.runFusion({t0, t4}); testValidate(&fusion, {cg_outputs}, {t0, t4}, __LINE__, __FILE__); } @@ -3726,18 +3726,18 @@ TEST_F(NVFuserTest, FusionSimpleGemm_CUDA) { at::Tensor t0 = at::randn({M, K}, options); at::Tensor t1 = at::randn({K, N}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}, LaunchParams(1, -1, -1, 32, 4, 4)); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, t1}, LaunchParams(1, -1, -1, 32, 4, 4)); // Lets specify a few bounds in launch params to make sure it works - fe.runFusion({t0, t1}, LaunchParams(1, -1, -1, 32, 4, 4)); + ke.runFusion({t0, t1}, LaunchParams(1, -1, -1, 32, 4, 4)); // Make sure bad launch params throws // TODO: Re-enable once we have parallelization validation in. // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - // ASSERT_ANY_THROW(fe.runFusion({t0, t1}, LaunchParams(1, 2, 3, 4, 5, 6))); + // ASSERT_ANY_THROW(ke.runFusion({t0, t1}, LaunchParams(1, 2, 3, 4, 5, 6))); // Don't specify any launch params - auto cg_outputs = fe.runFusion({t0, t1}); + auto cg_outputs = ke.runFusion({t0, t1}); auto aten_output = t0.to(at::kDouble).matmul(t1.to(at::kDouble)); @@ -3791,9 +3791,9 @@ TEST_F(NVFuserTest, FusionSoftmax1D_CUDA) { at::Tensor cg_output = at::empty({dimx}, options); at::Tensor t3_output = at::empty_like(cg_output, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - fe.runFusion({t0}, {cg_output}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + ke.runFusion({t0}, {cg_output}); auto aten_output = at::_softmax(t0.to(at::kDouble), -1, false); @@ -3860,9 +3860,9 @@ TEST_F(NVFuserTest, FusionSoftmax1DNormalized_CUDA) { at::Tensor input = at::randn({dimx}, options); at::Tensor t3_output = at::empty({dimx}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto cg_outputs = fe.runFusion({input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input}); + auto cg_outputs = ke.runFusion({input}); auto aten_output = at::_softmax(input.to(at::kDouble), -1, false); @@ -3920,9 +3920,9 @@ TEST_F(NVFuserTest, FusionSoftmax3D_CUDA) { at::Tensor cg_output = at::empty({dimx, dimy, dimz}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - fe.runFusion({input}, {cg_output}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input}); + ke.runFusion({input}, {cg_output}); auto aten_output = at::_softmax(input.to(at::kDouble), -1, false); @@ -3995,9 +3995,9 @@ TEST_F(NVFuserTest, FusionSoftmax3DNormalized_CUDA) { at::Tensor input = at::randn({dimx, dimy, dimz}, options); at::Tensor t3_output = at::empty({dimx, dimy, dimz}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto cg_outputs = fe.runFusion({input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input}); + auto cg_outputs = ke.runFusion({input}); auto aten_output = at::_softmax(input.to(at::kDouble), -1, false); @@ -4081,9 +4081,9 @@ TEST_F(NVFuserTest, FusionGridReduction1_CUDA) { at::Tensor input = at::randn({numel_x, numel_y}, options); at::Tensor cg_output = at::empty({numel_x}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - fe.runFusion({input}, {cg_output}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input}); + ke.runFusion({input}, {cg_output}); auto aten_output = input.to(at::kDouble).sum({1}); @@ -4141,9 +4141,9 @@ TEST_F(NVFuserTest, FusionGridReduction2_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({numel_x, numel_y}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto cg_outputs = fe.runFusion({input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input}); + auto cg_outputs = ke.runFusion({input}); auto aten_output = input.to(at::kDouble).sum({1}); @@ -4203,9 +4203,9 @@ TEST_F(NVFuserTest, FusionGridReduction3dim1_CUDA) { at::Tensor input = at::randn({numel_x, numel_y}, options); at::Tensor cg_output = at::empty({numel_x}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - fe.runFusion({input}, {cg_output}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input}); + ke.runFusion({input}, {cg_output}); auto aten_output = input.to(at::kDouble).sum({1}); testValidate( @@ -4262,9 +4262,9 @@ TEST_F(NVFuserTest, FusionGridReduction3dim0_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({numel_x, numel_y}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto cg_outputs = fe.runFusion({input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input}); + auto cg_outputs = ke.runFusion({input}); auto aten_output = input.to(at::kDouble).sum({0}); @@ -4328,9 +4328,9 @@ TEST_F(NVFuserTest, FusionGridReduction4_CUDA) { at::Tensor input = at::randn({numel_x, numel_y}, options); at::Tensor cg_output = at::empty({numel_x}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - fe.runFusion({input}, {cg_output}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input}); + ke.runFusion({input}, {cg_output}); auto aten_output = input.to(at::kDouble).sum({1}); testValidate( @@ -4385,9 +4385,9 @@ TEST_F(NVFuserTest, FusionGridReduction5_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({numel_x, numel_y}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto cg_outputs = fe.runFusion({input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input}); + auto cg_outputs = ke.runFusion({input}); auto aten_output = input.to(at::kDouble).sum({1}); testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__); @@ -4450,9 +4450,9 @@ TEST_F(NVFuserTest, FusionGridReduction6_CUDA) { at::Tensor input = at::randn({numel_x, numel_y, numel_z}, options); at::Tensor cg_output = at::empty({numel_x}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - fe.runFusion({input}, {cg_output}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input}); + ke.runFusion({input}, {cg_output}); auto aten_output = input.to(at::kDouble).sum({1, 2}); @@ -4482,9 +4482,9 @@ TEST_F(NVFuserTest, FusionGridReduction7_CUDA) { at::Tensor input = at::randn({numel_x}, options); at::Tensor cg_output = at::empty({numel_x}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto out = fe.runFusion({input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input}); + auto out = ke.runFusion({input}); testValidate(&fusion, out, {input}, __LINE__, __FILE__); } @@ -4508,9 +4508,9 @@ TEST_F(NVFuserTest, FusionGridReduction8_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({numel_x, numel_y}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto out = fe.runFusion({input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input}); + auto out = ke.runFusion({input}); testValidate(&fusion, out, {input}, __LINE__, __FILE__); } @@ -4545,9 +4545,9 @@ TEST_F(NVFuserTest, FusionGridReduction9_CUDA) { std::vector aten_inputs = {t0, t2}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_output = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_output = ke.runFusion(aten_inputs); testValidate(&fusion, cg_output, {t0, t2}, __LINE__, __FILE__); } @@ -4586,9 +4586,9 @@ TEST_F(NVFuserTest, FusionGridReduction10_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({numel_w, numel_x, numel_y, numel_z}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_output = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_output = ke.runFusion({t0}); testValidate(&fusion, cg_output, {t0}, __LINE__, __FILE__); } @@ -4616,9 +4616,9 @@ TEST_F(NVFuserTest, FusionNonRedAxisBind_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({16, bid_x * tid_x}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto cg_outputs = fe.runFusion({input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input}); + auto cg_outputs = ke.runFusion({input}); testValidate(&fusion, cg_outputs, {input}, __LINE__, __FILE__); } @@ -4666,9 +4666,9 @@ TEST_F(NVFuserTest, FusionSplitBCast_CUDA) { at::Tensor t1 = at::randn({32, 32, 128}, options); at::Tensor cg_output = at::empty({32, 32, 128}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - fe.runFusion({t0, t1}, {cg_output}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, t1}); + ke.runFusion({t0, t1}, {cg_output}); } TEST_F(NVFuserTest, FusionBCastInnerDim_CUDA) { @@ -4747,9 +4747,9 @@ TEST_F(NVFuserTest, FusionComputeAtExprOrder1_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({100}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); + auto cg_outputs = ke.runFusion({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -4778,9 +4778,9 @@ TEST_F(NVFuserTest, FusionComputeAtExprOrder2_CUDA) { at::Tensor cg_output = at::empty_like(aten_input, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - fe.runFusion({aten_input}, {cg_output}); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); + ke.runFusion({aten_input}, {cg_output}); testValidate(&fusion, {cg_output}, {aten_input}, __LINE__, __FILE__); } @@ -4808,9 +4808,9 @@ TEST_F(NVFuserTest, FusionComputeAtExprOrder3_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({dimx, dimy}, options); - nvfuser::FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + nvfuser::KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); + auto cg_outputs = ke.runFusion({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -4831,9 +4831,9 @@ TEST_F(NVFuserTest, FusionZeroDimComputeAt_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({100}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); + auto cg_outputs = ke.runFusion({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -4866,9 +4866,9 @@ TEST_F(NVFuserTest, FusionZeroDimBroadcast_CUDA) { std::vector aten_inputs = {t0, t1}; at::Tensor cg_output = at::empty({}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - fe.runFusion(aten_inputs, {cg_output}); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + ke.runFusion(aten_inputs, {cg_output}); testValidate(&fusion, {cg_output}, aten_inputs, __LINE__, __FILE__); } @@ -4901,9 +4901,9 @@ TEST_F(NVFuserTest, FusionZeroDimReduction_CUDA) { at::Tensor cg_output = at::empty({}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - fe.runFusion({aten_input}, {cg_output}); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); + ke.runFusion({aten_input}, {cg_output}); testValidate( &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__); @@ -4953,9 +4953,9 @@ TEST_F(NVFuserTest, FusionBCastAfterReduce_CUDA) { auto aten_output = t3.add(t4); std::vector aten_inputs = {t0, t4}; - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t4}); - auto cg_outputs = fe.runFusion({t0, t4}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, t4}); + auto cg_outputs = ke.runFusion({t0, t4}); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); @@ -4977,9 +4977,9 @@ TEST_F(NVFuserTest, FusionOutputBroadcast_CUDA) { at::Tensor aten_input = at::randn({2, 3}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); + auto cg_outputs = ke.runFusion({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -5000,9 +5000,9 @@ TEST_F(NVFuserTest, FusionReductionKeepDimBasic_CUDA) { at::Tensor aten_input = at::randn({2, 3, 4, 5, 6}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); + auto cg_outputs = ke.runFusion({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -5076,9 +5076,9 @@ TEST_F(NVFuserTest, FusionSumTo_CUDA) { at::Tensor aten_input = at::randn(tensor_shape_ref, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); + auto cg_outputs = ke.runFusion({aten_input}); NVF_CHECK( cg_outputs[0].dim() == static_cast(sum_to_shape.size()), @@ -5118,9 +5118,9 @@ TEST_F(NVFuserTest, FusionSumToNoop_CUDA) { at::Tensor aten_input = at::randn(tensor_shape_ref, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); + auto cg_outputs = ke.runFusion({aten_input}); NVF_CHECK( cg_outputs[0].dim() == static_cast(sum_to_shape.size()), @@ -5265,9 +5265,9 @@ TEST_F(NVFuserTest, FusionSymbolicReduction_CUDA) { LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}, lparams); - auto cg_outputs = fe.runFusion({aten_input}, lparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}, lparams); + auto cg_outputs = ke.runFusion({aten_input}, lparams); testValidate( &fusion, @@ -5307,11 +5307,9 @@ TEST_F(NVFuserTest, FusionReductionSchedulerMultiDimNonFastest_CUDA) { auto heuristic_params = SchedulerEntry::scheduleWith( &fusion, SchedulerType::Reduction, {aten_input}); - FusionExecutor fusion_executor; - fusion_executor.compileFusion( - &fusion, {aten_input}, heuristic_params->lparams); - fusion_executor.runFusion( - {aten_input}, {cg_output}, heuristic_params->lparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}, heuristic_params->lparams); + ke.runFusion({aten_input}, {cg_output}, heuristic_params->lparams); testValidate( &fusion, @@ -5538,9 +5536,9 @@ TEST_F(NVFuserTest, FusionCacheBefore_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({M, N}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); + auto cg_outputs = ke.runFusion({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -5574,9 +5572,9 @@ TEST_F(NVFuserTest, FusionCacheAfter_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({M, N}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); + auto cg_outputs = ke.runFusion({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -5616,9 +5614,9 @@ TEST_F(NVFuserTest, FusionCacheFork_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({M, N}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); + auto cg_outputs = ke.runFusion({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -5663,9 +5661,9 @@ TEST_F(NVFuserTest, FusionCacheIndirect_CUDA) { std::vector aten_inputs = {t0, t1, t2, t3}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -5719,9 +5717,9 @@ TEST_F(NVFuserTest, FusionCacheBcast_CUDA) { at::Tensor t1 = at::randn({N}, options); std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -5756,9 +5754,9 @@ TEST_F(NVFuserTest, FusionCacheMultiConsumer_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({N}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); + auto cg_outputs = ke.runFusion({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -5808,13 +5806,13 @@ TEST_F(NVFuserTest, FusionSmem_CUDA) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, t1}); + auto cg_outputs = ke.runFusion({t0, t1}); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); - NVF_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0); + NVF_CHECK(ke.kernel()->summary().war_hazard_syncs_count == 0); } TEST_F(NVFuserTest, FusionSmemReduce_CUDA) { @@ -5856,13 +5854,13 @@ TEST_F(NVFuserTest, FusionSmemReduce_CUDA) { at::Tensor aten_input = at::randn({M, K, N}, options); at::Tensor aten_output = sum(aten_input.to(at::kDouble), {1}); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); + auto cg_outputs = ke.runFusion({aten_input}); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); - NVF_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0); + NVF_CHECK(ke.kernel()->summary().war_hazard_syncs_count == 0); } TEST_F(NVFuserTest, FusionSmemBlockGemm_CUDA) { @@ -5926,14 +5924,14 @@ TEST_F(NVFuserTest, FusionSmemBlockGemm_CUDA) { std::vector aten_inputs = {t0, t1}; at::Tensor aten_output = at::matmul(t0.to(at::kDouble), t1.to(at::kDouble)); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, t1}); + auto cg_outputs = ke.runFusion({t0, t1}); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); - NVF_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0); + NVF_CHECK(ke.kernel()->summary().war_hazard_syncs_count == 0); } TEST_F(NVFuserTest, FusionSmemBlockGemmCache_CUDA) { @@ -6015,14 +6013,14 @@ TEST_F(NVFuserTest, FusionSmemBlockGemmCache_CUDA) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); - NVF_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0); + NVF_CHECK(ke.kernel()->summary().war_hazard_syncs_count == 0); } TEST_F(NVFuserTest, FusionSmemDynamicPersistentSoftmax2D_CUDA) { @@ -6087,9 +6085,9 @@ TEST_F(NVFuserTest, FusionSmemDynamicPersistentSoftmax2D_CUDA) { at::Tensor aten_input = at::randn({dimx, dimy}, options); auto aten_output = at::_softmax(aten_input.to(at::kDouble), -1, false); - nvfuser::FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input, 128}); - auto cg_outputs = fe.runFusion({aten_input, 128}); + nvfuser::KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input, 128}); + auto cg_outputs = ke.runFusion({aten_input, 128}); testValidate( &fusion, @@ -6265,10 +6263,10 @@ TEST_F(NVFuserTest, FusionMagicSchedulerLayerNormBackward_CUDA) { auto aten_mean = std::get<1>(aten_results); auto aten_rstd = std::get<2>(aten_results); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); std::vector aten_inputs = { aten_grad_out, aten_input, aten_mean, aten_rstd, aten_weight, aten_bias}; - auto cg_outputs = fec.runFusionWithInputs(aten_inputs); + auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -6321,10 +6319,10 @@ TEST_F(NVFuserTest, FusionMagicSchedulerRMSNormBackward_CUDA) { auto var = at::mul(sum, 1.0 / NORM_SIZE); auto aten_rstd = at::pow(at::add(var, kEps), -0.5); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); std::vector aten_inputs = { aten_grad_out, aten_input, aten_rstd, aten_weight}; - auto cg_outputs = fec.runFusionWithInputs(aten_inputs); + auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); auto in_mul_rstd = at::mul(aten_input, aten_rstd); auto grad_out_mul = at::mul(aten_grad_out, in_mul_rstd); @@ -6383,9 +6381,9 @@ TEST_F(NVFuserTest, FusionMagicSchedulerLayerNormalization_CUDA) { // tv11 and tv17 should not be predicated. See issue #496 ASSERT_FALSE(PredicatedChecker::isPredicated( - 11, cg_results.fusion_executor->kernel())); + 11, cg_results.kernel_executor->kernel())); ASSERT_FALSE(PredicatedChecker::isPredicated( - 17, cg_results.fusion_executor->kernel())); + 17, cg_results.kernel_executor->kernel())); } TEST_F(NVFuserTest, FusionMagicSchedulerRMSNormalization_CUDA) { @@ -6842,9 +6840,9 @@ TEST_F(NVFuserTest, FusionPersistentSoftmaxLocalShared_CUDA) { at::Tensor aten_dynamic_out = aten_output.narrow(1, static_size, dimy - static_size); - nvfuser::FusionExecutor fe; - fe.compileFusion(&fusion, {aten_static_in, aten_dynamic_in}); - fe.runFusion( + nvfuser::KernelExecutor ke; + ke.compileFusion(&fusion, {aten_static_in, aten_dynamic_in}); + ke.runFusion( {aten_static_in, aten_dynamic_in}, {cg_static_out, cg_dynamic_out}); testValidate( @@ -7031,10 +7029,10 @@ TEST_F(NVFuserTest, FusionPersistentNormLocalShared_CUDA) { std::vector aten_inputs = { aten_static_in, aten_dynamic_in, kGamma, kBeta, kEps, dimy}; - nvfuser::FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); + nvfuser::KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); - fe.runFusion(aten_inputs, {cg_static_out, cg_dynamic_out}); + ke.runFusion(aten_inputs, {cg_static_out, cg_dynamic_out}); auto at_mu = at::mean(aten_input.to(at::kDouble), -1).unsqueeze(1); auto at_var = at::var(aten_input.to(at::kDouble), -1, false).unsqueeze(1); @@ -7155,9 +7153,9 @@ TEST_F(NVFuserTest, FusionSmemDynamicPersistentNorm_CUDA) { std::vector aten_inputs = { aten_input, kGamma, kBeta, kEps, dimy, TIDX}; - nvfuser::FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + nvfuser::KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); @@ -7201,9 +7199,9 @@ TEST_F(NVFuserTest, FusionSmemDynamicReductionSymbolic_CUDA) { LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}, lparams); - auto cg_outputs = fe.runFusion({aten_input}, lparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}, lparams); + auto cg_outputs = ke.runFusion({aten_input}, lparams); testValidate( &fusion, @@ -7214,7 +7212,7 @@ TEST_F(NVFuserTest, FusionSmemDynamicReductionSymbolic_CUDA) { __FILE__, "", lparams); - NVF_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0); + NVF_CHECK(ke.kernel()->summary().war_hazard_syncs_count == 0); } TEST_F(NVFuserTest, FusionSmemDynamicReductionSymbolicArg_CUDA) { @@ -7264,9 +7262,9 @@ TEST_F(NVFuserTest, FusionSmemDynamicReductionSymbolicArg_CUDA) { auto lparams = LaunchParams(-1, -1, -1, runtime_threadIdx_dim, -1, -1); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input, runtime_threadIdx_dim}, lparams); - auto cg_outputs = fe.runFusion({aten_input, runtime_threadIdx_dim}, lparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input, runtime_threadIdx_dim}, lparams); + auto cg_outputs = ke.runFusion({aten_input, runtime_threadIdx_dim}, lparams); testValidate( &fusion, @@ -7278,7 +7276,7 @@ TEST_F(NVFuserTest, FusionSmemDynamicReductionSymbolicArg_CUDA) { "", lparams); - NVF_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0); + NVF_CHECK(ke.kernel()->summary().war_hazard_syncs_count == 0); } TEST_F(NVFuserTest, FusionSmemDynamicPwiseMulSymbolicArgWAR_CUDA) { @@ -7328,14 +7326,14 @@ TEST_F(NVFuserTest, FusionSmemDynamicPwiseMulSymbolicArgWAR_CUDA) { LaunchParams lparams(-1, -1, -1, BSX, -1, -1); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs, lparams); - auto cg_outputs = fe.runFusion(aten_inputs, lparams); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs, lparams); + auto cg_outputs = ke.runFusion(aten_inputs, lparams); testValidate( &fusion, cg_outputs, aten_inputs, __LINE__, __FILE__, "", lparams); - NVF_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 1); + NVF_CHECK(ke.kernel()->summary().war_hazard_syncs_count == 1); } TEST_F(NVFuserTest, FusionSmemDynamicTiledGemm_CUDA) { @@ -7453,15 +7451,15 @@ TEST_F(NVFuserTest, FusionSmemDynamicTiledGemm_CUDA) { at::Tensor aten_output = mul(t0.unsqueeze(2), t1.unsqueeze(0)).to(at::kDouble).sum(1); - FusionExecutor fe; + KernelExecutor ke; // Generate CUDA and compile with nvRTC - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); - NVF_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 1); + NVF_CHECK(ke.kernel()->summary().war_hazard_syncs_count == 1); } } // namespace nvfuser diff --git a/tests/cpp/test_gpu2.cpp b/tests/cpp/test_gpu2.cpp index 5dae411ee12..2b8d696f934 100644 --- a/tests/cpp/test_gpu2.cpp +++ b/tests/cpp/test_gpu2.cpp @@ -94,9 +94,9 @@ TEST_F(NVFuserTest, FusionGlobalIntermediate_CUDA) { auto lparams = LaunchParams(-1, -1, -1, runtime_threadIdx_dim, -1, -1); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}, lparams); - auto cg_outputs = fe.runFusion({input}, lparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {input}, lparams); + auto cg_outputs = ke.runFusion({input}, lparams); auto aten_output = input.to(at::kDouble).sum({1}); testValidate( @@ -141,9 +141,9 @@ TEST_F(NVFuserTest, FusionGlobalIntermediateDefaultSchedule_CUDA) { std::vector aten_inputs = {t0, t1, t2, t3}; - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1, t2, t3}); - auto cg_outputs = fe.runFusion({t0, t1, t2, t3}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, t1, t2, t3}); + auto cg_outputs = ke.runFusion({t0, t1, t2, t3}); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -199,9 +199,9 @@ TEST_F(NVFuserTest, FusionUnrollWithAlloc_CUDA) { tv1->computeAt(tv2_rf, -1); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto cg_outputs = fe.runFusion({input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input}); + auto cg_outputs = ke.runFusion({input}); auto aten_output = (input + 0).to(at::kDouble).sum(1); @@ -276,9 +276,9 @@ TEST_F(NVFuserTest, FusionComputeAtNonterminatingOutput_CUDA) { auto t3 = t1 + 3; auto t4 = t3 + 4; - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); + auto cg_outputs = ke.runFusion({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -310,9 +310,9 @@ TEST_F(NVFuserTest, FusionTraversalOrder1_CUDA) { at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - fe.runFusion({aten_input}, cg_outputs); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); + ke.runFusion({aten_input}, cg_outputs); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -347,9 +347,9 @@ TEST_F(NVFuserTest, FusionTraversalOrder2_CUDA) { at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - fe.runFusion({aten_input}, cg_outputs); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); + ke.runFusion({aten_input}, cg_outputs); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -399,9 +399,9 @@ TEST_F(NVFuserTest, FusionTraversalOrder3_CUDA) { at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - fe.runFusion({aten_input}, cg_outputs); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); + ke.runFusion({aten_input}, cg_outputs); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -443,9 +443,9 @@ TEST_F(NVFuserTest, FusionTraversalOrder4_CUDA) { at::empty_like(t0, options), at::empty_like(t0, options)}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - fe.runFusion(aten_inputs, cg_outputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + ke.runFusion(aten_inputs, cg_outputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -476,9 +476,9 @@ TEST_F(NVFuserTest, FusionTraversalOrder5_CUDA) { at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - fe.runFusion({aten_input}, cg_outputs); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); + ke.runFusion({aten_input}, cg_outputs); auto t1 = aten_input + 1; auto t2 = t1 + 2; @@ -518,9 +518,9 @@ TEST_F(NVFuserTest, FusionTraversalOrder6_CUDA) { at::Tensor cg_output = at::empty_like(aten_input, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - fe.runFusion({aten_input}, {cg_output}); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); + ke.runFusion({aten_input}, {cg_output}); testValidate(&fusion, {cg_output}, {aten_input}, __LINE__, __FILE__); } @@ -558,9 +558,9 @@ TEST_F(NVFuserTest, FusionTraversalOrder7_CUDA) { at::Tensor cg_output = at::empty_like(aten_input, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - fe.runFusion({aten_input}, {cg_output}); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); + ke.runFusion({aten_input}, {cg_output}); testValidate(&fusion, {cg_output}, {aten_input}, __LINE__, __FILE__); } @@ -619,9 +619,9 @@ TEST_F(NVFuserTest, FusionThreadPredicate_CUDA) { std::vector cg_outputs = { at::empty_like(aten_input, options), at::empty({numel_x}, options)}; - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - fe.runFusion({aten_input}, cg_outputs); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); + ke.runFusion({aten_input}, cg_outputs); testValidate( &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); @@ -739,10 +739,10 @@ TEST_F(NVFuserTest, FusionReduceSingle_CUDA) { at::Tensor aten_input = at::randn({100, 1}, options); // Grab only tensor views, though there shouldn't be any other type - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); // no broadcasting needed, omitting the last optional argument; - auto cg_outputs = fe.runFusion({aten_input}); + auto cg_outputs = ke.runFusion({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -871,9 +871,9 @@ TEST_F(NVFuserTest, FusionTrivialReduction_CUDA) { at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({10, 20, 1}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); + auto cg_outputs = ke.runFusion({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -1281,9 +1281,9 @@ TEST_F(NVFuserTest, FusionIssue459_CUDA) { std::vector aten_inputs = {t0, t1}; - nvfuser::FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + nvfuser::KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -1311,9 +1311,9 @@ TEST_F(NVFuserTest, FusionSmemIndexingSimple_CUDA) { auto aten_input = at::randn({12, 34}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); + auto cg_outputs = ke.runFusion({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -1422,9 +1422,9 @@ TEST_F(NVFuserTest, FusionSmemIndexing_CUDA) { // A, B, m_tile_dim, split_k, intra_cta_tile std::vector aten_inputs = {t0, t1, 3, 4, 5}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); @@ -1457,9 +1457,9 @@ TEST_F(NVFuserTest, FusionCacheBeforeReduction_CUDA) { at::Tensor aten_input = at::randn({numel_x, numel_y}, options); at::Tensor cg_output = at::empty({numel_x}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - fe.runFusion({aten_input}, {cg_output}); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); + ke.runFusion({aten_input}, {cg_output}); testValidate(&fusion, {cg_output}, {aten_input}, __LINE__, __FILE__); } @@ -1494,9 +1494,9 @@ TEST_F(NVFuserTest, FusionCacheBeforeReduction2_CUDA) { at::Tensor aten_input = at::randn({numel_x, numel_y, numel_z}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); + auto cg_outputs = ke.runFusion({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -1600,9 +1600,9 @@ TEST_F(NVFuserTest, FusionIssue367_CUDA) { at::Tensor aten_output = mul(t0.unsqueeze(2), t1.unsqueeze(0)).to(at::kDouble).sum(1); - nvfuser::FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + nvfuser::KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); @@ -1626,9 +1626,9 @@ TEST_F(NVFuserTest, FusionIssue468_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({10, 100}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); + auto cg_outputs = ke.runFusion({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -1678,9 +1678,9 @@ TEST_F(NVFuserTest, FusionIssue363_CUDA) { std::vector aten_inputs = {t0, t1}; - nvfuser::FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + nvfuser::KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -1704,9 +1704,9 @@ TEST_F(NVFuserTest, FusionIssue484_CUDA) { at::Tensor aten_input = at::randn({M, M}, options); - nvfuser::FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + nvfuser::KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); + auto cg_outputs = ke.runFusion({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -1730,9 +1730,9 @@ TEST_F(NVFuserTest, FusionIssue329_CUDA) { std::vector t0_shape{17, 19}; auto aten_input = at::randn(t0_shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); + auto cg_outputs = ke.runFusion({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -1771,9 +1771,9 @@ TEST_F(NVFuserTest, FusionIssue382_CUDA) { std::vector aten_inputs = {t0, t3}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -1800,9 +1800,9 @@ TEST_F(NVFuserTest, FusionIssue507_CUDA) { std::vector t0_shape{17, 19}; auto aten_input = at::randn(t0_shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); + auto cg_outputs = ke.runFusion({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -1838,9 +1838,9 @@ TEST_F(NVFuserTest, FusionIssue532_CUDA) { at::Tensor t0 = at::randn({M}, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto outputs = ke.runFusion(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -1867,9 +1867,9 @@ TEST_F(NVFuserTest, FusionLoopUnswitch_CUDA) { at::Tensor t0 = at::randn({M}, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto outputs = ke.runFusion(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -1945,17 +1945,17 @@ TEST_F(NVFuserTest, FusionIssue549_CUDA) { // Lets specify a few bounds in launch params to make sure it works LaunchParams lparams(1, -1, -1, 32, 4, 4); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}, lparams); - fe.runFusion({t0, t1}, lparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, t1}, lparams); + ke.runFusion({t0, t1}, lparams); // Make sure bad launch params throws // TODO: Re-enable once we have parallelization validation in. // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - // ASSERT_ANY_THROW(fe.runFusion({t0, t1}, LaunchParams(1, 2, 3, 4, 5, 6))); + // ASSERT_ANY_THROW(ke.runFusion({t0, t1}, LaunchParams(1, 2, 3, 4, 5, 6))); // Don't specify any launch params - auto cg_outputs = fe.runFusion({t0, t1}); + auto cg_outputs = ke.runFusion({t0, t1}); auto aten_output = (t0 + 1).to(at::kDouble).matmul(t1.to(at::kDouble)); @@ -1964,7 +1964,7 @@ TEST_F(NVFuserTest, FusionIssue549_CUDA) { } TEST_F(NVFuserTest, FusionSimpleCompileRtc_CUDA) { - FusionExecutor fe; + KernelExecutor ke; std::string kernel = R"( __global__ void kernel1(Tensor T0, Tensor T1) { if(threadIdx.x==0){ @@ -1974,7 +1974,7 @@ __global__ void kernel1(Tensor T0, Tensor T1) { } } )"; - fe.compileRtc(kernel, "kernel1", false, PrimDataType::Int); + ke.compileRtc(kernel, "kernel1", false, PrimDataType::Int); LaunchParams lp( 256, // gdimx 1, // gdimy @@ -1989,14 +1989,14 @@ __global__ void kernel1(Tensor T0, Tensor T1) { const std::vector tensor_dims = {8}; auto in0 = at::randn(tensor_dims, options); auto out0 = at::empty_like(in0); - fe.runRtc(lp, {in0, out0}, PrimDataType::Int); + ke.runRtc(lp, {in0, out0}, PrimDataType::Int); auto out_ref = in0 * 2; NVF_CHECK(out_ref.allclose(out0)); } TEST_F(NVFuserTest, FusionSerialWelford_CUDA) { - FusionExecutor fe; + KernelExecutor ke; int x = 128, y = 64, z = 64; std::string kernel = R"( @@ -2030,7 +2030,7 @@ __global__ void kernel1( } } )"; - fe.compileRtc(kernel, "kernel1", false, PrimDataType::Int); + ke.compileRtc(kernel, "kernel1", false, PrimDataType::Int); LaunchParams lp( 1, // gdimx 1, // gdimy @@ -2046,14 +2046,14 @@ __global__ void kernel1( auto in0 = at::randn(tensor_dims, options); auto out_var = at::empty({x}, options); auto out_avg = at::empty({x}, options); - fe.runRtc(lp, {in0, out_var, out_avg}, PrimDataType::Int); + ke.runRtc(lp, {in0, out_var, out_avg}, PrimDataType::Int); NVF_CHECK(in0.var({1, 2}, false).allclose(out_var)); NVF_CHECK(in0.mean({1, 2}).allclose(out_avg, /*rtol*/ 1e-5, /*atol*/ 1e-6)); } TEST_F(NVFuserTest, FusionBlockWelford_CUDA) { - FusionExecutor fe; + KernelExecutor ke; int x = 7, y = 8, z = 9; std::string kernel = R"( @@ -2102,7 +2102,7 @@ __global__ void kernel1( } } )"; - fe.compileRtc(kernel, "kernel1", false, PrimDataType::Int); + ke.compileRtc(kernel, "kernel1", false, PrimDataType::Int); LaunchParams lp( 1, // gdimx 1, // gdimy @@ -2129,7 +2129,7 @@ __global__ void kernel1( // run kernel auto out_var = at::zeros({x}, options); auto out_avg = at::zeros({x}, options); - fe.runRtc( + ke.runRtc( lp, {in0, out_avg, out_var, init_avg, init_var, init_N}, PrimDataType::Int); @@ -2142,7 +2142,7 @@ __global__ void kernel1( } TEST_F(NVFuserTest, FusionBlockWelfordNoInit_CUDA) { - FusionExecutor fe; + KernelExecutor ke; int x = 7, y = 8, z = 9; // need support IValue for integer input as initial count @@ -2183,7 +2183,7 @@ __global__ void kernel1( } } )"; - fe.compileRtc(kernel, "kernel1", false, PrimDataType::Int); + ke.compileRtc(kernel, "kernel1", false, PrimDataType::Int); LaunchParams lp( 1, // gdimx 1, // gdimy @@ -2199,14 +2199,14 @@ __global__ void kernel1( auto in0 = at::randn(tensor_dims, options); auto out_var = at::empty({x}, options); auto out_avg = at::empty({x}, options); - fe.runRtc(lp, {in0, out_avg, out_var}, PrimDataType::Int); + ke.runRtc(lp, {in0, out_avg, out_var}, PrimDataType::Int); NVF_CHECK(in0.var({1, 2}, false).allclose(out_var)); NVF_CHECK(in0.mean({1, 2}).allclose(out_avg, /*rtol*/ 1e-5, /*atol*/ 1e-6)); } TEST_F(NVFuserTest, FusionGridWelfordNoInit_CUDA) { - FusionExecutor fe; + KernelExecutor ke; int x = 128, y = 64, z = 128; std::string kernel = R"( @@ -2258,7 +2258,7 @@ __global__ void kernel1( } } )"; - fe.compileRtc(kernel, "kernel1", false, PrimDataType::Int); + ke.compileRtc(kernel, "kernel1", false, PrimDataType::Int); LaunchParams lp( x, // gdimx y, // gdimy @@ -2282,7 +2282,7 @@ __global__ void kernel1( auto work_buf_var = at::empty({x * y * z}, options); auto work_buf_N = at::empty({x * y * z}, options_int); auto sync_flag = at::zeros({1}, options_int); - fe.runRtc( + ke.runRtc( lp, {in0, out_avg, @@ -2325,15 +2325,15 @@ TEST_F(NVFuserTest, FusionWelfordOp_CUDA) { auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0); at::Tensor t0 = at::randn({M, N}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto outputs = ke.runFusion({t0}); // by default Welford outputs sum of square diff so need to divide to get var outputs[1] /= N; testValidate( - fe.kernel(), + ke.kernel(), outputs, {t0}, {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N}, @@ -2370,15 +2370,15 @@ TEST_F(NVFuserTest, FusionBlockWelfordOp_CUDA) { at::Tensor t_avg = at::empty({M}, options); at::Tensor t_N = at::empty({M}, options_int); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto outputs = ke.runFusion({t0}); // by default Welford outputs sum of square diff so need to divide to get var outputs[1] /= N; testValidate( - fe.kernel(), + ke.kernel(), outputs, {t0}, {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N}, @@ -2415,15 +2415,15 @@ TEST_F(NVFuserTest, FusionGridWelfordOp_CUDA) { at::Tensor t_var = at::empty({M}, options); at::Tensor t_N = at::empty({M}, options_int); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto outputs = ke.runFusion({t0}); // by default Welford outputs sum of square diff so need to divide to get var outputs[1] /= N; testValidate( - fe.kernel(), + ke.kernel(), outputs, {t0}, {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N}, @@ -2459,15 +2459,15 @@ TEST_F(NVFuserTest, FusionRfactorWelfordOp_CUDA) { at::Tensor t_var = at::empty({M}, options); at::Tensor t_N = at::empty({M}, options_int); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto outputs = ke.runFusion({t0}); // by default Welford outputs sum of square diff so need to divide to get var outputs[1] /= N; testValidate( - fe.kernel(), + ke.kernel(), outputs, {t0}, {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N}, @@ -2588,7 +2588,7 @@ TEST_P(WelfordReduction, Test) { auto lparams = reduction_params->lparams; auto cparams = reduction_params->cparams; - FusionExecutor fe; + KernelExecutor ke; // Needs to pass compile para to use the correct index type, otherwise the // lowering pass will use int64 as the index tpye, since this test saves // `tv_N` as index type, it may cause vectorization size validation error. For @@ -2597,8 +2597,8 @@ TEST_P(WelfordReduction, Test) { // pass uses int64 as index type, so the max vectorization factor is 16 bytes // sizeof(int64) = 2, which is wrong since the actual index type is int32 // and the max vectorization factor is 4. - fe.compileFusion(&fusion, {aten_input}, lparams, cparams); - auto outputs = fe.runFusion({aten_input}, lparams); + ke.compileFusion(&fusion, {aten_input}, lparams, cparams); + auto outputs = ke.runFusion({aten_input}, lparams); // by default Welford outputs sum of square diff so need to divide to // get var @@ -2613,7 +2613,7 @@ TEST_P(WelfordReduction, Test) { at_n = at_n.sum({axis}); testValidate( - fe.kernel(), + ke.kernel(), outputs, {aten_input}, {at_avg, at_var, at_n}, @@ -2755,12 +2755,12 @@ TEST_F(NVFuserTest, FusionSimpleGemmTransposed_CUDA) { // Lets specify a few bounds in launch params to make sure it works LaunchParams lparams(1, -1, -1, 32, 4, 4); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}, lparams); - fe.runFusion({t0, t1}, lparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, t1}, lparams); + ke.runFusion({t0, t1}, lparams); // Don't specify any launch params - auto cg_outputs = fe.runFusion({t0, t1}); + auto cg_outputs = ke.runFusion({t0, t1}); auto aten_output = t0.t().to(at::kDouble).matmul(t1.t().to(at::kDouble)); @@ -2820,9 +2820,9 @@ TEST_F(NVFuserTest, FusionSoftmax3DTransposed_CUDA) { at::Tensor cg_output = at::empty({dimx, dimy, dimz}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - fe.runFusion({input}, {cg_output}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input}); + ke.runFusion({input}, {cg_output}); auto aten_input_t = at::transpose(input, 1, 2); auto aten_output = at::_softmax(aten_input_t.to(at::kDouble), -1, false); @@ -2894,9 +2894,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed1_CUDA) { at::Tensor aten_input = at::randn({129, 127}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); + auto cg_outputs = ke.runFusion({aten_input}); at::Tensor aten_input_t = aten_input.t(); @@ -2963,9 +2963,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed2_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({129, 127}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto cg_outputs = fe.runFusion({input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input}); + auto cg_outputs = ke.runFusion({input}); auto input_t = input.t(); auto t1 = input_t.mul({-1.0}); @@ -3029,9 +3029,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed3_CUDA) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); auto t0_t = t0.permute({3, 0, 1, 2}); auto t1_t = t1.permute({3, 0, 1, 2}); @@ -3107,9 +3107,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed4_CUDA) { std::vector aten_inputs = {t0, t1, t2, t3}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); auto t0_t = t0.permute({3, 0, 1, 2}); auto t1_t = t1.permute({3, 0, 1, 2}); @@ -3155,9 +3155,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed5_CUDA) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); auto t2 = t0.t().add(2.0); auto aten_output = t1.t().mul(t2); @@ -3197,9 +3197,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed6_CUDA) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); auto t2 = t0.t().add(2.0); auto aten_output = t1.t().mul(t2); @@ -3348,9 +3348,9 @@ TEST_F(NVFuserTest, FusionVectorizeSimple_CUDA) { at::Tensor aten_input = at::empty({2, 6, 32}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); + auto cg_outputs = ke.runFusion({aten_input}); at::Tensor aten_output = aten_input.sin(); @@ -3423,9 +3423,9 @@ TEST_F(NVFuserTest, FusionSimpleVectorizeUnroll_CUDA) { at::Tensor input2 = at::rand_like(input1); at::Tensor output = at::empty_like(input1); - FusionExecutor fe; - fe.compileFusion(&fusion, {input1, input2}); - fe.runFusion({input1, input2}, {output}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input1, input2}); + ke.runFusion({input1, input2}, {output}); at::Tensor tv2_ref = input2 + 2.0; at::Tensor output_ref = input1 + tv2_ref; @@ -3503,9 +3503,9 @@ TEST_F(NVFuserTest, FusionGridPersistence_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({numel_x}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto out = fe.runFusion({input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input}); + auto out = ke.runFusion({input}); testValidate(&fusion, out, {input}, __LINE__, __FILE__); } @@ -3536,9 +3536,9 @@ TEST_F(NVFuserTest, FusionGridPersistence2_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({numel_x, numel_y}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto out = fe.runFusion({input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input}); + auto out = ke.runFusion({input}); testValidate(&fusion, out, {input}, __LINE__, __FILE__); } @@ -3570,9 +3570,9 @@ TEST_F(NVFuserTest, FusionWelfordPersistence_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({numel_x}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto out = fe.runFusion({input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input}); + auto out = ke.runFusion({input}); auto aten_output = (input.mean({0}) + (input.var({0}, false) * numel_x)) .unsqueeze(-1) @@ -3610,9 +3610,9 @@ TEST_F(NVFuserTest, FusionWelfordPersistence2_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({numel_x, numel_y}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto out = fe.runFusion({input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input}); + auto out = ke.runFusion({input}); auto aten_output = (input.mean({0}) + (input.var({0}, false) * numel_x)) .unsqueeze(0) @@ -3648,9 +3648,9 @@ TEST_F(NVFuserTest, FusionIssue633_CUDA) { at::Tensor t1 = at::randn({dx, dy, 1}, options); std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -3681,9 +3681,9 @@ TEST_F(NVFuserTest, FusionBroadcastAcrossComputeAt_CUDA) { at::Tensor t1 = at::randn(shape, options); std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -3730,9 +3730,9 @@ TEST_F(NVFuserTest, FusionVectorizeMisalignedPointwise_CUDA) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -3786,9 +3786,9 @@ TEST_F(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeContig_CUDA) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -3847,9 +3847,9 @@ TEST_F(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeSymbolicPass_CUDA) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -3908,11 +3908,11 @@ TEST_F(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeSymbolicFail_CUDA) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; + KernelExecutor ke; // TODO: throw assertion - cannot merge non-contiguous vectorization axes // Make sure compilation fails // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(fe.compileFusion(&fusion)); + ASSERT_ANY_THROW(ke.compileFusion(&fusion)); } TEST_F(NVFuserTest, FusionVectorizeMisalignedRFactor_CUDA) { @@ -3964,9 +3964,9 @@ TEST_F(NVFuserTest, FusionVectorizeMisalignedRFactor_CUDA) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); auto aten_output = t0.add(t1).sum(1); testValidate( @@ -4006,10 +4006,10 @@ TEST_F(NVFuserTest, FusionVectorizeMisalignedWrongDimFail_CUDA) { tv->axis(-2)->parallelize(ParallelType::MisalignedVectorize); } - FusionExecutor fe; + KernelExecutor ke; // Make sure compilation fails // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(fe.compileFusion(&fusion)); + ASSERT_ANY_THROW(ke.compileFusion(&fusion)); } TEST_F(NVFuserTest, FusionVectorizeMisalignedStride_CUDA) { @@ -4056,9 +4056,9 @@ TEST_F(NVFuserTest, FusionVectorizeMisalignedStride_CUDA) { at::randn({bx, by}, options).index({"...", at::indexing::Slice(3)}); std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -4110,12 +4110,12 @@ TEST_F(NVFuserTest, FusionVectorizeMisalignedStrideFail_CUDA) { at::randn({bx, by}, options).index({"...", at::indexing::Slice(3)}); std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); // Failure because the input + output tensors do not have the same stride // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(fe.runFusion(aten_inputs)); + ASSERT_ANY_THROW(ke.runFusion(aten_inputs)); } TEST_F(NVFuserTest, FusionVectorization1_CUDA) { @@ -4157,9 +4157,9 @@ TEST_F(NVFuserTest, FusionVectorization1_CUDA) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -4197,10 +4197,10 @@ TEST_F(NVFuserTest, FusionVectorization2_CUDA) { tv->axis(-2)->parallelize(ParallelType::Vectorize); } - FusionExecutor fe; + KernelExecutor ke; // Make sure compilation fails // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(fe.compileFusion(&fusion)); + ASSERT_ANY_THROW(ke.compileFusion(&fusion)); } // TODO: Re-enable once vectorization validation is fixed @@ -4244,20 +4244,20 @@ TEST_F(NVFuserTest, FusionVectorization3_CUDA) { at::Tensor t1 = at::randn({bx, by}, options); std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(fe.runFusion(aten_inputs)); + ASSERT_ANY_THROW(ke.runFusion(aten_inputs)); aten_inputs[0] = t0.index({"...", at::indexing::Slice(1)}); aten_inputs[1] = t1.index({"...", at::indexing::Slice(1)}); // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(fe.runFusion(aten_inputs)); + ASSERT_ANY_THROW(ke.runFusion(aten_inputs)); t0 = at::randn({bx, 2048}, options).index({"...", at::indexing::Slice(4)}); t1 = at::randn({bx, 2048}, options).index({"...", at::indexing::Slice(4)}); aten_inputs = {t0, t1}; - auto cg_outputs = fe.runFusion(aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -4309,9 +4309,9 @@ TEST_F(NVFuserTest, FusionVectorizationRFactor_CUDA) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); auto aten_output = t0.add(t1).sum(1); testValidate( @@ -4372,9 +4372,9 @@ TEST_F(NVFuserTest, FusionSizeOneLoop1_CUDA) { at::Tensor t2 = at::randn({z, x, y}, options); std::vector aten_inputs = {t0, t1, t2}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -4406,9 +4406,9 @@ TEST_F(NVFuserTest, FusionSizeOneLoop2_CUDA) { at::Tensor t0 = at::randn({x}, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -4428,9 +4428,9 @@ TEST_F(NVFuserTest, FusionValidateParallelize1_CUDA) { tv2->axis(-1)->parallelize(ParallelType::TIDy); // Invalid as tv1 and tv2 do have the same ParallelType - FusionExecutor fe; + KernelExecutor ke; // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(fe.compileFusion(&fusion)); + ASSERT_ANY_THROW(ke.compileFusion(&fusion)); } TEST_F(NVFuserTest, FusionValidateParallelize2_CUDA) { @@ -4450,8 +4450,8 @@ TEST_F(NVFuserTest, FusionValidateParallelize2_CUDA) { // tv1 and tv2 do have the same ParallelType, but tv1 is on shared // memory, so it is valid - FusionExecutor fe; - fe.compileFusion(&fusion); + KernelExecutor ke; + ke.compileFusion(&fusion); } TEST_F(NVFuserTest, FusionValidateParallelize3_CUDA) { @@ -4473,8 +4473,8 @@ TEST_F(NVFuserTest, FusionValidateParallelize3_CUDA) { tv1->setMemoryType(MemoryType::Global); // tv1 and tv2 have the same shape and ParallelType - FusionExecutor fe; - fe.compileFusion(&fusion); + KernelExecutor ke; + ke.compileFusion(&fusion); } TEST_F(NVFuserTest, FusionValidateParallelize4_CUDA) { @@ -4496,8 +4496,8 @@ TEST_F(NVFuserTest, FusionValidateParallelize4_CUDA) { tv1->setMemoryType(MemoryType::Global); // tv1 and tv2 do not have the same shape but global memory comm is supported. - FusionExecutor fe; - fe.compileFusion(&fusion); + KernelExecutor ke; + ke.compileFusion(&fusion); } TEST_F(NVFuserTest, FusionValidateParallelize5_CUDA) { @@ -4520,8 +4520,8 @@ TEST_F(NVFuserTest, FusionValidateParallelize5_CUDA) { // tv1 and tv2 do not have the same shape, but tv1 is on shared // memory, so it is valid - FusionExecutor fe; - fe.compileFusion(&fusion); + KernelExecutor ke; + ke.compileFusion(&fusion); } // See issue #995 @@ -4648,9 +4648,9 @@ TEST_F(NVFuserTest, FusionValidateParallelize8_CUDA) { at::Tensor input0 = at::arange(64, options).view({32, 2}); at::Tensor input1 = at::arange(32, options) * 0.01; - FusionExecutor fe; - fe.compileFusion(&fusion, {input0, input1}); - auto outputs = fe.runFusion({input0, input1}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input0, input1}); + auto outputs = ke.runFusion({input0, input1}); testValidate(&fusion, outputs, {input0, input1}, __LINE__, __FILE__); } @@ -4737,9 +4737,9 @@ TEST_F(NVFuserTest, FusionValidateParallelize10_CUDA) { at::Tensor t1 = at::randn({s0, s1}, options); std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto outputs = ke.runFusion(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -4783,9 +4783,9 @@ TEST_F(NVFuserTest, FusionValidateParallelize11_CUDA) { at::Tensor t1 = at::randn({s0, s1}, options); std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto outputs = ke.runFusion(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -4897,9 +4897,9 @@ TEST_F(NVFuserTest, FusionBlockReduceInSerialLoop_CUDA) { at::Tensor t0 = at::randn({M, N, K}, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto outputs = ke.runFusion(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -4926,9 +4926,9 @@ TEST_F(NVFuserTest, FusionBlockWelfordInSerialLoop_CUDA) { at::Tensor t0 = at::randn({M, N, K}, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto outputs = ke.runFusion(aten_inputs); at::Tensor aten_avg = t0.mean({1, 2}); at::Tensor aten_M2 = t0.var({1, 2}, false) * N * K; testValidate( @@ -4965,9 +4965,9 @@ TEST_F(NVFuserTest, FusionReductionPredicate_CUDA) { at::Tensor input = at::randn({numel_x, numel_y}, options); at::Tensor cg_output = at::empty({numel_y}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - fe.runFusion({input}, {cg_output}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input}); + ke.runFusion({input}, {cg_output}); auto aten_output = input.to(at::kDouble).sum({0}); @@ -5062,9 +5062,9 @@ TEST_F(NVFuserTest, FusionIssue757_CUDA) { at::Tensor t3 = at::randn({numel_x, numel_y}, options); std::vector inputs = {t0, t3}; - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, inputs); + auto outputs = ke.runFusion(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } @@ -5100,9 +5100,9 @@ TEST_F(NVFuserTest, FusionPredicatedBlockBroadcast_CUDA) { at::Tensor t3 = at::randn({numel_x, numel_y}, options); std::vector inputs = {t0, t3}; - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, inputs); + auto outputs = ke.runFusion(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } @@ -5364,10 +5364,10 @@ TEST_F(NVFuserTest, FusionBNBackwardRepro_CUDA) { at::Tensor input6 = at::randn_like(input0); at::Tensor input7 = at::randn_like(input0); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); std::vector inputs = { input0, input1, input2, input3, input4, input5, input6, input7}; - auto outputs = fec.runFusionWithInputs(inputs); + auto outputs = executor_cache.runFusionWithInputs(inputs); } // TODO: We only changed inputs, merge this with the test above. @@ -5432,10 +5432,10 @@ TEST_F(NVFuserTest, FusionBNBackwardRepro2_CUDA) { at::Tensor input6 = at::randn_like(input0); at::Tensor input7 = at::randn_like(input0); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); std::vector inputs = { input0, input1, input2, input3, input4, input5, input6, input7}; - auto outputs = fec.runFusionWithInputs(inputs); + auto outputs = executor_cache.runFusionWithInputs(inputs); } TEST_F(NVFuserTest, FusionBNRepro_CUDA) { @@ -5494,10 +5494,10 @@ TEST_F(NVFuserTest, FusionBNRepro_CUDA) { auto input4_ref = input4.clone(); auto input5_ref = input5.clone(); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); std::vector aten_inputs = { input1, input2, input3, input4, input5}; - auto cg_outputs = fec.runFusionWithInputs(aten_inputs); + auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); auto at_results = at::native_batch_norm( input1_ref, @@ -5563,9 +5563,9 @@ TEST_F(NVFuserTest, FusionBNRepro2_CUDA) { at::Tensor weight; at::Tensor bias; - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); std::vector aten_inputs = {input1}; - auto cg_outputs = fec.runFusionWithInputs(aten_inputs); + auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -5894,8 +5894,8 @@ TEST_F(NVFuserTest, FusionSegmentIslands_CUDA) { at::Tensor t0 = at::randn({16, 16}, options); at::Tensor t1 = at::randn({16, 16}, options); - FusionExecutorCache fusion_executor_cache(std::move(fusion)); - fusion_executor_cache.runFusionWithInputs({t0, t1}); + FusionExecutorCache executor_cache(std::move(fusion)); + executor_cache.runFusionWithInputs({t0, t1}); } TEST_F(NVFuserTest, FusionBackOffInnerBroadcast_CUDA) { @@ -6004,9 +6004,9 @@ TEST_F(NVFuserTest, FusionSimpleWarp_CUDA) { auto at_output = input1.sum({1}, true).add(input1); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {input1}); - auto outputs = fe.runFusion({input1}); + KernelExecutor ke; + ke.compileFusion(fusion.get(), {input1}); + auto outputs = ke.runFusion({input1}); testValidate( fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__); @@ -6053,9 +6053,9 @@ TEST_F(NVFuserTest, FusionSimpleWarpPad_CUDA) { auto at_output = input1.sum({1}, true).add(input1); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {input1}); - auto outputs = fe.runFusion({input1}); + KernelExecutor ke; + ke.compileFusion(fusion.get(), {input1}); + auto outputs = ke.runFusion({input1}); testValidate( fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__); } @@ -6098,9 +6098,9 @@ TEST_F(NVFuserTest, FusionWarpPadMergeSplit_CUDA) { auto at_output = input1.sum({1, 2}, true).add(input1); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {input1}); - auto outputs = fe.runFusion({input1}); + KernelExecutor ke; + ke.compileFusion(fusion.get(), {input1}); + auto outputs = ke.runFusion({input1}); testValidate( fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__); } @@ -6140,9 +6140,9 @@ TEST_F(NVFuserTest, FusionSerialWarpReduction_CUDA) { auto at_output = input1.sum({1, 2}, true).add(input1); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {input1}); - auto outputs = fe.runFusion({input1}); + KernelExecutor ke; + ke.compileFusion(fusion.get(), {input1}); + auto outputs = ke.runFusion({input1}); testValidate( fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__); } @@ -6185,9 +6185,9 @@ TEST_F(NVFuserTest, FusionTrivialWarpReduction_CUDA) { auto at_output = input1.sum({1, 2, 3}, true).add(input1); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {input1}); - auto outputs = fe.runFusion({input1}); + KernelExecutor ke; + ke.compileFusion(fusion.get(), {input1}); + auto outputs = ke.runFusion({input1}); testValidate( fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__); } @@ -6240,9 +6240,9 @@ TEST_F(NVFuserTest, FusionMultipleDimBinding_CUDA) { auto at_output = input1.sum({1}, true).add(input1); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {input1, input2}); - auto outputs = fe.runFusion({input1, input2}); + KernelExecutor ke; + ke.compileFusion(fusion.get(), {input1, input2}); + auto outputs = ke.runFusion({input1, input2}); testValidate( fusion.get(), outputs, @@ -6278,9 +6278,9 @@ TEST_F(NVFuserTest, FusionPadNoWarpReduce_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input1 = at::randn({16, 31}, options); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {input1}); - auto outputs = fe.runFusion({input1}); + KernelExecutor ke; + ke.compileFusion(fusion.get(), {input1}); + auto outputs = ke.runFusion({input1}); testValidate(fusion.get(), outputs, {input1}, __LINE__, __FILE__); } @@ -6313,9 +6313,9 @@ TEST_F(NVFuserTest, FusionWarpMutipleThreadDim_CUDA) { auto at_output = (input1 + 1).sum({1}); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {input1}); - auto outputs = fe.runFusion({input1}); + KernelExecutor ke; + ke.compileFusion(fusion.get(), {input1}); + auto outputs = ke.runFusion({input1}); testValidate( fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__); } @@ -6364,9 +6364,9 @@ TEST_F(NVFuserTest, FusionWarpReduceUnrollOuterLoop_CUDA) { auto at_output = input1.sum({1}, true).add(input1); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {input1}); - auto outputs = fe.runFusion({input1}); + KernelExecutor ke; + ke.compileFusion(fusion.get(), {input1}); + auto outputs = ke.runFusion({input1}); testValidate( fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__); } @@ -6410,9 +6410,9 @@ TEST_F(NVFuserTest, FusionWarpReducePredication_CUDA) { auto t0 = at::randn(shape1, options); auto t2 = at::randn(shape2, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t2}); - auto cg_outputs = fe.runFusion({t0, t2}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, t2}); + auto cg_outputs = ke.runFusion({t0, t2}); auto t1 = t0.sum({0}); auto t4 = (t2 + 1).sum({0}) + 1; @@ -6458,9 +6458,9 @@ TEST_F(NVFuserTest, FusionSegfaultReduction_CUDA) { at::Tensor input0 = at::randn({batch, c, h, w}, options); at::Tensor input1 = at::randn({batch, c, h, w}, options); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); std::vector inputs = {input0, input1}; - auto outputs = fec.runFusionWithInputs(inputs); + auto outputs = executor_cache.runFusionWithInputs(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } @@ -6491,9 +6491,9 @@ TEST_F(NVFuserTest, FusionBufferReuseBroadCastMultiVisit_CUDA) { auto in0 = at::randn({2, 2}, options); auto in1 = at::randn({2, 2, 2}, options); - FusionExecutor fe; - fe.compileFusion(fusion, {in0, in1}); - auto outputs = fe.runFusion({in0, in1}); + KernelExecutor ke; + ke.compileFusion(fusion, {in0, in1}); + auto outputs = ke.runFusion({in0, in1}); testValidate(fusion, outputs, {in0, in1}, __LINE__, __FILE__); } @@ -6535,10 +6535,10 @@ TEST_F(NVFuserTest, FusionBufferReuseStressTest_CUDA) { auto in0 = at::randn({2, 2}, options); auto in1 = at::randn({2, 2, 2}, options); - FusionExecutor fe; - fe.compileFusion(fusion, {in0, in1}); + KernelExecutor ke; + ke.compileFusion(fusion, {in0, in1}); - auto outputs = fe.runFusion({in0, in1}); + auto outputs = ke.runFusion({in0, in1}); testValidate(fusion, outputs, {in0, in1}, __LINE__, __FILE__); } @@ -6567,9 +6567,9 @@ TEST_F(NVFuserTest, FusionBufferReuseLargeBuffer_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto in0 = at::randn({256, 512}, options); - FusionExecutor fe; - fe.compileFusion(fusion, {in0}); - auto outputs = fe.runFusion({in0}); + KernelExecutor ke; + ke.compileFusion(fusion, {in0}); + auto outputs = ke.runFusion({in0}); testValidate(fusion, outputs, {in0}, __LINE__, __FILE__); } @@ -6599,9 +6599,9 @@ TEST_F(NVFuserTest, FusionBufferReuseNo2hop_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto in0 = at::randn({2, 2}, options); auto in1 = at::randn({2, 2, 2}, options); - FusionExecutor fe; - fe.compileFusion(fusion, {in0, in1}); - auto outputs = fe.runFusion({in0, in1}); + KernelExecutor ke; + ke.compileFusion(fusion, {in0, in1}); + auto outputs = ke.runFusion({in0, in1}); testValidate(fusion, outputs, {in0, in1}, __LINE__, __FILE__); } @@ -6633,9 +6633,9 @@ TEST_F(NVFuserTest, FusionBufferReuseAllocationOrder_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto in0 = at::randn({3, 3, 3}, options); - FusionExecutor fe; - fe.compileFusion(fusion, {in0}); - auto outputs = fe.runFusion({in0}); + KernelExecutor ke; + ke.compileFusion(fusion, {in0}); + auto outputs = ke.runFusion({in0}); testValidate(fusion, outputs, {in0}, __LINE__, __FILE__); } @@ -6662,9 +6662,9 @@ TEST_F(NVFuserTest, FusionBufferReuseLiveInterval_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto in0 = at::randn({16, 16}, options); - FusionExecutor fe; - fe.compileFusion(fusion, {in0}); - auto cg_outputs = fe.runFusion({in0}); + KernelExecutor ke; + ke.compileFusion(fusion, {in0}); + auto cg_outputs = ke.runFusion({in0}); testValidate(fusion, cg_outputs, {in0}, __LINE__, __FILE__); } @@ -6696,9 +6696,9 @@ TEST_F(NVFuserTest, FusionBufferReuseNoAcrossBroadcast_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto in0 = at::randn({2, 2}, options); auto in1 = at::randn({2, 2, 2}, options); - FusionExecutor fe; - fe.compileFusion(fusion, {in0, in1}); - auto outputs = fe.runFusion({in0, in1}); + KernelExecutor ke; + ke.compileFusion(fusion, {in0, in1}); + auto outputs = ke.runFusion({in0, in1}); testValidate(fusion, outputs, {in0, in1}, __LINE__, __FILE__); } @@ -6722,9 +6722,9 @@ TEST_F(NVFuserTest, FusionIssue970_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({nelm, nelm}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto outputs = ke.runFusion({t0}); testValidate(&fusion, outputs, {t0}, __LINE__, __FILE__); } @@ -6753,9 +6753,9 @@ TEST_F(NVFuserTest, FusionIssue1016_CUDA) { at::Tensor t0 = at::randn({numel_x, numel_y}, options); std::vector inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, inputs); + auto outputs = ke.runFusion(inputs); testValidate(&fusion, outputs, {t0}, __LINE__, __FILE__); } @@ -6784,9 +6784,9 @@ TEST_F(NVFuserTest, FusionIssue1021_CUDA) { at::Tensor t0 = at::randn({10}, options); std::vector inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, inputs); + auto outputs = ke.runFusion(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } @@ -6819,9 +6819,9 @@ TEST_F(NVFuserTest, FusionNonUniqueThreadDim_CUDA) { auto at_tv1 = (input1).sum({0}); auto at_tv2 = input1 + 1; - FusionExecutor fe; - fe.compileFusion(fusion.get(), {input1}); - auto outputs = fe.runFusion({input1}); + KernelExecutor ke; + ke.compileFusion(fusion.get(), {input1}); + auto outputs = ke.runFusion({input1}); testValidate( fusion.get(), outputs, {input1}, {at_tv1, at_tv2}, __LINE__, __FILE__); } @@ -6856,9 +6856,9 @@ TEST_F(NVFuserTest, FusionParallelDimensionMap1_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input1 = at::randn({32}, options); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {input1}); - auto outputs = fe.runFusion({input1}); + KernelExecutor ke; + ke.compileFusion(fusion.get(), {input1}); + auto outputs = ke.runFusion({input1}); testValidate(fusion.get(), outputs, {input1}, __LINE__, __FILE__); } @@ -6893,9 +6893,9 @@ TEST_F(NVFuserTest, FusionParallelDimensionMap2_CUDA) { at::Tensor input1 = at::randn({11}, options); at::Tensor input2 = at::randn({11, 13}, options); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {input1, input2}); - auto outputs = fe.runFusion({input1, input2}); + KernelExecutor ke; + ke.compileFusion(fusion.get(), {input1, input2}); + auto outputs = ke.runFusion({input1, input2}); testValidate(fusion.get(), outputs, {input1, input2}, __LINE__, __FILE__); } @@ -6941,9 +6941,9 @@ TEST_F(NVFuserTest, FusionParallelDimensionMap3_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input1 = at::randn({13}, options); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {input1}); - auto outputs = fe.runFusion({input1}); + KernelExecutor ke; + ke.compileFusion(fusion.get(), {input1}); + auto outputs = ke.runFusion({input1}); testValidate(fusion.get(), outputs, {input1}, __LINE__, __FILE__); } @@ -6987,9 +6987,9 @@ TEST_F(NVFuserTest, FusionParallelDimensionMap4_CUDA) { at::Tensor input1 = at::randn({13}, options); at::Tensor input2 = at::randn({15, 13}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input1, input2}); - auto outputs = fe.runFusion({input1, input2}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input1, input2}); + auto outputs = ke.runFusion({input1, input2}); testValidate(&fusion, outputs, {input1, input2}, __LINE__, __FILE__); } @@ -7031,9 +7031,9 @@ TEST_F(NVFuserTest, FusionParallelDimensionMap5_CUDA) { at::Tensor input1 = at::randn({13}, options); at::Tensor input2 = at::randn({13, 15}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input1, input2}); - auto outputs = fe.runFusion({input1, input2}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input1, input2}); + auto outputs = ke.runFusion({input1, input2}); testValidate(&fusion, outputs, {input1, input2}, __LINE__, __FILE__); } @@ -7185,9 +7185,9 @@ TEST_F(NVFuserTest, FusionSerialAndParallelIndexing_CUDA) { at::Tensor t0 = at::randn({nx}, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto outputs = ke.runFusion(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -7238,9 +7238,9 @@ TEST_F(NVFuserTest, FusionWARSyncAliasedSmem_CUDA) { at::Tensor t0 = at::randn({17}, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto outputs = ke.runFusion(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -7289,9 +7289,9 @@ TEST_F(NVFuserTest, FusionIssue1099_CUDA) { at::Tensor t3 = at::randn({19}, options); std::vector aten_inputs = {t0, t3}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto outputs = ke.runFusion(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -7331,9 +7331,9 @@ TEST_F(NVFuserTest, FusionUnswitchPredicate_CUDA) { at::Tensor t0 = at::randn({nx, ny}, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto outputs = ke.runFusion(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -7370,9 +7370,9 @@ TEST_F(NVFuserTest, FusionIssue1189_CUDA) { at::Tensor t0 = at::randn({16, 16, 1}, options); at::Tensor t1 = at::randn({16, 16, 1}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - auto outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, t1}); + auto outputs = ke.runFusion({t0, t1}); testValidate(&fusion, outputs, {t0, t1}, __LINE__, __FILE__); } @@ -7403,9 +7403,9 @@ TEST_F(NVFuserTest, FusionIssue1052_CUDA) { at::Tensor t1 = at::randn({100}, options); std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto outputs = ke.runFusion(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -7509,9 +7509,9 @@ TEST_F(NVFuserTest, FusionSmemAliasSerial_CUDA) { at::Tensor t4 = at::randn({1024}, options); std::vector aten_inputs = {t0, t4}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto outputs = ke.runFusion(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -7539,9 +7539,9 @@ TEST_F(NVFuserTest, FusionGridReductionWithNonExactParallelDimensions_CUDA) { at::Tensor t2 = at::randn({19}, options); std::vector aten_inputs = {t0, t2}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto outputs = ke.runFusion(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -7569,9 +7569,9 @@ TEST_F(NVFuserTest, FusionGridWelfordWithNonExactParallelDimensions_CUDA) { at::Tensor t2 = at::randn({19}, options); std::vector aten_inputs = {t0, t2}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto outputs = ke.runFusion(aten_inputs); auto ref1 = t0 + 1; auto ref2 = mean(t2, {0}); @@ -7610,15 +7610,15 @@ TEST_F(NVFuserTest, FusionGridReductionWithNonExactParallelDimensions2_CUDA) { tv5->axis(1)->parallelize(ParallelType::BIDy); tv5->axis(2)->parallelize(ParallelType::BIDz); - FusionExecutor fe; - fe.compileFusion(&fusion); + KernelExecutor ke; + ke.compileFusion(&fusion); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({2, 3}, options); at::Tensor t2 = at::randn({5, 6, 7}, options); at::Tensor t4 = at::randn({8, 9, 10}, options); std::vector aten_inputs = {t0, t2, t4}; - auto outputs = fe.runFusion(aten_inputs); + auto outputs = ke.runFusion(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -7654,15 +7654,15 @@ TEST_F(NVFuserTest, FusionGridWelfordWithNonExactParallelDimensions2_CUDA) { tv5->axis(1)->parallelize(ParallelType::BIDy); tv5->axis(2)->parallelize(ParallelType::BIDz); - FusionExecutor fe; - fe.compileFusion(&fusion); + KernelExecutor ke; + ke.compileFusion(&fusion); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({2, 3}, options); at::Tensor t2 = at::randn({5, 6, 7}, options); at::Tensor t4 = at::randn({8, 9, 10}, options); std::vector aten_inputs = {t0, t2, t4}; - auto outputs = fe.runFusion(aten_inputs); + auto outputs = ke.runFusion(aten_inputs); auto ref1 = t0.mean(at::IntArrayRef{0, 1}); auto ref2 = t2 + 1; @@ -7723,9 +7723,9 @@ TEST_F(NVFuserTest, FusionPredicateParallelizedDomains_CUDA) { at::Tensor t4 = at::randn({19}, options); std::vector aten_inputs = {t0, t4}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto outputs = ke.runFusion(aten_inputs); auto ref1 = t0 + 3; auto ref2 = sum(t4 + 4); @@ -7785,9 +7785,9 @@ TEST_F(NVFuserTest, FusionSmemPredicateUnswitch_CUDA) { at::Tensor t1 = at::randn({19}, options); std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto outputs = ke.runFusion(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -7834,9 +7834,9 @@ TEST_F(NVFuserTest, FusionFloatPow_CUDA) { t0 = abs(t0); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto outputs = ke.runFusion(aten_inputs); auto p4 = at::pow(t0, 4); auto p2 = at::pow(t0, 2); @@ -7903,9 +7903,9 @@ TEST_F(NVFuserTest, FusionThreadPredicateUnswitch_CUDA) { at::Tensor t0 = at::randn({10, 1024}, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto outputs = ke.runFusion(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -7926,9 +7926,9 @@ TEST_F(NVFuserTest, FusionNonContigOutputs_CUDA) { at::Tensor at_input = at::randn({10}, options); at::Tensor at_output = at::empty_strided({10}, {2}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {at_input}); - auto returned_outputs = fe.runFusion({at_input}, {at_output}); + KernelExecutor ke; + ke.compileFusion(&fusion, {at_input}); + auto returned_outputs = ke.runFusion({at_input}, {at_output}); // Returned outputs should only contain one tensor that is the same // as the output tensor given to runFusion @@ -7974,9 +7974,9 @@ TEST_F(NVFuserTest, FusionTestWarpSoftMax_CUDA) { } // Test result - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto outputs = ke.runFusion(aten_inputs); auto ref_output = at::_softmax(aten_input, 1, false); testValidate(&fusion, outputs, aten_inputs, {ref_output}, __LINE__, __FILE__); } @@ -8048,9 +8048,9 @@ TEST_F(NVFuserTest, FusionIssue1133_CUDA) { at::Tensor t0 = at::randn({99, 101}, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto outputs = ke.runFusion(aten_inputs); auto ref = (t0 + 1).sum({1}) + 1; @@ -8082,9 +8082,9 @@ TEST_F(NVFuserTest, FusionRfactorContigIDs_CUDA) { at::Tensor t0 = at::randn({99, 101}, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto outputs = ke.runFusion(aten_inputs); auto ref = t0.sum({1}); @@ -8137,9 +8137,9 @@ TEST_F(NVFuserTest, FusionIssue1223_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor at_t0 = at::ones({11, 10}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {at_t0}); - auto cg_outputs = fe.runFusion({at_t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {at_t0}); + auto cg_outputs = ke.runFusion({at_t0}); auto at_t1 = (at_t0 + 1).sum(); @@ -8181,9 +8181,9 @@ TEST_F(NVFuserTest, FusionRfactorPredication1_CUDA) { at_t0 = at::abs(at_t0); at::Tensor at_t3 = at::randn({128}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {at_t0, at_t3}); - auto cg_outputs = fe.runFusion({at_t0, at_t3}); + KernelExecutor ke; + ke.compileFusion(&fusion, {at_t0, at_t3}); + auto cg_outputs = ke.runFusion({at_t0, at_t3}); auto at_t2 = (at_t0 + 1).min(); auto at_t4 = at_t3 + 1; @@ -8233,9 +8233,9 @@ TEST_F(NVFuserTest, FusionRfactorPredication2_CUDA) { at_t0 = at::abs(at_t0); at::Tensor at_t3 = at::randn({128}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {at_t0, at_t3}); - auto cg_outputs = fe.runFusion({at_t0, at_t3}); + KernelExecutor ke; + ke.compileFusion(&fusion, {at_t0, at_t3}); + auto cg_outputs = ke.runFusion({at_t0, at_t3}); auto at_t2 = std::get<0>(at_t0.min(0)); auto at_t4 = at_t3 + 1; @@ -8270,9 +8270,9 @@ TEST_F(NVFuserTest, FusionRfactorIndirectRoot_CUDA) { auto at_in = at::randn({6, 6, 6}, options); auto at_out = at_in.sum({1, 2}); - FusionExecutor fe; - fe.compileFusion(&fusion, {at_in}); - auto cg_outputs = fe.runFusion({at_in}); + KernelExecutor ke; + ke.compileFusion(&fusion, {at_in}); + auto cg_outputs = ke.runFusion({at_in}); testValidate(&fusion, cg_outputs, {at_in}, {at_out}, __LINE__, __FILE__); } diff --git a/tests/cpp/test_gpu3.cpp b/tests/cpp/test_gpu3.cpp index c9e7b7fdb3a..c2fdd8e7d33 100644 --- a/tests/cpp/test_gpu3.cpp +++ b/tests/cpp/test_gpu3.cpp @@ -107,9 +107,9 @@ TEST_F(NVFuserTest, FusionNonDivisibleSplit1_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({24}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); auto ref = t0.sum(); @@ -161,9 +161,9 @@ TEST_F(NVFuserTest, FusionNonDivisibleSplit2_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({13, 17}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -210,9 +210,9 @@ TEST_F(NVFuserTest, FusionNonDivisibleSplit3_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({24}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); auto ref = (t0 + 1).sum(); @@ -260,9 +260,9 @@ TEST_F(NVFuserTest, FusionNonDivisibleSplit4_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({24, 2}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); auto ref = (t0 + 1).sum(); @@ -314,9 +314,9 @@ TEST_F(NVFuserTest, FusionNonDivisibleSplit5_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({24}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); auto ref = (t0 + 1).sum(); @@ -357,9 +357,9 @@ TEST_F(NVFuserTest, FusionNonDivisibleSplitVectorize1_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({32}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); @@ -367,7 +367,7 @@ TEST_F(NVFuserTest, FusionNonDivisibleSplitVectorize1_CUDA) { // Since ceilDiv(8, 8) is not divisible by 4, the vectorization is // illegal. The run-time validation of vectorization should throw an error. // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(fe.runFusion({t0_non_divisible})); + ASSERT_ANY_THROW(ke.runFusion({t0_non_divisible})); } // If a split is validated at run time, it's not necessary to predicate. @@ -412,9 +412,9 @@ TEST_F(NVFuserTest, FusionNonDivisibleSplitVectorize2_CUDA) { auto t0 = at::randn({1024}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); auto ref = (t0 + 1).sum(); @@ -474,16 +474,16 @@ TEST_F(NVFuserTest, FusionIntermediateTensorVectorize_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({15}, options); - FusionExecutor fe; - fe.compileFusion(&fusion); + KernelExecutor ke; + ke.compileFusion(&fusion); // This should throw an exception as the extent of t0 is not // divisible by the vector width // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(fe.runFusion({t0})); + ASSERT_ANY_THROW(ke.runFusion({t0})); auto t1 = at::randn({16}, options); - auto cg_outputs = fe.runFusion({t1}); + auto cg_outputs = ke.runFusion({t1}); testValidate(&fusion, cg_outputs, {t1}, __LINE__, __FILE__); } @@ -529,9 +529,9 @@ TEST_F(NVFuserTest, FusionBroadcastConcretization1_CUDA) { auto t2 = at::randn({10, 10}, options); std::vector aten_inputs = {t0, t1, t2}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto outputs = ke.runFusion(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -572,9 +572,9 @@ TEST_F(NVFuserTest, FusionBroadcastConcretization2_CUDA) { auto t0 = at::randn({10, 11}, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto outputs = ke.runFusion(aten_inputs); auto t3 = t0.sum().unsqueeze(-1).unsqueeze(-1); @@ -617,9 +617,9 @@ TEST_F(NVFuserTest, FusionBroadcastConcretization3_CUDA) { auto t0 = at::randn(input_shape, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto outputs = ke.runFusion(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -790,9 +790,9 @@ TEST_F(NVFuserTest, FusionIssue1430_CUDA) { auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); at::Tensor t0 = at::randn({V, W, X, Y, Z}, options); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({t0}, LaunchParams(X, V, -1, Y, -1, -1)); + KernelExecutor ke; + ke.compileFusion(&fusion); + auto cg_outputs = ke.runFusion({t0}, LaunchParams(X, V, -1, Y, -1, -1)); auto t0_double = t0.to(at::kDouble); @@ -944,9 +944,9 @@ TEST_F(NVFuserTest, FusionTestGridComm_CUDA) { auto t0 = at::randn({X, Y, Z}, options); auto t1 = at::randn({X, Y, Z}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, t1}); + auto cg_outputs = ke.runFusion({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -988,9 +988,9 @@ TEST_F(NVFuserTest, FusionTestGridComm2_CUDA) { auto t0 = at::randn({X}, options); auto t1 = at::randn({W, X}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, t1}); + auto cg_outputs = ke.runFusion({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -1021,9 +1021,9 @@ TEST_F(NVFuserTest, FusionLargeSmem_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({(int)(12288 * 4)}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); auto ref = t0 + 1 + 2; testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); @@ -1057,10 +1057,10 @@ TEST_F(NVFuserTest, FusionTooLargeSmem_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({(int)(12288 * 4)}, options); - FusionExecutor fe; + KernelExecutor ke; // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(fe.compileFusion(&fusion, {t0})); + ASSERT_ANY_THROW(ke.compileFusion(&fusion, {t0})); } // Try to test alignment when multiple tensors are @@ -1097,10 +1097,10 @@ TEST_F(NVFuserTest, FusionSmemAlignment_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({3, 4, 7, 2, 5}, options); - FusionExecutor fe; + KernelExecutor ke; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -1126,8 +1126,8 @@ TEST_F(NVFuserTest, FusionImmediateValueAsInput_CUDA) { fusion.addOutput(tv1); // Make sure the kernel is compiled. - FusionExecutor fe; - fe.compileFusion(&fusion); + KernelExecutor ke; + ke.compileFusion(&fusion); } // Repro of #1506 @@ -1157,9 +1157,9 @@ TEST_F(NVFuserTest, FusionVectorizeContigIndex_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); NVF_CHECK(t0.equal(cg_outputs[0])); } @@ -1192,10 +1192,10 @@ TEST_F(NVFuserTest, FusionVectorizeContigIndexFail_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); - FusionExecutor fe; + KernelExecutor ke; // This should fail at compile time as we're trying to merge in a // non-contiguous dimension, then split and vectorize it. - ASSERT_ANY_THROW(fe.compileFusion(&fusion, {t0})); + ASSERT_ANY_THROW(ke.compileFusion(&fusion, {t0})); } // Make sure the same fusion as FusionVectorizeContigIndex fails if @@ -1227,14 +1227,14 @@ TEST_F(NVFuserTest, FusionVectorizeContigIndexFail2_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); // This should fail at the launch time as 14 is not divisible by the // vector word size. The two domains are merged, but they are not // contiguous, so contig indexing is not involved in this case. // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(fe.runFusion({t0})); + ASSERT_ANY_THROW(ke.runFusion({t0})); } TEST_F(NVFuserTest, FusionVectorizeInputToOutput_CUDA) { @@ -1260,18 +1260,18 @@ TEST_F(NVFuserTest, FusionVectorizeInputToOutput_CUDA) { auto t1_misaligned = at::empty({n + 1}, options).index({at::indexing::Slice(1)}); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); NVF_CHECK(t0.equal(cg_outputs[0])); // Pass misaligned input. This must fail. // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(fe.runFusion({t0_misaligned})); + ASSERT_ANY_THROW(ke.runFusion({t0_misaligned})); // Pass misaligned output. This must fail too. // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(fe.runFusion({t0}, {t1_misaligned})); + ASSERT_ANY_THROW(ke.runFusion({t0}, {t1_misaligned})); } // Repro of issue #1530 @@ -1300,11 +1300,11 @@ TEST_F(NVFuserTest, FusionVectorizeContigIndexValidationFail_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(fe.runFusion({t0})); + ASSERT_ANY_THROW(ke.runFusion({t0})); } TEST_F(NVFuserTest, FusionContigIndexingWithBroadcast_CUDA) { @@ -1331,9 +1331,9 @@ TEST_F(NVFuserTest, FusionContigIndexingWithBroadcast_CUDA) { auto t1 = at::randn({3, 4}, options); { - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, t1}); + auto cg_outputs = ke.runFusion({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -1341,9 +1341,9 @@ TEST_F(NVFuserTest, FusionContigIndexingWithBroadcast_CUDA) { // Make sure tv2 indexing also works when it's stored in global memory tv2->setMemoryType(MemoryType::Global); { - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, t1}); + auto cg_outputs = ke.runFusion({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -1384,12 +1384,12 @@ TEST_F(NVFuserTest, FusionVectorizeContigIndexValidationFail2_CUDA) { auto t0 = at::randn(shape1, options); auto t1 = at::randn(shape2, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, t1}); // Vectorization of tv2 should be detected as invalid. // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(fe.runFusion({t0, t1})); + ASSERT_ANY_THROW(ke.runFusion({t0, t1})); } TEST_F(NVFuserTest, FusionVectorizeContigIndexWithBroadcast_CUDA) { @@ -1433,9 +1433,9 @@ TEST_F(NVFuserTest, FusionVectorizeContigIndexWithBroadcast_CUDA) { auto t0 = at::randn(shape1, options); auto t1 = at::randn(shape2, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, t1}); + auto cg_outputs = ke.runFusion({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -1467,7 +1467,7 @@ TEST_F(NVFuserTest, FusionVectorizeContigIndexPointwiseSchedule_CUDA) { // vector word size should be 4. Broadcasting of tv1 should not // matter. for (const auto& vec_info : - cg_results.fusion_executor->kernel()->summary().vectorized_set_info) { + cg_results.kernel_executor->kernel()->summary().vectorized_set_info) { NVF_CHECK( vec_info.word_size == 4, "Invalid vector word size: ", @@ -1512,9 +1512,9 @@ TEST_F(NVFuserTest, FusionTrivialReductionForwarding4_CUDA) { auto t0 = at::randn({111}, options); auto t1 = at::randn({123, 111}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, t1}); + auto cg_outputs = ke.runFusion({t0, t1}); auto t2 = t0.unsqueeze(0); auto t3 = t1 + t2; @@ -1563,9 +1563,9 @@ TEST_F(NVFuserTest, FusionRAWSyncInsertionPlace1_CUDA) { auto t0 = at::randn({10, 64}, options); auto t1 = at::randn({10, 64}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, t1}); + auto cg_outputs = ke.runFusion({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -1608,9 +1608,9 @@ TEST_F(NVFuserTest, FusionRAWSyncInsertionPlace2_CUDA) { auto t0 = at::randn({10, 64}, options); auto t1 = at::randn({10, 64}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, t1}); + auto cg_outputs = ke.runFusion({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -1651,9 +1651,9 @@ TEST_F(NVFuserTest, FusionRAWSyncInsertionPlace3_CUDA) { auto t0 = at::randn({50, 64}, options); auto t1 = at::randn({50, 64}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, t1}); + auto cg_outputs = ke.runFusion({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -1756,9 +1756,9 @@ TEST_F(NVFuserTest, FusionSerialSmemWriteParallelRead1_CUDA) { at::Tensor t1 = at::randn({128, 6}, options); at::Tensor t2 = at::randn({128, 6}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1, t2}); - auto cg_outputs = fe.runFusion({t0, t1, t2}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, t1, t2}); + auto cg_outputs = ke.runFusion({t0, t1, t2}); testValidate(&fusion, cg_outputs, {t0, t1, t2}, __LINE__, __FILE__); } @@ -1796,9 +1796,9 @@ TEST_F(NVFuserTest, FusionSerialSmemWriteParallelRead2_CUDA) { at::Tensor t1 = at::randn({128, 6}, options); at::Tensor t2 = at::randn({128, 6}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1, t2}); - auto cg_outputs = fe.runFusion({t0, t1, t2}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, t1, t2}); + auto cg_outputs = ke.runFusion({t0, t1, t2}); testValidate(&fusion, cg_outputs, {t0, t1, t2}, __LINE__, __FILE__); } @@ -1831,20 +1831,20 @@ TEST_F(NVFuserTest, FusionSimpleCpAsync_CUDA) { at::Tensor t0 = at::randn({m, n}, options); at::Tensor t1 = at::randn({m, n}, options); - FusionExecutor fe; + KernelExecutor ke; // requires ampere+ GPU if (!deviceMajorMinorCheck(8)) { ASSERT_THAT( - [&]() { fe.compileFusion(&fusion, {t0, t1}); }, + [&]() { ke.compileFusion(&fusion, {t0, t1}); }, testing::ThrowsMessage(testing::HasSubstr( "Reason: LoadStoreOpType::CpAsync requires Ampere"))); GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs"; } else { - fe.compileFusion(&fusion, {t0, t1}); + ke.compileFusion(&fusion, {t0, t1}); } - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + ke.compileFusion(&fusion, {t0, t1}); + auto cg_outputs = ke.runFusion({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -1877,19 +1877,19 @@ TEST_F(NVFuserTest, FusionCpAsyncPredicate_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({m, n}, options); - FusionExecutor fe; + KernelExecutor ke; if (!deviceMajorMinorCheck(8)) { ASSERT_THAT( - [&]() { fe.compileFusion(&fusion, {t0}); }, + [&]() { ke.compileFusion(&fusion, {t0}); }, testing::ThrowsMessage(testing::HasSubstr( "Reason: LoadStoreOpType::CpAsync requires Ampere"))); GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs"; } else { - fe.compileFusion(&fusion, {t0}); + ke.compileFusion(&fusion, {t0}); } - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); auto ref = t0.sum({1}); @@ -2006,11 +2006,11 @@ TEST_F(NVFuserTest, FusionPropagateParallelTypesToSiblings_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({9999}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto outputs = ke.runFusion({t0}); - testValidate(fe.kernel(), outputs, {t0}, {t0.mean({0})}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, {t0}, {t0.mean({0})}, __LINE__, __FILE__); } // Test ExactLogicalDomainMap @@ -2211,13 +2211,13 @@ TEST_F(NVFuserTest, FusionTestReEntrantGridWelford_CUDA) { GpuLower gpulw(&fusion); checker.handle(gpulw.run()->topLevelExprs()); - FusionExecutor fe; - fe.compileFusion(&fusion, {}, LaunchParams()); + KernelExecutor ke; + ke.compileFusion(&fusion, {}, LaunchParams()); auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); at::Tensor t0 = at::randn({X, Y, Y, Z}, options); - auto cg_outputs = fe.runFusion({t0}, LaunchParams(-1, -1, -1, -1, -1, -1)); + auto cg_outputs = ke.runFusion({t0}, LaunchParams(-1, -1, -1, -1, -1, -1)); // by default Welford outputs sum of square diff so need to divide to get var cg_outputs[1] = cg_outputs[1].div((float)(X * Y * Y)); @@ -2280,9 +2280,9 @@ TEST_F(NVFuserTest, FusionRedundantPredSync_CUDA) { at::Tensor t0 = at::randn({32}, options); at::Tensor t1 = at::randn({32, 32}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, t1}); + auto cg_outputs = ke.runFusion({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -2345,9 +2345,9 @@ TEST_F(NVFuserTest, FusionRedundantPredSync2_CUDA) { at::Tensor t0 = at::randn({32}, options); at::Tensor t1 = at::randn({32, 32}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, t1}); + auto cg_outputs = ke.runFusion({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -2427,9 +2427,9 @@ TEST_F(NVFuserTest, FusionRedundantPredSync3_CUDA) { at::Tensor t0 = at::randn({32}, options); at::Tensor t1 = at::randn({32, 32}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, t1}); + auto cg_outputs = ke.runFusion({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -2532,9 +2532,9 @@ TEST_F(NVFuserTest, FusionUnsqueeze1_CUDA) { at::Tensor t0 = at::randn({10, 11}, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -2567,9 +2567,9 @@ TEST_F(NVFuserTest, FusionSqueeze1_CUDA) { at::Tensor t0 = at::randn({10, 11}, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -2596,11 +2596,11 @@ TEST_F(NVFuserTest, FusionContigPredicate_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({3, 4}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); - testValidate(fe.kernel(), cg_outputs, {t0}, __LINE__, __FILE__); + testValidate(ke.kernel(), cg_outputs, {t0}, __LINE__, __FILE__); } // Repro of https://github.com/csarofeen/pytorch/issues/1777 @@ -2620,9 +2620,9 @@ TEST_F(NVFuserTest, FusionDivScalarLhs_CUDA) { auto aten_output = at::div( at::native::wrapped_scalar_tensor(at::Scalar(2.0), options.device()), t0); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, {aten_output}, __LINE__, __FILE__); } @@ -3242,9 +3242,9 @@ TEST_F(NVFuserTest, FusionIssue1785Repro_CUDA) { at::Tensor in1 = at::randn({16}, options); at::Tensor in2 = at::randn({12, 16}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {in1, in2}); - auto cg_outputs = fe.runFusion({in1, in2}); + KernelExecutor ke; + ke.compileFusion(&fusion, {in1, in2}); + auto cg_outputs = ke.runFusion({in1, in2}); testValidate(&fusion, cg_outputs, {in1, in2}, __LINE__, __FILE__); } @@ -3516,9 +3516,9 @@ TEST_F(NVFuserTest, FusionVectorComponentReduce_CUDA) { at::TensorOptions().dtype(at::kComplexFloat).device(at::kCUDA, 0); auto t0 = at::randn({1024}, options); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(fusion.get(), {t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(fusion.get(), cg_outputs, {t0}, __LINE__, __FILE__, ""); } @@ -3799,9 +3799,9 @@ TEST_F(NVFuserTest, FusionPredicateUnshare_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({5, 5}, options); - FusionExecutor fe; - fe.compileFusion(fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); auto out = cg_outputs[0]; testValidate(fusion, {out}, {t0}, __LINE__, __FILE__); @@ -3879,9 +3879,9 @@ TEST_F(NVFuserTest, FusionMergeBroadcastingTrivialReduction1_CUDA) { at::Tensor t0 = at::randn({1, 1}, options); at::Tensor t1 = at::randn({10}, options); - FusionExecutor fe; - fe.compileFusion(fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compileFusion(fusion, {t0, t1}); + auto cg_outputs = ke.runFusion({t0, t1}); auto out = cg_outputs[0]; testValidate( @@ -3923,9 +3923,9 @@ TEST_F(NVFuserTest, FusionMappingRelation_CUDA) { at::Tensor t0 = at::randn({1, 1}, options); at::Tensor t1 = at::randn({2, 1, 1}, options); - FusionExecutor fe; - fe.compileFusion(fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compileFusion(fusion, {t0, t1}); + auto cg_outputs = ke.runFusion({t0, t1}); auto out = cg_outputs[0]; testValidate(fusion, {out}, {t0, t1}, __LINE__, __FILE__); @@ -3947,9 +3947,9 @@ TEST_F(NVFuserTest, FusionInlineAt_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({100, 2}, options); - FusionExecutor fe; - fe.compileFusion(fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); auto out = cg_outputs[0]; testValidate(fusion, {out}, {t0}, __LINE__, __FILE__); @@ -3981,9 +3981,9 @@ TEST_F(NVFuserTest, FusionReplayTrivialReductionAndBroadcast2_CUDA) { at::Tensor t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(fusion_ptr.get(), aten_inputs); + auto outputs = ke.runFusion(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -4043,19 +4043,19 @@ TEST_F(NVFuserTest, FusionSimpleAmperePipeline_CUDA) { GpuLower gpulw(&fusion); pred_checker.handle(gpulw.run()->topLevelExprs()); - FusionExecutor fe; + KernelExecutor ke; // requires ampere+ GPU if (!deviceMajorMinorCheck(8)) { ASSERT_THAT( - [&]() { fe.compileFusion(&fusion, {input1}); }, + [&]() { ke.compileFusion(&fusion, {input1}); }, testing::ThrowsMessage(testing::HasSubstr( "Reason: LoadStoreOpType::CpAsync requires Ampere"))); GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs"; } else { - fe.compileFusion(&fusion, {input1}); + ke.compileFusion(&fusion, {input1}); } - auto cg_outputs = fe.runFusion({input1}); + auto cg_outputs = ke.runFusion({input1}); testValidate(&fusion, cg_outputs, {input1}, __LINE__, __FILE__); } @@ -4078,8 +4078,8 @@ TEST_F(NVFuserTest, FusionExpandedInput_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({4096, 1, 4}, options).expand({-1, 7, -1}); - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = fec.runFusionWithInputs({t0}); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs({t0}); testValidate(fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -4112,8 +4112,8 @@ TEST_F(NVFuserTest, FusionVectorizeRepro1843_CUDA) { at::Tensor t0 = at::empty_strided({4096, 32128}, {32128, 1}, options).random_(); - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = fec.runFusionWithInputs({t1, t0}); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs({t1, t0}); testValidate(fusion, cg_outputs, {t1, t0}, __LINE__, __FILE__); } @@ -4137,8 +4137,8 @@ TEST_F(NVFuserTest, FusionBroadcastPersistentReduction_CUDA) { auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); auto t0 = at::randn({1024, 768}, options); - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = fec.runFusionWithInputs({t0}); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs({t0}); testValidate(fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -4284,8 +4284,8 @@ TEST_F(NVFuserTest, FusionRepro2094_CUDA) { outputs.push_back(t32); } - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = fec.runFusionWithInputs(inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs(inputs); testValidate(fusion, cg_outputs, inputs, outputs, __LINE__, __FILE__); } @@ -4428,9 +4428,9 @@ TEST_F(NVFuserTest, FusionSqueezeTransformPropagation_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({5, 1, 1, 1, 1}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -4482,9 +4482,9 @@ TEST_F(NVFuserTest, FusionSqueezeInlining_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({1, 1024}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -4885,9 +4885,9 @@ TEST_F(NVFuserTest, FusionPropagateVectorizePredicate_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({32}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); NVF_CHECK(t0.equal(cg_outputs[0])); } @@ -4992,16 +4992,16 @@ TEST_F(NVFuserTest, FusionIssue2163ReproInvalidAlias_CUDA) { std::vector aten_inputs({at_input, at_weight}); - FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(fusion_ptr.get(), aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); auto cg_output = cg_outputs.at(0); auto ref_x_sub_mean = at_input - at_input.sum({0}).unsqueeze(0); auto ref_y = ref_x_sub_mean * at_weight.unsqueeze(0); testValidate( - fe.kernel(), {cg_output}, aten_inputs, {ref_y}, __LINE__, __FILE__, ""); + ke.kernel(), {cg_output}, aten_inputs, {ref_y}, __LINE__, __FILE__, ""); } // Testing scalar FP types @@ -5080,9 +5080,9 @@ TEST_F(NVFuserTest, FusionFloatingPointType_CUDA) { std::vector inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto cg_outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, inputs); + auto cg_outputs = ke.runFusion(inputs); testValidate(&fusion, cg_outputs, inputs, __LINE__, __FILE__); } @@ -5146,9 +5146,9 @@ TEST_F(NVFuserTest, FusionIntegerType_CUDA) { std::vector inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto cg_outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, inputs); + auto cg_outputs = ke.runFusion(inputs); auto i2 = int64_val; auto i3 = int_val; @@ -5209,16 +5209,16 @@ TEST_F(NVFuserTest, FusionVectorizeWelford1_CUDA) { at::Tensor t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); auto ref_avg = t0.mean({0}); auto ref_var = t0.var({0}, false) * shape[0]; auto ref_N = at::ones({shape[1]}, options_int) * shape[0]; testValidate( - fe.kernel(), + ke.kernel(), cg_outputs, {t0}, {ref_avg, ref_var, ref_N}, @@ -5282,16 +5282,16 @@ TEST_F(NVFuserTest, FusionVectorizeWelford2_CUDA) { at::Tensor t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); auto ref_avg = t0.to(at::kDouble).mean({0}); auto ref_var = t0.to(at::kDouble).var({0}, false) * shape[0]; auto ref_N = at::ones({shape[1]}, options_int) * shape[0]; testValidate( - fe.kernel(), + ke.kernel(), cg_outputs, {t0}, {ref_avg, ref_var, ref_N}, @@ -5320,7 +5320,7 @@ TEST_F(NVFuserTest, FusionRepro2241_CUDA) { fusion->addOutput(t7); } - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().device(at::kCUDA, 0); at::Tensor t6 = at::tensor({15}, options.dtype(at::kLong)); @@ -5328,7 +5328,7 @@ TEST_F(NVFuserTest, FusionRepro2241_CUDA) { at::Tensor t20 = at::tensor({12}, options.dtype(at::kLong)).expand({1, 1, 1, 1}); - auto cg_outputs = fec.runFusionWithInputs({t6, t15, t20}); + auto cg_outputs = executor_cache.runFusionWithInputs({t6, t15, t20}); auto sample_total = at::sum(t15, {0, 1, 2, 3}, true); auto sample_mean = at::div(sample_total, t20); @@ -5338,7 +5338,12 @@ TEST_F(NVFuserTest, FusionRepro2241_CUDA) { auto t7 = at::div(total, t6); testValidate( - fec.fusion(), cg_outputs, {t6, t15, t20}, {t7}, __LINE__, __FILE__); + executor_cache.fusion(), + cg_outputs, + {t6, t15, t20}, + {t7}, + __LINE__, + __FILE__); } TEST_F(NVFuserTest, FusionExprSortMatmulLikeSchedule_CUDA) { @@ -5379,11 +5384,11 @@ TEST_F(NVFuserTest, FusionExprSortMatmulLikeSchedule_CUDA) { at::Tensor t0 = at::randn({M1, M2, K1, K2}, options); at::Tensor t1 = at::randn({N1, N2, K1, K2}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, t1}); + auto cg_outputs = ke.runFusion({t0, t1}); - testValidate(fe.kernel(), cg_outputs, {t0, t1}, __LINE__, __FILE__); + testValidate(ke.kernel(), cg_outputs, {t0, t1}, __LINE__, __FILE__); } TEST_F(NVFuserTest, FusionFloatConstantWhere_CUDA) { @@ -5439,19 +5444,19 @@ TEST_F(NVFuserTest, FusionCpAsyncCommitWait_CUDA) { at::Tensor t0 = at::randn({12800, 8, 8, 8}, options); - FusionExecutor fe; + KernelExecutor ke; if (!deviceMajorMinorCheck(8)) { ASSERT_THAT( - [&]() { fe.compileFusion(&fusion, {t0}); }, + [&]() { ke.compileFusion(&fusion, {t0}); }, testing::ThrowsMessage(testing::HasSubstr( "Reason: LoadStoreOpType::CpAsync requires Ampere"))); GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs"; } else { - fe.compileFusion(&fusion, {t0}); + ke.compileFusion(&fusion, {t0}); } - auto cg_outputs = fe.runFusion({t0}); - testValidate(fe.kernel(), cg_outputs, {t0}, __LINE__, __FILE__); + auto cg_outputs = ke.runFusion({t0}); + testValidate(ke.kernel(), cg_outputs, {t0}, __LINE__, __FILE__); } // Repro of issue #2459 @@ -5514,14 +5519,14 @@ TEST_F(NVFuserTest, FusionClearThreadPredicateByRAWSync_CUDA) { std::vector inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto cg_outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, inputs); + auto cg_outputs = ke.runFusion(inputs); auto t3 = t0.sum({1}).sum({0}); auto t6 = t0.sum({1}); - testValidate(fe.kernel(), cg_outputs, inputs, {t3, t6}, __LINE__, __FILE__); + testValidate(ke.kernel(), cg_outputs, inputs, {t3, t6}, __LINE__, __FILE__); } namespace { @@ -5636,15 +5641,15 @@ TEST_F(NVFuserTest, FusionPredicateReductionInitShared_CUDA) { std::vector inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto cg_outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, inputs); + auto cg_outputs = ke.runFusion(inputs); auto ref_t1 = t0.sum({0}); auto ref_t4 = t1.exp(); testValidate( - fe.kernel(), cg_outputs, inputs, {ref_t1, ref_t4}, __LINE__, __FILE__); + ke.kernel(), cg_outputs, inputs, {ref_t1, ref_t4}, __LINE__, __FILE__); } // Repro of issue #2487 @@ -5690,15 +5695,15 @@ TEST_F(NVFuserTest, FusionPredicateReductionInitGlobal_CUDA) { std::vector inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto cg_outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, inputs); + auto cg_outputs = ke.runFusion(inputs); auto ref_t1 = t0.sum({0}); auto ref_t3 = t1.exp(); testValidate( - fe.kernel(), cg_outputs, inputs, {ref_t1, ref_t3}, __LINE__, __FILE__); + ke.kernel(), cg_outputs, inputs, {ref_t1, ref_t3}, __LINE__, __FILE__); } TEST_F(NVFuserTest, FusionTypePromotionATenConsistency_CUDA) { @@ -5763,73 +5768,73 @@ TEST_F(NVFuserTest, FusionCompileIndexType_CUDA) { .getSmallestIndexTypeOfArguments() == PrimDataType::Int32); { - FusionExecutor fe; + KernelExecutor ke; // Lower the kernel with large inputs and int64 index type. CompileParams compile_opts = {.index_type = PrimDataType::Int}; - fe.compileFusion(&fusion, large_inputs, LaunchParams(), compile_opts); + ke.compileFusion(&fusion, large_inputs, LaunchParams(), compile_opts); NVF_CHECK( - fe.kernel()->indexType() == PrimDataType::Int, + ke.kernel()->indexType() == PrimDataType::Int, "Unexpected kernel index type: ", - fe.kernel()->indexType()); + ke.kernel()->indexType()); // Since the index type is int64, both small and large inputs // should work fine - fe.runFusion(small_inputs); - fe.runFusion(large_inputs); + ke.runFusion(small_inputs); + ke.runFusion(large_inputs); } { - FusionExecutor fe; + KernelExecutor ke; // Lower the kernel with small inputs and int64 index type. CompileParams compile_opts = {.index_type = PrimDataType::Int}; - fe.compileFusion(&fusion, small_inputs, LaunchParams(), compile_opts); + ke.compileFusion(&fusion, small_inputs, LaunchParams(), compile_opts); NVF_CHECK( - fe.kernel()->indexType() == PrimDataType::Int, + ke.kernel()->indexType() == PrimDataType::Int, "Unexpected kernel index type: ", - fe.kernel()->indexType()); + ke.kernel()->indexType()); // Since the index type is int64, both small and large inputs // should work fine - fe.runFusion(small_inputs); - fe.runFusion(large_inputs); + ke.runFusion(small_inputs); + ke.runFusion(large_inputs); } { - FusionExecutor fe; + KernelExecutor ke; LaunchParams launch_params; CompileParams compile_opts = {.index_type = PrimDataType::Int32}; - fe.compileFusion(&fusion, small_inputs, launch_params, compile_opts); + ke.compileFusion(&fusion, small_inputs, launch_params, compile_opts); NVF_CHECK( - fe.kernel()->indexType() == PrimDataType::Int32, + ke.kernel()->indexType() == PrimDataType::Int32, "Unexpected kernel index type: ", - fe.kernel()->indexType()); + ke.kernel()->indexType()); // This should complete successfully as the arguments are small // enough to use the int32 index type - fe.runFusion(small_inputs); + ke.runFusion(small_inputs); // This should fail as the Kernel is already compiled for Int32, but // the arguments are too large CompileParams compile_opts_large = {.index_type = PrimDataType::Int}; EXPECT_THAT( [&]() { - fe.runFusion(large_inputs, launch_params, compile_opts_large); + ke.runFusion(large_inputs, launch_params, compile_opts_large); }, testing::ThrowsMessage(testing::HasSubstr( "Kernel index type and compilation index type don't match"))); } { - FusionExecutor fe; + KernelExecutor ke; // Lower the kernel with large inputs and int32 index type. CompileParams compile_opts = {.index_type = PrimDataType::Int32}; // This should fail due to the conflict EXPECT_THAT( [&]() { - fe.compileFusion( + ke.compileFusion( &fusion, large_inputs, LaunchParams(), compile_opts); }, testing::ThrowsMessage(testing::HasSubstr( @@ -6034,13 +6039,14 @@ TEST_F(NVFuserTest, FusionAvoidRedundantWriteBroadcastedSoftmaxInput_CUDA) { at::Tensor t1 = at::ones(shape1, options); std::vector inputs = {t0, t1}; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = fec.runFusionWithInputs(inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs(inputs); // check thread_pred and write_stride - const auto& fe = fec.getMostRecentKernelRuntime()->executors().at(0); - auto kernel = fe.kernel(); - const auto& thread_pred_map = fe.threadPredMap(); + const auto& ke = + executor_cache.getMostRecentKernelRuntime()->executors().at(0); + auto kernel = ke.kernel(); + const auto& thread_pred_map = ke.threadPredMap(); for (const auto expr : kernel->exprs()) { auto tv = ir_utils::getTvOutput(expr); if (tv && tv->name() == 15 && tv->getMemoryType() == MemoryType::Global) { @@ -6054,7 +6060,7 @@ TEST_F(NVFuserTest, FusionAvoidRedundantWriteBroadcastedSoftmaxInput_CUDA) { } } - testValidate(fec.fusion(), cg_outputs, inputs, __LINE__, __FILE__); + testValidate(executor_cache.fusion(), cg_outputs, inputs, __LINE__, __FILE__); } TEST_F(NVFuserTest, FusionAvoidRedundantWrite_CUDA) { @@ -6089,13 +6095,14 @@ TEST_F(NVFuserTest, FusionAvoidRedundantWrite_CUDA) { at::Tensor t1 = at::randn(shape1, options); std::vector inputs = {t0, t1}; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = fec.runFusionWithInputs(inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs(inputs); // check thread_pred and write_stride - const auto& fe = fec.getMostRecentKernelRuntime()->executors().at(0); - auto kernel = fe.kernel(); - const auto& thread_pred_map = fe.threadPredMap(); + const auto& ke = + executor_cache.getMostRecentKernelRuntime()->executors().at(0); + auto kernel = ke.kernel(); + const auto& thread_pred_map = ke.threadPredMap(); for (const auto expr : kernel->exprs()) { auto tv = ir_utils::getTvOutput(expr); @@ -6110,7 +6117,8 @@ TEST_F(NVFuserTest, FusionAvoidRedundantWrite_CUDA) { } } - testValidate(fec.fusion(), cg_outputs, inputs, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), cg_outputs, inputs, __LINE__, __FILE__); }; // Test case where [B1,I2,I3] is merged to [B1I2I3] @@ -6189,13 +6197,14 @@ TEST_F(NVFuserTest, FusionAvoidRedundantWriteDifferentConcretizedDomains_CUDA) { testing::ThrowsMessage(testing::HasSubstr( "Producer is required to be in Global Memory based on parallelization strategy. RAW flags: (blockIdx.x)"))); } else { - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); - auto optimized_fusion = fec.getMostRecentKernelRuntime(); + auto optimized_fusion = executor_cache.getMostRecentKernelRuntime(); NVF_CHECK(optimized_fusion->isSegmented(), "segmentation didn't happen!"); - testValidate(fec.fusion(), cg_outputs, aten_inputs, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), cg_outputs, aten_inputs, __LINE__, __FILE__); } }; runTest(true); @@ -6239,13 +6248,13 @@ TEST_F(NVFuserTest, FusionAvoidRedundantWriteNonOutput_CUDA) { at::Tensor t1 = at::randn({32, 64}, options); std::vector inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), inputs); - auto cg_outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compileFusion(fusion_ptr.get(), inputs); + auto cg_outputs = ke.runFusion(inputs); // check thread_pred - auto kernel = fe.kernel(); - const auto& thread_pred_map = fe.threadPredMap(); + auto kernel = ke.kernel(); + const auto& thread_pred_map = ke.threadPredMap(); for (const auto expr : kernel->exprs()) { auto tv = ir_utils::getTvOutput(expr); @@ -6303,13 +6312,13 @@ TEST_F(NVFuserTest, FusionAvoidRedundantWriteNonNeighbor_CUDA) { at::Tensor t1 = at::randn({8, 7, 10, 12, 9}, options); std::vector inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), inputs); - auto cg_outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compileFusion(fusion_ptr.get(), inputs); + auto cg_outputs = ke.runFusion(inputs); // check thread_pred - auto kernel = fe.kernel(); - const auto& thread_pred_map = fe.threadPredMap(); + auto kernel = ke.kernel(); + const auto& thread_pred_map = ke.threadPredMap(); for (const auto expr : kernel->exprs()) { auto tv = ir_utils::getTvOutput(expr); @@ -6759,9 +6768,9 @@ TEST_F(ExpandedBroadcastGlobalIntermediateTest, TheTest_CUDA) { at::Tensor t0 = at::randn({2, 1, 2}, options); - FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), {t0}); - auto cg_output = fe.runFusion({t0}).at(0); + KernelExecutor ke; + ke.compileFusion(fusion_ptr.get(), {t0}); + auto cg_output = ke.runFusion({t0}).at(0); ASSERT_EQ(cg_output.size(0), 2); ASSERT_EQ(cg_output.size(1), (1L << 60L)); @@ -6808,10 +6817,10 @@ TEST_F(NVFuserTest, FusionTestWarnRegisterSpill_CUDA) { auto compile_opts = heuristic_params->cparams; compile_opts.maxrregcount = 32; compile_opts.enable_ptxas_verbose = true; - FusionExecutor fe; - fe.compileFusion( + KernelExecutor ke; + ke.compileFusion( &fusion, {aten_input}, heuristic_params->lparams, compile_opts); - auto cg_outputs = fe.runFusion({aten_input}); + auto cg_outputs = ke.runFusion({aten_input}); // validate results testValidate( @@ -6926,9 +6935,9 @@ TEST_F(NVFuserTest, IsFinite_CUDA) { std::array data{1.0, INFINITY, NAN}; const auto input = at::from_blob(data.data(), {3}, {1}).to(options); - FusionExecutor fe; - fe.compileFusion(fusion, {input}); - const auto output = fe.runFusion({input}); + KernelExecutor ke; + ke.compileFusion(fusion, {input}); + const auto output = ke.runFusion({input}); testValidate(fusion, output, {input}, __LINE__, __FILE__); } @@ -7026,8 +7035,8 @@ TEST_F(NVFuserTest, FusionOptionsGuard_CUDA) { // capture stdout and check stdout contains register spill warning captureStdout(); - FusionExecutor fe; - fe.compileFusion( + KernelExecutor ke; + ke.compileFusion( &fusion, {aten_input}, heuristic_params->lparams, @@ -7070,18 +7079,18 @@ TEST_F(NVFuserTest, FusionDisableKernelReuse_CUDA) { auto tv1 = add(tv0, tv0); fusion->addOutput(tv1); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto a5 = at::zeros({5}, options); auto a6 = at::zeros({6}, options); auto a7 = at::zeros({7}, options); - fec.runFusionWithInputs({a5}); + executor_cache.runFusionWithInputs({a5}); - auto numRuntimes = [&fec]() -> size_t { + auto numRuntimes = [&executor_cache]() -> size_t { // this is map, vector> - const auto& runtime_map = fec.getKernelRuntimes(); + const auto& runtime_map = executor_cache.getKernelRuntimes(); return runtime_map .begin() // There should be only one device/concretization pair ->second.size(); @@ -7091,7 +7100,7 @@ TEST_F(NVFuserTest, FusionDisableKernelReuse_CUDA) { DisableOptionsGuard og; DisableOptionsGuard::getCurOptions().unset(DisableOption::KernelReuse); - fec.runFusionWithInputs({a6}); + executor_cache.runFusionWithInputs({a6}); // Since kernel reuse is enabled, we should not generate a new runtime EXPECT_EQ(numRuntimes(), 1); @@ -7101,7 +7110,7 @@ TEST_F(NVFuserTest, FusionDisableKernelReuse_CUDA) { DisableOptionsGuard og; DisableOptionsGuard::getCurOptions().set(DisableOption::KernelReuse); - fec.runFusionWithInputs({a7}); + executor_cache.runFusionWithInputs({a7}); // Disabling reuse means we should get a new runtime EXPECT_EQ(numRuntimes(), 2); @@ -7186,9 +7195,9 @@ TEST_F(NVFuserTest, FusionLayerNormSharedMemoryBuffer_CUDA) { "Shouldn't use shared memory buffer!"); } - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = - fec.runFusionWithInputs({aten_input, aten_weight, aten_bias}); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs( + {aten_input, aten_weight, aten_bias}); testValidate( &fusion_copy, @@ -7260,8 +7269,8 @@ TEST_F(NVFuserTest, FusionInstanceNormNHWC_CUDA) { outputs.push_back(t4); } - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = fec.runFusionWithInputs(inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs(inputs); testValidate(fusion, cg_outputs, inputs, outputs, __LINE__, __FILE__); } @@ -7401,9 +7410,9 @@ TEST_F(NVFuserTest, AllInputDtypes) { CompileParams opt{.index_type = index_type}; - FusionExecutor fe; - fe.compileFusion(fusion.get(), args, LaunchParams{}, opt); - auto outputs = fe.runFusion(args, LaunchParams{}, opt); + KernelExecutor ke; + ke.compileFusion(fusion.get(), args, LaunchParams{}, opt); + auto outputs = ke.runFusion(args, LaunchParams{}, opt); auto kernel_result = outputs.at(0).item(); auto expect = ee.evaluate(output).as().item(); @@ -7521,9 +7530,9 @@ TEST_F(NVFuserTest, OpaqueTupleAsComplex) { KernelArgumentHolder args; args.push(Opaque(std::array{1.2, 3.4})); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion(args); + KernelExecutor ke; + ke.compileFusion(&fusion); + auto outputs = ke.runFusion(args); EXPECT_EQ( outputs.at(0).item>(), c10::complex(1.2, 3.4)); @@ -7548,9 +7557,9 @@ TEST_F(NVFuserTest, StructConstruct) { fusion.addOutput(tv); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion({1.2, 3.4}); + KernelExecutor ke; + ke.compileFusion(&fusion); + auto outputs = ke.runFusion({1.2, 3.4}); EXPECT_EQ( outputs.at(0).item>(), c10::complex(1.2, 3.4)); @@ -7586,12 +7595,12 @@ TEST_F(NVFuserTest, VectorizationStrideValidation) { auto t0 = at::randn(shape, options).expand({-1, 5, -1}); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); // This previously triggered a false positive error with the stride // validation - auto cg_outputs = fe.runFusion(aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); ASSERT_TRUE(cg_outputs[0].equal(t0)); } @@ -7615,10 +7624,10 @@ TEST_F(NVFuserTest, ConstLongExpressions) { auto tv0 = full({}, s1, DataType::Int); fusion->addOutput(tv0); - FusionExecutor fe; - fe.compileFusion(fusion); + KernelExecutor ke; + ke.compileFusion(fusion); - auto outputs = fe.runFusion({}); + auto outputs = ke.runFusion({}); testValidate(fusion, outputs, {}, __LINE__, __FILE__); } @@ -7687,10 +7696,10 @@ TEST_F(NVFuserTest, PredicateRNGOps) { auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); at::Tensor t0 = at::zeros({2048, size}, options); - FusionExecutor fe; - fe.compileFusion(fusion, {t0}); + KernelExecutor ke; + ke.compileFusion(fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); } TEST_F(NVFuserTest, LoweringHook) { @@ -7849,8 +7858,8 @@ TEST_F(NVFuserTest, AvoidCachingSliceInput) { NVF_CHECK(kernel_runtime->isSegmented(), "segmentation didn't happen"); const auto num_segments = kernel_runtime->fusionSegments()->groups().size(); NVF_CHECK(num_segments == 3, "Expect 3 segments, got: ", num_segments); - for (const auto& fe : kernel_runtime->executors()) { - for (auto expr : fe.fusion()->exprs()) { + for (const auto& ke : kernel_runtime->executors()) { + for (auto expr : ke.fusion()->exprs()) { if (expr->isA()) { auto slice = expr->as(); NVF_CHECK( @@ -7877,9 +7886,9 @@ TEST_F(NVFuserTest, UnsupportedBFloat) { fusion.addInput(tv0); fusion.addOutput(tv1); - FusionExecutor fe; + KernelExecutor ke; EXPECT_THAT( - [&]() { fe.compileFusion(&fusion); }, + [&]() { ke.compileFusion(&fusion); }, testing::ThrowsMessage( testing::HasSubstr("Reason: Fusion contains BFloat16"))); } @@ -7943,9 +7952,9 @@ TEST_F(NVFuserTest, BlockReduction3D) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); auto ref = t0.sum(0).sum(-1); testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); }; @@ -7986,9 +7995,9 @@ TEST_F(NVFuserTest, ReverseMerge) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({11, 12}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); ASSERT_TRUE(t0.equal(cg_outputs.at(0))); } @@ -8016,9 +8025,9 @@ TEST_F(NVFuserTest, FusionCpAsyncPredicateAvoidIllegalMemoryAccess) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({m, n}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); ASSERT_TRUE(t0.equal(cg_outputs.at(0))); } @@ -8350,9 +8359,9 @@ TEST_F(NVFuserTest, BroadcastFromNowhereFusion) { // TODO: use larger tensor size at::Tensor t0 = at::randn({4}, options); at::Tensor t1 = at::randn({2, 4}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, t1}); + auto cg_outputs = ke.runFusion({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -8399,8 +8408,8 @@ TEST_F(NVFuserTest, ReplayRFactorMergeBcast) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor at_x = at::ones(input_shape, options); std::vector aten_inputs = {at_x}; - FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); - auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -8434,9 +8443,9 @@ TEST_F(NVFuserTest, MultipleDifferentSizeGridReduction) { const at::Tensor t1 = at::randn({192}, options); const std::vector inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto cg_outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, inputs); + auto cg_outputs = ke.runFusion(inputs); testValidate(&fusion, cg_outputs, inputs, __LINE__, __FILE__); } @@ -8869,9 +8878,9 @@ TEST_F(NVFuserTest, CpAsyncDataTypeBool) { // "r"((uint32_t)((!b3))) // ); // If not correctly lowered, would trigger error in compile - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } diff --git a/tests/cpp/test_gpu_compute_with.cpp b/tests/cpp/test_gpu_compute_with.cpp index 9ef2cfced37..1a505e495f0 100644 --- a/tests/cpp/test_gpu_compute_with.cpp +++ b/tests/cpp/test_gpu_compute_with.cpp @@ -164,9 +164,9 @@ TEST_F(NVFuserTest, FusionComputeWith1_CUDA) { at::Tensor t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -219,9 +219,9 @@ TEST_F(NVFuserTest, FusionComputeWith2_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({dimx}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); auto aten_output = at::_softmax(t0.to(at::kDouble), -1, false); @@ -261,9 +261,9 @@ TEST_F(NVFuserTest, FusionComputeWith3_CUDA) { at::Tensor t0 = at::randn({123}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -307,9 +307,9 @@ TEST_F(NVFuserTest, FusionComputeWith4_CUDA) { at::Tensor t0 = at::randn({345, 10}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -346,9 +346,9 @@ TEST_F(NVFuserTest, FusionComputeWith5_CUDA) { at::Tensor t0 = at::randn({345, 10}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -449,9 +449,9 @@ TEST_F(NVFuserTest, FusionComputeWith6_CUDA) { const std::vector input_shape{N, H, W, C}; auto t0 = at::randn(input_shape, options_half); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, LaunchParams()); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, LaunchParams()); + auto cg_outputs = ke.runFusion({t0}); auto t1 = t0.to(at::kFloat); auto t2 = t1.mean({0, 1, 2}); diff --git a/tests/cpp/test_gpu_fused_reduction.cpp b/tests/cpp/test_gpu_fused_reduction.cpp index c29460a4241..0fa61a66397 100644 --- a/tests/cpp/test_gpu_fused_reduction.cpp +++ b/tests/cpp/test_gpu_fused_reduction.cpp @@ -115,9 +115,9 @@ TEST_F(NVFuserTest, FusionGridAllreduce1_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({nx}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); auto ref = sum(t0).unsqueeze(0) + t0; @@ -164,9 +164,9 @@ TEST_F(NVFuserTest, FusionGridAllreduce2_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({nx}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); auto ref = sum(t0).unsqueeze(0) + t0; @@ -212,9 +212,9 @@ TEST_F(NVFuserTest, FusionGridAllreduce3_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({nx, ny}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); auto ref = sum(t0, {1}).unsqueeze(-1) + t0; @@ -257,9 +257,9 @@ TEST_F(NVFuserTest, FusionGridAllreduce4_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({nx}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); auto ref = (sum(t0) + 1).unsqueeze(0) + t0; @@ -319,9 +319,9 @@ TEST_F(NVFuserTest, FusionGridAllreduce5_CUDA) { auto t0 = at::randn({iter, nx}, options); auto t5 = at::randn({bdimy, bdimx}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t5}); - auto cg_outputs = fe.runFusion({t0, t5}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, t5}); + auto cg_outputs = ke.runFusion({t0, t5}); auto ref = (sum(t0, {1}) + 1).unsqueeze(-1) + t0; @@ -371,14 +371,14 @@ TEST_F(NVFuserTest, FusionGridAllreduce6_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto outputs = ke.runFusion({t0}); auto t0_double = t0.to(at::kDouble); auto ref = t0_double + t0_double.sum({0}).unsqueeze(0); - testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); } TEST_F(NVFuserTest, FusionGridAllreduceWelford1_CUDA) { @@ -417,9 +417,9 @@ TEST_F(NVFuserTest, FusionGridAllreduceWelford1_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({nx}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); auto ref = (t0.mean({0}).unsqueeze(0) + t0) + t0.var({0}, false).unsqueeze(0) * nx; @@ -467,9 +467,9 @@ TEST_F(NVFuserTest, FusionGridAllreduceWelford2_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({nx, ny}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); auto ref = (sum(t0, {1}) / ny).unsqueeze(-1) + t0; @@ -586,10 +586,10 @@ TEST_F(NVFuserTest, FusionFusedReductionBatchnorm_CUDA) { GpuLower gpulw(&fusion); validateNoParallelBroadcastExist(gpulw.run()); - FusionExecutor fe; + KernelExecutor ke; LaunchParams launch_params(2, 2, -1, -1, -1, -1); - fe.compileFusion(&fusion, aten_inputs, launch_params); - auto cg_outputs = fe.runFusion(aten_inputs, launch_params); + ke.compileFusion(&fusion, aten_inputs, launch_params); + auto cg_outputs = ke.runFusion(aten_inputs, launch_params); auto t5 = t0.to(at::kFloat); auto t6 = t1.to(at::kFloat); @@ -653,13 +653,13 @@ TEST_F(NVFuserTest, FusionGroupedReduction1_CUDA) { auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto outputs = ke.runFusion({t0}); auto ref = t0.sum({1}) * 2; - testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); } // Grouping reductions with different ops @@ -698,13 +698,13 @@ TEST_F(NVFuserTest, FusionGroupedReduction2_CUDA) { auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto outputs = ke.runFusion({t0}); auto ref = (t0 + 1).sum({1}) + std::get<0>((t0 + 2).max(1)); - testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); } // Grouped reduction with different types @@ -741,13 +741,13 @@ TEST_F(NVFuserTest, FusionGroupedReduction3_CUDA) { auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto outputs = ke.runFusion({t0}); auto ref = t0.sum({1}) + t0.to(c10::kDouble).sum({1}).to(c10::kFloat); - testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); } // Testing validation @@ -829,11 +829,11 @@ TEST_F(NVFuserTest, FusionGroupedReduction6_CUDA) { auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto outputs = ke.runFusion({t0}); - testValidate(fe.kernel(), outputs, {t0}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, {t0}, __LINE__, __FILE__); } TEST_F(NVFuserTest, FusionGroupedReduction7_CUDA) { @@ -892,13 +892,13 @@ TEST_F(NVFuserTest, FusionGroupedReductionRfactor1_CUDA) { auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto outputs = ke.runFusion({t0}); auto ref = t0.sum({0}) * 2; - testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); } // Rfactoring grouped reductions @@ -937,13 +937,13 @@ TEST_F(NVFuserTest, FusionGroupedReductionRfactor2_CUDA) { auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto outputs = ke.runFusion({t0}); auto ref = t0.sum({0}) * 2; - testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); } // Group reductions of tensors that have computeAt positions set @@ -983,13 +983,13 @@ TEST_F(NVFuserTest, FusionGroupedReductionAfterComputeAt_CUDA) { auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto outputs = ke.runFusion({t0}); auto ref = (t0 + 1).sum({1}) * 2; - testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); } TEST_F(NVFuserTest, FusionGroupAllreduce1_CUDA) { @@ -1023,14 +1023,14 @@ TEST_F(NVFuserTest, FusionGroupAllreduce1_CUDA) { auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto outputs = ke.runFusion({t0}); auto t3 = t0.sum({0}).unsqueeze(-1); auto ref = t0 + t3 + t3; - testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); } // Grid reductionso of different types @@ -1076,15 +1076,15 @@ TEST_F(NVFuserTest, FusionGroupAllreduce2_CUDA) { auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto outputs = ke.runFusion({t0}); auto t2 = t0.sum({1}).unsqueeze(-1); auto t6 = t0.to(c10::kDouble).sum({1}).unsqueeze(-1).to(c10::kFloat); auto ref = t0 + t2 + t6; - testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); } // Grouping 3 grid allreduces @@ -1124,15 +1124,15 @@ TEST_F(NVFuserTest, FusionGroupAllreduce3_CUDA) { auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto outputs = ke.runFusion({t0}); auto t3 = t0 / t0.sum({0}).unsqueeze(0); auto t6 = t0 / std::get<0>(t0.max(0)).unsqueeze(0); auto t9 = t0 - std::get<0>(t0.min(0)).unsqueeze(0); - testValidate(fe.kernel(), outputs, {t0}, {t3, t6, t9}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, {t0}, {t3, t6, t9}, __LINE__, __FILE__); } // Grouping 8 grid allreduces @@ -1177,9 +1177,9 @@ TEST_F(NVFuserTest, FusionGroupAllreduce4_CUDA) { auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto outputs = ke.runFusion({t0}); at::Tensor ref = t0; for (int i = 0; i < num_reductions; ++i) { @@ -1189,7 +1189,7 @@ TEST_F(NVFuserTest, FusionGroupAllreduce4_CUDA) { ref = add(ref, bc); } - testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); } // Variation of FusionGroupAllreduce5_CUDA but with different @@ -1265,9 +1265,9 @@ TEST_F(NVFuserTest, FusionGroupAllreduce5_CUDA) { std::vector indices({at::indexing::Slice(0, 10)}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto outputs = ke.runFusion(aten_inputs); auto t3 = t0 / t0.sum({0}).unsqueeze(0).to(at::kComplexDouble); auto t7 = t4 / t4.sum({0}).unsqueeze(0).to(at::kComplexDouble); @@ -1275,7 +1275,7 @@ TEST_F(NVFuserTest, FusionGroupAllreduce5_CUDA) { auto t15 = t12 / t12.sum({0}).unsqueeze(0).to(at::kComplexDouble); auto t19 = t16 / t16.sum({0}).unsqueeze(0).to(at::kComplexDouble); auto ref = t3 + t7 + t11 + t15 + t19; - testValidate(fe.kernel(), outputs, aten_inputs, {ref}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, aten_inputs, {ref}, __LINE__, __FILE__); } // Persistent batchnorm backward with grouped allreduce @@ -1428,14 +1428,14 @@ TEST_F(NVFuserTest, FusionPersistentBNBackwardAllreduce_CUDA) { GpuLower gpulw(&fusion); validateNoParallelBroadcastExist(gpulw.run()); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); if (bidx * bidy > deviceSMCount()) { GTEST_SKIP() << "Not enough SMs to run this test"; } - auto outputs = fe.runFusion(aten_inputs); + auto outputs = ke.runFusion(aten_inputs); std::vector at_reduction_axes; std::copy( @@ -1483,7 +1483,7 @@ TEST_F(NVFuserTest, FusionPersistentBNBackwardAllreduce_CUDA) { } testValidate( - fe.kernel(), outputs, aten_inputs, {at_grad_input}, __LINE__, __FILE__); + ke.kernel(), outputs, aten_inputs, {at_grad_input}, __LINE__, __FILE__); } TEST_F(NVFuserTest, FusionGroupedReductionReEntrant1_CUDA) { @@ -1534,14 +1534,14 @@ TEST_F(NVFuserTest, FusionGroupedReductionReEntrant1_CUDA) { auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto outputs = ke.runFusion({t0}); auto t0_double = t0.to(at::kDouble); auto ref = (t0_double + 1).sum({0}) + (t0_double + 2).sum({0}); - testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); } // Channels-last batch norm with vectorization. Relies on re-entrant @@ -1649,9 +1649,9 @@ TEST_F(NVFuserTest, FusionGroupedReductionChannelsLastBatchNormLike_CUDA) { auto t2 = at::randn({shape.back()}, options_float); std::vector aten_inputs({t0, t1, t2}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto outputs = ke.runFusion(aten_inputs); auto t0_double = t0.to(at::kDouble); auto t1_double = t1.to(at::kDouble); @@ -1664,7 +1664,7 @@ TEST_F(NVFuserTest, FusionGroupedReductionChannelsLastBatchNormLike_CUDA) { (t1_double - t2_double.unsqueeze(0).unsqueeze(0).unsqueeze(0)); auto t9 = t8.sum(at_reduction_axes); - testValidate(fe.kernel(), outputs, aten_inputs, {t5, t9}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, aten_inputs, {t5, t9}, __LINE__, __FILE__); } // Test the grouped grid allreduce with BN-like outer reductions @@ -1780,9 +1780,9 @@ TEST_F( auto t2 = at::randn({shape.back()}, options_float); std::vector aten_inputs({t0, t1, t2}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto outputs = ke.runFusion(aten_inputs); auto t0_double = t0.to(at::kDouble); auto t1_double = t1.to(at::kDouble); @@ -1801,7 +1801,7 @@ TEST_F( auto t13 = t1_double + t12; testValidate( - fe.kernel(), outputs, aten_inputs, {t11, t13}, __LINE__, __FILE__); + ke.kernel(), outputs, aten_inputs, {t11, t13}, __LINE__, __FILE__); } TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduce1_CUDA) { @@ -1868,14 +1868,14 @@ TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduce1_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto outputs = ke.runFusion({t0}); auto t0_double = t0.to(at::kDouble); auto ref = t0_double + t0_double.sum({0}).unsqueeze(0); - testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); } // Test grouping of two domains @@ -1946,14 +1946,14 @@ TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduce2_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto outputs = ke.runFusion({t0}); auto t0_double = t0.to(at::kDouble); auto ref = t0_double + t0_double.sum({0}).unsqueeze(0); - testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); } // Group both expressions and iterations @@ -2030,16 +2030,16 @@ TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduce3_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto outputs = ke.runFusion({t0}); auto t0_double = t0.to(at::kDouble); auto t4 = t0_double + 1 + (t0_double + 1).sum({0}).unsqueeze(0); auto t8 = t0_double + 2 + (t0_double + 2).sum({0}).unsqueeze(0); auto ref = t4 + t8; - testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); } // ParallelType::Group with computeAt @@ -2122,14 +2122,14 @@ TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduce4_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto outputs = ke.runFusion({t0}); auto t0_double = t0.to(at::kDouble); auto ref = t0_double + t0_double.sum({0}).unsqueeze(0); - testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); } TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduceWelford1_CUDA) { @@ -2183,14 +2183,14 @@ TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduceWelford1_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto outputs = ke.runFusion({t0}); auto t0_double = t0.to(at::kDouble); auto ref = t0_double + t0_double.mean({0}).unsqueeze(0); - testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); } // Test grouping of two domains @@ -2248,14 +2248,14 @@ TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduceWelford2_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto outputs = ke.runFusion({t0}); auto t0_double = t0.to(at::kDouble); auto ref = t0_double + t0_double.mean({0}).unsqueeze(0); - testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); } // Follows the pattern of persistent outer grid welford in batchnorm @@ -2385,8 +2385,8 @@ TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduceWelfordShmoo_CUDA) { params.N, params.H, params.W, params.C}; auto t0 = at::randn(input_shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); // Skip the rest of this test size if the required number of SMs // exceeds the available SM count @@ -2397,7 +2397,7 @@ TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduceWelfordShmoo_CUDA) { return; } - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); auto t1 = t0.to(at::kDouble); auto t2 = t1.mean({0, 1, 2}).unsqueeze(0).unsqueeze(0).unsqueeze(0); @@ -2541,9 +2541,9 @@ TEST_F(NVFuserTest, FusionCrossEntropyGatherPattern_CUDA) { at::randint(0, num_classes, {batch_size}, options.dtype(at::kLong)); std::vector inputs = {at_log_probs, at_labels}; - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto cg_outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, inputs); + auto cg_outputs = ke.runFusion(inputs); auto ref = at::gather(at_log_probs, 1, at_labels.unsqueeze(1)).squeeze(); diff --git a/tests/cpp/test_gpu_indexing_ops.cpp b/tests/cpp/test_gpu_indexing_ops.cpp index 456fe0c2e46..95feb635954 100644 --- a/tests/cpp/test_gpu_indexing_ops.cpp +++ b/tests/cpp/test_gpu_indexing_ops.cpp @@ -396,9 +396,9 @@ TEST_F(NVFuserTest, FusionIndexSelect_Sum_CUDA) { std::vector aten_inputs = {input1, input0, input_idx}; auto heuristic_params = SchedulerEntry::scheduleWith( &fusion, SchedulerType::Reduction, aten_inputs); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs, heuristic_params->lparams); - fe.runFusion(aten_inputs, {cg_output}, heuristic_params->lparams); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs, heuristic_params->lparams); + ke.runFusion(aten_inputs, {cg_output}, heuristic_params->lparams); auto tv0_ref = at::index_select(input0, 0, input_idx); at::Tensor tv2_ref = tv0_ref * input1; diff --git a/tests/cpp/test_gpu_outer_reduction.cpp b/tests/cpp/test_gpu_outer_reduction.cpp index afaf89b7aeb..a0e73a69b5c 100644 --- a/tests/cpp/test_gpu_outer_reduction.cpp +++ b/tests/cpp/test_gpu_outer_reduction.cpp @@ -115,11 +115,11 @@ TEST_F(OuterReductionTest, GroupedGridWelfordOuterOpt) { auto t0 = at::randn(input_shape, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); NVF_CHECK( - fe.kernel()->summary().has_outer_grouped_grid_welford == + ke.kernel()->summary().has_outer_grouped_grid_welford == params.should_use_opt, (params.should_use_opt ? "Failed to use the optimized implementation" : "Should not use the optimized implementation"), @@ -132,7 +132,7 @@ TEST_F(OuterReductionTest, GroupedGridWelfordOuterOpt) { ", ", params.bidx); - auto cg_outputs = fe.runFusion(aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); auto t1 = t0; auto t2 = params.dtype == DataType::Half ? t1.to(at::kFloat) : t1; @@ -638,8 +638,8 @@ void grid_persistent_reduction_outer_norm_like( const std::vector input_shape{N, HW, HW, C}; auto t0 = at::randn(input_shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); auto bidy = ceilDiv(ceilDiv(N * HW * HW, params.tidy), params.pb); @@ -648,12 +648,12 @@ void grid_persistent_reduction_outer_norm_like( << params.bidx * bidy << ", available: " << deviceSMCount(); } - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); if (benchmark_mode) { for (int i = 0; i < 10; ++i) { clearL2Cache(); - cg_outputs = fe.runFusion({t0}); + cg_outputs = ke.runFusion({t0}); } } @@ -737,8 +737,8 @@ void grid_persistent_welford_outer_norm_like( const std::vector input_shape{N, HW, HW, C}; auto t0 = at::randn(input_shape, options_half); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); auto bidy = ceilDiv(ceilDiv(N * HW * HW, params.tidy), params.pb); @@ -747,12 +747,12 @@ void grid_persistent_welford_outer_norm_like( << params.bidx * bidy << ", available: " << deviceSMCount(); } - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); if (benchmark_mode) { for (int i = 0; i < 10; ++i) { clearL2Cache(); - cg_outputs = fe.runFusion({t0}); + cg_outputs = ke.runFusion({t0}); } } @@ -898,8 +898,8 @@ void grid_persistent_batchnorm_manual( std::vector aten_inputs( {at_input_nvfuser, at_weight, at_bias, at_running_mean, at_running_var}); - FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), aten_inputs); + KernelExecutor ke; + ke.compileFusion(fusion_ptr.get(), aten_inputs); auto bidy = ceilDiv(ceilDiv(N * HW * HW, params.tidy), params.pb); @@ -908,7 +908,7 @@ void grid_persistent_batchnorm_manual( << params.bidx * bidy << ", available: " << deviceSMCount(); } - auto cg_outputs = fe.runFusion(aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); cg_outputs.at(2) = cg_outputs.at(2).permute({0, 3, 1, 2}); auto at_output = at::batch_norm( @@ -923,7 +923,7 @@ void grid_persistent_batchnorm_manual( true); testValidate( - fe.kernel(), + ke.kernel(), {cg_outputs.at(2)}, aten_inputs, {at_output}, @@ -934,7 +934,7 @@ void grid_persistent_batchnorm_manual( if (benchmark_mode) { for (int i = 0; i < 10; ++i) { clearL2Cache(); - cg_outputs = fe.runFusion(aten_inputs); + cg_outputs = ke.runFusion(aten_inputs); } } } @@ -1037,8 +1037,8 @@ void grid_persistent_reduction_outer_norm_bwd_like( auto t1 = at::randn(input_shape, options); std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); auto bidy = ceilDiv(ceilDiv(N * HW * HW, params.tidy), params.pb); @@ -1047,12 +1047,12 @@ void grid_persistent_reduction_outer_norm_bwd_like( << params.bidx * bidy << ", available: " << deviceSMCount(); } - auto cg_outputs = fe.runFusion(aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); if (benchmark_mode) { for (int i = 0; i < 10; ++i) { clearL2Cache(); - cg_outputs = fe.runFusion(aten_inputs); + cg_outputs = ke.runFusion(aten_inputs); } } @@ -1224,8 +1224,8 @@ void grid_persistent_batchnorm_bwd_manual( std::vector cg_outputs; - FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), aten_inputs); + KernelExecutor ke; + ke.compileFusion(fusion_ptr.get(), aten_inputs); auto bidy = ceilDiv(ceilDiv(N * HW * HW, params.tidy), params.pb); @@ -1234,7 +1234,7 @@ void grid_persistent_batchnorm_bwd_manual( << params.bidx * bidy << ", available: " << deviceSMCount(); } - cg_outputs = fe.runFusion(aten_inputs); + cg_outputs = ke.runFusion(aten_inputs); // Permute grad_input output cg_outputs.at(0) = cg_outputs.at(0).permute({0, 3, 1, 2}); @@ -1251,7 +1251,7 @@ void grid_persistent_batchnorm_bwd_manual( {true, true, true}); testValidate( - fe.kernel(), + ke.kernel(), cg_outputs, aten_inputs, {std::get<0>(at_output), std::get<1>(at_output), std::get<2>(at_output)}, @@ -1262,7 +1262,7 @@ void grid_persistent_batchnorm_bwd_manual( if (benchmark_mode) { for (int i = 0; i < 10; ++i) { clearL2Cache(); - cg_outputs = fe.runFusion(aten_inputs); + cg_outputs = ke.runFusion(aten_inputs); } } } @@ -2181,22 +2181,20 @@ TEST_F(OuterReductionTest, IterGroupedBlockReduction) { rparams->unroll_factor_iter_dom = vect_factor; scheduler->schedule(&fusion, rparams); - FusionExecutor fusion_executor; - fusion_executor.compileFusion( - &fusion, aten_inputs, heuristic_params->lparams); - auto cg_outputs = - fusion_executor.runFusion(aten_inputs, heuristic_params->lparams); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs, heuristic_params->lparams); + auto cg_outputs = ke.runFusion(aten_inputs, heuristic_params->lparams); // lowering & check iteration grouped reductions NVF_CHECK( - fusion_executor.kernel()->summary().has_iter_grouped_reductions, + ke.kernel()->summary().has_iter_grouped_reductions, "There must be iter domain grouped reductions."); NVF_CHECK( - fusion_executor.kernel()->summary().num_grouped_iterations == vect_factor, + ke.kernel()->summary().num_grouped_iterations == vect_factor, "Expected ", vect_factor, " grouped iterations, found ", - fusion_executor.kernel()->summary().num_grouped_iterations); + ke.kernel()->summary().num_grouped_iterations); testValidate( &fusion, @@ -2292,9 +2290,9 @@ void shmooTestsOfIterGroupedBlockOrGridReduction( auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs, lparams); - auto cg_outputs = fe.runFusion(aten_inputs, lparams); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs, lparams); + auto cg_outputs = ke.runFusion(aten_inputs, lparams); testValidate( &fusion, @@ -2543,15 +2541,15 @@ TEST_F(OuterReductionTest, IterGroupedMultipleReductions) { << "Expect 2 Iteration domain grouped grid reductions, got: " << num_iter_grouped_reductions; - FusionExecutor fe; + KernelExecutor ke; std::vector shape({redu_dim, iter_dim}); auto options = at::TensorOptions().device(at::kCUDA, 0); auto t0 = at::randn(shape, options); auto t1 = at::randn(shape, options); std::vector aten_inputs({t0, t1}); - fe.compileFusion(&fusion, aten_inputs, lparams); - auto cg_outputs = fe.runFusion(aten_inputs, lparams); + ke.compileFusion(&fusion, aten_inputs, lparams); + auto cg_outputs = ke.runFusion(aten_inputs, lparams); testValidate( &fusion, @@ -2595,10 +2593,10 @@ TEST_F(NVFuserTest, SmallOuterBlockReductionIssue2766) { auto t0 = at::randn({shape[0] * shape[1], shape[2]}, options); std::vector inputs({t0}); - FusionExecutorCache fec(std::move(fusion_ptr)); - auto outputs = fec.runFusionWithInputs(inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(inputs); - testValidate(fec.fusion(), outputs, inputs, __LINE__, __FILE__); + testValidate(executor_cache.fusion(), outputs, inputs, __LINE__, __FILE__); } } // namespace nvfuser diff --git a/tests/cpp/test_gpu_transpose.cpp b/tests/cpp/test_gpu_transpose.cpp index 11c326878fc..947283d4975 100644 --- a/tests/cpp/test_gpu_transpose.cpp +++ b/tests/cpp/test_gpu_transpose.cpp @@ -547,9 +547,9 @@ TEST_F(TransposeTest, FusionManualScheduleTransposeComplexDAG1) { at::Tensor input1 = at::randn({1024, 512, 256}, options); at::Tensor input2 = at::randn({512, 256, 1024}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input0, input1, input2}); - auto outputs = fe.runFusion({input0, input1, input2}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input0, input1, input2}); + auto outputs = ke.runFusion({input0, input1, input2}); testValidate(&fusion, outputs, {input0, input1, input2}, __LINE__, __FILE__); } @@ -987,9 +987,9 @@ TEST_F(TransposeTest, FusionTransposeBankConflict9) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({32, 32, 2}, options); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion({input}); + KernelExecutor ke; + ke.compileFusion(&fusion); + auto outputs = ke.runFusion({input}); testValidate(&fusion, outputs, {input}, __LINE__, __FILE__); } diff --git a/tests/cpp/test_gpu_view.cpp b/tests/cpp/test_gpu_view.cpp index 725469b2647..e4a322860f7 100644 --- a/tests/cpp/test_gpu_view.cpp +++ b/tests/cpp/test_gpu_view.cpp @@ -134,9 +134,9 @@ TEST_F(GpuViewTest, FusionViewAsRealOutput) { at::Tensor at_y = at::randn(output_shape, out_options); std::vector aten_inputs = {at_x, at_bias, at_y}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto outputs = ke.runFusion(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -279,8 +279,8 @@ void reductionViewAddFusion( at::Tensor at_bias = at::randn(bias_shape, options); std::vector aten_inputs = {at_x, at_bias}; - FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); - auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -445,8 +445,8 @@ void persistentViewAddFusion( at::Tensor at_bias = at::randn(bias_shape, options); std::vector aten_inputs = {at_x, at_bias}; - FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); - auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -637,9 +637,9 @@ TEST_F(GpuViewTest, FusionReshapeConcreteDomain) { auto t0 = at::randn({2, 3}, options); auto t1 = at::randn({1, 6}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, t1}); + auto cg_outputs = ke.runFusion({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -668,8 +668,8 @@ TEST_F(GpuViewTest, FusionReshapeConcreteDomain2) { at::Tensor at_bias = at::randn(output_shape, options); std::vector aten_inputs = {at_x, at_bias}; - FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); - auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -704,8 +704,8 @@ TEST_F(GpuViewTest, FusionReshapeConcreteDomain3) { at::Tensor at_z = at::randn(other_shape, options); std::vector aten_inputs = {at_x, at_y, at_z}; - FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); - auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -850,9 +850,9 @@ TEST_F(GpuViewTest, FusionFlattenAfterUnsqueezeOutput) { x_add_bias->computeAt(x_reshape, 1); x_reshape->axis(0)->parallelize(ParallelType::TIDx); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto outputs = ke.runFusion(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -914,15 +914,15 @@ TEST_F(GpuViewTest, FusionExpandRepro) { at::Tensor at_y = at::randn(input_shape2, options); std::vector aten_inputs = {at_x, at_y}; - FusionExecutor fe; - fe.compileFusion(&fusion); + KernelExecutor ke; + ke.compileFusion(&fusion); LaunchParams l_params; - auto outputs = fe.runFusion(aten_inputs, {}, l_params, {}); + auto outputs = ke.runFusion(aten_inputs, {}, l_params, {}); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); // second run to verify cached output allocation - outputs = fe.runFusion(aten_inputs, {}, l_params, {}); + outputs = ke.runFusion(aten_inputs, {}, l_params, {}); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -1349,9 +1349,9 @@ TEST_F(GpuViewTest, FusionPwiseViewSchedule) { at::Tensor t0 = at::randn({x, y, z}, options); at::Tensor t3 = at::randn({x, y, z}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t3}); - auto cg_outputs = fe.runFusion({t0, t3}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, t3}); + auto cg_outputs = ke.runFusion({t0, t3}); testValidate(&fusion, cg_outputs, {t0, t3}, __LINE__, __FILE__); } @@ -1415,9 +1415,9 @@ TEST_F(GpuViewTest, FusionSumViewSchedule) { auto t5 = t4.sum({1}); auto t6 = t0 + t3; - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t3}); - auto cg_outputs = fe.runFusion({t0, t3}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, t3}); + auto cg_outputs = ke.runFusion({t0, t3}); testValidate(&fusion, cg_outputs, {t0, t3}, {t2, t5, t6}, __LINE__, __FILE__); } @@ -1944,9 +1944,9 @@ TEST_F(GpuViewTest, FusionReshapeMapping) { at::Tensor t0 = at::randn({w, x, y * z}, options); at::Tensor t3 = at::randn({w, x * y, z}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t3}); - auto cg_outputs = fe.runFusion({t0, t3}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, t3}); + auto cg_outputs = ke.runFusion({t0, t3}); testValidate(&fusion, cg_outputs, {t0, t3}, __LINE__, __FILE__); } @@ -2318,9 +2318,9 @@ TEST_F(GpuViewTest, ExpandedBroadcast) { at::Tensor in_tensor = at::randn({4, 5}, at::dtype(at::kFloat).device(at::kCUDA, 0)); - FusionExecutor fe; - fe.compileFusion(&fusion, {in_tensor}); - at::Tensor actual_out_tensor = fe.runFusion({in_tensor})[0]; + KernelExecutor ke; + ke.compileFusion(&fusion, {in_tensor}); + at::Tensor actual_out_tensor = ke.runFusion({in_tensor})[0]; testValidate(&fusion, {actual_out_tensor}, {in_tensor}, __LINE__, __FILE__); } @@ -2697,9 +2697,9 @@ TEST_F(GpuViewTest, FusionMismatchingReshape) { // TODO: use larger tensor size once we are able to successfully parallelize // this fusion. at::Tensor t0 = at::randn({2, 3, 5}).to(options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } diff --git a/tests/cpp/test_host_irs.cpp b/tests/cpp/test_host_irs.cpp index a651fb5778d..bee0c2979dc 100644 --- a/tests/cpp/test_host_irs.cpp +++ b/tests/cpp/test_host_irs.cpp @@ -346,7 +346,7 @@ TEST_P(HostIrTest, ThreeFusions) { // [Step 8)] Execute the Host program HostIrExecutorParams params; - // we test two different modes of the HostIrExecutor: using FusionExecutor or + // we test two different modes of the HostIrExecutor: using KernelExecutor or // FusionExecutorCache auto [use_fusion_executor_cache] = GetParam(); params.use_fusion_executor_cache = use_fusion_executor_cache; diff --git a/tests/cpp/test_indexing.cpp b/tests/cpp/test_indexing.cpp index 23c48dfc0b7..4289006ec27 100644 --- a/tests/cpp/test_indexing.cpp +++ b/tests/cpp/test_indexing.cpp @@ -1773,9 +1773,9 @@ TEST_F(IndexingTest, SmemAllocationDomainForTranspose) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input0 = at::randn({256, 256}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input0}); - auto outputs = fe.runFusion({input0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input0}); + auto outputs = ke.runFusion({input0}); testValidate(&fusion, outputs, {input0}, __LINE__, __FILE__); } @@ -3040,9 +3040,9 @@ TEST_F(PredicateIndexingTest, DoubleBuffering1) { EnableOptionsGuard enable_options_guard; EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, inputs); + auto outputs = ke.runFusion(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } @@ -3139,9 +3139,9 @@ TEST_F(PredicateIndexingTest, CircularBuffering1) { EnableOptionsGuard enable_options_guard; EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, inputs); + auto outputs = ke.runFusion(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } @@ -3306,9 +3306,9 @@ TEST_F(PredicateIndexingTest, UnrolledCircularBuffering) { EnableOptionsGuard enable_options_guard; EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, inputs); + auto outputs = ke.runFusion(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } @@ -3387,9 +3387,9 @@ TEST_F(PredicateIndexingTest, UnswitchedCircularBuffering1) { EnableOptionsGuard enable_options_guard; EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, inputs); + auto outputs = ke.runFusion(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } @@ -3476,9 +3476,9 @@ TEST_F(PredicateIndexingTest, UnswitchedCircularBuffering2) { EnableOptionsGuard enable_options_guard; EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, inputs); + auto outputs = ke.runFusion(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } @@ -3582,9 +3582,9 @@ TEST_P(PredicateIndexingTest, UnswitchedCircularBuffering3) { EnableOptionsGuard enable_options_guard; EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, inputs); + auto outputs = ke.runFusion(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } @@ -3661,9 +3661,9 @@ TEST_F(PredicateIndexingTest, UnswitchedCircularBuffering4) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({16}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -3754,9 +3754,9 @@ TEST_F(PredicateIndexingTest, NonDivisibleSplit1) { at::Tensor t0 = at::randn({999}, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto outputs = ke.runFusion(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -3845,9 +3845,9 @@ TEST_F(PredicateIndexingTest, NonDivisibleSplitWithUnswitch) { at::Tensor t0 = at::randn({999}, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto outputs = ke.runFusion(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -3940,9 +3940,9 @@ TEST_F(PredicateIndexingTest, NonDivisibleSplitWithCircularBuffering) { at::Tensor t0 = at::randn({999}, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto outputs = ke.runFusion(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -4051,9 +4051,9 @@ TEST_F( at::Tensor t0 = at::randn({999}, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto outputs = ke.runFusion(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -4136,9 +4136,9 @@ TEST_P(PredicateIndexingTest, UnswitchPredicateIssueRepro681) { EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); } - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto outputs = ke.runFusion(aten_inputs); auto ref = t0.to(at::kDouble).sum(); @@ -4296,9 +4296,9 @@ TEST_F(PredicateIndexingTest, NonDivisibleSplitWithUnswitchAndBroadcast) { EnableOptionsGuard enable_options_guard; EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto outputs = ke.runFusion(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -4419,9 +4419,9 @@ TEST_F(PredicateIndexingTest, UnswitchConsolidationDifferentThreading) { EnableOptionsGuard enable_options_guard; EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto outputs = ke.runFusion(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -4834,9 +4834,9 @@ TEST_F(ContigIndexingTest, ConcretizedBroadcastMerge) { auto t1 = at::randn({5, 6, 7}, options); std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -5063,9 +5063,9 @@ TEST_F(ContigPredicateIndexingTest, NonDivisibleSplit1) { at::Tensor t0 = at::randn({10, 20}, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto outputs = ke.runFusion(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } diff --git a/tests/cpp/test_indexing_advanced.cpp b/tests/cpp/test_indexing_advanced.cpp index 1787109a272..eb94b66f9ca 100644 --- a/tests/cpp/test_indexing_advanced.cpp +++ b/tests/cpp/test_indexing_advanced.cpp @@ -72,10 +72,10 @@ TEST_P(AdvancedIndexingTest, InlineBroadcast) { at::Tensor t0 = at::randn({123}, options); at::Tensor t1 = at::randn({3, 123}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, t1}); - auto outputs = fe.runFusion({t0, t1}); + auto outputs = ke.runFusion({t0, t1}); testValidate(&fusion, outputs, {t0, t1}, __LINE__, __FILE__); } @@ -117,15 +117,15 @@ TEST_P(AdvancedIndexingTest, 1) { tv2->axis(1)->parallelize(ParallelType::Unroll); tv2->axis(2)->parallelize(ParallelType::TIDx); - FusionExecutor fe; + KernelExecutor ke; at::Tensor t0 = at::randn({x, y, z}, options); at::Tensor t1 = at::randn({w, x, y, z}, options); std::vector aten_inputs = {t0, t1}; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -168,15 +168,15 @@ TEST_P(AdvancedIndexingTest, 2) { tv2->axis(1)->parallelize(ParallelType::Unroll); tv2->axis(2)->parallelize(ParallelType::TIDx); - FusionExecutor fe; + KernelExecutor ke; at::Tensor t0 = at::randn({x, y, z}, options); at::Tensor t1 = at::randn({w, x, y, z}, options); std::vector aten_inputs = {t0, t1}; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -229,9 +229,9 @@ TEST_P(AdvancedIndexingTest, 4) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -263,9 +263,9 @@ TEST_P(AdvancedIndexingTest, 5) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -345,9 +345,9 @@ TEST_P(AdvancedIndexingTest, 7) { auto at_t0 = at::randn({numel_x}, options); auto at_t1 = at::randn({numel_x, numel_y}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {at_t0, at_t1}); - auto cg_outputs = fe.runFusion({at_t0, at_t1}); + KernelExecutor ke; + ke.compileFusion(&fusion, {at_t0, at_t1}); + auto cg_outputs = ke.runFusion({at_t0, at_t1}); auto aten_output = (at_t0.unsqueeze(-1).expand({numel_x, numel_y}) + at_t1) .to(at::kDouble) @@ -391,9 +391,9 @@ TEST_P(AdvancedIndexingTest, 8) { auto at_t0 = at::randn({numel_x}, options); auto at_t1 = at::randn({numel_x, numel_y}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {at_t0, at_t1}); - auto cg_outputs = fe.runFusion({at_t0, at_t1}); + KernelExecutor ke; + ke.compileFusion(&fusion, {at_t0, at_t1}); + auto cg_outputs = ke.runFusion({at_t0, at_t1}); auto aten_output = (at_t0.unsqueeze(-1).expand({numel_x, numel_y}) + at_t1) .to(at::kDouble) @@ -484,9 +484,9 @@ TEST_P(AdvancedIndexingTest, 10) { at::Tensor input2 = at::rand_like(input1); at::Tensor output = at::empty_like(input1); - FusionExecutor fe; - fe.compileFusion(&fusion, {input1, input2}); - fe.runFusion({input1, input2}, {output}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input1, input2}); + ke.runFusion({input1, input2}, {output}); at::Tensor tv2_ref = input2 + 2.0; at::Tensor output_ref = input1 + tv2_ref; @@ -531,15 +531,15 @@ TEST_P(AdvancedIndexingTest, 11) { tv3->axis(-1)->parallelize(ParallelType::TIDx); - FusionExecutor fe; + KernelExecutor ke; at::Tensor t0 = at::randn({w, x, y, z}, options); at::Tensor t1 = at::randn({x}, options); std::vector aten_inputs = {t0, t1}; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -574,9 +574,9 @@ TEST_P(AdvancedIndexingTest, 12) { std::vector aten_outputs = {t2, t4}; - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {aten_input}); + auto cg_outputs = ke.runFusion({aten_input}); testValidate( &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); @@ -623,9 +623,9 @@ TEST_P(AdvancedIndexingTest, 13) { std::vector aten_inputs = {t0, t1, t2}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -665,9 +665,9 @@ TEST_P(AdvancedIndexingTest, 14) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -700,9 +700,9 @@ TEST_P(AdvancedIndexingTest, 15) { at::Tensor t3 = at::randn({bx, by, bz}, options); std::vector aten_inputs = {t0, t3}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -732,9 +732,9 @@ TEST_P(AdvancedIndexingTest, 16) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -767,9 +767,9 @@ TEST_P(AdvancedIndexingTest, 17) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -804,13 +804,13 @@ TEST_P(AdvancedIndexingTest, 18) { at::Tensor t1 = at::randn({5, 3}, options); std::vector inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto cg_outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, inputs); + auto cg_outputs = ke.runFusion(inputs); auto ref = (t0.unsqueeze(-1) + t1).sum(); - testValidate(fe.kernel(), cg_outputs, inputs, {ref}, __LINE__, __FILE__); + testValidate(ke.kernel(), cg_outputs, inputs, {ref}, __LINE__, __FILE__); } TEST_P(AdvancedIndexingTest, 19) { @@ -848,9 +848,9 @@ TEST_P(AdvancedIndexingTest, 19) { at::Tensor t1 = at::randn({5, 11}, options); std::vector inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, inputs); + auto outputs = ke.runFusion(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } @@ -913,9 +913,9 @@ TEST_F(AdvancedIndexingIdModelTest, 20) { at::Tensor t2 = at::randn({7, 13}, options); std::vector inputs = {t0, t1, t2}; - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, inputs); + auto outputs = ke.runFusion(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); #endif @@ -978,9 +978,9 @@ TEST_F(AdvancedIndexingIdModelTest, 21) { auto t6 = at::randn({3, 5, 7}, options); std::vector inputs = {t0, t3, t6}; - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, inputs); + auto outputs = ke.runFusion(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); #endif @@ -1022,9 +1022,9 @@ TEST_F(AdvancedIndexingIdModelTest, MultiPromotion1) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -1119,9 +1119,9 @@ TEST_F(AdvancedIndexingIdModelTest, IndexSplitMerge) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); diff --git a/tests/cpp/test_inlining.cpp b/tests/cpp/test_inlining.cpp index 320241e6112..767d6cfc836 100644 --- a/tests/cpp/test_inlining.cpp +++ b/tests/cpp/test_inlining.cpp @@ -48,9 +48,9 @@ TEST_F(InliningTest, InliningMismatchedDims1) { at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({2, 3, 4}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto cg_outputs = fe.runFusion({input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input}); + auto cg_outputs = ke.runFusion({input}); testValidate(&fusion, cg_outputs, {input}, __LINE__, __FILE__); } @@ -80,9 +80,9 @@ TEST_F(InliningTest, InliningMismatchedDims2) { at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({2, 3, 4}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto cg_outputs = fe.runFusion({input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input}); + auto cg_outputs = ke.runFusion({input}); testValidate(&fusion, cg_outputs, {input}, __LINE__, __FILE__); } @@ -113,9 +113,9 @@ TEST_F(InliningTest, InliningMismatchedDims4) { at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({2, 3, 4}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto cg_outputs = fe.runFusion({input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input}); + auto cg_outputs = ke.runFusion({input}); testValidate(&fusion, cg_outputs, {input}, __LINE__, __FILE__); } @@ -150,9 +150,9 @@ TEST_F(InliningTest, InliningBroadcast) { at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({2, 3, 4}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto cg_outputs = fe.runFusion({input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input}); + auto cg_outputs = ke.runFusion({input}); testValidate(&fusion, cg_outputs, {input}, __LINE__, __FILE__); } diff --git a/tests/cpp/test_loop_domain_scheduling.cpp b/tests/cpp/test_loop_domain_scheduling.cpp index 52884529727..04445f0a3d9 100644 --- a/tests/cpp/test_loop_domain_scheduling.cpp +++ b/tests/cpp/test_loop_domain_scheduling.cpp @@ -86,9 +86,9 @@ TEST_F(LoopDomainSchedulingTest, ReshapeSplitThenMerge) { auto t0 = at::randn({10}, options); std::vector inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, inputs); + auto outputs = ke.runFusion(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } @@ -147,9 +147,9 @@ TEST_F(LoopDomainSchedulingTest, Slice) { auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); auto ref = t0.index({at::indexing::Slice(1, shape[0] - 1)}); @@ -306,9 +306,9 @@ TEST_F(LoopDomainSchedulingTest, ManyReshape) { auto t0 = at::randn({12}, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); auto ref = t0 * 2; EXPECT_TRUE(ref.equal(cg_outputs[0])); diff --git a/tests/cpp/test_loop_rotation.cpp b/tests/cpp/test_loop_rotation.cpp index 39314552945..a41d69f6ab5 100644 --- a/tests/cpp/test_loop_rotation.cpp +++ b/tests/cpp/test_loop_rotation.cpp @@ -76,9 +76,9 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor for (auto n : {1, 99}) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({n, 3}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } } @@ -169,9 +169,9 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor for (auto n : {1, 99}) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({n, 3}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } } @@ -278,9 +278,9 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor for (auto n : {1, 99}) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({n, 3}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } } @@ -389,9 +389,9 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor for (auto n : {5, 99}) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({n, 3}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } } @@ -526,9 +526,9 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor for (auto n : {5, 99}) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({n, 3}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } } @@ -662,9 +662,9 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor for (auto n : {5, 99}) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({n, 3}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } } diff --git a/tests/cpp/test_matmul.cpp b/tests/cpp/test_matmul.cpp index 1c0fca0ac89..dfbc1381f4d 100644 --- a/tests/cpp/test_matmul.cpp +++ b/tests/cpp/test_matmul.cpp @@ -124,19 +124,19 @@ TEST_P(MatmulTestWithLayout, AmpereMatmul) { auto inputs = matmulAtInput3DTuring(M, N, K, layout); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion( + ke.compileFusion( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); @@ -185,19 +185,19 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulBroadcastBatch) { auto inputs = matmulAtInput3DTuring(M, N, K, layout); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion( + ke.compileFusion( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout) @@ -243,19 +243,19 @@ TEST_P(MatmulTestWithLayout, AmperePrologueFusionBroadcast) { auto inputs = matmulAtInput2D(M, N, K, layout); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion( + ke.compileFusion( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); @@ -304,19 +304,19 @@ TEST_P(MatmulTestWithLayout, AmpereProloguePointwise) { auto inputs = matmulAtInput3DTuring(M, N, K, layout); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion( + ke.compileFusion( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.sin().to(at::kFloat), inputs.second.sin().to(at::kFloat), @@ -365,19 +365,19 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulBFloat16) { auto inputs = matmulAtInput3DTuring(M, N, K, layout, at::kBFloat16); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion( + ke.compileFusion( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); @@ -428,19 +428,19 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulPipelineGmem) { auto inputs = matmulAtInput3DTuring(M, N, K, layout); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion( + ke.compileFusion( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); @@ -512,25 +512,25 @@ TEST_P(MatmulTestWithLayout, AmpereSwizzle) { FusionProfiler::createSegments(1); } - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion( + ke.compileFusion( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.01, 0.01)); - int gdimx = fe.lastLaunchParams().gdimx(); - int gdimy = fe.lastLaunchParams().gdimy(); + int gdimx = ke.lastLaunchParams().gdimx(); + int gdimy = ke.lastLaunchParams().gdimy(); int expected_gdim_unswizzled = (dim + 128 - 1) / 128; int expected_gdimx = expected_gdim_unswizzled * swizzle; @@ -640,19 +640,19 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulRegCircularBuffer) { auto inputs = matmulAtInput3DTuring(M, N, K, layout); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion( + ke.compileFusion( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); @@ -932,16 +932,16 @@ TEST_F(MatmulTest, MatmulMatmulAmpere) { .matmul(t1.t().to(at::kFloat)) .matmul(t2.t().to(at::kFloat)); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion(&fusion, {t0, t1, t2}, LaunchParams(), matmul_cparams)); + ke.compileFusion(&fusion, {t0, t1, t2}, LaunchParams(), matmul_cparams)); - auto cg_outputs = fe.runFusion({t0, t1, t2}); + auto cg_outputs = ke.runFusion({t0, t1, t2}); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); // relaxed check for now, err accumulation is significant. NVF_CHECK(cg_outputs[0].allclose(tref, 0.1, 0.1)); } @@ -1312,16 +1312,16 @@ TEST_F(MatmulTest, MatmulSoftmaxMatmulAmpere) { auto t1 = at::randn({N1, K1}, options); auto t2 = at::randn({N2, K2}, options); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion(&fusion, {t0, t1, t2}, LaunchParams(), matmul_cparams)); + ke.compileFusion(&fusion, {t0, t1, t2}, LaunchParams(), matmul_cparams)); - auto cg_outputs = fe.runFusion({t0, t1, t2}); + auto cg_outputs = ke.runFusion({t0, t1, t2}); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); auto g1 = t0.to(at::kFloat).matmul(t1.t().to(at::kFloat)); auto sg1 = at::_softmax(g1, -1, false); auto gsg1 = sg1.matmul(t2.t().to(at::kFloat)); @@ -1367,13 +1367,13 @@ TEST_P(MatmulTestWithLayout, TuringMatmul) { auto inputs = matmulAtInput3DTuring(M, N, K, layout); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( - 7, 5, fe.compileFusion(&fusion, {inputs.first, inputs.second})); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + 7, 5, ke.compileFusion(&fusion, {inputs.first, inputs.second})); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); @@ -1511,15 +1511,15 @@ TEST_F(MatmulTest, AmpereMatmulTNCpAsync) { auto t0 = at::randn({M, K}, options); auto t1 = at::randn({N, K}, options); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion(&fusion, {t0, t1}, LaunchParams(), matmul_cparams)); + ke.compileFusion(&fusion, {t0, t1}, LaunchParams(), matmul_cparams)); - auto cg_outputs = fe.runFusion({t0, t1}); + auto cg_outputs = ke.runFusion({t0, t1}); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); auto tref = t0.to(at::kFloat).matmul(t1.t().to(at::kFloat)); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); @@ -1679,16 +1679,16 @@ TEST_F(MatmulTest, AmpereStridedBatchedMatmulTN) { auto t0 = at::randn({B0, M, B1, K}, options); auto t1 = at::randn({B0, N, B1, K}, options); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion(&fusion, {t0, t1}, LaunchParams(), matmul_cparams)); + ke.compileFusion(&fusion, {t0, t1}, LaunchParams(), matmul_cparams)); - auto cg_outputs = fe.runFusion({t0, t1}); + auto cg_outputs = ke.runFusion({t0, t1}); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); // ref implementation: auto ref_t0 = t0.permute({0, 2, 1, 3}) .contiguous() @@ -1852,16 +1852,16 @@ TEST_F(MatmulTest, AmpereViewMatmulTN) { auto t0 = at::randn({M, Ko, Ki}, options); auto t1 = at::randn({N, K}, options); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion(&fusion, {t0, t1}, LaunchParams(), matmul_cparams)); + ke.compileFusion(&fusion, {t0, t1}, LaunchParams(), matmul_cparams)); - auto cg_outputs = fe.runFusion({t0, t1}); + auto cg_outputs = ke.runFusion({t0, t1}); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); auto tref = at::native::view(t0, {M, K}).to(at::kFloat).matmul(t1.t().to(at::kFloat)); @@ -2040,11 +2040,11 @@ TEST_F(MatmulTest, AmpereMatmulTNSwizzled) { auto t0 = at::randn({M, K}, options); auto t1 = at::randn({N, K}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, t1}, LaunchParams(), matmul_cparams); + auto cg_outputs = ke.runFusion({t0, t1}); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); auto tref = t0.to(at::kFloat).matmul(t1.t().to(at::kFloat)); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); @@ -2091,19 +2091,19 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulLargeLoad) { auto inputs = matmulAtInput3DTuring(M, N, K, layout); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion( + ke.compileFusion( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); @@ -2147,19 +2147,19 @@ TEST_P(MatmulTestWithLayout, TuringMatmulLargeLoad) { auto inputs = matmulAtInput3DTuring(M, N, K, layout); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 7, 5, - fe.compileFusion( + ke.compileFusion( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); @@ -2219,19 +2219,19 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulTileCheck4warp) { auto inputs = matmulAtInput3DTuring(M, N, K, layout); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion( + ke.compileFusion( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - EXPECT_TRUE(getBankConflictInfo(fe.kernel()).empty()); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + EXPECT_TRUE(getBankConflictInfo(ke.kernel()).empty()); + auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK( @@ -2300,19 +2300,19 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulTileCheck8warp) { auto inputs = matmulAtInput3DTuring(M, N, K, layout); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion( + ke.compileFusion( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); @@ -2371,19 +2371,19 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulTileCheck6warp) { auto inputs = matmulAtInput3DTuring(M, N, K, layout); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion( + ke.compileFusion( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); @@ -2431,19 +2431,19 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulLargeLoadLargeK) { auto inputs = matmulAtInput3DTuring(M, N, K, layout); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion( + ke.compileFusion( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.001, 0.001)); @@ -2489,15 +2489,15 @@ TEST_P(MatmulTestWithLayout, AmpereSplitKLikeStridedBatchedMatmul) { auto t0 = matmulAtInput2D(layout, TensorMatmulPos::A, at::kHalf, M, N, K, B); auto t1 = matmulAtInput2D(layout, TensorMatmulPos::B, at::kHalf, M, N, K, B); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion(&fusion, {t0, t1}, LaunchParams(), matmul_cparams)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + ke.compileFusion(&fusion, {t0, t1}, LaunchParams(), matmul_cparams)); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion({t0, t1}); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.runFusion({t0, t1}); auto tref = splitkLikeAtMatmul(t0.to(at::kFloat), t1.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); } @@ -2578,23 +2578,23 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulSmemEpilogue) { at::manual_seed(0); auto inputs = matmulAtInput3DTuring(M, N, K, layout); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion( + ke.compileFusion( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); // check bank conflicts - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); // (0.001, 0.001) passed on local A100 but failed on CI A100 NVF_CHECK( cg_outputs[0].allclose(tref, 0.01, 0.01), @@ -2612,7 +2612,7 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulSmemEpilogue) { // - !use_smem_epilogue : A + B (this test is skipped in this case) // - use_smem_epilogue && !promote_prologue_smem_reuse : A + B + C // - use_smem_epilogue && promote_prologue_smem_reuse : max(A + B, C) - auto smem_allocs = fe.kernel()->summary().dynamic_smem_allocations; + auto smem_allocs = ke.kernel()->summary().dynamic_smem_allocations; NVF_CHECK(smem_allocs.size() == 3); if (mparams.promote_prologue_smem_reuse) { // Check prologue shared memory re-use @@ -2712,29 +2712,29 @@ TEST_F(MatmulTest, AmpereMatmulSmemEpiloguePromotionRequiredA100) { SchedulerEntry::makeSchedulerInstance(SchedulerType::Matmul) ->schedule(&fusion, &mparams); - // FusionExecutor::compileFusion would fail otherwise. + // KernelExecutor::compileFusion would fail otherwise. SKIP_IF_INSUFFICIENT_SMEM(&mparams, data_types); at::manual_seed(0); auto inputs = matmulAtInput3DTuring(M, N, K, layout); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion( + ke.compileFusion( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); // check bank conflicts - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); // (0.001, 0.001) passed on local A100 but failed on CI A100 NVF_CHECK( cg_outputs[0].allclose(tref, 0.01, 0.01), @@ -2818,23 +2818,23 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulSmemEpilogueCast) { at::manual_seed(0); auto inputs = matmulAtInput3DTuring(M, N, K, layout); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion( + ke.compileFusion( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); tref = tref.to(at::kHalf); // check bank conflicts - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); // (0.001, 0.001) passed on local A100 but failed on CI A100 NVF_CHECK( cg_outputs[0].allclose(tref, 0.01, 0.01), @@ -2914,24 +2914,24 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulSmemEpilogueRelu) { at::manual_seed(0); auto inputs = matmulAtInput3DTuring(M, N, K, layout); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion( + ke.compileFusion( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); auto t2 = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); auto tref = at::relu(t2).to(at::kFloat); // check bank conflicts - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); // (0.001, 0.001) passed on local A100 but failed on CI A100 NVF_CHECK( cg_outputs[0].allclose(tref, 0.01, 0.01), @@ -3003,13 +3003,13 @@ TEST_P(MatmulTestWithLayout, FusionAmpereMatmulSplitK_CUDA) { auto inputs = matmulAtInput3DTuring(M, N, K, layout); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( - 7, 5, fe.compileFusion(&fusion, {inputs.first, inputs.second})); - EXPECT_TRUE(getBankConflictInfo(fe.kernel()).empty()); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + 7, 5, ke.compileFusion(&fusion, {inputs.first, inputs.second})); + EXPECT_TRUE(getBankConflictInfo(ke.kernel()).empty()); + auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); @@ -3068,13 +3068,13 @@ TEST_P(MatmulTestWithLayout, FusionAmpereMatmulSplitKBias_CUDA) { at::Tensor aten_bias = at::randn({M}, aten_a.options()); std::vector inputs = {aten_a, aten_b, aten_bias}; - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( - 7, 5, fe.compileFusion(&fusion, inputs)); - EXPECT_TRUE(getBankConflictInfo(fe.kernel()).empty()); - auto cg_outputs = fe.runFusion(inputs); + 7, 5, ke.compileFusion(&fusion, inputs)); + EXPECT_TRUE(getBankConflictInfo(ke.kernel()).empty()); + auto cg_outputs = ke.runFusion(inputs); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); auto tref = atBiasEpilogue( atMatmul(aten_a.to(at::kFloat), aten_b.to(at::kFloat), layout), aten_bias); @@ -3131,13 +3131,13 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulBatchSplitK) { std::vector inputs = {aten_a, aten_b}; - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( - 7, 5, fe.compileFusion(&fusion, inputs)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + 7, 5, ke.compileFusion(&fusion, inputs)); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion(inputs); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.runFusion(inputs); auto tref = atMatmul(aten_a.to(at::kFloat), aten_b.to(at::kFloat), layout); @@ -3198,13 +3198,13 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulBatchSplitKBias) { std::vector inputs = {aten_a, aten_b, aten_bias}; - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( - 7, 5, fe.compileFusion(&fusion, inputs)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + 7, 5, ke.compileFusion(&fusion, inputs)); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion(inputs); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.runFusion(inputs); auto tref = atBiasEpilogue( atMatmul(aten_a.to(at::kFloat), aten_b.to(at::kFloat), layout), aten_bias); @@ -3257,19 +3257,19 @@ TEST_F(MatmulTest, ReproIssue1808) { auto inputs = matmulAtInput3DTuring(M, N, K, layout); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion( + ke.compileFusion( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); @@ -3413,16 +3413,16 @@ TEST_P(MatmulTestWithLayout, MisalignedVectorization) { SchedulerEntry::makeSchedulerInstance(SchedulerType::Matmul) ->schedule(fusion.get(), &mparams); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion( + ke.compileFusion( fusion.get(), inputs, LaunchParams(), matmul_cparams)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto outputs = fe.runFusion(inputs); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto outputs = ke.runFusion(inputs); EXPECT_TRUE(outputs[0].allclose(tref, 0.001, 0.001)); } @@ -3473,13 +3473,13 @@ TEST_F(MatmulTest, MultipleConsecutiveDims) { at::Tensor B = at::randn({N1, N2, K}, options); std::vector inputs{A, B}; - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( - 8, 0, fe.compileFusion(&fusion, inputs, LaunchParams(), matmul_cparams)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + 8, 0, ke.compileFusion(&fusion, inputs, LaunchParams(), matmul_cparams)); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion(inputs); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.runFusion(inputs); auto tref = at::reshape( at::linear( at::reshape(A.to(at::kFloat), {M1 * M2, K}), @@ -3539,13 +3539,13 @@ TEST_F(MatmulTest, DISABLED_MultipleNonConsecutiveMDims) { at::Tensor B = at::randn({N, K}, options); std::vector inputs{A, B}; - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( - 8, 0, fe.compileFusion(&fusion, inputs, LaunchParams(), matmul_cparams)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + 8, 0, ke.compileFusion(&fusion, inputs, LaunchParams(), matmul_cparams)); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion(inputs); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.runFusion(inputs); auto Apermuted = A.permute({{1, 2}}).reshape({M1 * M2, K}); auto tref = at::linear(Apermuted.to(at::kFloat), B.to(at::kFloat)) .reshape({M1, M2, N}) @@ -3605,13 +3605,13 @@ TEST_F(MatmulTest, DISABLED_MultipleNonConsecutiveNDims) { at::Tensor B = at::randn({N1, K, N2}, options); std::vector inputs{A, B}; - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( - 8, 0, fe.compileFusion(&fusion, inputs, LaunchParams(), matmul_cparams)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + 8, 0, ke.compileFusion(&fusion, inputs, LaunchParams(), matmul_cparams)); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion(inputs); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.runFusion(inputs); auto Bpermuted = B.permute({{1, 2}}).reshape({N1 * N2, K}); auto tref = at::linear(A.to(at::kFloat), Bpermuted.to(at::kFloat)) .reshape({M, N1, N2}); @@ -3663,13 +3663,13 @@ TEST_F(MatmulTest, MultipleMDimsBatch) { at::Tensor B = at::randn({Batch, N, K}, options); std::vector inputs{A, B}; - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( - 8, 0, fe.compileFusion(&fusion, inputs, LaunchParams(), matmul_cparams)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + 8, 0, ke.compileFusion(&fusion, inputs, LaunchParams(), matmul_cparams)); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion(inputs); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.runFusion(inputs); auto tref = at::matmul(A.to(at::kFloat), at::permute(B.to(at::kFloat), {0, 2, 1})); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); @@ -3798,10 +3798,10 @@ TEST_F(HopperMatmulTest, HSH_NT_128BSwizzle) { auto inputs = matmulAtInput3DHopperSS(M, N, K, layout, data_type_to_aten(dtype)); - FusionExecutor fe; - fe.compileFusion( + KernelExecutor ke; + ke.compileFusion( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); auto tref = atMatmul(inputs.first.squeeze(), inputs.second.squeeze(), layout); EXPECT_TRUE(at::allclose(cg_outputs[0], tref, 1e-5, 1e-5)); } diff --git a/tests/cpp/test_matmul_aten_evaluation.cpp b/tests/cpp/test_matmul_aten_evaluation.cpp index 9dd8a927009..1d078b51d96 100644 --- a/tests/cpp/test_matmul_aten_evaluation.cpp +++ b/tests/cpp/test_matmul_aten_evaluation.cpp @@ -164,8 +164,8 @@ TEST_P(MatmulNodeParametrizedTest, MatmulNodeConcrete) { at::Tensor t1 = at::randn(b_shape, at::kHalf).cuda(); at::Tensor out_ref = at::matmul(t0, t1); - FusionExecutorCache fec(std::move(fusion)); - auto out = fec.runFusionWithInputs({t0, t1}); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out = executor_cache.runFusionWithInputs({t0, t1}); EXPECT_TRUE(at::allclose(out[0], out_ref)); } @@ -190,8 +190,8 @@ TEST_P(MatmulNodeParametrizedTest, MatmulNodeSymbolic) { at::Tensor t1 = at::randn(b_shape, at::kHalf).cuda(); at::Tensor out_ref = at::matmul(t0, t1); - FusionExecutorCache fec(std::move(fusion)); - auto out = fec.runFusionWithInputs({t0, t1}); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out = executor_cache.runFusionWithInputs({t0, t1}); EXPECT_TRUE(at::allclose(out[0], out_ref)); } @@ -227,17 +227,17 @@ TEST_P(LinearNodeParametrizedTest, LinearNodeConcrete) { } at::Tensor out_ref = at::linear(t0, t1, bias_opt); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); std::vector out = {}; if (bias_shape.has_value()) { - out = fec.runFusionWithInputs({t0, t1, bias_opt}); + out = executor_cache.runFusionWithInputs({t0, t1, bias_opt}); } else { - out = fec.runFusionWithInputs({t0, t1}); + out = executor_cache.runFusionWithInputs({t0, t1}); } - const std::vector& executors = - fec.getMostRecentKernelRuntime()->executors(); + const std::vector& executors = + executor_cache.getMostRecentKernelRuntime()->executors(); EXPECT_EQ(executors.size(), 1); // Verify that fusion compilation was skipped. EXPECT_FALSE(executors.front().hasCompiledKernel()); @@ -277,17 +277,17 @@ TEST_P(LinearNodeParametrizedTest, LinearNodeSymbolic) { } at::Tensor out_ref = at::linear(t0, t1, bias_opt); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); std::vector out = {}; if (bias_shape.has_value()) { - out = fec.runFusionWithInputs({t0, t1, bias_opt}); + out = executor_cache.runFusionWithInputs({t0, t1, bias_opt}); } else { - out = fec.runFusionWithInputs({t0, t1}); + out = executor_cache.runFusionWithInputs({t0, t1}); } - const std::vector& executors = - fec.getMostRecentKernelRuntime()->executors(); + const std::vector& executors = + executor_cache.getMostRecentKernelRuntime()->executors(); EXPECT_EQ(executors.size(), 1); // Verify that fusion compilation was skipped. EXPECT_FALSE(executors.front().hasCompiledKernel()); diff --git a/tests/cpp/test_matmul_sass.cpp b/tests/cpp/test_matmul_sass.cpp index 974300401d2..50defa61eee 100644 --- a/tests/cpp/test_matmul_sass.cpp +++ b/tests/cpp/test_matmul_sass.cpp @@ -98,16 +98,16 @@ sass::Container getSASSFor( SchedulerEntry::makeSchedulerInstance(SchedulerType::Matmul) ->schedule(&fusion, &mparams); - FusionExecutor fe; - fe.compileFusion( + KernelExecutor ke; + ke.compileFusion( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); - return sass::parse(fe.disassembledKernelSASS()); + return sass::parse(ke.disassembledKernelSASS()); } // A fusion with epilogue made of binary op (scalar multiplication) @@ -161,13 +161,13 @@ sass::Container getBinaryOpMulEpilogueSASSFor( auto inputs = matmulAtInput3DTuring(M, N, K, layout); const double alpha = 2.5; - FusionExecutor fe; - fe.compileFusion( + KernelExecutor ke; + ke.compileFusion( &fusion, {inputs.first, inputs.second, alpha}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second, alpha}); + auto cg_outputs = ke.runFusion({inputs.first, inputs.second, alpha}); auto tref = at::mul( atMatmul( inputs.first.to(at::kFloat), @@ -178,7 +178,7 @@ sass::Container getBinaryOpMulEpilogueSASSFor( NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); - return sass::parse(fe.disassembledKernelSASS()); + return sass::parse(ke.disassembledKernelSASS()); } } // namespace diff --git a/tests/cpp/test_matmul_scheduler.cpp b/tests/cpp/test_matmul_scheduler.cpp index 18256e18892..d4d18a27796 100644 --- a/tests/cpp/test_matmul_scheduler.cpp +++ b/tests/cpp/test_matmul_scheduler.cpp @@ -2811,10 +2811,10 @@ TEST_P(AllocationDomainTest, BasicMatmul) { ->schedule(fusion.get(), &mparams); auto [t0, t1] = getInputTensors(M, N, K, a_m_inner, b_k_inner); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); + KernelExecutor ke; + ke.compileFusion(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({t0, t1}); + auto cg_outputs = ke.runFusion({t0, t1}); auto tref = t0.to(at::kFloat).matmul(t1.to(at::kFloat)); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); } @@ -2844,10 +2844,10 @@ TEST_P(AllocationDomainTest, BasicMatmulNoTranspose) { ->schedule(fusion.get(), &mparams); auto [t0, t1] = getInputTensors(M, N, K, a_m_inner, b_k_inner); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); + KernelExecutor ke; + ke.compileFusion(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({t0, t1}); + auto cg_outputs = ke.runFusion({t0, t1}); auto tref = t0.to(at::kFloat).matmul(t1.to(at::kFloat)); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); } @@ -2880,10 +2880,10 @@ TEST_P(AllocationDomainTest, BasicMatmulWithPrologueSet) { ->schedule(fusion.get(), &mparams); auto [t0, t1] = getInputTensors(M, N, K, a_m_inner, b_k_inner); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); + KernelExecutor ke; + ke.compileFusion(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({t0, t1}); + auto cg_outputs = ke.runFusion({t0, t1}); auto tref = t0.to(at::kFloat).matmul(t1.to(at::kFloat)); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); } @@ -2918,10 +2918,10 @@ TEST_P(AllocationDomainTest, BasicMatmulWithPrologueSetCastSin) { ->schedule(fusion.get(), &mparams); auto [t0, t1] = getInputTensors(M, N, K, a_m_inner, b_k_inner); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); + KernelExecutor ke; + ke.compileFusion(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({t0, t1}); + auto cg_outputs = ke.runFusion({t0, t1}); auto tref = t0.to(at::kFloat).matmul(t1.sin().to(at::kFloat)); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); } @@ -2955,10 +2955,10 @@ TEST_P(AllocationDomainTest, BasicMatmulWithPrologueSetCastSinNoTranspose) { ->schedule(fusion.get(), &mparams); auto [t0, t1] = getInputTensors(M, N, K, a_m_inner, b_k_inner); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); + KernelExecutor ke; + ke.compileFusion(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({t0, t1}); + auto cg_outputs = ke.runFusion({t0, t1}); auto tref = t0.to(at::kFloat).matmul(t1.sin().to(at::kFloat)); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); } @@ -2992,10 +2992,10 @@ TEST_P(AllocationDomainTest, BasicMatmulWithPrologueSetCastSinSetNoTranspose) { ->schedule(fusion.get(), &mparams); auto [t0, t1] = getInputTensors(M, N, K, a_m_inner, b_k_inner); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); + KernelExecutor ke; + ke.compileFusion(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({t0, t1}); + auto cg_outputs = ke.runFusion({t0, t1}); auto tref = t0.to(at::kFloat).matmul(t1.sin().to(at::kFloat)); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); } @@ -3029,10 +3029,10 @@ TEST_P(AllocationDomainTest, MatmulWithPrologueSetCastSinTranspose) { ->schedule(fusion.get(), &mparams); auto [t0, t1] = getInputTensors(M, N, K, a_m_inner, b_k_inner); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); + KernelExecutor ke; + ke.compileFusion(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({t0, t1}); + auto cg_outputs = ke.runFusion({t0, t1}); auto tref = t0.to(at::kFloat).matmul(t1.sin().to(at::kFloat)); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); } @@ -3069,8 +3069,8 @@ TEST_F(MatmulSchedulerTest, OperandOrderIssue2434) { auto y_ref = at::randn({N, K}, options); std::vector inputs{x_ref, y_ref}; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = fec.runFusionWithInputs(inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs(inputs); auto tref = at::linear(x_ref.to(at::kFloat), y_ref.to(at::kFloat)); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); } @@ -3140,13 +3140,13 @@ TEST_F(MatmulSchedulerTest, HSH_TT) { //! TODO Disabled because hopper multiple matmul scheduler is currently a copy //! of ampere scheduler. /* - FusionExecutor fe; - fe.compileFusion( + KernelExecutor ke; + ke.compileFusion( fusion.get(), {inputs.first, inputs.second}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); auto tref = atMatmul(inputs.first.squeeze(), inputs.second.squeeze(), layout); EXPECT_TRUE(at::allclose(cg_outputs[0], tref, 1e-5, 1e-5)); */ @@ -3211,14 +3211,14 @@ TEST_F(MatmulSchedulerTest, HSH_TN) { auto inputs = matmulAtInput3DHopperSS(M, N, K, layout, data_type_to_aten(dtype)); - FusionExecutor fe; - fe.compileFusion( + KernelExecutor ke; + ke.compileFusion( fusion.get(), {inputs.first, inputs.second}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); auto tref = atMatmul(inputs.first.squeeze(), inputs.second.squeeze(), layout); EXPECT_TRUE(at::allclose(cg_outputs[0], tref, 1e-5, 1e-5)); } @@ -3286,14 +3286,14 @@ TEST_F(MatmulSchedulerTest, HSH_NT) { auto inputs = matmulAtInput3DHopperSS(M, N, K, layout, data_type_to_aten(dtype)); - FusionExecutor fe; - fe.compileFusion( + KernelExecutor ke; + ke.compileFusion( fusion.get(), {inputs.first, inputs.second}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); auto tref = atMatmul(inputs.first.squeeze(), inputs.second.squeeze(), layout); EXPECT_TRUE(at::allclose(cg_outputs[0], tref, 1e-5, 1e-5)); } @@ -3363,13 +3363,13 @@ TEST_F(MatmulSchedulerTest, HSH_NN) { // TODO Disabled because hopper multiple matmul scheduler is currently a copy // of ampere scheduler. /* - FusionExecutor fe; - fe.compileFusion( + KernelExecutor ke; + ke.compileFusion( fusion.get(), {inputs.first, inputs.second}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); auto tref = atMatmul(inputs.first.squeeze(), inputs.second.squeeze(), layout); EXPECT_TRUE(at::allclose(cg_outputs[0], tref, 1e-5, 1e-5)); */ diff --git a/tests/cpp/test_mbarrier.cpp b/tests/cpp/test_mbarrier.cpp index 84c58192271..bd1b425c70f 100644 --- a/tests/cpp/test_mbarrier.cpp +++ b/tests/cpp/test_mbarrier.cpp @@ -46,9 +46,9 @@ TEST_F(MBarrierTest, Simple) { tv2->axis(0)->parallelize(ParallelType::TIDy); tv2->axis(1)->parallelize(ParallelType::TIDx); - FusionExecutor fe; + KernelExecutor ke; - fe.registerPostLoweringHook([](kir::Kernel* kernel) { + ke.registerPostLoweringHook([](kir::Kernel* kernel) { // Replace block sync with mbarrier FusionGuard fg(kernel); @@ -122,7 +122,7 @@ TEST_F(MBarrierTest, Simple) { top_level_exprs.push_back(invalidate); }); - fe.compileFusion(&fusion); + ke.compileFusion(&fusion); // Make sure that the post-lowering hook successfully inserted all mbarrier // operations @@ -131,14 +131,14 @@ TEST_F(MBarrierTest, Simple) { &typeid(kir::MBarrierArrive), &typeid(kir::MBarrierWait), &typeid(kir::MBarrierInvalidate)}; - for (auto expr : fe.kernel()->topLevelExprs()) { + for (auto expr : ke.kernel()->topLevelExprs()) { remaining_mbarrier_exprs.erase(&typeid(*expr)); } EXPECT_TRUE(remaining_mbarrier_exprs.empty()); auto input = at::randn( {32, 32}, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0)); - auto outputs = fe.runFusion({input}); + auto outputs = ke.runFusion({input}); testValidate(&fusion, outputs, {input}, __LINE__, __FILE__); } diff --git a/tests/cpp/test_memory.cpp b/tests/cpp/test_memory.cpp index 5af4ecc4867..a472b7f18b1 100644 --- a/tests/cpp/test_memory.cpp +++ b/tests/cpp/test_memory.cpp @@ -78,15 +78,15 @@ TEST_P(MemoryTest, LoadCache) { {1024}, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0)); at::Tensor expected_output = input + 1.0f; - FusionExecutor fe; + KernelExecutor ke; { DebugDumpOptionsGuard debug_dump_options_guard; DebugDumpOptionsGuard::getCurOptions().set(DebugDumpOption::Ptx); - fe.compileFusion(&fusion, {input}); + ke.compileFusion(&fusion, {input}); } // Verify PTX. - const executor_utils::CompiledKernel& compiled_kernel = fe.compiledKernel(); + const executor_utils::CompiledKernel& compiled_kernel = ke.compiledKernel(); std::string ptx(compiled_kernel.ptx.begin(), compiled_kernel.ptx.end()); std::regex regex(R"(ld\.global\.)" + cache_op_str + R"(\.\S+)"); std::smatch match; @@ -98,7 +98,7 @@ TEST_P(MemoryTest, LoadCache) { std::filesystem::remove(compiled_kernel.ptx_filename); // Verify output tensors. - std::vector actual_ts = fe.runFusion({input}); + std::vector actual_ts = ke.runFusion({input}); testValidate( &fusion, actual_ts, {input}, {expected_output}, __LINE__, __FILE__); } @@ -153,15 +153,15 @@ TEST_F(MemoryTest, RefineCachePolicy) { {1024}, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0)); at::Tensor c = a + b; - FusionExecutor fe; + KernelExecutor ke; { DebugDumpOptionsGuard debug_dump_options_guard; DebugDumpOptionsGuard::getCurOptions().set(DebugDumpOption::Ptx); - fe.compileFusion(&fusion, {a, b}); + ke.compileFusion(&fusion, {a, b}); } // Verify PTX. - const executor_utils::CompiledKernel& compiled_kernel = fe.compiledKernel(); + const executor_utils::CompiledKernel& compiled_kernel = ke.compiledKernel(); std::string ptx(compiled_kernel.ptx.begin(), compiled_kernel.ptx.end()); expectMatchCount(ptx, R"(ld\.global\.ca\.v4\.\S+)", 1); expectMatchCount(ptx, R"(ld\.global\.cs\.v4\.\S+)", 1); @@ -170,7 +170,7 @@ TEST_F(MemoryTest, RefineCachePolicy) { debug() << "Removing " << compiled_kernel.ptx_filename << std::endl; std::filesystem::remove(compiled_kernel.ptx_filename); - std::vector actual_outputs = fe.runFusion({a, b}); + std::vector actual_outputs = ke.runFusion({a, b}); testValidate(&fusion, actual_outputs, {a, b}, {c}, __LINE__, __FILE__); } @@ -457,16 +457,16 @@ TEST_P(TMASimpleLdstTest, Load) { auto options = at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), dim); - TMAPredicateChecker::checkPredicate(fe.kernel(), 1); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), dim); + TMAPredicateChecker::checkPredicate(ke.kernel(), 1); ASSERT_EQ( - XorFinder::findXor(fe.kernel()), (swizzle != MmaInputSmemSwizzle::None)); - TMADimChecker::getDim(fe.kernel()); + XorFinder::findXor(ke.kernel()), (swizzle != MmaInputSmemSwizzle::None)); + TMADimChecker::getDim(ke.kernel()); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -533,10 +533,10 @@ TEST_P(TMALoadTestWithABroadcastDim, LoadWithBroadcast) { auto options = at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -577,15 +577,15 @@ TEST_P(TMASimpleLdstTest, Store) { auto options = at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), dim); - TMAPredicateChecker::checkPredicate(fe.kernel(), 1); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), dim); + TMAPredicateChecker::checkPredicate(ke.kernel(), 1); ASSERT_EQ( - XorFinder::findXor(fe.kernel()), (swizzle != MmaInputSmemSwizzle::None)); + XorFinder::findXor(ke.kernel()), (swizzle != MmaInputSmemSwizzle::None)); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -639,13 +639,13 @@ TEST_F(TMAIndexingTest, Load2DTensorWith1DTMA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({1024, 1024}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 1); - TMAPredicateChecker::checkPredicate(fe.kernel(), 1); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 1); + TMAPredicateChecker::checkPredicate(ke.kernel(), 1); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -676,13 +676,13 @@ TEST_F(TMAIndexingTest, Load1DTensorWith2DTMA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({1024 * 1024}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2); - TMAPredicateChecker::checkPredicate(fe.kernel(), 1); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2); + TMAPredicateChecker::checkPredicate(ke.kernel(), 1); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -713,13 +713,13 @@ TEST_F(TMAIndexingTest, NonOneElementStride) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({1024, 1024}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2); - TMAPredicateChecker::checkPredicate(fe.kernel(), 0); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2); + TMAPredicateChecker::checkPredicate(ke.kernel(), 0); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -784,13 +784,13 @@ TEST_F(TMAIndexingTest, Advanced) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({4, 32, 2, 8, 8, 8, 32, 8}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 4); - TMAPredicateChecker::checkPredicate(fe.kernel(), 1); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 4); + TMAPredicateChecker::checkPredicate(ke.kernel(), 1); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -833,13 +833,13 @@ TEST_F(TMAIndexingTest, DefineBoxByCompositing1) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({4, 32, 2, 8, 8, 8, 32, 8}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 4); - EXPECT_FALSE(PredicatedChecker::isPredicated(tv1, fe.kernel())); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 4); + EXPECT_FALSE(PredicatedChecker::isPredicated(tv1, ke.kernel())); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -886,13 +886,13 @@ TEST_F(TMAIndexingTest, DefineBoxByCompositing2) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({32, 4, 2, 8, 8, 8, 2, 8, 4}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 5); - EXPECT_FALSE(PredicatedChecker::isPredicated(tv1, fe.kernel())); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 5); + EXPECT_FALSE(PredicatedChecker::isPredicated(tv1, ke.kernel())); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -947,13 +947,13 @@ TEST_F(TMAIndexingTest, DefineBoxByRotation1) { int64_t multiple_of_16B_but_not_more = 4 * 67; auto t0 = at::randn( {prime_number, prime_number, multiple_of_16B_but_not_more}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 3); - TMAPredicateChecker::checkPredicate(fe.kernel(), 1); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 3); + TMAPredicateChecker::checkPredicate(ke.kernel(), 1); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -994,18 +994,18 @@ TEST_F(TMAIndexingTest, DefineBoxByRotation2) { at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); int64_t multiple_of_8_but_not_more = 8 * 997; auto t0 = at::randn({multiple_of_8_but_not_more}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); // We will be using 2D TMA instead of 1D, because strided box can not be // merged with other bulk axes by rotation. So, this schedule will be // interpreted as viewing then tensor as 2D (M/8, 8) and then applying 2D TMA. // The outer dim of TMA is defined by boxing and striding splits, and the // inner dim is defined as implicit whole. - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2); - TMAPredicateChecker::checkPredicate(fe.kernel(), 1); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2); + TMAPredicateChecker::checkPredicate(ke.kernel(), 1); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); // The tensor shape is not a multiple of 8, so the view should fail. @@ -1016,7 +1016,7 @@ TEST_F(TMAIndexingTest, DefineBoxByRotation2) { .device(at::kCUDA, 0); int64_t prime_number = 997; auto t0 = at::randn({prime_number}, options); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); }, ::testing::ThrowsMessage( ::testing::HasSubstr("must be divisible by 8"))); @@ -1056,8 +1056,8 @@ TEST_F(TMAIndexingTest, DefineBoxByRotation3) { at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); int64_t multiple_of_23 = 23 * 997; auto t0 = at::randn({multiple_of_23, 8}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); // We will be using 3D TMA instead of 2D, because split(23, 8) is indivisible, // we can not consider this schedule as a 2D TMA whose first dimension has box @@ -1065,10 +1065,10 @@ TEST_F(TMAIndexingTest, DefineBoxByRotation3) { // TMA. The dim 0 of TMA is as implicit size-one, and the dim 1 is defined by // a boxing split whose box size is 8, and dim 2 is an implicit whole box with // size N. - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 3); - TMAPredicateChecker::checkPredicate(fe.kernel(), 1); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 3); + TMAPredicateChecker::checkPredicate(ke.kernel(), 1); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); // The tensor shape is not a multiple of 23, so the view should fail. @@ -1079,7 +1079,7 @@ TEST_F(TMAIndexingTest, DefineBoxByRotation3) { .device(at::kCUDA, 0); int64_t prime_number = 997; auto t0 = at::randn({prime_number, 8}, options); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); }, ::testing::ThrowsMessage( ::testing::HasSubstr("must be divisible by 23"))); @@ -1118,14 +1118,14 @@ TEST_F(TMAIndexingTest, NonTrivialGmemAllocationDomain1) { auto t0 = at::randn({128, 1024 * 128}, options) .transpose(0, 1) .view({128, 1024, 128}); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2); - TMAPredicateChecker::checkPredicate(fe.kernel(), 1); - ASSERT_TRUE(XorFinder::findXor(fe.kernel())); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2); + TMAPredicateChecker::checkPredicate(ke.kernel(), 1); + ASSERT_TRUE(XorFinder::findXor(ke.kernel())); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -1173,13 +1173,13 @@ TEST_F(TMAIndexingTest, NonTrivialGmemAllocationDomain2) { auto options = at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::randn({2, 3, 5, 7, 11, 32}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 3); - TMAPredicateChecker::checkPredicate(fe.kernel(), 1); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 3); + TMAPredicateChecker::checkPredicate(ke.kernel(), 1); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -1221,13 +1221,13 @@ TEST_F(TMAMiscTest, AdvancedThreadParallelizationLoad) { auto options = at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::randn({100000}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 1); - TMAPredicateChecker::checkPredicate(fe.kernel(), 4); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 1); + TMAPredicateChecker::checkPredicate(ke.kernel(), 4); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -1264,13 +1264,13 @@ TEST_F(TMAMiscTest, AdvancedThreadParallelizationStore) { auto options = at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::randn({100000}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 1); - TMAPredicateChecker::checkPredicate(fe.kernel(), 4); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 1); + TMAPredicateChecker::checkPredicate(ke.kernel(), 4); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -1300,13 +1300,13 @@ TEST_F(TMAMiscTest, DisableIndexHoisting) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({32}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 1); - TMAPredicateChecker::checkPredicate(fe.kernel(), 0); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 1); + TMAPredicateChecker::checkPredicate(ke.kernel(), 0); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -1332,13 +1332,13 @@ TEST_F(TMAMiscTest, Repro1977) { auto options = at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::randn({1024}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 1); - TMAPredicateChecker::checkPredicate(fe.kernel(), 0); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 1); + TMAPredicateChecker::checkPredicate(ke.kernel(), 0); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -1423,9 +1423,9 @@ TEST_F(TMAMiscTest, StoreSyncInsertion) { std::count_if(flattened_exprs.begin(), flattened_exprs.end(), is_wait), 1); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}, {}, matmul_cparams); - auto cg_outputs = fe.runFusion({input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input}, {}, matmul_cparams); + auto cg_outputs = ke.runFusion({input}); testValidate(&fusion, cg_outputs, {input}, {input}, __LINE__, __FILE__); } @@ -1475,9 +1475,9 @@ TEST_F(TMAMiscTest, StoreSyncInsertion) { // RAW sync is inserted, the WAR pass has not run yet. We should be able to // remove the RAW sync by adding a cleanup pass. - FusionExecutor fe; - fe.compileFusion(&fusion, {input}, {}, matmul_cparams); - auto cg_outputs = fe.runFusion({input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input}, {}, matmul_cparams); + auto cg_outputs = ke.runFusion({input}); testValidate(&fusion, cg_outputs, {input}, {input}, __LINE__, __FILE__); } @@ -1542,9 +1542,9 @@ TEST_F(TMAMiscTest, StoreSyncInsertion) { std::count_if(flattened_exprs.begin(), flattened_exprs.end(), is_wait), 2); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}, {}, matmul_cparams); - auto cg_outputs = fe.runFusion({input}); + KernelExecutor ke; + ke.compileFusion(&fusion, {input}, {}, matmul_cparams); + auto cg_outputs = ke.runFusion({input}); testValidate(&fusion, cg_outputs, {input}, {input}, __LINE__, __FILE__); } } @@ -1586,12 +1586,12 @@ TEST_F(TMAMiscTest, LoadStrongCorrectness) { auto options = at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::arange(1, 33, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 1); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 1); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); auto expect = at::zeros({2, 1, 2, 16}, options); expect.flatten(0, 2).select(0, 0) = at::arange(1, 17, options); @@ -1632,8 +1632,8 @@ TEST_F(TMACompileTimeInvalidTest, BulkNotInTMA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({32}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage(::testing::HasSubstr( "ParallelType::Bulk is only supported for cp.async.bulk."))); @@ -1661,8 +1661,8 @@ TEST_F(TMACompileTimeInvalidTest, BulkBroadcast) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({32}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage(::testing::HasSubstr( "ParallelType::Bulk is only supported for IterType::Iteration."))); @@ -1689,8 +1689,8 @@ TEST_F(TMACompileTimeInvalidTest, InvalidParallelType) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({32}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Invalid parallel type for cp.async.bulk: V"))); @@ -1727,13 +1727,13 @@ TEST_F(TMARuntimeInvalidTest, MisalignedGlobalAddress) { at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0_aligned = at::randn({128 + items_of_16_bytes}, options) .narrow(0, items_of_16_bytes, 128); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0_aligned}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0_aligned}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 1); - TMAPredicateChecker::checkPredicate(fe.kernel(), 1); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 1); + TMAPredicateChecker::checkPredicate(ke.kernel(), 1); - auto cg_outputs = fe.runFusion({t0_aligned}); + auto cg_outputs = ke.runFusion({t0_aligned}); testValidate( &fusion, cg_outputs, {t0_aligned}, {t0_aligned}, __LINE__, __FILE__); @@ -1741,7 +1741,7 @@ TEST_F(TMARuntimeInvalidTest, MisalignedGlobalAddress) { [&]() { auto t0_misaligned = at::randn({128 + items_of_16_bytes / 2}, options) .narrow(0, items_of_16_bytes / 2, 128); - fe.runFusion({t0_misaligned}); + ke.runFusion({t0_misaligned}); }, ::testing::ThrowsMessage(::testing::HasSubstr( "globalAddress, which specifies the starting address of the memory region described, " @@ -1782,13 +1782,13 @@ TEST_F(TMARuntimeInvalidTest, MisalignedGlobalStride) { at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0_aligned = at::randn({128, 128 + items_of_16_bytes}, options).narrow(1, 0, 128); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0_aligned}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0_aligned}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2); - TMAPredicateChecker::checkPredicate(fe.kernel(), 1); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2); + TMAPredicateChecker::checkPredicate(ke.kernel(), 1); - auto cg_outputs = fe.runFusion({t0_aligned}); + auto cg_outputs = ke.runFusion({t0_aligned}); testValidate( &fusion, cg_outputs, {t0_aligned}, {t0_aligned}, __LINE__, __FILE__); @@ -1797,7 +1797,7 @@ TEST_F(TMARuntimeInvalidTest, MisalignedGlobalStride) { auto t0_misaligned = at::randn({128, 128 + items_of_16_bytes / 2}, options) .narrow(1, 0, 128); - fe.runFusion({t0_misaligned}); + ke.runFusion({t0_misaligned}); }, ::testing::ThrowsMessage(::testing::HasSubstr( "globalStrides array, which specifies tensor stride of each of the lower tensorRank - 1 dimensions in bytes, " @@ -1836,8 +1836,8 @@ TEST_F(TMACompileTimeInvalidTest, SizeOfTransfer) { EXPECT_THAT( [&]() { - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage(::testing::HasSubstr( "The expected bytes must be a multiple of 16 bytes, but 8 is not."))); @@ -1876,18 +1876,18 @@ TEST_F(TMARuntimeInvalidTest, SizeOfTransfer) { at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::randn({128}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, items_of_16_bytes}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, items_of_16_bytes}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 1); - TMAPredicateChecker::checkPredicate(fe.kernel(), 1); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 1); + TMAPredicateChecker::checkPredicate(ke.kernel(), 1); - auto cg_outputs = fe.runFusion({t0, items_of_16_bytes}); + auto cg_outputs = ke.runFusion({t0, items_of_16_bytes}); testValidate( &fusion, cg_outputs, {t0, items_of_16_bytes}, {t0}, __LINE__, __FILE__); EXPECT_THAT( - [&]() { fe.runFusion({t0, items_of_16_bytes / 2}); }, + [&]() { ke.runFusion({t0, items_of_16_bytes / 2}); }, ::testing::ThrowsMessage(::testing::HasSubstr( "The expected bytes must be a multiple of 16 bytes, but "))); } @@ -1929,19 +1929,19 @@ TEST_F(TMARuntimeInvalidTest, InvalidView) { at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); // (10240,) can be viewed as (10, 1024) auto t0_valid = at::randn({10240}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0_valid}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0_valid}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2); - auto cg_outputs = fe.runFusion({t0_valid}); + auto cg_outputs = ke.runFusion({t0_valid}); testValidate(&fusion, cg_outputs, {t0_valid}, {t0_valid}, __LINE__, __FILE__); EXPECT_THAT( [&]() { // it is impossible to view (10249,) as (?, 1024) auto t0_inval = at::randn({10249}, options); - fe.runFusion({t0_inval}); + ke.runFusion({t0_inval}); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Invalid view in TMA: the extent of"))); @@ -1975,8 +1975,8 @@ TEST_F(TMACompileTimeInvalidTest, InnermostDiscontiguous) { EXPECT_THAT( [&]() { - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage(::testing::HasSubstr( "The innermost dimension of the TMA domain must be contiguous"))); @@ -2016,8 +2016,8 @@ TEST_F(TMACompileTimeInvalidTest, MergeDiscontiguous) { EXPECT_THAT( [&]() { - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Can not merge discontiguous dimensions, but"))); @@ -2052,8 +2052,8 @@ TEST_F(TMACompileTimeInvalidTest, InnermostElementStrideNotOne) { EXPECT_THAT( [&]() { - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage(::testing::HasSubstr( "When interleave is CU_TENSOR_MAP_INTERLEAVE_NONE " @@ -2091,8 +2091,8 @@ TEST_F(TMACompileTimeInvalidTest, SwizzleBulkWithNonBulk) { EXPECT_THAT( [&]() { - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage(::testing::HasSubstr( "TMA domain must be a view of the allocation domain of the gmem tensor"))); @@ -2135,8 +2135,8 @@ TEST_F(TMADocTest, Figure13a) { EXPECT_THAT( [&]() { - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Some error message"))); @@ -2173,13 +2173,13 @@ TEST_F(TMADocTest, Figure14a) { at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::randn({16, 200}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2); - TMAPredicateChecker::checkPredicate(fe.kernel(), 0); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2); + TMAPredicateChecker::checkPredicate(ke.kernel(), 0); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -2214,8 +2214,8 @@ TEST_F(TMADocTest, Figure13b) { EXPECT_THAT( [&]() { - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Some error message"))); @@ -2249,13 +2249,13 @@ TEST_F(TMADocTest, Figure14b) { at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::randn({16, 10}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2); - TMAPredicateChecker::checkPredicate(fe.kernel(), 0); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2); + TMAPredicateChecker::checkPredicate(ke.kernel(), 0); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -2291,8 +2291,8 @@ TEST_F(TMADocTest, Figure13c) { EXPECT_THAT( [&]() { - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Some error message"))); @@ -2327,13 +2327,13 @@ TEST_F(TMADocTest, Figure14c) { at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::randn({16, 200}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2); - TMAPredicateChecker::checkPredicate(fe.kernel(), 0); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2); + TMAPredicateChecker::checkPredicate(ke.kernel(), 0); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -2366,8 +2366,8 @@ TEST_F(TMADocTest, Figure13d) { EXPECT_THAT( [&]() { - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Some error message"))); @@ -2398,13 +2398,13 @@ TEST_F(TMADocTest, Figure14d) { at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::randn({16, 12}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2); - TMAPredicateChecker::checkPredicate(fe.kernel(), 1); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2); + TMAPredicateChecker::checkPredicate(ke.kernel(), 1); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -2441,8 +2441,8 @@ TEST_F(TMADocTest, Figure13e) { EXPECT_THAT( [&]() { - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Some error message"))); @@ -2478,13 +2478,13 @@ TEST_F(TMADocTest, Figure14e) { at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::randn({16, 10}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2); - TMAPredicateChecker::checkPredicate(fe.kernel(), 1); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2); + TMAPredicateChecker::checkPredicate(ke.kernel(), 1); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -2523,13 +2523,13 @@ TEST_F(TMADocTest, Figure15a) { at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::randn({16, 10}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2); - TMAPredicateChecker::checkPredicate(fe.kernel(), 0); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2); + TMAPredicateChecker::checkPredicate(ke.kernel(), 0); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -2565,13 +2565,13 @@ TEST_F(TMADocTest, Figure15b) { at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::randn({16, 12}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2); - TMAPredicateChecker::checkPredicate(fe.kernel(), 4); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2); + TMAPredicateChecker::checkPredicate(ke.kernel(), 4); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -2613,8 +2613,8 @@ TEST_F(TMADocTest, Figure15c) { EXPECT_THAT( [&]() { - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Some error message"))); @@ -2660,8 +2660,8 @@ TEST_F(TMADocTest, Figure15d) { EXPECT_THAT( [&]() { - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Some error message"))); @@ -2701,8 +2701,8 @@ TEST_F(TMADocTest, Figure15e) { EXPECT_THAT( [&]() { - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Some error message"))); @@ -2755,9 +2755,9 @@ TEST_P(LdMatrixTest, Regular) { auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); auto t0 = at::randn({size1, getK(macro)}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, LaunchParams(), matmul_cparams); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -2881,9 +2881,9 @@ TEST_P(StMatrixSingleTileTest, Regular) { auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); auto t0 = at::randn({sizeM, sizeN}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, LaunchParams(), matmul_cparams); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -2942,9 +2942,9 @@ TEST_P(StMatrixTest, Regular) { auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); auto t0 = at::randn({sizeM, sizeN}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, LaunchParams(), matmul_cparams); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -3017,9 +3017,9 @@ TEST_P(LdMatrixTest, Transpose) { auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); auto t0 = at::randn({getK(macro), size2}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}, LaunchParams(), matmul_cparams); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } diff --git a/tests/cpp/test_mma.cpp b/tests/cpp/test_mma.cpp index 470a1633a9e..95a3bd2e772 100644 --- a/tests/cpp/test_mma.cpp +++ b/tests/cpp/test_mma.cpp @@ -172,10 +172,10 @@ std::vector scheduleCompileAndRun( tv2->setLoopDomain(s.as()); } - FusionExecutor fe; - fe.compileFusion( + KernelExecutor ke; + ke.compileFusion( fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams); - return fe.runFusion({inputs.first, inputs.second}); + return ke.runFusion({inputs.first, inputs.second}); } TEST_P(MmaTest, SingleTile) { @@ -388,11 +388,11 @@ TEST_P(HopperRS, SingleTile) { auto inputs = matmulAtInput3DHopperRS( getM(macro), getN(macro), getK(macro), layout, data_type_to_aten(dtype)); - FusionExecutor fe; - fe.compileFusion( + KernelExecutor ke; + ke.compileFusion( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.squeeze().to(at::kFloat), inputs.second.squeeze().to(at::kFloat), @@ -484,11 +484,11 @@ TEST_P(HopperRS, SingleTileWithTMALoadStore) { auto inputs = matmulAtInput3DHopperRS( getM(macro), getN(macro), getK(macro), layout, data_type_to_aten(dtype)); - FusionExecutor fe; - fe.compileFusion( + KernelExecutor ke; + ke.compileFusion( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.squeeze().to(at::kFloat), inputs.second.squeeze().to(at::kFloat), @@ -650,10 +650,10 @@ TEST_P(HopperSS, SingleTile) { auto inputs = matmulAtInput3DHopperSS( getM(macro), getN(macro), getK(macro), layout, data_type_to_aten(dtype)); - FusionExecutor fe; - fe.compileFusion( + KernelExecutor ke; + ke.compileFusion( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.squeeze().to(at::kFloat), inputs.second.squeeze().to(at::kFloat), @@ -779,10 +779,10 @@ TEST_P(HopperSS, SingleTileTransposed) { auto inputs = matmulAtInput3DHopperSS( getM(macro), getN(macro), getK(macro), layout, data_type_to_aten(dtype)); - FusionExecutor fe; - fe.compileFusion( + KernelExecutor ke; + ke.compileFusion( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.squeeze().to(at::kFloat), inputs.second.squeeze().to(at::kFloat), @@ -958,10 +958,10 @@ TEST_P(HopperSS, MultipleTile) { layout, data_type_to_aten(dtype)); - FusionExecutor fe; - fe.compileFusion( + KernelExecutor ke; + ke.compileFusion( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.squeeze().to(at::kFloat), inputs.second.squeeze().to(at::kFloat), diff --git a/tests/cpp/test_move_pad.cpp b/tests/cpp/test_move_pad.cpp index 4499ecb5ec5..92ccaeae676 100644 --- a/tests/cpp/test_move_pad.cpp +++ b/tests/cpp/test_move_pad.cpp @@ -41,13 +41,14 @@ TEST_F(MovePadTest, UnaryCat) { at::Tensor t1 = at::randn({2, 10}, options); std::vector aten_inputs = {t0, t1}; - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs(aten_inputs); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_EQ(runtime->fusionSegments()->groups().size(), 1); - testValidate(fec.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__); } TEST_F(MovePadTest, BinaryCat) { @@ -71,13 +72,14 @@ TEST_F(MovePadTest, BinaryCat) { at::Tensor t2 = at::randn({2, 10}, options); std::vector aten_inputs = {t0, t1, t2}; - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs(aten_inputs); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_EQ(runtime->fusionSegments()->groups().size(), 1); - testValidate(fec.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__); } TEST_F(MovePadTest, BinaryBroadcastOnNonCatDim) { @@ -105,19 +107,20 @@ TEST_F(MovePadTest, BinaryBroadcastOnNonCatDim) { at::Tensor t2 = at::randn({4, 5}, options); std::vector aten_inputs = {t0, t1, t2}; - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs(aten_inputs); // ensure that we propagate the pad across binary operation and the first // segment is no-op - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_THAT( runtime->fusionSegments()->groups(), UnorderedElementsAre( HeuristicIs(SchedulerType::NoOp), HeuristicIs(SchedulerType::PointWise))); - testValidate(fec.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__); } TEST_F(MovePadTest, BinaryBroadcastOnCatDim) { @@ -144,13 +147,14 @@ TEST_F(MovePadTest, BinaryBroadcastOnCatDim) { at::Tensor t2 = at::randn({2, 10}, options); std::vector aten_inputs = {t0, t1, t2}; - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs(aten_inputs); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_EQ(runtime->fusionSegments()->groups().size(), 2); - testValidate(fec.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__); } TEST_F(MovePadTest, PadReplayOnMultipleUsesCase0) { @@ -179,13 +183,14 @@ TEST_F(MovePadTest, PadReplayOnMultipleUsesCase0) { at::Tensor t1 = at::randn({1, 10}, options); std::vector aten_inputs = {t0, t1}; - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs(aten_inputs); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_EQ(runtime->fusionSegments()->groups().size(), 1); - testValidate(fec.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__); } TEST_F(MovePadTest, PadReplayOnMultipleUsesCase1) { @@ -215,10 +220,11 @@ TEST_F(MovePadTest, PadReplayOnMultipleUsesCase1) { at::Tensor t1 = at::randn({4, 10}, options); std::vector aten_inputs = {t0, t1}; - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs(aten_inputs); - testValidate(fec.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__); } TEST_F(MovePadTest, CascadePadCase0) { @@ -264,15 +270,16 @@ TEST_F(MovePadTest, CascadePadCase0) { at::Tensor t0 = at::randn({4, 10}, options); std::vector aten_inputs = {t0}; - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs(aten_inputs); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); Fusion* complete_fusion = runtime->fusionSegments()->completeFusion(); std::vector exprs = complete_fusion->exprs(); EXPECT_THAT(exprs, Contains(Property(&Expr::isA, IsTrue())).Times(1)); - testValidate(fec.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__); } TEST_F(MovePadTest, CascadePadCase1) { @@ -302,15 +309,16 @@ TEST_F(MovePadTest, CascadePadCase1) { at::Tensor t0 = at::randn({4, 10}, options); std::vector aten_inputs = {t0}; - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs(aten_inputs); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); Fusion* complete_fusion = runtime->fusionSegments()->completeFusion(); std::vector exprs = complete_fusion->exprs(); EXPECT_THAT(exprs, Contains(Property(&Expr::isA, IsTrue())).Times(2)); - testValidate(fec.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__); } TEST_F(MovePadTest, CascadePadCase2) { @@ -359,10 +367,11 @@ TEST_F(MovePadTest, CascadePadCase2) { at::Tensor t0 = at::randn({4, 10}, options); std::vector aten_inputs = {t0}; - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs(aten_inputs); - testValidate(fec.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__); } TEST_F(MovePadTest, NotMergeNegativePad) { @@ -391,10 +400,11 @@ TEST_F(MovePadTest, NotMergeNegativePad) { at::Tensor t0 = at::randn({4, 10}, options); std::vector aten_inputs = {t0}; - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs(aten_inputs); - testValidate(fec.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__); } TEST_F(MovePadTest, BooleanCat) { @@ -418,18 +428,24 @@ TEST_F(MovePadTest, BooleanCat) { at::Tensor t2 = at::randn({2, 10}, options) > 0.5; std::vector aten_inputs = {t0, t1, t2}; - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs(aten_inputs); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_EQ(runtime->fusionSegments()->groups().size(), 1); // ExpressionEvaluator is hitting an assert with dynamic value. - // https://github.com/NVIDIA/Fuser/issues/2697 testValidate(fec.fusion(), - // out_tensors, aten_inputs, __LINE__, __FILE__); + // https://github.com/NVIDIA/Fuser/issues/2697 + // testValidate(executor_cache.fusion(), out_tensors, aten_inputs, __LINE__, + // __FILE__); at::Tensor ref = at::cat({at::bitwise_and(t0, t1), t2}, 0); testValidate( - fec.fusion(), out_tensors, aten_inputs, {ref}, __LINE__, __FILE__); + executor_cache.fusion(), + out_tensors, + aten_inputs, + {ref}, + __LINE__, + __FILE__); } } // namespace nvfuser diff --git a/tests/cpp/test_move_split_cat.cpp b/tests/cpp/test_move_split_cat.cpp index 247aa96381e..beec3172e2d 100644 --- a/tests/cpp/test_move_split_cat.cpp +++ b/tests/cpp/test_move_split_cat.cpp @@ -39,9 +39,10 @@ TEST_F(MoveSplitCatTest, Cancellable_SplitImmediatelyFollowedByCat) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({4, 10}, options); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_TRUE(out_tensors[0].is_alias_of(in_tensor)); } @@ -60,9 +61,10 @@ TEST_F(MoveSplitCatTest, Noncancellable_DifferentOrder) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({2, 6}, options); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_FALSE(out_tensors[0].is_alias_of(in_tensor)); } @@ -83,9 +85,10 @@ TEST_F(MoveSplitCatTest, Cancellable_SetWithoutPermute) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({2, 5}, options); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_TRUE(out_tensors[0].is_alias_of(in_tensor)); } @@ -108,9 +111,10 @@ TEST_F(MoveSplitCatTest, Noncancellable_SliceAmountAndPaddingAmountMismatch) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({4, 10}, options); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_FALSE(out_tensors[0].is_alias_of(in_tensor)); } @@ -132,9 +136,10 @@ TEST_F(MoveSplitCatTest, Noncancellable_CatOnlySubsetOfSplitOutputs) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({4, 10}, options); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_FALSE(out_tensors[0].is_alias_of(in_tensor)); } @@ -158,9 +163,10 @@ TEST_F(MoveSplitCatTest, Cancellable_PermuteInBetween) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({2, 3, 10}, options); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_TRUE(out_tensors[0].is_alias_of(in_tensor)); } @@ -193,12 +199,13 @@ TEST_F(MoveSplitCatTest, Cancellable_IncompatibleAllocationOrder) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({2, 3, 5}, options); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); // Check the two permutes are merged to one. - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); Fusion* complete_fusion = runtime->fusionSegments()->completeFusion(); EXPECT_THAT(complete_fusion->exprs(), Contains(IsPermute()).Times(1)); @@ -232,9 +239,10 @@ TEST_F(MoveSplitCatTest, Cancellable_MultiplePermutesInBetween) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({2, 3, 10}, options); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_TRUE(out_tensors[0].is_alias_of(in_tensor)); } @@ -258,9 +266,10 @@ TEST_F(MoveSplitCatTest, Noncancellable_WrongAxis) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({2, 2, 4}, options); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_FALSE(out_tensors[0].is_alias_of(in_tensor)); } @@ -283,9 +292,10 @@ TEST_F(MoveSplitCatTest, Noncancellable_SomeButNotAllArePermuted) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({2, 2, 10}, options); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_FALSE(out_tensors[0].is_alias_of(in_tensor)); } @@ -311,9 +321,10 @@ TEST_F(MoveSplitCatTest, Noncancellable_PermutedDifferently) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({4, 2}, options); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_FALSE(out_tensors[0].is_alias_of(in_tensor)); } @@ -338,9 +349,10 @@ TEST_F(MoveSplitCatTest, Noncancellable_UnsupportedOps) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({2, 2, 4}, options); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_FALSE(out_tensors[0].is_alias_of(in_tensor)); } @@ -364,9 +376,10 @@ TEST_F(MoveSplitCatTest, Cancellable_ReshapeInBetween) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({4, 10}, options); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_TRUE(out_tensors[0].is_alias_of(in_tensor)); } @@ -393,9 +406,10 @@ TEST_F(MoveSplitCatTest, Cancellable_ReshapeAndPermuteInBetween) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({6, 10}, options); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_TRUE(out_tensors[0].is_alias_of(in_tensor)); } @@ -445,9 +459,10 @@ TEST_F(MoveSplitCatTest, Cancellable_Issue1768) { at::randn({b * h * 3 * s * f}, options) .as_strided({b, h * 3, s, f}, {h * 3 * s * f, f, h * 3 * f, 1}); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_TRUE(out_tensors[1].is_alias_of(in_tensor)); EXPECT_TRUE(out_tensors[2].is_alias_of(in_tensor)); @@ -471,9 +486,10 @@ TEST_F(MoveSplitCatTest, OuterSplit) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({4, 6}, options); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_FALSE(out_tensors[0].is_alias_of(in_tensor)); } @@ -514,11 +530,12 @@ TEST_F(MoveSplitCatTest, MultiplePairs) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({4, 6}, options); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); Fusion* complete_fusion = runtime->fusionSegments()->completeFusion(); std::vector exprs = complete_fusion->exprs(); @@ -564,9 +581,10 @@ TEST_F(MoveSplitCatTest, MultipleCatsOnSameSplit) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({4, 2}, options); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_FALSE(out_tensors[0].is_alias_of(in_tensor)); EXPECT_TRUE(out_tensors[1].is_alias_of(in_tensor)); diff --git a/tests/cpp/test_multidevice_lower_communication.cpp b/tests/cpp/test_multidevice_lower_communication.cpp index 3c454777f0f..47f80f77d9c 100644 --- a/tests/cpp/test_multidevice_lower_communication.cpp +++ b/tests/cpp/test_multidevice_lower_communication.cpp @@ -17,9 +17,10 @@ namespace nvfuser { namespace { -void assertIsCompiledToHostIrContainer(const FusionExecutorCache& fec) { - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); - const std::vector& executors = runtime->executors(); +void assertIsCompiledToHostIrContainer( + const FusionExecutorCache& executor_cache) { + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); + const std::vector& executors = runtime->executors(); EXPECT_THAT(executors, testing::SizeIs(1)); for (const auto& executor : executors) { EXPECT_TRUE(executor.fusion()->isA()) @@ -71,9 +72,9 @@ TEST_P(LowerGatherTest, ) { at::randn({in_mesh.size(), kTensorSize}, tensor_options); at::Tensor in_tensor = shardTensor(unsharded_tensor, in); - FusionExecutorCache fec(std::move(fusion)); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; - assertIsCompiledToHostIrContainer(fec); + FusionExecutorCache executor_cache(std::move(fusion)); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; + assertIsCompiledToHostIrContainer(executor_cache); if (out_mesh.has(device_id)) { EXPECT_TRUE(at::equal(out_tensor, unsharded_tensor)); @@ -112,9 +113,10 @@ TEST_P(LowerScatterTest, ) { at::Tensor unsharded_tensor = at::randn({out_mesh.size(), kTensorSize}, tensor_options); - FusionExecutorCache fec(std::move(fusion)); - at::Tensor out_tensor = fec.runFusionWithInputs({unsharded_tensor})[0]; - assertIsCompiledToHostIrContainer(fec); + FusionExecutorCache executor_cache(std::move(fusion)); + at::Tensor out_tensor = + executor_cache.runFusionWithInputs({unsharded_tensor})[0]; + assertIsCompiledToHostIrContainer(executor_cache); if (out_mesh.has(device_id)) { EXPECT_TRUE(at::equal(out_tensor, shardTensor(unsharded_tensor, out))); @@ -155,9 +157,9 @@ TEST_P(LowerSendRecvTest, ) { at::randn({in_mesh.size(), kTensorSize}, tensor_options); at::Tensor in_tensor = shardTensor(unsharded_tensor, in); - FusionExecutorCache fec(std::move(fusion)); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; - assertIsCompiledToHostIrContainer(fec); + FusionExecutorCache executor_cache(std::move(fusion)); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; + assertIsCompiledToHostIrContainer(executor_cache); if (out_mesh.has(device_id)) { EXPECT_TRUE(at::equal(out_tensor, shardTensor(unsharded_tensor, out))); @@ -194,9 +196,9 @@ TEST_F(LowerCollectiveTest, Allgather) { at::randn({num_devices, kTensorSize}, tensor_options); at::Tensor in_tensor = shardTensor(unsharded_tensor, in); - FusionExecutorCache fec(std::move(fusion)); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; - assertIsCompiledToHostIrContainer(fec); + FusionExecutorCache executor_cache(std::move(fusion)); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; + assertIsCompiledToHostIrContainer(executor_cache); EXPECT_TRUE(at::equal(out_tensor, unsharded_tensor)); } @@ -221,10 +223,10 @@ TEST_F(LowerCollectiveTest, Broadcast) { const auto device_id = communicator_->deviceId(); at::Tensor in_tensor = unsharded_tensor.slice(0, device_id, device_id + 1); - FusionExecutorCache fec(std::move(fusion)); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; + FusionExecutorCache executor_cache(std::move(fusion)); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; if (num_devices > 1) { - assertIsCompiledToHostIrContainer(fec); + assertIsCompiledToHostIrContainer(executor_cache); } EXPECT_TRUE( @@ -252,9 +254,9 @@ TEST_F(LowerCollectiveTest, Reduce) { const auto device_id = communicator_->deviceId(); at::Tensor in_tensor = shardTensor(unsharded_in_tensor, in); - FusionExecutorCache fec(std::move(fusion)); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; - assertIsCompiledToHostIrContainer(fec); + FusionExecutorCache executor_cache(std::move(fusion)); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; + assertIsCompiledToHostIrContainer(executor_cache); if (device_id == kRoot) { // at::allclose instead of at::equal because addition is involved. @@ -281,9 +283,9 @@ TEST_F(LowerCollectiveTest, Allreduce) { at::randn({num_devices, kTensorSize}, tensor_options); at::Tensor in_tensor = shardTensor(unsharded_in_tensor, in); - FusionExecutorCache fec(std::move(fusion)); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; - assertIsCompiledToHostIrContainer(fec); + FusionExecutorCache executor_cache(std::move(fusion)); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; + assertIsCompiledToHostIrContainer(executor_cache); EXPECT_TRUE(at::allclose(out_tensor, unsharded_in_tensor.sum(0))); } @@ -309,10 +311,10 @@ TEST_F(LowerCollectiveTest, Allreduce_Concrete) { at::randn({num_devices, kTensorSize}, tensor_options); at::Tensor in_tensor = shardTensor(unsharded_in_tensor, in); - FusionExecutorCache fec(std::move(fusion)); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; + FusionExecutorCache executor_cache(std::move(fusion)); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; if (num_devices > 1) { - assertIsCompiledToHostIrContainer(fec); + assertIsCompiledToHostIrContainer(executor_cache); } EXPECT_TRUE(at::allclose(out_tensor, unsharded_in_tensor.sum(0))); @@ -338,9 +340,9 @@ TEST_F(LowerCollectiveTest, ReduceScatter) { at::randn({num_devices, num_devices, kTensorSize}, tensor_options); at::Tensor in_tensor = shardTensor(unsharded_in_tensor, in); - FusionExecutorCache fec(std::move(fusion)); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; - assertIsCompiledToHostIrContainer(fec); + FusionExecutorCache executor_cache(std::move(fusion)); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; + assertIsCompiledToHostIrContainer(executor_cache); at::Tensor unsharded_out_tensor = unsharded_in_tensor.sum(0); EXPECT_TRUE(at::allclose(out_tensor, shardTensor(unsharded_out_tensor, out))); @@ -371,8 +373,8 @@ TEST_F(LowerCollectiveTest, ReduceScatter_Allgather) { at::randn({num_devices, num_devices, kTensorSize}, tensor_options); at::Tensor in_tensor = shardTensor(unsharded_in_tensor, in); - FusionExecutorCache fec(std::move(fusion)); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; + FusionExecutorCache executor_cache(std::move(fusion)); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; EXPECT_TRUE(at::allclose(out_tensor, unsharded_in_tensor.sum(0))); } diff --git a/tests/cpp/test_multidevice_matmul.cpp b/tests/cpp/test_multidevice_matmul.cpp index 3032db30b94..24e84f56e5e 100644 --- a/tests/cpp/test_multidevice_matmul.cpp +++ b/tests/cpp/test_multidevice_matmul.cpp @@ -102,12 +102,18 @@ TEST_F(DistributedMatmulTest, MulSum_LayoutTN_NoComms) { std::vector inputs = {shardTensor(in0, a), in1}; auto expected_output = shardTensor(out, c); - FusionExecutorCache fec(std::move(fusion)); - auto outputs = fec.runFusionWithInputs(inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto outputs = executor_cache.runFusionWithInputs(inputs); testValidate( - fec.fusion(), outputs, inputs, {expected_output}, __LINE__, __FILE__); + executor_cache.fusion(), + outputs, + inputs, + {expected_output}, + __LINE__, + __FILE__); - const FusionKernelRuntime* kernel_runtime = fec.getMostRecentKernelRuntime(); + const FusionKernelRuntime* kernel_runtime = + executor_cache.getMostRecentKernelRuntime(); EXPECT_THAT( kernel_runtime->fusionSegments()->groups(), Contains(HeuristicIs(SchedulerType::Matmul)).Times(1)); @@ -156,13 +162,19 @@ TEST_F(DistributedMatmulTest, Matmul_LayoutTN_NoComms) { std::vector inputs = {shardTensor(in0, a), in1}; auto expected_output = shardTensor(out, c); - FusionExecutorCache fec(std::move(fusion)); - auto outputs = fec.runFusionWithInputs(inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto outputs = executor_cache.runFusionWithInputs(inputs); testValidate( - fec.fusion(), outputs, inputs, {expected_output}, __LINE__, __FILE__); + executor_cache.fusion(), + outputs, + inputs, + {expected_output}, + __LINE__, + __FILE__); - const FusionKernelRuntime* kernel_runtime = fec.getMostRecentKernelRuntime(); + const FusionKernelRuntime* kernel_runtime = + executor_cache.getMostRecentKernelRuntime(); EXPECT_THAT( kernel_runtime->fusionSegments()->groups(), Contains(HeuristicIs(SchedulerType::ExprEval)).Times(1)); @@ -208,13 +220,19 @@ TEST_F(DistributedMatmulTest, Matmul_LayoutTN_Allgather) { std::vector inputs = {shardTensor(in0, a), in1}; auto expected_output = shardTensor(out, c); - FusionExecutorCache fec(std::move(fusion)); - auto outputs = fec.runFusionWithInputs(inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto outputs = executor_cache.runFusionWithInputs(inputs); testValidate( - fec.fusion(), outputs, inputs, {expected_output}, __LINE__, __FILE__); + executor_cache.fusion(), + outputs, + inputs, + {expected_output}, + __LINE__, + __FILE__); - const FusionKernelRuntime* kernel_runtime = fec.getMostRecentKernelRuntime(); + const FusionKernelRuntime* kernel_runtime = + executor_cache.getMostRecentKernelRuntime(); EXPECT_THAT( kernel_runtime->fusionSegments()->groups(), Contains(HeuristicIs(SchedulerType::ExprEval)).Times(1)); @@ -258,12 +276,14 @@ TEST_F(DistributedMatmulTest, Matmul_LayoutNT_AllReduce) { in1 = in1.view({Ko, Ki, N}); std::vector inputs = {shardTensor(in0, a), shardTensor(in1, b)}; - FusionExecutorCache fec(std::move(fusion)); - auto outputs = fec.runFusionWithInputs(inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto outputs = executor_cache.runFusionWithInputs(inputs); - testValidate(fec.fusion(), outputs, inputs, {out}, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), outputs, inputs, {out}, __LINE__, __FILE__); - const FusionKernelRuntime* kernel_runtime = fec.getMostRecentKernelRuntime(); + const FusionKernelRuntime* kernel_runtime = + executor_cache.getMostRecentKernelRuntime(); EXPECT_THAT( kernel_runtime->fusionSegments()->groups(), Contains(HeuristicIs(SchedulerType::ExprEval)).Times(1)); @@ -315,12 +335,18 @@ TEST_F(DistributedMatmulTest, Matmul_LayoutNT_ReduceScatter) { std::vector inputs = {shardTensor(in0, a), shardTensor(in1, b)}; auto expected_output = shardTensor(out, c).view({1, Mi, N}); - FusionExecutorCache fec(std::move(fusion)); - auto outputs = fec.runFusionWithInputs(inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto outputs = executor_cache.runFusionWithInputs(inputs); testValidate( - fec.fusion(), outputs, inputs, {expected_output}, __LINE__, __FILE__); + executor_cache.fusion(), + outputs, + inputs, + {expected_output}, + __LINE__, + __FILE__); - const FusionKernelRuntime* kernel_runtime = fec.getMostRecentKernelRuntime(); + const FusionKernelRuntime* kernel_runtime = + executor_cache.getMostRecentKernelRuntime(); EXPECT_THAT( kernel_runtime->fusionSegments()->groups(), Contains(HeuristicIs(SchedulerType::ExprEval)).Times(1)); @@ -354,16 +380,16 @@ TEST_F(DistributedMatmulTest, PresegPreservesSharding) { auto w_tensor = at::randn({mesh.size(), 36, 48}, tensor_options); auto sharded_w_tensor = shardTensor(w_tensor, w); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); std::vector inputs({x_tensor, sharded_w_tensor}); - std::vector outputs = fec.runFusionWithInputs(inputs); + std::vector outputs = executor_cache.runFusionWithInputs(inputs); at::Tensor expected_mm_t_tensor = atMatmul(x_tensor, w_tensor.view({mesh.size() * 36, 48}), MmaLayout::TN) .transpose(0, 1) .view({mesh.size(), 36, 12}); testValidate( - fec.fusion(), + executor_cache.fusion(), outputs, inputs, {shardTensor(expected_mm_t_tensor, mm_t)}, @@ -394,13 +420,13 @@ TEST_F(DistributedMatmulTest, AnnotateWeightOnly) { auto w_tensor = at::randn({mesh.size(), 3, 5}, tensor_options); auto sharded_w_tensor = shardTensor(w_tensor, w); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); std::vector inputs({x_tensor, sharded_w_tensor}); - std::vector outputs = fec.runFusionWithInputs(inputs); + std::vector outputs = executor_cache.runFusionWithInputs(inputs); at::Tensor expected_y_tensor = at::matmul(x_tensor, w_tensor); testValidate( - fec.fusion(), + executor_cache.fusion(), outputs, inputs, {shardTensor(expected_y_tensor, 0, mesh)}, diff --git a/tests/cpp/test_multidevice_sharding.cpp b/tests/cpp/test_multidevice_sharding.cpp index ece3a36b67a..1e1ff2eab9e 100644 --- a/tests/cpp/test_multidevice_sharding.cpp +++ b/tests/cpp/test_multidevice_sharding.cpp @@ -62,10 +62,16 @@ TEST_P(MultiDeviceReductionTest, UnshardedInput_ShardedOutput) { auto x1 = shardTensor(x0, tv1); auto x2 = x1 + x1; auto x3 = shardTensor(at::sum(x0 + x0, {sharded_input_dim}), tv3); - FusionExecutorCache fec(std::move(fusion)); - auto outputs = fec.runFusionWithInputs(inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto outputs = executor_cache.runFusionWithInputs(inputs); - testValidate(fec.fusion(), outputs, inputs, {x1, x2, x3}, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), + outputs, + inputs, + {x1, x2, x3}, + __LINE__, + __FILE__); } // Test multidevice fusion with sharded input and replicated intermediates and @@ -98,9 +104,10 @@ TEST_P(MultiDeviceReductionTest, ShardedInput_ReplicatedOutput) { auto x1 = at::randn(unsharded_input_shape, tensor_options); std::vector inputs = {shardTensor(x1, tv0)}; auto x2 = x1 * 2; - FusionExecutorCache fec(std::move(fusion)); - auto outputs = fec.runFusionWithInputs(inputs); - testValidate(fec.fusion(), outputs, inputs, {x1, x2}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto outputs = executor_cache.runFusionWithInputs(inputs); + testValidate( + executor_cache.fusion(), outputs, inputs, {x1, x2}, __LINE__, __FILE__); } INSTANTIATE_TEST_SUITE_P( @@ -137,10 +144,10 @@ TEST_F(MultiDeviceTest, Reduction) { auto unsharded_in_tensor = at::randn({mesh.size(), 4}, tensor_options); auto in_tensor = shardTensor(unsharded_in_tensor, in); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); testValidate( - fec.fusion(), + executor_cache.fusion(), out_tensors, {in_tensor}, {unsharded_in_tensor.sum(0)}, @@ -172,10 +179,10 @@ TEST_F(MultiDeviceTest, Slice) { auto expected_out = aten_x.split(4, 2); std::vector inputs = {{shardTensor(aten_x, x)}}; - FusionExecutorCache fec(std::move(fusion)); - auto outputs = fec.runFusionWithInputs(inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto outputs = executor_cache.runFusionWithInputs(inputs); testValidate( - fec.fusion(), + executor_cache.fusion(), outputs, inputs, {shardTensor(expected_out[0], x), shardTensor(expected_out[1], x)}, @@ -206,8 +213,8 @@ TEST_F(MultiDeviceTest, BackpropMeshes) { at::Tensor unsharded_x_tensor = at::randn({num_devices, 4}, tensor_options); at::Tensor x_tensor = shardTensor(unsharded_x_tensor, x); - FusionExecutorCache fec(std::move(fusion)); - at::Tensor z_tensor = fec.runFusionWithInputs({x_tensor})[0]; + FusionExecutorCache executor_cache(std::move(fusion)); + at::Tensor z_tensor = executor_cache.runFusionWithInputs({x_tensor})[0]; EXPECT_THAT(z_tensor.sizes(), ElementsAre(1, 4)) << "Due to sharding propagation, z is supposed to " << "be sharded in the same way as x."; @@ -239,11 +246,11 @@ TEST_F(MultiDeviceTest, LayerNorm) { auto aten_outputs = at::native_layer_norm(aten_x, norm_shape, aten_weight, aten_bias, kEps); - FusionExecutorCache fec(std::move(fusion)); - auto outputs = fec.runFusionWithInputs({aten_x}); + FusionExecutorCache executor_cache(std::move(fusion)); + auto outputs = executor_cache.runFusionWithInputs({aten_x}); testValidate( - fec.fusion(), + executor_cache.fusion(), outputs, {aten_x}, {std::get<0>(aten_outputs), @@ -278,14 +285,14 @@ TEST_F(MultiDeviceTest, Issue2758) { at::zeros({num_devices, num_devices, 4}, tensor_options); at::Tensor in_tensor = shardTensor(unsharded_in_tensor, in); - FusionExecutorCache fec(std::move(fusion)); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; + FusionExecutorCache executor_cache(std::move(fusion)); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; at::Tensor expected_out_tensor = shardTensor(unsharded_in_tensor.sum(0), reduce_scattered) + in_tensor.size(1); testValidate( - fec.fusion(), + executor_cache.fusion(), {out_tensor}, {in_tensor}, {expected_out_tensor}, @@ -314,20 +321,20 @@ TEST_F(MultiDeviceTest, Transpose) { at::randn({num_devices, 1024, 1024}, tensor_options); at::Tensor in_tensor = shardTensor(unsharded_in_tensor, in); - FusionExecutorCache fec(std::move(fusion)); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; + FusionExecutorCache executor_cache(std::move(fusion)); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; at::Tensor expected_out_tensor = shardTensor(unsharded_in_tensor.transpose(1, 2), out); testValidate( - fec.fusion(), + executor_cache.fusion(), {out_tensor}, {in_tensor}, {expected_out_tensor}, __LINE__, __FILE__); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_THAT( runtime->fusionSegments()->groups(), UnorderedElementsAre(HeuristicIs(SchedulerType::Transpose))); @@ -365,11 +372,12 @@ TEST_P(MultiDeviceBroadcastTest, NotExpanded) { fusion->addInput(in); fusion->addOutput(out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({1, 8}, options); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; - testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; + testValidate( + executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); } TEST_P(MultiDeviceBroadcastTest, Expanded) { @@ -395,11 +403,12 @@ TEST_P(MultiDeviceBroadcastTest, Expanded) { fusion->addInput(in); fusion->addOutput(out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({8}, options).as_strided({3, 8}, {0, 1}); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; - testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; + testValidate( + executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); } INSTANTIATE_TEST_SUITE_P(, MultiDeviceBroadcastTest, testing::Bool()); diff --git a/tests/cpp/test_multidevice_transformer.cpp b/tests/cpp/test_multidevice_transformer.cpp index f9d5d96c5da..f4b999e7aec 100644 --- a/tests/cpp/test_multidevice_transformer.cpp +++ b/tests/cpp/test_multidevice_transformer.cpp @@ -698,9 +698,9 @@ TEST_P(DistributedTransformerTest, MLP_Layer) { reference_outs[2], reference_outs[3]}; - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::manual_seed(getATenRandomSeed()); - auto outputs = fec.runFusionWithInputs(inputs); + auto outputs = executor_cache.runFusionWithInputs(inputs); validate(expected_outputs, outputs, {0.01, 0.01, 0.02, 0.02}); } @@ -785,9 +785,9 @@ TEST_P(DistributedTransformerTest, Sequence_Parallel_MLP_Layer) { shardTensor(reference_outs[2], 0, mesh), shardTensor(reference_outs[3], 0, mesh)}; - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::manual_seed(getATenRandomSeed()); - auto outputs = fec.runFusionWithInputs(inputs); + auto outputs = executor_cache.runFusionWithInputs(inputs); validate(expected_outputs, outputs, {0.01, 0.01, 0.02, 0.02}); } @@ -846,9 +846,9 @@ TEST_P(DistributedTransformerTest, MultiheadAttention) { reference_outs[2], reference_outs[3]}; - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::manual_seed(getATenRandomSeed()); - auto outputs = fec.runFusionWithInputs(inputs); + auto outputs = executor_cache.runFusionWithInputs(inputs); validate(expected_outputs, outputs, {0.02, 0.02, 0.02, 0.02}); } @@ -920,8 +920,8 @@ TEST_P(DistributedTransformerTest, MLP_Backward) { shardTensor(outs[5], 0, mesh), // linear0 bias grad outs[6]}; // linear0 grad x - FusionExecutorCache fec(std::move(fusion)); - auto outputs = fec.runFusionWithInputs(inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto outputs = executor_cache.runFusionWithInputs(inputs); validate(expected_outputs, outputs, {1e-5, 0.2, 1e-5, 0.01, 0.2, 0.01, 0.02}); } @@ -1021,9 +1021,9 @@ TEST_P(DistributedTransformerTest, MHA_Backward) { .view({1, 3 * E / D}), // linear0 bias grad reference_outs[12]}; - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::manual_seed(getATenRandomSeed()); - auto out = fec.runFusionWithInputs(inputs); + auto out = executor_cache.runFusionWithInputs(inputs); validate( expected_outputs, out, {1e-5, 0.02, 1e-5, .01, .02, 0.2, 0.2, 0.2, 0.02}); } @@ -1146,9 +1146,9 @@ TEST_P(DistributedTransformerTest, Forward) { std::vector expected_outputs = { ln0_out_, mha_out_, ln1_out_, mlp_out_, at_out}; - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::manual_seed(getATenRandomSeed()); - auto outputs = fec.runFusionWithInputs(inputs); + auto outputs = executor_cache.runFusionWithInputs(inputs); validate(expected_outputs, outputs, {1e-4, 0.02, 0.04, 0.04, 0.04}); } @@ -1430,9 +1430,9 @@ TEST_P(DistributedTransformerTest, Backward) { shardTensor(mlp_out_[0], 1, mesh) // mlp linear1 }; - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::manual_seed(getATenRandomSeed()); - auto outputs = fec.runFusionWithInputs(inputs); + auto outputs = executor_cache.runFusionWithInputs(inputs); validate( expected_outputs, outputs, diff --git a/tests/cpp/test_no_op.cpp b/tests/cpp/test_no_op.cpp index a6e35e9b9ac..0b0e093767a 100644 --- a/tests/cpp/test_no_op.cpp +++ b/tests/cpp/test_no_op.cpp @@ -186,10 +186,11 @@ TEST_F(NoOpTest, View) { TensorView* out = reshape(in, in_shape, out_shape); fusion->addOutput(out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3, 4}, at::dtype(at::kFloat).device(at::kCUDA, 0)); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); ASSERT_EQ(out_tensors.size(), 1); at::Tensor out_tensor = out_tensors[0]; @@ -198,7 +199,7 @@ TEST_F(NoOpTest, View) { // Verify the NoOp scheduler was kicked in. const std::vector& groups = - fec.getMostRecentKernelRuntime()->fusionSegments()->groups(); + executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups(); ASSERT_EQ(groups.size(), 1); SegmentedGroup* group = groups[0]; EXPECT_EQ(group->schedulerType(), SchedulerType::NoOp); @@ -220,12 +221,13 @@ TEST_F(NoOpTest, ExpandedReduction) { out = segment_set(out); fusion->addOutput(out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::ones({}).cuda().as_strided({2, 3}, {0, 0}); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; - testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; + testValidate( + executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_THAT( runtime->fusionSegments()->groups(), UnorderedElementsAre(HeuristicIs(SchedulerType::NoOp))); diff --git a/tests/cpp/test_persistent_buffer.cpp b/tests/cpp/test_persistent_buffer.cpp index 529423145dc..edfdd0fd082 100644 --- a/tests/cpp/test_persistent_buffer.cpp +++ b/tests/cpp/test_persistent_buffer.cpp @@ -343,8 +343,8 @@ TEST_F(PersistentBufferTest, FusionPersistentBufferProjection_CUDA) { auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); at::Tensor aten_t0 = at::randn({99, 101}, options); - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = fec.runFusionWithInputs({aten_t0}); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs({aten_t0}); testValidate(&fusion, cg_outputs, {aten_t0}, __LINE__, __FILE__); } @@ -611,8 +611,8 @@ TEST_F(PersistentBufferTest, FusionLayerNormFusedOpsRedundantCast_CUDA) { hidden_size * dataTypeSize(dtype), "Persistent buffer size is not correct!"); - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = fec.runFusionWithInputs(inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs(inputs); testValidate(fusion, cg_outputs, inputs, outputs, __LINE__, __FILE__); } @@ -679,8 +679,8 @@ TEST_F(PersistentBufferTest, FusionRecomputePersistentBuffer_CUDA) { persistent_buffer_info2.persistent_buffers.size() == 1, "After project to other buffers, should have one persistent buffer!"); - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = fec.runFusionWithInputs(inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs(inputs); testValidate(fusion, cg_outputs, inputs, outputs, __LINE__, __FILE__); } @@ -1172,10 +1172,10 @@ TEST_F(PersistentBufferTest, PostReductionBroadcastCheck) { auto t1 = at::randn({dim0, dim1}, options); auto t2 = at::sum(t0, {1}).unsqueeze(1) + t0; auto t4 = t2 + t1; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = fec.runFusionWithInputs({t0, t1}); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs({t0, t1}); NVF_CHECK( - !fec.getMostRecentKernelRuntime()->isSegmented(), + !executor_cache.getMostRecentKernelRuntime()->isSegmented(), "unexpected segmentation!"); testValidate(fusion, cg_outputs, {t0, t1}, {t4}, __LINE__, __FILE__); @@ -1211,10 +1211,10 @@ TEST_F(PersistentBufferTest, PostReductionBroadcastCheckMultiBcastDims) { auto t1 = at::randn({dim0, dim1, dim2}, options); auto t2 = at::sum(t0, {1, 2}).unsqueeze(-1).unsqueeze(-1) + t0; auto t4 = t2 + t1; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = fec.runFusionWithInputs({t0, t1}); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs({t0, t1}); NVF_CHECK( - !fec.getMostRecentKernelRuntime()->isSegmented(), + !executor_cache.getMostRecentKernelRuntime()->isSegmented(), "unexpected segmentation!"); testValidate(fusion, cg_outputs, {t0, t1}, {t4}, __LINE__, __FILE__); @@ -1243,15 +1243,16 @@ TEST_F(PersistentBufferTest, SmemPersistentNotSupportedIn3DReduction) { .device(at::kCUDA, 0); auto t0 = at::randn(input_shape, options); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); std::vector aten_inputs = {t0}; - auto cg_outputs = fec.runFusionWithInputs(aten_inputs); + auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); // should be segmented since buffer size is larger than 32K and smem // persistent is not supported yet for 3D reduction. - EXPECT_TRUE(fec.getMostRecentKernelRuntime()->isSegmented()); + EXPECT_TRUE(executor_cache.getMostRecentKernelRuntime()->isSegmented()); - testValidate(fec.fusion(), cg_outputs, aten_inputs, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), cg_outputs, aten_inputs, __LINE__, __FILE__); } TEST_F(PersistentBufferTest, SmemPersistent2DReduction) { @@ -1297,10 +1298,10 @@ TEST_F(PersistentBufferTest, SmemPersistent2DReduction) { scheduler->schedule(fusion.get(), heuristic_params.get()); // Run the fusion and validate the results - FusionExecutor fe; - fe.compileFusion(fusion.get(), aten_inputs); + KernelExecutor ke; + ke.compileFusion(fusion.get(), aten_inputs); // Shared memory access should be vectorized. - // getBankConflictInfo(fe.kernel()) triggers error "std::get: wrong index for + // getBankConflictInfo(ke.kernel()) triggers error "std::get: wrong index for // variant" when trying to evaluate index with: // `expr_eval.evaluate(ti->index()).as();` for (auto tv : fusion->allTvs()) { @@ -1313,7 +1314,7 @@ TEST_F(PersistentBufferTest, SmemPersistent2DReduction) { } } } - auto cg_outputs = fe.runFusion( + auto cg_outputs = ke.runFusion( aten_inputs, heuristic_params->as()->lparams); auto t1 = t0 / t0.sum({1, 2, 3}, true); testValidate(fusion.get(), cg_outputs, aten_inputs, {t1}, __LINE__, __FILE__); diff --git a/tests/cpp/test_pointwise.cpp b/tests/cpp/test_pointwise.cpp index 552cb18f3a8..a854ac75900 100644 --- a/tests/cpp/test_pointwise.cpp +++ b/tests/cpp/test_pointwise.cpp @@ -23,8 +23,8 @@ using PointwiseTest = NVFuserTest; namespace { -int64_t getVecSizeForPointwise(const FusionExecutorCache& fec) { - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); +int64_t getVecSizeForPointwise(const FusionExecutorCache& executor_cache) { + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); NVF_CHECK(!runtime->isSegmented()); const PointwiseParams* params = runtime->schedulerHeuristics() ->heuristicsList() @@ -62,7 +62,7 @@ TEST_F(PointwiseTest, VectorizeStrideContiguity2D) { auto tv1 = add(tv0, tv0); fusion->addOutput(tv1); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); std::vector> size_and_vec{{17, 1}, {18, 2}, {32, 4}}; @@ -71,9 +71,9 @@ TEST_F(PointwiseTest, VectorizeStrideContiguity2D) { auto vec = pair.second; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input0 = at::randn({1000000, size}, options).narrow(1, 0, 16); - auto cg_outputs = fec.runFusionWithInputs({input0}); + auto cg_outputs = executor_cache.runFusionWithInputs({input0}); - EXPECT_EQ(getVecSizeForPointwise(fec), vec); + EXPECT_EQ(getVecSizeForPointwise(executor_cache), vec); testValidate(fusion, cg_outputs, {input0}, __LINE__, __FILE__); } @@ -90,7 +90,7 @@ TEST_F(PointwiseTest, VectorizeStrideContiguity3D) { auto tv1 = add(tv0, tv0); fusion->addOutput(tv1); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); std::vector> size_and_vec{{17, 1}, {10, 2}, {16, 4}}; @@ -99,9 +99,9 @@ TEST_F(PointwiseTest, VectorizeStrideContiguity3D) { auto vec = pair.second; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input0 = at::randn({1000000, size, 3}, options).narrow(1, 0, 8); - auto cg_outputs = fec.runFusionWithInputs({input0}); + auto cg_outputs = executor_cache.runFusionWithInputs({input0}); - EXPECT_EQ(getVecSizeForPointwise(fec), vec); + EXPECT_EQ(getVecSizeForPointwise(executor_cache), vec); testValidate(fusion, cg_outputs, {input0}, __LINE__, __FILE__); } @@ -120,7 +120,7 @@ TEST_F(PointwiseTest, VectorizeStrideContiguity5D) { auto tv1 = add(tv0, tv0); fusion->addOutput(tv1); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); @@ -134,9 +134,9 @@ TEST_F(PointwiseTest, VectorizeStrideContiguity5D) { at::Tensor input0 = at::randn({4, size1, 12345, size2, 3}, options) .narrow(1, 0, 8) .narrow(3, 0, 4); - auto cg_outputs = fec.runFusionWithInputs({input0}); + auto cg_outputs = executor_cache.runFusionWithInputs({input0}); - EXPECT_EQ(getVecSizeForPointwise(fec), vec); + EXPECT_EQ(getVecSizeForPointwise(executor_cache), vec); testValidate(fusion, cg_outputs, {input0}, __LINE__, __FILE__); } @@ -158,7 +158,7 @@ TEST_F(PointwiseTest, VectorizeStrideMisalignedBase) { auto tv1 = add(tv0, tv0); fusion->addOutput(tv1); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); @@ -195,8 +195,8 @@ TEST_F(PointwiseTest, VectorizeStrideMisalignedBase) { at::Tensor flat = at::randn({alloc_size}, options); at::Tensor input0 = flat.as_strided(shape, stride, /*storage_offset=*/align); - auto cg_outputs = fec.runFusionWithInputs({input0}); - EXPECT_EQ(getVecSizeForPointwise(fec), vec); + auto cg_outputs = executor_cache.runFusionWithInputs({input0}); + EXPECT_EQ(getVecSizeForPointwise(executor_cache), vec); testValidate(fusion, cg_outputs, {input0}, __LINE__, __FILE__); } } @@ -214,7 +214,7 @@ TEST_F(PointwiseTest, VectorizeStrideContiguitySelfOverlapping) { auto tv1 = add(tv0, tv0); fusion->addOutput(tv1); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); @@ -241,8 +241,8 @@ TEST_F(PointwiseTest, VectorizeStrideContiguitySelfOverlapping) { stride1, (int64_t)stride2 * 12345, (int64_t)stride2, 3, 1}; at::Tensor input0 = at::empty_strided(shape, stride, options); input0.random_(); - auto cg_outputs = fec.runFusionWithInputs({input0}); - EXPECT_EQ(getVecSizeForPointwise(fec), vec); + auto cg_outputs = executor_cache.runFusionWithInputs({input0}); + EXPECT_EQ(getVecSizeForPointwise(executor_cache), vec); testValidate(fusion, cg_outputs, {input0}, __LINE__, __FILE__); } } @@ -262,13 +262,13 @@ TEST_F(PointwiseTest, VectorizeAllocationDomain) { tv1->setAllocationDomain({tv1->axis(0), tv1->axis(2), tv1->axis(1)}, true); fusion->addOutput(tv1); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input0 = at::empty_strided({1024, 128, 25}, {128 * 25, 1, 128}, options); - auto cg_outputs = fec.runFusionWithInputs({input0}); - EXPECT_EQ(getVecSizeForPointwise(fec), 4); + auto cg_outputs = executor_cache.runFusionWithInputs({input0}); + EXPECT_EQ(getVecSizeForPointwise(executor_cache), 4); testValidate(fusion, cg_outputs, {input0}, __LINE__, __FILE__); } @@ -407,7 +407,7 @@ TEST_F(PointwiseTest, Issue1567VectorizationFactorAnalysisCase2) { auto tv3 = transpose(tv2, 0, 1); fusion->addOutput(tv3); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input0 = at::randn({1024, 1, 2}, options); @@ -444,7 +444,7 @@ TEST_F(PointwiseTest, VIssue1567ectorizationFactorAnalysisCase3) { auto tv3 = transpose(tv2, 0, 1); fusion->addOutput(tv3); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input0 = at::randn({1, 1024, 2}, options); @@ -549,9 +549,9 @@ TEST_F(PointwiseTest, ShardedPointwise) { unsharded_pparams->flip_grid_binding); pwise_scheduler->schedule(&sharded_fusion, sharded_params.get()); - FusionExecutor fe; - fe.compileFusion(&sharded_fusion, sharded_inputs, sharded_params->lparams); - auto cg_outputs = fe.runFusion(sharded_inputs, sharded_params->lparams); + KernelExecutor ke; + ke.compileFusion(&sharded_fusion, sharded_inputs, sharded_params->lparams); + auto cg_outputs = ke.runFusion(sharded_inputs, sharded_params->lparams); testValidate( &sharded_fusion, cg_outputs, sharded_inputs, __LINE__, __FILE__); } @@ -659,11 +659,12 @@ TEST_F(PointwiseTest, VectorizeWithExpandedBroadcast) { auto in_tensor = at::randn({kTensorSize}, options).as_strided({2, kTensorSize}, {0, 1}); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); - EXPECT_GT(getVecSizeForPointwise(fec), 1); + EXPECT_GT(getVecSizeForPointwise(executor_cache), 1); } using VectUnrollFactors = std::tuple; @@ -705,10 +706,10 @@ TEST_P(PointwiseParamsTest, UnrollOnTopOfVectorize) { // Schedule, compile, run, validate scheduler_instance->schedule(fusion.get(), pparams); - FusionExecutor fe; - fe.compileFusion(fusion.get(), runtime_inputs, pparams->lparams); - auto cg_outputs = fe.runFusion(runtime_inputs, pparams->lparams); - const auto& lparams = fe.lastLaunchParams(); + KernelExecutor ke; + ke.compileFusion(fusion.get(), runtime_inputs, pparams->lparams); + auto cg_outputs = ke.runFusion(runtime_inputs, pparams->lparams); + const auto& lparams = ke.lastLaunchParams(); ASSERT_EQ(lparams.gdimy(), dim0 / unroll_outer); ASSERT_EQ( lparams.gdimx(), dim1 / vect_factor / lparams.bdimx() / unroll_inner); diff --git a/tests/cpp/test_predicate_elimination.cpp b/tests/cpp/test_predicate_elimination.cpp index 8b941f7e0e6..2e6f3a409a1 100644 --- a/tests/cpp/test_predicate_elimination.cpp +++ b/tests/cpp/test_predicate_elimination.cpp @@ -77,9 +77,9 @@ TEST_F(PredicateEliminationTest, 2) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); auto ref = (t0 + 1).sum({1}) + 1; @@ -127,9 +127,9 @@ TEST_F(PredicateEliminationTest, 3) { for (auto size : {1, 2, 999, 1001, 1234, 10000}) { auto t0 = at::randn({size}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); auto ref = sum(t0) + 1; testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); @@ -180,9 +180,9 @@ TEST_F(PredicateEliminationTest, 4) { for (auto s1 : sizes) { auto t0 = at::randn({s0, s1}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); auto t1 = t0.sum({1}); auto t3 = t1.sum({0}) + 1; @@ -228,9 +228,9 @@ TEST_F(PredicateEliminationTest, 5) { for (auto s0 : sizes) { auto t0 = at::randn({s0}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); auto ref = t0.mean({0}); @@ -277,9 +277,9 @@ TEST_F(PredicateEliminationTest, 6) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({2, 3}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -313,9 +313,9 @@ TEST_F(PredicateEliminationTest, 7) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({123}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -382,12 +382,12 @@ TEST_F(PredicateEliminationTest, 8) { at::Tensor aten_t3 = at::randn(full_size, options); // tv0 - 3 at::Tensor aten_t4 = at::randn({channel_size}, options); // tv4 - 4 - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = - fec.runFusionWithInputs({aten_t0, aten_t1, aten_t2, aten_t3, aten_t4}); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs( + {aten_t0, aten_t1, aten_t2, aten_t3, aten_t4}); const auto& compiled_executors = - fec.getMostRecentKernelRuntime()->executors(); + executor_cache.getMostRecentKernelRuntime()->executors(); NVF_CHECK(compiled_executors.size() == 1, "Unexpected scheduling"); NVF_CHECK( !PredicatedChecker::isPredicated(tv6, compiled_executors.at(0).kernel()), @@ -431,9 +431,9 @@ TEST_F(PredicateEliminationTest, 9) { // with TIDx in this tensor EXPECT_TRUE(PredicatedChecker::isPredicated(tv1, gpulw)); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(fusion.get(), {t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(fusion.get(), cg_outputs, {t0}, __LINE__, __FILE__); } @@ -470,16 +470,16 @@ TEST_F(PredicateEliminationTest, ExtentEqualToMaxParallelTypeExtent) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({10 * 32}, options); - FusionExecutor fe; - fe.registerLoweringHook([&](GpuLower* lower) { + KernelExecutor ke; + ke.registerLoweringHook([&](GpuLower* lower) { lower->passes().insert( lower->passes().begin(), {"validate_smem_predicate_elimination", validate_smem_predicate_elimination}); }); - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } diff --git a/tests/cpp/test_preseg_passes.cpp b/tests/cpp/test_preseg_passes.cpp index ca554ca0532..1a34d693c53 100644 --- a/tests/cpp/test_preseg_passes.cpp +++ b/tests/cpp/test_preseg_passes.cpp @@ -635,11 +635,12 @@ TEST_F(PresegTest, ReplaceOutput) { TensorView* y = add(x, x); fusion->replaceOutput(x, y); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({10}, at::device(at::kCUDA)); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; - testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); } TEST_F(PresegTest, ExtentSubstitution) { diff --git a/tests/cpp/test_replay.cpp b/tests/cpp/test_replay.cpp index 1bb5c460bef..76f1271907a 100644 --- a/tests/cpp/test_replay.cpp +++ b/tests/cpp/test_replay.cpp @@ -46,8 +46,9 @@ TEST_F(ReplayTest, HorizontallyMergeReshapeAndPermute) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({4, 5}, options); - FusionExecutorCache fec(std::move(fusion)); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); + FusionExecutorCache executor_cache(std::move(fusion)); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); ASSERT_EQ(out_tensors.size(), 1); auto out_tensor = out_tensors[0]; @@ -85,8 +86,9 @@ TEST_F(ReplayTest, HorizontallyMergeReshapeAndNeg) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({4, 5}, options); - FusionExecutorCache fec(std::move(fusion)); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); + FusionExecutorCache executor_cache(std::move(fusion)); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); ASSERT_EQ(out_tensors.size(), 1); auto out_tensor = out_tensors[0]; diff --git a/tests/cpp/test_resize.cpp b/tests/cpp/test_resize.cpp index e794559216e..3a627f8a292 100644 --- a/tests/cpp/test_resize.cpp +++ b/tests/cpp/test_resize.cpp @@ -63,9 +63,9 @@ TEST_P(ResizeTest, Pad1) { EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); } - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); auto ref = at::pad(t0, {1, 1}); @@ -99,9 +99,9 @@ TEST_P(ResizeTest, Pad2) { EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); } - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); auto ref = at::pad(t0, {1, 1}); @@ -152,9 +152,9 @@ TEST_P(ResizeTest, Pad3) { EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); } - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -186,9 +186,9 @@ TEST_P(ResizeTest, Pad4) { EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); } - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); auto ref = at::pad(t0, {1, 1}); @@ -241,9 +241,9 @@ TEST_P(ResizeTest, Pad5) { EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); } - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); auto ref = at::pad(t0, {1, 1}); @@ -292,9 +292,9 @@ TEST_P(ResizeTest, Pad6) { EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); } - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -343,9 +343,9 @@ TEST_P(ResizeTest, Pad7) { EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); } - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -388,9 +388,9 @@ TEST_F(ResizeTest, Pad8) { auto t0 = at::randn(999, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); auto ref = at::pad(t0, {0, 1}) + at::pad(t0, {1, 0}); @@ -613,9 +613,9 @@ TEST_F(ResizeTest, Cat1) { auto t1 = at::randn(shape1, options); std::vector aten_inputs({t0, t1}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); auto ref = at::cat({t0, t1}, 0); @@ -645,9 +645,9 @@ TEST_F(ResizeTest, Cat2) { auto t1 = at::randn(shape1, options); std::vector aten_inputs({t0, t1}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); auto ref = at::cat({t0, t1}, 0); @@ -686,9 +686,9 @@ TEST_F(ResizeTest, Cat3) { auto t1 = at::randn(shape1, options); std::vector aten_inputs({t0, t1}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); auto ref = at::cat({t0, t1}, 1); @@ -730,9 +730,9 @@ TEST_F(ResizeTest, Cat4) { auto t1 = at::randn(shape1, options); std::vector aten_inputs({t0, t1}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); auto ref = at::cat({t0, t1}, 1); @@ -779,9 +779,9 @@ TEST_F(ResizeTest, Cat5) { auto t2 = at::randn(shape2, options); std::vector aten_inputs({t0, t1, t2}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -823,9 +823,9 @@ TEST_F(ResizeTest, Cat6) { auto t2 = at::randn(shape2, options); std::vector aten_inputs({t0, t1, t2}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); auto ref = at::cat({t0, t1, t2}, 0); @@ -879,9 +879,9 @@ TEST_F(ResizeTest, Cat7) { std::vector aten_inputs_ivalue( {aten_inputs.begin(), aten_inputs.end()}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs_ivalue); - auto cg_outputs = fe.runFusion(aten_inputs_ivalue); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs_ivalue); + auto cg_outputs = ke.runFusion(aten_inputs_ivalue); auto ref = at::cat(aten_inputs, concat_dim); @@ -1013,9 +1013,9 @@ TEST_F(ResizeTest, Slice1) { auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); auto ref = t0.index({at::indexing::Slice(1, shape[0] - 1)}); @@ -1044,9 +1044,9 @@ TEST_F(ResizeTest, Slice2) { auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -1142,9 +1142,9 @@ TEST_F(ResizeTest, Slice4) { auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); auto ref = (t0 + 1).to(at::kDouble).sum({1}); @@ -1197,9 +1197,9 @@ TEST_F(ResizeTest, Slice5) { auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); auto t1 = t0.index( {at::indexing::Slice(0, at::indexing::None), @@ -1249,9 +1249,9 @@ TEST_F(ResizeTest, SliceConstantShmoo) { auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -1294,13 +1294,13 @@ TEST_F(ResizeTest, SliceInputShmoo) { !fusion.hasDynamicTransform(), "Expected to have no dynamic transform"); } - FusionExecutor fe; - fe.compileFusion(&fusion); + KernelExecutor ke; + ke.compileFusion(&fusion); auto t0 = at::randn(shape, options); for (auto [start, stop] : slice_cases) { std::vector aten_inputs({t0, start, stop}); - auto cg_outputs = fe.runFusion(aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -1328,14 +1328,15 @@ TEST_F(ResizeTest, SliceInputShmooFusionExecutorCache) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto t0 = at::randn(shape, options); for (auto [start, stop] : slice_cases) { std::vector aten_inputs({t0, start, stop}); - auto cg_outputs = fec.runFusionWithInputs(aten_inputs); + auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); - testValidate(fec.fusion(), cg_outputs, aten_inputs, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), cg_outputs, aten_inputs, __LINE__, __FILE__); } } @@ -1755,9 +1756,9 @@ TEST_P(ResizeTest, PadWithValue) { EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); } - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); auto ref = at::pad(t0, {1, 1}, "constant", 2); @@ -1830,9 +1831,9 @@ TEST_P(ResizeTest, PadHalfWithDoubleValue) { EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); } - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); auto ref = at::pad(t0, {1, 1}, "constant", 2.5); @@ -2186,7 +2187,7 @@ TEST_F(ResizeTest, FusionSizeZeroSliceSplitSchedule) { FusionExecutorCache executor_cache(std::move(fusion)); auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); - FusionExecutor fe; + KernelExecutor ke; auto ref0 = t0.index({at::indexing::Slice(0, 2)}); auto ref1 = t0.index({at::indexing::Slice(2, 4)}); @@ -2228,15 +2229,15 @@ TEST_F(ResizeTest, FusionSizeZeroSliceSplit) { tv1->merge(0, 1); // size 0*5 = 0 tv1->split(0, 4); // sizes (0, 4) - FusionExecutor fe; - fe.compileFusion(fusion.get()); + KernelExecutor ke; + ke.compileFusion(fusion.get()); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - auto cg_outputs = fe.runFusion(aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); auto ref0 = t0.index({at::indexing::Slice(2, 2), at::indexing::Slice(0, 5)}); @@ -2267,7 +2268,7 @@ TEST_F(ResizeTest, FusionSqueezeSymbolic) { // tv1 is of shape {0, 5} fusion->addOutput(tv2); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::manual_seed(0); @@ -2275,14 +2276,14 @@ TEST_F(ResizeTest, FusionSqueezeSymbolic) { auto t0 = at::randn(shape, options); std::vector aten_inputs({t0, 20}); - auto cg_outputs = fec.runFusionWithInputs(aten_inputs); + auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); auto ref0 = t0.flatten(); NVF_CHECK(ref0.equal(cg_outputs[0])); EXPECT_THAT( - [&]() { fec.runFusionWithInputs({t0, 10}); }, + [&]() { executor_cache.runFusionWithInputs({t0, 10}); }, ThrowsMessage( HasSubstr("must concretize to IterType::Broadcast but found"))); } @@ -2680,9 +2681,9 @@ TEST_F(ResizeTest, Slice1DVectorizeManual1) { auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); auto ref = t0.index({at::indexing::Slice(slice_offset, shape[0] - slice_offset)}); @@ -2733,9 +2734,9 @@ TEST_F(ResizeTest, Slice1DVectorizeManual2) { auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); auto ref_t1 = t0.index({at::indexing::Slice(slice_offset, shape[0] - slice_offset)}); @@ -2784,9 +2785,9 @@ TEST_F(ResizeTest, Slice1DVectorizeManual3) { auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); auto ref = t0.index({at::indexing::Slice(slice_offset, shape[0] - slice_offset)}); @@ -2823,9 +2824,9 @@ TEST_F(ResizeTest, Slice1DVectorizeManual4) { auto t0_unaligned = at::randn(shape, options); auto t0_aligned = t0_unaligned.index({at::indexing::Slice(3, -1)}); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0_aligned}); - auto cg_outputs = fe.runFusion({t0_aligned}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0_aligned}); + auto cg_outputs = ke.runFusion({t0_aligned}); auto ref_aligned = t0_aligned.index({at::indexing::Slice(1, -3)}); @@ -2867,9 +2868,9 @@ TEST_F(ResizeTest, Slice2DVectorizeManual1) { auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); auto ref = t0.index( {at::indexing::Slice(slice_offset, shape[0] - slice_offset), @@ -2917,11 +2918,11 @@ TEST_F(ResizeTest, Slice3DVectorizeManual1) { auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); EXPECT_THAT( - [&]() { fe.runFusion(aten_inputs); }, + [&]() { ke.runFusion(aten_inputs); }, ThrowsMessage( HasSubstr("with word size 2 not possible due to invalid stride"))); } @@ -2960,11 +2961,11 @@ TEST_F(ResizeTest, Slice3DVectorizeManual2) { auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); EXPECT_THAT( - [&]() { fe.runFusion(aten_inputs); }, + [&]() { ke.runFusion(aten_inputs); }, ThrowsMessage( HasSubstr("with word size 4 not possible due to invalid stride"))); } @@ -3041,9 +3042,9 @@ TEST_F(ResizeTest, SliceAndReshapeRepro540Manual) { auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); for (const auto i : c10::irange(3)) { auto slice_out_ref = t0.index( @@ -3086,23 +3087,23 @@ TEST_P(ResizeTest, ReshapeToPad) { EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); } - FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor at_x = at::randn({4, 3}, options); std::vector aten_inputs = {at_x, 1, 1, 3, 4}; auto at_y = at::pad(at_x.reshape({3, 4}), {0, 1, 0, 1}); - auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); // Assert that we segmented into two segments auto seg_fusion = - fusion_executor_cache.getMostRecentKernelRuntime()->fusionSegments(); + executor_cache.getMostRecentKernelRuntime()->fusionSegments(); EXPECT_TRUE(seg_fusion->isSegmented()); EXPECT_EQ(seg_fusion->groups().size(), 2); testValidate( - fusion_executor_cache.fusion(), + executor_cache.fusion(), outputs, aten_inputs, {at_y}, @@ -3131,23 +3132,23 @@ TEST_F(ResizeTest, ReshapeToSlice) { auto tv2 = slice(tv1, {{fusion.zeroVal(), s0}, {fusion.zeroVal(), s1}}); fusion.addOutput(tv2); - FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor at_x = at::randn({4, 3}, options); std::vector aten_inputs = {at_x, 3, 2, 3, 4}; auto at_y = at::slice(at::slice(at_x.reshape({3, 4}), 0, 0, 3), 1, 0, 2); - auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); // Assert that we segmented into two segments auto seg_fusion = - fusion_executor_cache.getMostRecentKernelRuntime()->fusionSegments(); + executor_cache.getMostRecentKernelRuntime()->fusionSegments(); EXPECT_TRUE(seg_fusion->isSegmented()); EXPECT_EQ(seg_fusion->groups().size(), 2); testValidate( - fusion_executor_cache.fusion(), + executor_cache.fusion(), outputs, aten_inputs, {at_y}, @@ -3179,9 +3180,9 @@ TEST_F(ResizeTest, CatOfBroadcast) { auto t1 = at::randn(shape1, options); std::vector aten_inputs({t0, t1}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); auto ref = at::cat({t0, t1}, 0); @@ -3216,9 +3217,9 @@ TEST_F(ResizeTest, CatOfExpandedBroadcast) { auto t1 = at::randn(shape1, options); std::vector aten_inputs({t0, t1}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); auto ref = at::cat({at::expand_copy(t0, shape0e), t1}, 0); @@ -3302,9 +3303,9 @@ TEST_P(ResizeTest, PadOfBroadcast) { EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); } - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -3338,9 +3339,9 @@ TEST_P(ResizeTest, PadOfExpandedBroadcast) { EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); } - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -3374,7 +3375,7 @@ TEST_F(ResizeTest, DynamicReshapeIssue1393) { auto tv4 = expand(tv3, {s0, s1, s3}); fusion->addOutput(tv4); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({3}, options).as_strided({3, 4}, {1, 0}); @@ -3382,7 +3383,7 @@ TEST_F(ResizeTest, DynamicReshapeIssue1393) { auto ref = t0.add(t1).as_strided({3, 4, 5}, {4, 1, 0}); std::vector aten_inputs({t0, t1}); - auto outputs = fec.runFusionWithInputs(aten_inputs); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); testValidate(fusion, outputs, {t0, t1}, {ref}, __LINE__, __FILE__); } @@ -3424,13 +3425,18 @@ TEST_F(ResizeTest, SqueezeSlicedExpand) { auto t0 = at::randn(shape0, options); std::vector aten_inputs({t0}); - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); auto ref = at::squeeze(at::slice(t0, 1, 2, 3), 1); testValidate( - fec.fusion(), cg_outputs, aten_inputs, {ref}, __LINE__, __FILE__); + executor_cache.fusion(), + cg_outputs, + aten_inputs, + {ref}, + __LINE__, + __FILE__); } // Vectorization through resize is not supported yet. Make sure @@ -3602,14 +3608,18 @@ TEST_F(ResizeTest, Issue2552) { TensorView* z = add(x, y); fusion->addOutput(z); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); auto options = at::dtype(at::kFloat).device(at::kCUDA); at::Tensor x_tensor = at::randn({1, 3}, options); at::Tensor y_tensor = at::randn({1, 3}, options); std::vector out_tensors = - fec.runFusionWithInputs({x_tensor, y_tensor}); + executor_cache.runFusionWithInputs({x_tensor, y_tensor}); testValidate( - fec.fusion(), out_tensors, {x_tensor, y_tensor}, __LINE__, __FILE__); + executor_cache.fusion(), + out_tensors, + {x_tensor, y_tensor}, + __LINE__, + __FILE__); } TEST_F(ResizeTest, Chunk_NegativeSize) { @@ -3623,11 +3633,11 @@ TEST_F(ResizeTest, Chunk_NegativeSize) { fusion->addOutput(out); } - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); EXPECT_THAT( [&]() { auto in_tensor = at::randn({13}).cuda(); - fec.runFusionWithInputs({in_tensor}); + executor_cache.runFusionWithInputs({in_tensor}); }, ThrowsMessage(HasSubstr("Invalid resized domain extent"))); } @@ -3643,10 +3653,11 @@ TEST_F(ResizeTest, Chunk_SizeZero) { fusion->addOutput(out); } - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); auto in_tensor = at::randn({15}).cuda(); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_EQ(out_tensors.back().numel(), 0); } @@ -3662,10 +3673,11 @@ TEST_F(ResizeTest, Chunk_Uneven) { fusion->addOutput(out); } - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); auto in_tensor = at::randn({16}).cuda(); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_EQ(out_tensors.back().numel(), 1); } @@ -3715,9 +3727,9 @@ TEST_F(ResizeTest, SliceScheduledLikeProducer) { auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); auto ref = t0.index({at::indexing::Slice(1, shape[0] - 1)}); @@ -3763,9 +3775,9 @@ TEST_F(ResizeTest, PadScheduledLikeConsumer) { auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); auto ref = at::pad(t0 + 1, {1, 1}) + 1; @@ -3815,9 +3827,9 @@ TEST_F(ResizeTest, SliceThenPadLeftHalf) { auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); auto ref = at::pad( t0.index({at::indexing::Slice(0, shape[0] / 2)}), {0, shape[0] / 2}); @@ -3870,9 +3882,9 @@ TEST_F(ResizeTest, SliceThenPadRightHalf) { auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); auto ref = at::pad( t0.index({at::indexing::Slice(shape[0] / 2, shape[0])}), @@ -3934,9 +3946,9 @@ TEST_F(ResizeTest, SliceThenConcat) { auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); EXPECT_TRUE(t0.equal(cg_outputs[0])); } @@ -4028,9 +4040,9 @@ TEST_F(ResizeTest, SliceSliceConcatConcat) { auto t0 = at::randn({i0}, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); auto ref = at::concat( {at::slice(t0, 0, 0, rope_size / 2) + 1, @@ -4066,9 +4078,9 @@ TEST_F(ResizeTest, VectorizePadLowering) { auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); auto ref = at::pad(t0, {4, 4}); ASSERT_TRUE(ref.equal(cg_outputs[0])); @@ -4102,9 +4114,9 @@ TEST_F(ResizeTest, VectorizeWhereLowering) { auto t0 = at::randn(shape, options); std::vector aten_inputs({at::Scalar(false), t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); + auto cg_outputs = ke.runFusion(aten_inputs); // Note: we cannot use at::where, because aten only support tensor as // predicate. diff --git a/tests/cpp/test_rng.cpp b/tests/cpp/test_rng.cpp index fb05848a86a..725da0c43de 100644 --- a/tests/cpp/test_rng.cpp +++ b/tests/cpp/test_rng.cpp @@ -80,18 +80,23 @@ TEST_F(RNGTest, ValidateWithCURand) { fusion->addOutput(tv0); fusion->addOutput(tv1); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); for (int64_t size : {16, 1024, 10001, 10002, 10003, 100000, 10000001}) { at::manual_seed(0); - auto cg_outputs = fec.runFusionWithInputs({size}); + auto cg_outputs = executor_cache.runFusionWithInputs({size}); at::manual_seed(0); auto ref0 = generate_uniform(size, at::kFloat); auto ref1 = generate_uniform(size, at::kDouble); testValidate( - fec.fusion(), cg_outputs, {size}, {ref0, ref1}, __LINE__, __FILE__); + executor_cache.fusion(), + cg_outputs, + {size}, + {ref0, ref1}, + __LINE__, + __FILE__); } } @@ -116,11 +121,11 @@ TEST_F(RNGTest, ManualScheduleValidateWithCURand) { auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA, 0); at::Tensor t0 = at::zeros({size}, options); - FusionExecutor fe; - fe.compileFusion(fusion, {t0}); + KernelExecutor ke; + ke.compileFusion(fusion, {t0}); at::manual_seed(0); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); auto out = cg_outputs[0]; at::manual_seed(0); @@ -154,11 +159,11 @@ TEST_F(RNGTest, ManualScheduleValidateWithCURand2) { /*maybe_symbolic=*/false); fusion->addOutput(tv0); - FusionExecutor fe; - fe.compileFusion(fusion, {10, 10, 10, 10}); + KernelExecutor ke; + ke.compileFusion(fusion, {10, 10, 10, 10}); at::manual_seed(0); - auto cg_outputs = fe.runFusion({10, 10, 10, 10}); + auto cg_outputs = ke.runFusion({10, 10, 10, 10}); auto out = cg_outputs[0]; at::manual_seed(0); @@ -182,13 +187,13 @@ TEST_F(RNGTest, BroadcastingRNG) { auto tv4 = add(tv0, tv3); fusion->addOutput(tv4); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA, 0); at::Tensor t0 = at::zeros({5, 1}, options); at::Tensor t1 = at::zeros({5, 5}, options); - auto cg_outputs = fec.runFusionWithInputs({t0, t1}); + auto cg_outputs = executor_cache.runFusionWithInputs({t0, t1}); auto out = cg_outputs[0]; NVF_CHECK((out.select(1, 0) == out.select(1, 1)).all().item()) NVF_CHECK((out.select(1, 0) == out.select(1, 2)).all().item()) @@ -212,20 +217,21 @@ TEST_F(RNGTest, BroadcastingRNG2) { auto tv3 = add(tv1, tv2); fusion->addOutput(tv3); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA, 0); at::Tensor t0 = at::zeros({1}, options); at::Tensor t1 = at::zeros({size}, options); at::manual_seed(0); - auto cg_outputs = fec.runFusionWithInputs({t0, t1}); + auto cg_outputs = executor_cache.runFusionWithInputs({t0, t1}); auto out = cg_outputs[0]; at::manual_seed(0); auto ref = generate_uniform(1, dtype).expand_as(t1); - testValidate(fec.fusion(), {out}, {t0, t1}, {ref}, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), {out}, {t0, t1}, {ref}, __LINE__, __FILE__); } } } @@ -287,9 +293,9 @@ TEST_F(RNGTest, BroadcastingRNGSmemNonSquareTile) { SchedulerEntry::makeSchedulerInstance(SchedulerType::Transpose) ->schedule(fusion, &tparams); - FusionExecutor fe; - fe.compileFusion(fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compileFusion(fusion, {t0, t1}); + auto cg_outputs = ke.runFusion({t0, t1}); auto out = cg_outputs[0]; NVF_CHECK((out.select(1, 0) == out.select(1, 1)).all().item()); @@ -314,18 +320,18 @@ TEST_F(RNGTest, Uniform) { fusion->addOutput(tv0); fusion->addOutput(tv1); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); for (int64_t size : {16, 1024, 10001, 10002, 10003, 100000, 10000001}) { at::manual_seed(0); - auto cg_outputs = fec.runFusionWithInputs({size, -1.0, 1.0}); + auto cg_outputs = executor_cache.runFusionWithInputs({size, -1.0, 1.0}); at::manual_seed(0); auto ref0 = generate_uniform(size, at::kFloat) * 2 - 1; auto ref1 = generate_uniform(size, at::kDouble) * 2 - 1; testValidate( - fec.fusion(), + executor_cache.fusion(), cg_outputs, {size, -1.0, 1.0}, {ref0, ref1}, @@ -354,11 +360,11 @@ TEST_F(RNGTest, Normal) { fusion->addOutput(tv2); fusion->addOutput(tv3); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); for (int64_t size : {16, 1024, 10001, 10002, 10003, 100000, 10000001}) { at::manual_seed(0); - auto cg_outputs = fec.runFusionWithInputs({size, 1.0, 0.5}); + auto cg_outputs = executor_cache.runFusionWithInputs({size, 1.0, 0.5}); at::manual_seed(0); auto ref0 = generate_normal(size, at::kFloat) * 0.5f + 1.0f; @@ -367,7 +373,7 @@ TEST_F(RNGTest, Normal) { auto ref3 = generate_normal(size, at::kDouble); testValidate( - fec.fusion(), + executor_cache.fusion(), cg_outputs, {size, 1.0, 0.5}, {ref0, ref1, ref2, ref3}, @@ -389,13 +395,13 @@ TEST_F(RNGTest, RandLikeReduction) { auto tv3 = add(tv1, tv2); fusion->addOutput(tv3); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA, 0); at::Tensor t0 = at::zeros({2, 3}, options); at::manual_seed(0); - auto cg_outputs = fec.runFusionWithInputs({t0}); + auto cg_outputs = executor_cache.runFusionWithInputs({t0}); auto out = cg_outputs[0]; at::manual_seed(0); @@ -403,7 +409,7 @@ TEST_F(RNGTest, RandLikeReduction) { auto t2 = generate_uniform(3, dtype).expand_as(t1); auto t3 = t1.add(t2); - testValidate(fec.fusion(), {out}, {t0}, {t3}, __LINE__, __FILE__); + testValidate(executor_cache.fusion(), {out}, {t0}, {t3}, __LINE__, __FILE__); } //! This is the same as the Uniform test, but we compare against @@ -447,7 +453,7 @@ TEST_F(RNGTest, FunctionalUniform) { fusion->addOutput(tv2); fusion->addOutput(tv3); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); for (int64_t size : {16, 1024, 10001, 10002, 10003, 100000, 10000001}) { at::manual_seed(0); @@ -465,7 +471,7 @@ TEST_F(RNGTest, FunctionalUniform) { std::vector aten_inputs({size, -1.0, 1.0, 0, 0}); at::manual_seed(0); - auto cg_outputs = fec.runFusionWithInputs(aten_inputs); + auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); std::vector aten_outputs; if (do_stochastic) { @@ -475,7 +481,7 @@ TEST_F(RNGTest, FunctionalUniform) { } testValidate( - fec.fusion(), + executor_cache.fusion(), cg_outputs, aten_inputs, aten_outputs, @@ -514,7 +520,7 @@ TEST_F(RNGTest, DifferentOffsets) { fusion->addOutput(tv0); } - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); std::unique_ptr fusion_ptr2 = std::make_unique(); { @@ -533,7 +539,7 @@ TEST_F(RNGTest, DifferentOffsets) { for (int64_t size : {1, 4}) { at::manual_seed(0); EXPECT_TRUE(get_current_offset() == 0); - auto r1 = fec.runFusionWithInputs({size}).at(0); + auto r1 = executor_cache.runFusionWithInputs({size}).at(0); EXPECT_TRUE(get_current_offset() == 4); auto r23 = fec2.runFusionWithInputs({size}); auto r2 = r23.at(0); diff --git a/tests/cpp/test_scalar_hoisting.cpp b/tests/cpp/test_scalar_hoisting.cpp index 6aa08c52b53..54aa0ae406a 100644 --- a/tests/cpp/test_scalar_hoisting.cpp +++ b/tests/cpp/test_scalar_hoisting.cpp @@ -213,9 +213,9 @@ TEST_F(ScalarHoistTest, IndexHoist1) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({15, 17}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -257,9 +257,9 @@ TEST_F(ScalarHoistTest, IndexHoist2) { auto t0 = at::randn({16}, options); auto t1 = at::randn({16}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0, t1}); + auto cg_outputs = ke.runFusion({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -290,9 +290,9 @@ TEST_F(ScalarHoistTest, IndexHoist3) { at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::arange(10000, options).view({100, 100}); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(fusion.get(), {t0}); + auto cg_outputs = ke.runFusion({t0}); const std::string expected_kernel = R"( __global__ void CUDAGeneratedKernel(Tensor T0, Tensor T2) { @@ -369,9 +369,9 @@ TEST_F(ScalarHoistTest, ARange) { int64_t start = 0, end = 100, step = 1; - FusionExecutor fe; - fe.compileFusion(fusion.get(), {start, end, step}); - auto cg_outputs = fe.runFusion({start, end, step}); + KernelExecutor ke; + ke.compileFusion(fusion.get(), {start, end, step}); + auto cg_outputs = ke.runFusion({start, end, step}); const std::string expected_kernel = R"( __global__ void CUDAGeneratedKernel(int64_t i0, int64_t i1, int64_t i2, Tensor T0, Tensor T1) { diff --git a/tests/cpp/test_scatter_gather.cpp b/tests/cpp/test_scatter_gather.cpp index 67237e0b5e4..27fd3857ce9 100644 --- a/tests/cpp/test_scatter_gather.cpp +++ b/tests/cpp/test_scatter_gather.cpp @@ -586,10 +586,10 @@ TEST_F(ScatterGatherTest, TakeAlongAxisIntermediateTensorPointwise1) { auto t1 = at::randint(0, shape[1], {shape[0]}, options_i); std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + auto outputs = ke.runFusion(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -621,11 +621,11 @@ TEST_F(ScatterGatherTest, TakeAlongAxisIntermediateTensorPointwise2) { auto t1 = at::randint(0, shape[1], {shape[0]}, options_i); std::vector aten_inputs = {t0, t1}; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto outputs = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); validateSegmentation( - fec.getMostRecentKernelRuntime(), {SchedulerType::PointWise}); + executor_cache.getMostRecentKernelRuntime(), {SchedulerType::PointWise}); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -655,11 +655,11 @@ TEST_F(ScatterGatherTest, TakeAlongAxisIntermediateTensorReduction1) { auto t1 = at::randint(0, shape[0], {2}, options_i); std::vector aten_inputs = {t0, t1}; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto outputs = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); validateSegmentation( - fec.getMostRecentKernelRuntime(), + executor_cache.getMostRecentKernelRuntime(), {SchedulerType::Reduction, SchedulerType::PointWise}); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); @@ -695,11 +695,11 @@ TEST_F(ScatterGatherTest, TakeAlongAxisIntermediateTensorReduction2) { auto t1 = at::randint(0, shape[1], {shape[0]}, options_i); std::vector aten_inputs = {t0, t1}; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto outputs = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); validateSegmentation( - fec.getMostRecentKernelRuntime(), + executor_cache.getMostRecentKernelRuntime(), {SchedulerType::PointWise, SchedulerType::Reduction}); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); @@ -734,11 +734,11 @@ TEST_F(ScatterGatherTest, TakeAlongAxisIntermediateTensorReduction3) { at::randint(0, shape_before_gather[1], shape_after_gather, options_i); std::vector aten_inputs = {t0, t1}; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto outputs = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); validateSegmentation( - fec.getMostRecentKernelRuntime(), {SchedulerType::Reduction}); + executor_cache.getMostRecentKernelRuntime(), {SchedulerType::Reduction}); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -776,11 +776,11 @@ TEST_F(ScatterGatherTest, DISABLED_TakeAlongAxisIntermediateTensorReduction4) { at::randint(0, shape_before_gather[1], shape_after_gather, options_i); std::vector aten_inputs = {t0, t1}; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto outputs = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); validateSegmentation( - fec.getMostRecentKernelRuntime(), {SchedulerType::Reduction}); + executor_cache.getMostRecentKernelRuntime(), {SchedulerType::Reduction}); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -814,11 +814,12 @@ TEST_F(ScatterGatherTest, TakeAlongAxisIntermediateTensorNormalization1) { auto t1 = at::randint(0, shape[1], {shape[0]}, options_i); std::vector aten_inputs = {t0, t1}; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto outputs = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); validateSegmentation( - fec.getMostRecentKernelRuntime(), {SchedulerType::InnerPersistent}); + executor_cache.getMostRecentKernelRuntime(), + {SchedulerType::InnerPersistent}); auto t0_d = t0.to(at::kDouble); auto ref = at::take_along_dim( @@ -857,11 +858,11 @@ TEST_F(ScatterGatherTest, TakeAlongAxisIntermediateTensorNormalization2) { auto t1 = at::randint(0, shape[1], {shape[0]}, options_i); std::vector aten_inputs = {t0, t1}; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto outputs = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); validateSegmentation( - fec.getMostRecentKernelRuntime(), + executor_cache.getMostRecentKernelRuntime(), {SchedulerType::PointWise, SchedulerType::InnerPersistent}); auto t5 = at::take_along_dim(t0.to(at::kDouble) + 1, t1.unsqueeze(-1), 1) @@ -902,11 +903,12 @@ TEST_F(ScatterGatherTest, TakeAlongAxisIntermediateTensorNormalization3) { at::randint(0, shape_before_gather[1], shape_after_gather, options_i); std::vector aten_inputs = {t0, t1}; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto outputs = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); validateSegmentation( - fec.getMostRecentKernelRuntime(), {SchedulerType::InnerPersistent}); + executor_cache.getMostRecentKernelRuntime(), + {SchedulerType::InnerPersistent}); auto t3 = at::take_along_dim(t0.to(at::kDouble) + 1, t1, 1); auto ref = t3 / t3.sum({1}).unsqueeze(-1); @@ -943,13 +945,13 @@ TEST_F( auto t1 = at::randint(0, shape[1], {shape[0], 1}, options_i); std::vector aten_inputs = {t0, t1}; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto outputs = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); // The reduction patterns of the normalization and the final // reduction are different, so they are segmented out validateSegmentation( - fec.getMostRecentKernelRuntime(), + executor_cache.getMostRecentKernelRuntime(), {SchedulerType::InnerPersistent, SchedulerType::Reduction}); auto t0_d = t0.to(at::kDouble); @@ -995,11 +997,12 @@ TEST_F( auto t1 = at::randint(0, shape[1], {shape[0]}, options_i); std::vector aten_inputs = {t0, t1}; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto outputs = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); validateSegmentation( - fec.getMostRecentKernelRuntime(), {SchedulerType::InnerPersistent}); + executor_cache.getMostRecentKernelRuntime(), + {SchedulerType::InnerPersistent}); auto t0_d = t0.to(at::kDouble); auto t6 = at::take_along_dim( @@ -1045,11 +1048,11 @@ TEST_F(ScatterGatherTest, TakeAlongAxisIntermediateTensorTranspose1) { auto t1 = at::randint(0, shape[0], {shape[1], shape[2]}, options_i); std::vector aten_inputs = {t0, t1}; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto outputs = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); validateSegmentation( - fec.getMostRecentKernelRuntime(), {SchedulerType::Transpose}); + executor_cache.getMostRecentKernelRuntime(), {SchedulerType::Transpose}); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -1088,11 +1091,11 @@ TEST_F(ScatterGatherTest, TakeAlongAxisIntermediateTensorTranspose2) { auto t1 = at::randint(0, shape[0], {10, shape[2], shape[1]}, options_i); std::vector aten_inputs = {t0, t1}; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto outputs = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); validateSegmentation( - fec.getMostRecentKernelRuntime(), {SchedulerType::PointWise}); + executor_cache.getMostRecentKernelRuntime(), {SchedulerType::PointWise}); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -1133,13 +1136,13 @@ TEST_F(ScatterGatherTest, TakeAlongAxisIntermediateTensorTranspose3) { auto t1 = at::randint(0, shape_before[2], shape_after, options_i); std::vector aten_inputs = {t0, t1}; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto outputs = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); // Transpose scheduler should work for this case but not currently // supported validateSegmentation( - fec.getMostRecentKernelRuntime(), {SchedulerType::PointWise}); + executor_cache.getMostRecentKernelRuntime(), {SchedulerType::PointWise}); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -1188,11 +1191,11 @@ TEST_F(ScatterGatherTest, TakeAlongAxisCrossEntropyLoss) { auto t1 = at::randint(371, {128}, options).to(at::ScalarType::Long); std::vector inputs({t0, t1}); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); - auto cg_outputs = fec.runFusionWithInputs(inputs); + auto cg_outputs = executor_cache.runFusionWithInputs(inputs); - auto kernel_runtime = fec.getMostRecentKernelRuntime(); + auto kernel_runtime = executor_cache.getMostRecentKernelRuntime(); validateSegmentation( kernel_runtime, @@ -1290,10 +1293,10 @@ TEST_F(ScatterGatherTest, GatherIterGoupedReduction) { " grouped iterations, found ", gpulw.kernel()->summary().num_grouped_iterations); - FusionExecutor fe; + KernelExecutor ke; auto lparams = rparams->lparams; - fe.compileFusion(&fusion, aten_inputs, lparams); - auto cg_outputs = fe.runFusion(aten_inputs, lparams); + ke.compileFusion(&fusion, aten_inputs, lparams); + auto cg_outputs = ke.runFusion(aten_inputs, lparams); auto t_gather = at::gather(input, dim, input_idx); testValidate( diff --git a/tests/cpp/test_sdpa_node.cpp b/tests/cpp/test_sdpa_node.cpp index 772945f0909..b63986d5a0b 100644 --- a/tests/cpp/test_sdpa_node.cpp +++ b/tests/cpp/test_sdpa_node.cpp @@ -252,8 +252,8 @@ TEST_F(SDPATest, NonCausalAttnConcrete) { /*return_debug_mask=*/false, scale); - FusionExecutorCache fec(std::move(fusion)); - auto nvf_out = fec.runFusionWithInputs({q, k, v}); + FusionExecutorCache executor_cache(std::move(fusion)); + auto nvf_out = executor_cache.runFusionWithInputs({q, k, v}); validateSdpaFwdOutputs(nvf_out, aten_out); } @@ -299,8 +299,8 @@ TEST_F(SDPATest, NonCausalAttnSymbolic) { /*return_debug_mask=*/false, scale); - FusionExecutorCache fec(std::move(fusion)); - auto nvf_out = fec.runFusionWithInputs({q, k, v}); + FusionExecutorCache executor_cache(std::move(fusion)); + auto nvf_out = executor_cache.runFusionWithInputs({q, k, v}); validateSdpaFwdOutputs(nvf_out, aten_out); } @@ -345,8 +345,8 @@ TEST_F(SDPATest, CausalAttn) { /*return_debug_mask=*/false, /*scale=*/1e-3); - FusionExecutorCache fec(std::move(fusion)); - auto nvf_out = fec.runFusionWithInputs({q, k, v}); + FusionExecutorCache executor_cache(std::move(fusion)); + auto nvf_out = executor_cache.runFusionWithInputs({q, k, v}); validateSdpaFwdOutputs(nvf_out, aten_out); } @@ -496,8 +496,8 @@ TEST_F(SDPATest, NonCausalAttnConcreteBwd) { std::vector sdpa_bwd_inputs = { grad_out, q, k, v, output, log_sumexp, philox_seed, philox_offset}; - FusionExecutorCache fec(std::move(fusion)); - auto out = fec.runFusionWithInputs(sdpa_bwd_inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out = executor_cache.runFusionWithInputs(sdpa_bwd_inputs); auto [ref_grad_query, ref_grad_key, ref_grad_value] = at::_scaled_dot_product_flash_attention_backward( @@ -518,7 +518,7 @@ TEST_F(SDPATest, NonCausalAttnConcreteBwd) { /*scale=*/scale); testValidate( - fec.fusion(), + executor_cache.fusion(), out, sdpa_bwd_inputs, {ref_grad_query, ref_grad_key, ref_grad_value}, @@ -605,8 +605,8 @@ TEST_F(SDPATest, NonCausalAttnSymbolicBwd) { std::vector sdpa_bwd_inputs = { grad_out, q, k, v, output, log_sumexp, philox_seed, philox_offset}; - FusionExecutorCache fec(std::move(fusion)); - auto out = fec.runFusionWithInputs(sdpa_bwd_inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out = executor_cache.runFusionWithInputs(sdpa_bwd_inputs); auto [ref_grad_query, ref_grad_key, ref_grad_value] = at::_scaled_dot_product_flash_attention_backward( @@ -627,7 +627,7 @@ TEST_F(SDPATest, NonCausalAttnSymbolicBwd) { /*scale=*/scale); testValidate( - fec.fusion(), + executor_cache.fusion(), out, sdpa_bwd_inputs, {ref_grad_query, ref_grad_key, ref_grad_value}, @@ -683,8 +683,8 @@ TEST_F(SDPATest, AttnProgram) { scale); auto expected_out = (std::get<0>(aten_outputs).to(at::kFloat)) * 2.0; - FusionExecutorCache fec(std::move(fusion)); - auto out = fec.runFusionWithInputs({q, k, v}); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out = executor_cache.runFusionWithInputs({q, k, v}); EXPECT_TRUE(at::allclose(out[0], expected_out)); } @@ -744,8 +744,8 @@ TEST_F(SDPATest, AttnFwdBwd) { at::Tensor v = at::randn(v_shape, options).set_requires_grad(true); at::Tensor grad_out = at::randn(attn_shape, options); - FusionExecutorCache fec(std::move(fusion)); - auto nvf_out = fec.runFusionWithInputs({q, k, v, grad_out}); + FusionExecutorCache executor_cache(std::move(fusion)); + auto nvf_out = executor_cache.runFusionWithInputs({q, k, v, grad_out}); auto attn = at::scaled_dot_product_attention( q, @@ -761,7 +761,7 @@ TEST_F(SDPATest, AttnFwdBwd) { attn.backward(grad_out); testValidate( - fec.fusion(), + executor_cache.fusion(), nvf_out, {q, k, v, grad_out}, {attn, q.grad(), k.grad(), v.grad()}, @@ -824,9 +824,9 @@ TEST_F(SDPATest, Sharded_SdpaFwd) { /*return_debug_mask=*/false, scale); - FusionExecutorCache fec(std::move(fusion)); - auto nvf_out = - fec.runFusionWithInputs({q.unsqueeze(0), k.unsqueeze(0), v.unsqueeze(0)}); + FusionExecutorCache executor_cache(std::move(fusion)); + auto nvf_out = executor_cache.runFusionWithInputs( + {q.unsqueeze(0), k.unsqueeze(0), v.unsqueeze(0)}); validateSdpaFwdOutputs(nvf_out, aten_out); } @@ -928,8 +928,8 @@ TEST_F(SDPATest, Sharded_SdpaBwd) { philox_seed, philox_offset}; - FusionExecutorCache fec(std::move(fusion)); - auto out = fec.runFusionWithInputs(sdpa_bwd_inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out = executor_cache.runFusionWithInputs(sdpa_bwd_inputs); auto [ref_grad_query, ref_grad_key, ref_grad_value] = at::_scaled_dot_product_flash_attention_backward( @@ -950,7 +950,7 @@ TEST_F(SDPATest, Sharded_SdpaBwd) { /*scale=*/scale); testValidate( - fec.fusion(), + executor_cache.fusion(), out, sdpa_bwd_inputs, {ref_grad_query.unsqueeze(0), @@ -1016,9 +1016,9 @@ TEST_F(SDPATest, ComputeAt) { /*return_debug_mask=*/false, scale); - FusionExecutorCache fec(std::move(fusion)); - auto nvf_out = - fec.runFusionWithInputs({q.unsqueeze(0), k.unsqueeze(0), v.unsqueeze(0)}); + FusionExecutorCache executor_cache(std::move(fusion)); + auto nvf_out = executor_cache.runFusionWithInputs( + {q.unsqueeze(0), k.unsqueeze(0), v.unsqueeze(0)}); validateSdpaFwdOutputs(nvf_out, aten_out); } diff --git a/tests/cpp/test_segmentation.cpp b/tests/cpp/test_segmentation.cpp index c3a69b09cbb..b893c5c29ad 100644 --- a/tests/cpp/test_segmentation.cpp +++ b/tests/cpp/test_segmentation.cpp @@ -45,10 +45,10 @@ TEST_F(SegmentationTest, Issue1284_Repro1) { at::Tensor at_in_1 = at::randn(input_shape_1, options); std::vector aten_inputs = {at_in_0, at_in_1}; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto outputs = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_EQ(runtime->fusionSegments()->groups().size(), 2); testValidate(&fusion, outputs, {at_in_0, at_in_1}, __LINE__, __FILE__); @@ -84,10 +84,10 @@ TEST_F(SegmentationTest, Issue1284_Repro2) { std::vector aten_inputs = {at_in_0, at_in_1, at_in_2}; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto outputs = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_EQ(runtime->fusionSegments()->groups().size(), 2); testValidate( @@ -147,12 +147,14 @@ TEST_F(SegmentationTest, SegmentHintOnNonTerminatingOutput) { fusion->addOutput(add_out); fusion->addOutput(mul_out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); // Segment 1: in -> add_out (defined by segment_set) // Segment 2: add_out -> mul_out EXPECT_EQ(runtime->fusionSegments()->groups().size(), 2); @@ -195,18 +197,19 @@ TEST_F(SegmentationTest, EnforceSegmentationByCachingBeforeAndAfter) { } } - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); testValidate( - fec.fusion(), + executor_cache.fusion(), out_tensors, {in_tensor}, {in_tensor / in_tensor.sum({0})}, __LINE__, __FILE__); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_EQ(runtime->fusionSegments()->groups().size(), 2); } @@ -225,10 +228,12 @@ TEST_F(SegmentationTest, SetAllocationDomainOnSegmentBoundary) { add_out->setAllocationDomain( {add_out->axis(0), add_out->axis(1), add_out->axis(2)}, false); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3, 5}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); } TEST_F(SegmentationTest, InputForwardingUntilBinary) { @@ -254,16 +259,20 @@ TEST_F(SegmentationTest, InputForwardingUntilBinary) { z = segment_set(z); fusion->addOutput(z); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({2, 3}, options); std::vector out_tensors = - fec.runFusionWithInputs({in_tensor, in_tensor}); + executor_cache.runFusionWithInputs({in_tensor, in_tensor}); testValidate( - fec.fusion(), out_tensors, {in_tensor, in_tensor}, __LINE__, __FILE__); + executor_cache.fusion(), + out_tensors, + {in_tensor, in_tensor}, + __LINE__, + __FILE__); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_EQ(runtime->fusionSegments()->groups().size(), 1); } @@ -285,14 +294,18 @@ TEST_F(SegmentationTest, InputForwardingUntilOutput) { fusion->addOutput(out0); fusion->addOutput(out1); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({2, 3}, options); std::vector out_tensors = - fec.runFusionWithInputs({in_tensor, in_tensor}); + executor_cache.runFusionWithInputs({in_tensor, in_tensor}); testValidate( - fec.fusion(), out_tensors, {in_tensor, in_tensor}, __LINE__, __FILE__); + executor_cache.fusion(), + out_tensors, + {in_tensor, in_tensor}, + __LINE__, + __FILE__); } TEST_F(SegmentationTest, ForwardedExprsAreNotMergeable) { @@ -308,9 +321,10 @@ TEST_F(SegmentationTest, ForwardedExprsAreNotMergeable) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto in_tensor = at::randn({10}, options); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); } TEST_F(SegmentationTest, ForwardedExprsAreReplicated) { @@ -328,9 +342,10 @@ TEST_F(SegmentationTest, ForwardedExprsAreReplicated) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto in_tensor = at::randn({10, 20}, options); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); } TEST_F(SegmentationTest, ForceFp16Simple) { @@ -356,18 +371,18 @@ TEST_F(SegmentationTest, ForceFp16Simple) { fusion->addOutput(tv5); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); std::vector shape{15, 16}; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto in0 = at::randn(shape, options); auto in1 = at::randn(shape, options); - fec.runFusionWithInputs({in0, in1}); + executor_cache.runFusionWithInputs({in0, in1}); // Check the segmented edge is fp16 SegmentedFusion* segmented_fusion = - fec.getMostRecentKernelRuntime()->fusionSegments(); + executor_cache.getMostRecentKernelRuntime()->fusionSegments(); for (SegmentedEdge* edge : segmented_fusion->edges()) { auto* edge_tv = edge->val->as(); EXPECT_EQ(edge_tv->getDataType(), DataType::Half); @@ -406,18 +421,18 @@ TEST_F(SegmentationTest, ForceBf16Simple) { fusion->addOutput(tv5); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); std::vector shape{15, 16}; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto in0 = at::randn(shape, options); auto in1 = at::randn(shape, options); - fec.runFusionWithInputs({in0, in1}); + executor_cache.runFusionWithInputs({in0, in1}); // Check the segmented edge is bf16 SegmentedFusion* segmented_fusion = - fec.getMostRecentKernelRuntime()->fusionSegments(); + executor_cache.getMostRecentKernelRuntime()->fusionSegments(); for (SegmentedEdge* edge : segmented_fusion->edges()) { auto* edge_tv = edge->val->as(); EXPECT_EQ(edge_tv->getDataType(), DataType::BFloat16); @@ -452,17 +467,17 @@ TEST_F(SegmentationTest, ForceFp16NotAllCast) { fusion->addOutput(tv7); fusion->addOutput(tv8); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); std::vector shape{16, 16, 16}; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto in0 = at::randn(shape, options); auto in1 = at::randn(shape, options); - fec.runFusionWithInputs({in0, in1}); + executor_cache.runFusionWithInputs({in0, in1}); SegmentedFusion* segmented_fusion = - fec.getMostRecentKernelRuntime()->fusionSegments(); + executor_cache.getMostRecentKernelRuntime()->fusionSegments(); Fusion* complete_fusion = segmented_fusion->completeFusion(); // Check that the edge that wasn't fp16 is the producer of the @@ -513,17 +528,17 @@ TEST_F(SegmentationTest, ForceBf16NotAllCast) { fusion->addOutput(tv7); fusion->addOutput(tv8); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); std::vector shape{16, 16, 16}; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto in0 = at::randn(shape, options); auto in1 = at::randn(shape, options); - fec.runFusionWithInputs({in0, in1}); + executor_cache.runFusionWithInputs({in0, in1}); SegmentedFusion* segmented_fusion = - fec.getMostRecentKernelRuntime()->fusionSegments(); + executor_cache.getMostRecentKernelRuntime()->fusionSegments(); Fusion* complete_fusion = segmented_fusion->completeFusion(); // Check that the edge that wasn't fp16 is the producer of the @@ -558,14 +573,14 @@ TEST_F(SegmentationTest, SliceSegmentCasts) { fusion->addOutput(tv3); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); auto in0 = at::randn({5}, options); - auto outputs = fec.runFusionWithInputs({in0}); + auto outputs = executor_cache.runFusionWithInputs({in0}); SegmentedFusion* segmented_fusion = - fec.getMostRecentKernelRuntime()->fusionSegments(); + executor_cache.getMostRecentKernelRuntime()->fusionSegments(); ASSERT_EQ(segmented_fusion->edges().size(), 1); @@ -579,7 +594,7 @@ TEST_F(SegmentationTest, SliceSegmentCasts) { // There should be no cast before the slice EXPECT_TRUE(slice_edge->val->uses().at(0)->isA()); - testValidate(fec.fusion(), outputs, {in0}, __LINE__, __FILE__); + testValidate(executor_cache.fusion(), outputs, {in0}, __LINE__, __FILE__); } TEST_F(SegmentationTest, codeGenSupportedMergeIssue1970) { @@ -596,13 +611,13 @@ TEST_F(SegmentationTest, codeGenSupportedMergeIssue1970) { auto* tv3 = segment_set(tv2); fusion->addOutput(tv3); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto in0 = at::randn({3, 4, 3}, options); - auto outputs = fec.runFusionWithInputs({in0}); + auto outputs = executor_cache.runFusionWithInputs({in0}); - testValidate(fec.fusion(), outputs, {in0}, __LINE__, __FILE__); + testValidate(executor_cache.fusion(), outputs, {in0}, __LINE__, __FILE__); } // Test that Reduction axes are removed in segmentation edges @@ -622,15 +637,16 @@ TEST_F(SegmentationTest, EraseReductionsInSegmentationEdges) { fusion->addOutput(tv3); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto in0 = at::randn({3, 32, 17}, options); - auto outputs = fec.runFusionWithInputs({in0}); + auto outputs = executor_cache.runFusionWithInputs({in0}); - testValidate(fec.fusion(), outputs, {in0}, __LINE__, __FILE__); + testValidate(executor_cache.fusion(), outputs, {in0}, __LINE__, __FILE__); - const FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + const FusionKernelRuntime* runtime = + executor_cache.getMostRecentKernelRuntime(); ASSERT_TRUE(runtime != nullptr); SegmentedFusion* segmented_fusion = runtime->fusionSegments(); @@ -662,18 +678,18 @@ TEST_F(SegmentationTest, AliasedOutputOnSegmentation) { auto* tv2 = relu(seg_out); fusion->addOutput(tv2); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto in0 = at::randn({2, 3, 4}, options); auto in0_ref = in0.clone(); - auto outputs = fec.runFusionWithInputs({in0}); + auto outputs = executor_cache.runFusionWithInputs({in0}); auto in0_neg = in0_ref.neg(); EXPECT_TRUE(in0_neg.allclose(in0)); testValidate( - fec.fusion(), + executor_cache.fusion(), outputs, {in0.clone()}, {in0_neg.relu()}, @@ -693,14 +709,15 @@ TEST_F(SegmentationTest, MultipleSegmentSetsInOneSegment) { fusion->addInput(in); fusion->addOutput(out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({10}, options); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; - testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_THAT(runtime->fusionSegments()->groups(), SizeIs(2)); } @@ -723,10 +740,12 @@ TEST_F(SegmentationTest, ForwardInputsToSegmenterSetIssue2658) { fusion->addOutput(permute_out); fusion->addOutput(compute_out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); } } // namespace nvfuser diff --git a/tests/cpp/test_serial_gridreduce.cpp b/tests/cpp/test_serial_gridreduce.cpp index 15d035a75d6..b62f8a02e3b 100644 --- a/tests/cpp/test_serial_gridreduce.cpp +++ b/tests/cpp/test_serial_gridreduce.cpp @@ -116,15 +116,15 @@ TEST_F(SerialGridReductionTest, Scheduling) { inlineMost(); - FusionExecutor fe; + KernelExecutor ke; if (serial) { tv3->definition()->as()->requestSerialGridReduction(); } - fe.compileFusion(fusion); + ke.compileFusion(fusion); auto input = at::randn( {H, W}, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0)); - auto outputs = fe.runFusion({input}); + auto outputs = ke.runFusion({input}); if (serial) { // Verify that zeroed semaphore memory was reused instead of diff --git a/tests/cpp/test_sharding.cpp b/tests/cpp/test_sharding.cpp index c813bde5225..5ada8115a8b 100644 --- a/tests/cpp/test_sharding.cpp +++ b/tests/cpp/test_sharding.cpp @@ -155,9 +155,9 @@ TEST_P(ShardingTest, ComputeIndex) { // Dimension 2 has size 1 because that dimension is DIDx parallelized. auto a_tensor = at::randn({4, 2, 1, 5}, options); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {a_tensor}); - auto outputs = fe.runFusion({a_tensor}); + KernelExecutor ke; + ke.compileFusion(fusion.get(), {a_tensor}); + auto outputs = ke.runFusion({a_tensor}); testValidate(fusion.get(), outputs, {a_tensor}, __LINE__, __FILE__); } diff --git a/tests/cpp/test_smem_reuse.cpp b/tests/cpp/test_smem_reuse.cpp index 295fd3c2345..03f31a79afa 100644 --- a/tests/cpp/test_smem_reuse.cpp +++ b/tests/cpp/test_smem_reuse.cpp @@ -556,9 +556,9 @@ TEST_F(SmemReuseTest, SmemReuseWithDifferentVectorizationFactor) { } auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({n_element}, options); - FusionExecutor fe; - fe.compileFusion(fusion.get()); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(fusion.get()); + auto cg_outputs = ke.runFusion({t0}); testValidate(fusion.get(), cg_outputs, {t0}, __LINE__, __FILE__); } @@ -616,9 +616,9 @@ TEST_F(SmemReuseTest, RegisterReuseWithDifferentVectorizationFactor) { // run the fusion auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({n_element}, options); - FusionExecutor fe; - fe.compileFusion(fusion.get()); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(fusion.get()); + auto cg_outputs = ke.runFusion({t0}); testValidate(fusion.get(), cg_outputs, {t0}, __LINE__, __FILE__); }; @@ -677,9 +677,9 @@ TEST_F(SmemReuseTest, ExpandInterferes) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({y}, options); - FusionExecutor fe; - fe.compileFusion(fusion.get()); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compileFusion(fusion.get()); + auto cg_outputs = ke.runFusion({t0}); testValidate(fusion.get(), cg_outputs, {t0}, __LINE__, __FILE__); }; diff --git a/tests/cpp/test_swizzle.cpp b/tests/cpp/test_swizzle.cpp index f2cb00546eb..9499276732a 100644 --- a/tests/cpp/test_swizzle.cpp +++ b/tests/cpp/test_swizzle.cpp @@ -54,12 +54,12 @@ TEST_F(LegacySwizzleTest, SimpleSwizzle0) { auto str = ir_utils::toString(exprs); NVF_CHECK(str.find("where") != std::string::npos); - FusionExecutor fe; - fe.compileFusion(&fusion); + KernelExecutor ke; + ke.compileFusion(&fusion); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({2, 32}, options); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -93,12 +93,12 @@ TEST_F(LegacySwizzleTest, SimpleSwizzle1) { // Inlining a producer into a swizzled consumer is ok tv1->computeAt(tv2, -1); - FusionExecutor fe; - fe.compileFusion(&fusion); + KernelExecutor ke; + ke.compileFusion(&fusion); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({2, 32}, options); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -150,12 +150,12 @@ TEST_F(LegacySwizzleTest, SimpleSwizzle2) { } } - FusionExecutor fe; - fe.compileFusion(&fusion); + KernelExecutor ke; + ke.compileFusion(&fusion); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({32, 32}, options); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -279,12 +279,12 @@ TEST_F(LegacySwizzleTest, LoopSwizzle0) { tv0->computeAt(tv2, -1); - FusionExecutor fe; - fe.compileFusion(&fusion); + KernelExecutor ke; + ke.compileFusion(&fusion); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({2, 32}, options); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -314,12 +314,12 @@ TEST_F(LegacySwizzleTest, LoopSwizzle1) { tv2->axis(0)->parallelize(ParallelType::BIDx); tv2->axis(1)->parallelize(ParallelType::BIDy); - FusionExecutor fe; - fe.compileFusion(&fusion); + KernelExecutor ke; + ke.compileFusion(&fusion); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({45, 77}, options); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -349,8 +349,8 @@ TEST_F(LegacySwizzleTest, LoopSwizzleCheck0) { tv0->computeAt(tv2, -1); - FusionExecutor fe; - ASSERT_ANY_THROW(fe.compileFusion(&fusion)); + KernelExecutor ke; + ASSERT_ANY_THROW(ke.compileFusion(&fusion)); } // Test assertion in unsupported pattern: half-inlined loop swizzle. @@ -381,8 +381,8 @@ TEST_F(LegacySwizzleTest, LoopSwizzleCheck1) { // Make tv2 swizzled and partially-inlined (unsupported). tv0->computeAt(tv3, -2); - FusionExecutor fe; - ASSERT_ANY_THROW(fe.compileFusion(&fusion)); + KernelExecutor ke; + ASSERT_ANY_THROW(ke.compileFusion(&fusion)); } TEST_F(LegacySwizzleTest, SwizzleVectorize) { @@ -528,8 +528,8 @@ at::Tensor getSwizzledTensor( fusion.addOutput(swizzle.first); fusion.addOutput(swizzle.second); - FusionExecutorCache fec(std::move(fusion_ptr)); - auto outputs = fec.runFusionWithInputs({size_x, size_y}); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs({size_x, size_y}); return input.index_put({outputs[0], outputs[1]}, input); } @@ -615,9 +615,9 @@ TEST_F(LegacySwizzleTest, SwizzleIndexing170) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t = at::randn({64, 64}, options); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion({t}); + KernelExecutor ke; + ke.compileFusion(&fusion); + auto outputs = ke.runFusion({t}); testValidate(&fusion, outputs, {t}, __LINE__, __FILE__); } @@ -678,9 +678,9 @@ TEST_F(LegacySwizzleTest, SwizzleInProducerProjection) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t = at::randn({32, 64}, options); - FusionExecutor fe; - fe.compileFusion(fusion.get()); - auto outputs = fe.runFusion({t}); + KernelExecutor ke; + ke.compileFusion(fusion.get()); + auto outputs = ke.runFusion({t}); auto expect = at::empty_like(t); for (auto i : c10::irange(t.size(0) / 8)) { @@ -735,10 +735,10 @@ TEST_F(SwizzleTest, Transpose1) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t = at::randn({10240, 10240}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t}); - EXPECT_TRUE(getBankConflictInfo(fe.kernel()).empty()); - std::vector outputs = fe.runFusion({t}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t}); + EXPECT_TRUE(getBankConflictInfo(ke.kernel()).empty()); + std::vector outputs = ke.runFusion({t}); EXPECT_TRUE(at::equal(t.t(), outputs[0])); } diff --git a/tests/cpp/test_tensor_factories.cpp b/tests/cpp/test_tensor_factories.cpp index 8c3b2462ca7..68e8b4bc4f5 100644 --- a/tests/cpp/test_tensor_factories.cpp +++ b/tests/cpp/test_tensor_factories.cpp @@ -352,9 +352,9 @@ TEST_F(TensorFactoryTest, TensorConstruct) { auto output = tensor(std::vector>{{i00, i01}, {i10, i11}}); fusion->addOutput(output); - FusionExecutor fe; - fe.compileFusion(fusion.get()); - auto cg_outputs = fe.runFusion({00, 01, 10, 11}); + KernelExecutor ke; + ke.compileFusion(fusion.get()); + auto cg_outputs = ke.runFusion({00, 01, 10, 11}); testValidate(fusion.get(), cg_outputs, {00, 01, 10, 11}, __LINE__, __FILE__); } @@ -403,9 +403,9 @@ TEST_F(TensorFactoryTest, MetadataAsTensor) { auto input0 = at::randn({2, 3, 4, 5}, options); auto input1 = at::randn({6, 7, 8, 9}, options); - FusionExecutor fe; - fe.compileFusion(fusion.get()); - auto cg_outputs = fe.runFusion({input0, input1}); + KernelExecutor ke; + ke.compileFusion(fusion.get()); + auto cg_outputs = ke.runFusion({input0, input1}); testValidate(fusion.get(), cg_outputs, {input0, input1}, __LINE__, __FILE__); } diff --git a/tests/cpp/test_translate_mma.cpp b/tests/cpp/test_translate_mma.cpp index 420bdf1a760..dca5ccfac1e 100644 --- a/tests/cpp/test_translate_mma.cpp +++ b/tests/cpp/test_translate_mma.cpp @@ -229,11 +229,11 @@ TEST_P(CombineMulSumAsMmaTestWithLayout, AmpereMulSumToMatmul_Schedule) { auto inputs = matmulAtInput2D(M, N, K, layout); - FusionExecutor fe; - fe.compileFusion( + KernelExecutor ke; + ke.compileFusion( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); + auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); diff --git a/tests/cpp/test_tutorial.cpp b/tests/cpp/test_tutorial.cpp index 17e93767a1f..3089c6e4df3 100644 --- a/tests/cpp/test_tutorial.cpp +++ b/tests/cpp/test_tutorial.cpp @@ -82,12 +82,12 @@ TEST_F(Tutorial, Memcpy) { std::vector aten_inputs = {t0}; // Next, lower the fusion to Kernel, generate CUDA kernel source and then - // compile it with nvrtc. All of them are done by FusionExecutor - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); + // compile it with nvrtc. All of them are done by KernelExecutor + KernelExecutor ke; + ke.compileFusion(&fusion, aten_inputs); - // FusionExecutor now has a compiled kernel, which can be executed as: - std::vector outputs = fe.runFusion(aten_inputs); + // KernelExecutor now has a compiled kernel, which can be executed as: + std::vector outputs = ke.runFusion(aten_inputs); // Note that this run is done using just one thread, which will be // corrected below. @@ -158,15 +158,15 @@ TEST_F(Tutorial, Memcpy) { } // Since the fusion is modified, we need to recompile it. - FusionExecutor fe2; - fe2.compileFusion(&fusion, aten_inputs); + KernelExecutor ke2; + ke2.compileFusion(&fusion, aten_inputs); // This time, the kernel is launched with multiple threads and // thread blocks. Note that the launch configurations, i.e., the // thread block and grid shapes, are autoatically inferred from the // given inputs. To see how many threads are used, run this test // with NVFUSER_DUMP=launch_param - outputs = fe2.runFusion(aten_inputs); + outputs = ke2.runFusion(aten_inputs); ASSERT_TRUE(outputs[0].equal(t0)); } @@ -205,9 +205,9 @@ TEST_F(Tutorial, Reduction) { at::Tensor ref = t0.sum({1}); { - FusionExecutor fe; - fe.compileFusion(&fusion); - std::vector outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion); + std::vector outputs = ke.runFusion(aten_inputs); testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__); } @@ -221,9 +221,9 @@ TEST_F(Tutorial, Reduction) { } { - FusionExecutor fe; - fe.compileFusion(&fusion); - std::vector outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compileFusion(&fusion); + std::vector outputs = ke.runFusion(aten_inputs); testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__); } @@ -239,19 +239,19 @@ TEST_F(Tutorial, Reduction) { } { - FusionExecutor fe; - fe.compileFusion(&fusion); + KernelExecutor ke; + ke.compileFusion(&fusion); // Running this fusion, however, should fail as it would require // thread blocks of shape 1024x10, i.e., the same shape as the // input tensor, which is too large in CUDA. // // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(fe.runFusion(aten_inputs)); + ASSERT_ANY_THROW(ke.runFusion(aten_inputs)); // Try again with a smaller input. This should launch a kernel // with thread blocks of shape 32x10 at::Tensor t1 = at::randn({10, 32}, options); - std::vector outputs = fe.runFusion({t1}); + std::vector outputs = ke.runFusion({t1}); testValidate( &fusion, outputs, aten_inputs, {t1.sum({1})}, __LINE__, __FILE__); } @@ -266,13 +266,13 @@ TEST_F(Tutorial, Reduction) { } { - FusionExecutor fe; - fe.compileFusion(&fusion); + KernelExecutor ke; + ke.compileFusion(&fusion); // The original input should not fail in this case. The kernel // will be launched with 10 thread blocks, each of which has 1024 // threads. Try running this test with NVFUSER_DUMP=launch_param // to see the launch configuration of each kernel lauch - std::vector outputs = fe.runFusion(aten_inputs); + std::vector outputs = ke.runFusion(aten_inputs); testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__); } } @@ -380,13 +380,13 @@ TEST_F(Tutorial, ReductionRFactor) { std::vector aten_inputs = {t0}; at::Tensor ref = t0.sum({0}); - FusionExecutor fe; - fe.compileFusion(&fusion_copy); + KernelExecutor ke; + ke.compileFusion(&fusion_copy); // Since the size of the input is 10000, which is split by a // factor of 1024, the first per-thread reduction is done for // ceilDiv(10000, 1024) = 10 elements. - std::vector outputs = fe.runFusion(aten_inputs); + std::vector outputs = ke.runFusion(aten_inputs); testValidate(&fusion_copy, outputs, aten_inputs, {ref}, __LINE__, __FILE__); } @@ -439,10 +439,10 @@ TEST_F(Tutorial, ReductionRFactor) { std::vector aten_inputs = {t0}; at::Tensor ref = t0.sum({0}); - FusionExecutor fe; - fe.compileFusion(&fusion_copy); + KernelExecutor ke; + ke.compileFusion(&fusion_copy); - std::vector outputs = fe.runFusion(aten_inputs); + std::vector outputs = ke.runFusion(aten_inputs); testValidate(&fusion_copy, outputs, aten_inputs, {ref}, __LINE__, __FILE__); } } @@ -786,9 +786,9 @@ TEST_F(Tutorial, BasicTMA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); std::vector shape(3, 300); auto t = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t}, {}, index32bit); - std::vector outputs = fe.runFusion({t}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t}, {}, index32bit); + std::vector outputs = ke.runFusion({t}); ASSERT_TRUE(at::equal(t, outputs[0])); } @@ -870,9 +870,9 @@ TEST_F(Tutorial, BasicTMA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); std::vector shape(3, 300); auto t = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t}, {}, index32bit); - std::vector outputs = fe.runFusion({t}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t}, {}, index32bit); + std::vector outputs = ke.runFusion({t}); ASSERT_TRUE(at::equal(t, outputs[0])); } @@ -953,9 +953,9 @@ TEST_F(Tutorial, BasicTMA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); std::vector shape(3, 300); auto t = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t}, {}, index32bit); - std::vector outputs = fe.runFusion({t}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t}, {}, index32bit); + std::vector outputs = ke.runFusion({t}); ASSERT_TRUE(at::equal(t, outputs[0])); } @@ -1033,9 +1033,9 @@ TEST_F(Tutorial, BasicTMA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); std::vector shape(3, 300); auto t = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t}, {}, index32bit); - std::vector outputs = fe.runFusion({t}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t}, {}, index32bit); + std::vector outputs = ke.runFusion({t}); ASSERT_TRUE(at::equal(t, outputs[0])); } @@ -1138,9 +1138,9 @@ TEST_F(Tutorial, BasicTMA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); std::vector shape(3, 300); auto t = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t}, {}, index32bit); - std::vector outputs = fe.runFusion({t}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t}, {}, index32bit); + std::vector outputs = ke.runFusion({t}); ASSERT_TRUE(at::equal(t, outputs[0])); } @@ -1244,9 +1244,9 @@ TEST_F(Tutorial, BasicTMA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); std::vector shape(3, 300); auto t = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t}, {}, index32bit); - std::vector outputs = fe.runFusion({t}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t}, {}, index32bit); + std::vector outputs = ke.runFusion({t}); ASSERT_TRUE(at::equal(t, outputs[0])); } } @@ -1343,10 +1343,10 @@ TEST_F(Tutorial, VectorizeStorePointwiseTMA) { at::Tensor at_tv0 = at::randn({dim0, dim1}, options); at::Tensor at_tv1 = at::randn({dim0, dim1}, options); - // Compile with FusionExecutor directly to avoid scheduling - FusionExecutor fe; - fe.compileFusion(fusion.get(), {at_tv0, at_tv1}, {}, index32bit); - auto outputs = fe.runFusion({at_tv0, at_tv1}); + // Compile with KernelExecutor directly to avoid scheduling + KernelExecutor ke; + ke.compileFusion(fusion.get(), {at_tv0, at_tv1}, {}, index32bit); + auto outputs = ke.runFusion({at_tv0, at_tv1}); auto at_output = at_tv0 + at_tv1; testValidate( @@ -1447,10 +1447,10 @@ TEST_F(Tutorial, PointwiseBroadcastTMA) { at::Tensor at_tv0 = at::randn({dim1, dim2, dim3}, options); at::Tensor at_tv1 = at::randn({dim0, dim1, dim2, dim3}, options); - // Compile with FusionExecutor directly to avoid scheduling - FusionExecutor fe; - fe.compileFusion(fusion.get(), {at_tv0, at_tv1}, {}, index32bit); - auto outputs = fe.runFusion({at_tv0, at_tv1}); + // Compile with KernelExecutor directly to avoid scheduling + KernelExecutor ke; + ke.compileFusion(fusion.get(), {at_tv0, at_tv1}, {}, index32bit); + auto outputs = ke.runFusion({at_tv0, at_tv1}); auto at_output = at_tv0 + at_tv1; testValidate( @@ -1551,10 +1551,10 @@ TEST_F(Tutorial, TMABankConflictFreeTranspose) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t = at::randn({10000, 10000}, options); - FusionExecutor fe; + KernelExecutor ke; CompileParams index32bit{DataType::Int32, 255, false}; - fe.compileFusion(&fusion, {t}, {}, index32bit); - std::vector outputs = fe.runFusion({t}); + ke.compileFusion(&fusion, {t}, {}, index32bit); + std::vector outputs = ke.runFusion({t}); ASSERT_TRUE(at::equal(t.t(), outputs[0])); } diff --git a/tests/cpp/test_unary.cpp b/tests/cpp/test_unary.cpp index 9455230683b..76dfd789b39 100644 --- a/tests/cpp/test_unary.cpp +++ b/tests/cpp/test_unary.cpp @@ -57,13 +57,18 @@ TEST_P(UnaryTest, Neg) { in_tensor = at::randn(shape, options); } - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); // Calculate the reference output explicitly. Type promotion happens when // building the fusion, e.g., inside `neg`. Relying ExpresionEvaluator to // verify the result would hide type promotion errors. testValidate( - fec.fusion(), out_tensors, {in_tensor}, {-in_tensor}, __LINE__, __FILE__); + executor_cache.fusion(), + out_tensors, + {in_tensor}, + {-in_tensor}, + __LINE__, + __FILE__); } namespace { diff --git a/tests/cpp/test_utils.cpp b/tests/cpp/test_utils.cpp index 5215e58f19d..174c9ef367d 100644 --- a/tests/cpp/test_utils.cpp +++ b/tests/cpp/test_utils.cpp @@ -1115,16 +1115,16 @@ TEST_F(NVFuserTest, FusionSASSDumpError) { at::Tensor t0 = at::randn({8}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compileFusion(&fusion, {t0}); EXPECT_THAT( - [&]() { fe.disassembledKernelSASS(); }, + [&]() { ke.disassembledKernelSASS(); }, ::testing::ThrowsMessage( ::testing::HasSubstr("I am fake"))); - auto cg_outputs = fe.runFusion({t0}); - testValidate(fe.kernel(), cg_outputs, {t0}, __LINE__, __FILE__); + auto cg_outputs = ke.runFusion({t0}); + testValidate(ke.kernel(), cg_outputs, {t0}, __LINE__, __FILE__); } TEST_F(NVFuserTest, ProveLinearAndGetStride) { diff --git a/tests/cpp/utils.cpp b/tests/cpp/utils.cpp index ca29c5fe1bf..dca4d80ae20 100644 --- a/tests/cpp/utils.cpp +++ b/tests/cpp/utils.cpp @@ -24,15 +24,13 @@ CGResultsPackage scheduleAndRun( bool validate_scheduler) { auto heuristic_params = SchedulerEntry::scheduleWith( fusion, scheduler_type, runtime_inputs, validate_scheduler); - auto fusion_executor = std::make_unique(); - fusion_executor->compileFusion( - fusion, runtime_inputs, heuristic_params->lparams); - auto cg_outputs = - fusion_executor->runFusion(runtime_inputs, heuristic_params->lparams); + auto ke = std::make_unique(); + ke->compileFusion(fusion, runtime_inputs, heuristic_params->lparams); + auto cg_outputs = ke->runFusion(runtime_inputs, heuristic_params->lparams); CGResultsPackage results = { .outputs = cg_outputs, .heuristic_params = std::move(heuristic_params), - .fusion_executor = std::move(fusion_executor)}; + .kernel_executor = std::move(ke)}; return results; } diff --git a/tests/cpp/utils.h b/tests/cpp/utils.h index 648b85dbe55..d2964c8b731 100644 --- a/tests/cpp/utils.h +++ b/tests/cpp/utils.h @@ -40,12 +40,12 @@ namespace nvfuser { struct CGResultsPackage { std::vector outputs; std::unique_ptr heuristic_params; - std::unique_ptr fusion_executor; + std::unique_ptr kernel_executor; }; // Grabs heuristics and schedules with the provided scheduler type, compiles and // runs with Fuion executor, returns a struct containing the outputs, -// heuristic_params, and FusionExecutor. These structures are for convenience in +// heuristic_params, and KernelExecutor. These structures are for convenience in // testing. If validate_scheduler is set to false the scheduler check will still // be run but it will be ignored. Otherwise canScheduler returning false will // throw. diff --git a/tools/examples/repro.cpp b/tools/examples/repro.cpp index a1e123dd3aa..53058ace4ab 100644 --- a/tools/examples/repro.cpp +++ b/tools/examples/repro.cpp @@ -103,7 +103,7 @@ TEST_F(NVFuserTest, FusionGeneratedTest_CUDA) { outputs.push_back(t32); } - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = fec.runFusionWithInputs(inputs); + KernelExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs(inputs); testValidate(fusion, cg_outputs, inputs, outputs, __LINE__, __FILE__); } From 951dde65fcbfc9af428c62f3a10476b07c224ad5 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Thu, 7 Nov 2024 00:11:11 -0800 Subject: [PATCH 18/27] Renaming from #3263 part2 (#3362) Follow-up to #3349 `KernelExecutor::compileFusion` -> `KernelExecutor::compile` `KernelExecutor::runFusion` -> `KernelExecutor::run` --- benchmarks/cpp/gelu_backward.cpp | 16 +- benchmarks/cpp/indexselect.cpp | 6 +- benchmarks/cpp/lstm_cell.cpp | 16 +- benchmarks/cpp/matmul.cpp | 12 +- benchmarks/cpp/softmax.cpp | 4 +- benchmarks/cpp/utils.cpp | 8 +- csrc/host_ir/executor.cpp | 4 +- csrc/python_frontend/fusion_definition.cpp | 8 +- csrc/runtime/executor.cpp | 4 +- csrc/runtime/executor.h | 22 +- csrc/runtime/fusion_kernel_runtime.cpp | 4 +- examples/sinh_extension/main.cpp | 4 +- examples/sinh_libtorch/main.cpp | 4 +- tests/cpp/test_alias.cpp | 8 +- tests/cpp/test_alias_analysis.cpp | 4 +- tests/cpp/test_allocation_domain.cpp | 98 ++-- tests/cpp/test_circular_buffering.cpp | 128 ++--- .../test_combined_inner_outer_reduction.cpp | 14 +- tests/cpp/test_dynamic_transform.cpp | 8 +- tests/cpp/test_gpu1.cpp | 369 +++++++------ tests/cpp/test_gpu2.cpp | 498 +++++++++--------- tests/cpp/test_gpu3.cpp | 370 +++++++------ tests/cpp/test_gpu_compute_with.cpp | 24 +- tests/cpp/test_gpu_fused_reduction.cpp | 132 ++--- tests/cpp/test_gpu_indexing_ops.cpp | 4 +- tests/cpp/test_gpu_outer_reduction.cpp | 46 +- tests/cpp/test_gpu_transpose.cpp | 8 +- tests/cpp/test_gpu_view.cpp | 38 +- tests/cpp/test_indexing.cpp | 68 +-- tests/cpp/test_indexing_advanced.cpp | 84 +-- tests/cpp/test_inlining.cpp | 16 +- tests/cpp/test_loop_domain_scheduling.cpp | 12 +- tests/cpp/test_loop_rotation.cpp | 24 +- tests/cpp/test_matmul.cpp | 166 +++--- tests/cpp/test_matmul_sass.cpp | 8 +- tests/cpp/test_matmul_scheduler.cpp | 44 +- tests/cpp/test_mbarrier.cpp | 4 +- tests/cpp/test_memory.cpp | 200 +++---- tests/cpp/test_mma.cpp | 24 +- tests/cpp/test_persistent_buffer.cpp | 6 +- tests/cpp/test_pointwise.cpp | 8 +- tests/cpp/test_predicate_elimination.cpp | 32 +- tests/cpp/test_resize.cpp | 176 +++---- tests/cpp/test_rng.cpp | 12 +- tests/cpp/test_scalar_hoisting.cpp | 16 +- tests/cpp/test_scatter_gather.cpp | 8 +- tests/cpp/test_serial_gridreduce.cpp | 4 +- tests/cpp/test_sharding.cpp | 4 +- tests/cpp/test_smem_reuse.cpp | 12 +- tests/cpp/test_swizzle.cpp | 36 +- tests/cpp/test_tensor_factories.cpp | 8 +- tests/cpp/test_translate_mma.cpp | 4 +- tests/cpp/test_tutorial.cpp | 70 +-- tests/cpp/test_utils.cpp | 4 +- tests/cpp/utils.cpp | 4 +- 55 files changed, 1447 insertions(+), 1468 deletions(-) diff --git a/benchmarks/cpp/gelu_backward.cpp b/benchmarks/cpp/gelu_backward.cpp index 24cb5fa72f6..ae1e0ce2473 100644 --- a/benchmarks/cpp/gelu_backward.cpp +++ b/benchmarks/cpp/gelu_backward.cpp @@ -163,7 +163,7 @@ static void NvFuserScheduler_GeluBackward_Compile( for (auto _ : benchmark_state) { KernelExecutor ke; - ke.compileFusion(&fusion, inputs, heuristic_params->lparams); + ke.compile(&fusion, inputs, heuristic_params->lparams); } } @@ -188,13 +188,13 @@ static void NvFuserScheduler_GeluBackward_RunFusion( &fusion, SchedulerType::PointWise, c10::ArrayRef(inputs)); KernelExecutor ke; - ke.compileFusion(&fusion, inputs, heuristic_params->lparams); + ke.compile(&fusion, inputs, heuristic_params->lparams); C10_CUDA_CHECK(cudaDeviceSynchronize()); for (auto _ : benchmark_state) { - outputs = ke.runFusion( - c10::ArrayRef(inputs), heuristic_params->lparams); + outputs = + ke.run(c10::ArrayRef(inputs), heuristic_params->lparams); C10_CUDA_CHECK(cudaDeviceSynchronize()); clearL2Cache(); } @@ -219,7 +219,7 @@ static void NvFuserScheduler_GeluBackward_RunFusion_GpuOnly( &fusion, SchedulerType::PointWise, c10::ArrayRef(inputs)); KernelExecutor ke; - ke.compileFusion(&fusion, inputs, heuristic_params->lparams); + ke.compile(&fusion, inputs, heuristic_params->lparams); runBenchmarkIterations( benchmark_state, &ke, inputs, heuristic_params->lparams); @@ -249,11 +249,11 @@ static void NvFuserScheduler_GeluBackward_RunFusion_CpuOnly( KernelExecutor ke; ke.setExecuteKernelFlag(false); - ke.compileFusion(&fusion, inputs, heuristic_params->lparams); + ke.compile(&fusion, inputs, heuristic_params->lparams); for (auto _ : benchmark_state) { - outputs = ke.runFusion( - c10::ArrayRef(inputs), heuristic_params->lparams); + outputs = + ke.run(c10::ArrayRef(inputs), heuristic_params->lparams); } } diff --git a/benchmarks/cpp/indexselect.cpp b/benchmarks/cpp/indexselect.cpp index 24eeb31679e..01eefc2d0a1 100644 --- a/benchmarks/cpp/indexselect.cpp +++ b/benchmarks/cpp/indexselect.cpp @@ -133,7 +133,7 @@ static void NvFuserScheduler_IndexSelect_Compile( for (auto _ : benchmark_state) { KernelExecutor ke; - ke.compileFusion( + ke.compile( &fusion, c10::ArrayRef(inputs), heuristic_params->lparams); } } @@ -156,7 +156,7 @@ static void NvFuserScheduler_IndexSelect_RunFusion( &fusion, SchedulerType::PointWise, c10::ArrayRef(inputs)); KernelExecutor ke; - ke.compileFusion( + ke.compile( &fusion, c10::ArrayRef(inputs), heuristic_params->lparams); C10_CUDA_CHECK(cudaDeviceSynchronize()); @@ -164,7 +164,7 @@ static void NvFuserScheduler_IndexSelect_RunFusion( at::Tensor output = at::empty_like(inputs[0].toTensor()); for (auto _ : benchmark_state) { - ke.runFusion( + ke.run( c10::ArrayRef(inputs), {output}, heuristic_params->lparams); diff --git a/benchmarks/cpp/lstm_cell.cpp b/benchmarks/cpp/lstm_cell.cpp index 71a2dbc6cba..7fe205f6312 100644 --- a/benchmarks/cpp/lstm_cell.cpp +++ b/benchmarks/cpp/lstm_cell.cpp @@ -156,7 +156,7 @@ static void NvFuserScheduler_LstmCell_Compile( for (auto _ : benchmark_state) { KernelExecutor ke; - ke.compileFusion(&fusion, inputs); + ke.compile(&fusion, inputs); } } @@ -183,13 +183,13 @@ static void NvFuserScheduler_LstmCell_RunFusion( &fusion, SchedulerType::PointWise, c10::ArrayRef(inputs)); KernelExecutor ke; - ke.compileFusion(&fusion, inputs); + ke.compile(&fusion, inputs); C10_CUDA_CHECK(cudaDeviceSynchronize()); for (auto _ : benchmark_state) { - outputs = ke.runFusion( - c10::ArrayRef(inputs), heuristic_params->lparams); + outputs = + ke.run(c10::ArrayRef(inputs), heuristic_params->lparams); C10_CUDA_CHECK(cudaDeviceSynchronize()); } } @@ -221,7 +221,7 @@ static void NvFuserScheduler_LstmCell_RunFusion_GpuOnly( &fusion, SchedulerType::PointWise, c10::ArrayRef(inputs)); KernelExecutor ke; - ke.compileFusion(&fusion, inputs); + ke.compile(&fusion, inputs); runBenchmarkIterations( benchmark_state, &ke, inputs, heuristic_params->lparams); @@ -261,11 +261,11 @@ static void NvFuserScheduler_LstmCell_RunFusion_CpuOnly( KernelExecutor ke; ke.setExecuteKernelFlag(false); - ke.compileFusion(&fusion, inputs); + ke.compile(&fusion, inputs); for (auto _ : benchmark_state) { - outputs = ke.runFusion( - c10::ArrayRef(inputs), heuristic_params->lparams); + outputs = + ke.run(c10::ArrayRef(inputs), heuristic_params->lparams); } } diff --git a/benchmarks/cpp/matmul.cpp b/benchmarks/cpp/matmul.cpp index d48bc250ea6..a9b10655aa0 100644 --- a/benchmarks/cpp/matmul.cpp +++ b/benchmarks/cpp/matmul.cpp @@ -176,7 +176,7 @@ static void SingleMatmulBase( // Compile kernel auto launch_constraints = LaunchParams(); KernelExecutor ke; - ke.compileFusion(fusion, args, launch_constraints, cparams); + ke.compile(fusion, args, launch_constraints, cparams); NVF_CHECK( getBankConflictInfo(ke.kernel(), launch_constraints).empty(), "Shared memory bank conflict not removed."); @@ -184,7 +184,7 @@ static void SingleMatmulBase( std::vector aten_inputs({inputs.first, inputs.second}); // Warm up run - auto outputs = ke.runFusion(aten_inputs); + auto outputs = ke.run(aten_inputs); checkMatch(expected_output, outputs.at(0).to(at::kDouble), k); runBenchmarkIterations(benchmark_state, &ke, aten_inputs); @@ -357,13 +357,13 @@ static void SingleMatmulPartitionedK( // Compile kernel KernelExecutor ke; auto lparams = LaunchParams(); - ke.compileFusion(fusion, args, lparams, cparams); + ke.compile(fusion, args, lparams, cparams); NVF_CHECK( getBankConflictInfo(ke.kernel(), lparams).empty(), "Shared memory bank conflict not removed."); // Warm up run - auto outputs = ke.runFusion(aten_inputs); + auto outputs = ke.run(aten_inputs); checkMatch(expected_output, outputs.at(0).to(at::kDouble), Ki); @@ -462,7 +462,7 @@ static void NvFuserScheduler_MatmulSplitKReduction( // Compile kernel KernelExecutor ke; - ke.compileFusion( + ke.compile( fusion, args, heuristic_params->lparams, heuristic_params->cparams); NVF_CHECK( @@ -470,7 +470,7 @@ static void NvFuserScheduler_MatmulSplitKReduction( "Shared memory bank conflict not removed."); // Warm up run - auto outputs = ke.runFusion(aten_inputs, heuristic_params->lparams); + auto outputs = ke.run(aten_inputs, heuristic_params->lparams); checkMatch(expected_output, outputs.at(0).to(at::kDouble), splitk_factor); diff --git a/benchmarks/cpp/softmax.cpp b/benchmarks/cpp/softmax.cpp index ba6b707dd33..90b30c9ff54 100644 --- a/benchmarks/cpp/softmax.cpp +++ b/benchmarks/cpp/softmax.cpp @@ -106,7 +106,7 @@ static void NvFuserScheduler_Softmax_WarpReduceReference( scheduler->schedule(fusion, heuristic_params.get()); KernelExecutor ke; - ke.compileFusion(fusion, aten_inputs); + ke.compile(fusion, aten_inputs); runBenchmarkIterations(benchmark_state, &ke, aten_inputs); @@ -153,7 +153,7 @@ static void NvFuserScheduler_Softmax_WarpReduce( } KernelExecutor ke; - ke.compileFusion(fusion, aten_inputs); + ke.compile(fusion, aten_inputs); runBenchmarkIterations(benchmark_state, &ke, aten_inputs); diff --git a/benchmarks/cpp/utils.cpp b/benchmarks/cpp/utils.cpp index 54fa56a063d..613a3cbb2ef 100644 --- a/benchmarks/cpp/utils.cpp +++ b/benchmarks/cpp/utils.cpp @@ -230,8 +230,8 @@ int64_t runBenchmarkIterations( int64_t io_bytes = getSizeOfInputs(aten_inputs); { // Warm-up run - auto cg_outputs = fusion_executor->runFusion( - aten_inputs, launch_constraints, compile_params); + auto cg_outputs = + fusion_executor->run(aten_inputs, launch_constraints, compile_params); io_bytes += getSizeOfOutputs(cg_outputs); } @@ -246,8 +246,8 @@ int64_t runBenchmarkIterations( clearL2Cache(); FusionProfiler::start(); FusionProfiler::createSegments(1); - auto cg_outputs = fusion_executor->runFusion( - aten_inputs, launch_constraints, compile_params); + auto cg_outputs = + fusion_executor->run(aten_inputs, launch_constraints, compile_params); FusionProfiler::stop(); benchmark_state.SetIterationTime( FusionProfiler::profile().kernel_time_ms / 1000.0); diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index 1ef0e81b3e3..c3132067067 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -157,9 +157,9 @@ void HostIrExecutor::handle(PostOnStream* post_ir) { if (!ke.isCompiled()) { Fusion* fusion = hu->fusion_to_execute(); DynamicTransform::concretizeFusion(fusion, input_IValues); - ke.compileFusion(fusion, input_IValues); + ke.compile(fusion, input_IValues); } - outputs = ke.runFusion(input_IValues); + outputs = ke.run(input_IValues); if (!params_.cache_fusion_executor) { fe_.erase(hu); } diff --git a/csrc/python_frontend/fusion_definition.cpp b/csrc/python_frontend/fusion_definition.cpp index 05f12a7c2af..dc1af7a9f5c 100644 --- a/csrc/python_frontend/fusion_definition.cpp +++ b/csrc/python_frontend/fusion_definition.cpp @@ -370,18 +370,18 @@ std::vector FusionDefinition::execute( if (user_sched.heuristic_params == nullptr) { // Manual schedule if (!user_sched.executor->isCompiled()) { - user_sched.executor->compileFusion( + user_sched.executor->compile( user_sched.scheduled_fusion.get(), inputs, user_sched.fusion_id_, user_sched.device_id_); } - outputs = user_sched.executor->runFusion(inputs); + outputs = user_sched.executor->run(inputs); } else { // Automatic scheduler was used for UserSchedule. // Pass launch and compile params to compileFusion and runFusion. if (!user_sched.executor->isCompiled()) { - user_sched.executor->compileFusion( + user_sched.executor->compile( user_sched.scheduled_fusion.get(), KernelArgumentHolder::createKernelArgumentHolder( inputs, getCommonDeviceCUDA(inputs)), @@ -391,7 +391,7 @@ std::vector FusionDefinition::execute( user_sched.fusion_id_, user_sched.device_id_); } - outputs = user_sched.executor->runFusion( + outputs = user_sched.executor->run( inputs, user_sched.heuristic_params->lparams, user_sched.heuristic_params->cparams); diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp index 90eb036073c..a7ded48889a 100644 --- a/csrc/runtime/executor.cpp +++ b/csrc/runtime/executor.cpp @@ -185,7 +185,7 @@ std::string KernelExecutor::getStructuredCode() const { return getStructuredCode(kernelString(), kernel()->indexType()); } -void KernelExecutor::compileFusion( +void KernelExecutor::compile( Fusion* fusion, const KernelArgumentHolder& args, const LaunchParams& launch_constraints, @@ -1137,7 +1137,7 @@ at::Tensor findBufferForFusionOutput( } } // namespace -std::vector KernelExecutor::runFusion( +std::vector KernelExecutor::run( KernelArgumentHolder& args, const LaunchParams& launch_constraints, CompileParams compile_params, diff --git a/csrc/runtime/executor.h b/csrc/runtime/executor.h index 28742b3e9f3..0b6d27fb752 100644 --- a/csrc/runtime/executor.h +++ b/csrc/runtime/executor.h @@ -42,7 +42,7 @@ class KernelExecutor : public NonCopyable { //! To compile a fusion with the 32-bit index type, CompileParams //! must be passed in. There used to be an index type associated //! with KernelArgumentHolder, but it is no longer the case. - NVF_API void compileFusion( + NVF_API void compile( Fusion* fusion, const KernelArgumentHolder& args, const LaunchParams& launch_constraints, @@ -56,25 +56,25 @@ class KernelExecutor : public NonCopyable { // TODO: merge it with the overload above. //! This API is merely here so we don't have to go back and update all cpp //! tests. - void compileFusion( + void compile( Fusion* fusion, const at::ArrayRef& inputs = {}, const LaunchParams& launch_constraints = LaunchParams(), CompileParams compile_params = CompileParams()) { KernelArgumentHolder args = KernelArgumentHolder::createKernelArgumentHolder(inputs); - compileFusion(fusion, args, launch_constraints, compile_params); + compile(fusion, args, launch_constraints, compile_params); } //! Used by user defined schedules in python frontend - void compileFusion( + void compile( Fusion* fusion, const at::ArrayRef& inputs, int64_t fusion_id, int64_t concrete_id) { KernelArgumentHolder args = KernelArgumentHolder::createKernelArgumentHolder(inputs); - compileFusion( + compile( fusion, args, LaunchParams(), @@ -92,15 +92,15 @@ class KernelExecutor : public NonCopyable { // TODO: args shouldn't come in a reference here because we will append the // outputs to be able to send it to the kernel. For now none of the users are // reconsuming the args, so it is okay. It isn't done now because changing it - // from a reference makes a call as runFusion({}) ambiguous, and that is used + // from a reference makes a call as run({}) ambiguous, and that is used // in some places in the codebase. - NVF_API std::vector runFusion( + NVF_API std::vector run( KernelArgumentHolder& args, const LaunchParams& launch_constraints = LaunchParams(), CompileParams compile_params = CompileParams(), std::vector outputs = {}); - std::vector runFusion( + std::vector run( const at::ArrayRef& inputs, const std::vector& outputs, const LaunchParams& launch_constraints = LaunchParams(), @@ -111,15 +111,15 @@ class KernelExecutor : public NonCopyable { if (opt_code.has_value()) { args.setCacheId(*opt_code); } - return runFusion(args, launch_constraints, compile_params, outputs); + return run(args, launch_constraints, compile_params, outputs); } - std::vector runFusion( + std::vector run( const at::ArrayRef& inputs, const LaunchParams& launch_constraints = LaunchParams(), CompileParams compile_params = CompileParams(), const std::optional& opt_code = std::nullopt) { - return runFusion(inputs, {}, launch_constraints, compile_params, opt_code); + return run(inputs, {}, launch_constraints, compile_params, opt_code); } // Register a lowering hooks that are called to modify the GpuLower object diff --git a/csrc/runtime/fusion_kernel_runtime.cpp b/csrc/runtime/fusion_kernel_runtime.cpp index 99cb881a2c4..5e3a4a352ad 100644 --- a/csrc/runtime/fusion_kernel_runtime.cpp +++ b/csrc/runtime/fusion_kernel_runtime.cpp @@ -595,7 +595,7 @@ std::vector FusionKernelRuntime::runKernelWithInput( if (executor.groupId() < 0) { executor.setGroupId(group_id); } - auto outputs = executor.runFusion(args, launch_params, compile_params); + auto outputs = executor.run(args, launch_params, compile_params); return outputs; } @@ -625,7 +625,7 @@ void FusionKernelRuntime::compileKernel( NVF_ERROR( heuristic_params->cparams.index_type.has_value(), "Kernel index type is not defined."); - executors_.at(group_id).compileFusion( + executors_.at(group_id).compile( fusion_to_run.get(), args, heuristic_params->lparams, diff --git a/examples/sinh_extension/main.cpp b/examples/sinh_extension/main.cpp index f011b51c786..c6dd6b7fe01 100644 --- a/examples/sinh_extension/main.cpp +++ b/examples/sinh_extension/main.cpp @@ -35,8 +35,8 @@ at::Tensor sinh_nvfuser(const at::Tensor& input) { SchedulerEntry::scheduleWith(&fusion, SchedulerType::PointWise, {input}); KernelExecutor ke; - ke.compileFusion(&fusion, {input}, heuristic_params->lparams); - auto outputs = ke.runFusion({input}, heuristic_params->lparams); + ke.compile(&fusion, {input}, heuristic_params->lparams); + auto outputs = ke.run({input}, heuristic_params->lparams); return outputs[0]; } diff --git a/examples/sinh_libtorch/main.cpp b/examples/sinh_libtorch/main.cpp index 4487011b249..12f58d08c33 100644 --- a/examples/sinh_libtorch/main.cpp +++ b/examples/sinh_libtorch/main.cpp @@ -32,8 +32,8 @@ at::Tensor sinh_nvfuser(const at::Tensor& input) { &fusion, SchedulerType::PointWise, {input}); KernelExecutor ke; - ke.compileFusion(&fusion, {input}, heuristic_params->lparams); - auto outputs = ke.runFusion({input}, heuristic_params->lparams); + ke.compile(&fusion, {input}, heuristic_params->lparams); + auto outputs = ke.run({input}, heuristic_params->lparams); return outputs[0]; } diff --git a/tests/cpp/test_alias.cpp b/tests/cpp/test_alias.cpp index efb6a464fd5..7c1b4df46af 100644 --- a/tests/cpp/test_alias.cpp +++ b/tests/cpp/test_alias.cpp @@ -1026,8 +1026,8 @@ TEST_F(AliasTest, ReuseBuffer_KernelExecutor) { auto expected_tensor = tensor + 1.0; KernelExecutor ke; - ke.compileFusion(&fusion, {tensor}); - ke.runFusion({tensor}, {tensor}); + ke.compile(&fusion, {tensor}); + ke.run({tensor}, {tensor}); EXPECT_TRUE(tensor.allclose(expected_tensor)); } @@ -1231,8 +1231,8 @@ TEST_F(AliasTest, KernelExecutor) { KernelExecutor ke; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({10, 10}, options); - ke.compileFusion(&fusion, {in_tensor}); - at::Tensor out_tensor = ke.runFusion({in_tensor})[0]; + ke.compile(&fusion, {in_tensor}); + at::Tensor out_tensor = ke.run({in_tensor})[0]; EXPECT_EQ(out_tensor.data_ptr(), in_tensor.data_ptr()); } diff --git a/tests/cpp/test_alias_analysis.cpp b/tests/cpp/test_alias_analysis.cpp index d79f9d1405e..3b50a399b0b 100644 --- a/tests/cpp/test_alias_analysis.cpp +++ b/tests/cpp/test_alias_analysis.cpp @@ -185,8 +185,8 @@ TEST_F(AliasAnalysisTest, View_ForwardExpandedBroadcast) { KernelExecutor ke; at::Tensor in_tensor = at::randn({4, 5}).cuda().as_strided({4, 5, 6}, {5, 1, 0}); - ke.compileFusion(&fusion, {in_tensor}); - at::Tensor out_tensor = ke.runFusion({in_tensor})[0]; + ke.compile(&fusion, {in_tensor}); + at::Tensor out_tensor = ke.run({in_tensor})[0]; EXPECT_THAT(out_tensor.strides(), ElementsAre(1, 0)); } diff --git a/tests/cpp/test_allocation_domain.cpp b/tests/cpp/test_allocation_domain.cpp index c825263aed2..aca1adea75a 100644 --- a/tests/cpp/test_allocation_domain.cpp +++ b/tests/cpp/test_allocation_domain.cpp @@ -59,8 +59,8 @@ TEST_F(AllocationDomainTest, TransposedIntermediate) { at::Tensor t0 = at::randn({32, 32}, options); KernelExecutor ke; - ke.compileFusion(fusion_ptr.get(), {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(fusion_ptr.get(), {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -97,9 +97,9 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC4d) { at::Tensor t0 = at::randn({n, c, h, w}, options); KernelExecutor ke; - ke.compileFusion(fusion_ptr.get(), {t0}); + ke.compile(fusion_ptr.get(), {t0}); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -136,9 +136,9 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC1d) { at::Tensor t0 = at::randn({n, c, h, w}, options); KernelExecutor ke; - ke.compileFusion(fusion_ptr.get(), {t0}); + ke.compile(fusion_ptr.get(), {t0}); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -176,9 +176,9 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC2d) { at::Tensor t0 = at::randn({n, c, h, w}, options); KernelExecutor ke; - ke.compileFusion(fusion_ptr.get(), {t0}); + ke.compile(fusion_ptr.get(), {t0}); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -223,9 +223,9 @@ TEST_F(AllocationDomainTest, Tensor3d_To_NHWC3d) { at::Tensor t0 = at::randn({n1, n2, h * w * c}, options); KernelExecutor ke; - ke.compileFusion(fusion_ptr.get(), {t0}); + ke.compile(fusion_ptr.get(), {t0}); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -283,9 +283,9 @@ TEST_F(AllocationDomainTest, Tensor3d_To_NHWC4d_FwdBwd) { at::Tensor t0 = at::randn({n1, n2, c * h * w}, options); KernelExecutor ke; - ke.compileFusion(fusion_ptr.get(), {t0}); + ke.compile(fusion_ptr.get(), {t0}); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -339,14 +339,14 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); KernelExecutor ke; - ke.compileFusion(fusion_ptr.get(), {t0}); + ke.compile(fusion_ptr.get(), {t0}); EXPECT_THAT( - [&]() { ke.runFusion({t0_wrong_format}); }, + [&]() { ke.run({t0_wrong_format}); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Stride mismatch with contiguity info"))); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -398,14 +398,14 @@ TEST_F(AllocationDomainTest, NHWC1d_To_NHWC4d) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); KernelExecutor ke; - ke.compileFusion(fusion_ptr.get(), {t0}); + ke.compile(fusion_ptr.get(), {t0}); EXPECT_THAT( - [&]() { ke.runFusion({t0_wrong_format}); }, + [&]() { ke.run({t0_wrong_format}); }, ::testing::ThrowsMessage(::testing::HasSubstr( "splitting one dimension into discontiguous dimensions is not allowed in allocation domain"))); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -454,14 +454,14 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC1d) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); KernelExecutor ke; - ke.compileFusion(fusion_ptr.get(), {t0}); + ke.compile(fusion_ptr.get(), {t0}); EXPECT_THAT( - [&]() { ke.runFusion({t0_wrong_format}); }, + [&]() { ke.run({t0_wrong_format}); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Stride mismatch with contiguity info"))); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -515,14 +515,14 @@ TEST_F(AllocationDomainTest, NHWC1d_To_NHWC1d) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); KernelExecutor ke; - ke.compileFusion(fusion_ptr.get(), {t0}); + ke.compile(fusion_ptr.get(), {t0}); EXPECT_THAT( - [&]() { ke.runFusion({t0_wrong_format}); }, + [&]() { ke.run({t0_wrong_format}); }, ::testing::ThrowsMessage(::testing::HasSubstr( "splitting one dimension into discontiguous dimensions is not"))); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -583,14 +583,14 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); KernelExecutor ke; - ke.compileFusion(fusion_ptr.get(), {t0}); + ke.compile(fusion_ptr.get(), {t0}); EXPECT_THAT( - [&]() { ke.runFusion({t0_wrong_format}); }, + [&]() { ke.run({t0_wrong_format}); }, ::testing::ThrowsMessage(::testing::HasSubstr( "splitting one dimension into discontiguous dimensions is not allowed in allocation domain"))); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -648,14 +648,14 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheBefore) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); KernelExecutor ke; - ke.compileFusion(fusion_ptr.get(), {t0}); + ke.compile(fusion_ptr.get(), {t0}); EXPECT_THAT( - [&]() { ke.runFusion({t0_wrong_format}); }, + [&]() { ke.run({t0_wrong_format}); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Stride mismatch with contiguity info"))); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -725,14 +725,14 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheBefore) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); KernelExecutor ke; - ke.compileFusion(fusion_ptr.get(), {t0}); + ke.compile(fusion_ptr.get(), {t0}); EXPECT_THAT( - [&]() { ke.runFusion({t0_wrong_format}); }, + [&]() { ke.run({t0_wrong_format}); }, ::testing::ThrowsMessage(::testing::HasSubstr( "splitting one dimension into discontiguous dimensions is not allowed in allocation domain"))); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -790,14 +790,14 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheAfter) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); KernelExecutor ke; - ke.compileFusion(fusion_ptr.get(), {t0}); + ke.compile(fusion_ptr.get(), {t0}); EXPECT_THAT( - [&]() { ke.runFusion({t0_wrong_format}); }, + [&]() { ke.run({t0_wrong_format}); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Stride mismatch with contiguity info"))); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -861,14 +861,14 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheAfter) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); KernelExecutor ke; - ke.compileFusion(fusion_ptr.get(), {t0}); + ke.compile(fusion_ptr.get(), {t0}); EXPECT_THAT( - [&]() { ke.runFusion({t0_wrong_format}); }, + [&]() { ke.run({t0_wrong_format}); }, ::testing::ThrowsMessage(::testing::HasSubstr( "merging of discontiguous dimensions is not allowed in allocation domain"))); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -933,14 +933,14 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheFork) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); KernelExecutor ke; - ke.compileFusion(fusion_ptr.get(), {t0}); + ke.compile(fusion_ptr.get(), {t0}); EXPECT_THAT( - [&]() { ke.runFusion({t0_wrong_format}); }, + [&]() { ke.run({t0_wrong_format}); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Stride mismatch with contiguity info"))); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -1023,14 +1023,14 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheFork) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); KernelExecutor ke; - ke.compileFusion(fusion_ptr.get(), {t0}); + ke.compile(fusion_ptr.get(), {t0}); EXPECT_THAT( - [&]() { ke.runFusion({t0_wrong_format}); }, + [&]() { ke.run({t0_wrong_format}); }, ::testing::ThrowsMessage(::testing::HasSubstr( "splitting one dimension into discontiguous dimensions is not allowed in allocation domain"))); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -1226,8 +1226,8 @@ TEST_F(AllocationDomainTest, VectorizeOverlappingTensor) { at::randn({4 * 5 * 7}).cuda().as_strided({4, 5, 7}, {7, 4, 1}); KernelExecutor ke; - ke.compileFusion(fusion_ptr.get(), {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(fusion_ptr.get(), {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -1276,8 +1276,8 @@ TEST_F(AllocationDomainTest, Issue1290_ReplayCasPFailedDueToDifferentRanks) { at::Tensor in_tensor = at::randn({2, 3}).cuda(); KernelExecutor ke; - ke.compileFusion(&fusion, {in_tensor}); - at::Tensor out_tensor = ke.runFusion({in_tensor})[0]; + ke.compile(&fusion, {in_tensor}); + at::Tensor out_tensor = ke.run({in_tensor})[0]; EXPECT_THAT(out_tensor.sizes(), ElementsAre(2)); } diff --git a/tests/cpp/test_circular_buffering.cpp b/tests/cpp/test_circular_buffering.cpp index 78b79a31e83..cb0d27eafd2 100644 --- a/tests/cpp/test_circular_buffering.cpp +++ b/tests/cpp/test_circular_buffering.cpp @@ -65,16 +65,16 @@ TEST_P(CircularBufferingTest, SingleDim1) { auto t0 = at::randn({1000}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); + ke.compile(&fusion, {t0}); // Given computeAt axis 1, the axis_extent is I0/128. constexpr int64_t axis_extent = 8; if (axis_extent < number_of_stages) { - ASSERT_ANY_THROW(ke.runFusion({t0})); + ASSERT_ANY_THROW(ke.run({t0})); return; } - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); auto ref = t0 + 1; testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); } @@ -113,16 +113,16 @@ TEST_P(CircularBufferingTest, SingleDim2) { auto t0 = at::randn({1000}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); + ke.compile(&fusion, {t0}); // Given computeAt axis 1, the axis_extent is I0/128. constexpr int64_t axis_extent = 8; if (axis_extent < number_of_stages) { - ASSERT_ANY_THROW(ke.runFusion({t0})); + ASSERT_ANY_THROW(ke.run({t0})); return; } - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); auto ref = t0 + 1; testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); } @@ -168,16 +168,16 @@ TEST_P(CircularBufferingTest, SingleDim3) { auto t0 = at::randn({1000}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); + ke.compile(&fusion, {t0}); // Given computeAt axis 2, the axis_extent is 128/32. constexpr int64_t axis_extent = 4; if (axis_extent < number_of_stages) { - ASSERT_ANY_THROW(ke.runFusion({t0})); + ASSERT_ANY_THROW(ke.run({t0})); return; } - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); auto ref = t0 + 2; testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); } @@ -220,17 +220,17 @@ TEST_P(CircularBufferingTest, SingleDimUnswitch1) { auto t0 = at::randn({1000}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); + ke.compile(&fusion, {t0}); // Given computeAt axis -1 and axis 3 is parallelized with TIDx, the axis // extent is 4. constexpr int64_t axis_extent = 4; if (axis_extent < number_of_stages) { - ASSERT_ANY_THROW(ke.runFusion({t0})); + ASSERT_ANY_THROW(ke.run({t0})); return; } - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); auto ref = t0 + 2; testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); } @@ -272,17 +272,17 @@ TEST_P(CircularBufferingTest, SingleDimUnswitch2) { auto t0 = at::randn({1000}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); + ke.compile(&fusion, {t0}); // Given computeAt axis -1 and axis 3 is parallelized with TIDx, the axis // extent is 4. constexpr int64_t axis_extent = 4; if (axis_extent < number_of_stages) { - ASSERT_ANY_THROW(ke.runFusion({t0})); + ASSERT_ANY_THROW(ke.run({t0})); return; } - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); auto ref = t0 + 1; testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); } @@ -326,17 +326,17 @@ TEST_P(CircularBufferingTest, SingleDimUnroll) { auto t0 = at::randn({199}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); + ke.compile(&fusion, {t0}); // Given computeAt axis -1 and axis 4 is parallelized with TIDx, the axis // extent is 2. constexpr int64_t axis_extent = 2; if (axis_extent < number_of_stages) { - ASSERT_ANY_THROW(ke.runFusion({t0})); + ASSERT_ANY_THROW(ke.run({t0})); return; } - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); auto ref = t0 + 2; testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); } @@ -373,17 +373,17 @@ TEST_P(CircularBufferingTest, SingleDimVectorize) { auto t0 = at::randn({200}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); + ke.compile(&fusion, {t0}); // Given computeAt axis 2 and axis 1 is parallelized with TIDx, the axis // extent is I0/128. constexpr int64_t axis_extent = 2; if (axis_extent < number_of_stages) { - ASSERT_ANY_THROW(ke.runFusion({t0})); + ASSERT_ANY_THROW(ke.run({t0})); return; } - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); auto ref = t0 + 1; testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); } @@ -425,16 +425,16 @@ TEST_P(CircularBufferingTest, MultipleTensors) { auto t1 = at::randn({500}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0, t1}); + ke.compile(&fusion, {t0, t1}); // Given computeAt axis 1, the axis extent is I0/32/4. constexpr int64_t axis_extent = 1; if (axis_extent < number_of_stages) { - ASSERT_ANY_THROW(ke.runFusion({t0})); + ASSERT_ANY_THROW(ke.run({t0})); return; } - auto cg_outputs = ke.runFusion({t0, t1}); + auto cg_outputs = ke.run({t0, t1}); auto ref = t0 + t1; testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__); } @@ -476,18 +476,18 @@ TEST_P(CircularBufferingTest, NestedTensors) { auto t0 = at::randn({1001}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); + ke.compile(&fusion, {t0}); // Given computeAt axis 1 for tv2, the axis extent is I0/32/4 = 8. // Given computeAt axis 3 for tv3 and axis 3 is parallelized with TIDx, // the axis extent is 4. constexpr int64_t axis_extent = 4; if (axis_extent < number_of_stages) { - ASSERT_ANY_THROW(ke.runFusion({t0})); + ASSERT_ANY_THROW(ke.run({t0})); return; } - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); auto ref = t0 + 1; testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); } @@ -570,15 +570,15 @@ TEST_P(CircularBufferingTest, SmemBlockGemmCache) { std::vector aten_inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); + ke.compile(&fusion, aten_inputs); constexpr int64_t axis_extent = 2; if (axis_extent < number_of_stages) { - ASSERT_ANY_THROW(ke.runFusion({t0})); + ASSERT_ANY_THROW(ke.run({t0})); return; } - auto cg_outputs = ke.runFusion(aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); // The smem cache write in this test case is redundant predicated, @@ -624,15 +624,15 @@ TEST_P(CircularBufferingTest, Vector) { auto t0 = at::randn({200}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); + ke.compile(&fusion, {t0}); constexpr int64_t axis_extent = 8; if (axis_extent < number_of_stages) { - ASSERT_ANY_THROW(ke.runFusion({t0})); + ASSERT_ANY_THROW(ke.run({t0})); return; } - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); auto ref = (t0 + 1).sum({0}); testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); } @@ -681,11 +681,11 @@ TEST_P(CircularBufferingTest, CpAsync1) { KernelExecutor ke; // requires ampere+ GPU if (!deviceMajorMinorCheck(8)) { - ASSERT_ANY_THROW(ke.compileFusion(&fusion, {t0, t1})); + ASSERT_ANY_THROW(ke.compile(&fusion, {t0, t1})); GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs"; } - ke.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = ke.runFusion({t0, t1}); + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); auto ref = t0 + t1; @@ -734,11 +734,11 @@ TEST_P(CircularBufferingTest, CpAsync2) { KernelExecutor ke; // requires ampere+ GPU if (!deviceMajorMinorCheck(8)) { - ASSERT_ANY_THROW(ke.compileFusion(&fusion, {t0, t1})); + ASSERT_ANY_THROW(ke.compile(&fusion, {t0, t1})); GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs"; } - ke.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = ke.runFusion({t0, t1}); + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); auto ref = t0 + t1; @@ -795,8 +795,8 @@ TEST_P(CircularBufferingTest, NoSync) { NVF_ERROR(!sync_inserted, "Un-expected block sync inserted"); KernelExecutor ke; - ke.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = ke.runFusion({t0, t1}); + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); auto ref = t0 + t1; @@ -973,7 +973,7 @@ TEST_F(NVFuserTest, ElectSyncCompatibility) { // a single thread. KernelExecutor ke; try { - ke.compileFusion(fusion.get(), {t0}); + ke.compile(fusion.get(), {t0}); } catch (const std::exception& e) { const char* reference = R"(This thread-parallelized TensorView T2_s_float[ iblockIdx.x15{( ceilDiv(( ceilDiv(( ceilDiv(( ( ( (( (( getMetaData(T0) )).logical_size ))[0] ) * ( (( (( getMetaData(T0) )).logical_size ))[1] ) ) * ( (( (( getMetaData(T0) )).logical_size ))[2] ) ), 256) ), 4) ), 2) )}, iS16{2}, ithreadIdx.x14{4}, iB12{256} ] ca_pos( 2 ) is incorrectly contained within a If-Then-Else with the ElectSync predicate.)"; @@ -1024,9 +1024,9 @@ TEST_P(TmaCircularBufferingTest, SingleDim) { at::Tensor t1 = at::exp(t0); KernelExecutor ke; - ke.compileFusion(fusion.get(), {t0}); + ke.compile(fusion.get(), {t0}); - std::vector cg_outputs = ke.runFusion({t0}); + std::vector cg_outputs = ke.run({t0}); compare(tensor_inner_dim, cg_outputs.front(), t1); testValidate(fusion.get(), cg_outputs, {t0}, {t1}, __LINE__, __FILE__); } @@ -1077,16 +1077,16 @@ TEST_P(TmaCircularBufferingTest, SingleDimUnroll) { at::Tensor t1 = at::exp(t0); KernelExecutor ke; - ke.compileFusion(fusion.get(), {t0}); + ke.compile(fusion.get(), {t0}); int64_t axis_extent = ceilDiv(ceilDiv(tensor_inner_dim, bulk_inner_dim), unroll_dim); if (axis_extent < number_of_stages) { - ASSERT_ANY_THROW(ke.runFusion({t0})); + ASSERT_ANY_THROW(ke.run({t0})); return; } - std::vector cg_outputs = ke.runFusion({t0}); + std::vector cg_outputs = ke.run({t0}); compare(tensor_inner_dim, cg_outputs.front(), t1); testValidate(fusion.get(), cg_outputs, {t0}, {t1}, __LINE__, __FILE__); } @@ -1137,16 +1137,16 @@ TEST_P(TmaCircularBufferingTest, SingleDimUnswitch) { at::Tensor t1 = at::exp(t0); KernelExecutor ke; - ke.compileFusion(fusion.get(), {t0}); + ke.compile(fusion.get(), {t0}); int64_t axis_extent = ceilDiv(ceilDiv(tensor_inner_dim, bulk_inner_dim), unroll_dim); if (axis_extent < number_of_stages) { - ASSERT_ANY_THROW(ke.runFusion({t0})); + ASSERT_ANY_THROW(ke.run({t0})); return; } - std::vector cg_outputs = ke.runFusion({t0}); + std::vector cg_outputs = ke.run({t0}); compare(tensor_inner_dim, cg_outputs.front(), t1); testValidate(fusion.get(), cg_outputs, {t0}, {t1}, __LINE__, __FILE__); } @@ -1207,9 +1207,9 @@ TEST_P(TmaCircularBufferingTest, MultiDim) { at::Tensor t1 = at::exp(t0); KernelExecutor ke; - ke.compileFusion(fusion.get(), {t0}); + ke.compile(fusion.get(), {t0}); - std::vector cg_outputs = ke.runFusion({t0}); + std::vector cg_outputs = ke.run({t0}); compare(tensor_outer_dim, tensor_inner_dim, cg_outputs.front(), t1); testValidate(fusion.get(), cg_outputs, {t0}, {t1}, __LINE__, __FILE__); } @@ -1269,9 +1269,9 @@ TEST_P(TmaCircularBufferingTest, Pointwise) { at::Tensor t2 = t0 + t1; KernelExecutor ke; - ke.compileFusion(fusion.get(), {t0, t1}); + ke.compile(fusion.get(), {t0, t1}); - std::vector cg_outputs = ke.runFusion({t0, t1}); + std::vector cg_outputs = ke.run({t0, t1}); compare(tensor_outer_dim, tensor_inner_dim, cg_outputs.front(), t2); testValidate(fusion.get(), cg_outputs, {t0, t1}, {t2}, __LINE__, __FILE__); } @@ -1336,9 +1336,9 @@ TEST_P(TmaCircularBufferingTest, PointwiseCpAsync) { at::Tensor t2 = t0 + t1; KernelExecutor ke; - ke.compileFusion(fusion.get(), {t0, t1}); + ke.compile(fusion.get(), {t0, t1}); - std::vector cg_outputs = ke.runFusion({t0, t1}); + std::vector cg_outputs = ke.run({t0, t1}); compare(tensor_outer_dim, tensor_inner_dim, cg_outputs.front(), t2); testValidate(fusion.get(), cg_outputs, {t0, t1}, {t2}, __LINE__, __FILE__); } @@ -1394,9 +1394,9 @@ TEST_P(TmaCircularBufferingTest, Reduction) { at::Tensor t1 = sum(t0, {-1}); KernelExecutor ke; - ke.compileFusion(fusion.get(), {t0}); + ke.compile(fusion.get(), {t0}); - std::vector cg_outputs = ke.runFusion({t0}); + std::vector cg_outputs = ke.run({t0}); compare(tensor_outer_dim, cg_outputs.front(), t1); testValidate(fusion.get(), cg_outputs, {t0}, {t1}, __LINE__, __FILE__); } @@ -1520,8 +1520,8 @@ TEST_P(TmaCircularBufferingTest, Persistent) { // Compile with KernelExecutor directly to avoid scheduling KernelExecutor ke; - ke.compileFusion(fusion.get(), {at_tv0}); - std::vector cg_outputs = ke.runFusion({at_tv0}); + ke.compile(fusion.get(), {at_tv0}); + std::vector cg_outputs = ke.run({at_tv0}); std::tuple at_var_mean = at::var_mean(at_tv0, {-1}, correction, keepdim); @@ -1641,9 +1641,9 @@ TEST_P(TmaCircularBufferingTest, Matmul) { (t0.unsqueeze(/*dim=*/-1) * t1.unsqueeze(/*dim=*/0)).sum(/*dim=*/1); KernelExecutor ke; - ke.compileFusion(fusion.get(), {t0, t1}); + ke.compile(fusion.get(), {t0, t1}); - std::vector cg_outputs = ke.runFusion({t0, t1}); + std::vector cg_outputs = ke.run({t0, t1}); compare( tensor_outer_dim, tensor_inner_dim, cg_outputs.front(), aten_output); testValidate( @@ -1755,9 +1755,9 @@ TEST_P(TmaCircularBufferingTest, MatmulWithBroadcastedInput) { at::Tensor aten_output = (t0 * t1).sum(/*dim=*/1); KernelExecutor ke; - ke.compileFusion(fusion.get(), {t0, t1}); + ke.compile(fusion.get(), {t0, t1}); - std::vector cg_outputs = ke.runFusion({t0, t1}); + std::vector cg_outputs = ke.run({t0, t1}); compare( tensor_outer_dim, tensor_inner_dim, cg_outputs.front(), aten_output); testValidate( diff --git a/tests/cpp/test_combined_inner_outer_reduction.cpp b/tests/cpp/test_combined_inner_outer_reduction.cpp index cdb2f39c3db..f0a90168cc4 100644 --- a/tests/cpp/test_combined_inner_outer_reduction.cpp +++ b/tests/cpp/test_combined_inner_outer_reduction.cpp @@ -636,8 +636,8 @@ TEST_F(CombinedSchedulerTest, CombinedReduction) { at::Tensor qv_cg_output = at::empty({dim1}, options); auto qv_aten_output = tv_input.to(at::kFloat).sum({0}); KernelExecutor ke; - ke.compileFusion(&fusion, {tv_input}, launch_constraints, compile_params); - ke.runFusion( + ke.compile(&fusion, {tv_input}, launch_constraints, compile_params); + ke.run( {tv_input}, {tv_cg_output, qv_cg_output}, launch_constraints, @@ -813,8 +813,8 @@ TEST_F(CombinedSchedulerTest, CombinedReductionMultiPerBlock) { at::Tensor tv_input2 = at::ones({dim0, dim1}, options); auto qv_aten_output = tv_input2.to(at::kFloat).sum({0}); KernelExecutor ke; - ke.compileFusion(&fusion, {tv_input}, launch_constraints, compile_params); - ke.runFusion( + ke.compile(&fusion, {tv_input}, launch_constraints, compile_params); + ke.run( {tv_input}, {tv_cg_output, qv_cg_output}, launch_constraints, @@ -983,7 +983,7 @@ TEST_F(CombinedSchedulerTest, SharedMemoryPersistentVectFactor) { std::vector{tv1}; scheduler->schedule(&fusion, heuristic_params.get()); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); + ke.compile(&fusion, aten_inputs); for (auto tv : fusion.allTvs()) { if (tv->getMemoryType() == MemoryType::Shared) { @@ -992,8 +992,8 @@ TEST_F(CombinedSchedulerTest, SharedMemoryPersistentVectFactor) { } } } - auto cg_outputs = ke.runFusion( - aten_inputs, heuristic_params->as()->lparams); + auto cg_outputs = + ke.run(aten_inputs, heuristic_params->as()->lparams); testValidate(&fusion_copy, cg_outputs, aten_inputs, __LINE__, __FILE__); } diff --git a/tests/cpp/test_dynamic_transform.cpp b/tests/cpp/test_dynamic_transform.cpp index c7178597652..8eb468999b7 100644 --- a/tests/cpp/test_dynamic_transform.cpp +++ b/tests/cpp/test_dynamic_transform.cpp @@ -1210,9 +1210,9 @@ TEST_F(NVFuserTest, OptOutMutatorMutatedOutput) { inlineMost(); KernelExecutor ke; - ke.compileFusion(fusion); + ke.compile(fusion); - auto outputs = ke.runFusion({t0}); + auto outputs = ke.run({t0}); testValidate(fusion, outputs, {t0}, __LINE__, __FILE__); } @@ -1247,9 +1247,9 @@ TEST_F(NVFuserTest, OptOutMutatorRedefinedConstant) { inlineMost(); KernelExecutor ke; - ke.compileFusion(fusion); + ke.compile(fusion); - auto outputs = ke.runFusion({3L}); + auto outputs = ke.run({3L}); testValidate(fusion, outputs, {3L}, __LINE__, __FILE__); } diff --git a/tests/cpp/test_gpu1.cpp b/tests/cpp/test_gpu1.cpp index 2d626a271b4..4ed46cddc46 100644 --- a/tests/cpp/test_gpu1.cpp +++ b/tests/cpp/test_gpu1.cpp @@ -208,8 +208,8 @@ TEST_F(NVFuserTest, FusionClear_CUDA) { at::Tensor input2 = at::randn_like(input1); KernelExecutor ke; - ke.compileFusion(&fusion, {input1, input2}); - auto outputs = ke.runFusion({input1, input2}); + ke.compile(&fusion, {input1, input2}); + auto outputs = ke.run({input1, input2}); at::Tensor tv2_ref = input2 + 2.0; at::Tensor output_ref = input1 + tv2_ref; @@ -814,8 +814,8 @@ TEST_F(NVFuserTest, FusionOuterSplit_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); KernelExecutor ke; - ke.compileFusion(&fusion); - auto outputs = ke.runFusion({}); + ke.compile(&fusion); + auto outputs = ke.run({}); const auto& output = outputs.at(0); at::Tensor output_ref = at::ones_like(output, options); @@ -856,8 +856,8 @@ TEST_F(NVFuserTest, FusionCodeGen_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); KernelExecutor ke; - ke.compileFusion(&fusion); - auto outputs = ke.runFusion({}); + ke.compile(&fusion); + auto outputs = ke.run({}); const auto& output = outputs.at(0); at::Tensor output_ref = at::ones_like(output, options); @@ -900,8 +900,8 @@ TEST_F(NVFuserTest, FusionCodeGen2_CUDA) { at::Tensor input2 = at::randn_like(input1); KernelExecutor ke; - ke.compileFusion(&fusion, {input1, input2}); - auto outputs = ke.runFusion({input1, input2}); + ke.compile(&fusion, {input1, input2}); + auto outputs = ke.run({input1, input2}); at::Tensor tv2_ref = input2 + 2.0; at::Tensor output_ref = input1 + tv2_ref; @@ -956,8 +956,8 @@ TEST_F(NVFuserTest, FusionSimplePWise_CUDA) { at::Tensor output = at::empty_like(input1); KernelExecutor ke; - ke.compileFusion(&fusion, {input1, input2}); - ke.runFusion({input1, input2}, {output}); + ke.compile(&fusion, {input1, input2}); + ke.run({input1, input2}, {output}); at::Tensor tv2_ref = input2 + 2.0; at::Tensor output_ref = input1 + tv2_ref; @@ -1014,8 +1014,8 @@ TEST_F(NVFuserTest, FusionSimplePWiseDtypeComplex_CUDA) { at::Tensor output = at::empty_like(input1); KernelExecutor ke; - ke.compileFusion(&fusion, {input1, input2}); - ke.runFusion({input1, input2}, {output}); + ke.compile(&fusion, {input1, input2}); + ke.run({input1, input2}, {output}); at::Tensor tv2_ref = input2 + static_cast>(scalar1); at::Tensor output_ref = input1 + tv2_ref; @@ -1064,8 +1064,8 @@ TEST_F(NVFuserTest, FusionExecKernel_CUDA) { at::Tensor input2 = at::ones_like(input1); KernelExecutor ke; - ke.compileFusion(&fusion, {input1, input2}); - auto outputs = ke.runFusion({input1, input2}); + ke.compile(&fusion, {input1, input2}); + auto outputs = ke.run({input1, input2}); at::Tensor check = at::full({1, 128}, 4, options); ; @@ -1146,8 +1146,8 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAt1_CUDA) { at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); - ke.runFusion({aten_input}, cg_outputs); + ke.compile(&fusion, {aten_input}); + ke.run({aten_input}, cg_outputs); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -1200,8 +1200,8 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAt2_CUDA) { at::Tensor input = at::randn({129, 127}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {input}); - auto cg_outputs = ke.runFusion({input}); + ke.compile(&fusion, {input}); + auto cg_outputs = ke.run({input}); testValidate(&fusion, cg_outputs, {input}, __LINE__, __FILE__); } @@ -1254,8 +1254,8 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAt3_CUDA) { at::Tensor cg_output = at::empty_like(t0, options); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - ke.runFusion(aten_inputs, {cg_output}); + ke.compile(&fusion, aten_inputs); + ke.run(aten_inputs, {cg_output}); testValidate(&fusion, {cg_output}, aten_inputs, __LINE__, __FILE__); } @@ -1318,8 +1318,8 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAt4_CUDA) { std::vector aten_inputs = {t0, t1, t2, t3}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -1354,8 +1354,8 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAt5_CUDA) { std::vector aten_inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -1389,8 +1389,8 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAt6_CUDA) { std::vector aten_inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -1450,8 +1450,8 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAt7_CUDA) { std::vector aten_inputs = {t0, t2, t6}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -1506,8 +1506,8 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAt8_CUDA) { std::vector aten_inputs = {t0, t2, t6}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -1575,8 +1575,8 @@ TEST_F(NVFuserTest, FusionComputeAtMultiConsumers_CUDA) { at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); - ke.runFusion({aten_input}, cg_outputs); + ke.compile(&fusion, {aten_input}); + ke.run({aten_input}, cg_outputs); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -1645,8 +1645,8 @@ TEST_F(NVFuserTest, FusionComputeAtCommonConsumer1_CUDA) { at::empty_like(aten_input, options)}; KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); - ke.runFusion({aten_input}, cg_outputs); + ke.compile(&fusion, {aten_input}); + ke.run({aten_input}, cg_outputs); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -1720,8 +1720,8 @@ TEST_F(NVFuserTest, FusionComputeAtCommonConsumer2_CUDA) { at::Tensor cg_output = at::empty_like(aten_input, options); KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); - ke.runFusion({aten_input}, {cg_output}); + ke.compile(&fusion, {aten_input}); + ke.run({aten_input}, {cg_output}); testValidate(&fusion, {cg_output}, {aten_input}, __LINE__, __FILE__); } @@ -1801,8 +1801,8 @@ TEST_F(NVFuserTest, FusionComputeAtCommonConsumer3_CUDA) { at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); - ke.runFusion({aten_input}, cg_outputs); + ke.compile(&fusion, {aten_input}); + ke.run({aten_input}, cg_outputs); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -1865,8 +1865,8 @@ TEST_F(NVFuserTest, FusionComputeAtNoCommonConsumer_CUDA) { at::empty_like(aten_input, options)}; KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); - ke.runFusion({aten_input}, cg_outputs); + ke.compile(&fusion, {aten_input}); + ke.run({aten_input}, cg_outputs); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -1973,8 +1973,8 @@ TEST_F(NVFuserTest, FusionScalarInputs_CUDA) { at::Scalar(fl3)}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - ke.runFusion(aten_inputs, {cg_output}); + ke.compile(&fusion, aten_inputs); + ke.run(aten_inputs, {cg_output}); testValidate(&fusion, {cg_output}, aten_inputs, __LINE__, __FILE__); } @@ -2025,8 +2025,8 @@ TEST_F(NVFuserTest, FusionLoopUnroll_CUDA) { at::Tensor input1 = at::randn({129, 13, 3}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {input0, input1}); - auto outputs = ke.runFusion({input0, input1}); + ke.compile(&fusion, {input0, input1}); + auto outputs = ke.run({input0, input1}); NVF_CHECK(outputs[0].equal(input0.add(input1.add(2.0)))); } @@ -2174,8 +2174,8 @@ void test_op( cudaDeviceSynchronize(); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs_ivalues); - ke.runFusion(aten_inputs_ivalues, output_vect); + ke.compile(&fusion, aten_inputs_ivalues); + ke.run(aten_inputs_ivalues, output_vect); cudaDeviceSynchronize(); at::Tensor aten_output = af(aten_inputs); @@ -2714,13 +2714,13 @@ TEST_F(NVFuserTest, FusionFp8CastOps_CUDA) { if (!deviceMajorMinorCheck(9)) { ASSERT_THAT( - [&]() { ke.compileFusion(&fusion, inputs); }, + [&]() { ke.compile(&fusion, inputs); }, testing::ThrowsMessage(testing::HasSubstr( "Reason: Fusion contains Float8_xxx values which was introduced in Hopper (9.0)"))); GTEST_SKIP() << "skipping tests on pre-HOPPER GPUs"; } else { - ke.compileFusion(&fusion, inputs); - auto outputs = ke.runFusion(inputs); + ke.compile(&fusion, inputs); + auto outputs = ke.run(inputs); at::Tensor ref_output = input1.to(at_fp8_type).to(at_src_type); @@ -2791,8 +2791,8 @@ TEST_F(NVFuserTest, FusionReduction1_CUDA) { at::Tensor cg_output = at::empty({numel_x}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {input}); - ke.runFusion({input}, {cg_output}); + ke.compile(&fusion, {input}); + ke.run({input}, {cg_output}); auto aten_output = input.to(at::kDouble).sum({1}); @@ -2863,8 +2863,8 @@ TEST_F(NVFuserTest, FusionReduction2_CUDA) { at::Tensor input = at::randn({numel_x, numel_y}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {input}); - auto cg_outputs = ke.runFusion({input}); + ke.compile(&fusion, {input}); + auto cg_outputs = ke.run({input}); auto aten_output = input.to(at::kDouble).sum({1}); testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__); @@ -2914,8 +2914,8 @@ TEST_F(NVFuserTest, FusionReduction3_CUDA) { at::Tensor cg_output = at::empty({numel_x}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); - ke.runFusion({aten_input}, {cg_output}); + ke.compile(&fusion, {aten_input}); + ke.run({aten_input}, {cg_output}); auto aten_output = aten_input.to(at::kDouble).sum({1}); @@ -2980,8 +2980,8 @@ TEST_F(NVFuserTest, FusionReduction4_CUDA) { at::Tensor t4 = at::randn({numel_x}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0, t1, t4}); - auto cg_outputs = ke.runFusion({t0, t1, t4}); + ke.compile(&fusion, {t0, t1, t4}); + auto cg_outputs = ke.run({t0, t1, t4}); auto t2 = t0.add(t1); auto t3 = t2.to(at::kDouble).sum({1}); @@ -3034,8 +3034,8 @@ TEST_F(NVFuserTest, FusionReduction5_CUDA) { at::Tensor cg_output = at::empty({bidy, tidx}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {input}); - ke.runFusion({input}, {cg_output}); + ke.compile(&fusion, {input}); + ke.run({input}, {cg_output}); auto aten_output = input.to(at::kDouble).sum({1}); testValidate( @@ -3099,8 +3099,8 @@ TEST_F(NVFuserTest, FusionReduction6_CUDA) { at::Tensor input = at::randn({numel_x, numel_y, numel_z}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {input}); - auto cg_outputs = ke.runFusion({input}); + ke.compile(&fusion, {input}); + auto cg_outputs = ke.run({input}); auto aten_output = input.to(at::kDouble).sum({1, 2}); testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__); @@ -3131,8 +3131,8 @@ TEST_F(NVFuserTest, FusionMultiGridReduction_CUDA) { at::Tensor input = at::randn({numel_x, numel_y}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {input}); - auto cg_outputs = ke.runFusion({input}); + ke.compile(&fusion, {input}); + auto cg_outputs = ke.run({input}); testValidate(&fusion, cg_outputs, {input}, __LINE__, __FILE__); } @@ -3155,8 +3155,8 @@ TEST_F(NVFuserTest, FusionMultiGridReduction2_CUDA) { at::Tensor input = at::randn({4, 8}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {input}); - auto cg_output = ke.runFusion({input}); + ke.compile(&fusion, {input}); + auto cg_output = ke.run({input}); testValidate(&fusion, cg_output, {input}, __LINE__, __FILE__); } @@ -3208,8 +3208,8 @@ TEST_F(NVFuserTest, FusionReductionTFT_CUDA) { at::Tensor cg_output = at::empty({numel_x}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {input}); - ke.runFusion({input}, {cg_output}); + ke.compile(&fusion, {input}); + ke.run({input}, {cg_output}); auto aten_output = input.to(at::kDouble).sum({1}); testValidate( @@ -3272,8 +3272,8 @@ TEST_F(NVFuserTest, FusionReductionOuterSplit_CUDA) { at::Tensor t4 = at::randn({numel_x}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0, t1, t4}); - auto cg_outputs = ke.runFusion({t0, t1, t4}); + ke.compile(&fusion, {t0, t1, t4}); + auto cg_outputs = ke.run({t0, t1, t4}); auto t2 = t0.add(t1); auto t3 = t2.to(at::kDouble).sum({1}); @@ -3331,8 +3331,8 @@ TEST_F(NVFuserTest, FusionBranches_CUDA) { std::vector aten_inputs = {t0, t1, t2}; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -3378,8 +3378,8 @@ TEST_F(NVFuserTest, FusionSimpleBCast1_CUDA) { std::vector aten_inputs = {t0, t2, t3}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -3430,8 +3430,8 @@ TEST_F(NVFuserTest, FusionSimpleBCast2_CUDA) { std::vector aten_inputs = {t0, t1, t4}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - ke.runFusion(aten_inputs, {cg_output}); + ke.compile(&fusion, aten_inputs); + ke.run(aten_inputs, {cg_output}); testValidate(&fusion, {cg_output}, aten_inputs, __LINE__, __FILE__); } @@ -3472,8 +3472,8 @@ TEST_F(NVFuserTest, FusionSimpleBCast3_CUDA) { at::Tensor cg_output = at::empty({x, y, z}, options); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - ke.runFusion(aten_inputs, {cg_output}); + ke.compile(&fusion, aten_inputs); + ke.run(aten_inputs, {cg_output}); testValidate(&fusion, {cg_output}, aten_inputs, __LINE__, __FILE__); } @@ -3517,8 +3517,8 @@ TEST_F(NVFuserTest, FusionSimpleBCast4_CUDA) { std::vector aten_inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - ke.runFusion(aten_inputs, {cg_output}); + ke.compile(&fusion, aten_inputs); + ke.run(aten_inputs, {cg_output}); testValidate(&fusion, {cg_output}, aten_inputs, __LINE__, __FILE__); } @@ -3557,8 +3557,8 @@ TEST_F(NVFuserTest, FusionSimpleBCast5_CUDA) { std::vector aten_inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - ke.runFusion(aten_inputs, {cg_output}); + ke.compile(&fusion, aten_inputs); + ke.run(aten_inputs, {cg_output}); testValidate(&fusion, {cg_output}, aten_inputs, __LINE__, __FILE__); } @@ -3609,8 +3609,8 @@ TEST_F(NVFuserTest, FusionComplexBCast1_CUDA) { std::vector aten_inputs = {t0, t3, t6}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -3653,8 +3653,8 @@ TEST_F(NVFuserTest, FusionComplexBCast2_CUDA) { at::Tensor t4 = at::randn({x, y}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0, t4}); - auto cg_outputs = ke.runFusion({t0, t4}); + ke.compile(&fusion, {t0, t4}); + auto cg_outputs = ke.run({t0, t4}); testValidate(&fusion, {cg_outputs}, {t0, t4}, __LINE__, __FILE__); } @@ -3727,9 +3727,9 @@ TEST_F(NVFuserTest, FusionSimpleGemm_CUDA) { at::Tensor t1 = at::randn({K, N}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0, t1}, LaunchParams(1, -1, -1, 32, 4, 4)); + ke.compile(&fusion, {t0, t1}, LaunchParams(1, -1, -1, 32, 4, 4)); // Lets specify a few bounds in launch params to make sure it works - ke.runFusion({t0, t1}, LaunchParams(1, -1, -1, 32, 4, 4)); + ke.run({t0, t1}, LaunchParams(1, -1, -1, 32, 4, 4)); // Make sure bad launch params throws // TODO: Re-enable once we have parallelization validation in. @@ -3737,7 +3737,7 @@ TEST_F(NVFuserTest, FusionSimpleGemm_CUDA) { // ASSERT_ANY_THROW(ke.runFusion({t0, t1}, LaunchParams(1, 2, 3, 4, 5, 6))); // Don't specify any launch params - auto cg_outputs = ke.runFusion({t0, t1}); + auto cg_outputs = ke.run({t0, t1}); auto aten_output = t0.to(at::kDouble).matmul(t1.to(at::kDouble)); @@ -3792,8 +3792,8 @@ TEST_F(NVFuserTest, FusionSoftmax1D_CUDA) { at::Tensor t3_output = at::empty_like(cg_output, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - ke.runFusion({t0}, {cg_output}); + ke.compile(&fusion, {t0}); + ke.run({t0}, {cg_output}); auto aten_output = at::_softmax(t0.to(at::kDouble), -1, false); @@ -3861,8 +3861,8 @@ TEST_F(NVFuserTest, FusionSoftmax1DNormalized_CUDA) { at::Tensor t3_output = at::empty({dimx}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {input}); - auto cg_outputs = ke.runFusion({input}); + ke.compile(&fusion, {input}); + auto cg_outputs = ke.run({input}); auto aten_output = at::_softmax(input.to(at::kDouble), -1, false); @@ -3921,8 +3921,8 @@ TEST_F(NVFuserTest, FusionSoftmax3D_CUDA) { at::Tensor cg_output = at::empty({dimx, dimy, dimz}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {input}); - ke.runFusion({input}, {cg_output}); + ke.compile(&fusion, {input}); + ke.run({input}, {cg_output}); auto aten_output = at::_softmax(input.to(at::kDouble), -1, false); @@ -3996,8 +3996,8 @@ TEST_F(NVFuserTest, FusionSoftmax3DNormalized_CUDA) { at::Tensor t3_output = at::empty({dimx, dimy, dimz}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {input}); - auto cg_outputs = ke.runFusion({input}); + ke.compile(&fusion, {input}); + auto cg_outputs = ke.run({input}); auto aten_output = at::_softmax(input.to(at::kDouble), -1, false); @@ -4082,8 +4082,8 @@ TEST_F(NVFuserTest, FusionGridReduction1_CUDA) { at::Tensor cg_output = at::empty({numel_x}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {input}); - ke.runFusion({input}, {cg_output}); + ke.compile(&fusion, {input}); + ke.run({input}, {cg_output}); auto aten_output = input.to(at::kDouble).sum({1}); @@ -4142,8 +4142,8 @@ TEST_F(NVFuserTest, FusionGridReduction2_CUDA) { at::Tensor input = at::randn({numel_x, numel_y}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {input}); - auto cg_outputs = ke.runFusion({input}); + ke.compile(&fusion, {input}); + auto cg_outputs = ke.run({input}); auto aten_output = input.to(at::kDouble).sum({1}); @@ -4204,8 +4204,8 @@ TEST_F(NVFuserTest, FusionGridReduction3dim1_CUDA) { at::Tensor cg_output = at::empty({numel_x}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {input}); - ke.runFusion({input}, {cg_output}); + ke.compile(&fusion, {input}); + ke.run({input}, {cg_output}); auto aten_output = input.to(at::kDouble).sum({1}); testValidate( @@ -4263,8 +4263,8 @@ TEST_F(NVFuserTest, FusionGridReduction3dim0_CUDA) { at::Tensor input = at::randn({numel_x, numel_y}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {input}); - auto cg_outputs = ke.runFusion({input}); + ke.compile(&fusion, {input}); + auto cg_outputs = ke.run({input}); auto aten_output = input.to(at::kDouble).sum({0}); @@ -4329,8 +4329,8 @@ TEST_F(NVFuserTest, FusionGridReduction4_CUDA) { at::Tensor cg_output = at::empty({numel_x}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {input}); - ke.runFusion({input}, {cg_output}); + ke.compile(&fusion, {input}); + ke.run({input}, {cg_output}); auto aten_output = input.to(at::kDouble).sum({1}); testValidate( @@ -4386,8 +4386,8 @@ TEST_F(NVFuserTest, FusionGridReduction5_CUDA) { at::Tensor input = at::randn({numel_x, numel_y}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {input}); - auto cg_outputs = ke.runFusion({input}); + ke.compile(&fusion, {input}); + auto cg_outputs = ke.run({input}); auto aten_output = input.to(at::kDouble).sum({1}); testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__); @@ -4451,8 +4451,8 @@ TEST_F(NVFuserTest, FusionGridReduction6_CUDA) { at::Tensor cg_output = at::empty({numel_x}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {input}); - ke.runFusion({input}, {cg_output}); + ke.compile(&fusion, {input}); + ke.run({input}, {cg_output}); auto aten_output = input.to(at::kDouble).sum({1, 2}); @@ -4483,8 +4483,8 @@ TEST_F(NVFuserTest, FusionGridReduction7_CUDA) { at::Tensor cg_output = at::empty({numel_x}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {input}); - auto out = ke.runFusion({input}); + ke.compile(&fusion, {input}); + auto out = ke.run({input}); testValidate(&fusion, out, {input}, __LINE__, __FILE__); } @@ -4509,8 +4509,8 @@ TEST_F(NVFuserTest, FusionGridReduction8_CUDA) { at::Tensor input = at::randn({numel_x, numel_y}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {input}); - auto out = ke.runFusion({input}); + ke.compile(&fusion, {input}); + auto out = ke.run({input}); testValidate(&fusion, out, {input}, __LINE__, __FILE__); } @@ -4546,8 +4546,8 @@ TEST_F(NVFuserTest, FusionGridReduction9_CUDA) { std::vector aten_inputs = {t0, t2}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_output = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_output = ke.run(aten_inputs); testValidate(&fusion, cg_output, {t0, t2}, __LINE__, __FILE__); } @@ -4587,8 +4587,8 @@ TEST_F(NVFuserTest, FusionGridReduction10_CUDA) { at::Tensor t0 = at::randn({numel_w, numel_x, numel_y, numel_z}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_output = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_output = ke.run({t0}); testValidate(&fusion, cg_output, {t0}, __LINE__, __FILE__); } @@ -4617,8 +4617,8 @@ TEST_F(NVFuserTest, FusionNonRedAxisBind_CUDA) { at::Tensor input = at::randn({16, bid_x * tid_x}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {input}); - auto cg_outputs = ke.runFusion({input}); + ke.compile(&fusion, {input}); + auto cg_outputs = ke.run({input}); testValidate(&fusion, cg_outputs, {input}, __LINE__, __FILE__); } @@ -4667,8 +4667,8 @@ TEST_F(NVFuserTest, FusionSplitBCast_CUDA) { at::Tensor cg_output = at::empty({32, 32, 128}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0, t1}); - ke.runFusion({t0, t1}, {cg_output}); + ke.compile(&fusion, {t0, t1}); + ke.run({t0, t1}, {cg_output}); } TEST_F(NVFuserTest, FusionBCastInnerDim_CUDA) { @@ -4748,8 +4748,8 @@ TEST_F(NVFuserTest, FusionComputeAtExprOrder1_CUDA) { at::Tensor aten_input = at::randn({100}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); - auto cg_outputs = ke.runFusion({aten_input}); + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -4779,8 +4779,8 @@ TEST_F(NVFuserTest, FusionComputeAtExprOrder2_CUDA) { at::Tensor cg_output = at::empty_like(aten_input, options); KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); - ke.runFusion({aten_input}, {cg_output}); + ke.compile(&fusion, {aten_input}); + ke.run({aten_input}, {cg_output}); testValidate(&fusion, {cg_output}, {aten_input}, __LINE__, __FILE__); } @@ -4809,8 +4809,8 @@ TEST_F(NVFuserTest, FusionComputeAtExprOrder3_CUDA) { at::Tensor aten_input = at::randn({dimx, dimy}, options); nvfuser::KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); - auto cg_outputs = ke.runFusion({aten_input}); + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -4832,8 +4832,8 @@ TEST_F(NVFuserTest, FusionZeroDimComputeAt_CUDA) { at::Tensor aten_input = at::randn({100}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); - auto cg_outputs = ke.runFusion({aten_input}); + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -4867,8 +4867,8 @@ TEST_F(NVFuserTest, FusionZeroDimBroadcast_CUDA) { at::Tensor cg_output = at::empty({}, options); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - ke.runFusion(aten_inputs, {cg_output}); + ke.compile(&fusion, aten_inputs); + ke.run(aten_inputs, {cg_output}); testValidate(&fusion, {cg_output}, aten_inputs, __LINE__, __FILE__); } @@ -4902,8 +4902,8 @@ TEST_F(NVFuserTest, FusionZeroDimReduction_CUDA) { at::Tensor cg_output = at::empty({}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); - ke.runFusion({aten_input}, {cg_output}); + ke.compile(&fusion, {aten_input}); + ke.run({aten_input}, {cg_output}); testValidate( &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__); @@ -4954,8 +4954,8 @@ TEST_F(NVFuserTest, FusionBCastAfterReduce_CUDA) { std::vector aten_inputs = {t0, t4}; KernelExecutor ke; - ke.compileFusion(&fusion, {t0, t4}); - auto cg_outputs = ke.runFusion({t0, t4}); + ke.compile(&fusion, {t0, t4}); + auto cg_outputs = ke.run({t0, t4}); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); @@ -4978,8 +4978,8 @@ TEST_F(NVFuserTest, FusionOutputBroadcast_CUDA) { at::Tensor aten_input = at::randn({2, 3}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); - auto cg_outputs = ke.runFusion({aten_input}); + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -5001,8 +5001,8 @@ TEST_F(NVFuserTest, FusionReductionKeepDimBasic_CUDA) { at::Tensor aten_input = at::randn({2, 3, 4, 5, 6}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); - auto cg_outputs = ke.runFusion({aten_input}); + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -5077,8 +5077,8 @@ TEST_F(NVFuserTest, FusionSumTo_CUDA) { at::Tensor aten_input = at::randn(tensor_shape_ref, options); KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); - auto cg_outputs = ke.runFusion({aten_input}); + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); NVF_CHECK( cg_outputs[0].dim() == static_cast(sum_to_shape.size()), @@ -5119,8 +5119,8 @@ TEST_F(NVFuserTest, FusionSumToNoop_CUDA) { at::Tensor aten_input = at::randn(tensor_shape_ref, options); KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); - auto cg_outputs = ke.runFusion({aten_input}); + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); NVF_CHECK( cg_outputs[0].dim() == static_cast(sum_to_shape.size()), @@ -5266,8 +5266,8 @@ TEST_F(NVFuserTest, FusionSymbolicReduction_CUDA) { LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1); KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}, lparams); - auto cg_outputs = ke.runFusion({aten_input}, lparams); + ke.compile(&fusion, {aten_input}, lparams); + auto cg_outputs = ke.run({aten_input}, lparams); testValidate( &fusion, @@ -5308,8 +5308,8 @@ TEST_F(NVFuserTest, FusionReductionSchedulerMultiDimNonFastest_CUDA) { auto heuristic_params = SchedulerEntry::scheduleWith( &fusion, SchedulerType::Reduction, {aten_input}); KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}, heuristic_params->lparams); - ke.runFusion({aten_input}, {cg_output}, heuristic_params->lparams); + ke.compile(&fusion, {aten_input}, heuristic_params->lparams); + ke.run({aten_input}, {cg_output}, heuristic_params->lparams); testValidate( &fusion, @@ -5537,8 +5537,8 @@ TEST_F(NVFuserTest, FusionCacheBefore_CUDA) { at::Tensor aten_input = at::randn({M, N}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); - auto cg_outputs = ke.runFusion({aten_input}); + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -5573,8 +5573,8 @@ TEST_F(NVFuserTest, FusionCacheAfter_CUDA) { at::Tensor aten_input = at::randn({M, N}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); - auto cg_outputs = ke.runFusion({aten_input}); + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -5615,8 +5615,8 @@ TEST_F(NVFuserTest, FusionCacheFork_CUDA) { at::Tensor aten_input = at::randn({M, N}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); - auto cg_outputs = ke.runFusion({aten_input}); + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -5662,8 +5662,8 @@ TEST_F(NVFuserTest, FusionCacheIndirect_CUDA) { std::vector aten_inputs = {t0, t1, t2, t3}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -5718,8 +5718,8 @@ TEST_F(NVFuserTest, FusionCacheBcast_CUDA) { std::vector aten_inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -5755,8 +5755,8 @@ TEST_F(NVFuserTest, FusionCacheMultiConsumer_CUDA) { at::Tensor aten_input = at::randn({N}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); - auto cg_outputs = ke.runFusion({aten_input}); + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -5807,8 +5807,8 @@ TEST_F(NVFuserTest, FusionSmem_CUDA) { std::vector aten_inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = ke.runFusion({t0, t1}); + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); @@ -5855,8 +5855,8 @@ TEST_F(NVFuserTest, FusionSmemReduce_CUDA) { at::Tensor aten_output = sum(aten_input.to(at::kDouble), {1}); KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); - auto cg_outputs = ke.runFusion({aten_input}); + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); @@ -5925,8 +5925,8 @@ TEST_F(NVFuserTest, FusionSmemBlockGemm_CUDA) { at::Tensor aten_output = at::matmul(t0.to(at::kDouble), t1.to(at::kDouble)); KernelExecutor ke; - ke.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = ke.runFusion({t0, t1}); + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); @@ -6014,8 +6014,8 @@ TEST_F(NVFuserTest, FusionSmemBlockGemmCache_CUDA) { std::vector aten_inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); @@ -6086,8 +6086,8 @@ TEST_F(NVFuserTest, FusionSmemDynamicPersistentSoftmax2D_CUDA) { auto aten_output = at::_softmax(aten_input.to(at::kDouble), -1, false); nvfuser::KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input, 128}); - auto cg_outputs = ke.runFusion({aten_input, 128}); + ke.compile(&fusion, {aten_input, 128}); + auto cg_outputs = ke.run({aten_input, 128}); testValidate( &fusion, @@ -6841,9 +6841,8 @@ TEST_F(NVFuserTest, FusionPersistentSoftmaxLocalShared_CUDA) { aten_output.narrow(1, static_size, dimy - static_size); nvfuser::KernelExecutor ke; - ke.compileFusion(&fusion, {aten_static_in, aten_dynamic_in}); - ke.runFusion( - {aten_static_in, aten_dynamic_in}, {cg_static_out, cg_dynamic_out}); + ke.compile(&fusion, {aten_static_in, aten_dynamic_in}); + ke.run({aten_static_in, aten_dynamic_in}, {cg_static_out, cg_dynamic_out}); testValidate( &fusion, @@ -7030,9 +7029,9 @@ TEST_F(NVFuserTest, FusionPersistentNormLocalShared_CUDA) { aten_static_in, aten_dynamic_in, kGamma, kBeta, kEps, dimy}; nvfuser::KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); + ke.compile(&fusion, aten_inputs); - ke.runFusion(aten_inputs, {cg_static_out, cg_dynamic_out}); + ke.run(aten_inputs, {cg_static_out, cg_dynamic_out}); auto at_mu = at::mean(aten_input.to(at::kDouble), -1).unsqueeze(1); auto at_var = at::var(aten_input.to(at::kDouble), -1, false).unsqueeze(1); @@ -7154,8 +7153,8 @@ TEST_F(NVFuserTest, FusionSmemDynamicPersistentNorm_CUDA) { aten_input, kGamma, kBeta, kEps, dimy, TIDX}; nvfuser::KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); @@ -7200,8 +7199,8 @@ TEST_F(NVFuserTest, FusionSmemDynamicReductionSymbolic_CUDA) { LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1); KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}, lparams); - auto cg_outputs = ke.runFusion({aten_input}, lparams); + ke.compile(&fusion, {aten_input}, lparams); + auto cg_outputs = ke.run({aten_input}, lparams); testValidate( &fusion, @@ -7263,8 +7262,8 @@ TEST_F(NVFuserTest, FusionSmemDynamicReductionSymbolicArg_CUDA) { auto lparams = LaunchParams(-1, -1, -1, runtime_threadIdx_dim, -1, -1); KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input, runtime_threadIdx_dim}, lparams); - auto cg_outputs = ke.runFusion({aten_input, runtime_threadIdx_dim}, lparams); + ke.compile(&fusion, {aten_input, runtime_threadIdx_dim}, lparams); + auto cg_outputs = ke.run({aten_input, runtime_threadIdx_dim}, lparams); testValidate( &fusion, @@ -7327,8 +7326,8 @@ TEST_F(NVFuserTest, FusionSmemDynamicPwiseMulSymbolicArgWAR_CUDA) { LaunchParams lparams(-1, -1, -1, BSX, -1, -1); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs, lparams); - auto cg_outputs = ke.runFusion(aten_inputs, lparams); + ke.compile(&fusion, aten_inputs, lparams); + auto cg_outputs = ke.run(aten_inputs, lparams); testValidate( &fusion, cg_outputs, aten_inputs, __LINE__, __FILE__, "", lparams); @@ -7453,8 +7452,8 @@ TEST_F(NVFuserTest, FusionSmemDynamicTiledGemm_CUDA) { KernelExecutor ke; // Generate CUDA and compile with nvRTC - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); diff --git a/tests/cpp/test_gpu2.cpp b/tests/cpp/test_gpu2.cpp index 2b8d696f934..2426d801176 100644 --- a/tests/cpp/test_gpu2.cpp +++ b/tests/cpp/test_gpu2.cpp @@ -95,8 +95,8 @@ TEST_F(NVFuserTest, FusionGlobalIntermediate_CUDA) { auto lparams = LaunchParams(-1, -1, -1, runtime_threadIdx_dim, -1, -1); KernelExecutor ke; - ke.compileFusion(&fusion, {input}, lparams); - auto cg_outputs = ke.runFusion({input}, lparams); + ke.compile(&fusion, {input}, lparams); + auto cg_outputs = ke.run({input}, lparams); auto aten_output = input.to(at::kDouble).sum({1}); testValidate( @@ -142,8 +142,8 @@ TEST_F(NVFuserTest, FusionGlobalIntermediateDefaultSchedule_CUDA) { std::vector aten_inputs = {t0, t1, t2, t3}; KernelExecutor ke; - ke.compileFusion(&fusion, {t0, t1, t2, t3}); - auto cg_outputs = ke.runFusion({t0, t1, t2, t3}); + ke.compile(&fusion, {t0, t1, t2, t3}); + auto cg_outputs = ke.run({t0, t1, t2, t3}); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -200,8 +200,8 @@ TEST_F(NVFuserTest, FusionUnrollWithAlloc_CUDA) { tv1->computeAt(tv2_rf, -1); KernelExecutor ke; - ke.compileFusion(&fusion, {input}); - auto cg_outputs = ke.runFusion({input}); + ke.compile(&fusion, {input}); + auto cg_outputs = ke.run({input}); auto aten_output = (input + 0).to(at::kDouble).sum(1); @@ -277,8 +277,8 @@ TEST_F(NVFuserTest, FusionComputeAtNonterminatingOutput_CUDA) { auto t4 = t3 + 4; KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); - auto cg_outputs = ke.runFusion({aten_input}); + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -311,8 +311,8 @@ TEST_F(NVFuserTest, FusionTraversalOrder1_CUDA) { at::empty_like(aten_input, options)}; KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); - ke.runFusion({aten_input}, cg_outputs); + ke.compile(&fusion, {aten_input}); + ke.run({aten_input}, cg_outputs); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -348,8 +348,8 @@ TEST_F(NVFuserTest, FusionTraversalOrder2_CUDA) { at::empty_like(aten_input, options)}; KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); - ke.runFusion({aten_input}, cg_outputs); + ke.compile(&fusion, {aten_input}); + ke.run({aten_input}, cg_outputs); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -400,8 +400,8 @@ TEST_F(NVFuserTest, FusionTraversalOrder3_CUDA) { at::empty_like(aten_input, options)}; KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); - ke.runFusion({aten_input}, cg_outputs); + ke.compile(&fusion, {aten_input}); + ke.run({aten_input}, cg_outputs); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -444,8 +444,8 @@ TEST_F(NVFuserTest, FusionTraversalOrder4_CUDA) { at::empty_like(t0, options)}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - ke.runFusion(aten_inputs, cg_outputs); + ke.compile(&fusion, aten_inputs); + ke.run(aten_inputs, cg_outputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -477,8 +477,8 @@ TEST_F(NVFuserTest, FusionTraversalOrder5_CUDA) { at::empty_like(aten_input, options)}; KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); - ke.runFusion({aten_input}, cg_outputs); + ke.compile(&fusion, {aten_input}); + ke.run({aten_input}, cg_outputs); auto t1 = aten_input + 1; auto t2 = t1 + 2; @@ -519,8 +519,8 @@ TEST_F(NVFuserTest, FusionTraversalOrder6_CUDA) { at::Tensor cg_output = at::empty_like(aten_input, options); KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); - ke.runFusion({aten_input}, {cg_output}); + ke.compile(&fusion, {aten_input}); + ke.run({aten_input}, {cg_output}); testValidate(&fusion, {cg_output}, {aten_input}, __LINE__, __FILE__); } @@ -559,8 +559,8 @@ TEST_F(NVFuserTest, FusionTraversalOrder7_CUDA) { at::Tensor cg_output = at::empty_like(aten_input, options); KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); - ke.runFusion({aten_input}, {cg_output}); + ke.compile(&fusion, {aten_input}); + ke.run({aten_input}, {cg_output}); testValidate(&fusion, {cg_output}, {aten_input}, __LINE__, __FILE__); } @@ -620,8 +620,8 @@ TEST_F(NVFuserTest, FusionThreadPredicate_CUDA) { at::empty_like(aten_input, options), at::empty({numel_x}, options)}; KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); - ke.runFusion({aten_input}, cg_outputs); + ke.compile(&fusion, {aten_input}); + ke.run({aten_input}, cg_outputs); testValidate( &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); @@ -740,9 +740,9 @@ TEST_F(NVFuserTest, FusionReduceSingle_CUDA) { // Grab only tensor views, though there shouldn't be any other type KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); + ke.compile(&fusion, {aten_input}); // no broadcasting needed, omitting the last optional argument; - auto cg_outputs = ke.runFusion({aten_input}); + auto cg_outputs = ke.run({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -872,8 +872,8 @@ TEST_F(NVFuserTest, FusionTrivialReduction_CUDA) { at::Tensor aten_input = at::randn({10, 20, 1}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); - auto cg_outputs = ke.runFusion({aten_input}); + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -1282,8 +1282,8 @@ TEST_F(NVFuserTest, FusionIssue459_CUDA) { std::vector aten_inputs = {t0, t1}; nvfuser::KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -1312,8 +1312,8 @@ TEST_F(NVFuserTest, FusionSmemIndexingSimple_CUDA) { auto aten_input = at::randn({12, 34}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); - auto cg_outputs = ke.runFusion({aten_input}); + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -1423,8 +1423,8 @@ TEST_F(NVFuserTest, FusionSmemIndexing_CUDA) { std::vector aten_inputs = {t0, t1, 3, 4, 5}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); @@ -1458,8 +1458,8 @@ TEST_F(NVFuserTest, FusionCacheBeforeReduction_CUDA) { at::Tensor cg_output = at::empty({numel_x}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); - ke.runFusion({aten_input}, {cg_output}); + ke.compile(&fusion, {aten_input}); + ke.run({aten_input}, {cg_output}); testValidate(&fusion, {cg_output}, {aten_input}, __LINE__, __FILE__); } @@ -1495,8 +1495,8 @@ TEST_F(NVFuserTest, FusionCacheBeforeReduction2_CUDA) { at::Tensor aten_input = at::randn({numel_x, numel_y, numel_z}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); - auto cg_outputs = ke.runFusion({aten_input}); + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -1601,8 +1601,8 @@ TEST_F(NVFuserTest, FusionIssue367_CUDA) { mul(t0.unsqueeze(2), t1.unsqueeze(0)).to(at::kDouble).sum(1); nvfuser::KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); @@ -1627,8 +1627,8 @@ TEST_F(NVFuserTest, FusionIssue468_CUDA) { at::Tensor aten_input = at::randn({10, 100}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); - auto cg_outputs = ke.runFusion({aten_input}); + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -1679,8 +1679,8 @@ TEST_F(NVFuserTest, FusionIssue363_CUDA) { std::vector aten_inputs = {t0, t1}; nvfuser::KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -1705,8 +1705,8 @@ TEST_F(NVFuserTest, FusionIssue484_CUDA) { at::Tensor aten_input = at::randn({M, M}, options); nvfuser::KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); - auto cg_outputs = ke.runFusion({aten_input}); + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -1731,8 +1731,8 @@ TEST_F(NVFuserTest, FusionIssue329_CUDA) { auto aten_input = at::randn(t0_shape, options); KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); - auto cg_outputs = ke.runFusion({aten_input}); + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -1772,8 +1772,8 @@ TEST_F(NVFuserTest, FusionIssue382_CUDA) { std::vector aten_inputs = {t0, t3}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -1801,8 +1801,8 @@ TEST_F(NVFuserTest, FusionIssue507_CUDA) { auto aten_input = at::randn(t0_shape, options); KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); - auto cg_outputs = ke.runFusion({aten_input}); + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -1839,8 +1839,8 @@ TEST_F(NVFuserTest, FusionIssue532_CUDA) { std::vector aten_inputs = {t0}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -1868,8 +1868,8 @@ TEST_F(NVFuserTest, FusionLoopUnswitch_CUDA) { std::vector aten_inputs = {t0}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -1946,8 +1946,8 @@ TEST_F(NVFuserTest, FusionIssue549_CUDA) { LaunchParams lparams(1, -1, -1, 32, 4, 4); KernelExecutor ke; - ke.compileFusion(&fusion, {t0, t1}, lparams); - ke.runFusion({t0, t1}, lparams); + ke.compile(&fusion, {t0, t1}, lparams); + ke.run({t0, t1}, lparams); // Make sure bad launch params throws // TODO: Re-enable once we have parallelization validation in. @@ -1955,7 +1955,7 @@ TEST_F(NVFuserTest, FusionIssue549_CUDA) { // ASSERT_ANY_THROW(ke.runFusion({t0, t1}, LaunchParams(1, 2, 3, 4, 5, 6))); // Don't specify any launch params - auto cg_outputs = ke.runFusion({t0, t1}); + auto cg_outputs = ke.run({t0, t1}); auto aten_output = (t0 + 1).to(at::kDouble).matmul(t1.to(at::kDouble)); @@ -2326,8 +2326,8 @@ TEST_F(NVFuserTest, FusionWelfordOp_CUDA) { at::Tensor t0 = at::randn({M, N}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); // by default Welford outputs sum of square diff so need to divide to get var outputs[1] /= N; @@ -2371,8 +2371,8 @@ TEST_F(NVFuserTest, FusionBlockWelfordOp_CUDA) { at::Tensor t_N = at::empty({M}, options_int); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); // by default Welford outputs sum of square diff so need to divide to get var outputs[1] /= N; @@ -2416,8 +2416,8 @@ TEST_F(NVFuserTest, FusionGridWelfordOp_CUDA) { at::Tensor t_N = at::empty({M}, options_int); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); // by default Welford outputs sum of square diff so need to divide to get var outputs[1] /= N; @@ -2460,8 +2460,8 @@ TEST_F(NVFuserTest, FusionRfactorWelfordOp_CUDA) { at::Tensor t_N = at::empty({M}, options_int); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); // by default Welford outputs sum of square diff so need to divide to get var outputs[1] /= N; @@ -2593,12 +2593,12 @@ TEST_P(WelfordReduction, Test) { // lowering pass will use int64 as the index tpye, since this test saves // `tv_N` as index type, it may cause vectorization size validation error. For // example, the heuristics set index type to int32 and the max vectorization - // factor is 4, if compile para is not passed to compileFusion, the lowering + // factor is 4, if compile para is not passed to compile, the lowering // pass uses int64 as index type, so the max vectorization factor is 16 bytes // sizeof(int64) = 2, which is wrong since the actual index type is int32 // and the max vectorization factor is 4. - ke.compileFusion(&fusion, {aten_input}, lparams, cparams); - auto outputs = ke.runFusion({aten_input}, lparams); + ke.compile(&fusion, {aten_input}, lparams, cparams); + auto outputs = ke.run({aten_input}, lparams); // by default Welford outputs sum of square diff so need to divide to // get var @@ -2756,11 +2756,11 @@ TEST_F(NVFuserTest, FusionSimpleGemmTransposed_CUDA) { // Lets specify a few bounds in launch params to make sure it works LaunchParams lparams(1, -1, -1, 32, 4, 4); KernelExecutor ke; - ke.compileFusion(&fusion, {t0, t1}, lparams); - ke.runFusion({t0, t1}, lparams); + ke.compile(&fusion, {t0, t1}, lparams); + ke.run({t0, t1}, lparams); // Don't specify any launch params - auto cg_outputs = ke.runFusion({t0, t1}); + auto cg_outputs = ke.run({t0, t1}); auto aten_output = t0.t().to(at::kDouble).matmul(t1.t().to(at::kDouble)); @@ -2821,8 +2821,8 @@ TEST_F(NVFuserTest, FusionSoftmax3DTransposed_CUDA) { at::Tensor cg_output = at::empty({dimx, dimy, dimz}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {input}); - ke.runFusion({input}, {cg_output}); + ke.compile(&fusion, {input}); + ke.run({input}, {cg_output}); auto aten_input_t = at::transpose(input, 1, 2); auto aten_output = at::_softmax(aten_input_t.to(at::kDouble), -1, false); @@ -2895,8 +2895,8 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed1_CUDA) { at::Tensor aten_input = at::randn({129, 127}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); - auto cg_outputs = ke.runFusion({aten_input}); + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); at::Tensor aten_input_t = aten_input.t(); @@ -2964,8 +2964,8 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed2_CUDA) { at::Tensor input = at::randn({129, 127}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {input}); - auto cg_outputs = ke.runFusion({input}); + ke.compile(&fusion, {input}); + auto cg_outputs = ke.run({input}); auto input_t = input.t(); auto t1 = input_t.mul({-1.0}); @@ -3030,8 +3030,8 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed3_CUDA) { std::vector aten_inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto t0_t = t0.permute({3, 0, 1, 2}); auto t1_t = t1.permute({3, 0, 1, 2}); @@ -3108,8 +3108,8 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed4_CUDA) { std::vector aten_inputs = {t0, t1, t2, t3}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto t0_t = t0.permute({3, 0, 1, 2}); auto t1_t = t1.permute({3, 0, 1, 2}); @@ -3156,8 +3156,8 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed5_CUDA) { std::vector aten_inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto t2 = t0.t().add(2.0); auto aten_output = t1.t().mul(t2); @@ -3198,8 +3198,8 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed6_CUDA) { std::vector aten_inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto t2 = t0.t().add(2.0); auto aten_output = t1.t().mul(t2); @@ -3349,8 +3349,8 @@ TEST_F(NVFuserTest, FusionVectorizeSimple_CUDA) { at::Tensor aten_input = at::empty({2, 6, 32}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); - auto cg_outputs = ke.runFusion({aten_input}); + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); at::Tensor aten_output = aten_input.sin(); @@ -3424,8 +3424,8 @@ TEST_F(NVFuserTest, FusionSimpleVectorizeUnroll_CUDA) { at::Tensor output = at::empty_like(input1); KernelExecutor ke; - ke.compileFusion(&fusion, {input1, input2}); - ke.runFusion({input1, input2}, {output}); + ke.compile(&fusion, {input1, input2}); + ke.run({input1, input2}, {output}); at::Tensor tv2_ref = input2 + 2.0; at::Tensor output_ref = input1 + tv2_ref; @@ -3504,8 +3504,8 @@ TEST_F(NVFuserTest, FusionGridPersistence_CUDA) { at::Tensor input = at::randn({numel_x}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {input}); - auto out = ke.runFusion({input}); + ke.compile(&fusion, {input}); + auto out = ke.run({input}); testValidate(&fusion, out, {input}, __LINE__, __FILE__); } @@ -3537,8 +3537,8 @@ TEST_F(NVFuserTest, FusionGridPersistence2_CUDA) { at::Tensor input = at::randn({numel_x, numel_y}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {input}); - auto out = ke.runFusion({input}); + ke.compile(&fusion, {input}); + auto out = ke.run({input}); testValidate(&fusion, out, {input}, __LINE__, __FILE__); } @@ -3571,8 +3571,8 @@ TEST_F(NVFuserTest, FusionWelfordPersistence_CUDA) { at::Tensor input = at::randn({numel_x}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {input}); - auto out = ke.runFusion({input}); + ke.compile(&fusion, {input}); + auto out = ke.run({input}); auto aten_output = (input.mean({0}) + (input.var({0}, false) * numel_x)) .unsqueeze(-1) @@ -3611,8 +3611,8 @@ TEST_F(NVFuserTest, FusionWelfordPersistence2_CUDA) { at::Tensor input = at::randn({numel_x, numel_y}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {input}); - auto out = ke.runFusion({input}); + ke.compile(&fusion, {input}); + auto out = ke.run({input}); auto aten_output = (input.mean({0}) + (input.var({0}, false) * numel_x)) .unsqueeze(0) @@ -3649,8 +3649,8 @@ TEST_F(NVFuserTest, FusionIssue633_CUDA) { std::vector aten_inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -3682,8 +3682,8 @@ TEST_F(NVFuserTest, FusionBroadcastAcrossComputeAt_CUDA) { std::vector aten_inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -3731,8 +3731,8 @@ TEST_F(NVFuserTest, FusionVectorizeMisalignedPointwise_CUDA) { std::vector aten_inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -3787,8 +3787,8 @@ TEST_F(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeContig_CUDA) { std::vector aten_inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -3848,8 +3848,8 @@ TEST_F(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeSymbolicPass_CUDA) { std::vector aten_inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -3912,7 +3912,7 @@ TEST_F(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeSymbolicFail_CUDA) { // TODO: throw assertion - cannot merge non-contiguous vectorization axes // Make sure compilation fails // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(ke.compileFusion(&fusion)); + ASSERT_ANY_THROW(ke.compile(&fusion)); } TEST_F(NVFuserTest, FusionVectorizeMisalignedRFactor_CUDA) { @@ -3965,8 +3965,8 @@ TEST_F(NVFuserTest, FusionVectorizeMisalignedRFactor_CUDA) { std::vector aten_inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto aten_output = t0.add(t1).sum(1); testValidate( @@ -4009,7 +4009,7 @@ TEST_F(NVFuserTest, FusionVectorizeMisalignedWrongDimFail_CUDA) { KernelExecutor ke; // Make sure compilation fails // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(ke.compileFusion(&fusion)); + ASSERT_ANY_THROW(ke.compile(&fusion)); } TEST_F(NVFuserTest, FusionVectorizeMisalignedStride_CUDA) { @@ -4057,8 +4057,8 @@ TEST_F(NVFuserTest, FusionVectorizeMisalignedStride_CUDA) { std::vector aten_inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -4111,11 +4111,11 @@ TEST_F(NVFuserTest, FusionVectorizeMisalignedStrideFail_CUDA) { std::vector aten_inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); + ke.compile(&fusion, aten_inputs); // Failure because the input + output tensors do not have the same stride // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(ke.runFusion(aten_inputs)); + ASSERT_ANY_THROW(ke.run(aten_inputs)); } TEST_F(NVFuserTest, FusionVectorization1_CUDA) { @@ -4158,8 +4158,8 @@ TEST_F(NVFuserTest, FusionVectorization1_CUDA) { std::vector aten_inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -4200,7 +4200,7 @@ TEST_F(NVFuserTest, FusionVectorization2_CUDA) { KernelExecutor ke; // Make sure compilation fails // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(ke.compileFusion(&fusion)); + ASSERT_ANY_THROW(ke.compile(&fusion)); } // TODO: Re-enable once vectorization validation is fixed @@ -4245,19 +4245,19 @@ TEST_F(NVFuserTest, FusionVectorization3_CUDA) { std::vector aten_inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); + ke.compile(&fusion, aten_inputs); // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(ke.runFusion(aten_inputs)); + ASSERT_ANY_THROW(ke.run(aten_inputs)); aten_inputs[0] = t0.index({"...", at::indexing::Slice(1)}); aten_inputs[1] = t1.index({"...", at::indexing::Slice(1)}); // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(ke.runFusion(aten_inputs)); + ASSERT_ANY_THROW(ke.run(aten_inputs)); t0 = at::randn({bx, 2048}, options).index({"...", at::indexing::Slice(4)}); t1 = at::randn({bx, 2048}, options).index({"...", at::indexing::Slice(4)}); aten_inputs = {t0, t1}; - auto cg_outputs = ke.runFusion(aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -4310,8 +4310,8 @@ TEST_F(NVFuserTest, FusionVectorizationRFactor_CUDA) { std::vector aten_inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto aten_output = t0.add(t1).sum(1); testValidate( @@ -4373,8 +4373,8 @@ TEST_F(NVFuserTest, FusionSizeOneLoop1_CUDA) { std::vector aten_inputs = {t0, t1, t2}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -4407,8 +4407,8 @@ TEST_F(NVFuserTest, FusionSizeOneLoop2_CUDA) { std::vector aten_inputs = {t0}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -4430,7 +4430,7 @@ TEST_F(NVFuserTest, FusionValidateParallelize1_CUDA) { // Invalid as tv1 and tv2 do have the same ParallelType KernelExecutor ke; // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(ke.compileFusion(&fusion)); + ASSERT_ANY_THROW(ke.compile(&fusion)); } TEST_F(NVFuserTest, FusionValidateParallelize2_CUDA) { @@ -4451,7 +4451,7 @@ TEST_F(NVFuserTest, FusionValidateParallelize2_CUDA) { // tv1 and tv2 do have the same ParallelType, but tv1 is on shared // memory, so it is valid KernelExecutor ke; - ke.compileFusion(&fusion); + ke.compile(&fusion); } TEST_F(NVFuserTest, FusionValidateParallelize3_CUDA) { @@ -4474,7 +4474,7 @@ TEST_F(NVFuserTest, FusionValidateParallelize3_CUDA) { // tv1 and tv2 have the same shape and ParallelType KernelExecutor ke; - ke.compileFusion(&fusion); + ke.compile(&fusion); } TEST_F(NVFuserTest, FusionValidateParallelize4_CUDA) { @@ -4497,7 +4497,7 @@ TEST_F(NVFuserTest, FusionValidateParallelize4_CUDA) { // tv1 and tv2 do not have the same shape but global memory comm is supported. KernelExecutor ke; - ke.compileFusion(&fusion); + ke.compile(&fusion); } TEST_F(NVFuserTest, FusionValidateParallelize5_CUDA) { @@ -4521,7 +4521,7 @@ TEST_F(NVFuserTest, FusionValidateParallelize5_CUDA) { // tv1 and tv2 do not have the same shape, but tv1 is on shared // memory, so it is valid KernelExecutor ke; - ke.compileFusion(&fusion); + ke.compile(&fusion); } // See issue #995 @@ -4649,8 +4649,8 @@ TEST_F(NVFuserTest, FusionValidateParallelize8_CUDA) { at::Tensor input1 = at::arange(32, options) * 0.01; KernelExecutor ke; - ke.compileFusion(&fusion, {input0, input1}); - auto outputs = ke.runFusion({input0, input1}); + ke.compile(&fusion, {input0, input1}); + auto outputs = ke.run({input0, input1}); testValidate(&fusion, outputs, {input0, input1}, __LINE__, __FILE__); } @@ -4738,8 +4738,8 @@ TEST_F(NVFuserTest, FusionValidateParallelize10_CUDA) { std::vector aten_inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -4784,8 +4784,8 @@ TEST_F(NVFuserTest, FusionValidateParallelize11_CUDA) { std::vector aten_inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -4898,8 +4898,8 @@ TEST_F(NVFuserTest, FusionBlockReduceInSerialLoop_CUDA) { std::vector aten_inputs = {t0}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -4927,8 +4927,8 @@ TEST_F(NVFuserTest, FusionBlockWelfordInSerialLoop_CUDA) { std::vector aten_inputs = {t0}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); at::Tensor aten_avg = t0.mean({1, 2}); at::Tensor aten_M2 = t0.var({1, 2}, false) * N * K; testValidate( @@ -4966,8 +4966,8 @@ TEST_F(NVFuserTest, FusionReductionPredicate_CUDA) { at::Tensor cg_output = at::empty({numel_y}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {input}); - ke.runFusion({input}, {cg_output}); + ke.compile(&fusion, {input}); + ke.run({input}, {cg_output}); auto aten_output = input.to(at::kDouble).sum({0}); @@ -5063,8 +5063,8 @@ TEST_F(NVFuserTest, FusionIssue757_CUDA) { std::vector inputs = {t0, t3}; KernelExecutor ke; - ke.compileFusion(&fusion, inputs); - auto outputs = ke.runFusion(inputs); + ke.compile(&fusion, inputs); + auto outputs = ke.run(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } @@ -5101,8 +5101,8 @@ TEST_F(NVFuserTest, FusionPredicatedBlockBroadcast_CUDA) { std::vector inputs = {t0, t3}; KernelExecutor ke; - ke.compileFusion(&fusion, inputs); - auto outputs = ke.runFusion(inputs); + ke.compile(&fusion, inputs); + auto outputs = ke.run(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } @@ -6005,8 +6005,8 @@ TEST_F(NVFuserTest, FusionSimpleWarp_CUDA) { auto at_output = input1.sum({1}, true).add(input1); KernelExecutor ke; - ke.compileFusion(fusion.get(), {input1}); - auto outputs = ke.runFusion({input1}); + ke.compile(fusion.get(), {input1}); + auto outputs = ke.run({input1}); testValidate( fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__); @@ -6054,8 +6054,8 @@ TEST_F(NVFuserTest, FusionSimpleWarpPad_CUDA) { auto at_output = input1.sum({1}, true).add(input1); KernelExecutor ke; - ke.compileFusion(fusion.get(), {input1}); - auto outputs = ke.runFusion({input1}); + ke.compile(fusion.get(), {input1}); + auto outputs = ke.run({input1}); testValidate( fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__); } @@ -6099,8 +6099,8 @@ TEST_F(NVFuserTest, FusionWarpPadMergeSplit_CUDA) { auto at_output = input1.sum({1, 2}, true).add(input1); KernelExecutor ke; - ke.compileFusion(fusion.get(), {input1}); - auto outputs = ke.runFusion({input1}); + ke.compile(fusion.get(), {input1}); + auto outputs = ke.run({input1}); testValidate( fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__); } @@ -6141,8 +6141,8 @@ TEST_F(NVFuserTest, FusionSerialWarpReduction_CUDA) { auto at_output = input1.sum({1, 2}, true).add(input1); KernelExecutor ke; - ke.compileFusion(fusion.get(), {input1}); - auto outputs = ke.runFusion({input1}); + ke.compile(fusion.get(), {input1}); + auto outputs = ke.run({input1}); testValidate( fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__); } @@ -6186,8 +6186,8 @@ TEST_F(NVFuserTest, FusionTrivialWarpReduction_CUDA) { auto at_output = input1.sum({1, 2, 3}, true).add(input1); KernelExecutor ke; - ke.compileFusion(fusion.get(), {input1}); - auto outputs = ke.runFusion({input1}); + ke.compile(fusion.get(), {input1}); + auto outputs = ke.run({input1}); testValidate( fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__); } @@ -6241,8 +6241,8 @@ TEST_F(NVFuserTest, FusionMultipleDimBinding_CUDA) { auto at_output = input1.sum({1}, true).add(input1); KernelExecutor ke; - ke.compileFusion(fusion.get(), {input1, input2}); - auto outputs = ke.runFusion({input1, input2}); + ke.compile(fusion.get(), {input1, input2}); + auto outputs = ke.run({input1, input2}); testValidate( fusion.get(), outputs, @@ -6279,8 +6279,8 @@ TEST_F(NVFuserTest, FusionPadNoWarpReduce_CUDA) { at::Tensor input1 = at::randn({16, 31}, options); KernelExecutor ke; - ke.compileFusion(fusion.get(), {input1}); - auto outputs = ke.runFusion({input1}); + ke.compile(fusion.get(), {input1}); + auto outputs = ke.run({input1}); testValidate(fusion.get(), outputs, {input1}, __LINE__, __FILE__); } @@ -6314,8 +6314,8 @@ TEST_F(NVFuserTest, FusionWarpMutipleThreadDim_CUDA) { auto at_output = (input1 + 1).sum({1}); KernelExecutor ke; - ke.compileFusion(fusion.get(), {input1}); - auto outputs = ke.runFusion({input1}); + ke.compile(fusion.get(), {input1}); + auto outputs = ke.run({input1}); testValidate( fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__); } @@ -6365,8 +6365,8 @@ TEST_F(NVFuserTest, FusionWarpReduceUnrollOuterLoop_CUDA) { auto at_output = input1.sum({1}, true).add(input1); KernelExecutor ke; - ke.compileFusion(fusion.get(), {input1}); - auto outputs = ke.runFusion({input1}); + ke.compile(fusion.get(), {input1}); + auto outputs = ke.run({input1}); testValidate( fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__); } @@ -6411,8 +6411,8 @@ TEST_F(NVFuserTest, FusionWarpReducePredication_CUDA) { auto t2 = at::randn(shape2, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0, t2}); - auto cg_outputs = ke.runFusion({t0, t2}); + ke.compile(&fusion, {t0, t2}); + auto cg_outputs = ke.run({t0, t2}); auto t1 = t0.sum({0}); auto t4 = (t2 + 1).sum({0}) + 1; @@ -6492,8 +6492,8 @@ TEST_F(NVFuserTest, FusionBufferReuseBroadCastMultiVisit_CUDA) { auto in1 = at::randn({2, 2, 2}, options); KernelExecutor ke; - ke.compileFusion(fusion, {in0, in1}); - auto outputs = ke.runFusion({in0, in1}); + ke.compile(fusion, {in0, in1}); + auto outputs = ke.run({in0, in1}); testValidate(fusion, outputs, {in0, in1}, __LINE__, __FILE__); } @@ -6536,9 +6536,9 @@ TEST_F(NVFuserTest, FusionBufferReuseStressTest_CUDA) { auto in1 = at::randn({2, 2, 2}, options); KernelExecutor ke; - ke.compileFusion(fusion, {in0, in1}); + ke.compile(fusion, {in0, in1}); - auto outputs = ke.runFusion({in0, in1}); + auto outputs = ke.run({in0, in1}); testValidate(fusion, outputs, {in0, in1}, __LINE__, __FILE__); } @@ -6568,8 +6568,8 @@ TEST_F(NVFuserTest, FusionBufferReuseLargeBuffer_CUDA) { auto in0 = at::randn({256, 512}, options); KernelExecutor ke; - ke.compileFusion(fusion, {in0}); - auto outputs = ke.runFusion({in0}); + ke.compile(fusion, {in0}); + auto outputs = ke.run({in0}); testValidate(fusion, outputs, {in0}, __LINE__, __FILE__); } @@ -6600,8 +6600,8 @@ TEST_F(NVFuserTest, FusionBufferReuseNo2hop_CUDA) { auto in0 = at::randn({2, 2}, options); auto in1 = at::randn({2, 2, 2}, options); KernelExecutor ke; - ke.compileFusion(fusion, {in0, in1}); - auto outputs = ke.runFusion({in0, in1}); + ke.compile(fusion, {in0, in1}); + auto outputs = ke.run({in0, in1}); testValidate(fusion, outputs, {in0, in1}, __LINE__, __FILE__); } @@ -6634,8 +6634,8 @@ TEST_F(NVFuserTest, FusionBufferReuseAllocationOrder_CUDA) { auto in0 = at::randn({3, 3, 3}, options); KernelExecutor ke; - ke.compileFusion(fusion, {in0}); - auto outputs = ke.runFusion({in0}); + ke.compile(fusion, {in0}); + auto outputs = ke.run({in0}); testValidate(fusion, outputs, {in0}, __LINE__, __FILE__); } @@ -6663,8 +6663,8 @@ TEST_F(NVFuserTest, FusionBufferReuseLiveInterval_CUDA) { auto in0 = at::randn({16, 16}, options); KernelExecutor ke; - ke.compileFusion(fusion, {in0}); - auto cg_outputs = ke.runFusion({in0}); + ke.compile(fusion, {in0}); + auto cg_outputs = ke.run({in0}); testValidate(fusion, cg_outputs, {in0}, __LINE__, __FILE__); } @@ -6697,8 +6697,8 @@ TEST_F(NVFuserTest, FusionBufferReuseNoAcrossBroadcast_CUDA) { auto in0 = at::randn({2, 2}, options); auto in1 = at::randn({2, 2, 2}, options); KernelExecutor ke; - ke.compileFusion(fusion, {in0, in1}); - auto outputs = ke.runFusion({in0, in1}); + ke.compile(fusion, {in0, in1}); + auto outputs = ke.run({in0, in1}); testValidate(fusion, outputs, {in0, in1}, __LINE__, __FILE__); } @@ -6723,8 +6723,8 @@ TEST_F(NVFuserTest, FusionIssue970_CUDA) { at::Tensor t0 = at::randn({nelm, nelm}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); testValidate(&fusion, outputs, {t0}, __LINE__, __FILE__); } @@ -6754,8 +6754,8 @@ TEST_F(NVFuserTest, FusionIssue1016_CUDA) { std::vector inputs = {t0}; KernelExecutor ke; - ke.compileFusion(&fusion, inputs); - auto outputs = ke.runFusion(inputs); + ke.compile(&fusion, inputs); + auto outputs = ke.run(inputs); testValidate(&fusion, outputs, {t0}, __LINE__, __FILE__); } @@ -6785,8 +6785,8 @@ TEST_F(NVFuserTest, FusionIssue1021_CUDA) { std::vector inputs = {t0}; KernelExecutor ke; - ke.compileFusion(&fusion, inputs); - auto outputs = ke.runFusion(inputs); + ke.compile(&fusion, inputs); + auto outputs = ke.run(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } @@ -6820,8 +6820,8 @@ TEST_F(NVFuserTest, FusionNonUniqueThreadDim_CUDA) { auto at_tv2 = input1 + 1; KernelExecutor ke; - ke.compileFusion(fusion.get(), {input1}); - auto outputs = ke.runFusion({input1}); + ke.compile(fusion.get(), {input1}); + auto outputs = ke.run({input1}); testValidate( fusion.get(), outputs, {input1}, {at_tv1, at_tv2}, __LINE__, __FILE__); } @@ -6857,8 +6857,8 @@ TEST_F(NVFuserTest, FusionParallelDimensionMap1_CUDA) { at::Tensor input1 = at::randn({32}, options); KernelExecutor ke; - ke.compileFusion(fusion.get(), {input1}); - auto outputs = ke.runFusion({input1}); + ke.compile(fusion.get(), {input1}); + auto outputs = ke.run({input1}); testValidate(fusion.get(), outputs, {input1}, __LINE__, __FILE__); } @@ -6894,8 +6894,8 @@ TEST_F(NVFuserTest, FusionParallelDimensionMap2_CUDA) { at::Tensor input2 = at::randn({11, 13}, options); KernelExecutor ke; - ke.compileFusion(fusion.get(), {input1, input2}); - auto outputs = ke.runFusion({input1, input2}); + ke.compile(fusion.get(), {input1, input2}); + auto outputs = ke.run({input1, input2}); testValidate(fusion.get(), outputs, {input1, input2}, __LINE__, __FILE__); } @@ -6942,8 +6942,8 @@ TEST_F(NVFuserTest, FusionParallelDimensionMap3_CUDA) { at::Tensor input1 = at::randn({13}, options); KernelExecutor ke; - ke.compileFusion(fusion.get(), {input1}); - auto outputs = ke.runFusion({input1}); + ke.compile(fusion.get(), {input1}); + auto outputs = ke.run({input1}); testValidate(fusion.get(), outputs, {input1}, __LINE__, __FILE__); } @@ -6988,8 +6988,8 @@ TEST_F(NVFuserTest, FusionParallelDimensionMap4_CUDA) { at::Tensor input2 = at::randn({15, 13}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {input1, input2}); - auto outputs = ke.runFusion({input1, input2}); + ke.compile(&fusion, {input1, input2}); + auto outputs = ke.run({input1, input2}); testValidate(&fusion, outputs, {input1, input2}, __LINE__, __FILE__); } @@ -7032,8 +7032,8 @@ TEST_F(NVFuserTest, FusionParallelDimensionMap5_CUDA) { at::Tensor input2 = at::randn({13, 15}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {input1, input2}); - auto outputs = ke.runFusion({input1, input2}); + ke.compile(&fusion, {input1, input2}); + auto outputs = ke.run({input1, input2}); testValidate(&fusion, outputs, {input1, input2}, __LINE__, __FILE__); } @@ -7186,8 +7186,8 @@ TEST_F(NVFuserTest, FusionSerialAndParallelIndexing_CUDA) { std::vector aten_inputs = {t0}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -7239,8 +7239,8 @@ TEST_F(NVFuserTest, FusionWARSyncAliasedSmem_CUDA) { std::vector aten_inputs = {t0}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -7290,8 +7290,8 @@ TEST_F(NVFuserTest, FusionIssue1099_CUDA) { std::vector aten_inputs = {t0, t3}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -7332,8 +7332,8 @@ TEST_F(NVFuserTest, FusionUnswitchPredicate_CUDA) { std::vector aten_inputs = {t0}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -7371,8 +7371,8 @@ TEST_F(NVFuserTest, FusionIssue1189_CUDA) { at::Tensor t1 = at::randn({16, 16, 1}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0, t1}); - auto outputs = ke.runFusion({t0, t1}); + ke.compile(&fusion, {t0, t1}); + auto outputs = ke.run({t0, t1}); testValidate(&fusion, outputs, {t0, t1}, __LINE__, __FILE__); } @@ -7404,8 +7404,8 @@ TEST_F(NVFuserTest, FusionIssue1052_CUDA) { std::vector aten_inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -7510,8 +7510,8 @@ TEST_F(NVFuserTest, FusionSmemAliasSerial_CUDA) { std::vector aten_inputs = {t0, t4}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -7540,8 +7540,8 @@ TEST_F(NVFuserTest, FusionGridReductionWithNonExactParallelDimensions_CUDA) { std::vector aten_inputs = {t0, t2}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -7570,8 +7570,8 @@ TEST_F(NVFuserTest, FusionGridWelfordWithNonExactParallelDimensions_CUDA) { std::vector aten_inputs = {t0, t2}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); auto ref1 = t0 + 1; auto ref2 = mean(t2, {0}); @@ -7611,14 +7611,14 @@ TEST_F(NVFuserTest, FusionGridReductionWithNonExactParallelDimensions2_CUDA) { tv5->axis(2)->parallelize(ParallelType::BIDz); KernelExecutor ke; - ke.compileFusion(&fusion); + ke.compile(&fusion); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({2, 3}, options); at::Tensor t2 = at::randn({5, 6, 7}, options); at::Tensor t4 = at::randn({8, 9, 10}, options); std::vector aten_inputs = {t0, t2, t4}; - auto outputs = ke.runFusion(aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -7655,14 +7655,14 @@ TEST_F(NVFuserTest, FusionGridWelfordWithNonExactParallelDimensions2_CUDA) { tv5->axis(2)->parallelize(ParallelType::BIDz); KernelExecutor ke; - ke.compileFusion(&fusion); + ke.compile(&fusion); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({2, 3}, options); at::Tensor t2 = at::randn({5, 6, 7}, options); at::Tensor t4 = at::randn({8, 9, 10}, options); std::vector aten_inputs = {t0, t2, t4}; - auto outputs = ke.runFusion(aten_inputs); + auto outputs = ke.run(aten_inputs); auto ref1 = t0.mean(at::IntArrayRef{0, 1}); auto ref2 = t2 + 1; @@ -7724,8 +7724,8 @@ TEST_F(NVFuserTest, FusionPredicateParallelizedDomains_CUDA) { std::vector aten_inputs = {t0, t4}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); auto ref1 = t0 + 3; auto ref2 = sum(t4 + 4); @@ -7786,8 +7786,8 @@ TEST_F(NVFuserTest, FusionSmemPredicateUnswitch_CUDA) { std::vector aten_inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -7835,8 +7835,8 @@ TEST_F(NVFuserTest, FusionFloatPow_CUDA) { std::vector aten_inputs = {t0}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); auto p4 = at::pow(t0, 4); auto p2 = at::pow(t0, 2); @@ -7904,8 +7904,8 @@ TEST_F(NVFuserTest, FusionThreadPredicateUnswitch_CUDA) { std::vector aten_inputs = {t0}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -7927,8 +7927,8 @@ TEST_F(NVFuserTest, FusionNonContigOutputs_CUDA) { at::Tensor at_output = at::empty_strided({10}, {2}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {at_input}); - auto returned_outputs = ke.runFusion({at_input}, {at_output}); + ke.compile(&fusion, {at_input}); + auto returned_outputs = ke.run({at_input}, {at_output}); // Returned outputs should only contain one tensor that is the same // as the output tensor given to runFusion @@ -7975,8 +7975,8 @@ TEST_F(NVFuserTest, FusionTestWarpSoftMax_CUDA) { // Test result KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); auto ref_output = at::_softmax(aten_input, 1, false); testValidate(&fusion, outputs, aten_inputs, {ref_output}, __LINE__, __FILE__); } @@ -8049,8 +8049,8 @@ TEST_F(NVFuserTest, FusionIssue1133_CUDA) { std::vector aten_inputs = {t0}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); auto ref = (t0 + 1).sum({1}) + 1; @@ -8083,8 +8083,8 @@ TEST_F(NVFuserTest, FusionRfactorContigIDs_CUDA) { std::vector aten_inputs = {t0}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); auto ref = t0.sum({1}); @@ -8138,8 +8138,8 @@ TEST_F(NVFuserTest, FusionIssue1223_CUDA) { at::Tensor at_t0 = at::ones({11, 10}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {at_t0}); - auto cg_outputs = ke.runFusion({at_t0}); + ke.compile(&fusion, {at_t0}); + auto cg_outputs = ke.run({at_t0}); auto at_t1 = (at_t0 + 1).sum(); @@ -8182,8 +8182,8 @@ TEST_F(NVFuserTest, FusionRfactorPredication1_CUDA) { at::Tensor at_t3 = at::randn({128}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {at_t0, at_t3}); - auto cg_outputs = ke.runFusion({at_t0, at_t3}); + ke.compile(&fusion, {at_t0, at_t3}); + auto cg_outputs = ke.run({at_t0, at_t3}); auto at_t2 = (at_t0 + 1).min(); auto at_t4 = at_t3 + 1; @@ -8234,8 +8234,8 @@ TEST_F(NVFuserTest, FusionRfactorPredication2_CUDA) { at::Tensor at_t3 = at::randn({128}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {at_t0, at_t3}); - auto cg_outputs = ke.runFusion({at_t0, at_t3}); + ke.compile(&fusion, {at_t0, at_t3}); + auto cg_outputs = ke.run({at_t0, at_t3}); auto at_t2 = std::get<0>(at_t0.min(0)); auto at_t4 = at_t3 + 1; @@ -8271,8 +8271,8 @@ TEST_F(NVFuserTest, FusionRfactorIndirectRoot_CUDA) { auto at_out = at_in.sum({1, 2}); KernelExecutor ke; - ke.compileFusion(&fusion, {at_in}); - auto cg_outputs = ke.runFusion({at_in}); + ke.compile(&fusion, {at_in}); + auto cg_outputs = ke.run({at_in}); testValidate(&fusion, cg_outputs, {at_in}, {at_out}, __LINE__, __FILE__); } diff --git a/tests/cpp/test_gpu3.cpp b/tests/cpp/test_gpu3.cpp index c2fdd8e7d33..f7deca99425 100644 --- a/tests/cpp/test_gpu3.cpp +++ b/tests/cpp/test_gpu3.cpp @@ -108,8 +108,8 @@ TEST_F(NVFuserTest, FusionNonDivisibleSplit1_CUDA) { at::Tensor t0 = at::randn({24}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto ref = t0.sum(); @@ -162,8 +162,8 @@ TEST_F(NVFuserTest, FusionNonDivisibleSplit2_CUDA) { at::Tensor t0 = at::randn({13, 17}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -211,8 +211,8 @@ TEST_F(NVFuserTest, FusionNonDivisibleSplit3_CUDA) { at::Tensor t0 = at::randn({24}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto ref = (t0 + 1).sum(); @@ -261,8 +261,8 @@ TEST_F(NVFuserTest, FusionNonDivisibleSplit4_CUDA) { at::Tensor t0 = at::randn({24, 2}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto ref = (t0 + 1).sum(); @@ -315,8 +315,8 @@ TEST_F(NVFuserTest, FusionNonDivisibleSplit5_CUDA) { at::Tensor t0 = at::randn({24}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto ref = (t0 + 1).sum(); @@ -358,8 +358,8 @@ TEST_F(NVFuserTest, FusionNonDivisibleSplitVectorize1_CUDA) { auto t0 = at::randn({32}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); @@ -367,7 +367,7 @@ TEST_F(NVFuserTest, FusionNonDivisibleSplitVectorize1_CUDA) { // Since ceilDiv(8, 8) is not divisible by 4, the vectorization is // illegal. The run-time validation of vectorization should throw an error. // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(ke.runFusion({t0_non_divisible})); + ASSERT_ANY_THROW(ke.run({t0_non_divisible})); } // If a split is validated at run time, it's not necessary to predicate. @@ -413,8 +413,8 @@ TEST_F(NVFuserTest, FusionNonDivisibleSplitVectorize2_CUDA) { auto t0 = at::randn({1024}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto ref = (t0 + 1).sum(); @@ -475,15 +475,15 @@ TEST_F(NVFuserTest, FusionIntermediateTensorVectorize_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({15}, options); KernelExecutor ke; - ke.compileFusion(&fusion); + ke.compile(&fusion); // This should throw an exception as the extent of t0 is not // divisible by the vector width // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(ke.runFusion({t0})); + ASSERT_ANY_THROW(ke.run({t0})); auto t1 = at::randn({16}, options); - auto cg_outputs = ke.runFusion({t1}); + auto cg_outputs = ke.run({t1}); testValidate(&fusion, cg_outputs, {t1}, __LINE__, __FILE__); } @@ -530,8 +530,8 @@ TEST_F(NVFuserTest, FusionBroadcastConcretization1_CUDA) { std::vector aten_inputs = {t0, t1, t2}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -573,8 +573,8 @@ TEST_F(NVFuserTest, FusionBroadcastConcretization2_CUDA) { std::vector aten_inputs = {t0}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); auto t3 = t0.sum().unsqueeze(-1).unsqueeze(-1); @@ -618,8 +618,8 @@ TEST_F(NVFuserTest, FusionBroadcastConcretization3_CUDA) { std::vector aten_inputs = {t0}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -791,8 +791,8 @@ TEST_F(NVFuserTest, FusionIssue1430_CUDA) { at::Tensor t0 = at::randn({V, W, X, Y, Z}, options); KernelExecutor ke; - ke.compileFusion(&fusion); - auto cg_outputs = ke.runFusion({t0}, LaunchParams(X, V, -1, Y, -1, -1)); + ke.compile(&fusion); + auto cg_outputs = ke.run({t0}, LaunchParams(X, V, -1, Y, -1, -1)); auto t0_double = t0.to(at::kDouble); @@ -945,8 +945,8 @@ TEST_F(NVFuserTest, FusionTestGridComm_CUDA) { auto t1 = at::randn({X, Y, Z}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = ke.runFusion({t0, t1}); + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -989,8 +989,8 @@ TEST_F(NVFuserTest, FusionTestGridComm2_CUDA) { auto t1 = at::randn({W, X}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = ke.runFusion({t0, t1}); + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -1022,8 +1022,8 @@ TEST_F(NVFuserTest, FusionLargeSmem_CUDA) { auto t0 = at::randn({(int)(12288 * 4)}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto ref = t0 + 1 + 2; testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); @@ -1060,7 +1060,7 @@ TEST_F(NVFuserTest, FusionTooLargeSmem_CUDA) { KernelExecutor ke; // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(ke.compileFusion(&fusion, {t0})); + ASSERT_ANY_THROW(ke.compile(&fusion, {t0})); } // Try to test alignment when multiple tensors are @@ -1099,8 +1099,8 @@ TEST_F(NVFuserTest, FusionSmemAlignment_CUDA) { auto t0 = at::randn({3, 4, 7, 2, 5}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -1127,7 +1127,7 @@ TEST_F(NVFuserTest, FusionImmediateValueAsInput_CUDA) { // Make sure the kernel is compiled. KernelExecutor ke; - ke.compileFusion(&fusion); + ke.compile(&fusion); } // Repro of #1506 @@ -1158,8 +1158,8 @@ TEST_F(NVFuserTest, FusionVectorizeContigIndex_CUDA) { auto t0 = at::randn(shape, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); NVF_CHECK(t0.equal(cg_outputs[0])); } @@ -1195,7 +1195,7 @@ TEST_F(NVFuserTest, FusionVectorizeContigIndexFail_CUDA) { KernelExecutor ke; // This should fail at compile time as we're trying to merge in a // non-contiguous dimension, then split and vectorize it. - ASSERT_ANY_THROW(ke.compileFusion(&fusion, {t0})); + ASSERT_ANY_THROW(ke.compile(&fusion, {t0})); } // Make sure the same fusion as FusionVectorizeContigIndex fails if @@ -1228,13 +1228,13 @@ TEST_F(NVFuserTest, FusionVectorizeContigIndexFail2_CUDA) { auto t0 = at::randn(shape, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); + ke.compile(&fusion, {t0}); // This should fail at the launch time as 14 is not divisible by the // vector word size. The two domains are merged, but they are not // contiguous, so contig indexing is not involved in this case. // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(ke.runFusion({t0})); + ASSERT_ANY_THROW(ke.run({t0})); } TEST_F(NVFuserTest, FusionVectorizeInputToOutput_CUDA) { @@ -1261,17 +1261,17 @@ TEST_F(NVFuserTest, FusionVectorizeInputToOutput_CUDA) { at::empty({n + 1}, options).index({at::indexing::Slice(1)}); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); NVF_CHECK(t0.equal(cg_outputs[0])); // Pass misaligned input. This must fail. // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(ke.runFusion({t0_misaligned})); + ASSERT_ANY_THROW(ke.run({t0_misaligned})); // Pass misaligned output. This must fail too. // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(ke.runFusion({t0}, {t1_misaligned})); + ASSERT_ANY_THROW(ke.run({t0}, {t1_misaligned})); } // Repro of issue #1530 @@ -1301,10 +1301,10 @@ TEST_F(NVFuserTest, FusionVectorizeContigIndexValidationFail_CUDA) { auto t0 = at::randn(shape, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); + ke.compile(&fusion, {t0}); // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(ke.runFusion({t0})); + ASSERT_ANY_THROW(ke.run({t0})); } TEST_F(NVFuserTest, FusionContigIndexingWithBroadcast_CUDA) { @@ -1332,8 +1332,8 @@ TEST_F(NVFuserTest, FusionContigIndexingWithBroadcast_CUDA) { { KernelExecutor ke; - ke.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = ke.runFusion({t0, t1}); + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -1342,8 +1342,8 @@ TEST_F(NVFuserTest, FusionContigIndexingWithBroadcast_CUDA) { tv2->setMemoryType(MemoryType::Global); { KernelExecutor ke; - ke.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = ke.runFusion({t0, t1}); + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -1385,11 +1385,11 @@ TEST_F(NVFuserTest, FusionVectorizeContigIndexValidationFail2_CUDA) { auto t1 = at::randn(shape2, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0, t1}); + ke.compile(&fusion, {t0, t1}); // Vectorization of tv2 should be detected as invalid. // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(ke.runFusion({t0, t1})); + ASSERT_ANY_THROW(ke.run({t0, t1})); } TEST_F(NVFuserTest, FusionVectorizeContigIndexWithBroadcast_CUDA) { @@ -1434,8 +1434,8 @@ TEST_F(NVFuserTest, FusionVectorizeContigIndexWithBroadcast_CUDA) { auto t1 = at::randn(shape2, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = ke.runFusion({t0, t1}); + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -1513,8 +1513,8 @@ TEST_F(NVFuserTest, FusionTrivialReductionForwarding4_CUDA) { auto t1 = at::randn({123, 111}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = ke.runFusion({t0, t1}); + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); auto t2 = t0.unsqueeze(0); auto t3 = t1 + t2; @@ -1564,8 +1564,8 @@ TEST_F(NVFuserTest, FusionRAWSyncInsertionPlace1_CUDA) { auto t1 = at::randn({10, 64}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = ke.runFusion({t0, t1}); + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -1609,8 +1609,8 @@ TEST_F(NVFuserTest, FusionRAWSyncInsertionPlace2_CUDA) { auto t1 = at::randn({10, 64}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = ke.runFusion({t0, t1}); + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -1652,8 +1652,8 @@ TEST_F(NVFuserTest, FusionRAWSyncInsertionPlace3_CUDA) { auto t1 = at::randn({50, 64}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = ke.runFusion({t0, t1}); + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -1757,8 +1757,8 @@ TEST_F(NVFuserTest, FusionSerialSmemWriteParallelRead1_CUDA) { at::Tensor t2 = at::randn({128, 6}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0, t1, t2}); - auto cg_outputs = ke.runFusion({t0, t1, t2}); + ke.compile(&fusion, {t0, t1, t2}); + auto cg_outputs = ke.run({t0, t1, t2}); testValidate(&fusion, cg_outputs, {t0, t1, t2}, __LINE__, __FILE__); } @@ -1797,8 +1797,8 @@ TEST_F(NVFuserTest, FusionSerialSmemWriteParallelRead2_CUDA) { at::Tensor t2 = at::randn({128, 6}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0, t1, t2}); - auto cg_outputs = ke.runFusion({t0, t1, t2}); + ke.compile(&fusion, {t0, t1, t2}); + auto cg_outputs = ke.run({t0, t1, t2}); testValidate(&fusion, cg_outputs, {t0, t1, t2}, __LINE__, __FILE__); } @@ -1836,15 +1836,15 @@ TEST_F(NVFuserTest, FusionSimpleCpAsync_CUDA) { // requires ampere+ GPU if (!deviceMajorMinorCheck(8)) { ASSERT_THAT( - [&]() { ke.compileFusion(&fusion, {t0, t1}); }, + [&]() { ke.compile(&fusion, {t0, t1}); }, testing::ThrowsMessage(testing::HasSubstr( "Reason: LoadStoreOpType::CpAsync requires Ampere"))); GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs"; } else { - ke.compileFusion(&fusion, {t0, t1}); + ke.compile(&fusion, {t0, t1}); } - ke.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = ke.runFusion({t0, t1}); + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -1880,16 +1880,16 @@ TEST_F(NVFuserTest, FusionCpAsyncPredicate_CUDA) { KernelExecutor ke; if (!deviceMajorMinorCheck(8)) { ASSERT_THAT( - [&]() { ke.compileFusion(&fusion, {t0}); }, + [&]() { ke.compile(&fusion, {t0}); }, testing::ThrowsMessage(testing::HasSubstr( "Reason: LoadStoreOpType::CpAsync requires Ampere"))); GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs"; } else { - ke.compileFusion(&fusion, {t0}); + ke.compile(&fusion, {t0}); } - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto ref = t0.sum({1}); @@ -2007,8 +2007,8 @@ TEST_F(NVFuserTest, FusionPropagateParallelTypesToSiblings_CUDA) { at::Tensor t0 = at::randn({9999}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); testValidate(ke.kernel(), outputs, {t0}, {t0.mean({0})}, __LINE__, __FILE__); } @@ -2212,12 +2212,12 @@ TEST_F(NVFuserTest, FusionTestReEntrantGridWelford_CUDA) { checker.handle(gpulw.run()->topLevelExprs()); KernelExecutor ke; - ke.compileFusion(&fusion, {}, LaunchParams()); + ke.compile(&fusion, {}, LaunchParams()); auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); at::Tensor t0 = at::randn({X, Y, Y, Z}, options); - auto cg_outputs = ke.runFusion({t0}, LaunchParams(-1, -1, -1, -1, -1, -1)); + auto cg_outputs = ke.run({t0}, LaunchParams(-1, -1, -1, -1, -1, -1)); // by default Welford outputs sum of square diff so need to divide to get var cg_outputs[1] = cg_outputs[1].div((float)(X * Y * Y)); @@ -2281,8 +2281,8 @@ TEST_F(NVFuserTest, FusionRedundantPredSync_CUDA) { at::Tensor t1 = at::randn({32, 32}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = ke.runFusion({t0, t1}); + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -2346,8 +2346,8 @@ TEST_F(NVFuserTest, FusionRedundantPredSync2_CUDA) { at::Tensor t1 = at::randn({32, 32}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = ke.runFusion({t0, t1}); + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -2428,8 +2428,8 @@ TEST_F(NVFuserTest, FusionRedundantPredSync3_CUDA) { at::Tensor t1 = at::randn({32, 32}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = ke.runFusion({t0, t1}); + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -2533,8 +2533,8 @@ TEST_F(NVFuserTest, FusionUnsqueeze1_CUDA) { std::vector aten_inputs = {t0}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -2568,8 +2568,8 @@ TEST_F(NVFuserTest, FusionSqueeze1_CUDA) { std::vector aten_inputs = {t0}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -2597,8 +2597,8 @@ TEST_F(NVFuserTest, FusionContigPredicate_CUDA) { at::Tensor t0 = at::randn({3, 4}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(ke.kernel(), cg_outputs, {t0}, __LINE__, __FILE__); } @@ -2621,8 +2621,8 @@ TEST_F(NVFuserTest, FusionDivScalarLhs_CUDA) { at::native::wrapped_scalar_tensor(at::Scalar(2.0), options.device()), t0); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {aten_output}, __LINE__, __FILE__); } @@ -3243,8 +3243,8 @@ TEST_F(NVFuserTest, FusionIssue1785Repro_CUDA) { at::Tensor in2 = at::randn({12, 16}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {in1, in2}); - auto cg_outputs = ke.runFusion({in1, in2}); + ke.compile(&fusion, {in1, in2}); + auto cg_outputs = ke.run({in1, in2}); testValidate(&fusion, cg_outputs, {in1, in2}, __LINE__, __FILE__); } @@ -3517,8 +3517,8 @@ TEST_F(NVFuserTest, FusionVectorComponentReduce_CUDA) { auto t0 = at::randn({1024}, options); KernelExecutor ke; - ke.compileFusion(fusion.get(), {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(fusion.get(), {t0}); + auto cg_outputs = ke.run({t0}); testValidate(fusion.get(), cg_outputs, {t0}, __LINE__, __FILE__, ""); } @@ -3800,8 +3800,8 @@ TEST_F(NVFuserTest, FusionPredicateUnshare_CUDA) { at::Tensor t0 = at::randn({5, 5}, options); KernelExecutor ke; - ke.compileFusion(fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto out = cg_outputs[0]; testValidate(fusion, {out}, {t0}, __LINE__, __FILE__); @@ -3880,8 +3880,8 @@ TEST_F(NVFuserTest, FusionMergeBroadcastingTrivialReduction1_CUDA) { at::Tensor t1 = at::randn({10}, options); KernelExecutor ke; - ke.compileFusion(fusion, {t0, t1}); - auto cg_outputs = ke.runFusion({t0, t1}); + ke.compile(fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); auto out = cg_outputs[0]; testValidate( @@ -3924,8 +3924,8 @@ TEST_F(NVFuserTest, FusionMappingRelation_CUDA) { at::Tensor t1 = at::randn({2, 1, 1}, options); KernelExecutor ke; - ke.compileFusion(fusion, {t0, t1}); - auto cg_outputs = ke.runFusion({t0, t1}); + ke.compile(fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); auto out = cg_outputs[0]; testValidate(fusion, {out}, {t0, t1}, __LINE__, __FILE__); @@ -3948,8 +3948,8 @@ TEST_F(NVFuserTest, FusionInlineAt_CUDA) { at::Tensor t0 = at::randn({100, 2}, options); KernelExecutor ke; - ke.compileFusion(fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto out = cg_outputs[0]; testValidate(fusion, {out}, {t0}, __LINE__, __FILE__); @@ -3982,8 +3982,8 @@ TEST_F(NVFuserTest, FusionReplayTrivialReductionAndBroadcast2_CUDA) { std::vector aten_inputs({t0}); KernelExecutor ke; - ke.compileFusion(fusion_ptr.get(), aten_inputs); - auto outputs = ke.runFusion(aten_inputs); + ke.compile(fusion_ptr.get(), aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -4047,15 +4047,15 @@ TEST_F(NVFuserTest, FusionSimpleAmperePipeline_CUDA) { // requires ampere+ GPU if (!deviceMajorMinorCheck(8)) { ASSERT_THAT( - [&]() { ke.compileFusion(&fusion, {input1}); }, + [&]() { ke.compile(&fusion, {input1}); }, testing::ThrowsMessage(testing::HasSubstr( "Reason: LoadStoreOpType::CpAsync requires Ampere"))); GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs"; } else { - ke.compileFusion(&fusion, {input1}); + ke.compile(&fusion, {input1}); } - auto cg_outputs = ke.runFusion({input1}); + auto cg_outputs = ke.run({input1}); testValidate(&fusion, cg_outputs, {input1}, __LINE__, __FILE__); } @@ -4429,8 +4429,8 @@ TEST_F(NVFuserTest, FusionSqueezeTransformPropagation_CUDA) { at::Tensor t0 = at::randn({5, 1, 1, 1, 1}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -4483,8 +4483,8 @@ TEST_F(NVFuserTest, FusionSqueezeInlining_CUDA) { at::Tensor t0 = at::randn({1, 1024}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -4886,8 +4886,8 @@ TEST_F(NVFuserTest, FusionPropagateVectorizePredicate_CUDA) { at::Tensor t0 = at::randn({32}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); NVF_CHECK(t0.equal(cg_outputs[0])); } @@ -4993,8 +4993,8 @@ TEST_F(NVFuserTest, FusionIssue2163ReproInvalidAlias_CUDA) { std::vector aten_inputs({at_input, at_weight}); KernelExecutor ke; - ke.compileFusion(fusion_ptr.get(), aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(fusion_ptr.get(), aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto cg_output = cg_outputs.at(0); auto ref_x_sub_mean = at_input - at_input.sum({0}).unsqueeze(0); @@ -5081,8 +5081,8 @@ TEST_F(NVFuserTest, FusionFloatingPointType_CUDA) { std::vector inputs({t0}); KernelExecutor ke; - ke.compileFusion(&fusion, inputs); - auto cg_outputs = ke.runFusion(inputs); + ke.compile(&fusion, inputs); + auto cg_outputs = ke.run(inputs); testValidate(&fusion, cg_outputs, inputs, __LINE__, __FILE__); } @@ -5147,8 +5147,8 @@ TEST_F(NVFuserTest, FusionIntegerType_CUDA) { std::vector inputs({t0}); KernelExecutor ke; - ke.compileFusion(&fusion, inputs); - auto cg_outputs = ke.runFusion(inputs); + ke.compile(&fusion, inputs); + auto cg_outputs = ke.run(inputs); auto i2 = int64_val; auto i3 = int_val; @@ -5210,8 +5210,8 @@ TEST_F(NVFuserTest, FusionVectorizeWelford1_CUDA) { at::Tensor t0 = at::randn(shape, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto ref_avg = t0.mean({0}); auto ref_var = t0.var({0}, false) * shape[0]; @@ -5283,8 +5283,8 @@ TEST_F(NVFuserTest, FusionVectorizeWelford2_CUDA) { at::Tensor t0 = at::randn(shape, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto ref_avg = t0.to(at::kDouble).mean({0}); auto ref_var = t0.to(at::kDouble).var({0}, false) * shape[0]; @@ -5385,8 +5385,8 @@ TEST_F(NVFuserTest, FusionExprSortMatmulLikeSchedule_CUDA) { at::Tensor t1 = at::randn({N1, N2, K1, K2}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = ke.runFusion({t0, t1}); + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); testValidate(ke.kernel(), cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -5447,15 +5447,15 @@ TEST_F(NVFuserTest, FusionCpAsyncCommitWait_CUDA) { KernelExecutor ke; if (!deviceMajorMinorCheck(8)) { ASSERT_THAT( - [&]() { ke.compileFusion(&fusion, {t0}); }, + [&]() { ke.compile(&fusion, {t0}); }, testing::ThrowsMessage(testing::HasSubstr( "Reason: LoadStoreOpType::CpAsync requires Ampere"))); GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs"; } else { - ke.compileFusion(&fusion, {t0}); + ke.compile(&fusion, {t0}); } - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(ke.kernel(), cg_outputs, {t0}, __LINE__, __FILE__); } @@ -5520,8 +5520,8 @@ TEST_F(NVFuserTest, FusionClearThreadPredicateByRAWSync_CUDA) { std::vector inputs = {t0}; KernelExecutor ke; - ke.compileFusion(&fusion, inputs); - auto cg_outputs = ke.runFusion(inputs); + ke.compile(&fusion, inputs); + auto cg_outputs = ke.run(inputs); auto t3 = t0.sum({1}).sum({0}); auto t6 = t0.sum({1}); @@ -5642,8 +5642,8 @@ TEST_F(NVFuserTest, FusionPredicateReductionInitShared_CUDA) { std::vector inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, inputs); - auto cg_outputs = ke.runFusion(inputs); + ke.compile(&fusion, inputs); + auto cg_outputs = ke.run(inputs); auto ref_t1 = t0.sum({0}); auto ref_t4 = t1.exp(); @@ -5696,8 +5696,8 @@ TEST_F(NVFuserTest, FusionPredicateReductionInitGlobal_CUDA) { std::vector inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, inputs); - auto cg_outputs = ke.runFusion(inputs); + ke.compile(&fusion, inputs); + auto cg_outputs = ke.run(inputs); auto ref_t1 = t0.sum({0}); auto ref_t3 = t1.exp(); @@ -5771,7 +5771,7 @@ TEST_F(NVFuserTest, FusionCompileIndexType_CUDA) { KernelExecutor ke; // Lower the kernel with large inputs and int64 index type. CompileParams compile_opts = {.index_type = PrimDataType::Int}; - ke.compileFusion(&fusion, large_inputs, LaunchParams(), compile_opts); + ke.compile(&fusion, large_inputs, LaunchParams(), compile_opts); NVF_CHECK( ke.kernel()->indexType() == PrimDataType::Int, @@ -5780,15 +5780,15 @@ TEST_F(NVFuserTest, FusionCompileIndexType_CUDA) { // Since the index type is int64, both small and large inputs // should work fine - ke.runFusion(small_inputs); - ke.runFusion(large_inputs); + ke.run(small_inputs); + ke.run(large_inputs); } { KernelExecutor ke; // Lower the kernel with small inputs and int64 index type. CompileParams compile_opts = {.index_type = PrimDataType::Int}; - ke.compileFusion(&fusion, small_inputs, LaunchParams(), compile_opts); + ke.compile(&fusion, small_inputs, LaunchParams(), compile_opts); NVF_CHECK( ke.kernel()->indexType() == PrimDataType::Int, @@ -5797,15 +5797,15 @@ TEST_F(NVFuserTest, FusionCompileIndexType_CUDA) { // Since the index type is int64, both small and large inputs // should work fine - ke.runFusion(small_inputs); - ke.runFusion(large_inputs); + ke.run(small_inputs); + ke.run(large_inputs); } { KernelExecutor ke; LaunchParams launch_params; CompileParams compile_opts = {.index_type = PrimDataType::Int32}; - ke.compileFusion(&fusion, small_inputs, launch_params, compile_opts); + ke.compile(&fusion, small_inputs, launch_params, compile_opts); NVF_CHECK( ke.kernel()->indexType() == PrimDataType::Int32, @@ -5814,15 +5814,13 @@ TEST_F(NVFuserTest, FusionCompileIndexType_CUDA) { // This should complete successfully as the arguments are small // enough to use the int32 index type - ke.runFusion(small_inputs); + ke.run(small_inputs); // This should fail as the Kernel is already compiled for Int32, but // the arguments are too large CompileParams compile_opts_large = {.index_type = PrimDataType::Int}; EXPECT_THAT( - [&]() { - ke.runFusion(large_inputs, launch_params, compile_opts_large); - }, + [&]() { ke.run(large_inputs, launch_params, compile_opts_large); }, testing::ThrowsMessage(testing::HasSubstr( "Kernel index type and compilation index type don't match"))); } @@ -5834,8 +5832,7 @@ TEST_F(NVFuserTest, FusionCompileIndexType_CUDA) { // This should fail due to the conflict EXPECT_THAT( [&]() { - ke.compileFusion( - &fusion, large_inputs, LaunchParams(), compile_opts); + ke.compile(&fusion, large_inputs, LaunchParams(), compile_opts); }, testing::ThrowsMessage(testing::HasSubstr( "Compilation with int32 is requested but int64 is required for the arguments"))); @@ -6249,8 +6246,8 @@ TEST_F(NVFuserTest, FusionAvoidRedundantWriteNonOutput_CUDA) { std::vector inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(fusion_ptr.get(), inputs); - auto cg_outputs = ke.runFusion(inputs); + ke.compile(fusion_ptr.get(), inputs); + auto cg_outputs = ke.run(inputs); // check thread_pred auto kernel = ke.kernel(); @@ -6313,8 +6310,8 @@ TEST_F(NVFuserTest, FusionAvoidRedundantWriteNonNeighbor_CUDA) { std::vector inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(fusion_ptr.get(), inputs); - auto cg_outputs = ke.runFusion(inputs); + ke.compile(fusion_ptr.get(), inputs); + auto cg_outputs = ke.run(inputs); // check thread_pred auto kernel = ke.kernel(); @@ -6769,8 +6766,8 @@ TEST_F(ExpandedBroadcastGlobalIntermediateTest, TheTest_CUDA) { at::Tensor t0 = at::randn({2, 1, 2}, options); KernelExecutor ke; - ke.compileFusion(fusion_ptr.get(), {t0}); - auto cg_output = ke.runFusion({t0}).at(0); + ke.compile(fusion_ptr.get(), {t0}); + auto cg_output = ke.run({t0}).at(0); ASSERT_EQ(cg_output.size(0), 2); ASSERT_EQ(cg_output.size(1), (1L << 60L)); @@ -6818,9 +6815,8 @@ TEST_F(NVFuserTest, FusionTestWarnRegisterSpill_CUDA) { compile_opts.maxrregcount = 32; compile_opts.enable_ptxas_verbose = true; KernelExecutor ke; - ke.compileFusion( - &fusion, {aten_input}, heuristic_params->lparams, compile_opts); - auto cg_outputs = ke.runFusion({aten_input}); + ke.compile(&fusion, {aten_input}, heuristic_params->lparams, compile_opts); + auto cg_outputs = ke.run({aten_input}); // validate results testValidate( @@ -6936,8 +6932,8 @@ TEST_F(NVFuserTest, IsFinite_CUDA) { const auto input = at::from_blob(data.data(), {3}, {1}).to(options); KernelExecutor ke; - ke.compileFusion(fusion, {input}); - const auto output = ke.runFusion({input}); + ke.compile(fusion, {input}); + const auto output = ke.run({input}); testValidate(fusion, output, {input}, __LINE__, __FILE__); } @@ -7036,7 +7032,7 @@ TEST_F(NVFuserTest, FusionOptionsGuard_CUDA) { captureStdout(); KernelExecutor ke; - ke.compileFusion( + ke.compile( &fusion, {aten_input}, heuristic_params->lparams, @@ -7411,8 +7407,8 @@ TEST_F(NVFuserTest, AllInputDtypes) { CompileParams opt{.index_type = index_type}; KernelExecutor ke; - ke.compileFusion(fusion.get(), args, LaunchParams{}, opt); - auto outputs = ke.runFusion(args, LaunchParams{}, opt); + ke.compile(fusion.get(), args, LaunchParams{}, opt); + auto outputs = ke.run(args, LaunchParams{}, opt); auto kernel_result = outputs.at(0).item(); auto expect = ee.evaluate(output).as().item(); @@ -7531,8 +7527,8 @@ TEST_F(NVFuserTest, OpaqueTupleAsComplex) { args.push(Opaque(std::array{1.2, 3.4})); KernelExecutor ke; - ke.compileFusion(&fusion); - auto outputs = ke.runFusion(args); + ke.compile(&fusion); + auto outputs = ke.run(args); EXPECT_EQ( outputs.at(0).item>(), c10::complex(1.2, 3.4)); @@ -7558,8 +7554,8 @@ TEST_F(NVFuserTest, StructConstruct) { fusion.addOutput(tv); KernelExecutor ke; - ke.compileFusion(&fusion); - auto outputs = ke.runFusion({1.2, 3.4}); + ke.compile(&fusion); + auto outputs = ke.run({1.2, 3.4}); EXPECT_EQ( outputs.at(0).item>(), c10::complex(1.2, 3.4)); @@ -7596,11 +7592,11 @@ TEST_F(NVFuserTest, VectorizationStrideValidation) { std::vector aten_inputs({t0}); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); + ke.compile(&fusion, aten_inputs); // This previously triggered a false positive error with the stride // validation - auto cg_outputs = ke.runFusion(aten_inputs); + auto cg_outputs = ke.run(aten_inputs); ASSERT_TRUE(cg_outputs[0].equal(t0)); } @@ -7625,9 +7621,9 @@ TEST_F(NVFuserTest, ConstLongExpressions) { fusion->addOutput(tv0); KernelExecutor ke; - ke.compileFusion(fusion); + ke.compile(fusion); - auto outputs = ke.runFusion({}); + auto outputs = ke.run({}); testValidate(fusion, outputs, {}, __LINE__, __FILE__); } @@ -7697,9 +7693,9 @@ TEST_F(NVFuserTest, PredicateRNGOps) { at::Tensor t0 = at::zeros({2048, size}, options); KernelExecutor ke; - ke.compileFusion(fusion, {t0}); + ke.compile(fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); } TEST_F(NVFuserTest, LoweringHook) { @@ -7888,7 +7884,7 @@ TEST_F(NVFuserTest, UnsupportedBFloat) { KernelExecutor ke; EXPECT_THAT( - [&]() { ke.compileFusion(&fusion); }, + [&]() { ke.compile(&fusion); }, testing::ThrowsMessage( testing::HasSubstr("Reason: Fusion contains BFloat16"))); } @@ -7953,8 +7949,8 @@ TEST_F(NVFuserTest, BlockReduction3D) { at::Tensor t0 = at::randn(shape, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto ref = t0.sum(0).sum(-1); testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); }; @@ -7996,8 +7992,8 @@ TEST_F(NVFuserTest, ReverseMerge) { at::Tensor t0 = at::randn({11, 12}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); ASSERT_TRUE(t0.equal(cg_outputs.at(0))); } @@ -8026,8 +8022,8 @@ TEST_F(NVFuserTest, FusionCpAsyncPredicateAvoidIllegalMemoryAccess) { at::Tensor t0 = at::randn({m, n}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); ASSERT_TRUE(t0.equal(cg_outputs.at(0))); } @@ -8360,8 +8356,8 @@ TEST_F(NVFuserTest, BroadcastFromNowhereFusion) { at::Tensor t0 = at::randn({4}, options); at::Tensor t1 = at::randn({2, 4}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = ke.runFusion({t0, t1}); + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -8444,8 +8440,8 @@ TEST_F(NVFuserTest, MultipleDifferentSizeGridReduction) { const std::vector inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, inputs); - auto cg_outputs = ke.runFusion(inputs); + ke.compile(&fusion, inputs); + auto cg_outputs = ke.run(inputs); testValidate(&fusion, cg_outputs, inputs, __LINE__, __FILE__); } @@ -8879,8 +8875,8 @@ TEST_F(NVFuserTest, CpAsyncDataTypeBool) { // ); // If not correctly lowered, would trigger error in compile KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } diff --git a/tests/cpp/test_gpu_compute_with.cpp b/tests/cpp/test_gpu_compute_with.cpp index 1a505e495f0..df3b5a9bff1 100644 --- a/tests/cpp/test_gpu_compute_with.cpp +++ b/tests/cpp/test_gpu_compute_with.cpp @@ -165,8 +165,8 @@ TEST_F(NVFuserTest, FusionComputeWith1_CUDA) { at::Tensor t0 = at::randn(shape, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -220,8 +220,8 @@ TEST_F(NVFuserTest, FusionComputeWith2_CUDA) { at::Tensor t0 = at::randn({dimx}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto aten_output = at::_softmax(t0.to(at::kDouble), -1, false); @@ -262,8 +262,8 @@ TEST_F(NVFuserTest, FusionComputeWith3_CUDA) { at::Tensor t0 = at::randn({123}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -308,8 +308,8 @@ TEST_F(NVFuserTest, FusionComputeWith4_CUDA) { at::Tensor t0 = at::randn({345, 10}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -347,8 +347,8 @@ TEST_F(NVFuserTest, FusionComputeWith5_CUDA) { at::Tensor t0 = at::randn({345, 10}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -450,8 +450,8 @@ TEST_F(NVFuserTest, FusionComputeWith6_CUDA) { auto t0 = at::randn(input_shape, options_half); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, LaunchParams()); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}, LaunchParams()); + auto cg_outputs = ke.run({t0}); auto t1 = t0.to(at::kFloat); auto t2 = t1.mean({0, 1, 2}); diff --git a/tests/cpp/test_gpu_fused_reduction.cpp b/tests/cpp/test_gpu_fused_reduction.cpp index 0fa61a66397..6080862aed5 100644 --- a/tests/cpp/test_gpu_fused_reduction.cpp +++ b/tests/cpp/test_gpu_fused_reduction.cpp @@ -116,8 +116,8 @@ TEST_F(NVFuserTest, FusionGridAllreduce1_CUDA) { auto t0 = at::randn({nx}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto ref = sum(t0).unsqueeze(0) + t0; @@ -165,8 +165,8 @@ TEST_F(NVFuserTest, FusionGridAllreduce2_CUDA) { auto t0 = at::randn({nx}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto ref = sum(t0).unsqueeze(0) + t0; @@ -213,8 +213,8 @@ TEST_F(NVFuserTest, FusionGridAllreduce3_CUDA) { auto t0 = at::randn({nx, ny}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto ref = sum(t0, {1}).unsqueeze(-1) + t0; @@ -258,8 +258,8 @@ TEST_F(NVFuserTest, FusionGridAllreduce4_CUDA) { auto t0 = at::randn({nx}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto ref = (sum(t0) + 1).unsqueeze(0) + t0; @@ -320,8 +320,8 @@ TEST_F(NVFuserTest, FusionGridAllreduce5_CUDA) { auto t5 = at::randn({bdimy, bdimx}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0, t5}); - auto cg_outputs = ke.runFusion({t0, t5}); + ke.compile(&fusion, {t0, t5}); + auto cg_outputs = ke.run({t0, t5}); auto ref = (sum(t0, {1}) + 1).unsqueeze(-1) + t0; @@ -372,8 +372,8 @@ TEST_F(NVFuserTest, FusionGridAllreduce6_CUDA) { auto t0 = at::randn(shape, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); auto t0_double = t0.to(at::kDouble); auto ref = t0_double + t0_double.sum({0}).unsqueeze(0); @@ -418,8 +418,8 @@ TEST_F(NVFuserTest, FusionGridAllreduceWelford1_CUDA) { auto t0 = at::randn({nx}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto ref = (t0.mean({0}).unsqueeze(0) + t0) + t0.var({0}, false).unsqueeze(0) * nx; @@ -468,8 +468,8 @@ TEST_F(NVFuserTest, FusionGridAllreduceWelford2_CUDA) { auto t0 = at::randn({nx, ny}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto ref = (sum(t0, {1}) / ny).unsqueeze(-1) + t0; @@ -588,8 +588,8 @@ TEST_F(NVFuserTest, FusionFusedReductionBatchnorm_CUDA) { KernelExecutor ke; LaunchParams launch_params(2, 2, -1, -1, -1, -1); - ke.compileFusion(&fusion, aten_inputs, launch_params); - auto cg_outputs = ke.runFusion(aten_inputs, launch_params); + ke.compile(&fusion, aten_inputs, launch_params); + auto cg_outputs = ke.run(aten_inputs, launch_params); auto t5 = t0.to(at::kFloat); auto t6 = t1.to(at::kFloat); @@ -654,8 +654,8 @@ TEST_F(NVFuserTest, FusionGroupedReduction1_CUDA) { auto t0 = at::randn(shape, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); auto ref = t0.sum({1}) * 2; @@ -699,8 +699,8 @@ TEST_F(NVFuserTest, FusionGroupedReduction2_CUDA) { auto t0 = at::randn(shape, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); auto ref = (t0 + 1).sum({1}) + std::get<0>((t0 + 2).max(1)); @@ -742,8 +742,8 @@ TEST_F(NVFuserTest, FusionGroupedReduction3_CUDA) { auto t0 = at::randn(shape, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); auto ref = t0.sum({1}) + t0.to(c10::kDouble).sum({1}).to(c10::kFloat); @@ -830,8 +830,8 @@ TEST_F(NVFuserTest, FusionGroupedReduction6_CUDA) { auto t0 = at::randn(shape, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); testValidate(ke.kernel(), outputs, {t0}, __LINE__, __FILE__); } @@ -893,8 +893,8 @@ TEST_F(NVFuserTest, FusionGroupedReductionRfactor1_CUDA) { auto t0 = at::randn(shape, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); auto ref = t0.sum({0}) * 2; @@ -938,8 +938,8 @@ TEST_F(NVFuserTest, FusionGroupedReductionRfactor2_CUDA) { auto t0 = at::randn(shape, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); auto ref = t0.sum({0}) * 2; @@ -984,8 +984,8 @@ TEST_F(NVFuserTest, FusionGroupedReductionAfterComputeAt_CUDA) { auto t0 = at::randn(shape, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); auto ref = (t0 + 1).sum({1}) * 2; @@ -1024,8 +1024,8 @@ TEST_F(NVFuserTest, FusionGroupAllreduce1_CUDA) { auto t0 = at::randn(shape, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); auto t3 = t0.sum({0}).unsqueeze(-1); auto ref = t0 + t3 + t3; @@ -1077,8 +1077,8 @@ TEST_F(NVFuserTest, FusionGroupAllreduce2_CUDA) { auto t0 = at::randn(shape, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); auto t2 = t0.sum({1}).unsqueeze(-1); auto t6 = t0.to(c10::kDouble).sum({1}).unsqueeze(-1).to(c10::kFloat); @@ -1125,8 +1125,8 @@ TEST_F(NVFuserTest, FusionGroupAllreduce3_CUDA) { auto t0 = at::randn(shape, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); auto t3 = t0 / t0.sum({0}).unsqueeze(0); auto t6 = t0 / std::get<0>(t0.max(0)).unsqueeze(0); @@ -1178,8 +1178,8 @@ TEST_F(NVFuserTest, FusionGroupAllreduce4_CUDA) { auto t0 = at::randn(shape, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); at::Tensor ref = t0; for (int i = 0; i < num_reductions; ++i) { @@ -1266,8 +1266,8 @@ TEST_F(NVFuserTest, FusionGroupAllreduce5_CUDA) { std::vector indices({at::indexing::Slice(0, 10)}); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); auto t3 = t0 / t0.sum({0}).unsqueeze(0).to(at::kComplexDouble); auto t7 = t4 / t4.sum({0}).unsqueeze(0).to(at::kComplexDouble); @@ -1429,13 +1429,13 @@ TEST_F(NVFuserTest, FusionPersistentBNBackwardAllreduce_CUDA) { validateNoParallelBroadcastExist(gpulw.run()); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); + ke.compile(&fusion, aten_inputs); if (bidx * bidy > deviceSMCount()) { GTEST_SKIP() << "Not enough SMs to run this test"; } - auto outputs = ke.runFusion(aten_inputs); + auto outputs = ke.run(aten_inputs); std::vector at_reduction_axes; std::copy( @@ -1535,8 +1535,8 @@ TEST_F(NVFuserTest, FusionGroupedReductionReEntrant1_CUDA) { auto t0 = at::randn(shape, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); auto t0_double = t0.to(at::kDouble); auto ref = (t0_double + 1).sum({0}) + (t0_double + 2).sum({0}); @@ -1650,8 +1650,8 @@ TEST_F(NVFuserTest, FusionGroupedReductionChannelsLastBatchNormLike_CUDA) { std::vector aten_inputs({t0, t1, t2}); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); auto t0_double = t0.to(at::kDouble); auto t1_double = t1.to(at::kDouble); @@ -1781,8 +1781,8 @@ TEST_F( std::vector aten_inputs({t0, t1, t2}); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); auto t0_double = t0.to(at::kDouble); auto t1_double = t1.to(at::kDouble); @@ -1869,8 +1869,8 @@ TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduce1_CUDA) { auto t0 = at::randn(shape, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); auto t0_double = t0.to(at::kDouble); auto ref = t0_double + t0_double.sum({0}).unsqueeze(0); @@ -1947,8 +1947,8 @@ TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduce2_CUDA) { auto t0 = at::randn(shape, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); auto t0_double = t0.to(at::kDouble); auto ref = t0_double + t0_double.sum({0}).unsqueeze(0); @@ -2031,8 +2031,8 @@ TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduce3_CUDA) { auto t0 = at::randn(shape, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); auto t0_double = t0.to(at::kDouble); auto t4 = t0_double + 1 + (t0_double + 1).sum({0}).unsqueeze(0); @@ -2123,8 +2123,8 @@ TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduce4_CUDA) { auto t0 = at::randn(shape, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); auto t0_double = t0.to(at::kDouble); auto ref = t0_double + t0_double.sum({0}).unsqueeze(0); @@ -2184,8 +2184,8 @@ TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduceWelford1_CUDA) { auto t0 = at::randn(shape, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); auto t0_double = t0.to(at::kDouble); auto ref = t0_double + t0_double.mean({0}).unsqueeze(0); @@ -2249,8 +2249,8 @@ TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduceWelford2_CUDA) { auto t0 = at::randn(shape, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); auto t0_double = t0.to(at::kDouble); auto ref = t0_double + t0_double.mean({0}).unsqueeze(0); @@ -2386,7 +2386,7 @@ TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduceWelfordShmoo_CUDA) { auto t0 = at::randn(input_shape, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); + ke.compile(&fusion, {t0}); // Skip the rest of this test size if the required number of SMs // exceeds the available SM count @@ -2397,7 +2397,7 @@ TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduceWelfordShmoo_CUDA) { return; } - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); auto t1 = t0.to(at::kDouble); auto t2 = t1.mean({0, 1, 2}).unsqueeze(0).unsqueeze(0).unsqueeze(0); @@ -2542,8 +2542,8 @@ TEST_F(NVFuserTest, FusionCrossEntropyGatherPattern_CUDA) { std::vector inputs = {at_log_probs, at_labels}; KernelExecutor ke; - ke.compileFusion(&fusion, inputs); - auto cg_outputs = ke.runFusion(inputs); + ke.compile(&fusion, inputs); + auto cg_outputs = ke.run(inputs); auto ref = at::gather(at_log_probs, 1, at_labels.unsqueeze(1)).squeeze(); diff --git a/tests/cpp/test_gpu_indexing_ops.cpp b/tests/cpp/test_gpu_indexing_ops.cpp index 95feb635954..ed5c4a5ce86 100644 --- a/tests/cpp/test_gpu_indexing_ops.cpp +++ b/tests/cpp/test_gpu_indexing_ops.cpp @@ -397,8 +397,8 @@ TEST_F(NVFuserTest, FusionIndexSelect_Sum_CUDA) { auto heuristic_params = SchedulerEntry::scheduleWith( &fusion, SchedulerType::Reduction, aten_inputs); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs, heuristic_params->lparams); - ke.runFusion(aten_inputs, {cg_output}, heuristic_params->lparams); + ke.compile(&fusion, aten_inputs, heuristic_params->lparams); + ke.run(aten_inputs, {cg_output}, heuristic_params->lparams); auto tv0_ref = at::index_select(input0, 0, input_idx); at::Tensor tv2_ref = tv0_ref * input1; diff --git a/tests/cpp/test_gpu_outer_reduction.cpp b/tests/cpp/test_gpu_outer_reduction.cpp index a0e73a69b5c..7934689844c 100644 --- a/tests/cpp/test_gpu_outer_reduction.cpp +++ b/tests/cpp/test_gpu_outer_reduction.cpp @@ -116,7 +116,7 @@ TEST_F(OuterReductionTest, GroupedGridWelfordOuterOpt) { std::vector aten_inputs = {t0}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); + ke.compile(&fusion, aten_inputs); NVF_CHECK( ke.kernel()->summary().has_outer_grouped_grid_welford == @@ -132,7 +132,7 @@ TEST_F(OuterReductionTest, GroupedGridWelfordOuterOpt) { ", ", params.bidx); - auto cg_outputs = ke.runFusion(aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto t1 = t0; auto t2 = params.dtype == DataType::Half ? t1.to(at::kFloat) : t1; @@ -639,7 +639,7 @@ void grid_persistent_reduction_outer_norm_like( auto t0 = at::randn(input_shape, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); + ke.compile(&fusion, {t0}); auto bidy = ceilDiv(ceilDiv(N * HW * HW, params.tidy), params.pb); @@ -648,12 +648,12 @@ void grid_persistent_reduction_outer_norm_like( << params.bidx * bidy << ", available: " << deviceSMCount(); } - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); if (benchmark_mode) { for (int i = 0; i < 10; ++i) { clearL2Cache(); - cg_outputs = ke.runFusion({t0}); + cg_outputs = ke.run({t0}); } } @@ -738,7 +738,7 @@ void grid_persistent_welford_outer_norm_like( auto t0 = at::randn(input_shape, options_half); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); + ke.compile(&fusion, {t0}); auto bidy = ceilDiv(ceilDiv(N * HW * HW, params.tidy), params.pb); @@ -747,12 +747,12 @@ void grid_persistent_welford_outer_norm_like( << params.bidx * bidy << ", available: " << deviceSMCount(); } - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); if (benchmark_mode) { for (int i = 0; i < 10; ++i) { clearL2Cache(); - cg_outputs = ke.runFusion({t0}); + cg_outputs = ke.run({t0}); } } @@ -899,7 +899,7 @@ void grid_persistent_batchnorm_manual( {at_input_nvfuser, at_weight, at_bias, at_running_mean, at_running_var}); KernelExecutor ke; - ke.compileFusion(fusion_ptr.get(), aten_inputs); + ke.compile(fusion_ptr.get(), aten_inputs); auto bidy = ceilDiv(ceilDiv(N * HW * HW, params.tidy), params.pb); @@ -908,7 +908,7 @@ void grid_persistent_batchnorm_manual( << params.bidx * bidy << ", available: " << deviceSMCount(); } - auto cg_outputs = ke.runFusion(aten_inputs); + auto cg_outputs = ke.run(aten_inputs); cg_outputs.at(2) = cg_outputs.at(2).permute({0, 3, 1, 2}); auto at_output = at::batch_norm( @@ -934,7 +934,7 @@ void grid_persistent_batchnorm_manual( if (benchmark_mode) { for (int i = 0; i < 10; ++i) { clearL2Cache(); - cg_outputs = ke.runFusion(aten_inputs); + cg_outputs = ke.run(aten_inputs); } } } @@ -1038,7 +1038,7 @@ void grid_persistent_reduction_outer_norm_bwd_like( std::vector aten_inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); + ke.compile(&fusion, aten_inputs); auto bidy = ceilDiv(ceilDiv(N * HW * HW, params.tidy), params.pb); @@ -1047,12 +1047,12 @@ void grid_persistent_reduction_outer_norm_bwd_like( << params.bidx * bidy << ", available: " << deviceSMCount(); } - auto cg_outputs = ke.runFusion(aten_inputs); + auto cg_outputs = ke.run(aten_inputs); if (benchmark_mode) { for (int i = 0; i < 10; ++i) { clearL2Cache(); - cg_outputs = ke.runFusion(aten_inputs); + cg_outputs = ke.run(aten_inputs); } } @@ -1225,7 +1225,7 @@ void grid_persistent_batchnorm_bwd_manual( std::vector cg_outputs; KernelExecutor ke; - ke.compileFusion(fusion_ptr.get(), aten_inputs); + ke.compile(fusion_ptr.get(), aten_inputs); auto bidy = ceilDiv(ceilDiv(N * HW * HW, params.tidy), params.pb); @@ -1234,7 +1234,7 @@ void grid_persistent_batchnorm_bwd_manual( << params.bidx * bidy << ", available: " << deviceSMCount(); } - cg_outputs = ke.runFusion(aten_inputs); + cg_outputs = ke.run(aten_inputs); // Permute grad_input output cg_outputs.at(0) = cg_outputs.at(0).permute({0, 3, 1, 2}); @@ -1262,7 +1262,7 @@ void grid_persistent_batchnorm_bwd_manual( if (benchmark_mode) { for (int i = 0; i < 10; ++i) { clearL2Cache(); - cg_outputs = ke.runFusion(aten_inputs); + cg_outputs = ke.run(aten_inputs); } } } @@ -2182,8 +2182,8 @@ TEST_F(OuterReductionTest, IterGroupedBlockReduction) { scheduler->schedule(&fusion, rparams); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs, heuristic_params->lparams); - auto cg_outputs = ke.runFusion(aten_inputs, heuristic_params->lparams); + ke.compile(&fusion, aten_inputs, heuristic_params->lparams); + auto cg_outputs = ke.run(aten_inputs, heuristic_params->lparams); // lowering & check iteration grouped reductions NVF_CHECK( @@ -2291,8 +2291,8 @@ void shmooTestsOfIterGroupedBlockOrGridReduction( std::vector aten_inputs({t0}); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs, lparams); - auto cg_outputs = ke.runFusion(aten_inputs, lparams); + ke.compile(&fusion, aten_inputs, lparams); + auto cg_outputs = ke.run(aten_inputs, lparams); testValidate( &fusion, @@ -2548,8 +2548,8 @@ TEST_F(OuterReductionTest, IterGroupedMultipleReductions) { auto t1 = at::randn(shape, options); std::vector aten_inputs({t0, t1}); - ke.compileFusion(&fusion, aten_inputs, lparams); - auto cg_outputs = ke.runFusion(aten_inputs, lparams); + ke.compile(&fusion, aten_inputs, lparams); + auto cg_outputs = ke.run(aten_inputs, lparams); testValidate( &fusion, diff --git a/tests/cpp/test_gpu_transpose.cpp b/tests/cpp/test_gpu_transpose.cpp index 947283d4975..13b01bfb8c9 100644 --- a/tests/cpp/test_gpu_transpose.cpp +++ b/tests/cpp/test_gpu_transpose.cpp @@ -548,8 +548,8 @@ TEST_F(TransposeTest, FusionManualScheduleTransposeComplexDAG1) { at::Tensor input2 = at::randn({512, 256, 1024}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {input0, input1, input2}); - auto outputs = ke.runFusion({input0, input1, input2}); + ke.compile(&fusion, {input0, input1, input2}); + auto outputs = ke.run({input0, input1, input2}); testValidate(&fusion, outputs, {input0, input1, input2}, __LINE__, __FILE__); } @@ -988,8 +988,8 @@ TEST_F(TransposeTest, FusionTransposeBankConflict9) { at::Tensor input = at::randn({32, 32, 2}, options); KernelExecutor ke; - ke.compileFusion(&fusion); - auto outputs = ke.runFusion({input}); + ke.compile(&fusion); + auto outputs = ke.run({input}); testValidate(&fusion, outputs, {input}, __LINE__, __FILE__); } diff --git a/tests/cpp/test_gpu_view.cpp b/tests/cpp/test_gpu_view.cpp index e4a322860f7..c3319a7ef39 100644 --- a/tests/cpp/test_gpu_view.cpp +++ b/tests/cpp/test_gpu_view.cpp @@ -135,8 +135,8 @@ TEST_F(GpuViewTest, FusionViewAsRealOutput) { std::vector aten_inputs = {at_x, at_bias, at_y}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -638,8 +638,8 @@ TEST_F(GpuViewTest, FusionReshapeConcreteDomain) { auto t1 = at::randn({1, 6}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = ke.runFusion({t0, t1}); + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -851,8 +851,8 @@ TEST_F(GpuViewTest, FusionFlattenAfterUnsqueezeOutput) { x_reshape->axis(0)->parallelize(ParallelType::TIDx); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -915,14 +915,14 @@ TEST_F(GpuViewTest, FusionExpandRepro) { std::vector aten_inputs = {at_x, at_y}; KernelExecutor ke; - ke.compileFusion(&fusion); + ke.compile(&fusion); LaunchParams l_params; - auto outputs = ke.runFusion(aten_inputs, {}, l_params, {}); + auto outputs = ke.run(aten_inputs, {}, l_params, {}); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); // second run to verify cached output allocation - outputs = ke.runFusion(aten_inputs, {}, l_params, {}); + outputs = ke.run(aten_inputs, {}, l_params, {}); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -1350,8 +1350,8 @@ TEST_F(GpuViewTest, FusionPwiseViewSchedule) { at::Tensor t3 = at::randn({x, y, z}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0, t3}); - auto cg_outputs = ke.runFusion({t0, t3}); + ke.compile(&fusion, {t0, t3}); + auto cg_outputs = ke.run({t0, t3}); testValidate(&fusion, cg_outputs, {t0, t3}, __LINE__, __FILE__); } @@ -1416,8 +1416,8 @@ TEST_F(GpuViewTest, FusionSumViewSchedule) { auto t6 = t0 + t3; KernelExecutor ke; - ke.compileFusion(&fusion, {t0, t3}); - auto cg_outputs = ke.runFusion({t0, t3}); + ke.compile(&fusion, {t0, t3}); + auto cg_outputs = ke.run({t0, t3}); testValidate(&fusion, cg_outputs, {t0, t3}, {t2, t5, t6}, __LINE__, __FILE__); } @@ -1945,8 +1945,8 @@ TEST_F(GpuViewTest, FusionReshapeMapping) { at::Tensor t3 = at::randn({w, x * y, z}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0, t3}); - auto cg_outputs = ke.runFusion({t0, t3}); + ke.compile(&fusion, {t0, t3}); + auto cg_outputs = ke.run({t0, t3}); testValidate(&fusion, cg_outputs, {t0, t3}, __LINE__, __FILE__); } @@ -2319,8 +2319,8 @@ TEST_F(GpuViewTest, ExpandedBroadcast) { at::randn({4, 5}, at::dtype(at::kFloat).device(at::kCUDA, 0)); KernelExecutor ke; - ke.compileFusion(&fusion, {in_tensor}); - at::Tensor actual_out_tensor = ke.runFusion({in_tensor})[0]; + ke.compile(&fusion, {in_tensor}); + at::Tensor actual_out_tensor = ke.run({in_tensor})[0]; testValidate(&fusion, {actual_out_tensor}, {in_tensor}, __LINE__, __FILE__); } @@ -2698,8 +2698,8 @@ TEST_F(GpuViewTest, FusionMismatchingReshape) { // this fusion. at::Tensor t0 = at::randn({2, 3, 5}).to(options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } diff --git a/tests/cpp/test_indexing.cpp b/tests/cpp/test_indexing.cpp index 4289006ec27..bc7af04a0e0 100644 --- a/tests/cpp/test_indexing.cpp +++ b/tests/cpp/test_indexing.cpp @@ -1774,8 +1774,8 @@ TEST_F(IndexingTest, SmemAllocationDomainForTranspose) { at::Tensor input0 = at::randn({256, 256}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {input0}); - auto outputs = ke.runFusion({input0}); + ke.compile(&fusion, {input0}); + auto outputs = ke.run({input0}); testValidate(&fusion, outputs, {input0}, __LINE__, __FILE__); } @@ -3041,8 +3041,8 @@ TEST_F(PredicateIndexingTest, DoubleBuffering1) { EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); KernelExecutor ke; - ke.compileFusion(&fusion, inputs); - auto outputs = ke.runFusion(inputs); + ke.compile(&fusion, inputs); + auto outputs = ke.run(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } @@ -3140,8 +3140,8 @@ TEST_F(PredicateIndexingTest, CircularBuffering1) { EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); KernelExecutor ke; - ke.compileFusion(&fusion, inputs); - auto outputs = ke.runFusion(inputs); + ke.compile(&fusion, inputs); + auto outputs = ke.run(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } @@ -3307,8 +3307,8 @@ TEST_F(PredicateIndexingTest, UnrolledCircularBuffering) { EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); KernelExecutor ke; - ke.compileFusion(&fusion, inputs); - auto outputs = ke.runFusion(inputs); + ke.compile(&fusion, inputs); + auto outputs = ke.run(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } @@ -3388,8 +3388,8 @@ TEST_F(PredicateIndexingTest, UnswitchedCircularBuffering1) { EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); KernelExecutor ke; - ke.compileFusion(&fusion, inputs); - auto outputs = ke.runFusion(inputs); + ke.compile(&fusion, inputs); + auto outputs = ke.run(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } @@ -3477,8 +3477,8 @@ TEST_F(PredicateIndexingTest, UnswitchedCircularBuffering2) { EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); KernelExecutor ke; - ke.compileFusion(&fusion, inputs); - auto outputs = ke.runFusion(inputs); + ke.compile(&fusion, inputs); + auto outputs = ke.run(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } @@ -3583,8 +3583,8 @@ TEST_P(PredicateIndexingTest, UnswitchedCircularBuffering3) { EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); KernelExecutor ke; - ke.compileFusion(&fusion, inputs); - auto outputs = ke.runFusion(inputs); + ke.compile(&fusion, inputs); + auto outputs = ke.run(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } @@ -3662,8 +3662,8 @@ TEST_F(PredicateIndexingTest, UnswitchedCircularBuffering4) { auto t0 = at::randn({16}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -3755,8 +3755,8 @@ TEST_F(PredicateIndexingTest, NonDivisibleSplit1) { std::vector aten_inputs = {t0}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -3846,8 +3846,8 @@ TEST_F(PredicateIndexingTest, NonDivisibleSplitWithUnswitch) { std::vector aten_inputs = {t0}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -3941,8 +3941,8 @@ TEST_F(PredicateIndexingTest, NonDivisibleSplitWithCircularBuffering) { std::vector aten_inputs = {t0}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -4052,8 +4052,8 @@ TEST_F( std::vector aten_inputs = {t0}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -4137,8 +4137,8 @@ TEST_P(PredicateIndexingTest, UnswitchPredicateIssueRepro681) { } KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); auto ref = t0.to(at::kDouble).sum(); @@ -4297,8 +4297,8 @@ TEST_F(PredicateIndexingTest, NonDivisibleSplitWithUnswitchAndBroadcast) { EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -4420,8 +4420,8 @@ TEST_F(PredicateIndexingTest, UnswitchConsolidationDifferentThreading) { EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -4835,8 +4835,8 @@ TEST_F(ContigIndexingTest, ConcretizedBroadcastMerge) { std::vector aten_inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -5064,8 +5064,8 @@ TEST_F(ContigPredicateIndexingTest, NonDivisibleSplit1) { std::vector aten_inputs = {t0}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } diff --git a/tests/cpp/test_indexing_advanced.cpp b/tests/cpp/test_indexing_advanced.cpp index eb94b66f9ca..4b0674ca015 100644 --- a/tests/cpp/test_indexing_advanced.cpp +++ b/tests/cpp/test_indexing_advanced.cpp @@ -73,9 +73,9 @@ TEST_P(AdvancedIndexingTest, InlineBroadcast) { at::Tensor t1 = at::randn({3, 123}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0, t1}); + ke.compile(&fusion, {t0, t1}); - auto outputs = ke.runFusion({t0, t1}); + auto outputs = ke.run({t0, t1}); testValidate(&fusion, outputs, {t0, t1}, __LINE__, __FILE__); } @@ -124,8 +124,8 @@ TEST_P(AdvancedIndexingTest, 1) { std::vector aten_inputs = {t0, t1}; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -175,8 +175,8 @@ TEST_P(AdvancedIndexingTest, 2) { std::vector aten_inputs = {t0, t1}; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -230,8 +230,8 @@ TEST_P(AdvancedIndexingTest, 4) { std::vector aten_inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -264,8 +264,8 @@ TEST_P(AdvancedIndexingTest, 5) { std::vector aten_inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -346,8 +346,8 @@ TEST_P(AdvancedIndexingTest, 7) { auto at_t1 = at::randn({numel_x, numel_y}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {at_t0, at_t1}); - auto cg_outputs = ke.runFusion({at_t0, at_t1}); + ke.compile(&fusion, {at_t0, at_t1}); + auto cg_outputs = ke.run({at_t0, at_t1}); auto aten_output = (at_t0.unsqueeze(-1).expand({numel_x, numel_y}) + at_t1) .to(at::kDouble) @@ -392,8 +392,8 @@ TEST_P(AdvancedIndexingTest, 8) { auto at_t1 = at::randn({numel_x, numel_y}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {at_t0, at_t1}); - auto cg_outputs = ke.runFusion({at_t0, at_t1}); + ke.compile(&fusion, {at_t0, at_t1}); + auto cg_outputs = ke.run({at_t0, at_t1}); auto aten_output = (at_t0.unsqueeze(-1).expand({numel_x, numel_y}) + at_t1) .to(at::kDouble) @@ -485,8 +485,8 @@ TEST_P(AdvancedIndexingTest, 10) { at::Tensor output = at::empty_like(input1); KernelExecutor ke; - ke.compileFusion(&fusion, {input1, input2}); - ke.runFusion({input1, input2}, {output}); + ke.compile(&fusion, {input1, input2}); + ke.run({input1, input2}, {output}); at::Tensor tv2_ref = input2 + 2.0; at::Tensor output_ref = input1 + tv2_ref; @@ -538,8 +538,8 @@ TEST_P(AdvancedIndexingTest, 11) { std::vector aten_inputs = {t0, t1}; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -575,8 +575,8 @@ TEST_P(AdvancedIndexingTest, 12) { std::vector aten_outputs = {t2, t4}; KernelExecutor ke; - ke.compileFusion(&fusion, {aten_input}); - auto cg_outputs = ke.runFusion({aten_input}); + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); testValidate( &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); @@ -624,8 +624,8 @@ TEST_P(AdvancedIndexingTest, 13) { std::vector aten_inputs = {t0, t1, t2}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -666,8 +666,8 @@ TEST_P(AdvancedIndexingTest, 14) { std::vector aten_inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -701,8 +701,8 @@ TEST_P(AdvancedIndexingTest, 15) { std::vector aten_inputs = {t0, t3}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -733,8 +733,8 @@ TEST_P(AdvancedIndexingTest, 16) { std::vector aten_inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -768,8 +768,8 @@ TEST_P(AdvancedIndexingTest, 17) { std::vector aten_inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -805,8 +805,8 @@ TEST_P(AdvancedIndexingTest, 18) { std::vector inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, inputs); - auto cg_outputs = ke.runFusion(inputs); + ke.compile(&fusion, inputs); + auto cg_outputs = ke.run(inputs); auto ref = (t0.unsqueeze(-1) + t1).sum(); @@ -849,8 +849,8 @@ TEST_P(AdvancedIndexingTest, 19) { std::vector inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, inputs); - auto outputs = ke.runFusion(inputs); + ke.compile(&fusion, inputs); + auto outputs = ke.run(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } @@ -914,8 +914,8 @@ TEST_F(AdvancedIndexingIdModelTest, 20) { std::vector inputs = {t0, t1, t2}; KernelExecutor ke; - ke.compileFusion(&fusion, inputs); - auto outputs = ke.runFusion(inputs); + ke.compile(&fusion, inputs); + auto outputs = ke.run(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); #endif @@ -979,8 +979,8 @@ TEST_F(AdvancedIndexingIdModelTest, 21) { std::vector inputs = {t0, t3, t6}; KernelExecutor ke; - ke.compileFusion(&fusion, inputs); - auto outputs = ke.runFusion(inputs); + ke.compile(&fusion, inputs); + auto outputs = ke.run(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); #endif @@ -1023,8 +1023,8 @@ TEST_F(AdvancedIndexingIdModelTest, MultiPromotion1) { std::vector aten_inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -1120,8 +1120,8 @@ TEST_F(AdvancedIndexingIdModelTest, IndexSplitMerge) { std::vector aten_inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); diff --git a/tests/cpp/test_inlining.cpp b/tests/cpp/test_inlining.cpp index 767d6cfc836..569b541a838 100644 --- a/tests/cpp/test_inlining.cpp +++ b/tests/cpp/test_inlining.cpp @@ -49,8 +49,8 @@ TEST_F(InliningTest, InliningMismatchedDims1) { at::Tensor input = at::randn({2, 3, 4}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {input}); - auto cg_outputs = ke.runFusion({input}); + ke.compile(&fusion, {input}); + auto cg_outputs = ke.run({input}); testValidate(&fusion, cg_outputs, {input}, __LINE__, __FILE__); } @@ -81,8 +81,8 @@ TEST_F(InliningTest, InliningMismatchedDims2) { at::Tensor input = at::randn({2, 3, 4}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {input}); - auto cg_outputs = ke.runFusion({input}); + ke.compile(&fusion, {input}); + auto cg_outputs = ke.run({input}); testValidate(&fusion, cg_outputs, {input}, __LINE__, __FILE__); } @@ -114,8 +114,8 @@ TEST_F(InliningTest, InliningMismatchedDims4) { at::Tensor input = at::randn({2, 3, 4}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {input}); - auto cg_outputs = ke.runFusion({input}); + ke.compile(&fusion, {input}); + auto cg_outputs = ke.run({input}); testValidate(&fusion, cg_outputs, {input}, __LINE__, __FILE__); } @@ -151,8 +151,8 @@ TEST_F(InliningTest, InliningBroadcast) { at::Tensor input = at::randn({2, 3, 4}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {input}); - auto cg_outputs = ke.runFusion({input}); + ke.compile(&fusion, {input}); + auto cg_outputs = ke.run({input}); testValidate(&fusion, cg_outputs, {input}, __LINE__, __FILE__); } diff --git a/tests/cpp/test_loop_domain_scheduling.cpp b/tests/cpp/test_loop_domain_scheduling.cpp index 04445f0a3d9..710be9ce08a 100644 --- a/tests/cpp/test_loop_domain_scheduling.cpp +++ b/tests/cpp/test_loop_domain_scheduling.cpp @@ -87,8 +87,8 @@ TEST_F(LoopDomainSchedulingTest, ReshapeSplitThenMerge) { std::vector inputs({t0}); KernelExecutor ke; - ke.compileFusion(&fusion, inputs); - auto outputs = ke.runFusion(inputs); + ke.compile(&fusion, inputs); + auto outputs = ke.run(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } @@ -148,8 +148,8 @@ TEST_F(LoopDomainSchedulingTest, Slice) { std::vector aten_inputs({t0}); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = t0.index({at::indexing::Slice(1, shape[0] - 1)}); @@ -307,8 +307,8 @@ TEST_F(LoopDomainSchedulingTest, ManyReshape) { std::vector aten_inputs({t0}); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = t0 * 2; EXPECT_TRUE(ref.equal(cg_outputs[0])); diff --git a/tests/cpp/test_loop_rotation.cpp b/tests/cpp/test_loop_rotation.cpp index a41d69f6ab5..db5f3e20848 100644 --- a/tests/cpp/test_loop_rotation.cpp +++ b/tests/cpp/test_loop_rotation.cpp @@ -77,8 +77,8 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({n, 3}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } } @@ -170,8 +170,8 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({n, 3}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } } @@ -279,8 +279,8 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({n, 3}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } } @@ -390,8 +390,8 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({n, 3}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } } @@ -527,8 +527,8 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({n, 3}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } } @@ -663,8 +663,8 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({n, 3}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } } diff --git a/tests/cpp/test_matmul.cpp b/tests/cpp/test_matmul.cpp index dfbc1381f4d..12ed11bd554 100644 --- a/tests/cpp/test_matmul.cpp +++ b/tests/cpp/test_matmul.cpp @@ -128,7 +128,7 @@ TEST_P(MatmulTestWithLayout, AmpereMatmul) { NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - ke.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), @@ -136,7 +136,7 @@ TEST_P(MatmulTestWithLayout, AmpereMatmul) { ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); - auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); @@ -189,7 +189,7 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulBroadcastBatch) { NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - ke.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), @@ -197,7 +197,7 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulBroadcastBatch) { ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); - auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout) @@ -247,7 +247,7 @@ TEST_P(MatmulTestWithLayout, AmperePrologueFusionBroadcast) { NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - ke.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), @@ -255,7 +255,7 @@ TEST_P(MatmulTestWithLayout, AmperePrologueFusionBroadcast) { ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); - auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); @@ -308,7 +308,7 @@ TEST_P(MatmulTestWithLayout, AmpereProloguePointwise) { NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - ke.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), @@ -316,7 +316,7 @@ TEST_P(MatmulTestWithLayout, AmpereProloguePointwise) { ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); - auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.sin().to(at::kFloat), inputs.second.sin().to(at::kFloat), @@ -369,7 +369,7 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulBFloat16) { NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - ke.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), @@ -377,7 +377,7 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulBFloat16) { ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); - auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); @@ -432,7 +432,7 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulPipelineGmem) { NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - ke.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), @@ -440,7 +440,7 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulPipelineGmem) { ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); - auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); @@ -516,7 +516,7 @@ TEST_P(MatmulTestWithLayout, AmpereSwizzle) { NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - ke.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), @@ -524,7 +524,7 @@ TEST_P(MatmulTestWithLayout, AmpereSwizzle) { ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); - auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.01, 0.01)); @@ -644,7 +644,7 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulRegCircularBuffer) { NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - ke.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), @@ -652,7 +652,7 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulRegCircularBuffer) { ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); - auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); @@ -935,11 +935,9 @@ TEST_F(MatmulTest, MatmulMatmulAmpere) { KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( - 8, - 0, - ke.compileFusion(&fusion, {t0, t1, t2}, LaunchParams(), matmul_cparams)); + 8, 0, ke.compile(&fusion, {t0, t1, t2}, LaunchParams(), matmul_cparams)); - auto cg_outputs = ke.runFusion({t0, t1, t2}); + auto cg_outputs = ke.run({t0, t1, t2}); ASSERT_FALSE( PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); // relaxed check for now, err accumulation is significant. @@ -1315,11 +1313,9 @@ TEST_F(MatmulTest, MatmulSoftmaxMatmulAmpere) { KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( - 8, - 0, - ke.compileFusion(&fusion, {t0, t1, t2}, LaunchParams(), matmul_cparams)); + 8, 0, ke.compile(&fusion, {t0, t1, t2}, LaunchParams(), matmul_cparams)); - auto cg_outputs = ke.runFusion({t0, t1, t2}); + auto cg_outputs = ke.run({t0, t1, t2}); ASSERT_FALSE( PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); auto g1 = t0.to(at::kFloat).matmul(t1.t().to(at::kFloat)); @@ -1369,11 +1365,11 @@ TEST_P(MatmulTestWithLayout, TuringMatmul) { KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( - 7, 5, ke.compileFusion(&fusion, {inputs.first, inputs.second})); + 7, 5, ke.compile(&fusion, {inputs.first, inputs.second})); ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); - auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); @@ -1513,11 +1509,9 @@ TEST_F(MatmulTest, AmpereMatmulTNCpAsync) { KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( - 8, - 0, - ke.compileFusion(&fusion, {t0, t1}, LaunchParams(), matmul_cparams)); + 8, 0, ke.compile(&fusion, {t0, t1}, LaunchParams(), matmul_cparams)); - auto cg_outputs = ke.runFusion({t0, t1}); + auto cg_outputs = ke.run({t0, t1}); ASSERT_FALSE( PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); auto tref = t0.to(at::kFloat).matmul(t1.t().to(at::kFloat)); @@ -1682,11 +1676,9 @@ TEST_F(MatmulTest, AmpereStridedBatchedMatmulTN) { KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( - 8, - 0, - ke.compileFusion(&fusion, {t0, t1}, LaunchParams(), matmul_cparams)); + 8, 0, ke.compile(&fusion, {t0, t1}, LaunchParams(), matmul_cparams)); - auto cg_outputs = ke.runFusion({t0, t1}); + auto cg_outputs = ke.run({t0, t1}); ASSERT_FALSE( PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); // ref implementation: @@ -1855,11 +1847,9 @@ TEST_F(MatmulTest, AmpereViewMatmulTN) { KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( - 8, - 0, - ke.compileFusion(&fusion, {t0, t1}, LaunchParams(), matmul_cparams)); + 8, 0, ke.compile(&fusion, {t0, t1}, LaunchParams(), matmul_cparams)); - auto cg_outputs = ke.runFusion({t0, t1}); + auto cg_outputs = ke.run({t0, t1}); ASSERT_FALSE( PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); auto tref = @@ -2041,8 +2031,8 @@ TEST_F(MatmulTest, AmpereMatmulTNSwizzled) { auto t1 = at::randn({N, K}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0, t1}, LaunchParams(), matmul_cparams); - auto cg_outputs = ke.runFusion({t0, t1}); + ke.compile(&fusion, {t0, t1}, LaunchParams(), matmul_cparams); + auto cg_outputs = ke.run({t0, t1}); ASSERT_FALSE( PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); auto tref = t0.to(at::kFloat).matmul(t1.t().to(at::kFloat)); @@ -2095,7 +2085,7 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulLargeLoad) { NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - ke.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), @@ -2103,7 +2093,7 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulLargeLoad) { ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); - auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); @@ -2151,7 +2141,7 @@ TEST_P(MatmulTestWithLayout, TuringMatmulLargeLoad) { NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 7, 5, - ke.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), @@ -2159,7 +2149,7 @@ TEST_P(MatmulTestWithLayout, TuringMatmulLargeLoad) { ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); - auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); @@ -2223,13 +2213,13 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulTileCheck4warp) { NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - ke.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); EXPECT_TRUE(getBankConflictInfo(ke.kernel()).empty()); - auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); ASSERT_FALSE( PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); auto tref = atMatmul( @@ -2304,7 +2294,7 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulTileCheck8warp) { NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - ke.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), @@ -2312,7 +2302,7 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulTileCheck8warp) { ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); - auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); @@ -2375,7 +2365,7 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulTileCheck6warp) { NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - ke.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), @@ -2383,7 +2373,7 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulTileCheck6warp) { ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); - auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); @@ -2435,7 +2425,7 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulLargeLoadLargeK) { NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - ke.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), @@ -2443,7 +2433,7 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulLargeLoadLargeK) { ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); - auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.001, 0.001)); @@ -2491,13 +2481,11 @@ TEST_P(MatmulTestWithLayout, AmpereSplitKLikeStridedBatchedMatmul) { KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( - 8, - 0, - ke.compileFusion(&fusion, {t0, t1}, LaunchParams(), matmul_cparams)); + 8, 0, ke.compile(&fusion, {t0, t1}, LaunchParams(), matmul_cparams)); ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); - auto cg_outputs = ke.runFusion({t0, t1}); + auto cg_outputs = ke.run({t0, t1}); auto tref = splitkLikeAtMatmul(t0.to(at::kFloat), t1.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); } @@ -2582,12 +2570,12 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulSmemEpilogue) { NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - ke.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); @@ -2712,7 +2700,7 @@ TEST_F(MatmulTest, AmpereMatmulSmemEpiloguePromotionRequiredA100) { SchedulerEntry::makeSchedulerInstance(SchedulerType::Matmul) ->schedule(&fusion, &mparams); - // KernelExecutor::compileFusion would fail otherwise. + // KernelExecutor::compile would fail otherwise. SKIP_IF_INSUFFICIENT_SMEM(&mparams, data_types); at::manual_seed(0); @@ -2722,12 +2710,12 @@ TEST_F(MatmulTest, AmpereMatmulSmemEpiloguePromotionRequiredA100) { NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - ke.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); @@ -2822,12 +2810,12 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulSmemEpilogueCast) { NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - ke.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); tref = tref.to(at::kHalf); @@ -2918,12 +2906,12 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulSmemEpilogueRelu) { NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - ke.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto t2 = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); auto tref = at::relu(t2).to(at::kFloat); @@ -3005,9 +2993,9 @@ TEST_P(MatmulTestWithLayout, FusionAmpereMatmulSplitK_CUDA) { KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( - 7, 5, ke.compileFusion(&fusion, {inputs.first, inputs.second})); + 7, 5, ke.compile(&fusion, {inputs.first, inputs.second})); EXPECT_TRUE(getBankConflictInfo(ke.kernel()).empty()); - auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); ASSERT_FALSE( PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); auto tref = atMatmul( @@ -3069,10 +3057,9 @@ TEST_P(MatmulTestWithLayout, FusionAmpereMatmulSplitKBias_CUDA) { std::vector inputs = {aten_a, aten_b, aten_bias}; KernelExecutor ke; - NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( - 7, 5, ke.compileFusion(&fusion, inputs)); + NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(7, 5, ke.compile(&fusion, inputs)); EXPECT_TRUE(getBankConflictInfo(ke.kernel()).empty()); - auto cg_outputs = ke.runFusion(inputs); + auto cg_outputs = ke.run(inputs); ASSERT_FALSE( PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); auto tref = atBiasEpilogue( @@ -3132,12 +3119,11 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulBatchSplitK) { std::vector inputs = {aten_a, aten_b}; KernelExecutor ke; - NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( - 7, 5, ke.compileFusion(&fusion, inputs)); + NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(7, 5, ke.compile(&fusion, inputs)); ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); - auto cg_outputs = ke.runFusion(inputs); + auto cg_outputs = ke.run(inputs); auto tref = atMatmul(aten_a.to(at::kFloat), aten_b.to(at::kFloat), layout); @@ -3199,12 +3185,11 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulBatchSplitKBias) { std::vector inputs = {aten_a, aten_b, aten_bias}; KernelExecutor ke; - NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( - 7, 5, ke.compileFusion(&fusion, inputs)); + NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(7, 5, ke.compile(&fusion, inputs)); ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); - auto cg_outputs = ke.runFusion(inputs); + auto cg_outputs = ke.run(inputs); auto tref = atBiasEpilogue( atMatmul(aten_a.to(at::kFloat), aten_b.to(at::kFloat), layout), aten_bias); @@ -3261,7 +3246,7 @@ TEST_F(MatmulTest, ReproIssue1808) { NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - ke.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), @@ -3269,7 +3254,7 @@ TEST_F(MatmulTest, ReproIssue1808) { ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); - auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); @@ -3417,12 +3402,11 @@ TEST_P(MatmulTestWithLayout, MisalignedVectorization) { NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - ke.compileFusion( - fusion.get(), inputs, LaunchParams(), matmul_cparams)); + ke.compile(fusion.get(), inputs, LaunchParams(), matmul_cparams)); ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); - auto outputs = ke.runFusion(inputs); + auto outputs = ke.run(inputs); EXPECT_TRUE(outputs[0].allclose(tref, 0.001, 0.001)); } @@ -3475,11 +3459,11 @@ TEST_F(MatmulTest, MultipleConsecutiveDims) { KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( - 8, 0, ke.compileFusion(&fusion, inputs, LaunchParams(), matmul_cparams)); + 8, 0, ke.compile(&fusion, inputs, LaunchParams(), matmul_cparams)); ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); - auto cg_outputs = ke.runFusion(inputs); + auto cg_outputs = ke.run(inputs); auto tref = at::reshape( at::linear( at::reshape(A.to(at::kFloat), {M1 * M2, K}), @@ -3541,11 +3525,11 @@ TEST_F(MatmulTest, DISABLED_MultipleNonConsecutiveMDims) { KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( - 8, 0, ke.compileFusion(&fusion, inputs, LaunchParams(), matmul_cparams)); + 8, 0, ke.compile(&fusion, inputs, LaunchParams(), matmul_cparams)); ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); - auto cg_outputs = ke.runFusion(inputs); + auto cg_outputs = ke.run(inputs); auto Apermuted = A.permute({{1, 2}}).reshape({M1 * M2, K}); auto tref = at::linear(Apermuted.to(at::kFloat), B.to(at::kFloat)) .reshape({M1, M2, N}) @@ -3607,11 +3591,11 @@ TEST_F(MatmulTest, DISABLED_MultipleNonConsecutiveNDims) { KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( - 8, 0, ke.compileFusion(&fusion, inputs, LaunchParams(), matmul_cparams)); + 8, 0, ke.compile(&fusion, inputs, LaunchParams(), matmul_cparams)); ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); - auto cg_outputs = ke.runFusion(inputs); + auto cg_outputs = ke.run(inputs); auto Bpermuted = B.permute({{1, 2}}).reshape({N1 * N2, K}); auto tref = at::linear(A.to(at::kFloat), Bpermuted.to(at::kFloat)) .reshape({M, N1, N2}); @@ -3665,11 +3649,11 @@ TEST_F(MatmulTest, MultipleMDimsBatch) { KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( - 8, 0, ke.compileFusion(&fusion, inputs, LaunchParams(), matmul_cparams)); + 8, 0, ke.compile(&fusion, inputs, LaunchParams(), matmul_cparams)); ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); - auto cg_outputs = ke.runFusion(inputs); + auto cg_outputs = ke.run(inputs); auto tref = at::matmul(A.to(at::kFloat), at::permute(B.to(at::kFloat), {0, 2, 1})); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); @@ -3799,9 +3783,9 @@ TEST_F(HopperMatmulTest, HSH_NT_128BSwizzle) { matmulAtInput3DHopperSS(M, N, K, layout, data_type_to_aten(dtype)); KernelExecutor ke; - ke.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams); - auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul(inputs.first.squeeze(), inputs.second.squeeze(), layout); EXPECT_TRUE(at::allclose(cg_outputs[0], tref, 1e-5, 1e-5)); } diff --git a/tests/cpp/test_matmul_sass.cpp b/tests/cpp/test_matmul_sass.cpp index 50defa61eee..0d4385eb46b 100644 --- a/tests/cpp/test_matmul_sass.cpp +++ b/tests/cpp/test_matmul_sass.cpp @@ -99,9 +99,9 @@ sass::Container getSASSFor( SchedulerEntry::makeSchedulerInstance(SchedulerType::Matmul) ->schedule(&fusion, &mparams); KernelExecutor ke; - ke.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams); - auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); @@ -162,12 +162,12 @@ sass::Container getBinaryOpMulEpilogueSASSFor( const double alpha = 2.5; KernelExecutor ke; - ke.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second, alpha}, LaunchParams(), matmul_cparams); - auto cg_outputs = ke.runFusion({inputs.first, inputs.second, alpha}); + auto cg_outputs = ke.run({inputs.first, inputs.second, alpha}); auto tref = at::mul( atMatmul( inputs.first.to(at::kFloat), diff --git a/tests/cpp/test_matmul_scheduler.cpp b/tests/cpp/test_matmul_scheduler.cpp index d4d18a27796..f2109c7f1e1 100644 --- a/tests/cpp/test_matmul_scheduler.cpp +++ b/tests/cpp/test_matmul_scheduler.cpp @@ -2812,9 +2812,9 @@ TEST_P(AllocationDomainTest, BasicMatmul) { auto [t0, t1] = getInputTensors(M, N, K, a_m_inner, b_k_inner); KernelExecutor ke; - ke.compileFusion(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); + ke.compile(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); - auto cg_outputs = ke.runFusion({t0, t1}); + auto cg_outputs = ke.run({t0, t1}); auto tref = t0.to(at::kFloat).matmul(t1.to(at::kFloat)); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); } @@ -2845,9 +2845,9 @@ TEST_P(AllocationDomainTest, BasicMatmulNoTranspose) { auto [t0, t1] = getInputTensors(M, N, K, a_m_inner, b_k_inner); KernelExecutor ke; - ke.compileFusion(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); + ke.compile(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); - auto cg_outputs = ke.runFusion({t0, t1}); + auto cg_outputs = ke.run({t0, t1}); auto tref = t0.to(at::kFloat).matmul(t1.to(at::kFloat)); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); } @@ -2881,9 +2881,9 @@ TEST_P(AllocationDomainTest, BasicMatmulWithPrologueSet) { auto [t0, t1] = getInputTensors(M, N, K, a_m_inner, b_k_inner); KernelExecutor ke; - ke.compileFusion(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); + ke.compile(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); - auto cg_outputs = ke.runFusion({t0, t1}); + auto cg_outputs = ke.run({t0, t1}); auto tref = t0.to(at::kFloat).matmul(t1.to(at::kFloat)); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); } @@ -2919,9 +2919,9 @@ TEST_P(AllocationDomainTest, BasicMatmulWithPrologueSetCastSin) { auto [t0, t1] = getInputTensors(M, N, K, a_m_inner, b_k_inner); KernelExecutor ke; - ke.compileFusion(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); + ke.compile(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); - auto cg_outputs = ke.runFusion({t0, t1}); + auto cg_outputs = ke.run({t0, t1}); auto tref = t0.to(at::kFloat).matmul(t1.sin().to(at::kFloat)); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); } @@ -2956,9 +2956,9 @@ TEST_P(AllocationDomainTest, BasicMatmulWithPrologueSetCastSinNoTranspose) { auto [t0, t1] = getInputTensors(M, N, K, a_m_inner, b_k_inner); KernelExecutor ke; - ke.compileFusion(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); + ke.compile(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); - auto cg_outputs = ke.runFusion({t0, t1}); + auto cg_outputs = ke.run({t0, t1}); auto tref = t0.to(at::kFloat).matmul(t1.sin().to(at::kFloat)); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); } @@ -2993,9 +2993,9 @@ TEST_P(AllocationDomainTest, BasicMatmulWithPrologueSetCastSinSetNoTranspose) { auto [t0, t1] = getInputTensors(M, N, K, a_m_inner, b_k_inner); KernelExecutor ke; - ke.compileFusion(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); + ke.compile(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); - auto cg_outputs = ke.runFusion({t0, t1}); + auto cg_outputs = ke.run({t0, t1}); auto tref = t0.to(at::kFloat).matmul(t1.sin().to(at::kFloat)); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); } @@ -3030,9 +3030,9 @@ TEST_P(AllocationDomainTest, MatmulWithPrologueSetCastSinTranspose) { auto [t0, t1] = getInputTensors(M, N, K, a_m_inner, b_k_inner); KernelExecutor ke; - ke.compileFusion(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); + ke.compile(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); - auto cg_outputs = ke.runFusion({t0, t1}); + auto cg_outputs = ke.run({t0, t1}); auto tref = t0.to(at::kFloat).matmul(t1.sin().to(at::kFloat)); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); } @@ -3141,12 +3141,12 @@ TEST_F(MatmulSchedulerTest, HSH_TT) { //! of ampere scheduler. /* KernelExecutor ke; - ke.compileFusion( + ke.compile( fusion.get(), {inputs.first, inputs.second}, LaunchParams(), matmul_cparams); - auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul(inputs.first.squeeze(), inputs.second.squeeze(), layout); EXPECT_TRUE(at::allclose(cg_outputs[0], tref, 1e-5, 1e-5)); */ @@ -3212,13 +3212,13 @@ TEST_F(MatmulSchedulerTest, HSH_TN) { matmulAtInput3DHopperSS(M, N, K, layout, data_type_to_aten(dtype)); KernelExecutor ke; - ke.compileFusion( + ke.compile( fusion.get(), {inputs.first, inputs.second}, LaunchParams(), matmul_cparams); - auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul(inputs.first.squeeze(), inputs.second.squeeze(), layout); EXPECT_TRUE(at::allclose(cg_outputs[0], tref, 1e-5, 1e-5)); } @@ -3287,13 +3287,13 @@ TEST_F(MatmulSchedulerTest, HSH_NT) { matmulAtInput3DHopperSS(M, N, K, layout, data_type_to_aten(dtype)); KernelExecutor ke; - ke.compileFusion( + ke.compile( fusion.get(), {inputs.first, inputs.second}, LaunchParams(), matmul_cparams); - auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul(inputs.first.squeeze(), inputs.second.squeeze(), layout); EXPECT_TRUE(at::allclose(cg_outputs[0], tref, 1e-5, 1e-5)); } @@ -3364,12 +3364,12 @@ TEST_F(MatmulSchedulerTest, HSH_NN) { // of ampere scheduler. /* KernelExecutor ke; - ke.compileFusion( + ke.compile( fusion.get(), {inputs.first, inputs.second}, LaunchParams(), matmul_cparams); - auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul(inputs.first.squeeze(), inputs.second.squeeze(), layout); EXPECT_TRUE(at::allclose(cg_outputs[0], tref, 1e-5, 1e-5)); */ diff --git a/tests/cpp/test_mbarrier.cpp b/tests/cpp/test_mbarrier.cpp index bd1b425c70f..f7f9611d895 100644 --- a/tests/cpp/test_mbarrier.cpp +++ b/tests/cpp/test_mbarrier.cpp @@ -122,7 +122,7 @@ TEST_F(MBarrierTest, Simple) { top_level_exprs.push_back(invalidate); }); - ke.compileFusion(&fusion); + ke.compile(&fusion); // Make sure that the post-lowering hook successfully inserted all mbarrier // operations @@ -138,7 +138,7 @@ TEST_F(MBarrierTest, Simple) { auto input = at::randn( {32, 32}, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0)); - auto outputs = ke.runFusion({input}); + auto outputs = ke.run({input}); testValidate(&fusion, outputs, {input}, __LINE__, __FILE__); } diff --git a/tests/cpp/test_memory.cpp b/tests/cpp/test_memory.cpp index a472b7f18b1..085a95640bc 100644 --- a/tests/cpp/test_memory.cpp +++ b/tests/cpp/test_memory.cpp @@ -82,7 +82,7 @@ TEST_P(MemoryTest, LoadCache) { { DebugDumpOptionsGuard debug_dump_options_guard; DebugDumpOptionsGuard::getCurOptions().set(DebugDumpOption::Ptx); - ke.compileFusion(&fusion, {input}); + ke.compile(&fusion, {input}); } // Verify PTX. @@ -98,7 +98,7 @@ TEST_P(MemoryTest, LoadCache) { std::filesystem::remove(compiled_kernel.ptx_filename); // Verify output tensors. - std::vector actual_ts = ke.runFusion({input}); + std::vector actual_ts = ke.run({input}); testValidate( &fusion, actual_ts, {input}, {expected_output}, __LINE__, __FILE__); } @@ -157,7 +157,7 @@ TEST_F(MemoryTest, RefineCachePolicy) { { DebugDumpOptionsGuard debug_dump_options_guard; DebugDumpOptionsGuard::getCurOptions().set(DebugDumpOption::Ptx); - ke.compileFusion(&fusion, {a, b}); + ke.compile(&fusion, {a, b}); } // Verify PTX. @@ -170,7 +170,7 @@ TEST_F(MemoryTest, RefineCachePolicy) { debug() << "Removing " << compiled_kernel.ptx_filename << std::endl; std::filesystem::remove(compiled_kernel.ptx_filename); - std::vector actual_outputs = ke.runFusion({a, b}); + std::vector actual_outputs = ke.run({a, b}); testValidate(&fusion, actual_outputs, {a, b}, {c}, __LINE__, __FILE__); } @@ -458,7 +458,7 @@ TEST_P(TMASimpleLdstTest, Load) { at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), dim); TMAPredicateChecker::checkPredicate(ke.kernel(), 1); @@ -466,7 +466,7 @@ TEST_P(TMASimpleLdstTest, Load) { XorFinder::findXor(ke.kernel()), (swizzle != MmaInputSmemSwizzle::None)); TMADimChecker::getDim(ke.kernel()); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -534,9 +534,9 @@ TEST_P(TMALoadTestWithABroadcastDim, LoadWithBroadcast) { at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -578,14 +578,14 @@ TEST_P(TMASimpleLdstTest, Store) { at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), dim); TMAPredicateChecker::checkPredicate(ke.kernel(), 1); ASSERT_EQ( XorFinder::findXor(ke.kernel()), (swizzle != MmaInputSmemSwizzle::None)); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -640,12 +640,12 @@ TEST_F(TMAIndexingTest, Load2DTensorWith1DTMA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({1024, 1024}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 1); TMAPredicateChecker::checkPredicate(ke.kernel(), 1); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -677,12 +677,12 @@ TEST_F(TMAIndexingTest, Load1DTensorWith2DTMA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({1024 * 1024}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2); TMAPredicateChecker::checkPredicate(ke.kernel(), 1); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -714,12 +714,12 @@ TEST_F(TMAIndexingTest, NonOneElementStride) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({1024, 1024}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2); TMAPredicateChecker::checkPredicate(ke.kernel(), 0); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -785,12 +785,12 @@ TEST_F(TMAIndexingTest, Advanced) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({4, 32, 2, 8, 8, 8, 32, 8}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 4); TMAPredicateChecker::checkPredicate(ke.kernel(), 1); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -834,12 +834,12 @@ TEST_F(TMAIndexingTest, DefineBoxByCompositing1) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({4, 32, 2, 8, 8, 8, 32, 8}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 4); EXPECT_FALSE(PredicatedChecker::isPredicated(tv1, ke.kernel())); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -887,12 +887,12 @@ TEST_F(TMAIndexingTest, DefineBoxByCompositing2) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({32, 4, 2, 8, 8, 8, 2, 8, 4}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 5); EXPECT_FALSE(PredicatedChecker::isPredicated(tv1, ke.kernel())); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -948,12 +948,12 @@ TEST_F(TMAIndexingTest, DefineBoxByRotation1) { auto t0 = at::randn( {prime_number, prime_number, multiple_of_16B_but_not_more}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 3); TMAPredicateChecker::checkPredicate(ke.kernel(), 1); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -995,7 +995,7 @@ TEST_F(TMAIndexingTest, DefineBoxByRotation2) { int64_t multiple_of_8_but_not_more = 8 * 997; auto t0 = at::randn({multiple_of_8_but_not_more}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); // We will be using 2D TMA instead of 1D, because strided box can not be // merged with other bulk axes by rotation. So, this schedule will be @@ -1005,7 +1005,7 @@ TEST_F(TMAIndexingTest, DefineBoxByRotation2) { EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2); TMAPredicateChecker::checkPredicate(ke.kernel(), 1); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); // The tensor shape is not a multiple of 8, so the view should fail. @@ -1016,7 +1016,7 @@ TEST_F(TMAIndexingTest, DefineBoxByRotation2) { .device(at::kCUDA, 0); int64_t prime_number = 997; auto t0 = at::randn({prime_number}, options); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); }, ::testing::ThrowsMessage( ::testing::HasSubstr("must be divisible by 8"))); @@ -1057,7 +1057,7 @@ TEST_F(TMAIndexingTest, DefineBoxByRotation3) { int64_t multiple_of_23 = 23 * 997; auto t0 = at::randn({multiple_of_23, 8}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); // We will be using 3D TMA instead of 2D, because split(23, 8) is indivisible, // we can not consider this schedule as a 2D TMA whose first dimension has box @@ -1068,7 +1068,7 @@ TEST_F(TMAIndexingTest, DefineBoxByRotation3) { EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 3); TMAPredicateChecker::checkPredicate(ke.kernel(), 1); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); // The tensor shape is not a multiple of 23, so the view should fail. @@ -1079,7 +1079,7 @@ TEST_F(TMAIndexingTest, DefineBoxByRotation3) { .device(at::kCUDA, 0); int64_t prime_number = 997; auto t0 = at::randn({prime_number, 8}, options); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); }, ::testing::ThrowsMessage( ::testing::HasSubstr("must be divisible by 23"))); @@ -1119,13 +1119,13 @@ TEST_F(TMAIndexingTest, NonTrivialGmemAllocationDomain1) { .transpose(0, 1) .view({128, 1024, 128}); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2); TMAPredicateChecker::checkPredicate(ke.kernel(), 1); ASSERT_TRUE(XorFinder::findXor(ke.kernel())); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -1174,12 +1174,12 @@ TEST_F(TMAIndexingTest, NonTrivialGmemAllocationDomain2) { at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::randn({2, 3, 5, 7, 11, 32}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 3); TMAPredicateChecker::checkPredicate(ke.kernel(), 1); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -1222,12 +1222,12 @@ TEST_F(TMAMiscTest, AdvancedThreadParallelizationLoad) { at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::randn({100000}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 1); TMAPredicateChecker::checkPredicate(ke.kernel(), 4); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -1265,12 +1265,12 @@ TEST_F(TMAMiscTest, AdvancedThreadParallelizationStore) { at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::randn({100000}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 1); TMAPredicateChecker::checkPredicate(ke.kernel(), 4); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -1301,12 +1301,12 @@ TEST_F(TMAMiscTest, DisableIndexHoisting) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({32}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 1); TMAPredicateChecker::checkPredicate(ke.kernel(), 0); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -1333,12 +1333,12 @@ TEST_F(TMAMiscTest, Repro1977) { at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::randn({1024}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 1); TMAPredicateChecker::checkPredicate(ke.kernel(), 0); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -1424,8 +1424,8 @@ TEST_F(TMAMiscTest, StoreSyncInsertion) { 1); KernelExecutor ke; - ke.compileFusion(&fusion, {input}, {}, matmul_cparams); - auto cg_outputs = ke.runFusion({input}); + ke.compile(&fusion, {input}, {}, matmul_cparams); + auto cg_outputs = ke.run({input}); testValidate(&fusion, cg_outputs, {input}, {input}, __LINE__, __FILE__); } @@ -1476,8 +1476,8 @@ TEST_F(TMAMiscTest, StoreSyncInsertion) { // remove the RAW sync by adding a cleanup pass. KernelExecutor ke; - ke.compileFusion(&fusion, {input}, {}, matmul_cparams); - auto cg_outputs = ke.runFusion({input}); + ke.compile(&fusion, {input}, {}, matmul_cparams); + auto cg_outputs = ke.run({input}); testValidate(&fusion, cg_outputs, {input}, {input}, __LINE__, __FILE__); } @@ -1543,8 +1543,8 @@ TEST_F(TMAMiscTest, StoreSyncInsertion) { 2); KernelExecutor ke; - ke.compileFusion(&fusion, {input}, {}, matmul_cparams); - auto cg_outputs = ke.runFusion({input}); + ke.compile(&fusion, {input}, {}, matmul_cparams); + auto cg_outputs = ke.run({input}); testValidate(&fusion, cg_outputs, {input}, {input}, __LINE__, __FILE__); } } @@ -1587,11 +1587,11 @@ TEST_F(TMAMiscTest, LoadStrongCorrectness) { at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::arange(1, 33, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 1); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); auto expect = at::zeros({2, 1, 2, 16}, options); expect.flatten(0, 2).select(0, 0) = at::arange(1, 17, options); @@ -1633,7 +1633,7 @@ TEST_F(TMACompileTimeInvalidTest, BulkNotInTMA) { at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({32}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage(::testing::HasSubstr( "ParallelType::Bulk is only supported for cp.async.bulk."))); @@ -1662,7 +1662,7 @@ TEST_F(TMACompileTimeInvalidTest, BulkBroadcast) { at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({32}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage(::testing::HasSubstr( "ParallelType::Bulk is only supported for IterType::Iteration."))); @@ -1690,7 +1690,7 @@ TEST_F(TMACompileTimeInvalidTest, InvalidParallelType) { at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({32}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Invalid parallel type for cp.async.bulk: V"))); @@ -1728,12 +1728,12 @@ TEST_F(TMARuntimeInvalidTest, MisalignedGlobalAddress) { auto t0_aligned = at::randn({128 + items_of_16_bytes}, options) .narrow(0, items_of_16_bytes, 128); KernelExecutor ke; - ke.compileFusion(&fusion, {t0_aligned}, {}, matmul_cparams); + ke.compile(&fusion, {t0_aligned}, {}, matmul_cparams); EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 1); TMAPredicateChecker::checkPredicate(ke.kernel(), 1); - auto cg_outputs = ke.runFusion({t0_aligned}); + auto cg_outputs = ke.run({t0_aligned}); testValidate( &fusion, cg_outputs, {t0_aligned}, {t0_aligned}, __LINE__, __FILE__); @@ -1741,7 +1741,7 @@ TEST_F(TMARuntimeInvalidTest, MisalignedGlobalAddress) { [&]() { auto t0_misaligned = at::randn({128 + items_of_16_bytes / 2}, options) .narrow(0, items_of_16_bytes / 2, 128); - ke.runFusion({t0_misaligned}); + ke.run({t0_misaligned}); }, ::testing::ThrowsMessage(::testing::HasSubstr( "globalAddress, which specifies the starting address of the memory region described, " @@ -1783,12 +1783,12 @@ TEST_F(TMARuntimeInvalidTest, MisalignedGlobalStride) { auto t0_aligned = at::randn({128, 128 + items_of_16_bytes}, options).narrow(1, 0, 128); KernelExecutor ke; - ke.compileFusion(&fusion, {t0_aligned}, {}, matmul_cparams); + ke.compile(&fusion, {t0_aligned}, {}, matmul_cparams); EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2); TMAPredicateChecker::checkPredicate(ke.kernel(), 1); - auto cg_outputs = ke.runFusion({t0_aligned}); + auto cg_outputs = ke.run({t0_aligned}); testValidate( &fusion, cg_outputs, {t0_aligned}, {t0_aligned}, __LINE__, __FILE__); @@ -1797,7 +1797,7 @@ TEST_F(TMARuntimeInvalidTest, MisalignedGlobalStride) { auto t0_misaligned = at::randn({128, 128 + items_of_16_bytes / 2}, options) .narrow(1, 0, 128); - ke.runFusion({t0_misaligned}); + ke.run({t0_misaligned}); }, ::testing::ThrowsMessage(::testing::HasSubstr( "globalStrides array, which specifies tensor stride of each of the lower tensorRank - 1 dimensions in bytes, " @@ -1837,7 +1837,7 @@ TEST_F(TMACompileTimeInvalidTest, SizeOfTransfer) { EXPECT_THAT( [&]() { KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage(::testing::HasSubstr( "The expected bytes must be a multiple of 16 bytes, but 8 is not."))); @@ -1877,17 +1877,17 @@ TEST_F(TMARuntimeInvalidTest, SizeOfTransfer) { auto t0 = at::randn({128}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0, items_of_16_bytes}, {}, matmul_cparams); + ke.compile(&fusion, {t0, items_of_16_bytes}, {}, matmul_cparams); EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 1); TMAPredicateChecker::checkPredicate(ke.kernel(), 1); - auto cg_outputs = ke.runFusion({t0, items_of_16_bytes}); + auto cg_outputs = ke.run({t0, items_of_16_bytes}); testValidate( &fusion, cg_outputs, {t0, items_of_16_bytes}, {t0}, __LINE__, __FILE__); EXPECT_THAT( - [&]() { ke.runFusion({t0, items_of_16_bytes / 2}); }, + [&]() { ke.run({t0, items_of_16_bytes / 2}); }, ::testing::ThrowsMessage(::testing::HasSubstr( "The expected bytes must be a multiple of 16 bytes, but "))); } @@ -1930,18 +1930,18 @@ TEST_F(TMARuntimeInvalidTest, InvalidView) { // (10240,) can be viewed as (10, 1024) auto t0_valid = at::randn({10240}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0_valid}, {}, matmul_cparams); + ke.compile(&fusion, {t0_valid}, {}, matmul_cparams); EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2); - auto cg_outputs = ke.runFusion({t0_valid}); + auto cg_outputs = ke.run({t0_valid}); testValidate(&fusion, cg_outputs, {t0_valid}, {t0_valid}, __LINE__, __FILE__); EXPECT_THAT( [&]() { // it is impossible to view (10249,) as (?, 1024) auto t0_inval = at::randn({10249}, options); - ke.runFusion({t0_inval}); + ke.run({t0_inval}); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Invalid view in TMA: the extent of"))); @@ -1976,7 +1976,7 @@ TEST_F(TMACompileTimeInvalidTest, InnermostDiscontiguous) { EXPECT_THAT( [&]() { KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage(::testing::HasSubstr( "The innermost dimension of the TMA domain must be contiguous"))); @@ -2017,7 +2017,7 @@ TEST_F(TMACompileTimeInvalidTest, MergeDiscontiguous) { EXPECT_THAT( [&]() { KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Can not merge discontiguous dimensions, but"))); @@ -2053,7 +2053,7 @@ TEST_F(TMACompileTimeInvalidTest, InnermostElementStrideNotOne) { EXPECT_THAT( [&]() { KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage(::testing::HasSubstr( "When interleave is CU_TENSOR_MAP_INTERLEAVE_NONE " @@ -2092,7 +2092,7 @@ TEST_F(TMACompileTimeInvalidTest, SwizzleBulkWithNonBulk) { EXPECT_THAT( [&]() { KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage(::testing::HasSubstr( "TMA domain must be a view of the allocation domain of the gmem tensor"))); @@ -2136,7 +2136,7 @@ TEST_F(TMADocTest, Figure13a) { EXPECT_THAT( [&]() { KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Some error message"))); @@ -2174,12 +2174,12 @@ TEST_F(TMADocTest, Figure14a) { auto t0 = at::randn({16, 200}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2); TMAPredicateChecker::checkPredicate(ke.kernel(), 0); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -2215,7 +2215,7 @@ TEST_F(TMADocTest, Figure13b) { EXPECT_THAT( [&]() { KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Some error message"))); @@ -2250,12 +2250,12 @@ TEST_F(TMADocTest, Figure14b) { auto t0 = at::randn({16, 10}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2); TMAPredicateChecker::checkPredicate(ke.kernel(), 0); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -2292,7 +2292,7 @@ TEST_F(TMADocTest, Figure13c) { EXPECT_THAT( [&]() { KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Some error message"))); @@ -2328,12 +2328,12 @@ TEST_F(TMADocTest, Figure14c) { auto t0 = at::randn({16, 200}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2); TMAPredicateChecker::checkPredicate(ke.kernel(), 0); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -2367,7 +2367,7 @@ TEST_F(TMADocTest, Figure13d) { EXPECT_THAT( [&]() { KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Some error message"))); @@ -2399,12 +2399,12 @@ TEST_F(TMADocTest, Figure14d) { auto t0 = at::randn({16, 12}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2); TMAPredicateChecker::checkPredicate(ke.kernel(), 1); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -2442,7 +2442,7 @@ TEST_F(TMADocTest, Figure13e) { EXPECT_THAT( [&]() { KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Some error message"))); @@ -2479,12 +2479,12 @@ TEST_F(TMADocTest, Figure14e) { auto t0 = at::randn({16, 10}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2); TMAPredicateChecker::checkPredicate(ke.kernel(), 1); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -2524,12 +2524,12 @@ TEST_F(TMADocTest, Figure15a) { auto t0 = at::randn({16, 10}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2); TMAPredicateChecker::checkPredicate(ke.kernel(), 0); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -2566,12 +2566,12 @@ TEST_F(TMADocTest, Figure15b) { auto t0 = at::randn({16, 12}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2); TMAPredicateChecker::checkPredicate(ke.kernel(), 4); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -2614,7 +2614,7 @@ TEST_F(TMADocTest, Figure15c) { EXPECT_THAT( [&]() { KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Some error message"))); @@ -2661,7 +2661,7 @@ TEST_F(TMADocTest, Figure15d) { EXPECT_THAT( [&]() { KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Some error message"))); @@ -2702,7 +2702,7 @@ TEST_F(TMADocTest, Figure15e) { EXPECT_THAT( [&]() { KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Some error message"))); @@ -2756,8 +2756,8 @@ TEST_P(LdMatrixTest, Regular) { auto t0 = at::randn({size1, getK(macro)}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, LaunchParams(), matmul_cparams); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}, LaunchParams(), matmul_cparams); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -2882,8 +2882,8 @@ TEST_P(StMatrixSingleTileTest, Regular) { auto t0 = at::randn({sizeM, sizeN}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, LaunchParams(), matmul_cparams); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}, LaunchParams(), matmul_cparams); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -2943,8 +2943,8 @@ TEST_P(StMatrixTest, Regular) { auto t0 = at::randn({sizeM, sizeN}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, LaunchParams(), matmul_cparams); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}, LaunchParams(), matmul_cparams); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -3018,8 +3018,8 @@ TEST_P(LdMatrixTest, Transpose) { auto t0 = at::randn({getK(macro), size2}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}, LaunchParams(), matmul_cparams); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}, LaunchParams(), matmul_cparams); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } diff --git a/tests/cpp/test_mma.cpp b/tests/cpp/test_mma.cpp index 95a3bd2e772..5493b4a19d5 100644 --- a/tests/cpp/test_mma.cpp +++ b/tests/cpp/test_mma.cpp @@ -173,9 +173,9 @@ std::vector scheduleCompileAndRun( } KernelExecutor ke; - ke.compileFusion( + ke.compile( fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams); - return ke.runFusion({inputs.first, inputs.second}); + return ke.run({inputs.first, inputs.second}); } TEST_P(MmaTest, SingleTile) { @@ -389,10 +389,10 @@ TEST_P(HopperRS, SingleTile) { getM(macro), getN(macro), getK(macro), layout, data_type_to_aten(dtype)); KernelExecutor ke; - ke.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams); - auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.squeeze().to(at::kFloat), inputs.second.squeeze().to(at::kFloat), @@ -485,10 +485,10 @@ TEST_P(HopperRS, SingleTileWithTMALoadStore) { getM(macro), getN(macro), getK(macro), layout, data_type_to_aten(dtype)); KernelExecutor ke; - ke.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams); - auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.squeeze().to(at::kFloat), inputs.second.squeeze().to(at::kFloat), @@ -651,9 +651,9 @@ TEST_P(HopperSS, SingleTile) { getM(macro), getN(macro), getK(macro), layout, data_type_to_aten(dtype)); KernelExecutor ke; - ke.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams); - auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.squeeze().to(at::kFloat), inputs.second.squeeze().to(at::kFloat), @@ -780,9 +780,9 @@ TEST_P(HopperSS, SingleTileTransposed) { getM(macro), getN(macro), getK(macro), layout, data_type_to_aten(dtype)); KernelExecutor ke; - ke.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams); - auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.squeeze().to(at::kFloat), inputs.second.squeeze().to(at::kFloat), @@ -959,9 +959,9 @@ TEST_P(HopperSS, MultipleTile) { data_type_to_aten(dtype)); KernelExecutor ke; - ke.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams); - auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.squeeze().to(at::kFloat), inputs.second.squeeze().to(at::kFloat), diff --git a/tests/cpp/test_persistent_buffer.cpp b/tests/cpp/test_persistent_buffer.cpp index edfdd0fd082..3463395b11b 100644 --- a/tests/cpp/test_persistent_buffer.cpp +++ b/tests/cpp/test_persistent_buffer.cpp @@ -1299,7 +1299,7 @@ TEST_F(PersistentBufferTest, SmemPersistent2DReduction) { // Run the fusion and validate the results KernelExecutor ke; - ke.compileFusion(fusion.get(), aten_inputs); + ke.compile(fusion.get(), aten_inputs); // Shared memory access should be vectorized. // getBankConflictInfo(ke.kernel()) triggers error "std::get: wrong index for // variant" when trying to evaluate index with: @@ -1314,8 +1314,8 @@ TEST_F(PersistentBufferTest, SmemPersistent2DReduction) { } } } - auto cg_outputs = ke.runFusion( - aten_inputs, heuristic_params->as()->lparams); + auto cg_outputs = + ke.run(aten_inputs, heuristic_params->as()->lparams); auto t1 = t0 / t0.sum({1, 2, 3}, true); testValidate(fusion.get(), cg_outputs, aten_inputs, {t1}, __LINE__, __FILE__); } diff --git a/tests/cpp/test_pointwise.cpp b/tests/cpp/test_pointwise.cpp index a854ac75900..c4684adbb6e 100644 --- a/tests/cpp/test_pointwise.cpp +++ b/tests/cpp/test_pointwise.cpp @@ -550,8 +550,8 @@ TEST_F(PointwiseTest, ShardedPointwise) { pwise_scheduler->schedule(&sharded_fusion, sharded_params.get()); KernelExecutor ke; - ke.compileFusion(&sharded_fusion, sharded_inputs, sharded_params->lparams); - auto cg_outputs = ke.runFusion(sharded_inputs, sharded_params->lparams); + ke.compile(&sharded_fusion, sharded_inputs, sharded_params->lparams); + auto cg_outputs = ke.run(sharded_inputs, sharded_params->lparams); testValidate( &sharded_fusion, cg_outputs, sharded_inputs, __LINE__, __FILE__); } @@ -707,8 +707,8 @@ TEST_P(PointwiseParamsTest, UnrollOnTopOfVectorize) { // Schedule, compile, run, validate scheduler_instance->schedule(fusion.get(), pparams); KernelExecutor ke; - ke.compileFusion(fusion.get(), runtime_inputs, pparams->lparams); - auto cg_outputs = ke.runFusion(runtime_inputs, pparams->lparams); + ke.compile(fusion.get(), runtime_inputs, pparams->lparams); + auto cg_outputs = ke.run(runtime_inputs, pparams->lparams); const auto& lparams = ke.lastLaunchParams(); ASSERT_EQ(lparams.gdimy(), dim0 / unroll_outer); ASSERT_EQ( diff --git a/tests/cpp/test_predicate_elimination.cpp b/tests/cpp/test_predicate_elimination.cpp index 2e6f3a409a1..bfb12a12b8a 100644 --- a/tests/cpp/test_predicate_elimination.cpp +++ b/tests/cpp/test_predicate_elimination.cpp @@ -78,8 +78,8 @@ TEST_F(PredicateEliminationTest, 2) { auto t0 = at::randn(shape, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto ref = (t0 + 1).sum({1}) + 1; @@ -128,8 +128,8 @@ TEST_F(PredicateEliminationTest, 3) { auto t0 = at::randn({size}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto ref = sum(t0) + 1; testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); @@ -181,8 +181,8 @@ TEST_F(PredicateEliminationTest, 4) { auto t0 = at::randn({s0, s1}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto t1 = t0.sum({1}); auto t3 = t1.sum({0}) + 1; @@ -229,8 +229,8 @@ TEST_F(PredicateEliminationTest, 5) { auto t0 = at::randn({s0}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto ref = t0.mean({0}); @@ -278,8 +278,8 @@ TEST_F(PredicateEliminationTest, 6) { auto t0 = at::randn({2, 3}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -314,8 +314,8 @@ TEST_F(PredicateEliminationTest, 7) { auto t0 = at::randn({123}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -432,8 +432,8 @@ TEST_F(PredicateEliminationTest, 9) { EXPECT_TRUE(PredicatedChecker::isPredicated(tv1, gpulw)); KernelExecutor ke; - ke.compileFusion(fusion.get(), {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(fusion.get(), {t0}); + auto cg_outputs = ke.run({t0}); testValidate(fusion.get(), cg_outputs, {t0}, __LINE__, __FILE__); } @@ -477,9 +477,9 @@ TEST_F(PredicateEliminationTest, ExtentEqualToMaxParallelTypeExtent) { {"validate_smem_predicate_elimination", validate_smem_predicate_elimination}); }); - ke.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } diff --git a/tests/cpp/test_resize.cpp b/tests/cpp/test_resize.cpp index 3a627f8a292..ed02e67ee5f 100644 --- a/tests/cpp/test_resize.cpp +++ b/tests/cpp/test_resize.cpp @@ -64,8 +64,8 @@ TEST_P(ResizeTest, Pad1) { } KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = at::pad(t0, {1, 1}); @@ -100,8 +100,8 @@ TEST_P(ResizeTest, Pad2) { } KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = at::pad(t0, {1, 1}); @@ -153,8 +153,8 @@ TEST_P(ResizeTest, Pad3) { } KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -187,8 +187,8 @@ TEST_P(ResizeTest, Pad4) { } KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = at::pad(t0, {1, 1}); @@ -242,8 +242,8 @@ TEST_P(ResizeTest, Pad5) { } KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = at::pad(t0, {1, 1}); @@ -293,8 +293,8 @@ TEST_P(ResizeTest, Pad6) { } KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -344,8 +344,8 @@ TEST_P(ResizeTest, Pad7) { } KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -389,8 +389,8 @@ TEST_F(ResizeTest, Pad8) { std::vector aten_inputs({t0}); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = at::pad(t0, {0, 1}) + at::pad(t0, {1, 0}); @@ -614,8 +614,8 @@ TEST_F(ResizeTest, Cat1) { std::vector aten_inputs({t0, t1}); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = at::cat({t0, t1}, 0); @@ -646,8 +646,8 @@ TEST_F(ResizeTest, Cat2) { std::vector aten_inputs({t0, t1}); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = at::cat({t0, t1}, 0); @@ -687,8 +687,8 @@ TEST_F(ResizeTest, Cat3) { std::vector aten_inputs({t0, t1}); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = at::cat({t0, t1}, 1); @@ -731,8 +731,8 @@ TEST_F(ResizeTest, Cat4) { std::vector aten_inputs({t0, t1}); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = at::cat({t0, t1}, 1); @@ -780,8 +780,8 @@ TEST_F(ResizeTest, Cat5) { std::vector aten_inputs({t0, t1, t2}); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -824,8 +824,8 @@ TEST_F(ResizeTest, Cat6) { std::vector aten_inputs({t0, t1, t2}); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = at::cat({t0, t1, t2}, 0); @@ -880,8 +880,8 @@ TEST_F(ResizeTest, Cat7) { {aten_inputs.begin(), aten_inputs.end()}); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs_ivalue); - auto cg_outputs = ke.runFusion(aten_inputs_ivalue); + ke.compile(&fusion, aten_inputs_ivalue); + auto cg_outputs = ke.run(aten_inputs_ivalue); auto ref = at::cat(aten_inputs, concat_dim); @@ -1014,8 +1014,8 @@ TEST_F(ResizeTest, Slice1) { std::vector aten_inputs({t0}); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = t0.index({at::indexing::Slice(1, shape[0] - 1)}); @@ -1045,8 +1045,8 @@ TEST_F(ResizeTest, Slice2) { std::vector aten_inputs({t0}); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -1143,8 +1143,8 @@ TEST_F(ResizeTest, Slice4) { std::vector aten_inputs({t0}); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = (t0 + 1).to(at::kDouble).sum({1}); @@ -1198,8 +1198,8 @@ TEST_F(ResizeTest, Slice5) { std::vector aten_inputs({t0}); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto t1 = t0.index( {at::indexing::Slice(0, at::indexing::None), @@ -1250,8 +1250,8 @@ TEST_F(ResizeTest, SliceConstantShmoo) { std::vector aten_inputs({t0}); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -1295,12 +1295,12 @@ TEST_F(ResizeTest, SliceInputShmoo) { } KernelExecutor ke; - ke.compileFusion(&fusion); + ke.compile(&fusion); auto t0 = at::randn(shape, options); for (auto [start, stop] : slice_cases) { std::vector aten_inputs({t0, start, stop}); - auto cg_outputs = ke.runFusion(aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -1757,8 +1757,8 @@ TEST_P(ResizeTest, PadWithValue) { } KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = at::pad(t0, {1, 1}, "constant", 2); @@ -1832,8 +1832,8 @@ TEST_P(ResizeTest, PadHalfWithDoubleValue) { } KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = at::pad(t0, {1, 1}, "constant", 2.5); @@ -2230,14 +2230,14 @@ TEST_F(ResizeTest, FusionSizeZeroSliceSplit) { tv1->split(0, 4); // sizes (0, 4) KernelExecutor ke; - ke.compileFusion(fusion.get()); + ke.compile(fusion.get()); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - auto cg_outputs = ke.runFusion(aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref0 = t0.index({at::indexing::Slice(2, 2), at::indexing::Slice(0, 5)}); @@ -2682,8 +2682,8 @@ TEST_F(ResizeTest, Slice1DVectorizeManual1) { std::vector aten_inputs({t0}); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = t0.index({at::indexing::Slice(slice_offset, shape[0] - slice_offset)}); @@ -2735,8 +2735,8 @@ TEST_F(ResizeTest, Slice1DVectorizeManual2) { std::vector aten_inputs({t0}); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref_t1 = t0.index({at::indexing::Slice(slice_offset, shape[0] - slice_offset)}); @@ -2786,8 +2786,8 @@ TEST_F(ResizeTest, Slice1DVectorizeManual3) { std::vector aten_inputs({t0}); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = t0.index({at::indexing::Slice(slice_offset, shape[0] - slice_offset)}); @@ -2825,8 +2825,8 @@ TEST_F(ResizeTest, Slice1DVectorizeManual4) { auto t0_aligned = t0_unaligned.index({at::indexing::Slice(3, -1)}); KernelExecutor ke; - ke.compileFusion(&fusion, {t0_aligned}); - auto cg_outputs = ke.runFusion({t0_aligned}); + ke.compile(&fusion, {t0_aligned}); + auto cg_outputs = ke.run({t0_aligned}); auto ref_aligned = t0_aligned.index({at::indexing::Slice(1, -3)}); @@ -2869,8 +2869,8 @@ TEST_F(ResizeTest, Slice2DVectorizeManual1) { std::vector aten_inputs({t0}); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = t0.index( {at::indexing::Slice(slice_offset, shape[0] - slice_offset), @@ -2919,10 +2919,10 @@ TEST_F(ResizeTest, Slice3DVectorizeManual1) { std::vector aten_inputs({t0}); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); + ke.compile(&fusion, aten_inputs); EXPECT_THAT( - [&]() { ke.runFusion(aten_inputs); }, + [&]() { ke.run(aten_inputs); }, ThrowsMessage( HasSubstr("with word size 2 not possible due to invalid stride"))); } @@ -2962,10 +2962,10 @@ TEST_F(ResizeTest, Slice3DVectorizeManual2) { std::vector aten_inputs({t0}); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); + ke.compile(&fusion, aten_inputs); EXPECT_THAT( - [&]() { ke.runFusion(aten_inputs); }, + [&]() { ke.run(aten_inputs); }, ThrowsMessage( HasSubstr("with word size 4 not possible due to invalid stride"))); } @@ -3043,8 +3043,8 @@ TEST_F(ResizeTest, SliceAndReshapeRepro540Manual) { std::vector aten_inputs({t0}); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); for (const auto i : c10::irange(3)) { auto slice_out_ref = t0.index( @@ -3181,8 +3181,8 @@ TEST_F(ResizeTest, CatOfBroadcast) { std::vector aten_inputs({t0, t1}); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = at::cat({t0, t1}, 0); @@ -3218,8 +3218,8 @@ TEST_F(ResizeTest, CatOfExpandedBroadcast) { std::vector aten_inputs({t0, t1}); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = at::cat({at::expand_copy(t0, shape0e), t1}, 0); @@ -3304,8 +3304,8 @@ TEST_P(ResizeTest, PadOfBroadcast) { } KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -3340,8 +3340,8 @@ TEST_P(ResizeTest, PadOfExpandedBroadcast) { } KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -3728,8 +3728,8 @@ TEST_F(ResizeTest, SliceScheduledLikeProducer) { std::vector aten_inputs({t0}); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = t0.index({at::indexing::Slice(1, shape[0] - 1)}); @@ -3776,8 +3776,8 @@ TEST_F(ResizeTest, PadScheduledLikeConsumer) { std::vector aten_inputs({t0}); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = at::pad(t0 + 1, {1, 1}) + 1; @@ -3828,8 +3828,8 @@ TEST_F(ResizeTest, SliceThenPadLeftHalf) { std::vector aten_inputs({t0}); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = at::pad( t0.index({at::indexing::Slice(0, shape[0] / 2)}), {0, shape[0] / 2}); @@ -3883,8 +3883,8 @@ TEST_F(ResizeTest, SliceThenPadRightHalf) { std::vector aten_inputs({t0}); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = at::pad( t0.index({at::indexing::Slice(shape[0] / 2, shape[0])}), @@ -3947,8 +3947,8 @@ TEST_F(ResizeTest, SliceThenConcat) { std::vector aten_inputs({t0}); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); EXPECT_TRUE(t0.equal(cg_outputs[0])); } @@ -4041,8 +4041,8 @@ TEST_F(ResizeTest, SliceSliceConcatConcat) { std::vector aten_inputs({t0}); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = at::concat( {at::slice(t0, 0, 0, rope_size / 2) + 1, @@ -4079,8 +4079,8 @@ TEST_F(ResizeTest, VectorizePadLowering) { std::vector aten_inputs({t0}); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = at::pad(t0, {4, 4}); ASSERT_TRUE(ref.equal(cg_outputs[0])); @@ -4115,8 +4115,8 @@ TEST_F(ResizeTest, VectorizeWhereLowering) { std::vector aten_inputs({at::Scalar(false), t0}); KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); - auto cg_outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); // Note: we cannot use at::where, because aten only support tensor as // predicate. diff --git a/tests/cpp/test_rng.cpp b/tests/cpp/test_rng.cpp index 725da0c43de..c8f7c545ae6 100644 --- a/tests/cpp/test_rng.cpp +++ b/tests/cpp/test_rng.cpp @@ -122,10 +122,10 @@ TEST_F(RNGTest, ManualScheduleValidateWithCURand) { at::Tensor t0 = at::zeros({size}, options); KernelExecutor ke; - ke.compileFusion(fusion, {t0}); + ke.compile(fusion, {t0}); at::manual_seed(0); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); auto out = cg_outputs[0]; at::manual_seed(0); @@ -160,10 +160,10 @@ TEST_F(RNGTest, ManualScheduleValidateWithCURand2) { fusion->addOutput(tv0); KernelExecutor ke; - ke.compileFusion(fusion, {10, 10, 10, 10}); + ke.compile(fusion, {10, 10, 10, 10}); at::manual_seed(0); - auto cg_outputs = ke.runFusion({10, 10, 10, 10}); + auto cg_outputs = ke.run({10, 10, 10, 10}); auto out = cg_outputs[0]; at::manual_seed(0); @@ -294,8 +294,8 @@ TEST_F(RNGTest, BroadcastingRNGSmemNonSquareTile) { ->schedule(fusion, &tparams); KernelExecutor ke; - ke.compileFusion(fusion, {t0, t1}); - auto cg_outputs = ke.runFusion({t0, t1}); + ke.compile(fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); auto out = cg_outputs[0]; NVF_CHECK((out.select(1, 0) == out.select(1, 1)).all().item()); diff --git a/tests/cpp/test_scalar_hoisting.cpp b/tests/cpp/test_scalar_hoisting.cpp index 54aa0ae406a..d0295aa20f3 100644 --- a/tests/cpp/test_scalar_hoisting.cpp +++ b/tests/cpp/test_scalar_hoisting.cpp @@ -214,8 +214,8 @@ TEST_F(ScalarHoistTest, IndexHoist1) { auto t0 = at::randn({15, 17}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -258,8 +258,8 @@ TEST_F(ScalarHoistTest, IndexHoist2) { auto t1 = at::randn({16}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = ke.runFusion({t0, t1}); + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -291,8 +291,8 @@ TEST_F(ScalarHoistTest, IndexHoist3) { at::Tensor t0 = at::arange(10000, options).view({100, 100}); KernelExecutor ke; - ke.compileFusion(fusion.get(), {t0}); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(fusion.get(), {t0}); + auto cg_outputs = ke.run({t0}); const std::string expected_kernel = R"( __global__ void CUDAGeneratedKernel(Tensor T0, Tensor T2) { @@ -370,8 +370,8 @@ TEST_F(ScalarHoistTest, ARange) { int64_t start = 0, end = 100, step = 1; KernelExecutor ke; - ke.compileFusion(fusion.get(), {start, end, step}); - auto cg_outputs = ke.runFusion({start, end, step}); + ke.compile(fusion.get(), {start, end, step}); + auto cg_outputs = ke.run({start, end, step}); const std::string expected_kernel = R"( __global__ void CUDAGeneratedKernel(int64_t i0, int64_t i1, int64_t i2, Tensor T0, Tensor T1) { diff --git a/tests/cpp/test_scatter_gather.cpp b/tests/cpp/test_scatter_gather.cpp index 27fd3857ce9..442ea74d99f 100644 --- a/tests/cpp/test_scatter_gather.cpp +++ b/tests/cpp/test_scatter_gather.cpp @@ -587,9 +587,9 @@ TEST_F(ScatterGatherTest, TakeAlongAxisIntermediateTensorPointwise1) { std::vector aten_inputs = {t0, t1}; KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); + ke.compile(&fusion, aten_inputs); - auto outputs = ke.runFusion(aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -1295,8 +1295,8 @@ TEST_F(ScatterGatherTest, GatherIterGoupedReduction) { KernelExecutor ke; auto lparams = rparams->lparams; - ke.compileFusion(&fusion, aten_inputs, lparams); - auto cg_outputs = ke.runFusion(aten_inputs, lparams); + ke.compile(&fusion, aten_inputs, lparams); + auto cg_outputs = ke.run(aten_inputs, lparams); auto t_gather = at::gather(input, dim, input_idx); testValidate( diff --git a/tests/cpp/test_serial_gridreduce.cpp b/tests/cpp/test_serial_gridreduce.cpp index b62f8a02e3b..13ee5d77df9 100644 --- a/tests/cpp/test_serial_gridreduce.cpp +++ b/tests/cpp/test_serial_gridreduce.cpp @@ -120,11 +120,11 @@ TEST_F(SerialGridReductionTest, Scheduling) { if (serial) { tv3->definition()->as()->requestSerialGridReduction(); } - ke.compileFusion(fusion); + ke.compile(fusion); auto input = at::randn( {H, W}, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0)); - auto outputs = ke.runFusion({input}); + auto outputs = ke.run({input}); if (serial) { // Verify that zeroed semaphore memory was reused instead of diff --git a/tests/cpp/test_sharding.cpp b/tests/cpp/test_sharding.cpp index 5ada8115a8b..6738d99857c 100644 --- a/tests/cpp/test_sharding.cpp +++ b/tests/cpp/test_sharding.cpp @@ -156,8 +156,8 @@ TEST_P(ShardingTest, ComputeIndex) { auto a_tensor = at::randn({4, 2, 1, 5}, options); KernelExecutor ke; - ke.compileFusion(fusion.get(), {a_tensor}); - auto outputs = ke.runFusion({a_tensor}); + ke.compile(fusion.get(), {a_tensor}); + auto outputs = ke.run({a_tensor}); testValidate(fusion.get(), outputs, {a_tensor}, __LINE__, __FILE__); } diff --git a/tests/cpp/test_smem_reuse.cpp b/tests/cpp/test_smem_reuse.cpp index 03f31a79afa..5a258ab6917 100644 --- a/tests/cpp/test_smem_reuse.cpp +++ b/tests/cpp/test_smem_reuse.cpp @@ -557,8 +557,8 @@ TEST_F(SmemReuseTest, SmemReuseWithDifferentVectorizationFactor) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({n_element}, options); KernelExecutor ke; - ke.compileFusion(fusion.get()); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(fusion.get()); + auto cg_outputs = ke.run({t0}); testValidate(fusion.get(), cg_outputs, {t0}, __LINE__, __FILE__); } @@ -617,8 +617,8 @@ TEST_F(SmemReuseTest, RegisterReuseWithDifferentVectorizationFactor) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({n_element}, options); KernelExecutor ke; - ke.compileFusion(fusion.get()); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(fusion.get()); + auto cg_outputs = ke.run({t0}); testValidate(fusion.get(), cg_outputs, {t0}, __LINE__, __FILE__); }; @@ -678,8 +678,8 @@ TEST_F(SmemReuseTest, ExpandInterferes) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({y}, options); KernelExecutor ke; - ke.compileFusion(fusion.get()); - auto cg_outputs = ke.runFusion({t0}); + ke.compile(fusion.get()); + auto cg_outputs = ke.run({t0}); testValidate(fusion.get(), cg_outputs, {t0}, __LINE__, __FILE__); }; diff --git a/tests/cpp/test_swizzle.cpp b/tests/cpp/test_swizzle.cpp index 9499276732a..d4e910a2522 100644 --- a/tests/cpp/test_swizzle.cpp +++ b/tests/cpp/test_swizzle.cpp @@ -55,11 +55,11 @@ TEST_F(LegacySwizzleTest, SimpleSwizzle0) { NVF_CHECK(str.find("where") != std::string::npos); KernelExecutor ke; - ke.compileFusion(&fusion); + ke.compile(&fusion); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({2, 32}, options); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -94,11 +94,11 @@ TEST_F(LegacySwizzleTest, SimpleSwizzle1) { tv1->computeAt(tv2, -1); KernelExecutor ke; - ke.compileFusion(&fusion); + ke.compile(&fusion); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({2, 32}, options); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -151,11 +151,11 @@ TEST_F(LegacySwizzleTest, SimpleSwizzle2) { } KernelExecutor ke; - ke.compileFusion(&fusion); + ke.compile(&fusion); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({32, 32}, options); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -280,11 +280,11 @@ TEST_F(LegacySwizzleTest, LoopSwizzle0) { tv0->computeAt(tv2, -1); KernelExecutor ke; - ke.compileFusion(&fusion); + ke.compile(&fusion); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({2, 32}, options); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -315,11 +315,11 @@ TEST_F(LegacySwizzleTest, LoopSwizzle1) { tv2->axis(1)->parallelize(ParallelType::BIDy); KernelExecutor ke; - ke.compileFusion(&fusion); + ke.compile(&fusion); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({45, 77}, options); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -350,7 +350,7 @@ TEST_F(LegacySwizzleTest, LoopSwizzleCheck0) { tv0->computeAt(tv2, -1); KernelExecutor ke; - ASSERT_ANY_THROW(ke.compileFusion(&fusion)); + ASSERT_ANY_THROW(ke.compile(&fusion)); } // Test assertion in unsupported pattern: half-inlined loop swizzle. @@ -382,7 +382,7 @@ TEST_F(LegacySwizzleTest, LoopSwizzleCheck1) { tv0->computeAt(tv3, -2); KernelExecutor ke; - ASSERT_ANY_THROW(ke.compileFusion(&fusion)); + ASSERT_ANY_THROW(ke.compile(&fusion)); } TEST_F(LegacySwizzleTest, SwizzleVectorize) { @@ -616,8 +616,8 @@ TEST_F(LegacySwizzleTest, SwizzleIndexing170) { at::Tensor t = at::randn({64, 64}, options); KernelExecutor ke; - ke.compileFusion(&fusion); - auto outputs = ke.runFusion({t}); + ke.compile(&fusion); + auto outputs = ke.run({t}); testValidate(&fusion, outputs, {t}, __LINE__, __FILE__); } @@ -679,8 +679,8 @@ TEST_F(LegacySwizzleTest, SwizzleInProducerProjection) { auto t = at::randn({32, 64}, options); KernelExecutor ke; - ke.compileFusion(fusion.get()); - auto outputs = ke.runFusion({t}); + ke.compile(fusion.get()); + auto outputs = ke.run({t}); auto expect = at::empty_like(t); for (auto i : c10::irange(t.size(0) / 8)) { @@ -736,9 +736,9 @@ TEST_F(SwizzleTest, Transpose1) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t = at::randn({10240, 10240}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t}); + ke.compile(&fusion, {t}); EXPECT_TRUE(getBankConflictInfo(ke.kernel()).empty()); - std::vector outputs = ke.runFusion({t}); + std::vector outputs = ke.run({t}); EXPECT_TRUE(at::equal(t.t(), outputs[0])); } diff --git a/tests/cpp/test_tensor_factories.cpp b/tests/cpp/test_tensor_factories.cpp index 68e8b4bc4f5..2eabde38b3b 100644 --- a/tests/cpp/test_tensor_factories.cpp +++ b/tests/cpp/test_tensor_factories.cpp @@ -353,8 +353,8 @@ TEST_F(TensorFactoryTest, TensorConstruct) { fusion->addOutput(output); KernelExecutor ke; - ke.compileFusion(fusion.get()); - auto cg_outputs = ke.runFusion({00, 01, 10, 11}); + ke.compile(fusion.get()); + auto cg_outputs = ke.run({00, 01, 10, 11}); testValidate(fusion.get(), cg_outputs, {00, 01, 10, 11}, __LINE__, __FILE__); } @@ -404,8 +404,8 @@ TEST_F(TensorFactoryTest, MetadataAsTensor) { auto input1 = at::randn({6, 7, 8, 9}, options); KernelExecutor ke; - ke.compileFusion(fusion.get()); - auto cg_outputs = ke.runFusion({input0, input1}); + ke.compile(fusion.get()); + auto cg_outputs = ke.run({input0, input1}); testValidate(fusion.get(), cg_outputs, {input0, input1}, __LINE__, __FILE__); } diff --git a/tests/cpp/test_translate_mma.cpp b/tests/cpp/test_translate_mma.cpp index dca5ccfac1e..14e9a9d222f 100644 --- a/tests/cpp/test_translate_mma.cpp +++ b/tests/cpp/test_translate_mma.cpp @@ -230,10 +230,10 @@ TEST_P(CombineMulSumAsMmaTestWithLayout, AmpereMulSumToMatmul_Schedule) { auto inputs = matmulAtInput2D(M, N, K, layout); KernelExecutor ke; - ke.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams); ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); - auto cg_outputs = ke.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); diff --git a/tests/cpp/test_tutorial.cpp b/tests/cpp/test_tutorial.cpp index 3089c6e4df3..943b2fc6504 100644 --- a/tests/cpp/test_tutorial.cpp +++ b/tests/cpp/test_tutorial.cpp @@ -84,10 +84,10 @@ TEST_F(Tutorial, Memcpy) { // Next, lower the fusion to Kernel, generate CUDA kernel source and then // compile it with nvrtc. All of them are done by KernelExecutor KernelExecutor ke; - ke.compileFusion(&fusion, aten_inputs); + ke.compile(&fusion, aten_inputs); // KernelExecutor now has a compiled kernel, which can be executed as: - std::vector outputs = ke.runFusion(aten_inputs); + std::vector outputs = ke.run(aten_inputs); // Note that this run is done using just one thread, which will be // corrected below. @@ -159,14 +159,14 @@ TEST_F(Tutorial, Memcpy) { // Since the fusion is modified, we need to recompile it. KernelExecutor ke2; - ke2.compileFusion(&fusion, aten_inputs); + ke2.compile(&fusion, aten_inputs); // This time, the kernel is launched with multiple threads and // thread blocks. Note that the launch configurations, i.e., the // thread block and grid shapes, are autoatically inferred from the // given inputs. To see how many threads are used, run this test // with NVFUSER_DUMP=launch_param - outputs = ke2.runFusion(aten_inputs); + outputs = ke2.run(aten_inputs); ASSERT_TRUE(outputs[0].equal(t0)); } @@ -206,8 +206,8 @@ TEST_F(Tutorial, Reduction) { { KernelExecutor ke; - ke.compileFusion(&fusion); - std::vector outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion); + std::vector outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__); } @@ -222,8 +222,8 @@ TEST_F(Tutorial, Reduction) { { KernelExecutor ke; - ke.compileFusion(&fusion); - std::vector outputs = ke.runFusion(aten_inputs); + ke.compile(&fusion); + std::vector outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__); } @@ -240,18 +240,18 @@ TEST_F(Tutorial, Reduction) { { KernelExecutor ke; - ke.compileFusion(&fusion); + ke.compile(&fusion); // Running this fusion, however, should fail as it would require // thread blocks of shape 1024x10, i.e., the same shape as the // input tensor, which is too large in CUDA. // // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(ke.runFusion(aten_inputs)); + ASSERT_ANY_THROW(ke.run(aten_inputs)); // Try again with a smaller input. This should launch a kernel // with thread blocks of shape 32x10 at::Tensor t1 = at::randn({10, 32}, options); - std::vector outputs = ke.runFusion({t1}); + std::vector outputs = ke.run({t1}); testValidate( &fusion, outputs, aten_inputs, {t1.sum({1})}, __LINE__, __FILE__); } @@ -267,12 +267,12 @@ TEST_F(Tutorial, Reduction) { { KernelExecutor ke; - ke.compileFusion(&fusion); + ke.compile(&fusion); // The original input should not fail in this case. The kernel // will be launched with 10 thread blocks, each of which has 1024 // threads. Try running this test with NVFUSER_DUMP=launch_param // to see the launch configuration of each kernel lauch - std::vector outputs = ke.runFusion(aten_inputs); + std::vector outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__); } } @@ -381,12 +381,12 @@ TEST_F(Tutorial, ReductionRFactor) { at::Tensor ref = t0.sum({0}); KernelExecutor ke; - ke.compileFusion(&fusion_copy); + ke.compile(&fusion_copy); // Since the size of the input is 10000, which is split by a // factor of 1024, the first per-thread reduction is done for // ceilDiv(10000, 1024) = 10 elements. - std::vector outputs = ke.runFusion(aten_inputs); + std::vector outputs = ke.run(aten_inputs); testValidate(&fusion_copy, outputs, aten_inputs, {ref}, __LINE__, __FILE__); } @@ -440,9 +440,9 @@ TEST_F(Tutorial, ReductionRFactor) { at::Tensor ref = t0.sum({0}); KernelExecutor ke; - ke.compileFusion(&fusion_copy); + ke.compile(&fusion_copy); - std::vector outputs = ke.runFusion(aten_inputs); + std::vector outputs = ke.run(aten_inputs); testValidate(&fusion_copy, outputs, aten_inputs, {ref}, __LINE__, __FILE__); } } @@ -787,8 +787,8 @@ TEST_F(Tutorial, BasicTMA) { std::vector shape(3, 300); auto t = at::randn(shape, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t}, {}, index32bit); - std::vector outputs = ke.runFusion({t}); + ke.compile(&fusion, {t}, {}, index32bit); + std::vector outputs = ke.run({t}); ASSERT_TRUE(at::equal(t, outputs[0])); } @@ -871,8 +871,8 @@ TEST_F(Tutorial, BasicTMA) { std::vector shape(3, 300); auto t = at::randn(shape, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t}, {}, index32bit); - std::vector outputs = ke.runFusion({t}); + ke.compile(&fusion, {t}, {}, index32bit); + std::vector outputs = ke.run({t}); ASSERT_TRUE(at::equal(t, outputs[0])); } @@ -954,8 +954,8 @@ TEST_F(Tutorial, BasicTMA) { std::vector shape(3, 300); auto t = at::randn(shape, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t}, {}, index32bit); - std::vector outputs = ke.runFusion({t}); + ke.compile(&fusion, {t}, {}, index32bit); + std::vector outputs = ke.run({t}); ASSERT_TRUE(at::equal(t, outputs[0])); } @@ -1034,8 +1034,8 @@ TEST_F(Tutorial, BasicTMA) { std::vector shape(3, 300); auto t = at::randn(shape, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t}, {}, index32bit); - std::vector outputs = ke.runFusion({t}); + ke.compile(&fusion, {t}, {}, index32bit); + std::vector outputs = ke.run({t}); ASSERT_TRUE(at::equal(t, outputs[0])); } @@ -1139,8 +1139,8 @@ TEST_F(Tutorial, BasicTMA) { std::vector shape(3, 300); auto t = at::randn(shape, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t}, {}, index32bit); - std::vector outputs = ke.runFusion({t}); + ke.compile(&fusion, {t}, {}, index32bit); + std::vector outputs = ke.run({t}); ASSERT_TRUE(at::equal(t, outputs[0])); } @@ -1245,8 +1245,8 @@ TEST_F(Tutorial, BasicTMA) { std::vector shape(3, 300); auto t = at::randn(shape, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t}, {}, index32bit); - std::vector outputs = ke.runFusion({t}); + ke.compile(&fusion, {t}, {}, index32bit); + std::vector outputs = ke.run({t}); ASSERT_TRUE(at::equal(t, outputs[0])); } } @@ -1345,8 +1345,8 @@ TEST_F(Tutorial, VectorizeStorePointwiseTMA) { // Compile with KernelExecutor directly to avoid scheduling KernelExecutor ke; - ke.compileFusion(fusion.get(), {at_tv0, at_tv1}, {}, index32bit); - auto outputs = ke.runFusion({at_tv0, at_tv1}); + ke.compile(fusion.get(), {at_tv0, at_tv1}, {}, index32bit); + auto outputs = ke.run({at_tv0, at_tv1}); auto at_output = at_tv0 + at_tv1; testValidate( @@ -1449,8 +1449,8 @@ TEST_F(Tutorial, PointwiseBroadcastTMA) { // Compile with KernelExecutor directly to avoid scheduling KernelExecutor ke; - ke.compileFusion(fusion.get(), {at_tv0, at_tv1}, {}, index32bit); - auto outputs = ke.runFusion({at_tv0, at_tv1}); + ke.compile(fusion.get(), {at_tv0, at_tv1}, {}, index32bit); + auto outputs = ke.run({at_tv0, at_tv1}); auto at_output = at_tv0 + at_tv1; testValidate( @@ -1553,8 +1553,8 @@ TEST_F(Tutorial, TMABankConflictFreeTranspose) { auto t = at::randn({10000, 10000}, options); KernelExecutor ke; CompileParams index32bit{DataType::Int32, 255, false}; - ke.compileFusion(&fusion, {t}, {}, index32bit); - std::vector outputs = ke.runFusion({t}); + ke.compile(&fusion, {t}, {}, index32bit); + std::vector outputs = ke.run({t}); ASSERT_TRUE(at::equal(t.t(), outputs[0])); } diff --git a/tests/cpp/test_utils.cpp b/tests/cpp/test_utils.cpp index 174c9ef367d..e04fd39ca43 100644 --- a/tests/cpp/test_utils.cpp +++ b/tests/cpp/test_utils.cpp @@ -1116,14 +1116,14 @@ TEST_F(NVFuserTest, FusionSASSDumpError) { at::Tensor t0 = at::randn({8}, options); KernelExecutor ke; - ke.compileFusion(&fusion, {t0}); + ke.compile(&fusion, {t0}); EXPECT_THAT( [&]() { ke.disassembledKernelSASS(); }, ::testing::ThrowsMessage( ::testing::HasSubstr("I am fake"))); - auto cg_outputs = ke.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(ke.kernel(), cg_outputs, {t0}, __LINE__, __FILE__); } diff --git a/tests/cpp/utils.cpp b/tests/cpp/utils.cpp index dca4d80ae20..64b1cbe55f7 100644 --- a/tests/cpp/utils.cpp +++ b/tests/cpp/utils.cpp @@ -25,8 +25,8 @@ CGResultsPackage scheduleAndRun( auto heuristic_params = SchedulerEntry::scheduleWith( fusion, scheduler_type, runtime_inputs, validate_scheduler); auto ke = std::make_unique(); - ke->compileFusion(fusion, runtime_inputs, heuristic_params->lparams); - auto cg_outputs = ke->runFusion(runtime_inputs, heuristic_params->lparams); + ke->compile(fusion, runtime_inputs, heuristic_params->lparams); + auto cg_outputs = ke->run(runtime_inputs, heuristic_params->lparams); CGResultsPackage results = { .outputs = cg_outputs, .heuristic_params = std::move(heuristic_params), From 30e7bff5877e106979dd7fa223ec2bf26a73e900 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Thu, 7 Nov 2024 00:13:03 -0800 Subject: [PATCH 19/27] Fix launch configuration error with ScatterGatherTest.TorchGatherAllRankAllSelectedDim (#3364) This test uses pseudo random numbers to generate index tensors, which can result in requiring too large grid dimensions. For instance, there was this error reported today: ``` C++ exception with description " INTERNAL ASSERT FAILED at "/opt/pytorch/nvfuser/csrc/runtime/executor_params.cpp":41, please report a bug with repro script to NVFuser at https://github.com/NVIDIA/Fuser/issues. Invalid number of blocks in y direction: 69923 Exception raised from assertValid at /opt/pytorch/nvfuser/csrc/runtime/executor_params.cpp:41 (most recent call first): ``` The true fix would be making sure the scheduler to use a proper launch configurations, but these index operations are only there as experimental ops, I think this fix should be good enough for now. --- tests/cpp/test_scatter_gather.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/cpp/test_scatter_gather.cpp b/tests/cpp/test_scatter_gather.cpp index 442ea74d99f..5e1bfdd2eb1 100644 --- a/tests/cpp/test_scatter_gather.cpp +++ b/tests/cpp/test_scatter_gather.cpp @@ -132,7 +132,7 @@ TEST_F(ScatterGatherTest, TorchGatherAllRankAllSelectedDim) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto options_i = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0); for (const auto is_take_along : {false, true}) { - for (int rank = 1; rank <= 5; ++rank) { + for (int rank = 1; rank <= 3; ++rank) { for (int dim = 0; dim < rank; ++dim) { // this test uses a random input shape, clear the allocator to avoid // OOM. From f5b903e94c6ac3bd12914214a1e3be7509423e1b Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Thu, 7 Nov 2024 08:03:22 -0500 Subject: [PATCH 20/27] Fix tests with DEBUG_SERDE=disable (#3356) Fixing failures in the python tests with DEBUG_SERDE=disable. Repro: DEBUG_SERDE=disable pytest -v tests/python/test_python_frontend.py -k test_pad_dynamic --- tests/python/utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/python/utils.py b/tests/python/utils.py index 2a7fadc4a14..9ff3f1b8d78 100644 --- a/tests/python/utils.py +++ b/tests/python/utils.py @@ -437,8 +437,10 @@ def exec_nvfuser( self.assertTrue( check_captured_python_definition(out, fd, inputs_captured, device) ) - - self.assertEqual(fc.num_fusions() - before_fusions, int(new_fusion_expected)) + if not disable_serde: + self.assertEqual( + fc.num_fusions() - before_fusions, int(new_fusion_expected) + ) if is_clonable: self.assertTrue(check_cpp_translation(out, fd, inputs_cloned)) From dc4f4a140cf697e4094a1a6048d76679f7d3ff0c Mon Sep 17 00:00:00 2001 From: Jacob Hinkle <1454944+jacobhinkle@users.noreply.github.com> Date: Thu, 7 Nov 2024 11:17:35 -0500 Subject: [PATCH 21/27] Add NVFUSER_DUMP=python_definition_segments (#3368) This adds a new debug dump option that simply prints out all of the segmented fusion segments as python FusionDefinitions at compile time. This is useful for debugging non-segmentation related errors whose repros contain many segments. To do so, when you notice a compile error the printout will tell you which segmented group it was found in. Then run your code again with `NVFUSER_DUMP=python_definition_segments` which will print a definition function for each segmented group using the C++ to python translation that was recently added. This lets you create a more targeted repro by copying that smaller definition into the repro printed in the error message then updating the inputs so that it executes. --- csrc/options.cpp | 1 + csrc/options.h | 1 + csrc/runtime/fusion_kernel_runtime.cpp | 10 ++++++++++ 3 files changed, 12 insertions(+) diff --git a/csrc/options.cpp b/csrc/options.cpp index beb98d2f8a8..54571bad3ba 100644 --- a/csrc/options.cpp +++ b/csrc/options.cpp @@ -135,6 +135,7 @@ std::unordered_map> Options< {"ptx", DebugDumpOption::Ptx}, {"ptxas_verbose", DebugDumpOption::PrintPtxasLog}, {"python_definition", DebugDumpOption::PythonDefinition}, + {"python_definition_segments", DebugDumpOption::PythonDefinitionSegments}, {"python_frontend_debug", DebugDumpOption::PythonFrontendDebug}, {"sass", DebugDumpOption::Sass}, {"segmented_fusion", DebugDumpOption::FusionSegments}, diff --git a/csrc/options.h b/csrc/options.h index f256f9860cc..0cc6313a214 100644 --- a/csrc/options.h +++ b/csrc/options.h @@ -64,6 +64,7 @@ enum class DebugDumpOption { //! associated with what's running PreSegmenterLogging, PythonDefinition, //! Python Frontend Fusion Definition. + PythonDefinitionSegments, //! Python Frontend Fusion Definition of segments. PythonFrontendDebug, //! Python Frontend debug information. TransformPropagator, //! When running TransformPropagator, print propagation //! path and replay result diff --git a/csrc/runtime/fusion_kernel_runtime.cpp b/csrc/runtime/fusion_kernel_runtime.cpp index 5e3a4a352ad..8069ed3ee3a 100644 --- a/csrc/runtime/fusion_kernel_runtime.cpp +++ b/csrc/runtime/fusion_kernel_runtime.cpp @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include #include #include @@ -298,6 +300,14 @@ void FusionKernelRuntime::compileFusionParallel(KernelArgumentHolder args) { for (int64_t run_order_id = 0; run_order_id < num_groups; ++run_order_id) { auto group_to_run = runtime_workspace_.group_run_order.at(run_order_id); + if (isDebugDumpEnabled(DebugDumpOption::PythonDefinitionSegments)) { + debug() << "Python definition for segmented group " + << group_to_run->groupId() << ":" << std::endl; + python_frontend::FusionDefinition fd(/*id=*/std::nullopt); + python_frontend::translate(group_to_run->getFusion(), &fd); + fd.print(debug()); + } + // TODO: index mode should be updated per segmented kernel // Prepare input vector KernelArgumentHolder group_runtime_inputs; From 0261d6b9cdf83eaeb92fdab153028525c19fea1a Mon Sep 17 00:00:00 2001 From: Jacob Hinkle <1454944+jacobhinkle@users.noreply.github.com> Date: Thu, 7 Nov 2024 11:22:34 -0500 Subject: [PATCH 22/27] Ensure expression simplification is enabled in proveLinearAndGetStride (#3345) When debugging the matmul scheduler, I noticed that we hit an error when generating Hopper MMA when we use `NVFUSER_DISABLE=expr_simplify`. This PR makes that possible again. Note that the option is disabled only within `proveLinearAndGetStride` so it has no direct effect on index expressions, which is what we want to look at when using that option. --- csrc/device_lower/utils.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/csrc/device_lower/utils.cpp b/csrc/device_lower/utils.cpp index 00bb3a1d458..d10ffb90e50 100644 --- a/csrc/device_lower/utils.cpp +++ b/csrc/device_lower/utils.cpp @@ -1907,6 +1907,11 @@ Val* proveLinearAndGetStride( const ValGroup& linear_g, const ValGroups& domain) { FusionGuard fg(linear_g->front()->fusion()); + // This function uses simplifyExpr extensively. If we have disable expression + // simplification in order to help inspect generated kernels then we will get + // incorrect results here. Instead, we ensure it is enabled using this guard. + DisableOptionsGuard dog; + DisableOptionsGuard::getCurOptions().unset(DisableOption::ExprSimplify); if (simplifyExpr(extent(linear_g))->isOne()) { // If the extent of the linear group is 1, we always consider it as linear, // regardless of its relationship with domain. For this case, we use stride From 267b7e045a646400dab903d870f78825270277c9 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Thu, 7 Nov 2024 09:03:14 -0800 Subject: [PATCH 23/27] Use SimplyfingIrBuilder for circular buffering (#3361) Noticed some easy simplification while working on #3309. Not necessary for correctness and the actual perf would probably have no effect ,if any, thanks to expr simplification and index hoisting, but examining generated code could be easier when simplification is possible. --- csrc/device_lower/pass/circular_buffer.cpp | 10 +++++----- csrc/ir/nodes.cpp | 7 +++++++ tests/cpp/test_indexing.cpp | 5 +++++ 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/csrc/device_lower/pass/circular_buffer.cpp b/csrc/device_lower/pass/circular_buffer.cpp index f2eb30297a5..273cee42ba9 100644 --- a/csrc/device_lower/pass/circular_buffer.cpp +++ b/csrc/device_lower/pass/circular_buffer.cpp @@ -97,7 +97,7 @@ class CircularBufferLoopCloner : public kir::IrVisitor { } case CircularBufferLoopStage::Main: { if (requireEpilogue(circular_buffer_load_exprs_)) { - stop = IrBuilder::subExpr( + stop = SimplifyingIrBuilder::subExpr( circular_buffer_loop_->stop(), SimplifyingIrBuilder::create( prefetch_distance, DataType::Index)); @@ -106,7 +106,7 @@ class CircularBufferLoopCloner : public kir::IrVisitor { } case CircularBufferLoopStage::Epilog: { NVF_ERROR(requireEpilogue(circular_buffer_load_exprs_)); - start = IrBuilder::subExpr( + start = SimplifyingIrBuilder::subExpr( circular_buffer_loop_->stop(), SimplifyingIrBuilder::create( prefetch_distance, DataType::Index)); @@ -424,7 +424,7 @@ class CloneTmaCircularBufferLoopAndInsertSync int64_t stage_depth = GpuLower::current()->circularBufferInfo().getStageDepthFor( circular_buffer_loop_->iter_domain()); - Val* result = IrBuilder::modExpr( + Val* result = SimplifyingIrBuilder::modExpr( cloned_top_level_loop_->indexOrStartIfTrivial(), IrBuilder::create(stage_depth, PrimDataType::Index)); return GpuLower::current()->commonScalarMap().hoistScalar( @@ -441,8 +441,8 @@ class CloneTmaCircularBufferLoopAndInsertSync GpuLower::current()->circularBufferInfo().getPrefetchDistanceFor( circular_buffer_loop_->iter_domain()); - auto current_load_stage = IrBuilder::modExpr( - IrBuilder::addExpr( + auto current_load_stage = SimplifyingIrBuilder::modExpr( + SimplifyingIrBuilder::addExpr( cloned_top_level_loop_->indexOrStartIfTrivial(), IrBuilder::create(prefetch_distance, PrimDataType::Index)), IrBuilder::create(stage_depth, PrimDataType::Index)); diff --git a/csrc/ir/nodes.cpp b/csrc/ir/nodes.cpp index d647f66b566..0ff0e5c6bf3 100644 --- a/csrc/ir/nodes.cpp +++ b/csrc/ir/nodes.cpp @@ -4881,6 +4881,13 @@ bool ForLoop::isTrivial() const { return true; } + if (start()->isConstScalar() && simplifiedStop()->isConstScalar() && + start()->evaluate().as() + 1 == + simplifiedStop()->evaluate().as() && + step()->isOneInt()) { + return true; + } + return false; } diff --git a/tests/cpp/test_indexing.cpp b/tests/cpp/test_indexing.cpp index bc7af04a0e0..1862747ec47 100644 --- a/tests/cpp/test_indexing.cpp +++ b/tests/cpp/test_indexing.cpp @@ -2151,6 +2151,11 @@ TEST_F(IndexingTest, DoubleBuffering6) { return nullptr; } + // This loop is double buffered. Since the loop originally has + // just a trip count of 2, the double-buffered main loop has a + // trip count of 1. Thus, this loop is always trivial + loop_indices.at(1) = tv->fusion()->zeroVal(); + switch (tv->name()) { case 1: { if (!as_consumer) { From e072c9254b05f452801d10228bea504bfb99c0b1 Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Thu, 7 Nov 2024 12:56:02 -0800 Subject: [PATCH 24/27] Redo #3326. (#3370) The change was accidentally reverted in https://github.com/NVIDIA/Fuser/pull/3349. --- tests/cpp/test_allocation_domain.cpp | 140 +++++++++++---------------- 1 file changed, 59 insertions(+), 81 deletions(-) diff --git a/tests/cpp/test_allocation_domain.cpp b/tests/cpp/test_allocation_domain.cpp index aca1adea75a..55ac0ee99d4 100644 --- a/tests/cpp/test_allocation_domain.cpp +++ b/tests/cpp/test_allocation_domain.cpp @@ -29,8 +29,7 @@ using ::testing::ElementsAre; // A global->shared->global copy kernel, shared memory allocated transposed to // avoid bank conflict. TEST_F(AllocationDomainTest, TransposedIntermediate) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigConcreteTensor({32, 32}); @@ -59,7 +58,7 @@ TEST_F(AllocationDomainTest, TransposedIntermediate) { at::Tensor t0 = at::randn({32, 32}, options); KernelExecutor ke; - ke.compile(fusion_ptr.get(), {t0}); + ke.compile(&fusion, {t0}); auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -67,8 +66,7 @@ TEST_F(AllocationDomainTest, TransposedIntermediate) { // A global->global copy kernel converting NCHW memory format into NHWC, with a // 4d allocation domain in output. TEST_F(AllocationDomainTest, NCHW4d_To_NHWC4d) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigTensor(4); @@ -97,7 +95,7 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC4d) { at::Tensor t0 = at::randn({n, c, h, w}, options); KernelExecutor ke; - ke.compile(fusion_ptr.get(), {t0}); + ke.compile(&fusion, {t0}); auto cg_outputs = ke.run({t0}); @@ -109,8 +107,7 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC4d) { // A global->global copy kernel converting NCHW memory format into NHWC, with a // 1d allocation domain in output. TEST_F(AllocationDomainTest, NCHW4d_To_NHWC1d) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigTensor(4); @@ -136,7 +133,7 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC1d) { at::Tensor t0 = at::randn({n, c, h, w}, options); KernelExecutor ke; - ke.compile(fusion_ptr.get(), {t0}); + ke.compile(&fusion, {t0}); auto cg_outputs = ke.run({t0}); @@ -148,8 +145,7 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC1d) { // A global->global copy kernel converting NCHW memory format into NHWC, with a // 2d allocation domain in output. TEST_F(AllocationDomainTest, NCHW4d_To_NHWC2d) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigTensor(4); @@ -176,7 +172,7 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC2d) { at::Tensor t0 = at::randn({n, c, h, w}, options); KernelExecutor ke; - ke.compile(fusion_ptr.get(), {t0}); + ke.compile(&fusion, {t0}); auto cg_outputs = ke.run({t0}); @@ -188,8 +184,7 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC2d) { // Reshape and transpose a 3d tensor into an NHWC tensor with a 3d allocation // domain in fusion output. TEST_F(AllocationDomainTest, Tensor3d_To_NHWC3d) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); int n1 = 31, n2 = 29, h = 64, w = 104, c = 21; @@ -223,7 +218,7 @@ TEST_F(AllocationDomainTest, Tensor3d_To_NHWC3d) { at::Tensor t0 = at::randn({n1, n2, h * w * c}, options); KernelExecutor ke; - ke.compile(fusion_ptr.get(), {t0}); + ke.compile(&fusion, {t0}); auto cg_outputs = ke.run({t0}); @@ -242,8 +237,7 @@ TEST_F(AllocationDomainTest, Tensor3d_To_NHWC3d) { // output. The allocation domain is on both the producer and the consumer side // of the rFactor domain. TEST_F(AllocationDomainTest, Tensor3d_To_NHWC4d_FwdBwd) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); int n1 = 31, n2 = 29, h = 64, w = 104, c = 21; @@ -283,7 +277,7 @@ TEST_F(AllocationDomainTest, Tensor3d_To_NHWC4d_FwdBwd) { at::Tensor t0 = at::randn({n1, n2, c * h * w}, options); KernelExecutor ke; - ke.compile(fusion_ptr.get(), {t0}); + ke.compile(&fusion, {t0}); auto cg_outputs = ke.run({t0}); @@ -301,8 +295,7 @@ TEST_F(AllocationDomainTest, Tensor3d_To_NHWC4d_FwdBwd) { // A global->global copy kernel where both inputs and outputs are NHWC memory // format TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigTensor(4); @@ -339,7 +332,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); KernelExecutor ke; - ke.compile(fusion_ptr.get(), {t0}); + ke.compile(&fusion, {t0}); EXPECT_THAT( [&]() { ke.run({t0_wrong_format}); }, @@ -356,8 +349,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d) { // A global->global copy kernel where both inputs are NHWC memory format. The // allocation domain view the input as a 1d tensor. TEST_F(AllocationDomainTest, NHWC1d_To_NHWC4d) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); int n = 31, h = 64, w = 103, c = 21; @@ -398,7 +390,7 @@ TEST_F(AllocationDomainTest, NHWC1d_To_NHWC4d) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); KernelExecutor ke; - ke.compile(fusion_ptr.get(), {t0}); + ke.compile(&fusion, {t0}); EXPECT_THAT( [&]() { ke.run({t0_wrong_format}); }, @@ -415,8 +407,7 @@ TEST_F(AllocationDomainTest, NHWC1d_To_NHWC4d) { // A global->global copy kernel where both inputs are NHWC memory format. The // allocation domain of the output view the output as a 1d tensor. TEST_F(AllocationDomainTest, NHWC4d_To_NHWC1d) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); int n = 31, h = 64, w = 103, c = 21; @@ -454,7 +445,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC1d) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); KernelExecutor ke; - ke.compile(fusion_ptr.get(), {t0}); + ke.compile(&fusion, {t0}); EXPECT_THAT( [&]() { ke.run({t0_wrong_format}); }, @@ -471,8 +462,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC1d) { // A global->global copy kernel where both inputs are NHWC memory format. The // allocation domain view both the input and the output as a 1d tensors. TEST_F(AllocationDomainTest, NHWC1d_To_NHWC1d) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); int n = 31, h = 64, w = 103, c = 21; @@ -515,7 +505,7 @@ TEST_F(AllocationDomainTest, NHWC1d_To_NHWC1d) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); KernelExecutor ke; - ke.compile(fusion_ptr.get(), {t0}); + ke.compile(&fusion, {t0}); EXPECT_THAT( [&]() { ke.run({t0_wrong_format}); }, @@ -533,8 +523,7 @@ TEST_F(AllocationDomainTest, NHWC1d_To_NHWC1d) { // allocation domain view the input as a 2d tensor of shape [N*H/8, 8*W*C], and // view the output as a 2d tensor of shape [N*H*W*C/4, 4] TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); int n = 31, h = 64, w = 103, c = 21; @@ -583,7 +572,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); KernelExecutor ke; - ke.compile(fusion_ptr.get(), {t0}); + ke.compile(&fusion, {t0}); EXPECT_THAT( [&]() { ke.run({t0_wrong_format}); }, @@ -599,8 +588,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d) { // Similar to NHWC4d_To_NHWC4d, but does a cacheBefore TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheBefore) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigTensor(4); @@ -648,7 +636,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheBefore) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); KernelExecutor ke; - ke.compile(fusion_ptr.get(), {t0}); + ke.compile(&fusion, {t0}); EXPECT_THAT( [&]() { ke.run({t0_wrong_format}); }, @@ -664,8 +652,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheBefore) { // Similar to NHWC2d_To_NHWC2d, but does a cacheBefore TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheBefore) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); int n = 31, h = 64, w = 103, c = 21; @@ -725,7 +712,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheBefore) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); KernelExecutor ke; - ke.compile(fusion_ptr.get(), {t0}); + ke.compile(&fusion, {t0}); EXPECT_THAT( [&]() { ke.run({t0_wrong_format}); }, @@ -741,8 +728,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheBefore) { // Similar to NHWC4d_To_NHWC4d, but does a cacheAfter TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheAfter) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigTensor(4); @@ -790,7 +776,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheAfter) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); KernelExecutor ke; - ke.compile(fusion_ptr.get(), {t0}); + ke.compile(&fusion, {t0}); EXPECT_THAT( [&]() { ke.run({t0_wrong_format}); }, @@ -808,8 +794,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheAfter) { // allocation tensor to be between rFactor domain and loop domain, which is not // the case for NHWC2d_To_NHWC2d TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheAfter) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); int n = 31, h = 64, w = 103, c = 21; @@ -861,7 +846,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheAfter) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); KernelExecutor ke; - ke.compile(fusion_ptr.get(), {t0}); + ke.compile(&fusion, {t0}); EXPECT_THAT( [&]() { ke.run({t0_wrong_format}); }, @@ -877,8 +862,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheAfter) { // Similar to NHWC4d_To_NHWC4d, but does a cacheFork TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheFork) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigTensor(4); @@ -933,7 +917,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheFork) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); KernelExecutor ke; - ke.compile(fusion_ptr.get(), {t0}); + ke.compile(&fusion, {t0}); EXPECT_THAT( [&]() { ke.run({t0_wrong_format}); }, @@ -949,8 +933,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheFork) { // Similar to NHWC2d_To_NHWC2d, but does a cacheFork TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheFork) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); int n = 31, h = 64, w = 103, c = 21; @@ -1023,7 +1006,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheFork) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); KernelExecutor ke; - ke.compile(fusion_ptr.get(), {t0}); + ke.compile(&fusion, {t0}); EXPECT_THAT( [&]() { ke.run({t0_wrong_format}); }, @@ -1038,30 +1021,29 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheFork) { } TEST_F(AllocationDomainTest, VectorizationIssue902) { - auto fusion_ptr = std::make_unique(); - auto& fusion = *fusion_ptr; - FusionGuard fg(fusion_ptr.get()); + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); const std::vector shape({16, 16, 512, 64}); auto tv0 = makeContigTensor(4); - fusion.addInput(tv0); + fusion->addInput(tv0); auto tv1 = set(tv0); - fusion.addOutput(tv1); + fusion->addOutput(tv1); - std::vector aloc_domain; - aloc_domain.push_back(tv1->axis(0)); - aloc_domain.push_back(tv1->axis(2)); - aloc_domain.push_back(tv1->axis(3)); - aloc_domain.push_back(tv1->axis(1)); - tv1->setAllocationDomain(aloc_domain, true); + std::vector alloc_domain; + alloc_domain.push_back(tv1->axis(0)); + alloc_domain.push_back(tv1->axis(2)); + alloc_domain.push_back(tv1->axis(3)); + alloc_domain.push_back(tv1->axis(1)); + tv1->setAllocationDomain(alloc_domain, true); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutorCache executor_cache(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion)); auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); ASSERT_TRUE(cg_outputs[0].equal(t0)); @@ -1101,9 +1083,8 @@ TEST_F(AllocationDomainTest, TransposeMatrix) { } TEST_F(AllocationDomainTest, ContiguityIssue1021) { - std::unique_ptr fusion_ptr = std::make_unique(); - Fusion* fusion = fusion_ptr.get(); - FusionGuard fg(fusion); + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); auto tv0 = TensorViewBuilder() .ndims(2) @@ -1119,17 +1100,16 @@ TEST_F(AllocationDomainTest, ContiguityIssue1021) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({8, 8}, options).as_strided({4, 8}, {1, 8}); - FusionExecutorCache executor_cache(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion)); auto outputs = executor_cache.runFusionWithInputs({t0}); auto t1 = t0.add(5.0); - testValidate(fusion, outputs, {t0}, __LINE__, __FILE__); + testValidate(executor_cache.fusion(), outputs, {t0}, __LINE__, __FILE__); } TEST_F(AllocationDomainTest, ContiguityForBroadcast) { - std::unique_ptr fusion_ptr = std::make_unique(); - Fusion* fusion = fusion_ptr.get(); - FusionGuard fg(fusion); + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); auto tv0 = TensorViewBuilder() .ndims(2) @@ -1145,17 +1125,16 @@ TEST_F(AllocationDomainTest, ContiguityForBroadcast) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({1, 1}, options).as_strided({1, 1}, {0, 3}); - FusionExecutorCache executor_cache(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion)); auto outputs = executor_cache.runFusionWithInputs({t0}); auto t1 = t0.add(5.0); - testValidate(fusion, outputs, {t0}, __LINE__, __FILE__); + testValidate(executor_cache.fusion(), outputs, {t0}, __LINE__, __FILE__); } TEST_F(AllocationDomainTest, ContiguityForExplicitBroadcast) { - std::unique_ptr fusion_ptr = std::make_unique(); - Fusion* fusion = fusion_ptr.get(); - FusionGuard fg(fusion); + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); auto tv0 = TensorViewBuilder() .ndims(3) @@ -1172,11 +1151,11 @@ TEST_F(AllocationDomainTest, ContiguityForExplicitBroadcast) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({4, 8}, options).as_strided({3, 8, 4}, {0, 1, 8}); - FusionExecutorCache executor_cache(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion)); auto outputs = executor_cache.runFusionWithInputs({t0}); auto t1 = t0.add(5.0); - testValidate(fusion, outputs, {t0}, __LINE__, __FILE__); + testValidate(executor_cache.fusion(), outputs, {t0}, __LINE__, __FILE__); } // Test that allocation domain can be used to vectorize overlapping tensors, @@ -1189,8 +1168,7 @@ TEST_F(AllocationDomainTest, ContiguityForExplicitBroadcast) { // automatically supports all kinds of use cases, even those that we don't have // an active plan to support on). TEST_F(AllocationDomainTest, VectorizeOverlappingTensor) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigTensor(3); @@ -1226,7 +1204,7 @@ TEST_F(AllocationDomainTest, VectorizeOverlappingTensor) { at::randn({4 * 5 * 7}).cuda().as_strided({4, 5, 7}, {7, 4, 1}); KernelExecutor ke; - ke.compile(fusion_ptr.get(), {t0}); + ke.compile(&fusion, {t0}); auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); From 5a603dbf2ec01d503f41f93d1b9690eec70ee190 Mon Sep 17 00:00:00 2001 From: jjsjann123 Date: Thu, 7 Nov 2024 15:12:50 -0800 Subject: [PATCH 25/27] Explicit check build flag (#3371) Adding a build option `--explicit-error-check` in setup.py along with related changes in CMakeLists.txt, so that the python option would set a compile option `NVFUSER_EXPLICIT_ERROR_CHECK` to the project. This is to be used in our CI runs, where debug build is too slow but we still want things like dynamic_cast to catch errors. Currently the change enables some runtime check, e.g. dynamic_cast, which was not checked previously in release build and have been hidden in CI. --- CMakeLists.txt | 4 ++++ csrc/disjoint_set.h | 12 ++++++------ csrc/utils.h | 8 ++++---- setup.py | 6 ++++++ 4 files changed, 20 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a6eb9fed679..93333339f50 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,6 +13,10 @@ set(NVFUSER_SRCS_DIR "${NVFUSER_ROOT}/csrc") set(NVFUSER_THIRD_PARTY_DIR "${NVFUSER_ROOT}/third_party") option(NVFUSER_STANDALONE_BUILD_WITH_UCC "" OFF) +option(NVFUSER_EXPLICIT_ERROR_CHECK "" OFF) +if (NVFUSER_EXPLICIT_ERROR_CHECK) + add_compile_definitions(NVFUSER_EXPLICIT_ERROR_CHECK) +endif() option(NVFUSER_BUILD_WITH_ASAN "Build nvFuser with asan" OFF) include(CMakeDependentOption) diff --git a/csrc/disjoint_set.h b/csrc/disjoint_set.h index c1638ae2037..568f9ff2604 100644 --- a/csrc/disjoint_set.h +++ b/csrc/disjoint_set.h @@ -153,25 +153,25 @@ class VectorOfUniqueEntries { // Returns first element in vector T front() const { -#ifndef NDEBUG +#if defined(NDEBUG) && !defined(NVFUSER_EXPLICIT_CHECK) NVF_ERROR(!empty()); -#endif // NDEBUG +#endif // defined(NDEBUG) && !defined(NVFUSER_EXPLICIT_CHECK) return vector_.front(); } // Returns last element in vector T back() const { -#ifndef NDEBUG +#if defined(NDEBUG) && !defined(NVFUSER_EXPLICIT_CHECK) NVF_ERROR(!empty()); -#endif // NDEBUG +#endif // defined(NDEBUG) && !defined(NVFUSER_EXPLICIT_CHECK) return vector_.back(); } // Remove and returns the last element in vector T popBack() { -#ifndef NDEBUG +#if defined(NDEBUG) && !defined(NVFUSER_EXPLICIT_CHECK) NVF_ERROR(!empty()); -#endif // NDEBUG +#endif // defined(NDEBUG) && !defined(NVFUSER_EXPLICIT_CHECK) T v = vector_.back(); set_.erase(v); vector_.pop_back(); diff --git a/csrc/utils.h b/csrc/utils.h index f98d2e357a2..d831a6695a5 100644 --- a/csrc/utils.h +++ b/csrc/utils.h @@ -112,23 +112,23 @@ class PolymorphicBase { // (checked in DEBUG builds) template T* as() { -#ifdef NDEBUG +#if defined(NDEBUG) && !defined(NVFUSER_EXPLICIT_ERROR_CHECK) auto downcast_ptr = static_cast(this); #else auto downcast_ptr = dynamic_cast(this); NVF_ERROR(downcast_ptr != nullptr); -#endif +#endif // defined(NDEBUG) && !defined(NVFUSER_EXPLICIT_ERROR_CHECK) return downcast_ptr; } template const T* as() const { -#ifdef NDEBUG +#if defined(NDEBUG) && !defined(NVFUSER_EXPLICIT_ERROR_CHECK) auto downcast_ptr = static_cast(this); #else auto downcast_ptr = dynamic_cast(this); NVF_ERROR(downcast_ptr != nullptr); -#endif +#endif // defined(NDEBUG) && !defined(NVFUSER_EXPLICIT_ERROR_CHECK) return downcast_ptr; } diff --git a/setup.py b/setup.py index 55dcc041f92..4f3fd6c28fa 100644 --- a/setup.py +++ b/setup.py @@ -79,6 +79,7 @@ BUILD_WITH_ASAN = False BUILD_WITHOUT_DISTRIBUTED = False OVERWRITE_VERSION = False +EXPLICIT_ERROR_CHECK = False VERSION_TAG = None BUILD_TYPE = "Release" WHEEL_NAME = "nvfuser" @@ -107,6 +108,9 @@ if arg == "--build-with-ucc": BUILD_WITH_UCC = True continue + if arg == "--explicit-error-check": + EXPLICIT_ERROR_CHECK = True + continue if arg == "--build-with-asan": BUILD_WITH_ASAN = True continue @@ -330,6 +334,8 @@ def cmake(): ] if BUILD_WITH_UCC: cmd_str.append("-DNVFUSER_STANDALONE_BUILD_WITH_UCC=ON") + if EXPLICIT_ERROR_CHECK: + cmd_str.append("-DNVFUSER_EXPLICIT_ERROR_CHECK=ON") if not NO_NINJA: cmd_str.append("-G") cmd_str.append("Ninja") From 6abf3101ba1769123d76ab90f1b9661ff3772287 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Thu, 7 Nov 2024 16:16:16 -0800 Subject: [PATCH 26/27] Accidentally lost in #3028 (#3363) I accidentally replaced this line in #3028 --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 93333339f50..de3e52f5055 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -549,6 +549,7 @@ list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/tests/cpp/test_id_model.cpp ${NVFUSER_ROOT}/tests/cpp/test_indexing.cpp ${NVFUSER_ROOT}/tests/cpp/test_indexing_advanced.cpp + ${NVFUSER_ROOT}/tests/cpp/test_inlining.cpp ${NVFUSER_ROOT}/tests/cpp/test_iter_visitor.cpp ${NVFUSER_ROOT}/tests/cpp/test_linked_hash_map.cpp ${NVFUSER_ROOT}/tests/cpp/test_loop_domain_scheduling.cpp From 96d64b61612f3248062c7d17c68582f94b92e37c Mon Sep 17 00:00:00 2001 From: Priya Mishra <52657555+Priya2698@users.noreply.github.com> Date: Thu, 7 Nov 2024 20:19:16 -0800 Subject: [PATCH 27/27] Modify the `compile` parameter in baseline benchmarks to `executor` (#3350) This PR is the first step in adding `thunder.jit` benchmarks. The major change is modifying the `compile` parameter to `executor` with values `eager`, `torchcompile`, `thunder`. This PR does not introduce any new thunder benchmarks (to be done in next PR). CC: @xwang233 for dashboard changes. Issue: #2718 --- benchmarks/python/conftest.py | 64 +++++++++---------- benchmarks/python/normalization.py | 18 +++--- benchmarks/python/test_batchnorm_bwd.py | 6 +- benchmarks/python/test_batchnorm_fwd.py | 6 +- benchmarks/python/test_broadcast_add_fwd.py | 13 ++-- .../python/test_dropout_layernorm_bwd.py | 15 +++-- .../python/test_dropout_layernorm_fwd.py | 13 ++-- benchmarks/python/test_dropout_rmsnorm_bwd.py | 15 +++-- benchmarks/python/test_dropout_rmsnorm_fwd.py | 13 ++-- benchmarks/python/test_gelu_bwd.py | 15 +++-- benchmarks/python/test_gelu_bwd_reduction.py | 16 +++-- benchmarks/python/test_gelu_fwd.py | 16 +++-- benchmarks/python/test_groupnorm_fwd.py | 34 ++++------ .../python/test_huggingface_attn_bwd.py | 15 +++-- .../python/test_huggingface_attn_fwd.py | 13 ++-- benchmarks/python/test_instancenorm_bwd.py | 6 +- benchmarks/python/test_instancenorm_fwd.py | 6 +- benchmarks/python/test_layernorm_bwd.py | 15 +++-- benchmarks/python/test_layernorm_fwd.py | 13 ++-- benchmarks/python/test_matmul.py | 4 +- benchmarks/python/test_nanogpt_attn_bwd.py | 16 +++-- benchmarks/python/test_nanogpt_attn_fwd.py | 14 ++-- benchmarks/python/test_pointwise_mul.py | 13 ++-- benchmarks/python/test_reduction.py | 13 ++-- benchmarks/python/test_reduction_epilogue.py | 16 +++-- benchmarks/python/test_rmsnorm_bwd.py | 12 ++-- benchmarks/python/test_rmsnorm_fwd.py | 12 ++-- benchmarks/python/test_scale_bias_relu_bwd.py | 12 ++-- benchmarks/python/test_scale_bias_relu_fwd.py | 10 +-- benchmarks/python/test_silu_mul_bwd.py | 12 ++-- benchmarks/python/test_silu_mul_fwd.py | 13 ++-- benchmarks/python/test_softmax_bwd.py | 12 ++-- benchmarks/python/test_softmax_fwd.py | 12 ++-- benchmarks/python/test_transpose.py | 14 ++-- 34 files changed, 287 insertions(+), 210 deletions(-) diff --git a/benchmarks/python/conftest.py b/benchmarks/python/conftest.py index 8932afbff30..03adbe1e7dd 100644 --- a/benchmarks/python/conftest.py +++ b/benchmarks/python/conftest.py @@ -96,45 +96,39 @@ def pytest_configure(config): def pytest_collection_modifyitems(session, config, items): """ - The baseline benchmarks use `compile` parameter: - compile = false: Eager mode benchmark - compile = true: torch.compile benchmark + The baseline benchmarks use `executor` parameter with + values ["eager", "torchcompile", "thunder"] that are optionally + run using `--benchmark-{executor}` flag. They are skipped by + default. """ - run_eager = config.getoption("--benchmark-eager") - run_thunder = config.getoption("--benchmark-thunder") - run_torchcompile = config.getoption("--benchmark-torchcompile") from nvfuser.pytorch_utils import retry_on_oom_or_skip_test + executors = ["eager", "torchcompile", "thunder"] + + def get_test_executor(item) -> str | None: + if hasattr(item, "callspec") and "executor" in item.callspec.params: + test_executor = item.callspec.params["executor"] + assert ( + test_executor in executors + ), f"Expected executor to be one of 'eager', 'torchcompile', 'thunder', found {test_executor}." + return test_executor + return None + + executors_to_skip = [] + + for executor in executors: + if not config.getoption(f"--benchmark-{executor}"): + executors_to_skip.append(executor) + for item in items: item.obj = retry_on_oom_or_skip_test(item.obj) - if not run_eager: - skip_eager = pytest.mark.skip(reason="need --benchmark-eager option to run") - for item in items: - # If the benchmark has compile=False parameter (eager mode), skip it. - if ( - hasattr(item, "callspec") - and "compile" in item.callspec.params - and not item.callspec.params["compile"] - ): - item.add_marker(skip_eager) - - if not run_torchcompile: - skip_torchcompile = pytest.mark.skip( - reason="need --benchmark-torchcompile option to run" - ) - for item in items: - # If the benchmark has compile=True parameter (torch.compile mode), skip it. - if ( - hasattr(item, "callspec") - and "compile" in item.callspec.params - and item.callspec.params["compile"] - ): - item.add_marker(skip_torchcompile) - - if not run_thunder: - skip_thunder = pytest.mark.skip(reason="need --benchmark-thunder option to run") - for item in items: - if "thunder" in item.nodeid: - item.add_marker(skip_thunder) + test_executor = get_test_executor(item) + + if test_executor is not None and test_executor in executors_to_skip: + item.add_marker( + pytest.mark.skip( + reason=f"need --benchmark-{test_executor} option to run." + ) + ) diff --git a/benchmarks/python/normalization.py b/benchmarks/python/normalization.py index 8cbafe81353..6d493338846 100644 --- a/benchmarks/python/normalization.py +++ b/benchmarks/python/normalization.py @@ -433,10 +433,10 @@ def norm_fwd_baseline_benchmark( size: tuple, dtype: torch.dtype, channels_last: bool, - compile: bool, + executor: str, norm: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() assert norm in ["batch_norm", "instance_norm"], NotImplementedError @@ -453,10 +453,12 @@ def norm_fwd_baseline_benchmark( norm_fwd_fn = batchnorm_fwd_fn if norm == "batch_norm" else instancenorm_fwd_fn + benchmark_fn = {"eager": norm_fwd_fn, "torchcompile": torch.compile(norm_fwd_fn)} + # Manually compute IOBytes: See PR #1725 run_benchmark( benchmark, - torch.compile(norm_fwd_fn) if compile else norm_fwd_fn, + benchmark_fn[executor], [inputs, weight, bias, running_mean, running_var], iobytes=norm_fwd_iobytes(size, dtype, norm), ) @@ -467,10 +469,10 @@ def norm_bwd_baseline_benchmark( size: tuple, dtype: torch.dtype, channels_last: bool, - compile: bool, + executor: str, norm: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() assert norm in ["batch_norm", "instance_norm"], NotImplementedError @@ -491,13 +493,13 @@ def norm_bwd_baseline_benchmark( norm_fwd_fn = batchnorm_fwd_fn if norm == "batch_norm" else instancenorm_fwd_fn # Compile the fwd fn for torchcompile - norm_fwd_fn = torch.compile(norm_fwd_fn) if compile else norm_fwd_fn - output = norm_fwd_fn([inputs, weight, bias, running_mean, running_var]) + fwd_fn = {"eager": norm_fwd_fn, "torchcompile": torch.compile(norm_fwd_fn)} + outputs = fwd_fn[executor]([inputs, weight, bias, running_mean, running_var]) # Manually compute IOBytes: See PR #1725 run_benchmark( benchmark, unary_bwd_torch, - [output, grads], + [outputs, grads], iobytes=norm_bwd_iobytes(size, dtype, norm), ) diff --git a/benchmarks/python/test_batchnorm_bwd.py b/benchmarks/python/test_batchnorm_bwd.py index 74242ba99e2..0a1cd64cc57 100644 --- a/benchmarks/python/test_batchnorm_bwd.py +++ b/benchmarks/python/test_batchnorm_bwd.py @@ -31,13 +31,13 @@ def test_batchnorm_bwd_nvf_benchmark( ) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=4)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("channels_last", [True, False]) def test_batchnorm_bwd_baseline_benchmark( - benchmark, size: tuple, dtype: torch.dtype, channels_last: bool, compile: bool + benchmark, size: tuple, dtype: torch.dtype, channels_last: bool, executor: str ): norm_bwd_baseline_benchmark( - benchmark, size, dtype, channels_last, compile, "batch_norm" + benchmark, size, dtype, channels_last, executor, "batch_norm" ) diff --git a/benchmarks/python/test_batchnorm_fwd.py b/benchmarks/python/test_batchnorm_fwd.py index 47b3997770a..af197ce6f1b 100644 --- a/benchmarks/python/test_batchnorm_fwd.py +++ b/benchmarks/python/test_batchnorm_fwd.py @@ -31,13 +31,13 @@ def test_batchnorm_fwd_nvf_benchmark( ) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=4)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("channels_last", [True, False]) def test_batchnorm_fwd_baseline_benchmark( - benchmark, size: tuple, dtype: torch.dtype, channels_last: bool, compile: bool + benchmark, size: tuple, dtype: torch.dtype, channels_last: bool, executor: str ): norm_fwd_baseline_benchmark( - benchmark, size, dtype, channels_last, compile, "batch_norm" + benchmark, size, dtype, channels_last, executor, "batch_norm" ) diff --git a/benchmarks/python/test_broadcast_add_fwd.py b/benchmarks/python/test_broadcast_add_fwd.py index abb320ef2a3..65db1555b28 100644 --- a/benchmarks/python/test_broadcast_add_fwd.py +++ b/benchmarks/python/test_broadcast_add_fwd.py @@ -88,7 +88,7 @@ def test_bcast_add_nvf_benchmark( run_benchmark(benchmark, fd.execute, [bias, x]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("bcast_axis", [0, 1], ids=["outer", "inner"]) @@ -101,9 +101,9 @@ def test_bcast_add_baseline_benchmark( dtype: torch.dtype, bcast_axis: int, contiguous: bool, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() bias = torch.randn(size[1 - bcast_axis], dtype=dtype, device="cuda") input_shape = size if contiguous else (size[1], size[0]) @@ -112,9 +112,14 @@ def test_bcast_add_baseline_benchmark( x = x.t() assert x.is_contiguous() == contiguous + benchmark_fn = { + "eager": bcast_add_fwd_fn, + "torchcompile": torch.compile(bcast_add_fwd_fn), + } + # Inputs and outputs are same as nvFuser, no need for manual IOByte computation run_benchmark( benchmark, - torch.compile(bcast_add_fwd_fn) if compile else bcast_add_fwd_fn, + benchmark_fn[executor], [bias, x, bcast_axis], ) diff --git a/benchmarks/python/test_dropout_layernorm_bwd.py b/benchmarks/python/test_dropout_layernorm_bwd.py index 6acaa012c5c..380a2085b09 100644 --- a/benchmarks/python/test_dropout_layernorm_bwd.py +++ b/benchmarks/python/test_dropout_layernorm_bwd.py @@ -189,16 +189,16 @@ def test_dropout_layernorm_bwd_nvf_benchmark( ) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_dropout_layernorm_bwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() dropout_p = 0.2 @@ -217,13 +217,16 @@ def dropout_layernorm_fwd(): ) # Compile the fwd fn for torchcompile - fwd_fn = torch.compile(dropout_layernorm_fwd) if compile else dropout_layernorm_fwd - output = fwd_fn() + fwd_fn = { + "eager": dropout_layernorm_fwd, + "torchcompile": torch.compile(dropout_layernorm_fwd), + } + outputs = fwd_fn[executor]() # Manually compute IOBytes: See PR #1725 run_benchmark( benchmark, unary_bwd_torch, - [output, grads], + [outputs, grads], iobytes=dropout_layernorm_bwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_dropout_layernorm_fwd.py b/benchmarks/python/test_dropout_layernorm_fwd.py index 47854fcd2d7..4408a2bd611 100644 --- a/benchmarks/python/test_dropout_layernorm_fwd.py +++ b/benchmarks/python/test_dropout_layernorm_fwd.py @@ -160,16 +160,16 @@ def test_dropout_layernorm_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, inputs) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_dropout_layernorm_fwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() dropout_p = 0.2 @@ -181,10 +181,15 @@ def test_dropout_layernorm_fwd_baseline_benchmark( dropout_p, ] + benchmark_fn = { + "eager": dropout_layernorm_fwd, + "torchcompile": torch.compile(dropout_layernorm_fwd), + } + # Manually compute IOBytes: See PR #1725 run_benchmark( benchmark, - torch.compile(dropout_layernorm_fwd) if compile else dropout_layernorm_fwd, + benchmark_fn[executor], inputs, iobytes=dropout_layernorm_fwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_dropout_rmsnorm_bwd.py b/benchmarks/python/test_dropout_rmsnorm_bwd.py index 8c61c51e2d9..d196e76f57b 100644 --- a/benchmarks/python/test_dropout_rmsnorm_bwd.py +++ b/benchmarks/python/test_dropout_rmsnorm_bwd.py @@ -169,16 +169,16 @@ def test_dropout_rmsnorm_bwd_nvf_benchmark( ) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_dropout_rmsnorm_bwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() dropout_p = 0.2 input1 = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True) @@ -191,12 +191,15 @@ def dropout_rmsnorm_fwd(): output = weights * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + 1e-5) return output - fwd_fn = torch.compile(dropout_rmsnorm_fwd) if compile else dropout_rmsnorm_fwd - output = fwd_fn() + fwd_fn = { + "eager": dropout_rmsnorm_fwd, + "torchcompile": torch.compile(dropout_rmsnorm_fwd), + } + outputs = fwd_fn[executor]() run_benchmark( benchmark, unary_bwd_torch, - [output, grads], + [outputs, grads], iobytes=dropout_rmsnorm_bwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_dropout_rmsnorm_fwd.py b/benchmarks/python/test_dropout_rmsnorm_fwd.py index a93a8caf547..aea2674df9d 100644 --- a/benchmarks/python/test_dropout_rmsnorm_fwd.py +++ b/benchmarks/python/test_dropout_rmsnorm_fwd.py @@ -145,16 +145,16 @@ def test_dropout_rmsnorm_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [input1, input2, weights]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_dropout_rmsnorm_fwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() dropout_p = 0.2 @@ -165,10 +165,15 @@ def test_dropout_rmsnorm_fwd_baseline_benchmark( dropout_p, ] + benchmark_fn = { + "eager": dropout_rmsnorm_fwd, + "torchcompile": torch.compile(dropout_rmsnorm_fwd), + } + # Manually compute IOBytes: See PR #1725 run_benchmark( benchmark, - torch.compile(dropout_rmsnorm_fwd) if compile else dropout_rmsnorm_fwd, + benchmark_fn[executor], inputs, iobytes=dropout_rmsnorm_fwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_gelu_bwd.py b/benchmarks/python/test_gelu_bwd.py index 648f0317cf9..ffd0b25c6a2 100644 --- a/benchmarks/python/test_gelu_bwd.py +++ b/benchmarks/python/test_gelu_bwd.py @@ -88,16 +88,16 @@ def test_gelu_bwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [inputs, grads, bias]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_gelu_bwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() inputs = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True) bias = torch.ones(size[-1], device="cuda", dtype=dtype) @@ -106,12 +106,15 @@ def test_gelu_bwd_baseline_benchmark( def gelu_fwd(): return torch.nn.functional.gelu(inputs + bias, approximate="tanh") - fwd_fn = torch.compile(gelu_fwd) if compile else gelu_fwd - eager_output = fwd_fn() + fwd_fn = { + "eager": gelu_fwd, + "torchcompile": torch.compile(gelu_fwd), + } + outputs = fwd_fn[executor]() run_benchmark( benchmark, unary_bwd_torch, - [eager_output, grads], + [outputs, grads], iobytes=gelu_bwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_gelu_bwd_reduction.py b/benchmarks/python/test_gelu_bwd_reduction.py index 09dfd53d88a..e860826eb49 100644 --- a/benchmarks/python/test_gelu_bwd_reduction.py +++ b/benchmarks/python/test_gelu_bwd_reduction.py @@ -103,7 +103,7 @@ def test_gelu_bwd_reduction_nvf_benchmark( run_benchmark(benchmark, fd.execute, [inputs, grads, bias]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("reduction_axis", [0, 1]) @@ -112,19 +112,23 @@ def test_gelu_bwd_reduction_baseline_benchmark( size: tuple, dtype: torch.dtype, reduction_axis: int, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() inputs = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True) bias = torch.ones(size[-1], device="cuda", dtype=dtype) grads = torch.randn(size, device="cuda", dtype=dtype) eager_output = torch.nn.functional.gelu(inputs + bias, approximate="tanh") + + benchmark_fn = { + "eager": gelu_bwd_reduction_torch, + "torchcompile": torch.compile(gelu_bwd_reduction_torch), + } + run_benchmark( benchmark, - torch.compile(gelu_bwd_reduction_torch) - if compile - else gelu_bwd_reduction_torch, + benchmark_fn[executor], [eager_output, grads, inputs, reduction_axis], iobytes=gelu_bwd_reduction_iobytes(size, dtype, reduction_axis), ) diff --git a/benchmarks/python/test_gelu_fwd.py b/benchmarks/python/test_gelu_fwd.py index fa5f891ef8a..2f208b2c090 100644 --- a/benchmarks/python/test_gelu_fwd.py +++ b/benchmarks/python/test_gelu_fwd.py @@ -67,22 +67,26 @@ def test_gelu_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, inputs) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_gelu_fwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() inputs = [ torch.randn(size, device="cuda", dtype=dtype, requires_grad=True), # in_tensor torch.ones(size[-1], device="cuda", dtype=dtype), # bias ] + + benchmark_fn = { + "eager": gelu_fwd_fn, + "torchcompile": torch.compile(gelu_fwd_fn), + } + # Inputs and outputs are same as nvFuser, no need for manual IOByte computation - run_benchmark( - benchmark, torch.compile(gelu_fwd_fn) if compile else gelu_fwd_fn, inputs - ) + run_benchmark(benchmark, benchmark_fn[executor], inputs) diff --git a/benchmarks/python/test_groupnorm_fwd.py b/benchmarks/python/test_groupnorm_fwd.py index af4c023d7d7..8c729e115d7 100644 --- a/benchmarks/python/test_groupnorm_fwd.py +++ b/benchmarks/python/test_groupnorm_fwd.py @@ -128,35 +128,16 @@ def test_groupnorm_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [x, weight, bias]) -@pytest.mark.parametrize("size", generate_input_sizes(dims=4)) -@pytest.mark.parametrize("dtype", FLOAT_DTYPES) -def test_groupnorm_fwd_thunder_benchmark( - benchmark, - size: tuple, - dtype: torch.dtype, -): - N, C, H, W = size - x = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True) - weight = torch.randn(C, device="cuda", dtype=dtype, requires_grad=True) - bias = torch.randn(C, device="cuda", dtype=dtype, requires_grad=True) - num_groups = get_n_groups(C) - # thunder compiled model - groupnorm_fwd_jit = thunder.jit( - groupnorm_fwd, nv_enable_bookend=False, executors=[nvfuserex] - ) - run_benchmark(benchmark, groupnorm_fwd_jit, [x, weight, bias, num_groups]) - - -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile", "thunder"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=4)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_groupnorm_fwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() N, C, H, W = size x = torch.randn(size, device="cuda", dtype=dtype) @@ -164,8 +145,15 @@ def test_groupnorm_fwd_baseline_benchmark( bias = torch.randn(C, device="cuda", dtype=dtype) num_groups = get_n_groups(C) + benchmark_fn = { + "eager": groupnorm_fwd, + "torchcompile": torch.compile(groupnorm_fwd), + "thunder": thunder.jit( + groupnorm_fwd, nv_enable_bookend=False, executors=[nvfuserex] + ), + } run_benchmark( benchmark, - torch.compile(groupnorm_fwd) if compile else groupnorm_fwd, + benchmark_fn[executor], [x, weight, bias, num_groups], ) diff --git a/benchmarks/python/test_huggingface_attn_bwd.py b/benchmarks/python/test_huggingface_attn_bwd.py index dd8c9f80114..bcb2b4d9268 100644 --- a/benchmarks/python/test_huggingface_attn_bwd.py +++ b/benchmarks/python/test_huggingface_attn_bwd.py @@ -107,16 +107,16 @@ def test_huggingface_attn_bwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [grads, attn, dropout_mask]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_attn_inputs()) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_huggingface_attn_bwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() batch_size, seq_len, nh, n_embd = size dropout_p = 0.2 @@ -134,14 +134,17 @@ def huggingface_attn_fwd(): return output # Compile the fwd fn for torchcompile - fwd_fn = torch.compile(huggingface_attn_fwd) if compile else huggingface_attn_fwd - output = fwd_fn() + fwd_fn = { + "eager": huggingface_attn_fwd, + "torchcompile": torch.compile(huggingface_attn_fwd), + } + outputs = fwd_fn[executor]() grads = torch.randn(batch_size * nh, seq_len, seq_len, device="cuda", dtype=dtype) # Manually compute IOBytes: See PR #1725 run_benchmark( benchmark, unary_bwd_torch, - [output, grads], + [outputs, grads], iobytes=huggingface_attn_bwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_huggingface_attn_fwd.py b/benchmarks/python/test_huggingface_attn_fwd.py index 27a013a8481..714a12e41d1 100644 --- a/benchmarks/python/test_huggingface_attn_fwd.py +++ b/benchmarks/python/test_huggingface_attn_fwd.py @@ -135,16 +135,16 @@ def test_huggingface_attn_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [attention_mask, inputs]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_attn_inputs()) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_huggingface_attn_fwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() batch_size, seq_len, nh, n_embd = size dropout_p = 0.2 @@ -153,10 +153,15 @@ def test_huggingface_attn_fwd_baseline_benchmark( batch_size, nh, seq_len, seq_len, device="cuda", dtype=dtype ) + benchmark_fn = { + "eager": huggingface_attn_fwd, + "torchcompile": torch.compile(huggingface_attn_fwd), + } + # Manually compute IOBytes: See PR #1725 run_benchmark( benchmark, - torch.compile(huggingface_attn_fwd) if compile else huggingface_attn_fwd, + benchmark_fn[executor], [attention_mask, inputs, size, dropout_p], iobytes=huggingface_attn_fwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_instancenorm_bwd.py b/benchmarks/python/test_instancenorm_bwd.py index 99d3e3baf2b..4022c5f395f 100644 --- a/benchmarks/python/test_instancenorm_bwd.py +++ b/benchmarks/python/test_instancenorm_bwd.py @@ -30,13 +30,13 @@ def test_instancenorm_bwd_nvf_benchmark( ) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=4)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("channels_last", [True, False]) def test_instancenorm_bwd_baseline_benchmark( - benchmark, size: tuple, dtype: torch.dtype, channels_last: bool, compile: bool + benchmark, size: tuple, dtype: torch.dtype, channels_last: bool, executor: str ): norm_bwd_baseline_benchmark( - benchmark, size, dtype, channels_last, compile, "instance_norm" + benchmark, size, dtype, channels_last, executor, "instance_norm" ) diff --git a/benchmarks/python/test_instancenorm_fwd.py b/benchmarks/python/test_instancenorm_fwd.py index 3b8f6564f51..3335fcc7bbf 100644 --- a/benchmarks/python/test_instancenorm_fwd.py +++ b/benchmarks/python/test_instancenorm_fwd.py @@ -29,13 +29,13 @@ def test_instancenorm_fwd_nvf_benchmark( ) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=4)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("channels_last", [True, False]) def test_instancenorm_fwd_baseline_benchmark( - benchmark, size: tuple, dtype: torch.dtype, channels_last: bool, compile: bool + benchmark, size: tuple, dtype: torch.dtype, channels_last: bool, executor: str ): norm_fwd_baseline_benchmark( - benchmark, size, dtype, channels_last, compile, "instance_norm" + benchmark, size, dtype, channels_last, executor, "instance_norm" ) diff --git a/benchmarks/python/test_layernorm_bwd.py b/benchmarks/python/test_layernorm_bwd.py index d76046575dc..926ab2ef0fb 100644 --- a/benchmarks/python/test_layernorm_bwd.py +++ b/benchmarks/python/test_layernorm_bwd.py @@ -146,16 +146,16 @@ def test_layernorm_bwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [inputs, grads, mean, invstd, weights]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_layernorm_bwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() inputs = torch.randn(*size, device="cuda", dtype=dtype, requires_grad=True) @@ -171,13 +171,16 @@ def layernorm_fwd(): bias=bias, ) - fwd_fn = torch.compile(layernorm_fwd) if compile else layernorm_fwd - output = fwd_fn() + fwd_fn = { + "eager": layernorm_fwd, + "torchcompile": torch.compile(layernorm_fwd), + } + outputs = fwd_fn[executor]() # Manually compute IOBytes: See PR #1725 run_benchmark( benchmark, unary_bwd_torch, - [output, grads], + [outputs, grads], iobytes=layernorm_bwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_layernorm_fwd.py b/benchmarks/python/test_layernorm_fwd.py index c6a5f24c8dc..52aa5838f62 100644 --- a/benchmarks/python/test_layernorm_fwd.py +++ b/benchmarks/python/test_layernorm_fwd.py @@ -106,16 +106,16 @@ def test_layernorm_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, inputs) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_layernorm_fwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() batch_size, hidden_size = size inputs = [ @@ -124,10 +124,15 @@ def test_layernorm_fwd_baseline_benchmark( torch.randn(hidden_size, device="cuda", dtype=dtype), ] + benchmark_fn = { + "eager": layernorm_fwd, + "torchcompile": torch.compile(layernorm_fwd), + } + # Manually compute IOBytes: See PR #1725 run_benchmark( benchmark, - torch.compile(layernorm_fwd) if compile else layernorm_fwd, + benchmark_fn[executor], inputs, iobytes=layernorm_fwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_matmul.py b/benchmarks/python/test_matmul.py index 865caba2e31..2448ac07fd9 100644 --- a/benchmarks/python/test_matmul.py +++ b/benchmarks/python/test_matmul.py @@ -25,14 +25,14 @@ def load_matmul_problems(): @pytest.mark.parametrize("half_reduction", [False, True], ids=["fullred", "halfred"]) -@pytest.mark.parametrize("compile", [False], ids=["eager"]) +@pytest.mark.parametrize("executor", ["eager"]) @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"]) @pytest.mark.parametrize( "config", load_matmul_problems(), ids=lambda val: "-".join(str(v) for v in val) ) def test_matmul_baseline_benchmark( benchmark, - compile: bool, + executor: str, config: tuple, dtype: torch.dtype, half_reduction: bool, diff --git a/benchmarks/python/test_nanogpt_attn_bwd.py b/benchmarks/python/test_nanogpt_attn_bwd.py index 2efb8e7d58d..88d8d56e26d 100644 --- a/benchmarks/python/test_nanogpt_attn_bwd.py +++ b/benchmarks/python/test_nanogpt_attn_bwd.py @@ -124,16 +124,16 @@ def test_nanogpt_attn_bwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [grads, attn, dropout_mask, bias_mask]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_attn_inputs()) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_nanogpt_attn_bwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() batch_size, seq_len, nh, n_embd = size dropout_p = 0.2 @@ -154,14 +154,18 @@ def nanogpt_attn_fwd(): return output # Compile the fwd fn for torchcompile - fwd_fn = torch.compile(nanogpt_attn_fwd) if compile else nanogpt_attn_fwd - output = fwd_fn() + fwd_fn = { + "eager": nanogpt_attn_fwd, + "torchcompile": torch.compile(nanogpt_attn_fwd), + } + outputs = fwd_fn[executor]() + grads = torch.randn(batch_size, nh, seq_len, seq_len, device="cuda", dtype=dtype) # Manually compute IOBytes: See PR #1725 run_benchmark( benchmark, unary_bwd_torch, - [output, grads], + [outputs, grads], iobytes=nanogpt_attn_bwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_nanogpt_attn_fwd.py b/benchmarks/python/test_nanogpt_attn_fwd.py index 4dbd5821c59..5336d96cba5 100644 --- a/benchmarks/python/test_nanogpt_attn_fwd.py +++ b/benchmarks/python/test_nanogpt_attn_fwd.py @@ -137,16 +137,16 @@ def test_nanogpt_attn_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [inputs, bias]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_attn_inputs()) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_nanogpt_attn_fwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() batch_size, seq_len, nh, n_embd = size dropout_p = 0.2 @@ -154,10 +154,16 @@ def test_nanogpt_attn_fwd_baseline_benchmark( bias = torch.tril(torch.ones(seq_len, seq_len, device="cuda")).view( 1, 1, seq_len, seq_len ) + + benchmark_fn = { + "eager": nanogpt_attn_fwd, + "torchcompile": torch.compile(nanogpt_attn_fwd), + } + # Manually compute IOBytes: See PR #1725 run_benchmark( benchmark, - torch.compile(nanogpt_attn_fwd) if compile else nanogpt_attn_fwd, + benchmark_fn[executor], [inputs, bias, size, dropout_p], iobytes=nanogpt_attn_fwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_pointwise_mul.py b/benchmarks/python/test_pointwise_mul.py index 0162950cc47..31ec20d6b10 100644 --- a/benchmarks/python/test_pointwise_mul.py +++ b/benchmarks/python/test_pointwise_mul.py @@ -50,21 +50,26 @@ def test_pointwise_mul_nvf_benchmark( run_benchmark(benchmark, fd.execute, inputs) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_pointwise_mul_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() input = torch.randn(size, device="cuda", dtype=dtype) + + benchmark_fn = { + "eager": pointwise_mul_fwd_fn, + "torchcompile": torch.compile(pointwise_mul_fwd_fn), + } # Inputs and outputs are same as nvFuser, no need for manual IOByte computation run_benchmark( benchmark, - torch.compile(pointwise_mul_fwd_fn) if compile else pointwise_mul_fwd_fn, + benchmark_fn[executor], [input], ) diff --git a/benchmarks/python/test_reduction.py b/benchmarks/python/test_reduction.py index f734769a1e5..303f65609b7 100644 --- a/benchmarks/python/test_reduction.py +++ b/benchmarks/python/test_reduction.py @@ -53,7 +53,7 @@ def test_reduction_nvf_benchmark( run_benchmark(benchmark, fd.execute, inputs) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("reduction_axis", [0, 1]) @@ -62,14 +62,19 @@ def test_reduction_baseline_benchmark( size: tuple, dtype: torch.dtype, reduction_axis: int, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() input = torch.randn(size, device="cuda", dtype=dtype) + + benchmark_fn = { + "eager": reduction_fwd_fn, + "torchcompile": torch.compile(reduction_fwd_fn), + } # Inputs and outputs are same as nvFuser, no need for manual IOByte computation run_benchmark( benchmark, - torch.compile(reduction_fwd_fn) if compile else reduction_fwd_fn, + benchmark_fn[executor], [input, reduction_axis], ) diff --git a/benchmarks/python/test_reduction_epilogue.py b/benchmarks/python/test_reduction_epilogue.py index 231090e4135..aacf7326d29 100644 --- a/benchmarks/python/test_reduction_epilogue.py +++ b/benchmarks/python/test_reduction_epilogue.py @@ -67,7 +67,7 @@ def test_reduction_epilogue_nvf_benchmark( run_benchmark(benchmark, fd.execute, [x, epilogue]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("reduction_axis", [0]) @@ -76,17 +76,21 @@ def test_reduction_epilogue_baseline_benchmark( size: tuple, dtype: torch.dtype, reduction_axis: int, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() x = torch.randn(size, device="cuda", dtype=dtype) epilogue = torch.randn(size[reduction_axis - 1], device="cuda", dtype=dtype) # Inputs and outputs are same as nvFuser, no need for manual IOByte computation + + benchmark_fn = { + "eager": reduction_epilogue_fwd_fn, + "torchcompile": torch.compile(reduction_epilogue_fwd_fn), + } + run_benchmark( benchmark, - torch.compile(reduction_epilogue_fwd_fn) - if compile - else reduction_epilogue_fwd_fn, + benchmark_fn[executor], [x, epilogue, reduction_axis], ) diff --git a/benchmarks/python/test_rmsnorm_bwd.py b/benchmarks/python/test_rmsnorm_bwd.py index 697aa8848ab..2fb4698fdbf 100644 --- a/benchmarks/python/test_rmsnorm_bwd.py +++ b/benchmarks/python/test_rmsnorm_bwd.py @@ -112,16 +112,16 @@ def test_rmsnorm_bwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [inputs, rms_eps, grads, weights]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_rmsnorm_bwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() inputs = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True) grads = torch.randn(size, device="cuda", dtype=dtype) @@ -134,13 +134,13 @@ def rmsnorm_fwd(): return output # Compile the fwd fn for torchcompile - fwd_fn = torch.compile(rmsnorm_fwd) if compile else rmsnorm_fwd - output = fwd_fn() + fwd_fn = {"eager": rmsnorm_fwd, "torchcompile": torch.compile(rmsnorm_fwd)} + outputs = fwd_fn[executor]() # Manually compute IOBytes: See PR #1725 run_benchmark( benchmark, unary_bwd_torch, - [output, grads], + [outputs, grads], iobytes=rmsnorm_bwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_rmsnorm_fwd.py b/benchmarks/python/test_rmsnorm_fwd.py index b7839b631de..0114ae6507c 100644 --- a/benchmarks/python/test_rmsnorm_fwd.py +++ b/benchmarks/python/test_rmsnorm_fwd.py @@ -86,24 +86,28 @@ def test_rmsnorm_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [inputs, weights]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_rmsnorm_fwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() inputs = torch.randn(size, device="cuda", dtype=dtype) weights = torch.randn(size[1], device="cuda", dtype=dtype) + benchmark_fn = { + "eager": rmsnorm_fwd_fn, + "torchcompile": torch.compile(rmsnorm_fwd_fn), + } # Manually compute IOBytes: See PR #1725 run_benchmark( benchmark, - torch.compile(rmsnorm_fwd_fn) if compile else rmsnorm_fwd_fn, + benchmark_fn[executor], [inputs, weights], iobytes=rmsnorm_fwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_scale_bias_relu_bwd.py b/benchmarks/python/test_scale_bias_relu_bwd.py index a85c62a1592..c98d32382b5 100644 --- a/benchmarks/python/test_scale_bias_relu_bwd.py +++ b/benchmarks/python/test_scale_bias_relu_bwd.py @@ -79,16 +79,16 @@ def test_sbr_bwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [scale, bool_mask, grads]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_sbr_bwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() inputs = torch.randn(*size, device="cuda", dtype=dtype, requires_grad=True) grads = torch.randn(*size, device="cuda", dtype=dtype) @@ -99,12 +99,12 @@ def sbr_fwd(): return torch.nn.functional.relu(inputs * scale + bias) # Compile the fwd fn for torchcompile - fwd_fn = torch.compile(sbr_fwd) if compile else sbr_fwd - eager_output = sbr_fwd() + fwd_fn = {"eager": sbr_fwd, "torchcompile": torch.compile(sbr_fwd)} + outputs = fwd_fn[executor]() run_benchmark( benchmark, unary_bwd_torch, - [eager_output, grads], + [outputs, grads], iobytes=sbr_bwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_scale_bias_relu_fwd.py b/benchmarks/python/test_scale_bias_relu_fwd.py index ede13dbb767..c09b11296c3 100644 --- a/benchmarks/python/test_scale_bias_relu_fwd.py +++ b/benchmarks/python/test_scale_bias_relu_fwd.py @@ -82,24 +82,26 @@ def test_sbr_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [bias, scale, inputs]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_sbr_fwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() inputs = torch.randn(*size, device="cuda", dtype=dtype, requires_grad=True) bias = torch.ones(size[-1], device="cuda", dtype=dtype) scale = torch.ones(size[-1], device="cuda", dtype=dtype) + benchmark_fn = {"eager": sbr_fwd_fn, "torchcompile": torch.compile(sbr_fwd_fn)} + run_benchmark( benchmark, - torch.compile(sbr_fwd_fn) if compile else sbr_fwd_fn, + benchmark_fn[executor], [bias, scale, inputs], iobytes=sbr_fwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_silu_mul_bwd.py b/benchmarks/python/test_silu_mul_bwd.py index 98995e860b1..25276dec474 100644 --- a/benchmarks/python/test_silu_mul_bwd.py +++ b/benchmarks/python/test_silu_mul_bwd.py @@ -79,16 +79,16 @@ def test_silu_mul_bwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [grads, x, y]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_silu_mul_bwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() x = torch.randn(*size, device="cuda", dtype=dtype, requires_grad=True) y = torch.randn(*size, device="cuda", dtype=dtype, requires_grad=True) @@ -98,12 +98,12 @@ def silu_mul_fwd(): return torch.nn.functional.silu(x) * y # Compile the fwd fn for torchcompile - fwd_fn = torch.compile(silu_mul_fwd) if compile else silu_mul_fwd - eager_output = fwd_fn() + fwd_fn = {"eager": silu_mul_fwd, "torchcompile": torch.compile(silu_mul_fwd)} + outputs = fwd_fn[executor]() run_benchmark( benchmark, unary_bwd_torch, - [eager_output, grads], + [outputs, grads], iobytes=silu_mul_bwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_silu_mul_fwd.py b/benchmarks/python/test_silu_mul_fwd.py index 0f1e86d0d56..3de05067cb2 100644 --- a/benchmarks/python/test_silu_mul_fwd.py +++ b/benchmarks/python/test_silu_mul_fwd.py @@ -56,22 +56,27 @@ def test_silu_mul_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, inputs) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_silu_mul_fwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() inputs = [torch.randn(*size, device="cuda", dtype=dtype) for _ in range(2)] + benchmark_fn = { + "eager": silu_mul_fwd_fn, + "torchcompile": torch.compile(silu_mul_fwd_fn), + } + # Inputs and outputs are same as nvFuser, no need for manual IOByte computation run_benchmark( benchmark, - torch.compile(silu_mul_fwd_fn) if compile else silu_mul_fwd_fn, + benchmark_fn[executor], inputs, ) diff --git a/benchmarks/python/test_softmax_bwd.py b/benchmarks/python/test_softmax_bwd.py index 86f22654380..049da18fe27 100644 --- a/benchmarks/python/test_softmax_bwd.py +++ b/benchmarks/python/test_softmax_bwd.py @@ -91,7 +91,7 @@ def test_softmax_bwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, inputs) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("reduction_axis", [0, 1]) @@ -100,9 +100,9 @@ def test_softmax_bwd_baseline_benchmark( size: tuple, dtype: torch.dtype, reduction_axis: int, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() input = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True) grads = torch.randn(size, device="cuda", dtype=dtype) @@ -110,12 +110,12 @@ def test_softmax_bwd_baseline_benchmark( def softmax_fwd(): return torch.nn.functional.softmax(input, dim=reduction_axis) - fwd_fn = torch.compile(softmax_fwd) if compile else softmax_fwd - output = fwd_fn() + fwd_fn = {"eager": softmax_fwd, "torchcompile": torch.compile(softmax_fwd)} + outputs = fwd_fn[executor]() run_benchmark( benchmark, unary_bwd_torch, - [output, grads], + [outputs, grads], iobytes=softmax_bwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_softmax_fwd.py b/benchmarks/python/test_softmax_fwd.py index 2e672eb2e30..d138aa1ced1 100644 --- a/benchmarks/python/test_softmax_fwd.py +++ b/benchmarks/python/test_softmax_fwd.py @@ -81,7 +81,7 @@ def test_softmax_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, inputs) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("reduction_axis", [0, 1]) @@ -90,15 +90,19 @@ def test_softmax_fwd_baseline_benchmark( size: tuple, dtype: torch.dtype, reduction_axis: int, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() input = torch.randn(size, device="cuda", dtype=dtype) + benchmark_fn = { + "eager": softmax_fwd_fn, + "torchcompile": torch.compile(softmax_fwd_fn), + } run_benchmark( benchmark, - torch.compile(softmax_fwd_fn) if compile else softmax_fwd_fn, + benchmark_fn[executor], [input, reduction_axis], iobytes=softmax_fwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_transpose.py b/benchmarks/python/test_transpose.py index cf290f278a5..a4e3198cc9a 100644 --- a/benchmarks/python/test_transpose.py +++ b/benchmarks/python/test_transpose.py @@ -74,7 +74,7 @@ def test_transpose_nvf_benchmark( run_benchmark(benchmark, fd.execute, [input1, input2]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=3)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("axes", [(0, 1), (0, 2), (1, 2)]) @@ -83,15 +83,21 @@ def test_transpose_baseline_benchmark( size: tuple, dtype: torch.dtype, axes: list, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() input1 = torch.randn(size, device="cuda", dtype=dtype) input2 = torch.randn(size, device="cuda", dtype=dtype) + + benchmark_fn = { + "eager": transpose_fwd_fn, + "torchcompile": torch.compile(transpose_fwd_fn), + } + # Inputs and outputs are same as nvFuser, no need for manual IOByte computation run_benchmark( benchmark, - torch.compile(transpose_fwd_fn) if compile else transpose_fwd_fn, + benchmark_fn[executor], [input1, input2, axes[0], axes[1]], )