diff --git a/CMakeLists.txt b/CMakeLists.txt index a6eb9fed679..de3e52f5055 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,6 +13,10 @@ set(NVFUSER_SRCS_DIR "${NVFUSER_ROOT}/csrc") set(NVFUSER_THIRD_PARTY_DIR "${NVFUSER_ROOT}/third_party") option(NVFUSER_STANDALONE_BUILD_WITH_UCC "" OFF) +option(NVFUSER_EXPLICIT_ERROR_CHECK "" OFF) +if (NVFUSER_EXPLICIT_ERROR_CHECK) + add_compile_definitions(NVFUSER_EXPLICIT_ERROR_CHECK) +endif() option(NVFUSER_BUILD_WITH_ASAN "Build nvFuser with asan" OFF) include(CMakeDependentOption) @@ -545,6 +549,7 @@ list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/tests/cpp/test_id_model.cpp ${NVFUSER_ROOT}/tests/cpp/test_indexing.cpp ${NVFUSER_ROOT}/tests/cpp/test_indexing_advanced.cpp + ${NVFUSER_ROOT}/tests/cpp/test_inlining.cpp ${NVFUSER_ROOT}/tests/cpp/test_iter_visitor.cpp ${NVFUSER_ROOT}/tests/cpp/test_linked_hash_map.cpp ${NVFUSER_ROOT}/tests/cpp/test_loop_domain_scheduling.cpp diff --git a/benchmarks/cpp/batch_norm_channels_first.cpp b/benchmarks/cpp/batch_norm_channels_first.cpp index 1bc1845d912..22098787766 100644 --- a/benchmarks/cpp/batch_norm_channels_first.cpp +++ b/benchmarks/cpp/batch_norm_channels_first.cpp @@ -78,7 +78,7 @@ static void setupBatchNorm(Fusion* fusion, DataType dtype) { static void NvFuserScheduler_BatchNorm( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype) { NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half); @@ -102,7 +102,7 @@ static void NvFuserScheduler_BatchNorm( std::vector aten_inputs( {at_x, at_weight, at_bias, at_run_mean, at_run_var}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * diff --git a/benchmarks/cpp/batch_norm_channels_first_backward.cpp b/benchmarks/cpp/batch_norm_channels_first_backward.cpp index 271d04eece9..0edd2d3e52d 100644 --- a/benchmarks/cpp/batch_norm_channels_first_backward.cpp +++ b/benchmarks/cpp/batch_norm_channels_first_backward.cpp @@ -89,7 +89,7 @@ static void setupBatchNorm_BWD(Fusion* fusion, DataType dtype) { static void NvFuserScheduler_BatchNorm_BWD( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype) { NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half); @@ -115,7 +115,7 @@ static void NvFuserScheduler_BatchNorm_BWD( std::vector aten_inputs( {input, grad_out, weight, run_mean, run_var, save_mean, save_var}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * diff --git a/benchmarks/cpp/batch_norm_channels_last.cpp b/benchmarks/cpp/batch_norm_channels_last.cpp index dc21fede5f6..bbdd5c82e63 100644 --- a/benchmarks/cpp/batch_norm_channels_last.cpp +++ b/benchmarks/cpp/batch_norm_channels_last.cpp @@ -79,7 +79,7 @@ static void setupBatchNorm_nhwc(Fusion* fusion, DataType dtype) { static void NvFuserScheduler_BatchNorm_nhwc( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype) { NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half); @@ -103,7 +103,7 @@ static void NvFuserScheduler_BatchNorm_nhwc( std::vector aten_inputs( {at_x, at_weight, at_bias, at_run_mean, at_run_var}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * diff --git a/benchmarks/cpp/batch_norm_channels_last_backward.cpp b/benchmarks/cpp/batch_norm_channels_last_backward.cpp index a11627139d4..e40508fffe3 100644 --- a/benchmarks/cpp/batch_norm_channels_last_backward.cpp +++ b/benchmarks/cpp/batch_norm_channels_last_backward.cpp @@ -90,7 +90,7 @@ static void setupBatchNorm_nhwc_BWD(Fusion* fusion, DataType dtype) { static void NvFuserScheduler_BatchNorm_nhwc_BWD( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype) { NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half); @@ -116,7 +116,7 @@ static void NvFuserScheduler_BatchNorm_nhwc_BWD( std::vector aten_inputs( {input, grad_out, weight, run_mean, run_var, save_mean, save_var}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * diff --git a/benchmarks/cpp/bert.cpp b/benchmarks/cpp/bert.cpp index d22e234cde1..94edab48479 100644 --- a/benchmarks/cpp/bert.cpp +++ b/benchmarks/cpp/bert.cpp @@ -118,7 +118,7 @@ static void setupDivMaxSoftmaxDropoutBackward(Fusion* fusion, DataType dtype) { static void NvFuserScheduler_DivMaxSoftDropFwd( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype) { auto w = benchmark_state.range(0); auto x = benchmark_state.range(1); @@ -135,7 +135,7 @@ static void NvFuserScheduler_DivMaxSoftDropFwd( std::vector at_inputs = {t0, t1}; auto bytes = - runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, at_inputs); benchmark_state.SetBytesProcessed( bytes * int64_t(benchmark_state.iterations())); @@ -143,7 +143,7 @@ static void NvFuserScheduler_DivMaxSoftDropFwd( static void NvFuserScheduler_DivMaxSoftDropBwd( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype) { auto w = benchmark_state.range(0); auto x = benchmark_state.range(1); @@ -162,7 +162,7 @@ static void NvFuserScheduler_DivMaxSoftDropBwd( std::vector at_inputs = {t0, t1, t2, t3}; auto bytes = - runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, at_inputs); // Some reason t1 isn't used, ignore it. bytes -= @@ -228,7 +228,7 @@ static void setupBiasDropoutAddLayernormFwd(Fusion* fusion, DataType dtype) { static void NvFuserScheduler_BiasDropoutAddLayernormFwd( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype) { auto x = benchmark_state.range(0); auto y = benchmark_state.range(1); @@ -247,7 +247,7 @@ static void NvFuserScheduler_BiasDropoutAddLayernormFwd( std::vector at_inputs = {t0, t1, t2, t3, t4}; auto bytes = - runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, at_inputs); benchmark_state.SetBytesProcessed( bytes * int64_t(benchmark_state.iterations())); @@ -304,7 +304,7 @@ static void setupBiasDropoutAddLayernormBwd1(Fusion* fusion, DataType dtype) { static void NvFuserScheduler_BiasDropoutAddLayernormBwd1( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype) { auto x = benchmark_state.range(0); auto y = benchmark_state.range(1); @@ -322,7 +322,7 @@ static void NvFuserScheduler_BiasDropoutAddLayernormBwd1( std::vector at_inputs = {t0, t1, t2, t3}; auto bytes = - runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, at_inputs); benchmark_state.SetBytesProcessed( bytes * int64_t(benchmark_state.iterations())); @@ -380,7 +380,7 @@ static void setupBiasDropoutAddLayernormBwd2(Fusion* fusion, DataType dtype) { static void NvFuserScheduler_BiasDropoutAddLayernormBwd2( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype) { auto x = benchmark_state.range(0); auto y = benchmark_state.range(1); @@ -398,7 +398,7 @@ static void NvFuserScheduler_BiasDropoutAddLayernormBwd2( std::vector at_inputs = {t4, t5, t1, t8}; auto bytes = - runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, at_inputs); benchmark_state.SetBytesProcessed( bytes * int64_t(benchmark_state.iterations())); @@ -438,7 +438,7 @@ static void setupBiasDropoutAddLayernormBwd3(Fusion* fusion, DataType dtype) { static void NvFuserScheduler_BiasDropoutAddLayernormBwd3( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype) { auto x = benchmark_state.range(0); auto y = benchmark_state.range(1); @@ -454,7 +454,7 @@ static void NvFuserScheduler_BiasDropoutAddLayernormBwd3( std::vector at_inputs = {t0, t21}; auto bytes = - runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, at_inputs); benchmark_state.SetBytesProcessed( bytes * int64_t(benchmark_state.iterations())); diff --git a/benchmarks/cpp/broadcast.cpp b/benchmarks/cpp/broadcast.cpp index c3accd47d2e..6ef7564a6e0 100644 --- a/benchmarks/cpp/broadcast.cpp +++ b/benchmarks/cpp/broadcast.cpp @@ -56,7 +56,7 @@ static void setupBroadcast(Fusion* fusion, DataType dtype, int bcast_axis) { static void NvFuserScheduler_Broadcast( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype, int bcast_dim) { auto bcast_size = benchmark_state.range(0); @@ -74,7 +74,7 @@ static void NvFuserScheduler_Broadcast( std::vector aten_inputs({t0, t1}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * diff --git a/benchmarks/cpp/gelu_backward.cpp b/benchmarks/cpp/gelu_backward.cpp index 512ea915ae2..ae1e0ce2473 100644 --- a/benchmarks/cpp/gelu_backward.cpp +++ b/benchmarks/cpp/gelu_backward.cpp @@ -162,8 +162,8 @@ static void NvFuserScheduler_GeluBackward_Compile( &fusion, SchedulerType::PointWise, c10::ArrayRef(inputs)); for (auto _ : benchmark_state) { - FusionExecutor executor; - executor.compileFusion(&fusion, inputs, heuristic_params->lparams); + KernelExecutor ke; + ke.compile(&fusion, inputs, heuristic_params->lparams); } } @@ -187,14 +187,14 @@ static void NvFuserScheduler_GeluBackward_RunFusion( auto heuristic_params = SchedulerEntry::scheduleWith( &fusion, SchedulerType::PointWise, c10::ArrayRef(inputs)); - FusionExecutor executor; - executor.compileFusion(&fusion, inputs, heuristic_params->lparams); + KernelExecutor ke; + ke.compile(&fusion, inputs, heuristic_params->lparams); C10_CUDA_CHECK(cudaDeviceSynchronize()); for (auto _ : benchmark_state) { - outputs = executor.runFusion( - c10::ArrayRef(inputs), heuristic_params->lparams); + outputs = + ke.run(c10::ArrayRef(inputs), heuristic_params->lparams); C10_CUDA_CHECK(cudaDeviceSynchronize()); clearL2Cache(); } @@ -218,11 +218,11 @@ static void NvFuserScheduler_GeluBackward_RunFusion_GpuOnly( auto heuristic_params = SchedulerEntry::scheduleWith( &fusion, SchedulerType::PointWise, c10::ArrayRef(inputs)); - FusionExecutor executor; - executor.compileFusion(&fusion, inputs, heuristic_params->lparams); + KernelExecutor ke; + ke.compile(&fusion, inputs, heuristic_params->lparams); runBenchmarkIterations( - benchmark_state, &executor, inputs, heuristic_params->lparams); + benchmark_state, &ke, inputs, heuristic_params->lparams); } BENCHMARK(NvFuserScheduler_GeluBackward_RunFusion_GpuOnly) @@ -247,13 +247,13 @@ static void NvFuserScheduler_GeluBackward_RunFusion_CpuOnly( auto heuristic_params = SchedulerEntry::scheduleWith( &fusion, SchedulerType::PointWise, c10::ArrayRef(inputs)); - FusionExecutor executor; - executor.setExecuteKernelFlag(false); - executor.compileFusion(&fusion, inputs, heuristic_params->lparams); + KernelExecutor ke; + ke.setExecuteKernelFlag(false); + ke.compile(&fusion, inputs, heuristic_params->lparams); for (auto _ : benchmark_state) { - outputs = executor.runFusion( - c10::ArrayRef(inputs), heuristic_params->lparams); + outputs = + ke.run(c10::ArrayRef(inputs), heuristic_params->lparams); } } diff --git a/benchmarks/cpp/gelu_backward_reduction.cpp b/benchmarks/cpp/gelu_backward_reduction.cpp index 60ea8ed2b29..c4e97fc6d3b 100644 --- a/benchmarks/cpp/gelu_backward_reduction.cpp +++ b/benchmarks/cpp/gelu_backward_reduction.cpp @@ -93,7 +93,7 @@ static void setupGeluBackwardReduction( static void NvFuserScheduler_GeluBackwardReduction( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype, int reduction_dim) { auto reduction_size = benchmark_state.range(0); @@ -112,7 +112,7 @@ static void NvFuserScheduler_GeluBackwardReduction( std::vector aten_inputs = {aten_input_grad, aten_input_x}; - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); // inputs: gradient tensor + input tensor // outputs: output, output_of_reduction diff --git a/benchmarks/cpp/heuristic_cache.cpp b/benchmarks/cpp/heuristic_cache.cpp index 29d5b13bef2..5ab923bd6d0 100644 --- a/benchmarks/cpp/heuristic_cache.cpp +++ b/benchmarks/cpp/heuristic_cache.cpp @@ -26,7 +26,7 @@ using namespace nvfuser; static auto getLayerBackwardNormRuntime( std::unique_ptr fusion_ptr, - std::unique_ptr& fec, + std::unique_ptr& executor_cache, std::vector& aten_inputs, std::vector& shape, std::vector& norm_shape) { @@ -84,12 +84,12 @@ static auto getLayerBackwardNormRuntime( auto aten_mean = std::get<1>(aten_results); auto aten_rstd = std::get<2>(aten_results); - fec = std::make_unique(std::move(fusion_ptr)); + executor_cache = std::make_unique(std::move(fusion_ptr)); aten_inputs = { aten_grad_out, aten_input, aten_mean, aten_rstd, aten_weight, aten_bias}; - auto cg_outputs = fec->runFusionWithInputs(aten_inputs); + auto cg_outputs = executor_cache->runFusionWithInputs(aten_inputs); - return fec->getMostRecentKernelRuntime(); + return executor_cache->getMostRecentKernelRuntime(); } static void NvFuserScheduler_LayerNormBackward_HeuristicCache( @@ -98,14 +98,14 @@ static void NvFuserScheduler_LayerNormBackward_HeuristicCache( FusionGuard fg(fusion_ptr.get()); // PreAllocate - std::unique_ptr fec; + std::unique_ptr executor_cache; std::vector aten_inputs; std::vector shape{20, 100, 35, 67}; std::vector norm_shape{67}; auto runtime = getLayerBackwardNormRuntime( - std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape); + std::move(fusion_ptr), executor_cache, aten_inputs, shape, norm_shape); KernelArgumentHolder args = KernelArgumentHolder::createKernelArgumentHolder(aten_inputs); @@ -120,7 +120,7 @@ static void NvFuserScheduler_LayerNormBackward_HeuristicCache( static auto getLayerForwardNormRuntime( std::unique_ptr fusion_ptr, - std::unique_ptr& fec, + std::unique_ptr& executor_cache, std::vector& aten_inputs, std::vector& shape, std::vector& norm_shape) { @@ -141,11 +141,11 @@ static auto getLayerForwardNormRuntime( auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn(shape, options); - fec = std::make_unique(std::move(fusion_ptr)); + executor_cache = std::make_unique(std::move(fusion_ptr)); aten_inputs = {aten_input}; - auto cg_outputs = fec->runFusionWithInputs(aten_inputs); + auto cg_outputs = executor_cache->runFusionWithInputs(aten_inputs); - return fec->getMostRecentKernelRuntime(); + return executor_cache->getMostRecentKernelRuntime(); } static void NvFuserScheduler_LayerNormForward_HeuristicCache( @@ -154,14 +154,14 @@ static void NvFuserScheduler_LayerNormForward_HeuristicCache( FusionGuard fg(fusion_ptr.get()); // PreAllocate - std::unique_ptr fec; + std::unique_ptr executor_cache; std::vector aten_inputs; std::vector shape{20, 100, 35, 67}; std::vector norm_shape{67}; auto runtime = getLayerForwardNormRuntime( - std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape); + std::move(fusion_ptr), executor_cache, aten_inputs, shape, norm_shape); KernelArgumentHolder args = KernelArgumentHolder::createKernelArgumentHolder(aten_inputs); diff --git a/benchmarks/cpp/heuristic_lookup.cpp b/benchmarks/cpp/heuristic_lookup.cpp index aecc7dc824f..16be106b728 100644 --- a/benchmarks/cpp/heuristic_lookup.cpp +++ b/benchmarks/cpp/heuristic_lookup.cpp @@ -26,7 +26,7 @@ using namespace nvfuser; static auto getLayerBackwardNormRuntime( std::unique_ptr fusion_ptr, - std::unique_ptr& fec, + std::unique_ptr& executor_cache, std::vector& aten_inputs, std::vector& shape, std::vector& norm_shape) { @@ -86,12 +86,12 @@ static auto getLayerBackwardNormRuntime( auto aten_mean = std::get<1>(aten_results); auto aten_rstd = std::get<2>(aten_results); - fec = std::make_unique(std::move(fusion_ptr)); + executor_cache = std::make_unique(std::move(fusion_ptr)); aten_inputs = { aten_grad_out, aten_input, aten_mean, aten_rstd, aten_weight, aten_bias}; - auto cg_outputs = fec->runFusionWithInputs(aten_inputs); + auto cg_outputs = executor_cache->runFusionWithInputs(aten_inputs); - return fec->getMostRecentKernelRuntime(); + return executor_cache->getMostRecentKernelRuntime(); } static void NvFuserScheduler_LayerNormBackward_HeuristicLookup( @@ -100,14 +100,14 @@ static void NvFuserScheduler_LayerNormBackward_HeuristicLookup( FusionGuard fg(fusion_ptr.get()); // PreAllocate - std::unique_ptr fec; + std::unique_ptr executor_cache; std::vector aten_inputs; std::vector shape{20, 100, 35, 67}; std::vector norm_shape{67}; auto runtime = getLayerBackwardNormRuntime( - std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape); + std::move(fusion_ptr), executor_cache, aten_inputs, shape, norm_shape); KernelArgumentHolder args = KernelArgumentHolder::createKernelArgumentHolder(aten_inputs); @@ -122,7 +122,7 @@ static void NvFuserScheduler_LayerNormBackward_HeuristicLookup( static auto getLayerForwardNormRuntime( std::unique_ptr fusion_ptr, - std::unique_ptr& fec, + std::unique_ptr& executor_cache, std::vector& aten_inputs, std::vector& shape, std::vector& norm_shape) { @@ -143,11 +143,11 @@ static auto getLayerForwardNormRuntime( auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn(shape, options); - fec = std::make_unique(std::move(fusion_ptr)); + executor_cache = std::make_unique(std::move(fusion_ptr)); aten_inputs = {aten_input}; - auto cg_outputs = fec->runFusionWithInputs(aten_inputs); + auto cg_outputs = executor_cache->runFusionWithInputs(aten_inputs); - return fec->getMostRecentKernelRuntime(); + return executor_cache->getMostRecentKernelRuntime(); } static void NvFuserScheduler_LayerNormForward_HeuristicLookup( @@ -156,14 +156,14 @@ static void NvFuserScheduler_LayerNormForward_HeuristicLookup( FusionGuard fg(fusion_ptr.get()); // PreAllocate - std::unique_ptr fec; + std::unique_ptr executor_cache; std::vector aten_inputs; std::vector shape{20, 100, 35, 67}; std::vector norm_shape{67}; auto runtime = getLayerForwardNormRuntime( - std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape); + std::move(fusion_ptr), executor_cache, aten_inputs, shape, norm_shape); KernelArgumentHolder args = KernelArgumentHolder::createKernelArgumentHolder(aten_inputs); diff --git a/benchmarks/cpp/indexselect.cpp b/benchmarks/cpp/indexselect.cpp index ba5c7054cab..01eefc2d0a1 100644 --- a/benchmarks/cpp/indexselect.cpp +++ b/benchmarks/cpp/indexselect.cpp @@ -132,8 +132,8 @@ static void NvFuserScheduler_IndexSelect_Compile( &fusion, SchedulerType::PointWise, c10::ArrayRef(inputs)); for (auto _ : benchmark_state) { - FusionExecutor executor; - executor.compileFusion( + KernelExecutor ke; + ke.compile( &fusion, c10::ArrayRef(inputs), heuristic_params->lparams); } } @@ -155,8 +155,8 @@ static void NvFuserScheduler_IndexSelect_RunFusion( auto heuristic_params = SchedulerEntry::scheduleWith( &fusion, SchedulerType::PointWise, c10::ArrayRef(inputs)); - FusionExecutor executor; - executor.compileFusion( + KernelExecutor ke; + ke.compile( &fusion, c10::ArrayRef(inputs), heuristic_params->lparams); C10_CUDA_CHECK(cudaDeviceSynchronize()); @@ -164,7 +164,7 @@ static void NvFuserScheduler_IndexSelect_RunFusion( at::Tensor output = at::empty_like(inputs[0].toTensor()); for (auto _ : benchmark_state) { - executor.runFusion( + ke.run( c10::ArrayRef(inputs), {output}, heuristic_params->lparams); @@ -235,7 +235,7 @@ static void setupIndexSelect(Fusion* fusion, DataType dtype, int select_dim) { static void NvFuserScheduler_IndexSelectSimple( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype, int select_dim) { auto elem_size = benchmark_state.range(0); @@ -257,7 +257,7 @@ static void NvFuserScheduler_IndexSelectSimple( std::vector aten_inputs = {t0, t1}; - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * @@ -267,7 +267,7 @@ static void NvFuserScheduler_IndexSelectSimple( static void NvFuserScheduler_IndexSelect( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype, int select_dim) { auto elem_size = benchmark_state.range(0); @@ -289,7 +289,7 @@ static void NvFuserScheduler_IndexSelect( std::vector aten_inputs = {t2, t0, t1}; - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * diff --git a/benchmarks/cpp/instance_norm.cpp b/benchmarks/cpp/instance_norm.cpp index d4c6707e912..2f3a832d8db 100644 --- a/benchmarks/cpp/instance_norm.cpp +++ b/benchmarks/cpp/instance_norm.cpp @@ -81,7 +81,7 @@ static void setupInstanceNorm( static void NvFuserScheduler_InstanceNorm( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype, bool channels_last_3d = false) { NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half); @@ -116,7 +116,7 @@ static void NvFuserScheduler_InstanceNorm( at_x, at_weight, at_bias, at_mean, at_var}; std::vector outputs; - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); const size_t kChannels = benchmark_state.range(2); @@ -165,7 +165,7 @@ static void setupInstanceNormNHWC(Fusion* fusion, DataType dtype) { static void NvFuserScheduler_InstanceNormNHWC( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype) { NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half); @@ -186,7 +186,7 @@ static void NvFuserScheduler_InstanceNormNHWC( std::vector aten_inputs = {at_x, at_weight, at_bias}; std::vector outputs; - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); const size_t kChannels = benchmark_state.range(2); diff --git a/benchmarks/cpp/layer_norm.cpp b/benchmarks/cpp/layer_norm.cpp index 445b1637274..706f1e8fa84 100644 --- a/benchmarks/cpp/layer_norm.cpp +++ b/benchmarks/cpp/layer_norm.cpp @@ -67,7 +67,7 @@ static void setupLayerNorm(Fusion* fusion, DataType dtype) { static void NvFuserScheduler_LayerNorm( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype) { NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half); @@ -84,7 +84,7 @@ static void NvFuserScheduler_LayerNorm( std::vector aten_inputs({input, weight, bias}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * @@ -142,7 +142,7 @@ static void Baseline_LayerNorm_fp16(benchmark::State& benchmark_state) { static void NvFuserScheduler_TIMM_LayerNorm( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype) { NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half); @@ -162,7 +162,7 @@ static void NvFuserScheduler_TIMM_LayerNorm( std::vector aten_inputs({input, weight, bias}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * diff --git a/benchmarks/cpp/layer_norm_backward.cpp b/benchmarks/cpp/layer_norm_backward.cpp index a14e89fce4f..3da9de991dc 100644 --- a/benchmarks/cpp/layer_norm_backward.cpp +++ b/benchmarks/cpp/layer_norm_backward.cpp @@ -80,7 +80,7 @@ static void setupLayerNorm_BWD(Fusion* fusion, DataType dtype) { static void NvFuserScheduler_LayerNorm_BWD( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype) { NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half); @@ -103,7 +103,7 @@ static void NvFuserScheduler_LayerNorm_BWD( std::vector aten_inputs( {grad_out, input, weight, bias, mean, rstd}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * diff --git a/benchmarks/cpp/layer_norm_fused.cpp b/benchmarks/cpp/layer_norm_fused.cpp index 823b571aa1c..12cba780d4b 100644 --- a/benchmarks/cpp/layer_norm_fused.cpp +++ b/benchmarks/cpp/layer_norm_fused.cpp @@ -84,7 +84,7 @@ static void setupLayerNormFused(Fusion* fusion, DataType dtype) { static void NvFuserScheduler_LayerNormFused( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype) { NVF_ERROR(dtype == DataType::Half); @@ -104,7 +104,7 @@ static void NvFuserScheduler_LayerNormFused( std::vector aten_inputs({tv0, tv1, tv2, tv3, tv4}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * diff --git a/benchmarks/cpp/lstm_cell.cpp b/benchmarks/cpp/lstm_cell.cpp index 3c7b98a4c84..7fe205f6312 100644 --- a/benchmarks/cpp/lstm_cell.cpp +++ b/benchmarks/cpp/lstm_cell.cpp @@ -155,8 +155,8 @@ static void NvFuserScheduler_LstmCell_Compile( &fusion, SchedulerType::PointWise, c10::ArrayRef(inputs)); for (auto _ : benchmark_state) { - FusionExecutor executor; - executor.compileFusion(&fusion, inputs); + KernelExecutor ke; + ke.compile(&fusion, inputs); } } @@ -182,14 +182,14 @@ static void NvFuserScheduler_LstmCell_RunFusion( auto heuristic_params = SchedulerEntry::scheduleWith( &fusion, SchedulerType::PointWise, c10::ArrayRef(inputs)); - FusionExecutor executor; - executor.compileFusion(&fusion, inputs); + KernelExecutor ke; + ke.compile(&fusion, inputs); C10_CUDA_CHECK(cudaDeviceSynchronize()); for (auto _ : benchmark_state) { - outputs = executor.runFusion( - c10::ArrayRef(inputs), heuristic_params->lparams); + outputs = + ke.run(c10::ArrayRef(inputs), heuristic_params->lparams); C10_CUDA_CHECK(cudaDeviceSynchronize()); } } @@ -220,11 +220,11 @@ static void NvFuserScheduler_LstmCell_RunFusion_GpuOnly( auto heuristic_params = SchedulerEntry::scheduleWith( &fusion, SchedulerType::PointWise, c10::ArrayRef(inputs)); - FusionExecutor executor; - executor.compileFusion(&fusion, inputs); + KernelExecutor ke; + ke.compile(&fusion, inputs); runBenchmarkIterations( - benchmark_state, &executor, inputs, heuristic_params->lparams); + benchmark_state, &ke, inputs, heuristic_params->lparams); } BENCHMARK_CAPTURE(NvFuserScheduler_LstmCell_RunFusion_GpuOnly, Small, 512, 64) @@ -259,13 +259,13 @@ static void NvFuserScheduler_LstmCell_RunFusion_CpuOnly( auto heuristic_params = SchedulerEntry::scheduleWith( &fusion, SchedulerType::PointWise, c10::ArrayRef(inputs)); - FusionExecutor executor; - executor.setExecuteKernelFlag(false); - executor.compileFusion(&fusion, inputs); + KernelExecutor ke; + ke.setExecuteKernelFlag(false); + ke.compile(&fusion, inputs); for (auto _ : benchmark_state) { - outputs = executor.runFusion( - c10::ArrayRef(inputs), heuristic_params->lparams); + outputs = + ke.run(c10::ArrayRef(inputs), heuristic_params->lparams); } } diff --git a/benchmarks/cpp/matmul.cpp b/benchmarks/cpp/matmul.cpp index 4f93dfbaf62..a9b10655aa0 100644 --- a/benchmarks/cpp/matmul.cpp +++ b/benchmarks/cpp/matmul.cpp @@ -175,19 +175,19 @@ static void SingleMatmulBase( // Compile kernel auto launch_constraints = LaunchParams(); - FusionExecutor fe; - fe.compileFusion(fusion, args, launch_constraints, cparams); + KernelExecutor ke; + ke.compile(fusion, args, launch_constraints, cparams); NVF_CHECK( - getBankConflictInfo(fe.kernel(), launch_constraints).empty(), + getBankConflictInfo(ke.kernel(), launch_constraints).empty(), "Shared memory bank conflict not removed."); std::vector aten_inputs({inputs.first, inputs.second}); // Warm up run - auto outputs = fe.runFusion(aten_inputs); + auto outputs = ke.run(aten_inputs); checkMatch(expected_output, outputs.at(0).to(at::kDouble), k); - runBenchmarkIterations(benchmark_state, &fe, aten_inputs); + runBenchmarkIterations(benchmark_state, &ke, aten_inputs); // TODO: FLOPS calculation } @@ -355,19 +355,19 @@ static void SingleMatmulPartitionedK( cparams.index_type = computeIndexType(M, N, K); // Compile kernel - FusionExecutor fe; + KernelExecutor ke; auto lparams = LaunchParams(); - fe.compileFusion(fusion, args, lparams, cparams); + ke.compile(fusion, args, lparams, cparams); NVF_CHECK( - getBankConflictInfo(fe.kernel(), lparams).empty(), + getBankConflictInfo(ke.kernel(), lparams).empty(), "Shared memory bank conflict not removed."); // Warm up run - auto outputs = fe.runFusion(aten_inputs); + auto outputs = ke.run(aten_inputs); checkMatch(expected_output, outputs.at(0).to(at::kDouble), Ki); - runBenchmarkIterations(benchmark_state, &fe, aten_inputs); + runBenchmarkIterations(benchmark_state, &ke, aten_inputs); // TODO: FLOPS calculation } @@ -461,21 +461,21 @@ static void NvFuserScheduler_MatmulSplitKReduction( KernelArgumentHolder::createKernelArgumentHolder(aten_inputs); // Compile kernel - FusionExecutor fe; - fe.compileFusion( + KernelExecutor ke; + ke.compile( fusion, args, heuristic_params->lparams, heuristic_params->cparams); NVF_CHECK( - getBankConflictInfo(fe.kernel(), heuristic_params->lparams).empty(), + getBankConflictInfo(ke.kernel(), heuristic_params->lparams).empty(), "Shared memory bank conflict not removed."); // Warm up run - auto outputs = fe.runFusion(aten_inputs, heuristic_params->lparams); + auto outputs = ke.run(aten_inputs, heuristic_params->lparams); checkMatch(expected_output, outputs.at(0).to(at::kDouble), splitk_factor); runBenchmarkIterations( - benchmark_state, &fe, aten_inputs, heuristic_params->lparams); + benchmark_state, &ke, aten_inputs, heuristic_params->lparams); // TODO: FLOPS calculation } diff --git a/benchmarks/cpp/reduction.cpp b/benchmarks/cpp/reduction.cpp index f70fb931e84..84f378967ca 100644 --- a/benchmarks/cpp/reduction.cpp +++ b/benchmarks/cpp/reduction.cpp @@ -50,7 +50,7 @@ static void setupReduction(Fusion* fusion, DataType dtype, int red_axis) { static void NvFuserScheduler_Reduction( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype, int reduction_dim) { auto reduction_size = benchmark_state.range(0); @@ -65,7 +65,7 @@ static void NvFuserScheduler_Reduction( std::vector aten_inputs({aten_input}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * diff --git a/benchmarks/cpp/rms_norm.cpp b/benchmarks/cpp/rms_norm.cpp index 1c29c66a631..6085929c179 100644 --- a/benchmarks/cpp/rms_norm.cpp +++ b/benchmarks/cpp/rms_norm.cpp @@ -62,7 +62,7 @@ static void setupRMSNorm(Fusion* fusion, DataType dtype) { static void NvFuserScheduler_RMSNorm( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype) { NVF_ERROR( dtype == DataType::Float || dtype == DataType::Half || @@ -80,7 +80,7 @@ static void NvFuserScheduler_RMSNorm( std::vector aten_inputs({input, weight}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * diff --git a/benchmarks/cpp/rms_norm_backward.cpp b/benchmarks/cpp/rms_norm_backward.cpp index 357499f2be4..c9422846b19 100644 --- a/benchmarks/cpp/rms_norm_backward.cpp +++ b/benchmarks/cpp/rms_norm_backward.cpp @@ -69,7 +69,7 @@ static void setupRMSNorm_BWD(Fusion* fusion, DataType dtype) { static void NvFuserScheduler_RMSNorm_BWD( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype) { NVF_ERROR( dtype == DataType::Float || dtype == DataType::Half || @@ -89,7 +89,7 @@ static void NvFuserScheduler_RMSNorm_BWD( std::vector aten_inputs({grad_out, input, weight, rstd}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * diff --git a/benchmarks/cpp/scale_bias_relu.cpp b/benchmarks/cpp/scale_bias_relu.cpp index e68c0b9140d..ed1505a2884 100644 --- a/benchmarks/cpp/scale_bias_relu.cpp +++ b/benchmarks/cpp/scale_bias_relu.cpp @@ -114,7 +114,7 @@ static void setupSBRNorm(Fusion* fusion, DataType dtype) { static void NvFuserScheduler_SBR( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype) { // N, H, W, C format std::vector input_shape{ @@ -136,7 +136,7 @@ static void NvFuserScheduler_SBR( // inputs std::vector aten_inputs = {at_x, at_scale, at_bias}; - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); const size_t size = input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3]; @@ -191,7 +191,7 @@ static void Baseline_SBR(benchmark::State& benchmark_state, DataType dtype) { static void NvFuserScheduler_SBR_Norm( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype) { // N, H, W, C format std::vector input_shape{ @@ -215,7 +215,7 @@ static void NvFuserScheduler_SBR_Norm( std::vector aten_inputs = { at_x, at_weight, at_bias, at_mean, at_var}; - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); const size_t size = input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3]; diff --git a/benchmarks/cpp/shape_inference.cpp b/benchmarks/cpp/shape_inference.cpp index 801759e2c03..3e580b4e6b4 100644 --- a/benchmarks/cpp/shape_inference.cpp +++ b/benchmarks/cpp/shape_inference.cpp @@ -26,7 +26,7 @@ using namespace nvfuser; static auto getLayerBackwardNormRuntime( std::unique_ptr fusion_ptr, - std::unique_ptr& fec, + std::unique_ptr& executor_cache, std::vector& aten_inputs, std::vector& shape, std::vector& norm_shape) { @@ -86,12 +86,12 @@ static auto getLayerBackwardNormRuntime( auto aten_mean = std::get<1>(aten_results); auto aten_rstd = std::get<2>(aten_results); - fec = std::make_unique(std::move(fusion_ptr)); + executor_cache = std::make_unique(std::move(fusion_ptr)); aten_inputs = { aten_grad_out, aten_input, aten_mean, aten_rstd, aten_weight, aten_bias}; - auto cg_outputs = fec->runFusionWithInputs(aten_inputs); + auto cg_outputs = executor_cache->runFusionWithInputs(aten_inputs); - return fec->getMostRecentKernelRuntime(); + return executor_cache->getMostRecentKernelRuntime(); } void LayerNormBackward_ShapeInference_Base( @@ -101,30 +101,30 @@ void LayerNormBackward_ShapeInference_Base( FusionGuard fg(fusion_ptr.get()); // PreAllocate - std::unique_ptr fec; + std::unique_ptr executor_cache; std::vector aten_inputs; std::vector shape{20, 100, 35, 67}; std::vector norm_shape{67}; auto runtime = getLayerBackwardNormRuntime( - std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape); + std::move(fusion_ptr), executor_cache, aten_inputs, shape, norm_shape); KernelArgumentHolder args = KernelArgumentHolder::createKernelArgumentHolder(aten_inputs); NVF_ERROR(runtime->getMaybeHeuristicsFor(args).has_value()); - fec->profile(true); - fec->disableKernelLaunch(); - fec->runFusionWithInputs(aten_inputs); + executor_cache->profile(true); + executor_cache->disableKernelLaunch(); + executor_cache->runFusionWithInputs(aten_inputs); if (disable_launch_parameter_cache) { - fec->disableLaunchParamCache(); + executor_cache->disableLaunchParamCache(); } for (auto _ : benchmark_state) { // Setup (not included in the measurement) - fec->runFusionWithInputs(aten_inputs); + executor_cache->runFusionWithInputs(aten_inputs); } } @@ -140,7 +140,7 @@ static void NvFuserScheduler_LayerNormBackward_NoShapeInferenceCachedBaseline( static auto getLayerForwardNormRuntime( std::unique_ptr fusion_ptr, - std::unique_ptr& fec, + std::unique_ptr& executor_cache, std::vector& aten_inputs, std::vector& shape, std::vector& norm_shape) { @@ -161,11 +161,11 @@ static auto getLayerForwardNormRuntime( auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn(shape, options); - fec = std::make_unique(std::move(fusion_ptr)); + executor_cache = std::make_unique(std::move(fusion_ptr)); aten_inputs = {aten_input}; - auto cg_outputs = fec->runFusionWithInputs(aten_inputs); + auto cg_outputs = executor_cache->runFusionWithInputs(aten_inputs); - return fec->getMostRecentKernelRuntime(); + return executor_cache->getMostRecentKernelRuntime(); } void LayerNormForward_ShapeInferenceBase( @@ -175,31 +175,31 @@ void LayerNormForward_ShapeInferenceBase( FusionGuard fg(fusion_ptr.get()); // PreAllocate - std::unique_ptr fec; + std::unique_ptr executor_cache; std::vector aten_inputs; std::vector shape{20, 100, 35, 67}; std::vector norm_shape{67}; auto runtime = getLayerForwardNormRuntime( - std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape); + std::move(fusion_ptr), executor_cache, aten_inputs, shape, norm_shape); KernelArgumentHolder args = KernelArgumentHolder::createKernelArgumentHolder(aten_inputs); NVF_ERROR(runtime->getMaybeHeuristicsFor(args).has_value()); - fec->profile(true); - fec->disableKernelLaunch(); - fec->runFusionWithInputs(aten_inputs); + executor_cache->profile(true); + executor_cache->disableKernelLaunch(); + executor_cache->runFusionWithInputs(aten_inputs); if (disable_launch_param_cache) { - fec->disableLaunchParamCache(); + executor_cache->disableLaunchParamCache(); } for (auto _ : benchmark_state) { // Setup (not included in the measurement) - fec->runFusionWithInputs(aten_inputs); + executor_cache->runFusionWithInputs(aten_inputs); } } diff --git a/benchmarks/cpp/softmax.cpp b/benchmarks/cpp/softmax.cpp index f1dca672349..90b30c9ff54 100644 --- a/benchmarks/cpp/softmax.cpp +++ b/benchmarks/cpp/softmax.cpp @@ -52,7 +52,7 @@ static void setupSoftmax( static void NvFuserScheduler_Softmax( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype, const int reduction_axis) { NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half); @@ -70,7 +70,7 @@ static void NvFuserScheduler_Softmax( std::vector aten_inputs({aten_input}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * @@ -105,10 +105,10 @@ static void NvFuserScheduler_Softmax_WarpReduceReference( auto heuristic_params = scheduler->computeHeuristics(fusion, runtime_info); scheduler->schedule(fusion, heuristic_params.get()); - FusionExecutor fe; - fe.compileFusion(fusion, aten_inputs); + KernelExecutor ke; + ke.compile(fusion, aten_inputs); - runBenchmarkIterations(benchmark_state, &fe, aten_inputs); + runBenchmarkIterations(benchmark_state, &ke, aten_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * @@ -152,10 +152,10 @@ static void NvFuserScheduler_Softmax_WarpReduce( } } - FusionExecutor fe; - fe.compileFusion(fusion, aten_inputs); + KernelExecutor ke; + ke.compile(fusion, aten_inputs); - runBenchmarkIterations(benchmark_state, &fe, aten_inputs); + runBenchmarkIterations(benchmark_state, &ke, aten_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * diff --git a/benchmarks/cpp/softmax_backward.cpp b/benchmarks/cpp/softmax_backward.cpp index 8c4a84562cc..364a5246016 100644 --- a/benchmarks/cpp/softmax_backward.cpp +++ b/benchmarks/cpp/softmax_backward.cpp @@ -57,7 +57,7 @@ static void setupSoftmaxBWD( static void NvFuserScheduler_Softmax_BWD( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype, const int reduction_axis) { NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half); @@ -83,7 +83,7 @@ static void NvFuserScheduler_Softmax_BWD( std::vector aten_inputs({grad_output, output, input}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * diff --git a/benchmarks/cpp/softmax_dropout.cpp b/benchmarks/cpp/softmax_dropout.cpp index f43fa24da81..2999d6442f1 100644 --- a/benchmarks/cpp/softmax_dropout.cpp +++ b/benchmarks/cpp/softmax_dropout.cpp @@ -75,7 +75,7 @@ static void setupSoftmaxDropout( static void NvFuserScheduler_SoftmaxDropout( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype, const int kReductionAxis) { NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half); @@ -96,7 +96,7 @@ static void NvFuserScheduler_SoftmaxDropout( std::vector aten_inputs( {at_scores, at_mask, sqrt(kAttentionHeadSize)}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); // 5 dtype: attention_scores + attention_mask + attention_scores_out + // attention_probs_out + output diff --git a/benchmarks/cpp/timm.cpp b/benchmarks/cpp/timm.cpp index 8bffc0bd1ef..cac01eadcca 100644 --- a/benchmarks/cpp/timm.cpp +++ b/benchmarks/cpp/timm.cpp @@ -56,7 +56,7 @@ static void setup_vit_base_patch16_224_bcast7(Fusion* fusion, void* null) { static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast7( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, void* null) { std::vector input_shape{ benchmark_state.range(0), @@ -74,7 +74,7 @@ static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast7( auto t7 = at::randn(input_shape, fp16_options); std::vector aten_inputs({t2, t3, t4, t7}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); // full tensor - float + halfx2 - t2, t7, t39 // Inner most dimension only - floatx2 - t36, t37 @@ -170,7 +170,7 @@ static void setup_vit_base_patch16_224_bcast5(Fusion* fusion, void* null) { static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast5( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, void* null) { std::vector input_shape{ benchmark_state.range(0), @@ -189,7 +189,7 @@ static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast5( auto t1 = at::randn({input_shape[2]}, fp32_options); std::vector aten_inputs({t2, t5, t3, t0, t1}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); // Full tensor - floatx2, halfx2, bool - t2, t16, t3, t34, t16 // Inner most dim only - floatx5 - t5, t0, t1, t7, t17 @@ -236,7 +236,7 @@ static void setup_vit_base_patch16_224_bcast_outer2( static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_outer2( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, void* null) { std::vector input_shape{ benchmark_state.range(0), @@ -252,7 +252,7 @@ static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_outer2( auto t2 = at::randn({input_shape[2]}, fp32_options); std::vector aten_inputs({t0, t2}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); // full tensor - halfx2 - t0, t6 // inner dimension only - halfx2 - t2, t7 @@ -314,7 +314,7 @@ static void setup_vit_base_patch16_224_norm_inner3(Fusion* fusion, void* null) { static void NvFuserScheduler_TIMM_vit_base_patch16_224_norm_inner3( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, void* null) { std::vector input_shape{ benchmark_state.range(0), @@ -328,7 +328,7 @@ static void NvFuserScheduler_TIMM_vit_base_patch16_224_norm_inner3( auto t0 = at::randn(input_shape, fp16_options); std::vector aten_inputs({t0, 0.125}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); // Full tensors - floatx2, half x2, bool - t12, t4, t0, t19, t14 benchmark_state.SetBytesProcessed( @@ -391,7 +391,7 @@ static void setup_vit_base_patch16_224_bcast_outer6( static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_outer6( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, void* null) { std::vector input_shape{ benchmark_state.range(0), @@ -407,7 +407,7 @@ static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_outer6( auto t2 = at::randn({input_shape[2]}, fp32_options); std::vector aten_inputs({t0, t2}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); // full tensors - float, halfx2, bool - t6, t0, t18, t13 // inner dimension only - float, half - t2, t19 benchmark_state.SetBytesProcessed( @@ -480,7 +480,7 @@ static void setup_vit_base_patch16_224_bcast_inner6( static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_inner6( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, void* null) { std::vector input_shape{ benchmark_state.range(0), @@ -496,7 +496,7 @@ static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_inner6( auto t2 = at::randn({input_shape[0], input_shape[1]}, fp32_options); std::vector aten_inputs({t0, t2}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); // full tensors - float, halfx2, bool - t6, t0, t18, t13 // outer two dimensions only - float, half - t2, t19 @@ -620,7 +620,7 @@ static void setup_vit_base_patch16_224_LN_BWD(Fusion* fusion, void* null) { static void NvFuserScheduler_TIMM_vit_base_patch16_224_LN_BWD( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, void* null) { std::vector input_shape{ benchmark_state.range(0), @@ -641,7 +641,7 @@ static void NvFuserScheduler_TIMM_vit_base_patch16_224_LN_BWD( auto t9 = at::randn({input_shape[2]}, fp16_options); std::vector aten_inputs({t0, t1, t3, t5, t6, t7, t9, 1.0}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); // Full tensors - bool, halfx4 - t0, t1, t3, t34, t35 // Outer two dimensions - floatx2 - t5, t6 @@ -701,7 +701,7 @@ static void nhwc_seresnet152d_transpose65(Fusion* fusion, void* null) { static void NvFuserScheduler_nhwc_seresnet152d_transpose65( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, void* null) { std::vector input_shape{ benchmark_state.range(0), @@ -721,7 +721,7 @@ static void NvFuserScheduler_nhwc_seresnet152d_transpose65( auto t4 = at::randn({2}, fp16_options).sum(); std::vector aten_inputs({t2, t5, t7, t9, t4}); - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs); // Full tensors - halfx6 - t2, t5, t7, t9, t29, t30 benchmark_state.SetBytesProcessed( diff --git a/benchmarks/cpp/transpose.cpp b/benchmarks/cpp/transpose.cpp index b24a2fdbfbe..21389f49f36 100644 --- a/benchmarks/cpp/transpose.cpp +++ b/benchmarks/cpp/transpose.cpp @@ -108,7 +108,7 @@ static void setupTranspose( static void NvFuserScheduler_Transpose( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, DataType dtype, int num_dims, std::pair axes, @@ -125,7 +125,7 @@ static void NvFuserScheduler_Transpose( auto at_input2 = aten_inputs[1]; std::vector fuser_inputs = {at_input1, at_input2}; - runBenchmarkIterations(benchmark_state, fusion_executor_cache, fuser_inputs); + runBenchmarkIterations(benchmark_state, executor_cache, fuser_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * diff --git a/benchmarks/cpp/utils.cpp b/benchmarks/cpp/utils.cpp index e171badd9ae..613a3cbb2ef 100644 --- a/benchmarks/cpp/utils.cpp +++ b/benchmarks/cpp/utils.cpp @@ -170,27 +170,27 @@ int64_t getSizeOfOutputs(const std::vector& outputs) { int64_t runBenchmarkIterations( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, std::vector& aten_inputs) { c10::cuda::CUDACachingAllocator::emptyCache(); - fusion_executor_cache->profile(true); + executor_cache->profile(true); int64_t io_bytes = getSizeOfInputs(aten_inputs); // Segment and compile the fusion { - auto cg_outputs = fusion_executor_cache->runFusionWithInputs(aten_inputs); + auto cg_outputs = executor_cache->runFusionWithInputs(aten_inputs); io_bytes += getSizeOfOutputs(cg_outputs); } bool segmented = - fusion_executor_cache->getMostRecentKernelRuntime()->isSegmented() && - fusion_executor_cache->getMostRecentKernelRuntime() + executor_cache->getMostRecentKernelRuntime()->isSegmented() && + executor_cache->getMostRecentKernelRuntime() ->fusionSegments() ->groups() .size() > 1; - const auto& compile_log = fusion_executor_cache->getMostRecentExecutorInfo(); + const auto& compile_log = executor_cache->getMostRecentExecutorInfo(); auto params = toString(compile_log.params); auto lparams = toString(compile_log.fusion_executor->lastLaunchParams()); // Only set if not segmented. In the case of segmented fusions, @@ -200,7 +200,7 @@ int64_t runBenchmarkIterations( benchmark_state.SetLabel(params + lparams); } - fusion_executor_cache->profile(false); + executor_cache->profile(false); // Sync everything up before we start NVFUSER_CUDA_RT_SAFE_CALL(cudaDeviceSynchronize()); @@ -208,7 +208,7 @@ int64_t runBenchmarkIterations( for (auto _ : benchmark_state) { clearL2Cache(); - auto cg_outputs = fusion_executor_cache->runFusionWithInputs(aten_inputs); + auto cg_outputs = executor_cache->runFusionWithInputs(aten_inputs); benchmark_state.SetIterationTime( FusionProfiler::profile().kernel_time_ms / 1000.0); } @@ -223,15 +223,15 @@ int64_t runBenchmarkIterations( int64_t runBenchmarkIterations( benchmark::State& benchmark_state, - FusionExecutor* fusion_executor, + KernelExecutor* fusion_executor, std::vector& aten_inputs, const LaunchParams& launch_constraints, CompileParams compile_params) { int64_t io_bytes = getSizeOfInputs(aten_inputs); { // Warm-up run - auto cg_outputs = fusion_executor->runFusion( - aten_inputs, launch_constraints, compile_params); + auto cg_outputs = + fusion_executor->run(aten_inputs, launch_constraints, compile_params); io_bytes += getSizeOfOutputs(cg_outputs); } @@ -246,8 +246,8 @@ int64_t runBenchmarkIterations( clearL2Cache(); FusionProfiler::start(); FusionProfiler::createSegments(1); - auto cg_outputs = fusion_executor->runFusion( - aten_inputs, launch_constraints, compile_params); + auto cg_outputs = + fusion_executor->run(aten_inputs, launch_constraints, compile_params); FusionProfiler::stop(); benchmark_state.SetIterationTime( FusionProfiler::profile().kernel_time_ms / 1000.0); diff --git a/benchmarks/cpp/utils.h b/benchmarks/cpp/utils.h index 61c5e556af3..67beb1ca7d5 100644 --- a/benchmarks/cpp/utils.h +++ b/benchmarks/cpp/utils.h @@ -40,7 +40,7 @@ std::string toString(LaunchParams lparams); //! if not segmented. int64_t runBenchmarkIterations( benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, + FusionExecutorCache* executor_cache, std::vector& aten_inputs); //! Run benchmark iterations with a fusion executor and @@ -48,7 +48,7 @@ int64_t runBenchmarkIterations( //! kernel time is added to benchmark_state. int64_t runBenchmarkIterations( benchmark::State& benchmark_state, - FusionExecutor* fusion_executor, + KernelExecutor* fusion_executor, std::vector& aten_inputs, const LaunchParams& launch_constraints = LaunchParams(), CompileParams compile_params = CompileParams()); diff --git a/benchmarks/python/conftest.py b/benchmarks/python/conftest.py index 8932afbff30..03adbe1e7dd 100644 --- a/benchmarks/python/conftest.py +++ b/benchmarks/python/conftest.py @@ -96,45 +96,39 @@ def pytest_configure(config): def pytest_collection_modifyitems(session, config, items): """ - The baseline benchmarks use `compile` parameter: - compile = false: Eager mode benchmark - compile = true: torch.compile benchmark + The baseline benchmarks use `executor` parameter with + values ["eager", "torchcompile", "thunder"] that are optionally + run using `--benchmark-{executor}` flag. They are skipped by + default. """ - run_eager = config.getoption("--benchmark-eager") - run_thunder = config.getoption("--benchmark-thunder") - run_torchcompile = config.getoption("--benchmark-torchcompile") from nvfuser.pytorch_utils import retry_on_oom_or_skip_test + executors = ["eager", "torchcompile", "thunder"] + + def get_test_executor(item) -> str | None: + if hasattr(item, "callspec") and "executor" in item.callspec.params: + test_executor = item.callspec.params["executor"] + assert ( + test_executor in executors + ), f"Expected executor to be one of 'eager', 'torchcompile', 'thunder', found {test_executor}." + return test_executor + return None + + executors_to_skip = [] + + for executor in executors: + if not config.getoption(f"--benchmark-{executor}"): + executors_to_skip.append(executor) + for item in items: item.obj = retry_on_oom_or_skip_test(item.obj) - if not run_eager: - skip_eager = pytest.mark.skip(reason="need --benchmark-eager option to run") - for item in items: - # If the benchmark has compile=False parameter (eager mode), skip it. - if ( - hasattr(item, "callspec") - and "compile" in item.callspec.params - and not item.callspec.params["compile"] - ): - item.add_marker(skip_eager) - - if not run_torchcompile: - skip_torchcompile = pytest.mark.skip( - reason="need --benchmark-torchcompile option to run" - ) - for item in items: - # If the benchmark has compile=True parameter (torch.compile mode), skip it. - if ( - hasattr(item, "callspec") - and "compile" in item.callspec.params - and item.callspec.params["compile"] - ): - item.add_marker(skip_torchcompile) - - if not run_thunder: - skip_thunder = pytest.mark.skip(reason="need --benchmark-thunder option to run") - for item in items: - if "thunder" in item.nodeid: - item.add_marker(skip_thunder) + test_executor = get_test_executor(item) + + if test_executor is not None and test_executor in executors_to_skip: + item.add_marker( + pytest.mark.skip( + reason=f"need --benchmark-{test_executor} option to run." + ) + ) diff --git a/benchmarks/python/normalization.py b/benchmarks/python/normalization.py index 8cbafe81353..6d493338846 100644 --- a/benchmarks/python/normalization.py +++ b/benchmarks/python/normalization.py @@ -433,10 +433,10 @@ def norm_fwd_baseline_benchmark( size: tuple, dtype: torch.dtype, channels_last: bool, - compile: bool, + executor: str, norm: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() assert norm in ["batch_norm", "instance_norm"], NotImplementedError @@ -453,10 +453,12 @@ def norm_fwd_baseline_benchmark( norm_fwd_fn = batchnorm_fwd_fn if norm == "batch_norm" else instancenorm_fwd_fn + benchmark_fn = {"eager": norm_fwd_fn, "torchcompile": torch.compile(norm_fwd_fn)} + # Manually compute IOBytes: See PR #1725 run_benchmark( benchmark, - torch.compile(norm_fwd_fn) if compile else norm_fwd_fn, + benchmark_fn[executor], [inputs, weight, bias, running_mean, running_var], iobytes=norm_fwd_iobytes(size, dtype, norm), ) @@ -467,10 +469,10 @@ def norm_bwd_baseline_benchmark( size: tuple, dtype: torch.dtype, channels_last: bool, - compile: bool, + executor: str, norm: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() assert norm in ["batch_norm", "instance_norm"], NotImplementedError @@ -491,13 +493,13 @@ def norm_bwd_baseline_benchmark( norm_fwd_fn = batchnorm_fwd_fn if norm == "batch_norm" else instancenorm_fwd_fn # Compile the fwd fn for torchcompile - norm_fwd_fn = torch.compile(norm_fwd_fn) if compile else norm_fwd_fn - output = norm_fwd_fn([inputs, weight, bias, running_mean, running_var]) + fwd_fn = {"eager": norm_fwd_fn, "torchcompile": torch.compile(norm_fwd_fn)} + outputs = fwd_fn[executor]([inputs, weight, bias, running_mean, running_var]) # Manually compute IOBytes: See PR #1725 run_benchmark( benchmark, unary_bwd_torch, - [output, grads], + [outputs, grads], iobytes=norm_bwd_iobytes(size, dtype, norm), ) diff --git a/benchmarks/python/test_batchnorm_bwd.py b/benchmarks/python/test_batchnorm_bwd.py index 74242ba99e2..0a1cd64cc57 100644 --- a/benchmarks/python/test_batchnorm_bwd.py +++ b/benchmarks/python/test_batchnorm_bwd.py @@ -31,13 +31,13 @@ def test_batchnorm_bwd_nvf_benchmark( ) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=4)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("channels_last", [True, False]) def test_batchnorm_bwd_baseline_benchmark( - benchmark, size: tuple, dtype: torch.dtype, channels_last: bool, compile: bool + benchmark, size: tuple, dtype: torch.dtype, channels_last: bool, executor: str ): norm_bwd_baseline_benchmark( - benchmark, size, dtype, channels_last, compile, "batch_norm" + benchmark, size, dtype, channels_last, executor, "batch_norm" ) diff --git a/benchmarks/python/test_batchnorm_fwd.py b/benchmarks/python/test_batchnorm_fwd.py index 47b3997770a..af197ce6f1b 100644 --- a/benchmarks/python/test_batchnorm_fwd.py +++ b/benchmarks/python/test_batchnorm_fwd.py @@ -31,13 +31,13 @@ def test_batchnorm_fwd_nvf_benchmark( ) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=4)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("channels_last", [True, False]) def test_batchnorm_fwd_baseline_benchmark( - benchmark, size: tuple, dtype: torch.dtype, channels_last: bool, compile: bool + benchmark, size: tuple, dtype: torch.dtype, channels_last: bool, executor: str ): norm_fwd_baseline_benchmark( - benchmark, size, dtype, channels_last, compile, "batch_norm" + benchmark, size, dtype, channels_last, executor, "batch_norm" ) diff --git a/benchmarks/python/test_broadcast_add_fwd.py b/benchmarks/python/test_broadcast_add_fwd.py index abb320ef2a3..65db1555b28 100644 --- a/benchmarks/python/test_broadcast_add_fwd.py +++ b/benchmarks/python/test_broadcast_add_fwd.py @@ -88,7 +88,7 @@ def test_bcast_add_nvf_benchmark( run_benchmark(benchmark, fd.execute, [bias, x]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("bcast_axis", [0, 1], ids=["outer", "inner"]) @@ -101,9 +101,9 @@ def test_bcast_add_baseline_benchmark( dtype: torch.dtype, bcast_axis: int, contiguous: bool, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() bias = torch.randn(size[1 - bcast_axis], dtype=dtype, device="cuda") input_shape = size if contiguous else (size[1], size[0]) @@ -112,9 +112,14 @@ def test_bcast_add_baseline_benchmark( x = x.t() assert x.is_contiguous() == contiguous + benchmark_fn = { + "eager": bcast_add_fwd_fn, + "torchcompile": torch.compile(bcast_add_fwd_fn), + } + # Inputs and outputs are same as nvFuser, no need for manual IOByte computation run_benchmark( benchmark, - torch.compile(bcast_add_fwd_fn) if compile else bcast_add_fwd_fn, + benchmark_fn[executor], [bias, x, bcast_axis], ) diff --git a/benchmarks/python/test_dropout_layernorm_bwd.py b/benchmarks/python/test_dropout_layernorm_bwd.py index 6acaa012c5c..380a2085b09 100644 --- a/benchmarks/python/test_dropout_layernorm_bwd.py +++ b/benchmarks/python/test_dropout_layernorm_bwd.py @@ -189,16 +189,16 @@ def test_dropout_layernorm_bwd_nvf_benchmark( ) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_dropout_layernorm_bwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() dropout_p = 0.2 @@ -217,13 +217,16 @@ def dropout_layernorm_fwd(): ) # Compile the fwd fn for torchcompile - fwd_fn = torch.compile(dropout_layernorm_fwd) if compile else dropout_layernorm_fwd - output = fwd_fn() + fwd_fn = { + "eager": dropout_layernorm_fwd, + "torchcompile": torch.compile(dropout_layernorm_fwd), + } + outputs = fwd_fn[executor]() # Manually compute IOBytes: See PR #1725 run_benchmark( benchmark, unary_bwd_torch, - [output, grads], + [outputs, grads], iobytes=dropout_layernorm_bwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_dropout_layernorm_fwd.py b/benchmarks/python/test_dropout_layernorm_fwd.py index 47854fcd2d7..4408a2bd611 100644 --- a/benchmarks/python/test_dropout_layernorm_fwd.py +++ b/benchmarks/python/test_dropout_layernorm_fwd.py @@ -160,16 +160,16 @@ def test_dropout_layernorm_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, inputs) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_dropout_layernorm_fwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() dropout_p = 0.2 @@ -181,10 +181,15 @@ def test_dropout_layernorm_fwd_baseline_benchmark( dropout_p, ] + benchmark_fn = { + "eager": dropout_layernorm_fwd, + "torchcompile": torch.compile(dropout_layernorm_fwd), + } + # Manually compute IOBytes: See PR #1725 run_benchmark( benchmark, - torch.compile(dropout_layernorm_fwd) if compile else dropout_layernorm_fwd, + benchmark_fn[executor], inputs, iobytes=dropout_layernorm_fwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_dropout_rmsnorm_bwd.py b/benchmarks/python/test_dropout_rmsnorm_bwd.py index 8c61c51e2d9..d196e76f57b 100644 --- a/benchmarks/python/test_dropout_rmsnorm_bwd.py +++ b/benchmarks/python/test_dropout_rmsnorm_bwd.py @@ -169,16 +169,16 @@ def test_dropout_rmsnorm_bwd_nvf_benchmark( ) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_dropout_rmsnorm_bwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() dropout_p = 0.2 input1 = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True) @@ -191,12 +191,15 @@ def dropout_rmsnorm_fwd(): output = weights * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + 1e-5) return output - fwd_fn = torch.compile(dropout_rmsnorm_fwd) if compile else dropout_rmsnorm_fwd - output = fwd_fn() + fwd_fn = { + "eager": dropout_rmsnorm_fwd, + "torchcompile": torch.compile(dropout_rmsnorm_fwd), + } + outputs = fwd_fn[executor]() run_benchmark( benchmark, unary_bwd_torch, - [output, grads], + [outputs, grads], iobytes=dropout_rmsnorm_bwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_dropout_rmsnorm_fwd.py b/benchmarks/python/test_dropout_rmsnorm_fwd.py index a93a8caf547..aea2674df9d 100644 --- a/benchmarks/python/test_dropout_rmsnorm_fwd.py +++ b/benchmarks/python/test_dropout_rmsnorm_fwd.py @@ -145,16 +145,16 @@ def test_dropout_rmsnorm_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [input1, input2, weights]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_dropout_rmsnorm_fwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() dropout_p = 0.2 @@ -165,10 +165,15 @@ def test_dropout_rmsnorm_fwd_baseline_benchmark( dropout_p, ] + benchmark_fn = { + "eager": dropout_rmsnorm_fwd, + "torchcompile": torch.compile(dropout_rmsnorm_fwd), + } + # Manually compute IOBytes: See PR #1725 run_benchmark( benchmark, - torch.compile(dropout_rmsnorm_fwd) if compile else dropout_rmsnorm_fwd, + benchmark_fn[executor], inputs, iobytes=dropout_rmsnorm_fwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_gelu_bwd.py b/benchmarks/python/test_gelu_bwd.py index 648f0317cf9..ffd0b25c6a2 100644 --- a/benchmarks/python/test_gelu_bwd.py +++ b/benchmarks/python/test_gelu_bwd.py @@ -88,16 +88,16 @@ def test_gelu_bwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [inputs, grads, bias]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_gelu_bwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() inputs = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True) bias = torch.ones(size[-1], device="cuda", dtype=dtype) @@ -106,12 +106,15 @@ def test_gelu_bwd_baseline_benchmark( def gelu_fwd(): return torch.nn.functional.gelu(inputs + bias, approximate="tanh") - fwd_fn = torch.compile(gelu_fwd) if compile else gelu_fwd - eager_output = fwd_fn() + fwd_fn = { + "eager": gelu_fwd, + "torchcompile": torch.compile(gelu_fwd), + } + outputs = fwd_fn[executor]() run_benchmark( benchmark, unary_bwd_torch, - [eager_output, grads], + [outputs, grads], iobytes=gelu_bwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_gelu_bwd_reduction.py b/benchmarks/python/test_gelu_bwd_reduction.py index 09dfd53d88a..e860826eb49 100644 --- a/benchmarks/python/test_gelu_bwd_reduction.py +++ b/benchmarks/python/test_gelu_bwd_reduction.py @@ -103,7 +103,7 @@ def test_gelu_bwd_reduction_nvf_benchmark( run_benchmark(benchmark, fd.execute, [inputs, grads, bias]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("reduction_axis", [0, 1]) @@ -112,19 +112,23 @@ def test_gelu_bwd_reduction_baseline_benchmark( size: tuple, dtype: torch.dtype, reduction_axis: int, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() inputs = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True) bias = torch.ones(size[-1], device="cuda", dtype=dtype) grads = torch.randn(size, device="cuda", dtype=dtype) eager_output = torch.nn.functional.gelu(inputs + bias, approximate="tanh") + + benchmark_fn = { + "eager": gelu_bwd_reduction_torch, + "torchcompile": torch.compile(gelu_bwd_reduction_torch), + } + run_benchmark( benchmark, - torch.compile(gelu_bwd_reduction_torch) - if compile - else gelu_bwd_reduction_torch, + benchmark_fn[executor], [eager_output, grads, inputs, reduction_axis], iobytes=gelu_bwd_reduction_iobytes(size, dtype, reduction_axis), ) diff --git a/benchmarks/python/test_gelu_fwd.py b/benchmarks/python/test_gelu_fwd.py index fa5f891ef8a..2f208b2c090 100644 --- a/benchmarks/python/test_gelu_fwd.py +++ b/benchmarks/python/test_gelu_fwd.py @@ -67,22 +67,26 @@ def test_gelu_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, inputs) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_gelu_fwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() inputs = [ torch.randn(size, device="cuda", dtype=dtype, requires_grad=True), # in_tensor torch.ones(size[-1], device="cuda", dtype=dtype), # bias ] + + benchmark_fn = { + "eager": gelu_fwd_fn, + "torchcompile": torch.compile(gelu_fwd_fn), + } + # Inputs and outputs are same as nvFuser, no need for manual IOByte computation - run_benchmark( - benchmark, torch.compile(gelu_fwd_fn) if compile else gelu_fwd_fn, inputs - ) + run_benchmark(benchmark, benchmark_fn[executor], inputs) diff --git a/benchmarks/python/test_groupnorm_fwd.py b/benchmarks/python/test_groupnorm_fwd.py index af4c023d7d7..8c729e115d7 100644 --- a/benchmarks/python/test_groupnorm_fwd.py +++ b/benchmarks/python/test_groupnorm_fwd.py @@ -128,35 +128,16 @@ def test_groupnorm_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [x, weight, bias]) -@pytest.mark.parametrize("size", generate_input_sizes(dims=4)) -@pytest.mark.parametrize("dtype", FLOAT_DTYPES) -def test_groupnorm_fwd_thunder_benchmark( - benchmark, - size: tuple, - dtype: torch.dtype, -): - N, C, H, W = size - x = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True) - weight = torch.randn(C, device="cuda", dtype=dtype, requires_grad=True) - bias = torch.randn(C, device="cuda", dtype=dtype, requires_grad=True) - num_groups = get_n_groups(C) - # thunder compiled model - groupnorm_fwd_jit = thunder.jit( - groupnorm_fwd, nv_enable_bookend=False, executors=[nvfuserex] - ) - run_benchmark(benchmark, groupnorm_fwd_jit, [x, weight, bias, num_groups]) - - -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile", "thunder"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=4)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_groupnorm_fwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() N, C, H, W = size x = torch.randn(size, device="cuda", dtype=dtype) @@ -164,8 +145,15 @@ def test_groupnorm_fwd_baseline_benchmark( bias = torch.randn(C, device="cuda", dtype=dtype) num_groups = get_n_groups(C) + benchmark_fn = { + "eager": groupnorm_fwd, + "torchcompile": torch.compile(groupnorm_fwd), + "thunder": thunder.jit( + groupnorm_fwd, nv_enable_bookend=False, executors=[nvfuserex] + ), + } run_benchmark( benchmark, - torch.compile(groupnorm_fwd) if compile else groupnorm_fwd, + benchmark_fn[executor], [x, weight, bias, num_groups], ) diff --git a/benchmarks/python/test_huggingface_attn_bwd.py b/benchmarks/python/test_huggingface_attn_bwd.py index dd8c9f80114..bcb2b4d9268 100644 --- a/benchmarks/python/test_huggingface_attn_bwd.py +++ b/benchmarks/python/test_huggingface_attn_bwd.py @@ -107,16 +107,16 @@ def test_huggingface_attn_bwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [grads, attn, dropout_mask]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_attn_inputs()) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_huggingface_attn_bwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() batch_size, seq_len, nh, n_embd = size dropout_p = 0.2 @@ -134,14 +134,17 @@ def huggingface_attn_fwd(): return output # Compile the fwd fn for torchcompile - fwd_fn = torch.compile(huggingface_attn_fwd) if compile else huggingface_attn_fwd - output = fwd_fn() + fwd_fn = { + "eager": huggingface_attn_fwd, + "torchcompile": torch.compile(huggingface_attn_fwd), + } + outputs = fwd_fn[executor]() grads = torch.randn(batch_size * nh, seq_len, seq_len, device="cuda", dtype=dtype) # Manually compute IOBytes: See PR #1725 run_benchmark( benchmark, unary_bwd_torch, - [output, grads], + [outputs, grads], iobytes=huggingface_attn_bwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_huggingface_attn_fwd.py b/benchmarks/python/test_huggingface_attn_fwd.py index 27a013a8481..714a12e41d1 100644 --- a/benchmarks/python/test_huggingface_attn_fwd.py +++ b/benchmarks/python/test_huggingface_attn_fwd.py @@ -135,16 +135,16 @@ def test_huggingface_attn_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [attention_mask, inputs]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_attn_inputs()) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_huggingface_attn_fwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() batch_size, seq_len, nh, n_embd = size dropout_p = 0.2 @@ -153,10 +153,15 @@ def test_huggingface_attn_fwd_baseline_benchmark( batch_size, nh, seq_len, seq_len, device="cuda", dtype=dtype ) + benchmark_fn = { + "eager": huggingface_attn_fwd, + "torchcompile": torch.compile(huggingface_attn_fwd), + } + # Manually compute IOBytes: See PR #1725 run_benchmark( benchmark, - torch.compile(huggingface_attn_fwd) if compile else huggingface_attn_fwd, + benchmark_fn[executor], [attention_mask, inputs, size, dropout_p], iobytes=huggingface_attn_fwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_instancenorm_bwd.py b/benchmarks/python/test_instancenorm_bwd.py index 99d3e3baf2b..4022c5f395f 100644 --- a/benchmarks/python/test_instancenorm_bwd.py +++ b/benchmarks/python/test_instancenorm_bwd.py @@ -30,13 +30,13 @@ def test_instancenorm_bwd_nvf_benchmark( ) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=4)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("channels_last", [True, False]) def test_instancenorm_bwd_baseline_benchmark( - benchmark, size: tuple, dtype: torch.dtype, channels_last: bool, compile: bool + benchmark, size: tuple, dtype: torch.dtype, channels_last: bool, executor: str ): norm_bwd_baseline_benchmark( - benchmark, size, dtype, channels_last, compile, "instance_norm" + benchmark, size, dtype, channels_last, executor, "instance_norm" ) diff --git a/benchmarks/python/test_instancenorm_fwd.py b/benchmarks/python/test_instancenorm_fwd.py index 3b8f6564f51..3335fcc7bbf 100644 --- a/benchmarks/python/test_instancenorm_fwd.py +++ b/benchmarks/python/test_instancenorm_fwd.py @@ -29,13 +29,13 @@ def test_instancenorm_fwd_nvf_benchmark( ) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=4)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("channels_last", [True, False]) def test_instancenorm_fwd_baseline_benchmark( - benchmark, size: tuple, dtype: torch.dtype, channels_last: bool, compile: bool + benchmark, size: tuple, dtype: torch.dtype, channels_last: bool, executor: str ): norm_fwd_baseline_benchmark( - benchmark, size, dtype, channels_last, compile, "instance_norm" + benchmark, size, dtype, channels_last, executor, "instance_norm" ) diff --git a/benchmarks/python/test_layernorm_bwd.py b/benchmarks/python/test_layernorm_bwd.py index d76046575dc..926ab2ef0fb 100644 --- a/benchmarks/python/test_layernorm_bwd.py +++ b/benchmarks/python/test_layernorm_bwd.py @@ -146,16 +146,16 @@ def test_layernorm_bwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [inputs, grads, mean, invstd, weights]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_layernorm_bwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() inputs = torch.randn(*size, device="cuda", dtype=dtype, requires_grad=True) @@ -171,13 +171,16 @@ def layernorm_fwd(): bias=bias, ) - fwd_fn = torch.compile(layernorm_fwd) if compile else layernorm_fwd - output = fwd_fn() + fwd_fn = { + "eager": layernorm_fwd, + "torchcompile": torch.compile(layernorm_fwd), + } + outputs = fwd_fn[executor]() # Manually compute IOBytes: See PR #1725 run_benchmark( benchmark, unary_bwd_torch, - [output, grads], + [outputs, grads], iobytes=layernorm_bwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_layernorm_fwd.py b/benchmarks/python/test_layernorm_fwd.py index c6a5f24c8dc..52aa5838f62 100644 --- a/benchmarks/python/test_layernorm_fwd.py +++ b/benchmarks/python/test_layernorm_fwd.py @@ -106,16 +106,16 @@ def test_layernorm_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, inputs) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_layernorm_fwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() batch_size, hidden_size = size inputs = [ @@ -124,10 +124,15 @@ def test_layernorm_fwd_baseline_benchmark( torch.randn(hidden_size, device="cuda", dtype=dtype), ] + benchmark_fn = { + "eager": layernorm_fwd, + "torchcompile": torch.compile(layernorm_fwd), + } + # Manually compute IOBytes: See PR #1725 run_benchmark( benchmark, - torch.compile(layernorm_fwd) if compile else layernorm_fwd, + benchmark_fn[executor], inputs, iobytes=layernorm_fwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_matmul.py b/benchmarks/python/test_matmul.py index 865caba2e31..2448ac07fd9 100644 --- a/benchmarks/python/test_matmul.py +++ b/benchmarks/python/test_matmul.py @@ -25,14 +25,14 @@ def load_matmul_problems(): @pytest.mark.parametrize("half_reduction", [False, True], ids=["fullred", "halfred"]) -@pytest.mark.parametrize("compile", [False], ids=["eager"]) +@pytest.mark.parametrize("executor", ["eager"]) @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"]) @pytest.mark.parametrize( "config", load_matmul_problems(), ids=lambda val: "-".join(str(v) for v in val) ) def test_matmul_baseline_benchmark( benchmark, - compile: bool, + executor: str, config: tuple, dtype: torch.dtype, half_reduction: bool, diff --git a/benchmarks/python/test_nanogpt_attn_bwd.py b/benchmarks/python/test_nanogpt_attn_bwd.py index 2efb8e7d58d..88d8d56e26d 100644 --- a/benchmarks/python/test_nanogpt_attn_bwd.py +++ b/benchmarks/python/test_nanogpt_attn_bwd.py @@ -124,16 +124,16 @@ def test_nanogpt_attn_bwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [grads, attn, dropout_mask, bias_mask]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_attn_inputs()) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_nanogpt_attn_bwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() batch_size, seq_len, nh, n_embd = size dropout_p = 0.2 @@ -154,14 +154,18 @@ def nanogpt_attn_fwd(): return output # Compile the fwd fn for torchcompile - fwd_fn = torch.compile(nanogpt_attn_fwd) if compile else nanogpt_attn_fwd - output = fwd_fn() + fwd_fn = { + "eager": nanogpt_attn_fwd, + "torchcompile": torch.compile(nanogpt_attn_fwd), + } + outputs = fwd_fn[executor]() + grads = torch.randn(batch_size, nh, seq_len, seq_len, device="cuda", dtype=dtype) # Manually compute IOBytes: See PR #1725 run_benchmark( benchmark, unary_bwd_torch, - [output, grads], + [outputs, grads], iobytes=nanogpt_attn_bwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_nanogpt_attn_fwd.py b/benchmarks/python/test_nanogpt_attn_fwd.py index 4dbd5821c59..5336d96cba5 100644 --- a/benchmarks/python/test_nanogpt_attn_fwd.py +++ b/benchmarks/python/test_nanogpt_attn_fwd.py @@ -137,16 +137,16 @@ def test_nanogpt_attn_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [inputs, bias]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_attn_inputs()) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_nanogpt_attn_fwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() batch_size, seq_len, nh, n_embd = size dropout_p = 0.2 @@ -154,10 +154,16 @@ def test_nanogpt_attn_fwd_baseline_benchmark( bias = torch.tril(torch.ones(seq_len, seq_len, device="cuda")).view( 1, 1, seq_len, seq_len ) + + benchmark_fn = { + "eager": nanogpt_attn_fwd, + "torchcompile": torch.compile(nanogpt_attn_fwd), + } + # Manually compute IOBytes: See PR #1725 run_benchmark( benchmark, - torch.compile(nanogpt_attn_fwd) if compile else nanogpt_attn_fwd, + benchmark_fn[executor], [inputs, bias, size, dropout_p], iobytes=nanogpt_attn_fwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_pointwise_mul.py b/benchmarks/python/test_pointwise_mul.py index 0162950cc47..31ec20d6b10 100644 --- a/benchmarks/python/test_pointwise_mul.py +++ b/benchmarks/python/test_pointwise_mul.py @@ -50,21 +50,26 @@ def test_pointwise_mul_nvf_benchmark( run_benchmark(benchmark, fd.execute, inputs) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_pointwise_mul_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() input = torch.randn(size, device="cuda", dtype=dtype) + + benchmark_fn = { + "eager": pointwise_mul_fwd_fn, + "torchcompile": torch.compile(pointwise_mul_fwd_fn), + } # Inputs and outputs are same as nvFuser, no need for manual IOByte computation run_benchmark( benchmark, - torch.compile(pointwise_mul_fwd_fn) if compile else pointwise_mul_fwd_fn, + benchmark_fn[executor], [input], ) diff --git a/benchmarks/python/test_reduction.py b/benchmarks/python/test_reduction.py index f734769a1e5..303f65609b7 100644 --- a/benchmarks/python/test_reduction.py +++ b/benchmarks/python/test_reduction.py @@ -53,7 +53,7 @@ def test_reduction_nvf_benchmark( run_benchmark(benchmark, fd.execute, inputs) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("reduction_axis", [0, 1]) @@ -62,14 +62,19 @@ def test_reduction_baseline_benchmark( size: tuple, dtype: torch.dtype, reduction_axis: int, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() input = torch.randn(size, device="cuda", dtype=dtype) + + benchmark_fn = { + "eager": reduction_fwd_fn, + "torchcompile": torch.compile(reduction_fwd_fn), + } # Inputs and outputs are same as nvFuser, no need for manual IOByte computation run_benchmark( benchmark, - torch.compile(reduction_fwd_fn) if compile else reduction_fwd_fn, + benchmark_fn[executor], [input, reduction_axis], ) diff --git a/benchmarks/python/test_reduction_epilogue.py b/benchmarks/python/test_reduction_epilogue.py index 231090e4135..aacf7326d29 100644 --- a/benchmarks/python/test_reduction_epilogue.py +++ b/benchmarks/python/test_reduction_epilogue.py @@ -67,7 +67,7 @@ def test_reduction_epilogue_nvf_benchmark( run_benchmark(benchmark, fd.execute, [x, epilogue]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("reduction_axis", [0]) @@ -76,17 +76,21 @@ def test_reduction_epilogue_baseline_benchmark( size: tuple, dtype: torch.dtype, reduction_axis: int, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() x = torch.randn(size, device="cuda", dtype=dtype) epilogue = torch.randn(size[reduction_axis - 1], device="cuda", dtype=dtype) # Inputs and outputs are same as nvFuser, no need for manual IOByte computation + + benchmark_fn = { + "eager": reduction_epilogue_fwd_fn, + "torchcompile": torch.compile(reduction_epilogue_fwd_fn), + } + run_benchmark( benchmark, - torch.compile(reduction_epilogue_fwd_fn) - if compile - else reduction_epilogue_fwd_fn, + benchmark_fn[executor], [x, epilogue, reduction_axis], ) diff --git a/benchmarks/python/test_rmsnorm_bwd.py b/benchmarks/python/test_rmsnorm_bwd.py index 697aa8848ab..2fb4698fdbf 100644 --- a/benchmarks/python/test_rmsnorm_bwd.py +++ b/benchmarks/python/test_rmsnorm_bwd.py @@ -112,16 +112,16 @@ def test_rmsnorm_bwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [inputs, rms_eps, grads, weights]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_rmsnorm_bwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() inputs = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True) grads = torch.randn(size, device="cuda", dtype=dtype) @@ -134,13 +134,13 @@ def rmsnorm_fwd(): return output # Compile the fwd fn for torchcompile - fwd_fn = torch.compile(rmsnorm_fwd) if compile else rmsnorm_fwd - output = fwd_fn() + fwd_fn = {"eager": rmsnorm_fwd, "torchcompile": torch.compile(rmsnorm_fwd)} + outputs = fwd_fn[executor]() # Manually compute IOBytes: See PR #1725 run_benchmark( benchmark, unary_bwd_torch, - [output, grads], + [outputs, grads], iobytes=rmsnorm_bwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_rmsnorm_fwd.py b/benchmarks/python/test_rmsnorm_fwd.py index b7839b631de..0114ae6507c 100644 --- a/benchmarks/python/test_rmsnorm_fwd.py +++ b/benchmarks/python/test_rmsnorm_fwd.py @@ -86,24 +86,28 @@ def test_rmsnorm_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [inputs, weights]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_rmsnorm_fwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() inputs = torch.randn(size, device="cuda", dtype=dtype) weights = torch.randn(size[1], device="cuda", dtype=dtype) + benchmark_fn = { + "eager": rmsnorm_fwd_fn, + "torchcompile": torch.compile(rmsnorm_fwd_fn), + } # Manually compute IOBytes: See PR #1725 run_benchmark( benchmark, - torch.compile(rmsnorm_fwd_fn) if compile else rmsnorm_fwd_fn, + benchmark_fn[executor], [inputs, weights], iobytes=rmsnorm_fwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_scale_bias_relu_bwd.py b/benchmarks/python/test_scale_bias_relu_bwd.py index a85c62a1592..c98d32382b5 100644 --- a/benchmarks/python/test_scale_bias_relu_bwd.py +++ b/benchmarks/python/test_scale_bias_relu_bwd.py @@ -79,16 +79,16 @@ def test_sbr_bwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [scale, bool_mask, grads]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_sbr_bwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() inputs = torch.randn(*size, device="cuda", dtype=dtype, requires_grad=True) grads = torch.randn(*size, device="cuda", dtype=dtype) @@ -99,12 +99,12 @@ def sbr_fwd(): return torch.nn.functional.relu(inputs * scale + bias) # Compile the fwd fn for torchcompile - fwd_fn = torch.compile(sbr_fwd) if compile else sbr_fwd - eager_output = sbr_fwd() + fwd_fn = {"eager": sbr_fwd, "torchcompile": torch.compile(sbr_fwd)} + outputs = fwd_fn[executor]() run_benchmark( benchmark, unary_bwd_torch, - [eager_output, grads], + [outputs, grads], iobytes=sbr_bwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_scale_bias_relu_fwd.py b/benchmarks/python/test_scale_bias_relu_fwd.py index ede13dbb767..c09b11296c3 100644 --- a/benchmarks/python/test_scale_bias_relu_fwd.py +++ b/benchmarks/python/test_scale_bias_relu_fwd.py @@ -82,24 +82,26 @@ def test_sbr_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [bias, scale, inputs]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_sbr_fwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() inputs = torch.randn(*size, device="cuda", dtype=dtype, requires_grad=True) bias = torch.ones(size[-1], device="cuda", dtype=dtype) scale = torch.ones(size[-1], device="cuda", dtype=dtype) + benchmark_fn = {"eager": sbr_fwd_fn, "torchcompile": torch.compile(sbr_fwd_fn)} + run_benchmark( benchmark, - torch.compile(sbr_fwd_fn) if compile else sbr_fwd_fn, + benchmark_fn[executor], [bias, scale, inputs], iobytes=sbr_fwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_silu_mul_bwd.py b/benchmarks/python/test_silu_mul_bwd.py index 98995e860b1..25276dec474 100644 --- a/benchmarks/python/test_silu_mul_bwd.py +++ b/benchmarks/python/test_silu_mul_bwd.py @@ -79,16 +79,16 @@ def test_silu_mul_bwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, [grads, x, y]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_silu_mul_bwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() x = torch.randn(*size, device="cuda", dtype=dtype, requires_grad=True) y = torch.randn(*size, device="cuda", dtype=dtype, requires_grad=True) @@ -98,12 +98,12 @@ def silu_mul_fwd(): return torch.nn.functional.silu(x) * y # Compile the fwd fn for torchcompile - fwd_fn = torch.compile(silu_mul_fwd) if compile else silu_mul_fwd - eager_output = fwd_fn() + fwd_fn = {"eager": silu_mul_fwd, "torchcompile": torch.compile(silu_mul_fwd)} + outputs = fwd_fn[executor]() run_benchmark( benchmark, unary_bwd_torch, - [eager_output, grads], + [outputs, grads], iobytes=silu_mul_bwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_silu_mul_fwd.py b/benchmarks/python/test_silu_mul_fwd.py index 0f1e86d0d56..3de05067cb2 100644 --- a/benchmarks/python/test_silu_mul_fwd.py +++ b/benchmarks/python/test_silu_mul_fwd.py @@ -56,22 +56,27 @@ def test_silu_mul_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, inputs) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) def test_silu_mul_fwd_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() inputs = [torch.randn(*size, device="cuda", dtype=dtype) for _ in range(2)] + benchmark_fn = { + "eager": silu_mul_fwd_fn, + "torchcompile": torch.compile(silu_mul_fwd_fn), + } + # Inputs and outputs are same as nvFuser, no need for manual IOByte computation run_benchmark( benchmark, - torch.compile(silu_mul_fwd_fn) if compile else silu_mul_fwd_fn, + benchmark_fn[executor], inputs, ) diff --git a/benchmarks/python/test_softmax_bwd.py b/benchmarks/python/test_softmax_bwd.py index 86f22654380..049da18fe27 100644 --- a/benchmarks/python/test_softmax_bwd.py +++ b/benchmarks/python/test_softmax_bwd.py @@ -91,7 +91,7 @@ def test_softmax_bwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, inputs) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("reduction_axis", [0, 1]) @@ -100,9 +100,9 @@ def test_softmax_bwd_baseline_benchmark( size: tuple, dtype: torch.dtype, reduction_axis: int, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() input = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True) grads = torch.randn(size, device="cuda", dtype=dtype) @@ -110,12 +110,12 @@ def test_softmax_bwd_baseline_benchmark( def softmax_fwd(): return torch.nn.functional.softmax(input, dim=reduction_axis) - fwd_fn = torch.compile(softmax_fwd) if compile else softmax_fwd - output = fwd_fn() + fwd_fn = {"eager": softmax_fwd, "torchcompile": torch.compile(softmax_fwd)} + outputs = fwd_fn[executor]() run_benchmark( benchmark, unary_bwd_torch, - [output, grads], + [outputs, grads], iobytes=softmax_bwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_softmax_fwd.py b/benchmarks/python/test_softmax_fwd.py index 2e672eb2e30..d138aa1ced1 100644 --- a/benchmarks/python/test_softmax_fwd.py +++ b/benchmarks/python/test_softmax_fwd.py @@ -81,7 +81,7 @@ def test_softmax_fwd_nvf_benchmark( run_benchmark(benchmark, fd.execute, inputs) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=2)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("reduction_axis", [0, 1]) @@ -90,15 +90,19 @@ def test_softmax_fwd_baseline_benchmark( size: tuple, dtype: torch.dtype, reduction_axis: int, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() input = torch.randn(size, device="cuda", dtype=dtype) + benchmark_fn = { + "eager": softmax_fwd_fn, + "torchcompile": torch.compile(softmax_fwd_fn), + } run_benchmark( benchmark, - torch.compile(softmax_fwd_fn) if compile else softmax_fwd_fn, + benchmark_fn[executor], [input, reduction_axis], iobytes=softmax_fwd_iobytes(size, dtype), ) diff --git a/benchmarks/python/test_transpose.py b/benchmarks/python/test_transpose.py index cf290f278a5..a4e3198cc9a 100644 --- a/benchmarks/python/test_transpose.py +++ b/benchmarks/python/test_transpose.py @@ -74,7 +74,7 @@ def test_transpose_nvf_benchmark( run_benchmark(benchmark, fd.execute, [input1, input2]) -@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"]) +@pytest.mark.parametrize("executor", ["eager", "torchcompile"]) @pytest.mark.parametrize("size", generate_input_sizes(dims=3)) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) @pytest.mark.parametrize("axes", [(0, 1), (0, 2), (1, 2)]) @@ -83,15 +83,21 @@ def test_transpose_baseline_benchmark( size: tuple, dtype: torch.dtype, axes: list, - compile: bool, + executor: str, ): - if compile: + if executor == "torchcompile": clear_dynamo_cache() input1 = torch.randn(size, device="cuda", dtype=dtype) input2 = torch.randn(size, device="cuda", dtype=dtype) + + benchmark_fn = { + "eager": transpose_fwd_fn, + "torchcompile": torch.compile(transpose_fwd_fn), + } + # Inputs and outputs are same as nvFuser, no need for manual IOByte computation run_benchmark( benchmark, - torch.compile(transpose_fwd_fn) if compile else transpose_fwd_fn, + benchmark_fn[executor], [input1, input2, axes[0], axes[1]], ) diff --git a/csrc/codegen.cpp b/csrc/codegen.cpp index 4cec43b2c92..727894caebb 100644 --- a/csrc/codegen.cpp +++ b/csrc/codegen.cpp @@ -402,6 +402,55 @@ class CudaKernelGenerator : private kir::ConstIrVisitor { } } + void generateVectorizedLdSt( + Val* in, + Val* out, + CacheOp cache_op, + int64_t vector_word_size) { + auto out_tv = out->as()->view(); + auto in_tv = in->as()->view(); + + bool localToGlobal = out_tv->getMemoryType() == MemoryType::Global && + in_tv->getMemoryType() == MemoryType::Local; + + bool globalToLocal = out_tv->getMemoryType() == MemoryType::Local && + in_tv->getMemoryType() == MemoryType::Global; + + bool globalToGlobal = out_tv->getMemoryType() == MemoryType::Global && + in_tv->getMemoryType() == MemoryType::Global; + + bool is_volatile_to = out_tv->getMemoryType() == MemoryType::Global && + kernel_->summary().sync_map->needsRawSync(out_tv).hasBID(); + + bool is_volatile_from = in_tv->getMemoryType() == MemoryType::Global && + kernel_->summary().sync_map->needsRawSync(in_tv).hasBID(); + + if (localToGlobal) { + code_ << "loadLocalToGlobal<" << out->dtype() << ", /*vec_size=*/" + << vector_word_size << ", /*is_volatile=*/" + << (is_volatile_to ? "true" : "false") << ">("; + code_ << " &" << gen(out) << ", &" << gen(in) << ")"; + } else if (globalToLocal) { + code_ << "loadGlobalToLocal<" << out->dtype() << ", /*vec_size=*/" + << vector_word_size << ", /*is_volatile=*/" + << (is_volatile_from ? "true" : "false") << ", " + << "CacheOp::" << cache_op << ">(&" << gen(out) << ", "; + code_ << " &" << gen(in) << ")"; + } else if (globalToGlobal) { + code_ << "loadGlobalToGlobal<" << out->dtype() << ", /*vec_size=*/" + << vector_word_size << ", /*is_volatile_to=*/" + << (is_volatile_to ? "true" : "false") << ", /*is_volatile_from=*/" + << (is_volatile_from ? "true" : "false") << ">("; + code_ << " &" << gen(out) << ", "; + code_ << " &" << gen(in) << ")"; + } else { + code_ << "loadGeneric<" << out->dtype() << ", " << vector_word_size + << ">("; + code_ << " &" << gen(out) << ", "; + code_ << " &" << gen(in) << ")"; + } + } + // Cannot just use ConstIrVisitor::handle as it expects a vector of // const Expr*, whereas most of the IR API returns a vector of // non-const Expr*. @@ -1001,6 +1050,68 @@ class CudaKernelGenerator : private kir::ConstIrVisitor { } void handle(const TernaryOp* top) final { + // Note: vectorized TernaryOp looks something like: + // ``` + // predicate + // ? LoadGlobalToLocal(&dst[0], &in2[index]) + // : arraySet(&dst[0], in3); + // ``` + // + // Current limitation: + // 1. only TernaryOpType::Where is supported; + // 2. predicate needs to be a scalar; + // 3. output needs to be a TensorView; + // 4. one and only one of the inputs needs to be a TensorView. (This is + // coming from validation analysis.) + if (top->out()->isA()) { + // Get vectorization information + auto out_tv = top->out()->as()->view(); + int64_t vector_word_size = ir_utils::getVectorizeSize(out_tv); + bool is_vector_op = vectorize_scope_ && vector_word_size != 1; + + if (is_vector_op) { + NVF_CHECK( + top->in1()->isScalar(), + "predicate should be a scalar for vectorized TernaryOp::where"); + NVF_CHECK( + !top->out()->isScalar(), + "scalar output in vectorization isn't supported"); + NVF_CHECK( + top->getTernaryOpType() == TernaryOpType::Where, + "vectorization only works on TernaryOp::where"); + indent() << gen(top->in1()) << "\n"; + indent() << kTab << "? "; + auto vec_load = [&out_tv, &top, &vector_word_size, this](Val* in) { + if (in->isScalar()) { + if (out_tv->getMemoryType() == MemoryType::Local && + !out_tv->isCircularBuffered()) { + // Vectorized initialization, explicit type conversion is needed + // for complex numbers + code_ << genVariableName(out_tv) << ".set(" + << genCall(out_tv->dtype(), gen(in)) << ")"; + } else { + // Note: currently arraySet option is not vectorized, so it will + // rely on auto vectorization pass of cuda compiler. + code_ << "arraySet<" << out_tv->getDataType().value() << ", " + << vector_word_size << ">(&" << gen(top->out()) << ", (" + << out_tv->getDataType().value() << ")" << gen(in) << ")"; + } + } else { + generateVectorizedLdSt( + in, top->out(), CacheOp::AllLevels, vector_word_size); + } + }; + + // TODO: should we have the option to specify cache level? + vec_load(top->in2()); + code_ << "\n"; + indent() << kTab << ": "; + vec_load(top->in3()); + code_ << ";\n"; + return; + } + } + if (!print_inline_) { indent() << gen(top->out()); if (!top->out()->isScalar()) { @@ -1338,53 +1449,10 @@ class CudaKernelGenerator : private kir::ConstIrVisitor { "Invalid input to unary op with tensor output, found: ", ldst->in()->toString()); - auto in_tv = ldst->in()->as()->view(); - bool localToGlobal = out_tv->getMemoryType() == MemoryType::Global && - in_tv->getMemoryType() == MemoryType::Local; - - bool globalToLocal = out_tv->getMemoryType() == MemoryType::Local && - in_tv->getMemoryType() == MemoryType::Global; - - bool globalToGlobal = out_tv->getMemoryType() == MemoryType::Global && - in_tv->getMemoryType() == MemoryType::Global; - - bool is_volatile_to = out_tv->getMemoryType() == MemoryType::Global && - kernel_->summary().sync_map->needsRawSync(out_tv).hasBID(); - - bool is_volatile_from = - in_tv->getMemoryType() == MemoryType::Global && - kernel_->summary().sync_map->needsRawSync(in_tv).hasBID(); - - if (localToGlobal) { - indent() << "loadLocalToGlobal<" << ldst->out()->dtype() - << ", /*vec_size=*/" << vector_word_size - << ", /*is_volatile=*/" - << (is_volatile_to ? "true" : "false") << ">("; - code_ << " &" << gen(ldst->out()) << ", &" << gen(ldst->in()) - << ");\n"; - } else if (globalToLocal) { - indent() << "loadGlobalToLocal<" << ldst->out()->dtype() - << ", /*vec_size=*/" << vector_word_size - << ", /*is_volatile=*/" - << (is_volatile_from ? "true" : "false") << ", " - << "CacheOp::" << ldst->cacheOp() << ">(&" - << gen(ldst->out()) << ", "; - code_ << " &" << gen(ldst->in()) << ");\n"; - } else if (globalToGlobal) { - indent() << "loadGlobalToGlobal<" << ldst->out()->dtype() - << ", /*vec_size=*/" << vector_word_size - << ", /*is_volatile_to=*/" - << (is_volatile_to ? "true" : "false") - << ", /*is_volatile_from=*/" - << (is_volatile_from ? "true" : "false") << ">("; - code_ << " &" << gen(ldst->out()) << ", "; - code_ << " &" << gen(ldst->in()) << ");\n"; - } else { - indent() << "loadGeneric<" << ldst->out()->dtype() << ", " - << vector_word_size << ">("; - code_ << " &" << gen(ldst->out()) << ", "; - code_ << " &" << gen(ldst->in()) << ");\n"; - } + indent(); + generateVectorizedLdSt( + ldst->in(), ldst->out(), ldst->cacheOp(), vector_word_size); + code_ << ";\n"; } return; } diff --git a/csrc/device_lower/lower2device.h b/csrc/device_lower/lower2device.h index 38f914b92ab..16d2177ca33 100644 --- a/csrc/device_lower/lower2device.h +++ b/csrc/device_lower/lower2device.h @@ -45,10 +45,6 @@ namespace nvfuser { -// TODO: we frequently use pairwise root mapping from consumers to producers. -// This information is implicitly in the computeAtMaps, but there's no isolated -// container for this information that we can reuse. Would be nice to generate -// such a structure and propagate it through lowering. // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) class GpuLower : public NonCopyable { class KernelIrMapper; diff --git a/csrc/device_lower/pass/circular_buffer.cpp b/csrc/device_lower/pass/circular_buffer.cpp index f2eb30297a5..273cee42ba9 100644 --- a/csrc/device_lower/pass/circular_buffer.cpp +++ b/csrc/device_lower/pass/circular_buffer.cpp @@ -97,7 +97,7 @@ class CircularBufferLoopCloner : public kir::IrVisitor { } case CircularBufferLoopStage::Main: { if (requireEpilogue(circular_buffer_load_exprs_)) { - stop = IrBuilder::subExpr( + stop = SimplifyingIrBuilder::subExpr( circular_buffer_loop_->stop(), SimplifyingIrBuilder::create( prefetch_distance, DataType::Index)); @@ -106,7 +106,7 @@ class CircularBufferLoopCloner : public kir::IrVisitor { } case CircularBufferLoopStage::Epilog: { NVF_ERROR(requireEpilogue(circular_buffer_load_exprs_)); - start = IrBuilder::subExpr( + start = SimplifyingIrBuilder::subExpr( circular_buffer_loop_->stop(), SimplifyingIrBuilder::create( prefetch_distance, DataType::Index)); @@ -424,7 +424,7 @@ class CloneTmaCircularBufferLoopAndInsertSync int64_t stage_depth = GpuLower::current()->circularBufferInfo().getStageDepthFor( circular_buffer_loop_->iter_domain()); - Val* result = IrBuilder::modExpr( + Val* result = SimplifyingIrBuilder::modExpr( cloned_top_level_loop_->indexOrStartIfTrivial(), IrBuilder::create(stage_depth, PrimDataType::Index)); return GpuLower::current()->commonScalarMap().hoistScalar( @@ -441,8 +441,8 @@ class CloneTmaCircularBufferLoopAndInsertSync GpuLower::current()->circularBufferInfo().getPrefetchDistanceFor( circular_buffer_loop_->iter_domain()); - auto current_load_stage = IrBuilder::modExpr( - IrBuilder::addExpr( + auto current_load_stage = SimplifyingIrBuilder::modExpr( + SimplifyingIrBuilder::addExpr( cloned_top_level_loop_->indexOrStartIfTrivial(), IrBuilder::create(prefetch_distance, PrimDataType::Index)), IrBuilder::create(stage_depth, PrimDataType::Index)); diff --git a/csrc/device_lower/pass/predicate.cpp b/csrc/device_lower/pass/predicate.cpp index 034534be7be..18e632d9e9e 100644 --- a/csrc/device_lower/pass/predicate.cpp +++ b/csrc/device_lower/pass/predicate.cpp @@ -103,7 +103,8 @@ class ConditionalFromPredicateModifier : public kir::ExprMutator { "Expecting predicated body to only have one vectorized expression."); auto vec_expr = ite->thenBody()[0]; NVF_ERROR( - vec_expr->isA() || vec_expr->isA(), + vec_expr->isA() || vec_expr->isA() || + vec_expr->isA(), "Vectorize predicate exprs only supported on set operations."); NVF_ERROR( ir_utils::isTvOp(vec_expr), diff --git a/csrc/device_lower/pass/replace_size.cpp b/csrc/device_lower/pass/replace_size.cpp index 1e1bc8b9738..1ace266794c 100644 --- a/csrc/device_lower/pass/replace_size.cpp +++ b/csrc/device_lower/pass/replace_size.cpp @@ -59,28 +59,38 @@ std::unordered_map getSimplificationMap(Fusion* fusion) { // 1. Constant ints. These might be non-immediate constants // 2. Extents of input TVs. // 3. Extents of non-input TVs. - // Within these three classes, we find the IterDomain with the smallest - // name(). + // Within these three classes, we find the IterDomain with the + // smallest name(). For case 3, we also prefer the IterDomain with + // the simplest extent, which has the smallest number of defining + // expessions. bool group_is_const = false; IterDomain* rep = nullptr; bool rep_is_input_id = false; + int64_t rep_num_defs = 0; std::unordered_set dynamic_scalars; for (Val* v : *group) { auto* id = dynamic_cast(v); NVF_ERROR( id != nullptr, "Expected only IterDomains in exact graph ValGroups"); bool is_input_id = fusion_input_ids.count(id) > 0; - if (rep == nullptr) { - rep = id; - rep_is_input_id = is_input_id; - continue; - } Val* ext = id->extent(); bool ext_is_const = ext->isConstInt(); if (!ext_is_const) { dynamic_scalars.insert(ext); } + // Initializing rep with the first ID + if (rep == nullptr) { + rep = id; + rep_is_input_id = is_input_id; + group_is_const = ext_is_const; + // If neigher const nor input, record the number of exprs + if (!ext_is_const && !is_input_id) { + rep_num_defs = ir_utils::getOperationCount(id->extent()); + } + continue; + } + if (ext_is_const) { if (!group_is_const || id->name() < rep->name()) { rep = id; @@ -103,9 +113,11 @@ std::unordered_map getSimplificationMap(Fusion* fusion) { if (group_is_const || rep_is_input_id) { continue; } - if (id->name() < rep->name()) { + auto num_defs = ir_utils::getOperationCount(id->extent()); + if (num_defs < rep_num_defs || id->name() < rep->name()) { rep = id; rep_is_input_id = is_input_id; + rep_num_defs = num_defs; continue; } } diff --git a/csrc/device_lower/utils.cpp b/csrc/device_lower/utils.cpp index 00bb3a1d458..d10ffb90e50 100644 --- a/csrc/device_lower/utils.cpp +++ b/csrc/device_lower/utils.cpp @@ -1907,6 +1907,11 @@ Val* proveLinearAndGetStride( const ValGroup& linear_g, const ValGroups& domain) { FusionGuard fg(linear_g->front()->fusion()); + // This function uses simplifyExpr extensively. If we have disable expression + // simplification in order to help inspect generated kernels then we will get + // incorrect results here. Instead, we ensure it is enabled using this guard. + DisableOptionsGuard dog; + DisableOptionsGuard::getCurOptions().unset(DisableOption::ExprSimplify); if (simplifyExpr(extent(linear_g))->isOne()) { // If the extent of the linear group is 1, we always consider it as linear, // regardless of its relationship with domain. For this case, we use stride diff --git a/csrc/device_lower/validation.cpp b/csrc/device_lower/validation.cpp index ef10cdb6bc1..cc1b2dec53a 100644 --- a/csrc/device_lower/validation.cpp +++ b/csrc/device_lower/validation.cpp @@ -668,17 +668,31 @@ class VectorizeValidator : public OptInDispatch { tv_def != nullptr, "Tv has no definition, cannot validate vectorization:", tv); - auto producer_tv = tv_def->inputs().at(0)->as(); - auto producer_word_size_it = - GpuLower::current()->vectorizedAccesses().find(producer_tv); - if (producer_word_size_it != - GpuLower::current()->vectorizedAccesses().end()) { - producer_word_size_it->second = - std::max(vector_word_size, producer_word_size_it->second); - } else { - GpuLower::current()->vectorizedAccesses().emplace( - producer_tv, vector_word_size); + // TernaryOp(where) is a could have multiple inputs. But we only support + // single TensorView input for vectorization. + TensorView* producer_tv = nullptr; + for (auto input : tv_def->inputs()) { + if (!input->isA()) { + continue; + } + NVF_ERROR( + producer_tv == nullptr, + "Vectorization validation only support op with a single TensorView input"); + producer_tv = input->as(); + auto producer_word_size_it = + GpuLower::current()->vectorizedAccesses().find(producer_tv); + if (producer_word_size_it != + GpuLower::current()->vectorizedAccesses().end()) { + producer_word_size_it->second = + std::max(vector_word_size, producer_word_size_it->second); + } else { + GpuLower::current()->vectorizedAccesses().emplace( + producer_tv, vector_word_size); + } } + NVF_ERROR( + producer_tv != nullptr, + "Vectorization validation requires a TensorView input"); VectorizedSetInfo vectorized_set_info; vectorized_set_info.consumer_tv = tv; @@ -798,6 +812,10 @@ void validateAndCollectVectorizeInfo(Fusion* fusion) { Expr* def = tv->definition(); NVF_ERROR( def == nullptr || def->isA() || def->isA() || + def->isA() || + (def->isA() && + def->as()->getTernaryOpType() == + TernaryOpType::Where) || (def->isA() && def->as()->serialGridReductionRequested()), "Vectorized accesses cannot be inline with computation: ", diff --git a/csrc/disjoint_set.h b/csrc/disjoint_set.h index c1638ae2037..568f9ff2604 100644 --- a/csrc/disjoint_set.h +++ b/csrc/disjoint_set.h @@ -153,25 +153,25 @@ class VectorOfUniqueEntries { // Returns first element in vector T front() const { -#ifndef NDEBUG +#if defined(NDEBUG) && !defined(NVFUSER_EXPLICIT_CHECK) NVF_ERROR(!empty()); -#endif // NDEBUG +#endif // defined(NDEBUG) && !defined(NVFUSER_EXPLICIT_CHECK) return vector_.front(); } // Returns last element in vector T back() const { -#ifndef NDEBUG +#if defined(NDEBUG) && !defined(NVFUSER_EXPLICIT_CHECK) NVF_ERROR(!empty()); -#endif // NDEBUG +#endif // defined(NDEBUG) && !defined(NVFUSER_EXPLICIT_CHECK) return vector_.back(); } // Remove and returns the last element in vector T popBack() { -#ifndef NDEBUG +#if defined(NDEBUG) && !defined(NVFUSER_EXPLICIT_CHECK) NVF_ERROR(!empty()); -#endif // NDEBUG +#endif // defined(NDEBUG) && !defined(NVFUSER_EXPLICIT_CHECK) T v = vector_.back(); set_.erase(v); vector_.pop_back(); diff --git a/csrc/fusion.cpp b/csrc/fusion.cpp index d9d9cc08003..ab1f44b4571 100644 --- a/csrc/fusion.cpp +++ b/csrc/fusion.cpp @@ -752,27 +752,6 @@ std::vector Fusion::getTerminatingOutputs() const { return terminating_outputs; } -bool Fusion::isAliasCompatible(Val* left, Val* right) { - // Nullptr check - if (left == nullptr || right == nullptr) { - return false; - } - - // DataType check - if (!left->getDataType().has_value() || !right->getDataType().has_value() || - left->getDataType().value() != right->getDataType().value()) { - return false; - } - - // ValType check - if (!left->getValType().has_value() || !right->getValType().has_value() || - left->getValType().value() != right->getValType().value()) { - return false; - } - - return true; -} - void Fusion::aliasOutputToInput( Val* output, Val* input, @@ -791,33 +770,16 @@ void Fusion::aliasOutputToInput( } NVF_ERROR(type == AllocationType::ReuseBuffer); - // `input` can be a cast of a fusion input. - if (!input->isFusionInput()) { - auto input_expr = input->definition(); - NVF_ERROR( - input_expr->isA(), "expected unary op for aliased input"); - auto input_uop = input_expr->as(); - NVF_ERROR( - input_uop->getUnaryOpType() == UnaryOpType::Cast, - "expected aliased input to be output of cast op"); - input = input_uop->in(); - } + NVF_ERROR(input->isFusionInput(), "alias source can only be a fusion input"); NVF_ERROR( input->getDataType().has_value() && output->getDataType().has_value(), "requires DataType to be available for aliased output to input"); - if (input->getDataType().value() != output->getDataType().value()) { - output = castOp(input->getDataType().value(), output); - } - if (output->isFusionInput()) { // ensure that codegen produce a write operation on the buffer. output = set(output); } - NVF_ERROR( - isAliasCompatible(input, output), - "The input and output values are not alias-compatible."); // Let integration hide any output that wasn't a fusion output when // `aliasOutputToInput` was called. For example, running mean and var for // batch norm. diff --git a/csrc/fusion.h b/csrc/fusion.h index 7b72aef0414..871dda53811 100644 --- a/csrc/fusion.h +++ b/csrc/fusion.h @@ -403,7 +403,7 @@ class NVF_API Fusion : public IrContainer { static IrCloner copy(const Fusion* from, Fusion* to); //! During scheduling, this can be set to a non-negative value. If done, then - //! during execution by FusionExecutor, we will check that this value matches + //! during execution by KernelExecutor, we will check that this value matches //! the corresponding value in LaunchParams. int64_t expectedDynamicSmemBytes() const { return expected_dynamic_smem_bytes_; @@ -464,11 +464,6 @@ class NVF_API Fusion : public IrContainer { all_tvs_ptr_.reset(); } - private: - // Determine if the two values are compatible for aliasing - // Same DataType, ValType, and number of dimensions - bool isAliasCompatible(Val* left, Val* right); - private: // Fusion inputs and outputs std::vector inputs_; diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index 402784153a0..c3132067067 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -141,7 +141,7 @@ void HostIrExecutor::handle(PostOnStream* post_ir) { "op must be a HostUnit: ", post_ir->hostOpToPost()); auto hu = post_ir->hostOpToPost()->as(); - // Compile the fusion and execute it with FusionExecutor(Cache) + // Compile the fusion and execute it with KernelExecutor(Cache) // Check if the executor has been cached. If not, create and cache it if (params_.use_fusion_executor_cache) { if (!fec_.count(hu)) { @@ -153,13 +153,13 @@ void HostIrExecutor::handle(PostOnStream* post_ir) { } outputs = fec_.at(hu).runFusionWithInputs(input_IValues); } else { - FusionExecutor& fe = fe_[hu]; - if (!fe.isCompiled()) { + KernelExecutor& ke = fe_[hu]; + if (!ke.isCompiled()) { Fusion* fusion = hu->fusion_to_execute(); DynamicTransform::concretizeFusion(fusion, input_IValues); - fe.compileFusion(fusion, input_IValues); + ke.compile(fusion, input_IValues); } - outputs = fe.runFusion(input_IValues); + outputs = ke.run(input_IValues); if (!params_.cache_fusion_executor) { fe_.erase(hu); } diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h index 2dcec129cc8..96bcc725d7b 100644 --- a/csrc/host_ir/executor.h +++ b/csrc/host_ir/executor.h @@ -36,7 +36,7 @@ duplication will be resolved in the future. // Set of parameters that control the behavior of HostIrExecutor struct HostIrExecutorParams { // Experimental: whether to use FusionExecutorCache rather than - // FusionExecutor. + // KernelExecutor. bool use_fusion_executor_cache = false; // Experimental: whether to apply auto-scheduling in FusionExecutorCache if // use_fusion_executor_cache=true. WAR: temporary hack mainly use for @@ -95,7 +95,7 @@ class HostIrExecutor final : public OptOutDispatch { // Stores concrete computed values ExpressionEvaluator expr_evaluator_; // Cache Fusions, FusionExecutors - std::unordered_map fe_; + std::unordered_map fe_; std::unordered_map fec_; using StreamKey = std::variant; std::unordered_map streams_; diff --git a/csrc/ir/internal_base_nodes.h b/csrc/ir/internal_base_nodes.h index 6ac52ba0564..f9f422cd994 100644 --- a/csrc/ir/internal_base_nodes.h +++ b/csrc/ir/internal_base_nodes.h @@ -120,18 +120,23 @@ class NVF_API IterDomain : public Val { static std::vector clone( const std::vector& domains); - //! When `rfactor_domain` is true, also set the `is_rfactor_domain_` flag of - //! the result IterDomain. + //! The optional parameters of rfactor_domain and iter_type can be + //! used to override the default behavior. static IterDomain* merge( IterDomain* outer, IterDomain* inner, - bool rfactor_domain = false); + std::optional rfactor_domain = std::nullopt, + std::optional iter_type = std::nullopt); + //! The optional parameters of rfactor_domain, outer_iter_type and + //! inner_iter_type can be used to override the default behavior. static std::pair split( IterDomain* in, Val* factor, bool inner_split, - bool rfactor_domain = false); + std::optional rfactor_domain = std::nullopt, + std::optional outer_iter_type = std::nullopt, + std::optional inner_iter_type = std::nullopt); //! Resize an IterDomain by expanding both the left and right sides //! by given widths. The resulting IterDomain has an extent of diff --git a/csrc/ir/nodes.cpp b/csrc/ir/nodes.cpp index ca39d51684a..0ff0e5c6bf3 100644 --- a/csrc/ir/nodes.cpp +++ b/csrc/ir/nodes.cpp @@ -2550,7 +2550,8 @@ IterDomain* IterDomain::cloneWithoutRFactor(bool map_with_original) { IterDomain* IterDomain::merge( IterDomain* outer, IterDomain* inner, - bool rfactor_domain) { + std::optional rfactor_domain, + std::optional iter_type) { NVF_CHECK( outer->isReduction() == inner->isReduction(), "Merging IterDomains requires that their iteration types match. ", @@ -2563,24 +2564,33 @@ IterDomain* IterDomain::merge( !outer->isStride() && !inner->isStride(), "No support for merging stride domains"); + // By default, if not specified, don't create rfactor + // outputs. Reshape transformations should propagate the flag, which + // should explicitly specify the flag + if (!rfactor_domain.has_value()) { + rfactor_domain = false; + } + Val* merged_id_size = mul(outer->extent(), inner->extent()); - IterType itype = outer->getIterType(); + if (!iter_type.has_value()) { + iter_type = outer->getIterType(); - if (outer->isBroadcast() && inner->isBroadcast()) { - itype = IterType::Broadcast; - } + if (outer->isBroadcast() && inner->isBroadcast()) { + iter_type = IterType::Broadcast; + } - if ((outer->isBroadcast() || inner->isBroadcast()) && - (outer->getIterType() == IterType::Iteration || - inner->getIterType() == IterType::Iteration)) { - itype = IterType::Iteration; - } + if ((outer->isBroadcast() || inner->isBroadcast()) && + (outer->getIterType() == IterType::Iteration || + inner->getIterType() == IterType::Iteration)) { + iter_type = IterType::Iteration; + } - if ((outer->isBroadcast() || inner->isBroadcast()) && - (outer->getIterType() == IterType::GatherScatter || - inner->getIterType() == IterType::GatherScatter)) { - itype = IterType::GatherScatter; + if ((outer->isBroadcast() || inner->isBroadcast()) && + (outer->getIterType() == IterType::GatherScatter || + inner->getIterType() == IterType::GatherScatter)) { + iter_type = IterType::GatherScatter; + } } Val* expanded_extent = nullptr; @@ -2606,8 +2616,8 @@ IterDomain* IterDomain::merge( IterDomainBuilder(outer->container()->zeroVal(), merged_id_size) .parallel_type(outer->getParallelType()) .expanded_extent(expanded_extent) - .iter_type(itype) - .is_rfactor_domain(rfactor_domain) + .iter_type(*iter_type) + .is_rfactor_domain(*rfactor_domain) .build(); IrBuilder::createInContainer( @@ -2620,7 +2630,9 @@ std::pair IterDomain::split( IterDomain* in, Val* factor, bool inner_split, - bool rfactor_domain) { + std::optional rfactor_domain, + std::optional outer_iter_type, + std::optional inner_iter_type) { NVF_CHECK( factor->isIntegralScalar(), "Cannot split by non-integer value ", factor); @@ -2631,6 +2643,22 @@ std::pair IterDomain::split( expanded_remainder = ceilDiv(in->expandedExtent(), factor); } + // By default, if not specified, don't create rfactor + // outputs. Reshape transformations should propagate the flag, which + // should explicitly specify the flag + if (!rfactor_domain.has_value()) { + rfactor_domain = false; + } + + // If not specified, inherit these properties from the input iter domain + if (!outer_iter_type.has_value()) { + outer_iter_type = in->getIterType(); + } + + if (!inner_iter_type.has_value()) { + inner_iter_type = in->getIterType(); + } + // outer loop IterDomain IterDomain* ido = IterDomainBuilder( @@ -2639,8 +2667,8 @@ std::pair IterDomain::split( in->hasExpandedExtent() && inner_split ? expanded_remainder : nullptr) .parallel_type(in->getParallelType()) - .iter_type(in->getIterType()) - .is_rfactor_domain(rfactor_domain) + .iter_type(*outer_iter_type) + .is_rfactor_domain(*rfactor_domain) .build(); // inner loop IterDomain @@ -2651,8 +2679,8 @@ std::pair IterDomain::split( in->hasExpandedExtent() && !inner_split ? expanded_remainder : nullptr) .parallel_type(in->getParallelType()) - .iter_type(in->getIterType()) - .is_rfactor_domain(rfactor_domain) + .iter_type(*inner_iter_type) + .is_rfactor_domain(*rfactor_domain) .build(); IrBuilder::createInContainer( @@ -4853,6 +4881,13 @@ bool ForLoop::isTrivial() const { return true; } + if (start()->isConstScalar() && simplifiedStop()->isConstScalar() && + start()->evaluate().as() + 1 == + simplifiedStop()->evaluate().as() && + step()->isOneInt()) { + return true; + } + return false; } diff --git a/csrc/ir/utils.cpp b/csrc/ir/utils.cpp index 91a1170bf38..868ba36144d 100644 --- a/csrc/ir/utils.cpp +++ b/csrc/ir/utils.cpp @@ -1204,6 +1204,66 @@ bool isFunctional(const Val* v) { return std::all_of(def->inputs().begin(), def->inputs().end(), isFunctional); } +bool isRecursivelyDefined(Val* val) { + NVF_ERROR(val != nullptr); + + std::deque vals_to_visit; + vals_to_visit.push_back(val); + + std::unordered_set visited_vals; + + while (!vals_to_visit.empty()) { + auto v = vals_to_visit.front(); + vals_to_visit.pop_front(); + + visited_vals.insert(v); + + auto v_def = v->definition(); + if (v_def == nullptr) { + continue; + } + + for (const auto inp : v_def->inputs()) { + if (inp == val) { + // Recursive dependency detected + return true; + } + // Don't visit the same multiple times + if (!visited_vals.count(inp)) { + vals_to_visit.push_back(inp); + } + } + } + + return false; +} + +int64_t getOperationCount(Val* val) { + int64_t num_ops = 0; + + // Start with the given val and recursively count the number of ops + // by traversing inputs + std::deque vals; + vals.push_back(val); + + while (!vals.empty()) { + auto v = vals.front(); + vals.pop_front(); + + auto def = v->definition(); + if (def == nullptr) { + continue; + } + ++num_ops; + + for (auto inp : def->inputs()) { + vals.push_back(inp); + } + } + + return num_ops; +} + } // namespace nvfuser::ir_utils namespace nvfuser::MmaOpUtils { diff --git a/csrc/ir/utils.h b/csrc/ir/utils.h index b02fb2fbe3e..60062b0e440 100644 --- a/csrc/ir/utils.h +++ b/csrc/ir/utils.h @@ -728,4 +728,13 @@ std::string nullOrToInlineString(const Statement* stmt); //! always returns the same result when called with the same inputs. bool isFunctional(const Val* v); +// Check if the given val is recursively defined, which is invalid in +// the Fusion IR but may not be necessarily the case in other IRs +// such as the Kernel IR +bool isRecursivelyDefined(Val* val); + +// Return the number of operations that are used to define val. One +// instance of Expr is counted as a single operation. +int64_t getOperationCount(Val* val); + } // namespace nvfuser::ir_utils diff --git a/csrc/kernel_ir.h b/csrc/kernel_ir.h index 8be502233a5..f5f062cdbb7 100644 --- a/csrc/kernel_ir.h +++ b/csrc/kernel_ir.h @@ -332,12 +332,12 @@ class NVF_API Allocate final : public Expr { //! hold counters starting at zero. Typically, each participating thread would //! increment the counter and the last thread would leave the counter in a //! non-zeroed state. The next time that kernel is run, it can no longer - //! re-use the non-zero semaphore buffer, so FusionExecutor will launch + //! re-use the non-zero semaphore buffer, so KernelExecutor will launch //! at::zeroes to allocate a new buffer, resulting in a memset kernel launch. //! //! Instead, if the last thread resets the counter to zero, then the buffer //! can be re-used, and at::zeroes need only be run at the first kernel - //! launch. If resetsToZero() is true, then FusionExecutor will use + //! launch. If resetsToZero() is true, then KernelExecutor will use //! contigZeroedTensor() and releaseZeroedMemory() from global_allocator.h to //! reuse zeroed memory avoiding the additional kernel launch. //! @@ -840,7 +840,7 @@ class NVF_API IfThenElse final : public Expr { //! This node is used only after lowering a fusion to explicitly mark a grid //! reduction and the buffer allocation needed to do it. //! -//! This node provides FusionExecutor the information it needs to allocate the +//! This node provides KernelExecutor the information it needs to allocate the //! reduction and sync buffers. class GridReduction final : public ReductionOp { static constexpr int num_reduction_op_attr = 4; @@ -1004,7 +1004,7 @@ class NVF_API GroupedGridReduction final : public GroupedReductionOp { //! This node is used only after lowering a fusion to explicitly mark a grid //! broadcast and the buffer allocation needed to do it. //! -//! This node provides FusionExecutor the information it needs to allocate the +//! This node provides KernelExecutor the information it needs to allocate the //! broadcast and sync buffers. class NVF_API GridBroadcast final : public Expr { public: @@ -1043,7 +1043,7 @@ class NVF_API GridBroadcast final : public Expr { //! This node is used only after lowering a fusion to explicitly mark a grid //! reduction and the buffer allocation needed to do it. //! -//! This node provides FusionExecutor the information it needs to allocate the +//! This node provides KernelExecutor the information it needs to allocate the //! reduction and sync buffers. //! //! TODO: Make this a subclass of WelfordOp diff --git a/csrc/multidevice/executor.cpp b/csrc/multidevice/executor.cpp index 6de05e48d76..6546e4555cd 100644 --- a/csrc/multidevice/executor.cpp +++ b/csrc/multidevice/executor.cpp @@ -6,10 +6,12 @@ */ // clang-format on #include + #include #include #include #include +#include #include #include #include @@ -52,6 +54,22 @@ std::unique_ptr copyFusionAndChangeOutputs( return fusion_copy; } +// Used in distributed setting where we only want to allocate output space and +// receive output data from a different rank instead of computing them. +std::vector allocateOutputSpace( + const at::ArrayRef& inputs, + Fusion* fusion, + const c10::Device& device) { + FUSER_PERF_SCOPE("multidevice::executor::allocateOutputSpace"); + auto fusion_inputs = KernelArgumentHolder::createKernelArgumentHolder(inputs); + auto expr_eval = executor_utils::bindInputs(fusion_inputs, fusion); + + auto output_info = + getBufferInfos(expr_eval, PrimDataType::Int, fusion->outputs()); + + return allocateOutputs(fusion, output_info, device, expr_eval); +} + } // namespace MultiDeviceExecutor::MultiDeviceExecutor( @@ -186,7 +204,7 @@ std::vector MultiDeviceExecutor::runWithInput( } auto allocations = - allocOutputSpace(inputs, allocator_fusion_.get(), comm()->device()); + allocateOutputSpace(inputs, allocator_fusion_.get(), comm()->device()); NVF_ERROR(vals_to_allocate_.size() == allocations.size()); for (auto i : c10::irange(allocations.size())) { val_to_IValue[vals_to_allocate_.at(i)] = allocations.at(i); diff --git a/csrc/mutator.cpp b/csrc/mutator.cpp index 87f44c797fe..5f183bf4839 100644 --- a/csrc/mutator.cpp +++ b/csrc/mutator.cpp @@ -77,6 +77,18 @@ void OptOutMutator::registerMutation(Val* val, Val* mutation) { ", ", mutation->dtype(), ")"); + + NVF_ERROR( + !DependencyCheck::isDependencyOf(val, mutation), + "Attempted to replace a val, ", + val->toString(), + ", with a dependent val, ", + mutation->toString(), + " (", + mutation->toInlineString(), + "), which is not allowed as it would result in a recursive definition of ", + mutation->toString()); + mutations_[val] = mutation; } diff --git a/csrc/options.cpp b/csrc/options.cpp index beb98d2f8a8..54571bad3ba 100644 --- a/csrc/options.cpp +++ b/csrc/options.cpp @@ -135,6 +135,7 @@ std::unordered_map> Options< {"ptx", DebugDumpOption::Ptx}, {"ptxas_verbose", DebugDumpOption::PrintPtxasLog}, {"python_definition", DebugDumpOption::PythonDefinition}, + {"python_definition_segments", DebugDumpOption::PythonDefinitionSegments}, {"python_frontend_debug", DebugDumpOption::PythonFrontendDebug}, {"sass", DebugDumpOption::Sass}, {"segmented_fusion", DebugDumpOption::FusionSegments}, diff --git a/csrc/options.h b/csrc/options.h index 76522b93a9d..0cc6313a214 100644 --- a/csrc/options.h +++ b/csrc/options.h @@ -39,7 +39,7 @@ enum class DebugDumpOption { FusionIrPresched, //!< Dump the segmented Fusion IR before it is scheduled // TODO(wujingyue): name the following FusionIrSched FusionIr, //!< Dump the Fusion IR before lowering. This is the Fusion IR fed - //!< to `FusionExecutor::compileFusion`. + //!< to `KernelExecutor::compileFusion`. FusionIrMath, //!< Dump just the compute (math) part of the above `FusionIr` //!< for conciseness KernelIr, //!< Dump the compiler Kernel IR @@ -64,6 +64,7 @@ enum class DebugDumpOption { //! associated with what's running PreSegmenterLogging, PythonDefinition, //! Python Frontend Fusion Definition. + PythonDefinitionSegments, //! Python Frontend Fusion Definition of segments. PythonFrontendDebug, //! Python Frontend debug information. TransformPropagator, //! When running TransformPropagator, print propagation //! path and replay result diff --git a/csrc/python_frontend/fusion_cache.cpp b/csrc/python_frontend/fusion_cache.cpp index e95ee6820da..c896592af5a 100644 --- a/csrc/python_frontend/fusion_cache.cpp +++ b/csrc/python_frontend/fusion_cache.cpp @@ -188,7 +188,7 @@ FusionCache* FusionCache::singleton_ = nullptr; UserSchedule::UserSchedule() : scheduled_fusion(nullptr), executor(nullptr) { scheduled_fusion = std::make_unique(); - executor = std::make_unique(); + executor = std::make_unique(); } bool UserSchedule::canSchedule(const SchedulerType& scheduler_type) { @@ -688,7 +688,7 @@ void FusionCache::serialize(std::string filename) const { &fb_nodes, &terminal_node_idx, &fb_auto_gen_schedules, - FusionExecutor::getGlobalFusionCount(), + KernelExecutor::getGlobalFusionCount(), device_prop->major, device_prop->minor, cuda_major, @@ -722,7 +722,7 @@ void FusionCache::deserialize(std::string filename) { NVF_CHECK(fusion_cache_buffer != nullptr, "Fusion Cache buffer is invalid."); // 0. Set static fusion count in Fusion Executor - FusionExecutor::setGlobalFusionCount( + KernelExecutor::setGlobalFusionCount( fusion_cache_buffer->global_fusion_count()); // 1. Deserialize max_fusions field diff --git a/csrc/python_frontend/fusion_cache.h b/csrc/python_frontend/fusion_cache.h index 2d4f2533ba5..ffe8088b82c 100644 --- a/csrc/python_frontend/fusion_cache.h +++ b/csrc/python_frontend/fusion_cache.h @@ -41,7 +41,7 @@ struct UserSchedule { std::unique_ptr scheduled_fusion; //! Generated kernel container - std::unique_ptr executor; + std::unique_ptr executor; //! ID of fusion in python frontend fusion cache int64_t fusion_id_ = -1; @@ -102,7 +102,7 @@ struct FusionSchedules { //! Keeps a pointer to the last scheduled Fusion IR for printing Fusion* last_user_def_scheduled_ir; //! Keeps a pointer to the last executed executor for printing its cuda kernel - FusionExecutor* last_user_def_executor; + KernelExecutor* last_user_def_executor; //! For thread-Safe locking of Fusion Schedules std::mutex scheds_lock; //! ID of fusion in python frontend fusion cache diff --git a/csrc/python_frontend/fusion_definition.cpp b/csrc/python_frontend/fusion_definition.cpp index 05f12a7c2af..dc1af7a9f5c 100644 --- a/csrc/python_frontend/fusion_definition.cpp +++ b/csrc/python_frontend/fusion_definition.cpp @@ -370,18 +370,18 @@ std::vector FusionDefinition::execute( if (user_sched.heuristic_params == nullptr) { // Manual schedule if (!user_sched.executor->isCompiled()) { - user_sched.executor->compileFusion( + user_sched.executor->compile( user_sched.scheduled_fusion.get(), inputs, user_sched.fusion_id_, user_sched.device_id_); } - outputs = user_sched.executor->runFusion(inputs); + outputs = user_sched.executor->run(inputs); } else { // Automatic scheduler was used for UserSchedule. // Pass launch and compile params to compileFusion and runFusion. if (!user_sched.executor->isCompiled()) { - user_sched.executor->compileFusion( + user_sched.executor->compile( user_sched.scheduled_fusion.get(), KernelArgumentHolder::createKernelArgumentHolder( inputs, getCommonDeviceCUDA(inputs)), @@ -391,7 +391,7 @@ std::vector FusionDefinition::execute( user_sched.fusion_id_, user_sched.device_id_); } - outputs = user_sched.executor->runFusion( + outputs = user_sched.executor->run( inputs, user_sched.heuristic_params->lparams, user_sched.heuristic_params->cparams); diff --git a/csrc/python_frontend/fusion_record.h b/csrc/python_frontend/fusion_record.h index 154f8d28805..8d9f77c80bf 100644 --- a/csrc/python_frontend/fusion_record.h +++ b/csrc/python_frontend/fusion_record.h @@ -1823,7 +1823,7 @@ struct SelectOpRecord : RecordFunctor { void operator()(FusionState& fd) final { auto arg1 = fd.getFusionState(args_.at(0).index)->template as(); - auto arg3 = fd.getFusionState(args_.at(1).index)->template as(); + auto arg3 = fd.getFusionState(args_.at(1).index); Val* output = select(arg1, dim_, arg3); fd.setFusionState(outputs_.at(0).index, output); diff --git a/csrc/python_frontend/python_bindings.cpp b/csrc/python_frontend/python_bindings.cpp index bf092240b06..f17ea228ad0 100644 --- a/csrc/python_frontend/python_bindings.cpp +++ b/csrc/python_frontend/python_bindings.cpp @@ -651,7 +651,6 @@ void defineHeuristicParamBindings(py::module& nvfuser) { .PARAM(MatmulParams, circular_buffer_options) .PARAM(MatmulParams, supported_vec_size) .PARAM(MatmulParams, async_gmem_load_operands) - .PARAM(MatmulParams, rotate_ldmatrix_out_of_main_loop) .PARAM(MatmulParams, grid_swizzle_factor) .PARAM(MatmulParams, use_smem_epilogue) .PARAM(MatmulParams, promote_prologue_smem_reuse) diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp index f482bf6dfb4..29fa52461e6 100644 --- a/csrc/runtime/allocations.cpp +++ b/csrc/runtime/allocations.cpp @@ -366,20 +366,6 @@ std::vector allocateOutputs( return out_tensors; } -std::vector allocOutputSpace( - const at::ArrayRef& inputs, - Fusion* fusion, - const c10::Device& device) { - FUSER_PERF_SCOPE("fusion_executor::allocations::allocOutputSpace"); - auto fusion_inputs = KernelArgumentHolder::createKernelArgumentHolder(inputs); - auto expr_eval = executor_utils::bindInputs(fusion_inputs, fusion); - - auto output_info = - getBufferInfos(expr_eval, PrimDataType::Int, fusion->outputs()); - - return allocateOutputs(fusion, output_info, device, expr_eval); -} - namespace { GlobalBufferInfo getBufferInfo( ExpressionEvaluator& expr_eval, @@ -685,12 +671,11 @@ class BackwardTraverseFromAllocToLogical { // Another example, if the logical domain is [I1*I2] and the allocation domain // is [I1, I2], then we will allocate as [I1, I2] and do a tensor.view(I1*I2) to // get a tensor whose semantics is [I1*I2] but memory is [I1,I2] -at::Tensor transformOutputFromAllocationToLogical( +at::Tensor transformFromAllocationToLogical( at::Tensor tensor, TensorView* tv, ExpressionEvaluator& ee) { - FUSER_PERF_SCOPE( - "fusion_executor::allocations::transformOutputFromAllocationToLogical"); + FUSER_PERF_SCOPE("allocations::transformFromAllocationToLogical"); // Ignore reductions because reductions does not exist in tensor's definition auto logical = TensorDomain::noReductions(tv->getLogicalDomain()); auto alloc = TensorDomain::noReductions(tv->getMaybeAllocationDomain()); @@ -765,9 +750,8 @@ std::pair, std::vector> inferShapeOfOutput( at::empty_strided(size_stride.first, size_stride.second, options); // TODO(jiej): we should refactor it here, there's no need to use // meta_tensor at all, size + stride should be used directly in the - // `transformOutputFromAllocationToLogical` - meta_tensor = - transformOutputFromAllocationToLogical(meta_tensor, tv, expr_eval); + // `transformFromAllocationToLogical` + meta_tensor = transformFromAllocationToLogical(meta_tensor, tv, expr_eval); return {meta_tensor.sizes().vec(), meta_tensor.strides().vec()}; } diff --git a/csrc/runtime/allocations.h b/csrc/runtime/allocations.h index 294013f4e1a..1ec77eb3ce2 100644 --- a/csrc/runtime/allocations.h +++ b/csrc/runtime/allocations.h @@ -56,14 +56,6 @@ NVF_API void setFillAllocationWithNan(bool value); void fillTensorWithNan(at::Tensor& t); -//! Used in distributed setting where we only want to -//! allocate output space and receive output data from -//! a different rank instead of computing them. -std::vector allocOutputSpace( - const at::ArrayRef& inputs, - Fusion* fusion, - const c10::Device& device); - // Infer the sizes and strides of an output tensor std::pair, std::vector> inferShapeOfOutput( TensorView* tv, diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp index 2acc3001822..a7ded48889a 100644 --- a/csrc/runtime/executor.cpp +++ b/csrc/runtime/executor.cpp @@ -138,10 +138,10 @@ std::string getStructuredCodeFromExternalFiles(const int64_t fusion_id) { } } // namespace -FusionExecutor::FusionExecutor() +KernelExecutor::KernelExecutor() : communicator_(&Communicator::getInstance()) {} -std::unique_ptr& FusionExecutor:: +std::unique_ptr& KernelExecutor:: evaluatorPrecomputedValues() { if (!evaluator_precomputed_values_) { evaluator_precomputed_values_ = @@ -150,7 +150,7 @@ std::unique_ptr& FusionExecutor:: return evaluator_precomputed_values_; } -std::string FusionExecutor::getStructuredCode( +std::string KernelExecutor::getStructuredCode( const std::string& kernel_str, PrimDataType index_type) const { // generating cuda code; @@ -181,11 +181,11 @@ std::string FusionExecutor::getStructuredCode( return code; } -std::string FusionExecutor::getStructuredCode() const { +std::string KernelExecutor::getStructuredCode() const { return getStructuredCode(kernelString(), kernel()->indexType()); } -void FusionExecutor::compileFusion( +void KernelExecutor::compile( Fusion* fusion, const KernelArgumentHolder& args, const LaunchParams& launch_constraints, @@ -195,7 +195,7 @@ void FusionExecutor::compileFusion( int64_t concrete_id, int64_t runtime_id, int64_t group_id) { - FUSER_PERF_SCOPE("FusionExecutor::compileFusion"); + FUSER_PERF_SCOPE("KernelExecutor::compileFusion"); NVF_ERROR( !fusion->outputs().empty(), "No output found for this kernel, aborting."); @@ -456,7 +456,7 @@ void FusionExecutor::compileFusion( kernel_id_, compile_params, block_size); - NVF_ERROR(validKernelId(), "Invalid kernel id for FusionExecutor."); + NVF_ERROR(validKernelId(), "Invalid kernel id for KernelExecutor."); // These should be nullopt at this point, but reset just in case resetCompiledKernelProperties(); @@ -475,12 +475,12 @@ void FusionExecutor::compileFusion( } } -LaunchParams FusionExecutor::computeLaunchParams( +LaunchParams KernelExecutor::computeLaunchParams( const LaunchParams& launch_constraints, ExpressionEvaluator& expr_eval, const int64_t warp_size, DataType index_type) { - FUSER_PERF_SCOPE("FusionExecutor::computeLaunchParams"); + FUSER_PERF_SCOPE("KernelExecutor::computeLaunchParams"); NVF_ERROR(warp_size > 0, "WARP_SIZE should be larger than 0"); LaunchParams launch_params; @@ -555,7 +555,7 @@ LaunchParams FusionExecutor::computeLaunchParams( // Run through the rest of the parallel IterDomains and infer their size for (auto [p_type, extent] : simplified_parallel_iter_extents) { - FUSER_PERF_SCOPE("FusionExecutor::ParallelBindingResolution"); + FUSER_PERF_SCOPE("KernelExecutor::ParallelBindingResolution"); auto val = expr_eval.evaluate(extent); NVF_ERROR( val.hasValue(), @@ -635,10 +635,10 @@ LaunchParams FusionExecutor::computeLaunchParams( return launch_params; } -std::vector FusionExecutor::getIntermediateBufferInfo( +std::vector KernelExecutor::getIntermediateBufferInfo( ExpressionEvaluator& expr_eval, DataType index_type) { - FUSER_PERF_SCOPE("FusionExecutor::getIntermediateBufferInfo"); + FUSER_PERF_SCOPE("KernelExecutor::getIntermediateBufferInfo"); std::vector global_buffers; const auto kernel = lowered_->kernel(); @@ -685,7 +685,7 @@ std::vector FusionExecutor::getIntermediateBufferInfo( return global_buffers; } -void FusionExecutor::setUsedTVs() { +void KernelExecutor::setUsedTVs() { auto used_vals = fusion()->usedMathVals(); auto used_tvs = ir_utils::filterByType(used_vals); used_tvs_.clear(); @@ -744,7 +744,7 @@ void validateCooperativeLaunch( // Dump fusion inputs and outputs as well as some useful fusion // information. Note that inputs and outputs are those that are passed -// to FusionExecutor::runFusion, so outputs may not be given. +// to KernelExecutor::runFusion, so outputs may not be given. void dumpFusionArgs( int64_t fusion_id, const KernelArgumentHolder& args, @@ -768,7 +768,7 @@ void dumpFusionArgs( // Dump arguments that are passed to a CUDA kernel call, which include // the inputs and outputs of the fusion as well as temporary // global-memory buffers. Unlike dumpFusionArgs, which dumps inputs -// and outputs passed to FusionExecutor::runFusion, this function +// and outputs passed to KernelExecutor::runFusion, this function // dumps those that are passed to a CUDA kernel. void dumpKernelArgs( int64_t fusion_id, @@ -803,14 +803,14 @@ void dumpKernelArgs( } // namespace -void FusionExecutor::initializeExecutorEntry( +void KernelExecutor::initializeExecutorEntry( ExecutorEntry& executor_entry, const KernelArgumentHolder& args, const LaunchParams& launch_constraints, const CompileParams& compile_params, const std::vector& outputs, DataType index_type) { - FUSER_PERF_SCOPE("FusionExecutor::initializeExecutorEntry"); + FUSER_PERF_SCOPE("KernelExecutor::initializeExecutorEntry"); ExpressionEvaluator expr_eval; evaluatorPrecomputedValues()->bindInputs(args); @@ -882,7 +882,7 @@ void FusionExecutor::initializeExecutorEntry( /// @param idx_type_size generally sizeof(int32_t) or sizeof(int64_t); used for /// computing how large the arrays to copy are. static void fillTensorArgMetadata( - FusionExecutor::ExecutorEntry& entry, + KernelExecutor::ExecutorEntry& entry, const PolymorphicValue& tensor_metadata, size_t idx, size_t idx_type_size) { @@ -943,11 +943,11 @@ static void fillTensorArgMetadata( // when we change the rank of a tensor or the number of arguments to a kernel. // It does not need to happen when only shapes change---use recomputeArgs for // that. -void FusionExecutor::computeArgs( +void KernelExecutor::computeArgs( ExecutorEntry& entry, ExpressionEvaluator& expr_eval, const kir::Kernel* kernel) const { - FUSER_PERF_SCOPE("FusionExecutor::computeArgs"); + FUSER_PERF_SCOPE("KernelExecutor::computeArgs"); const std::vector& params = kernel->parameters(); entry.args.resize(params.size()); @@ -961,11 +961,11 @@ void FusionExecutor::computeArgs( // Reset the arguments that we'll pass to cuLaunchKernel. This needs to be // invoked on every shape change. -void FusionExecutor::recomputeArgs( +void KernelExecutor::recomputeArgs( ExecutorEntry& entry, ExpressionEvaluator& expr_eval, const kir::Kernel* kernel) const { - FUSER_PERF_SCOPE("FusionExecutor::recomputeArgs"); + FUSER_PERF_SCOPE("KernelExecutor::recomputeArgs"); // assert(entry.init && "entry was never initialized"); const std::vector& params = kernel->parameters(); @@ -996,10 +996,10 @@ void FusionExecutor::recomputeArgs( } } -void FusionExecutor::recompileKernel( +void KernelExecutor::recompileKernel( const LaunchParams& new_launch_params, const CompileParams& new_compile_params) { - FUSER_PERF_SCOPE("FusionExecutor::runFusion::recompileKernel"); + FUSER_PERF_SCOPE("KernelExecutor::runFusion::recompileKernel"); const auto structured_code = getStructuredCode(); block_size_high_water_mark_ = new_launch_params.nThreads(); @@ -1026,7 +1026,7 @@ void FusionExecutor::recompileKernel( } } -int64_t FusionExecutor::getAvailableDynamicSmemSize() { +int64_t KernelExecutor::getAvailableDynamicSmemSize() { NVF_ERROR( hasCompiledKernel(), "Cannot get dynamic smem size unless kernel is compiled"); @@ -1041,7 +1041,7 @@ int64_t FusionExecutor::getAvailableDynamicSmemSize() { return available_dynamic_smem_size_.value(); } -int64_t FusionExecutor::getStaticSmemSize() { +int64_t KernelExecutor::getStaticSmemSize() { NVF_ERROR( hasCompiledKernel(), "Cannot get static smem size unless kernel is compiled"); @@ -1057,7 +1057,7 @@ int64_t FusionExecutor::getStaticSmemSize() { return static_smem_size_.value(); } -void FusionExecutor::validateDynamicSmemSize(int64_t dynamic_smem_size) { +void KernelExecutor::validateDynamicSmemSize(int64_t dynamic_smem_size) { // If specified, check that dynamic smem size matches what the scheduler // expects int64_t expected_dynamic_smem_size = fusion()->expectedDynamicSmemBytes(); @@ -1082,7 +1082,7 @@ void FusionExecutor::validateDynamicSmemSize(int64_t dynamic_smem_size) { device_smem_limit_); } -int64_t FusionExecutor::ensureAvailableDynamicSmemSize( +int64_t KernelExecutor::ensureAvailableDynamicSmemSize( int64_t dynamic_smem_size) { NVF_ERROR( hasCompiledKernel(), @@ -1098,15 +1098,15 @@ int64_t FusionExecutor::ensureAvailableDynamicSmemSize( return getAvailableDynamicSmemSize(); } -void FusionExecutor::resetCompiledKernelProperties() { +void KernelExecutor::resetCompiledKernelProperties() { available_dynamic_smem_size_.reset(); static_smem_size_.reset(); } -std::vector FusionExecutor::evaluateFusionOutputs( +std::vector KernelExecutor::evaluateFusionOutputs( std::vector outputs, ExpressionEvaluator& expr_eval) { - FUSER_PERF_SCOPE("FusionExecutor::runFusion::evaluateFusionOutputs"); + FUSER_PERF_SCOPE("KernelExecutor::runFusion::evaluateFusionOutputs"); NVF_ERROR( outputs.empty(), "Fusion executor is using expression evaluator,", @@ -1137,12 +1137,12 @@ at::Tensor findBufferForFusionOutput( } } // namespace -std::vector FusionExecutor::runFusion( +std::vector KernelExecutor::run( KernelArgumentHolder& args, const LaunchParams& launch_constraints, CompileParams compile_params, std::vector outputs) { - FUSER_PERF_SCOPE("FusionExecutor::runFusion"); + FUSER_PERF_SCOPE("KernelExecutor::runFusion"); if (isProfilerEnabled()) { NVF_CHECK( @@ -1165,7 +1165,7 @@ std::vector FusionExecutor::runFusion( auto expr_eval = executor_utils::bindInputs(args, fusion()); if (isExpressionEvaluated(fusion())) { - FUSER_PERF_SCOPE("FusionExecutor::runFusion::evaluate_with_ExprEval"); + FUSER_PERF_SCOPE("KernelExecutor::runFusion::evaluate_with_ExprEval"); outputs = evaluateFusionOutputs(outputs, expr_eval); if (isProfilerEnabled()) { auto& sprof = FusionProfiler::segment(group_id_); @@ -1176,7 +1176,7 @@ std::vector FusionExecutor::runFusion( } if (host_ir_container_ != nullptr) { - FUSER_PERF_SCOPE("FusionExecutor::runFusion::host_ir_evaluate"); + FUSER_PERF_SCOPE("KernelExecutor::runFusion::host_ir_evaluate"); if (outputs.empty()) { std::vector output_info = getBufferInfos( expr_eval, PrimDataType::Int, host_ir_container_->outputs()); @@ -1204,7 +1204,7 @@ std::vector FusionExecutor::runFusion( return outputs; } - NVF_ERROR(validKernelId(), "Invalid kernel id for FusionExecutor."); + NVF_ERROR(validKernelId(), "Invalid kernel id for KernelExecutor."); NVF_ERROR( !args.getCacheId().has_value() || outputs.empty(), "short cut input cache is not compatible with pre-allocated output"); @@ -1276,7 +1276,7 @@ std::vector FusionExecutor::runFusion( std::vector intermediates; at::Tensor profile_buffer; { - FUSER_PERF_SCOPE("FusionExecutor::runFusion::intermediates"); + FUSER_PERF_SCOPE("KernelExecutor::runFusion::intermediates"); for (const auto i : c10::irange(executor_entry->intermediates.size())) { const auto& buf_info = executor_entry->intermediates.at(i); bool has_expansion = false; @@ -1356,7 +1356,7 @@ std::vector FusionExecutor::runFusion( executor_utils::CudaKernelTimer timer(stream); if (execute_kernel_ && !kernel()->topLevelExprs().empty()) { - FUSER_PERF_SCOPE("FusionExecutor::runFusion::execute_kernel"); + FUSER_PERF_SCOPE("KernelExecutor::runFusion::execute_kernel"); ensureAvailableDynamicSmemSize(executor_entry->launch_params.smem()); recomputeArgs(*executor_entry, expr_eval, kernel()); @@ -1435,7 +1435,7 @@ std::vector FusionExecutor::runFusion( return outputs; } -int64_t FusionExecutor::inputBytesProcessed(const KernelArgumentHolder& args) { +int64_t KernelExecutor::inputBytesProcessed(const KernelArgumentHolder& args) { int64_t num_bytes = 0; // Figure how many bytes are inputs, outputs, and temporary buffers for (auto i : c10::irange(args.size())) { @@ -1447,7 +1447,7 @@ int64_t FusionExecutor::inputBytesProcessed(const KernelArgumentHolder& args) { return num_bytes; } -int64_t FusionExecutor::outputBytesProcessed( +int64_t KernelExecutor::outputBytesProcessed( const std::vector& outputs) { int64_t num_bytes = 0; for (auto i : c10::irange(outputs.size())) { @@ -1459,12 +1459,12 @@ int64_t FusionExecutor::outputBytesProcessed( return num_bytes; } -void FusionExecutor::compileRtc( +void KernelExecutor::compileRtc( const std::string& code, const std::string& name, bool structured, PrimDataType index_type) { - FUSER_PERF_SCOPE("FusionExecutor::compileRtc"); + FUSER_PERF_SCOPE("KernelExecutor::compileRtc"); NVF_ERROR( index_type == PrimDataType::Int || index_type == PrimDataType::Int32 || "Invalid index type: ", @@ -1482,11 +1482,11 @@ void FusionExecutor::compileRtc( executor_utils::getCompiledKernel(std::nullopt, scode, name, kernel_id_); } -float FusionExecutor::runRtc( +float KernelExecutor::runRtc( const LaunchParams& launch_params, const std::vector& args, PrimDataType index_type) { - FUSER_PERF_SCOPE("FusionExecutor::runRtc"); + FUSER_PERF_SCOPE("KernelExecutor::runRtc"); c10::DeviceGuard dg(options_.device); auto stream = at::cuda::getCurrentCUDAStream(); @@ -1547,9 +1547,9 @@ float FusionExecutor::runRtc( return kernel_time_ms; } -flatbuffers::Offset FusionExecutor::serialize( +flatbuffers::Offset KernelExecutor::serialize( flatbuffers::FlatBufferBuilder& builder) const { - // See table definition for FusionExecutor in serde/fusion_cache.fbs + // See table definition for KernelExecutor in serde/fusion_cache.fbs using fb_executor_entry = flatbuffers::Offset; // Separate unordered_map for executor_entry_lookup into key and value @@ -1564,10 +1564,10 @@ flatbuffers::Offset FusionExecutor::serialize( // When compilation is skipped, avoid serializing cubin because it doesn't // exist. The remaining fields are also not necessary in this case. if (!hasCompiledKernel()) { - return serde::CreateFusionExecutorDirect(builder); + return serde::CreateKernelExecutorDirect(builder); } - return serde::CreateFusionExecutorDirect( + return serde::CreateKernelExecutorDirect( builder, device_smem_limit_, block_size_high_water_mark_, @@ -1585,13 +1585,13 @@ flatbuffers::Offset FusionExecutor::serialize( serialize(builder, compiled_kernel_.get())); } -flatbuffers::Offset FusionExecutor::serialize( +flatbuffers::Offset KernelExecutor::serialize( flatbuffers::FlatBufferBuilder& builder, const executor_utils::CompiledKernel* compiled_kernel) const { NVF_ERROR( compiled_kernel_ != nullptr && (!compiled_kernel->cubin.empty() || !compiled_kernel->ptx.empty()), - "Expected compiled cuda kernel before serializing FusionExecutor."); + "Expected compiled cuda kernel before serializing KernelExecutor."); auto fb_kernel_name = builder.CreateString(compiled_kernel->kernel_name); auto fb_compile_args = builder.CreateString(compiled_kernel->compile_args); @@ -1631,7 +1631,7 @@ flatbuffers::Offset FusionExecutor::serialize( return ckb.Finish(); } -flatbuffers::Offset FusionExecutor::serialize( +flatbuffers::Offset KernelExecutor::serialize( flatbuffers::FlatBufferBuilder& builder, const ExecutorEntry& data) const { // See table definition for ExecutorEntry in serde/fusion_cache.fbs @@ -1683,7 +1683,7 @@ flatbuffers::Offset FusionExecutor::serialize( &intermediates_fb); } -flatbuffers::Offset FusionExecutor::serialize( +flatbuffers::Offset KernelExecutor::serialize( flatbuffers::FlatBufferBuilder& builder, const GlobalBufferInfo& data, int64_t tv_position, @@ -1701,8 +1701,8 @@ flatbuffers::Offset FusionExecutor::serialize( is_fusion_output); } -void FusionExecutor::deserialize( - const serde::FusionExecutor* buffer, +void KernelExecutor::deserialize( + const serde::KernelExecutor* buffer, Fusion* fusion, int8_t device_index, CompileParams compile_params, @@ -1711,15 +1711,15 @@ void FusionExecutor::deserialize( int64_t concrete_id, int64_t runtime_id, int64_t group_id) { - // See table definition for FusionExecutor in serde/fusion_cache.fbs + // See table definition for KernelExecutor in serde/fusion_cache.fbs - NVF_ERROR(buffer != nullptr, "serde::FusionExecutor is nullptr."); + NVF_ERROR(buffer != nullptr, "serde::KernelExecutor is nullptr."); // TODO Should we set fusion_id, concrete_id, runtime_id, and group_id when we // skip compilation? if (isExpressionEvaluated(fusion)) { fusion_ = std::make_unique(*fusion); - NVF_ERROR(!hasCompiledKernel(), "Failed to deserialize FusionExecutor"); + NVF_ERROR(!hasCompiledKernel(), "Failed to deserialize KernelExecutor"); return; } @@ -1781,10 +1781,10 @@ void FusionExecutor::deserialize( compiled_kernel_ = executor_utils::getCompiledKernel( buffer->compiled_kernel(), compile_params); - NVF_ERROR(hasCompiledKernel(), "Failed to deserialize FusionExecutor"); + NVF_ERROR(hasCompiledKernel(), "Failed to deserialize KernelExecutor"); } -FusionExecutor::ExecutorEntry FusionExecutor::deserialize( +KernelExecutor::ExecutorEntry KernelExecutor::deserialize( const serde::ExecutorEntry* buffer) { // See table definition for ExecutorEntry in serde/fusion_cache.fbs @@ -1807,7 +1807,7 @@ FusionExecutor::ExecutorEntry FusionExecutor::deserialize( return entry; } -GlobalBufferInfo FusionExecutor::deserialize( +GlobalBufferInfo KernelExecutor::deserialize( const serde::GlobalBufferInfo* buffer) { // See table definition for GlobalBufferInfo in serde/fusion_cache.fbs diff --git a/csrc/runtime/executor.h b/csrc/runtime/executor.h index fd68acc3baf..0b6d27fb752 100644 --- a/csrc/runtime/executor.h +++ b/csrc/runtime/executor.h @@ -34,15 +34,15 @@ struct CompileOptions { c10::Device device = c10::Device(c10::DeviceType::CUDA, 0); }; -class FusionExecutor : public NonCopyable { +class KernelExecutor : public NonCopyable { public: // NVF_API was added for nvfuser_extension. See examples/sinh_extension. - NVF_API FusionExecutor(); + NVF_API KernelExecutor(); //! To compile a fusion with the 32-bit index type, CompileParams //! must be passed in. There used to be an index type associated //! with KernelArgumentHolder, but it is no longer the case. - NVF_API void compileFusion( + NVF_API void compile( Fusion* fusion, const KernelArgumentHolder& args, const LaunchParams& launch_constraints, @@ -56,25 +56,25 @@ class FusionExecutor : public NonCopyable { // TODO: merge it with the overload above. //! This API is merely here so we don't have to go back and update all cpp //! tests. - void compileFusion( + void compile( Fusion* fusion, const at::ArrayRef& inputs = {}, const LaunchParams& launch_constraints = LaunchParams(), CompileParams compile_params = CompileParams()) { KernelArgumentHolder args = KernelArgumentHolder::createKernelArgumentHolder(inputs); - compileFusion(fusion, args, launch_constraints, compile_params); + compile(fusion, args, launch_constraints, compile_params); } //! Used by user defined schedules in python frontend - void compileFusion( + void compile( Fusion* fusion, const at::ArrayRef& inputs, int64_t fusion_id, int64_t concrete_id) { KernelArgumentHolder args = KernelArgumentHolder::createKernelArgumentHolder(inputs); - compileFusion( + compile( fusion, args, LaunchParams(), @@ -92,15 +92,15 @@ class FusionExecutor : public NonCopyable { // TODO: args shouldn't come in a reference here because we will append the // outputs to be able to send it to the kernel. For now none of the users are // reconsuming the args, so it is okay. It isn't done now because changing it - // from a reference makes a call as runFusion({}) ambiguous, and that is used + // from a reference makes a call as run({}) ambiguous, and that is used // in some places in the codebase. - NVF_API std::vector runFusion( + NVF_API std::vector run( KernelArgumentHolder& args, const LaunchParams& launch_constraints = LaunchParams(), CompileParams compile_params = CompileParams(), std::vector outputs = {}); - std::vector runFusion( + std::vector run( const at::ArrayRef& inputs, const std::vector& outputs, const LaunchParams& launch_constraints = LaunchParams(), @@ -111,15 +111,15 @@ class FusionExecutor : public NonCopyable { if (opt_code.has_value()) { args.setCacheId(*opt_code); } - return runFusion(args, launch_constraints, compile_params, outputs); + return run(args, launch_constraints, compile_params, outputs); } - std::vector runFusion( + std::vector run( const at::ArrayRef& inputs, const LaunchParams& launch_constraints = LaunchParams(), CompileParams compile_params = CompileParams(), const std::optional& opt_code = std::nullopt) { - return runFusion(inputs, {}, launch_constraints, compile_params, opt_code); + return run(inputs, {}, launch_constraints, compile_params, opt_code); } // Register a lowering hooks that are called to modify the GpuLower object @@ -135,7 +135,7 @@ class FusionExecutor : public NonCopyable { post_lowering_hooks_.push_back(std::move(hook)); } - // Function to query whether compilation was attempted for a `FusionExecutor` + // Function to query whether compilation was attempted for a `KernelExecutor` bool isCompiled() const { int num_compiled_artifacts = (fusion_ != nullptr) + (lowered_ != nullptr) + (host_ir_container_ != nullptr); @@ -143,7 +143,7 @@ class FusionExecutor : public NonCopyable { return num_compiled_artifacts == 1; }; - // function to query whether a `FusionExecutor` has a compiled kernel to + // function to query whether a `KernelExecutor` has a compiled kernel to // execute bool hasCompiledKernel() const { if (compiled_kernel_ != nullptr) { @@ -355,12 +355,12 @@ class FusionExecutor : public NonCopyable { } //! Serialize Fusion Executor using flatbuffers - flatbuffers::Offset serialize( + flatbuffers::Offset serialize( flatbuffers::FlatBufferBuilder& builder) const; //! Deserialize Fusion Executor using flatbuffers void deserialize( - const serde::FusionExecutor* buffer, + const serde::KernelExecutor* buffer, Fusion* fusion, int8_t device_index, CompileParams compile_params, @@ -428,9 +428,9 @@ class FusionExecutor : public NonCopyable { flatbuffers::FlatBufferBuilder& builder, const executor_utils::CompiledKernel* kernel) const; - // ExecutorEntry is an internal POD struct for the FusionExecutor class. + // ExecutorEntry is an internal POD struct for the KernelExecutor class. // We define ExecutorEntry's serialize and deserialize as private methods in - // FusionExecutor. + // KernelExecutor. flatbuffers::Offset serialize( flatbuffers::FlatBufferBuilder& builder, const ExecutorEntry& data) const; @@ -438,9 +438,9 @@ class FusionExecutor : public NonCopyable { //! Deserialize ExecutorEntry using flatbuffers ExecutorEntry deserialize(const serde::ExecutorEntry* buffer); - // GlobalBufferInfo is an internal POD struct for the FusionExecutor class. + // GlobalBufferInfo is an internal POD struct for the KernelExecutor class. // We define GlobalBufferInfo's serialize and deserialize as private methods - // in FusionExecutor. + // in KernelExecutor. flatbuffers::Offset serialize( flatbuffers::FlatBufferBuilder& builder, const GlobalBufferInfo& data, diff --git a/csrc/runtime/executor_utils.cpp b/csrc/runtime/executor_utils.cpp index bb1c9b59f63..565f823d263 100644 --- a/csrc/runtime/executor_utils.cpp +++ b/csrc/runtime/executor_utils.cpp @@ -689,7 +689,7 @@ void validateVectorizedTensors( const std::vector& outputs, caching::ExecutorCompileTimeInfoCache* data_cache, ExpressionEvaluator& expr_eval) { - FUSER_PERF_SCOPE("FusionExecutor::validateVectorizedTensors"); + FUSER_PERF_SCOPE("KernelExecutor::validateVectorizedTensors"); validateAlignedVectorizedTensors( kernel, args, outputs, data_cache, expr_eval); diff --git a/csrc/runtime/executor_utils.h b/csrc/runtime/executor_utils.h index 8e99129356c..6c8418dc200 100644 --- a/csrc/runtime/executor_utils.h +++ b/csrc/runtime/executor_utils.h @@ -77,7 +77,7 @@ namespace caching { // the logic in the common space and re-use //! List of all the possible entry types in -//! `FusionExecutor` compile-time data cache. +//! `KernelExecutor` compile-time data cache. enum class CompileTimeEntryType { PARALLEL_BINDING_ITERDOMAINS, PARALLEL_ITER_EXTENT_MAP, @@ -91,7 +91,7 @@ enum class CompileTimeEntryType { //! Entry class definitions for each entry type: //! each class defines the data type for each entry type -//! Compile-time info to be cached in each FusionExecutor: +//! Compile-time info to be cached in each KernelExecutor: //! ParallelBindingIterDomains: //! Stores all the iterdomains that are parallelized //! on the scheduled Fusion graph. They will be used @@ -104,7 +104,7 @@ class ParallelBindingIterDomains { CompileTimeEntryType::PARALLEL_BINDING_ITERDOMAINS; }; -//! Compile-time info to be cached in each FusionExecutor: +//! Compile-time info to be cached in each KernelExecutor: //! ParallelIterExtentMap //! Stores the symbolic extents of all the parallelized //! iterdomains corresponding to each used parallel type. @@ -132,7 +132,7 @@ struct VectorizedTensorInfo { std::vector out_misaligned_tensors_pos; }; -//! Compile-time info to be cached in each FusionExecutor: +//! Compile-time info to be cached in each KernelExecutor: //! VectorizedTensorValidation //! Stores position info and vector word sizes of //! vectorized input/output tensors, to be used diff --git a/csrc/runtime/fusion_cache_utils.h b/csrc/runtime/fusion_cache_utils.h index a61f1844343..452192b11b8 100644 --- a/csrc/runtime/fusion_cache_utils.h +++ b/csrc/runtime/fusion_cache_utils.h @@ -28,7 +28,7 @@ class SegmentedFusion; // Utilities for benchmarking and profiling struct ExecutorLog { std::unique_ptr params = nullptr; - FusionExecutor* fusion_executor = nullptr; + KernelExecutor* fusion_executor = nullptr; }; struct RuntimeWorkSpace { @@ -153,7 +153,7 @@ class InputsIdLookup : public NonCopyable { //! Encode each input sets to with an unique id; //! The returned data structure also indicates whether eviction has happened //! within the lookup cache. This is needed because lookup shortcut is also - //! cached in nested `FusionExecutorCache` and `FusionExecutor`. + //! cached in nested `FusionExecutorCache` and `KernelExecutor`. //! see [ Note -- Post-definition cache implementation ] and [ Note -- 2 level //! cache implementation ]. //! diff --git a/csrc/runtime/fusion_executor_cache.cpp b/csrc/runtime/fusion_executor_cache.cpp index a3b8de148e2..af9c83da2b6 100644 --- a/csrc/runtime/fusion_executor_cache.cpp +++ b/csrc/runtime/fusion_executor_cache.cpp @@ -33,9 +33,6 @@ #include #include -#include -#include - namespace nvfuser { FusionExecutorCache::FusionExecutorCache( @@ -81,15 +78,7 @@ std::vector FusionExecutorCache::runFusionWithInputs( " failed"); } - int seq_id = 0; - // Record kernel input and output tensors so profiler can construct - // the data flow graph - RECORD_FUNCTION( - "run_fused_kernel", - std::vector(inputs.begin(), inputs.end()), - seq_id); auto outputs = kernel_runtime->runWithInputs(args); - RECORD_OUTPUTS(outputs); // Kernel time measurement is off by default kernel_runtime->disableKernelTimeMeasurement(); @@ -194,8 +183,8 @@ std::string FusionExecutorCache::getCode( if (intrinsic_code) { const auto& execs = kernel_runtime->executors(); - const FusionExecutor& fe = execs[0]; - auto index_type = fe.kernel()->indexType(); + const KernelExecutor& ke = execs[0]; + auto index_type = ke.kernel()->indexType(); // Make sure all the segment index types match. All segments currently // use the same index type but this code change in the future. for (const auto& exec : execs) { @@ -206,7 +195,7 @@ std::string FusionExecutorCache::getCode( " ", exec.kernel()->indexType()); } - std::string full_code = fe.getStructuredCode(kernel_code, index_type); + std::string full_code = ke.getStructuredCode(kernel_code, index_type); return full_code; } else { return kernel_code; @@ -492,7 +481,7 @@ void FusionExecutorCache::deserialize( device_runtimes.size())); // 3. For FusionKernelRuntime, we have a separate deserialize function - // to create the FusionExecutor objects. + // to create the KernelExecutor objects. device_runtimes.back()->deserialize( fb_fusion_kernel_runtime, args.getDeviceIndex()); diff --git a/csrc/runtime/fusion_executor_cache.h b/csrc/runtime/fusion_executor_cache.h index 6ec0435bbd0..0a0e0520cf8 100644 --- a/csrc/runtime/fusion_executor_cache.h +++ b/csrc/runtime/fusion_executor_cache.h @@ -63,7 +63,7 @@ enum class PrimDataType; //! properties might: rank, DataType, contiguity, stride order, size (whether a //! dimension has size=1). When all of these properties are repeated, there is //! an opportunity to reduce the latency of producing a compiled Fusion and -//! launch params (a FusionExecutor). Given inputs, we first compute an ID using +//! launch params (a KernelExecutor). Given inputs, we first compute an ID using //! InputsIdLookup::lookupId that encodes tensor properties along with values of //! any integer-valued input scalars that might affect concretization. This ID //! is guaranteed not to conflict unless the inputs can be executed by the same @@ -124,7 +124,7 @@ class FusionExecutorCache { int64_t fusion_id = 0, bool auto_schedule = true); - //! Execute fusion graph with given inputs, create `FusionExecutor` as needed + //! Execute fusion graph with given inputs, create `KernelExecutor` as needed //! Note this function also handles permutation & input update outside of //! codegen. //! @@ -140,12 +140,6 @@ class FusionExecutorCache { std::optional forced_index_type = std::nullopt, std::optional selected_device = std::nullopt); - //! Converts inputs from IValue to KernelArgumentHolder, also handles cache - //! lookup - KernelArgumentHolder prepareInputs( - const at::ArrayRef& inputs, - std::optional selected_device = std::nullopt); - //! query if there's a kernel ready to go for given inputs NVF_API bool isCompiled( const at::ArrayRef& inputs, @@ -241,8 +235,14 @@ class FusionExecutorCache { void deserialize(const serde::FusionExecutorCache* buffer, int64_t fusion_id); private: + //! Converts inputs from IValue to KernelArgumentHolder, also handles cache + //! lookup + KernelArgumentHolder prepareInputs( + const at::ArrayRef& inputs, + std::optional selected_device = std::nullopt); + //! evict cached short cut entry in `code_to_fe_lookup_` as well as cached - //! entry in `FusionExecutor` + //! entry in `KernelExecutor` void evictCache(size_t cache_id); //! The index type of forced_index_type is used to get a kernel diff --git a/csrc/runtime/fusion_kernel_runtime.cpp b/csrc/runtime/fusion_kernel_runtime.cpp index 7a2768e0cbf..8069ed3ee3a 100644 --- a/csrc/runtime/fusion_kernel_runtime.cpp +++ b/csrc/runtime/fusion_kernel_runtime.cpp @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include #include #include @@ -125,7 +127,7 @@ FusionKernelRuntime::FusionKernelRuntime( // would go directly to kernel launch. prepareRuntimeOrder(segmented_fusion_.get(), runtime_workspace_); - executors_ = std::vector(segmented_fusion_->groups().size()); + executors_ = std::vector(segmented_fusion_->groups().size()); if (isDebugDumpEnabled(DebugDumpOption::FusionSegments)) { segmented_fusion_->print(); } @@ -142,8 +144,8 @@ FusionKernelRuntime::FusionKernelRuntime( } void FusionKernelRuntime::evictCache(size_t input_id) { - for (auto& fe : executors_) { - fe.evictCache(input_id); + for (auto& ke : executors_) { + ke.evictCache(input_id); } } @@ -159,8 +161,8 @@ flatbuffers::Offset FusionKernelRuntime::serialize( flatbuffers::FlatBufferBuilder& builder) const { // See table definition for FusionKernelRuntime in serde/fusion_cache.fbs - // 1. Serialize FusionExecutor objects - std::vector> executors_fb; + // 1. Serialize KernelExecutor objects + std::vector> executors_fb; executors_fb.reserve(executors_.size()); for (auto& executor : executors_) { executors_fb.push_back(executor.serialize(builder)); @@ -198,7 +200,7 @@ void FusionKernelRuntime::deserialize( runtime_id_ == buffer->runtime_id(), "Expected FusionKernelRuntime runtime_id to match serde runtime_id."); - // 1. Deserialize FusionExecutor objects + // 1. Deserialize KernelExecutor objects for (auto idx : c10::irange(buffer->executors()->size())) { auto sg = runtime_workspace_.group_run_order.at(idx); @@ -298,6 +300,14 @@ void FusionKernelRuntime::compileFusionParallel(KernelArgumentHolder args) { for (int64_t run_order_id = 0; run_order_id < num_groups; ++run_order_id) { auto group_to_run = runtime_workspace_.group_run_order.at(run_order_id); + if (isDebugDumpEnabled(DebugDumpOption::PythonDefinitionSegments)) { + debug() << "Python definition for segmented group " + << group_to_run->groupId() << ":" << std::endl; + python_frontend::FusionDefinition fd(/*id=*/std::nullopt); + python_frontend::translate(group_to_run->getFusion(), &fd); + fd.print(debug()); + } + // TODO: index mode should be updated per segmented kernel // Prepare input vector KernelArgumentHolder group_runtime_inputs; @@ -497,7 +507,7 @@ void FusionKernelRuntime::updateHeuristicsLaunchParams( } } -const std::vector& FusionKernelRuntime::executors() const { +const std::vector& FusionKernelRuntime::executors() const { return executors_; } @@ -595,7 +605,7 @@ std::vector FusionKernelRuntime::runKernelWithInput( if (executor.groupId() < 0) { executor.setGroupId(group_id); } - auto outputs = executor.runFusion(args, launch_params, compile_params); + auto outputs = executor.run(args, launch_params, compile_params); return outputs; } @@ -625,7 +635,7 @@ void FusionKernelRuntime::compileKernel( NVF_ERROR( heuristic_params->cparams.index_type.has_value(), "Kernel index type is not defined."); - executors_.at(group_id).compileFusion( + executors_.at(group_id).compile( fusion_to_run.get(), args, heuristic_params->lparams, diff --git a/csrc/runtime/fusion_kernel_runtime.h b/csrc/runtime/fusion_kernel_runtime.h index 7e34da833ce..5bf20883203 100644 --- a/csrc/runtime/fusion_kernel_runtime.h +++ b/csrc/runtime/fusion_kernel_runtime.h @@ -35,7 +35,7 @@ struct FusionKernelRuntime; //! //! Two types of instance can be created, one for complete/single-kernel fusion //! and one for segmented/multi-kernel fusion. -//! Conceptually this is a generalization of FusionExecutor that supports both +//! Conceptually this is a generalization of KernelExecutor that supports both //! single-kernel and multi-kernel caching/compiling/launching //! //! When serde_buffer argument is a nullptr, we run the @@ -143,7 +143,7 @@ class FusionKernelRuntime { //! for kernel launch for a new input dimension but same heuristics void updateHeuristicsLaunchParams(HeuristicParamsList* update_heuristics); - const std::vector& executors() const; + const std::vector& executors() const; private: //! Runs each fusion segment given arguments. The outputs for a fusion are @@ -176,7 +176,7 @@ class FusionKernelRuntime { private: //! Entries indexed by groupID: //! Executors holding compiled kernels - std::vector executors_; + std::vector executors_; // A metadata copy of initial arguments used to contruct this // FusionKernelRuntime. Used during deserialization to schedule the fusion diff --git a/csrc/scheduler/cache_policy_refiner.cpp b/csrc/scheduler/cache_policy_refiner.cpp index 7e6eab7eb18..2159c93c969 100644 --- a/csrc/scheduler/cache_policy_refiner.cpp +++ b/csrc/scheduler/cache_policy_refiner.cpp @@ -58,6 +58,12 @@ bool isLoadGlobalToLocal(const Expr* expr) { if (ldst->opType() != LoadStoreOpType::Set) { return false; } + // It should not be necessary to check the output since it should be + // always a TensorView as long as the input is a TensorView, but + // just in case. + if (!ldst->in()->isA() || !ldst->out()->isA()) { + return false; + } if (ldst->in()->as()->getMemoryType() != MemoryType::Global) { return false; } diff --git a/csrc/scheduler/compile_time_info.h b/csrc/scheduler/compile_time_info.h index d413c99ae81..f7ec9d4a97f 100644 --- a/csrc/scheduler/compile_time_info.h +++ b/csrc/scheduler/compile_time_info.h @@ -234,7 +234,7 @@ class CompileTimeInfoBase : public PolymorphicBase { //! Compile-time information cache for `canSchedule` and `getHeuristics` //! interfaces. Each cache instance stores information that could be inferred at //! compile time in a fusion and therefore corresponds to an instance of -//! FusionExecutor. +//! KernelExecutor. class HeuristicDataCache { using EntryOwningPtr = std::unique_ptr; diff --git a/csrc/scheduler/matmul_heuristic.h b/csrc/scheduler/matmul_heuristic.h index b97d8515011..ae7e7ff476e 100644 --- a/csrc/scheduler/matmul_heuristic.h +++ b/csrc/scheduler/matmul_heuristic.h @@ -138,9 +138,6 @@ class MatmulParams : public HeuristicParams { } } supported_vec_size; - //! Whether to rotate the ldmatrix out of the main loop - bool rotate_ldmatrix_out_of_main_loop = true; - //! (Ampere+) Use cp.async to load operands. bool async_gmem_load_operands = false; @@ -191,8 +188,6 @@ class MatmulParams : public HeuristicParams { << circular_buffer_options.toString() << "\n" << supported_vec_size.toString() << "\n" << nvfuser::toString(tile_sizes) << "\n" - << "Rotate ldmatrix out of main loop: " - << (rotate_ldmatrix_out_of_main_loop ? "true" : "false") << "\n" << "Async global mem load: " << (async_gmem_load_operands ? "true" : "false") << "\n" << "Indexing mode: " @@ -216,9 +211,8 @@ class MatmulParams : public HeuristicParams { size_t hash() const override { // combine boolean flags for hashing - size_t attr_hash = (static_cast(promote_prologue_smem_reuse) << 3) | - (static_cast(use_smem_epilogue) << 2) | - (static_cast(rotate_ldmatrix_out_of_main_loop) << 1) | + size_t attr_hash = (static_cast(promote_prologue_smem_reuse) << 2) | + (static_cast(use_smem_epilogue) << 1) | (static_cast(async_gmem_load_operands)); // combined hash @@ -240,8 +234,6 @@ class MatmulParams : public HeuristicParams { return other->cparams == cparams && other->mma_macro == mma_macro && other->async_gmem_load_operands == async_gmem_load_operands && - other->rotate_ldmatrix_out_of_main_loop == - rotate_ldmatrix_out_of_main_loop && other->tile_sizes == tile_sizes && other->circular_buffer_options == circular_buffer_options && other->supported_vec_size == supported_vec_size && diff --git a/csrc/scheduler/matmul_heuristic_plugin.cpp b/csrc/scheduler/matmul_heuristic_plugin.cpp index c1b7acf00c4..658ad2b07f7 100644 --- a/csrc/scheduler/matmul_heuristic_plugin.cpp +++ b/csrc/scheduler/matmul_heuristic_plugin.cpp @@ -146,8 +146,6 @@ void copyParamsToConfig(KernelConfig* config, const MatmulParams* mparams) { : 1; config->circular_buffer_smem_read = mparams->circular_buffer_options.circular_buffer_smem_read; - config->rotate_ldmatrix_out_of_main_loop = - mparams->rotate_ldmatrix_out_of_main_loop; config->problem.supported_vec_size.a = (uint8_t)mparams->supported_vec_size.a; config->problem.supported_vec_size.b = (uint8_t)mparams->supported_vec_size.b; config->problem.supported_vec_size.epilogue = @@ -190,8 +188,6 @@ void copyConfigToParams(MatmulParams* mparams, const KernelConfig* config) { } mparams->circular_buffer_options.circular_buffer_smem_read = config->circular_buffer_smem_read; - mparams->rotate_ldmatrix_out_of_main_loop = - config->rotate_ldmatrix_out_of_main_loop; // enable circular buffering if configured mparams->circular_buffer_options.circular_buffer_smem_write = diff --git a/csrc/scheduler/matmul_heuristic_plugin_api.h b/csrc/scheduler/matmul_heuristic_plugin_api.h index 65094cb9d8c..224705530e5 100644 --- a/csrc/scheduler/matmul_heuristic_plugin_api.h +++ b/csrc/scheduler/matmul_heuristic_plugin_api.h @@ -77,7 +77,6 @@ struct KernelConfig { uint8_t grid_swizzle_factor = 0; uint8_t cta_order = 0; bool circular_buffer_smem_read = true; - bool rotate_ldmatrix_out_of_main_loop = true; bool async_gmem_load_operands = true; public: diff --git a/csrc/scheduler/matmul_utils.cpp b/csrc/scheduler/matmul_utils.cpp index 799d519a8d2..86ad7e5144d 100644 --- a/csrc/scheduler/matmul_utils.cpp +++ b/csrc/scheduler/matmul_utils.cpp @@ -411,7 +411,7 @@ class VectorizationCalculator { //! To analyze vectorization, we need to know pointer alignment, sizes, and //! strides. SchedulerRuntimeInfo contains all this info about fusion - //! inputs, but fusion outputs are allocated by FusionExecutor so they are + //! inputs, but fusion outputs are allocated by KernelExecutor so they are //! absent from SchedulerRuntimeInfo. //! //! This function just extracts sizes and strides from runtime_info_ when diff --git a/csrc/scheduler/utils.cpp b/csrc/scheduler/utils.cpp index 750cdb43597..260f5813be7 100644 --- a/csrc/scheduler/utils.cpp +++ b/csrc/scheduler/utils.cpp @@ -2565,7 +2565,7 @@ int64_t getSharedMemoryOverheadPerBlock( dtype_size = std::max(dtype_size, dataTypeSize(tv->getDataType().value())); } // for welford, three arrays of type nvfuser_index_t are used to store var, - // avg, and n. see FusionExecutor::computeLaunchParams. Here index type is + // avg, and n. see KernelExecutor::computeLaunchParams. Here index type is // assumed as int64_t int64_t welford_factor = ir_utils::hasOpsOfType(fusion) ? 3l : 1l; if (welford_factor == 3l) { diff --git a/csrc/serde/Serde.md b/csrc/serde/Serde.md index c0528950d6b..5f5bcbc18bb 100644 --- a/csrc/serde/Serde.md +++ b/csrc/serde/Serde.md @@ -27,19 +27,19 @@ The string's position in the cache becomes the input's cache id. This table represents a key-value pair in the unordered_map. ### FusionKernelRuntime -* `FusionKernelRuntime` contains the segments for a Fusion. Each segment is represented by a `FusionExecutor` object. +* `FusionKernelRuntime` contains the segments for a Fusion. Each segment is represented by a `KernelExecutor` object. #### Serialization: * We save a metadata copy of the arguments used to construct the `FusionKernelRuntime`. During deserialization, -we call the constructor using the saved metadata arguments. Afterwards, we regenerate the `FusionExecutor` objects, +we call the constructor using the saved metadata arguments. Afterwards, we regenerate the `KernelExecutor` objects, which are normally built by calling `compileFusionParallel` outside the constructor. ### KernelArgumentHolder * A collection of `PolymorphicValue` objects representing Scalars [`int, double, bool, complex`], Cpu Scalars, and Gpu Tensors. * **Note:** Pointer address of meta aten tensors is zero. The pointer address is used to specify vectorization during schedule. -### FusionExecutor -* `FusionExecutor` defines two data structs: `ExecutorEntry` and `GlobalBufferInfo` +### KernelExecutor +* `KernelExecutor` defines two data structs: `ExecutorEntry` and `GlobalBufferInfo` * `ExecutorEntry` contains information to launch a kernel for a set of input arguments. It contains the launch parameters, output-to-input alias map, and global buffer configurations. * `GlobalBufferInfo` specifies the buffer's tensor properties [`shape, stride, dtype`] and its corresponding TensorView. diff --git a/csrc/serde/fusion_cache.fbs b/csrc/serde/fusion_cache.fbs index 0cc499416b6..b21e4ea4f82 100644 --- a/csrc/serde/fusion_cache.fbs +++ b/csrc/serde/fusion_cache.fbs @@ -156,7 +156,7 @@ table Scalar { } // ===================================================================================== -// Tables for PolymorphicValue, ScalarCpu, TensorArg, KernelArgumentHolder used in FusionExecutor. +// Tables for PolymorphicValue, ScalarCpu, TensorArg, KernelArgumentHolder used in KernelExecutor. // The ScalarCpu is represented by a fixed size array of raw bytes. table ScalarCpu { @@ -188,7 +188,7 @@ table KernelArgumentHolder { // // ===================================================================================== -// Tables for LaunchParams, GlobalBufferInfo, ExecutorEntry, and TensorShape used in FusionExecutor +// Tables for LaunchParams, GlobalBufferInfo, ExecutorEntry, and TensorShape used in KernelExecutor // Data representing a tensor shape used in LaunchParam table TensorShape { @@ -355,7 +355,7 @@ table CudaKernel { } // Each Fusion Executor maps to a lowered and compiled kernel. -table FusionExecutor { +table KernelExecutor { device_smem_limit: long; block_size_high_water_mark: long; maxrregcount_high_water_mark: long; @@ -415,14 +415,14 @@ table SegmentedFusion { // Each FusionKernelRuntime represents a concretized, segmented Fusion. // We store the metadata for the original arguments to segment, schedule, and compile the Fusion at deserialization. -// Each fusion segment is given a FusionExecutor. +// Each fusion segment is given a KernelExecutor. // The unscheduled fusion is defined by traversing Trie in FusionCache. table FusionKernelRuntime { fusion_id: long; concrete_id: long; runtime_id: long; args: KernelArgumentHolder; - executors: [FusionExecutor]; + executors: [KernelExecutor]; segmented_fusion: SegmentedFusion; } diff --git a/csrc/serde/polymorphic_value.h b/csrc/serde/polymorphic_value.h index 6ca56a1c69a..5c0245303f9 100644 --- a/csrc/serde/polymorphic_value.h +++ b/csrc/serde/polymorphic_value.h @@ -21,7 +21,7 @@ namespace nvfuser::serde { //! PolymorphicValue table. This factory creates Bool, ComplexDouble, Double, //! Long, CPU Scalar, and CUDA Tensor objects. These arguments are stored in //! KernelArgumentHolder, which is used to schedule the fusion in -//! FusionKernelRuntime and to run a kernel in FusionExecutor. +//! FusionKernelRuntime and to run a kernel in KernelExecutor. class PolymorphicValueFactory : public Factory { public: diff --git a/csrc/transform_replay.cpp b/csrc/transform_replay.cpp index 093715f92a8..06e15929aa9 100644 --- a/csrc/transform_replay.cpp +++ b/csrc/transform_replay.cpp @@ -52,25 +52,20 @@ class ReplaySelf : public ReplayTransformations { loop_ids_.find(mapped) != loop_ids_.end(), "Transform traversal failed, modified a node but it was not a loop node."); - // outer loop size - Val* remainder = ceilDiv(mapped->extent(), s->factor()); - - // Manually replay the split, following the output of the operations. - // This is so rfactor ops are replayed correctly. - IterDomain* ido = IterDomainBuilder(s->outer()) - .start(s->container()->zeroVal()) - .extent(s->innerSplit() ? remainder : s->factor()) - .build(); - - // inner IterDomain - IterDomain* idi = IterDomainBuilder(s->inner()) - .start(s->container()->zeroVal()) - .extent(s->innerSplit() ? s->factor() : remainder) - .build(); - - // Generate the split node - IrBuilder::createInContainer( - s->container(), ido, idi, mapped, s->factor(), s->innerSplit()); + NVF_ERROR(s->outer()->isRFactorProduct() == s->inner()->isRFactorProduct()); + + // Due to rfactor transformations, the iter types of the outputs + // may not follow the default rule. For example, even if the input + // is a reduction iter domain, the outputs may not. To replay the + // original split expression, the output iter types need to be + // specified explicitly. + auto [ido, idi] = IterDomain::split( + mapped, + s->factor(), + s->innerSplit(), + s->outer()->isRFactorProduct(), + s->outer()->getIterType(), + s->inner()->getIterType()); // Remove mapped id from loop IDs loop_ids_.erase(mapped); @@ -107,16 +102,7 @@ class ReplaySelf : public ReplayTransformations { id_inner_mapped, " however one or both are not loop nodes."); - Val* merged_id_size = - mul(id_outer_mapped->extent(), id_inner_mapped->extent()); - - IterDomain* merged_id = IterDomainBuilder(m->out()) - .start(m->container()->zeroVal()) - .extent(merged_id_size) - .build(); - - IrBuilder::createInContainer( - m->container(), merged_id, id_outer_mapped, id_inner_mapped); + IterDomain* merged_id = IterDomain::merge(id_outer_mapped, id_inner_mapped); // Remove inputs from the loop IDs loop_ids_.erase(id_outer_mapped); diff --git a/csrc/transform_rfactor.cpp b/csrc/transform_rfactor.cpp index 311bec23796..07799487eb0 100644 --- a/csrc/transform_rfactor.cpp +++ b/csrc/transform_rfactor.cpp @@ -108,9 +108,6 @@ class ReplayRFactor : public ReplayTransformations { loop_ids_.find(mapped) != loop_ids_.end(), "Transform traversal failed, modified a node but it was not a loop node."); - // outer loop size - Val* remainder = ceilDiv(mapped->extent(), s->factor()); - // Check if we need to mark the outputs as an logical domain meaning this // transformation must be present in replays otherwise it breaks the compute // definition of the fusion. Iter domains are actually not static, its the @@ -119,32 +116,27 @@ class ReplayRFactor : public ReplayTransformations { bool static_logical_outputs = static_logical_ids_.count(s->outer()) || static_logical_ids_.count(s->inner()); - // Manually replay the split, making reduction = false and rfactor = true - // outer IterDomain - IterDomain* ido = - IterDomainBuilder( - s->container()->zeroVal(), - s->innerSplit() ? remainder : s->factor()) - .iter_type( - rfactor_axes_.count(s->outer()) ? IterType::Reduction - : IterType::Iteration) - .is_rfactor_domain(static_logical_outputs) - .build(); + // Let IterDomain::split determine the correct IterType, except + // when the output is a reduction domain but not part of the + // rfactored domains. If it isn't involved in the rfactor, it's no + // longer a redunction domain + std::optional outer_iter_type; + if (s->outer()->isReduction() && !rfactor_dep_ids_.count(s->outer())) { + outer_iter_type = IterType::Iteration; + } - // inner IterDomain - IterDomain* idi = - IterDomainBuilder( - s->container()->zeroVal(), - s->innerSplit() ? s->factor() : remainder) - .iter_type( - rfactor_axes_.count(s->inner()) ? IterType::Reduction - : IterType::Iteration) - .is_rfactor_domain(static_logical_outputs) - .build(); + std::optional inner_iter_type; + if (s->inner()->isReduction() && !rfactor_dep_ids_.count(s->inner())) { + inner_iter_type = IterType::Iteration; + } - // Generate the split node - IrBuilder::createInContainer( - s->container(), ido, idi, mapped, s->factor(), s->innerSplit()); + auto [ido, idi] = IterDomain::split( + mapped, + s->factor(), + s->innerSplit(), + static_logical_outputs, + outer_iter_type, + inner_iter_type); // Remove mapped id from loop IDs loop_ids_.erase(mapped); @@ -182,23 +174,20 @@ class ReplayRFactor : public ReplayTransformations { id_inner_mapped, " however one or both are not loop nodes."); - Val* merged_id_size = - mul(id_outer_mapped->extent(), id_inner_mapped->extent()); - - bool is_bcast = - id_outer_mapped->isBroadcast() && id_inner_mapped->isBroadcast(); - auto iter_type = rfactor_axes_.count(m->out()) - ? IterType::Reduction - : (is_bcast ? IterType::Broadcast : IterType::Iteration); - - IterDomain* merged_id = - IterDomainBuilder(m->container()->zeroVal(), merged_id_size) - .iter_type(iter_type) - .is_rfactor_domain(static_logical_ids_.count(m->out())) - .build(); + // Let IterDomain::merge determine the correct IterType, except + // when the output is a reduction domain but not part of the + // rfactored domains. If it isn't involved in the rfactor, it's no + // longer a redunction domain + std::optional iter_type; + if (m->out()->isReduction() && !rfactor_dep_ids_.count(m->out())) { + iter_type = IterType::Iteration; + } - IrBuilder::createInContainer( - m->container(), merged_id, id_outer_mapped, id_inner_mapped); + IterDomain* merged_id = IterDomain::merge( + id_outer_mapped, + id_inner_mapped, + static_logical_ids_.count(m->out()), + iter_type); // Remove inputs from the loop IDs loop_ids_.erase(id_outer_mapped); @@ -236,6 +225,9 @@ class ReplayRFactor : public ReplayTransformations { // The IterDomains in the original_domain that are being factored into the // first stage of the two stage reduction (the producer). std::unordered_set rfactor_axes_; + // All iter domains between the logical and the loop that the + // rfactor_axes_ depend on + std::unordered_set rfactor_dep_ids_; // Iter domains whose history cannot be changed as it would break rfactor // dependencies. std::unordered_set static_logical_ids_; @@ -262,6 +254,14 @@ class ReplayRFactor : public ReplayTransformations { rfactor_axes_(std::move(rfactor_axes)), static_logical_ids_(std::move(static_logical_ids)), logical_domain_(original_domain->logical()) { + const auto all_dep_vals = DependencyCheck::getAllValsBetween( + {original_domain->maybeRoot().begin(), + original_domain->maybeRoot().end()}, + {rfactor_axes_.begin(), rfactor_axes_.end()}); + + auto all_dep_ids = ir_utils::filterByType(all_dep_vals); + rfactor_dep_ids_.insert(all_dep_ids.begin(), all_dep_ids.end()); + setErrorOnFailure(false); } }; diff --git a/csrc/utils.h b/csrc/utils.h index f98d2e357a2..d831a6695a5 100644 --- a/csrc/utils.h +++ b/csrc/utils.h @@ -112,23 +112,23 @@ class PolymorphicBase { // (checked in DEBUG builds) template T* as() { -#ifdef NDEBUG +#if defined(NDEBUG) && !defined(NVFUSER_EXPLICIT_ERROR_CHECK) auto downcast_ptr = static_cast(this); #else auto downcast_ptr = dynamic_cast(this); NVF_ERROR(downcast_ptr != nullptr); -#endif +#endif // defined(NDEBUG) && !defined(NVFUSER_EXPLICIT_ERROR_CHECK) return downcast_ptr; } template const T* as() const { -#ifdef NDEBUG +#if defined(NDEBUG) && !defined(NVFUSER_EXPLICIT_ERROR_CHECK) auto downcast_ptr = static_cast(this); #else auto downcast_ptr = dynamic_cast(this); NVF_ERROR(downcast_ptr != nullptr); -#endif +#endif // defined(NDEBUG) && !defined(NVFUSER_EXPLICIT_ERROR_CHECK) return downcast_ptr; } diff --git a/doc/dev/python_scheduling/autotune_pointwise.py b/doc/dev/python_scheduling/autotune_pointwise.py index 5034ebfd5c6..014ae8197a2 100644 --- a/doc/dev/python_scheduling/autotune_pointwise.py +++ b/doc/dev/python_scheduling/autotune_pointwise.py @@ -89,7 +89,7 @@ def inner_fn(): if config is not None: vectorization_factor, unroll_factor = config schedule_params.vectorization_factor = vectorization_factor - schedule_params.unroll_factor = unroll_factor + schedule_params.unroll_factor_inner = unroll_factor # Schedule fusion fd.sched.schedule() diff --git a/examples/sinh_extension/main.cpp b/examples/sinh_extension/main.cpp index e44086dbbe3..c6dd6b7fe01 100644 --- a/examples/sinh_extension/main.cpp +++ b/examples/sinh_extension/main.cpp @@ -34,9 +34,9 @@ at::Tensor sinh_nvfuser(const at::Tensor& input) { auto heuristic_params = SchedulerEntry::scheduleWith(&fusion, SchedulerType::PointWise, {input}); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}, heuristic_params->lparams); - auto outputs = fe.runFusion({input}, heuristic_params->lparams); + KernelExecutor ke; + ke.compile(&fusion, {input}, heuristic_params->lparams); + auto outputs = ke.run({input}, heuristic_params->lparams); return outputs[0]; } diff --git a/examples/sinh_libtorch/main.cpp b/examples/sinh_libtorch/main.cpp index 8c83f6d0e23..12f58d08c33 100644 --- a/examples/sinh_libtorch/main.cpp +++ b/examples/sinh_libtorch/main.cpp @@ -31,9 +31,9 @@ at::Tensor sinh_nvfuser(const at::Tensor& input) { auto heuristic_params = SchedulerEntry::scheduleWith( &fusion, SchedulerType::PointWise, {input}); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}, heuristic_params->lparams); - auto outputs = fe.runFusion({input}, heuristic_params->lparams); + KernelExecutor ke; + ke.compile(&fusion, {input}, heuristic_params->lparams); + auto outputs = ke.run({input}, heuristic_params->lparams); return outputs[0]; } diff --git a/nvfuser/contrib/nn/normalization.py b/nvfuser/contrib/nn/normalization.py index 4d05eb538ec..c01faf86cdb 100644 --- a/nvfuser/contrib/nn/normalization.py +++ b/nvfuser/contrib/nn/normalization.py @@ -401,12 +401,6 @@ def forward( tv_running_mean = partially_contig_tensor(fd, running_mean) tv_running_var = partially_contig_tensor(fd, running_var) inputs.extend([running_mean, running_var]) - if running_mean.dtype in [torch.half, torch.bfloat16]: - tv_running_mean = fd.ops.cast( - tv_running_mean, nvfuser.DataType.Float - ) - if running_var.dtype in [torch.half, torch.bfloat16]: - tv_running_var = fd.ops.cast(tv_running_var, nvfuser.DataType.Float) s_momentum = fd.define_scalar(nvfuser.DataType.Double) s_eps = fd.define_scalar(nvfuser.DataType.Double) diff --git a/setup.py b/setup.py index 55dcc041f92..4f3fd6c28fa 100644 --- a/setup.py +++ b/setup.py @@ -79,6 +79,7 @@ BUILD_WITH_ASAN = False BUILD_WITHOUT_DISTRIBUTED = False OVERWRITE_VERSION = False +EXPLICIT_ERROR_CHECK = False VERSION_TAG = None BUILD_TYPE = "Release" WHEEL_NAME = "nvfuser" @@ -107,6 +108,9 @@ if arg == "--build-with-ucc": BUILD_WITH_UCC = True continue + if arg == "--explicit-error-check": + EXPLICIT_ERROR_CHECK = True + continue if arg == "--build-with-asan": BUILD_WITH_ASAN = True continue @@ -330,6 +334,8 @@ def cmake(): ] if BUILD_WITH_UCC: cmd_str.append("-DNVFUSER_STANDALONE_BUILD_WITH_UCC=ON") + if EXPLICIT_ERROR_CHECK: + cmd_str.append("-DNVFUSER_EXPLICIT_ERROR_CHECK=ON") if not NO_NINJA: cmd_str.append("-G") cmd_str.append("Ninja") diff --git a/tests/cpp/test_alias.cpp b/tests/cpp/test_alias.cpp index 68337688656..7c1b4df46af 100644 --- a/tests/cpp/test_alias.cpp +++ b/tests/cpp/test_alias.cpp @@ -50,10 +50,11 @@ TEST_F(AliasTest, View) { TensorView* out = reshape(in, in_shape, out_shape); fusion->addOutput(out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3, 4}, at::dtype(at::kFloat).device(at::kCUDA, 0)); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); ASSERT_EQ(out_tensors.size(), 1); at::Tensor out_tensor = out_tensors[0]; @@ -61,7 +62,8 @@ TEST_F(AliasTest, View) { EXPECT_EQ(in_tensor.data_ptr(), out_tensor.data_ptr()); // Verify output values. - testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); } TEST_F(AliasTest, View_AliasForSameLayout) { @@ -80,13 +82,15 @@ TEST_F(AliasTest, View_AliasForSameLayout) { {in->axis(1), in->axis(2), in->axis(0)}, {true, false, false}); out->setAllocationDomain({out->axis(1), out->axis(0)}, false); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({60}).cuda().as_strided({2, 3, 4}, {2, 20, 5}); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); ASSERT_EQ(out_tensors.size(), 1); at::Tensor out_tensor = out_tensors[0]; - testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); EXPECT_TRUE(out_tensor.is_alias_of(in_tensor)); } @@ -105,12 +109,14 @@ TEST_F(AliasTest, View_AliasForCompliantLayout) { out->setAllocationDomain({out->axis(0), out->axis(1)}, {false, false}); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3, 4}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); ASSERT_EQ(out_tensors.size(), 1); at::Tensor out_tensor = out_tensors[0]; - testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); EXPECT_TRUE(out_tensor.is_alias_of(in_tensor)); } @@ -131,10 +137,11 @@ TEST_F(AliasTest, View_NoAliasForIncompliantLayout) { // alias. out->setAllocationDomain({out->axis(1), out->axis(0)}, true); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3, 4}, at::dtype(at::kFloat).device(at::kCUDA, 0)); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); ASSERT_EQ(out_tensors.size(), 1); at::Tensor out_tensor = out_tensors[0]; @@ -142,7 +149,8 @@ TEST_F(AliasTest, View_NoAliasForIncompliantLayout) { EXPECT_FALSE(out_tensor.is_alias_of(in_tensor)); // Verify output values. - testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); } TEST_F(AliasTest, ViewPermute) { @@ -158,10 +166,11 @@ TEST_F(AliasTest, ViewPermute) { out = permute(out, {1, 0}); fusion->addOutput(out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3, 4}, at::dtype(at::kFloat).device(at::kCUDA, 0)); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); ASSERT_EQ(out_tensors.size(), 1); at::Tensor out_tensor = out_tensors[0]; @@ -169,7 +178,8 @@ TEST_F(AliasTest, ViewPermute) { EXPECT_EQ(in_tensor.data_ptr(), out_tensor.data_ptr()); // Verify output values. - testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); } TEST_F(AliasTest, DuplicateOutputs) { @@ -185,10 +195,11 @@ TEST_F(AliasTest, DuplicateOutputs) { fusion->addOutput(out); fusion->addOutput(out); // duplicated outputs - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn(in_shape, at::dtype(at::kFloat).device(at::kCUDA, 0)); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); ASSERT_EQ(out_tensors.size(), 2); at::Tensor out_tensor_0 = out_tensors[0]; at::Tensor out_tensor_1 = out_tensors[1]; @@ -196,12 +207,13 @@ TEST_F(AliasTest, DuplicateOutputs) { // Verify aliasing among duplicated outputs EXPECT_TRUE(out_tensor_0.is_alias_of(out_tensor_1)); // Verify no segmentation - EXPECT_FALSE(fec.getMostRecentKernelRuntime()->isSegmented()) + EXPECT_FALSE(executor_cache.getMostRecentKernelRuntime()->isSegmented()) << "segmentation is not supposed to happen"; at::Tensor expected_out_tensor = in_tensor.add(3.141); // Verify output values. - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); } TEST_F(AliasTest, SliceToSizeOne_Issue1353) { @@ -213,14 +225,14 @@ TEST_F(AliasTest, SliceToSizeOne_Issue1353) { TensorView* out = slice(in, {0, 0, 0}, {4, 6, 1}); fusion->addOutput(out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({4, 6, 7}).cuda(); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; EXPECT_EQ(in_tensor.data_ptr(), out_tensor.data_ptr()); EXPECT_THAT(out_tensor.strides(), ElementsAre(42, 7, _)); testValidate( - fec.fusion(), + executor_cache.fusion(), {in_tensor.slice(/*dim=*/2, /*start=*/c10::nullopt, /*end=*/1)}, {in_tensor}, __LINE__, @@ -236,14 +248,14 @@ TEST_F(AliasTest, SliceRightOfBroadcast) { TensorView* out = slice(in, {0, 0, 0}, {4, 1, 5}); fusion->addOutput(out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({4, 1, 7}).cuda(); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; EXPECT_EQ(in_tensor.data_ptr(), out_tensor.data_ptr()); EXPECT_THAT(out_tensor.strides(), ElementsAre(7, _, 1)); testValidate( - fec.fusion(), + executor_cache.fusion(), {in_tensor.slice(/*dim=*/2, /*start=*/c10::nullopt, /*end=*/5)}, {in_tensor}, __LINE__, @@ -274,9 +286,10 @@ TEST_F(AliasTest, SliceViewPermute) { fusion->addOutput(split); } - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({batches, seq_length, features * 3}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); EXPECT_EQ(out_tensors.size(), 3); for (const auto& out_tensor : out_tensors) { @@ -292,7 +305,7 @@ TEST_F(AliasTest, SliceViewPermute) { } testValidate( - fec.fusion(), + executor_cache.fusion(), out_tensors, {in_tensor}, expected_out_tensors, @@ -317,11 +330,13 @@ TEST_F(AliasTest, DuplicateOutputsSegmentedFusion) { fusion->addOutput(out); fusion->addOutput(out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn(in_shape, at::dtype(at::kFloat).device(at::kCUDA, 0)); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); // Verify aliasing among duplicated outputs EXPECT_TRUE(out_tensors[0].is_alias_of(out_tensors[1])); @@ -329,22 +344,26 @@ TEST_F(AliasTest, DuplicateOutputsSegmentedFusion) { // Verify segmentation EXPECT_EQ( - fec.getMostRecentKernelRuntime()->fusionSegments()->groups().size(), 2) + executor_cache.getMostRecentKernelRuntime() + ->fusionSegments() + ->groups() + .size(), + 2) << "segmentation didn't happen as expected"; } namespace { // Returns the only executor in the most recent runtime. -const FusionExecutor& onlyExecutorInMostRecentRuntime( - const FusionExecutorCache& fec) { - const std::vector& executors = - fec.getMostRecentKernelRuntime()->executors(); +const KernelExecutor& onlyExecutorInMostRecentRuntime( + const FusionExecutorCache& executor_cache) { + const std::vector& executors = + executor_cache.getMostRecentKernelRuntime()->executors(); EXPECT_EQ(executors.size(), 1); return executors.front(); } -bool storesToOutput(const FusionExecutor& executor, const int64_t out_index) { +bool storesToOutput(const KernelExecutor& executor, const int64_t out_index) { // Get the variable name from the `kir::Kernel` not the input fusion, because // they are not always the same. std::string var_name = @@ -371,10 +390,12 @@ TEST_F(AliasTest, NotAllOutputsAlias_Pointwise) { fusion->addOutput(broadcast_out); fusion->addOutput(add_out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_TRUE(out_tensors[0].is_alias_of(in_tensor)); @@ -385,7 +406,7 @@ TEST_F(AliasTest, NotAllOutputsAlias_Pointwise) { // that stores only to the output of the add. // // - broadcast & expand. This segment is meta-op only. - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_THAT( runtime->fusionSegments()->groups(), UnorderedElementsAre( @@ -394,17 +415,17 @@ TEST_F(AliasTest, NotAllOutputsAlias_Pointwise) { for (SegmentedGroup* group : runtime->fusionSegments()->groups()) { if (group->schedulerType() == SchedulerType::PointWise) { - const FusionExecutor& fe = runtime->executors().at(group->groupId()); + const KernelExecutor& ke = runtime->executors().at(group->groupId()); int num_stores = 0; for (auto i : c10::irange(group->outputs().size())) { - if (storesToOutput(fe, i)) { + if (storesToOutput(ke, i)) { num_stores++; } } EXPECT_EQ(num_stores, 1) << "The generated CUDA kernel is expected to store data to one output:" << std::endl - << fe.kernelString(); + << ke.kernelString(); } } } @@ -427,13 +448,15 @@ TEST_F(AliasTest, NotAllOutputsAlias_Reduction) { fusion->addOutput(view_out); fusion->addOutput(permute_out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({16 * 12 * 128 * 192}) .cuda() .as_strided({16, 12, 128, 192}, {128 * 12 * 192, 192, 12 * 192, 1}); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_TRUE(out_tensors[1].is_alias_of(in_tensor)); EXPECT_TRUE(out_tensors[2].is_alias_of(in_tensor)); @@ -452,15 +475,17 @@ TEST_F(AliasTest, Issue1452) { fusion->addOutput(set_out); fusion->addOutput(add_out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({1024, 1024}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); at::Tensor set_out_tensor = out_tensors[0]; EXPECT_TRUE(set_out_tensor.is_alias_of(in_tensor)); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_THAT( runtime->fusionSegments()->groups(), UnorderedElementsAre( @@ -469,17 +494,17 @@ TEST_F(AliasTest, Issue1452) { for (SegmentedGroup* group : runtime->fusionSegments()->groups()) { if (group->schedulerType() == SchedulerType::PointWise) { - const FusionExecutor& fe = runtime->executors().at(group->groupId()); + const KernelExecutor& ke = runtime->executors().at(group->groupId()); int num_stores = 0; for (auto i : c10::irange(group->outputs().size())) { - if (storesToOutput(fe, i)) { + if (storesToOutput(ke, i)) { num_stores++; } } EXPECT_EQ(num_stores, 1) << "The generated CUDA kernel is expected to store data to one output:" << std::endl - << fe.kernelString(); + << ke.kernelString(); } } } @@ -495,20 +520,22 @@ TEST_F(AliasTest, AliasOutputBeforeNonAliasOutput) { fusion->addOutput(slice_out); fusion->addOutput(add_out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); at::Tensor slice_out_tensor = out_tensors[0]; EXPECT_TRUE(slice_out_tensor.is_alias_of(in_tensor)); - const FusionExecutor& fe = onlyExecutorInMostRecentRuntime(fec); - EXPECT_FALSE(storesToOutput(fe, /*out_index=*/0)) + const KernelExecutor& ke = onlyExecutorInMostRecentRuntime(executor_cache); + EXPECT_FALSE(storesToOutput(ke, /*out_index=*/0)) << "The generated CUDA kernel shouldn't store data to output 0:" << std::endl - << fe.kernelString(); + << ke.kernelString(); } TEST_F(AliasTest, Set_NoAliasForIncompatibleLayout) { @@ -523,9 +550,10 @@ TEST_F(AliasTest, Set_NoAliasForIncompatibleLayout) { // I intentionally set the allocation order to be different to block aliasing. out->setAllocationDomain({out->axis(1), out->axis(2), out->axis(0)}, true); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3, 5}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); ASSERT_EQ(out_tensors.size(), 1); at::Tensor out_tensor = out_tensors[0]; @@ -549,10 +577,11 @@ TEST_F(AliasTest, DuplicateOutputsComplex) { // duplicated output fusion->addOutput(out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3, 5}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); ASSERT_EQ(out_tensors.size(), 4); // Verify aliases among outputs. @@ -561,7 +590,8 @@ TEST_F(AliasTest, DuplicateOutputsComplex) { EXPECT_TRUE(out_tensors[0].is_alias_of(out_tensors[3])); // Verify output values. - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); } // test verifying that duplicated input is not allowed in nvfuser @@ -593,10 +623,12 @@ TEST_F(AliasTest, AliasInSegment) { fusion->addOutput(add_out); fusion->addOutput(permute_out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_TRUE(out_tensors[1].is_alias_of(in_tensor)); } @@ -617,17 +649,20 @@ TEST_F(AliasTest, TrivialInputForwarding) { at::Tensor t0 = at::randn({10, 4}).cuda(); at::Tensor t1 = at::randn({10, 4}).cuda(); - FusionExecutorCache fec(std::move(fusion)); - std::vector cg_outputs = fec.runFusionWithInputs({t0, t1}); + FusionExecutorCache executor_cache(std::move(fusion)); + std::vector cg_outputs = + executor_cache.runFusionWithInputs({t0, t1}); EXPECT_EQ(cg_outputs[0].data_ptr(), t0.data_ptr()); - testValidate(fec.fusion(), cg_outputs, {t0, t1}, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), cg_outputs, {t0, t1}, __LINE__, __FILE__); // Second run to ensure cache hit handles trivial forwarding properly - EXPECT_TRUE(fec.isCompiled({t0, t1})); - auto cg_outputs2 = fec.runFusionWithInputs({t0, t1}); + EXPECT_TRUE(executor_cache.isCompiled({t0, t1})); + auto cg_outputs2 = executor_cache.runFusionWithInputs({t0, t1}); EXPECT_EQ(cg_outputs2[0].data_ptr(), t0.data_ptr()); - testValidate(fec.fusion(), cg_outputs2, {t0, t1}, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), cg_outputs2, {t0, t1}, __LINE__, __FILE__); } TEST_F(AliasTest, TrivialInputForwarding_ScalarTensor) { @@ -640,16 +675,16 @@ TEST_F(AliasTest, TrivialInputForwarding_ScalarTensor) { at::Tensor t0 = at::randn({}).cuda(); - FusionExecutorCache fec(std::move(fusion)); - auto cg_outputs = fec.runFusionWithInputs({t0}); + FusionExecutorCache executor_cache(std::move(fusion)); + auto cg_outputs = executor_cache.runFusionWithInputs({t0}); EXPECT_EQ(cg_outputs[0].data_ptr(), t0.data_ptr()); - testValidate(fec.fusion(), cg_outputs, {t0}, __LINE__, __FILE__); + testValidate(executor_cache.fusion(), cg_outputs, {t0}, __LINE__, __FILE__); // Second run to ensure cache hit handles trivial forwarding properly - EXPECT_TRUE(fec.isCompiled({t0})); - auto cg_outputs2 = fec.runFusionWithInputs({t0}); + EXPECT_TRUE(executor_cache.isCompiled({t0})); + auto cg_outputs2 = executor_cache.runFusionWithInputs({t0}); EXPECT_EQ(cg_outputs2[0].data_ptr(), t0.data_ptr()); - testValidate(fec.fusion(), cg_outputs2, {t0}, __LINE__, __FILE__); + testValidate(executor_cache.fusion(), cg_outputs2, {t0}, __LINE__, __FILE__); } TEST_F(AliasTest, OutputAliasesAnotherOutput) { @@ -665,10 +700,12 @@ TEST_F(AliasTest, OutputAliasesAnotherOutput) { fusion->addOutput(reshape_out); fusion->addOutput(permute_out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3, 5}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); ASSERT_EQ(out_tensors.size(), 2); EXPECT_TRUE(out_tensors[1].is_alias_of(out_tensors[0])); @@ -689,12 +726,14 @@ TEST_F(AliasTest, OutputNotAliasedByAnotherOutputShouldNotBeSegmented) { fusion->addOutput(reshape_out); fusion->addOutput(mul_out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3, 5}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_FALSE(runtime->isSegmented()); } @@ -716,10 +755,12 @@ TEST_F(AliasTest, ManyAliasesBetweenOutputs) { fusion->addOutput(permute_out); fusion->addOutput(add_out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3, 5}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); ASSERT_EQ(out_tensors.size(), 4); at::Tensor slice_out_tensor = out_tensors[0]; at::Tensor reshape_out_tensor = out_tensors[1]; @@ -732,7 +773,7 @@ TEST_F(AliasTest, ManyAliasesBetweenOutputs) { // Segment 1: in -> add_out // Segment 2: add_out -> its output aliases - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_EQ(runtime->fusionSegments()->groups().size(), 2); } @@ -750,12 +791,14 @@ TEST_F(AliasTest, DoNotOverSegment_Straightline) { fusion->addOutput(permute_out); fusion->addOutput(mul_out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_FALSE(runtime->isSegmented()); // permute_out should be recognized as an alias of add_out. However, the @@ -781,12 +824,14 @@ TEST_F(AliasTest, DoNotOverSegment_WithForks) { fusion->addOutput(out1); fusion->addOutput(out2); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_THAT( runtime->fusionSegments()->groups(), Contains(HeuristicIs(SchedulerType::PointWise)).Times(1)); @@ -804,10 +849,11 @@ TEST_F(AliasTest, Broadcast) { fusion->addInput(in); fusion->addOutput(out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3}).cuda(); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; - testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; + testValidate( + executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); EXPECT_EQ(out_tensor.data_ptr(), in_tensor.data_ptr()); } @@ -826,10 +872,11 @@ TEST_F(AliasTest, Expand) { broadcast_tv->axis(2)->extent()}); fusion->addOutput(expanded_tv); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3}).cuda(); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; - testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; + testValidate( + executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); EXPECT_EQ(out_tensor.data_ptr(), in_tensor.data_ptr()); } @@ -848,10 +895,11 @@ TEST_F(AliasTest, MergeTwoExpandedBroadcasts) { TensorView* out = reshape(in, {4, 5, 6}, {20, -1}); fusion->addOutput(out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({1}).cuda().as_strided({4, 5, 6}, {0, 0, 0}); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; - testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; + testValidate( + executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); // TODO(#1126): This should become an alias when #1126 is fixed. // EXPECT_TRUE(out_tensor.is_alias_of(in_tensor)); @@ -872,11 +920,12 @@ TEST_F(AliasTest, MergeBroadcastsBetweenConcretes) { out = reshape(out, {2, 15, 7}, {30, 7}); fusion->addOutput(out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2 * 7}).cuda().as_strided({2, 3, 5, 7}, {7, 0, 0, 1}); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; - testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; + testValidate( + executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); } TEST_F(AliasTest, Squeeze) { @@ -888,10 +937,11 @@ TEST_F(AliasTest, Squeeze) { fusion->addInput(in); fusion->addOutput(out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 1, 3}).cuda(); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; - testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; + testValidate( + executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); EXPECT_EQ(out_tensor.data_ptr(), in_tensor.data_ptr()); } @@ -906,10 +956,12 @@ TEST_F(AliasTest, SourceIsBothInputAndOutput) { fusion->addOutput(in); fusion->addOutput(out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_EQ(in_tensor.data_ptr(), out_tensors[0].data_ptr()); EXPECT_EQ(in_tensor.data_ptr(), out_tensors[1].data_ptr()); @@ -929,12 +981,13 @@ TEST_F(AliasTest, SegmentBoundary) { fusion->addInput(in); fusion->addOutput(out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3}).cuda(); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; - testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; + testValidate( + executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_THAT( runtime->fusionSegments()->groups(), UnorderedElementsAre( @@ -955,12 +1008,12 @@ TEST_F(AliasTest, ReuseBuffer) { auto tensor = at::randn({10}, options); auto expected_tensor = tensor + 1.0; - FusionExecutorCache fec(std::move(fusion)); - fec.runFusionWithInputs({tensor}); + FusionExecutorCache executor_cache(std::move(fusion)); + executor_cache.runFusionWithInputs({tensor}); EXPECT_TRUE(tensor.allclose(expected_tensor)); } -TEST_F(AliasTest, ReuseBuffer_FusionExecutor) { +TEST_F(AliasTest, ReuseBuffer_KernelExecutor) { Fusion fusion; FusionGuard fg(&fusion); TensorView* in = makeContigTensor(1); @@ -972,9 +1025,9 @@ TEST_F(AliasTest, ReuseBuffer_FusionExecutor) { auto tensor = at::randn({10}, options); auto expected_tensor = tensor + 1.0; - FusionExecutor fe; - fe.compileFusion(&fusion, {tensor}); - fe.runFusion({tensor}, {tensor}); + KernelExecutor ke; + ke.compile(&fusion, {tensor}); + ke.run({tensor}, {tensor}); EXPECT_TRUE(tensor.allclose(expected_tensor)); } @@ -1010,18 +1063,27 @@ TEST_F(AliasTest, ReuseBuffer_AliasAcrossSegments) { at::Tensor t1 = at::randn({65}, options); at::Tensor t2 = at::randn({128, 65}, options); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); // Make a copy of `t0` because `t0` will be in-place updated. at::Tensor original_t0 = t0.clone(); - std::vector outputs = fec.runFusionWithInputs({t0, t1, t2}); + std::vector outputs = + executor_cache.runFusionWithInputs({t0, t1, t2}); testValidate( - fec.fusion(), outputs, {original_t0, t1, t2}, __LINE__, __FILE__); + executor_cache.fusion(), + outputs, + {original_t0, t1, t2}, + __LINE__, + __FILE__); // https://github.com/NVIDIA/Fuser/pull/2999 will cause 3 segments instead of // the optimal 2 segments. Change back to 2 segments once // https://github.com/NVIDIA/Fuser/issues/3251 is resolved. EXPECT_EQ( - fec.getMostRecentKernelRuntime()->fusionSegments()->groups().size(), 3) + executor_cache.getMostRecentKernelRuntime() + ->fusionSegments() + ->groups() + .size(), + 3) << "segmentation didn't happen as expected"; auto t3 = original_t0.add(1.0); @@ -1055,16 +1117,17 @@ TEST_F(AliasTest, AliasOnlyKernelsAreNotLaunched) { fusion->addOutput(add_out); fusion->addOutput(permute_out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); auto options = at::dtype(at::kFloat).device(at::kCUDA); at::Tensor in_tensor = at::randn({2, 3}, options); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); if (ProfilerState::Running == FusionProfiler::state()) { FusionProfiler::stop(); } ProfilerOptionsGuard::getCurOptions().unset(ProfilerOption::Enable); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); const FusionProfile& profile = FusionProfiler::profile(); // Expect a kernel launched for one of the two segments but not the @@ -1094,13 +1157,14 @@ TEST_F(AliasTest, PerfDebugVerboseWhenSomeKernelsNotLaunched) { fusion->addOutput(add_out); fusion->addOutput(permute_out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); auto options = at::dtype(at::kFloat).device(at::kCUDA); at::Tensor in_tensor = at::randn({2, 3}, options); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_THAT( runtime->fusionSegments()->groups(), UnorderedElementsAre( @@ -1127,10 +1191,10 @@ TEST_F(AliasTest, NoKernelsAreLaunched) { fusion->addInput(in); fusion->addOutput(out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); auto options = at::dtype(at::kFloat).device(at::kCUDA); at::Tensor in_tensor = at::randn({2, 3}, options); - fec.runFusionWithInputs({in_tensor}); + executor_cache.runFusionWithInputs({in_tensor}); if (ProfilerState::Running == FusionProfiler::state()) { FusionProfiler::stop(); @@ -1146,8 +1210,8 @@ TEST_F(AliasTest, NoKernelsAreLaunched) { } // While most use cases go through FusionExecutorCache, nvFuser also supports -// evaluating an alias via FusionExecutor. -TEST_F(AliasTest, FusionExecutor) { +// evaluating an alias via KernelExecutor. +TEST_F(AliasTest, KernelExecutor) { Fusion fusion; FusionGuard fg(&fusion); @@ -1160,15 +1224,15 @@ TEST_F(AliasTest, FusionExecutor) { AliasAnalysisResult analysis = findAliases(&fusion); EXPECT_EQ(analysis.getRoot(out), in); - // Mark them alias so FusionExecutor::runFusion expression-evaluates the + // Mark them alias so KernelExecutor::runFusion expression-evaluates the // output on the host instead of launching a CUDA kernel. fusion.aliasOutputToInput(out, in, AllocationType::Evaluate); - FusionExecutor fe; + KernelExecutor ke; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({10, 10}, options); - fe.compileFusion(&fusion, {in_tensor}); - at::Tensor out_tensor = fe.runFusion({in_tensor})[0]; + ke.compile(&fusion, {in_tensor}); + at::Tensor out_tensor = ke.run({in_tensor})[0]; EXPECT_EQ(out_tensor.data_ptr(), in_tensor.data_ptr()); } @@ -1182,13 +1246,13 @@ TEST_F(AliasTest, InplaceUpdate) { fusion->addInput(out); fusion->aliasOutputToInput(out, in, AllocationType::ReuseBuffer); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3}).cuda(); at::Tensor out_tensor = in_tensor + 1; - fec.runFusionWithInputs({in_tensor, out_tensor}); + executor_cache.runFusionWithInputs({in_tensor, out_tensor}); EXPECT_TRUE(out_tensor.equal(in_tensor)); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_THAT( runtime->fusionSegments()->groups(), UnorderedElementsAre(HeuristicIs(SchedulerType::PointWise))); @@ -1209,10 +1273,12 @@ TEST_F(AliasTest, Bookend_SegmentSetPreservesAllocation) { permute_out->setAllocationDomain( {permute_out->axis(0), permute_out->axis(1)}, true); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({3, 2}).cuda().transpose(0, 1); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); at::Tensor permute_out_tensor = out_tensors[0]; EXPECT_TRUE(permute_out_tensor.is_alias_of(in_tensor)); @@ -1230,15 +1296,17 @@ TEST_F(AliasTest, Bookend_InputsAndOutputs) { fusion->addOutput(permute_out); fusion->addOutput(compute_out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); at::Tensor permute_out_tensor = out_tensors[0]; EXPECT_TRUE(permute_out_tensor.is_alias_of(in_tensor)); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); // MarkAliasesPrepare adds a `segment_set` between `in` and `permute`, which // leads to three segments: // 1. segment_set`, a no-op segment, @@ -1269,12 +1337,14 @@ TEST_F(AliasTest, Bookend_IntermediateTensors) { fusion->addInput(in); fusion->addOutput(out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_THAT( runtime->fusionSegments()->groups(), UnorderedElementsAre( @@ -1303,15 +1373,17 @@ TEST_F(AliasTest, Bookend_AliasesOfSameTensor) { fusion->addOutput(out1); fusion->addOutput(out2); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_EQ(out_tensors[0].data_ptr(), out_tensors[1].data_ptr()); EXPECT_EQ(out_tensors[0].data_ptr(), out_tensors[2].data_ptr()); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_THAT( runtime->fusionSegments()->groups(), Contains(HeuristicIs(SchedulerType::PointWise)).Times(1)); @@ -1338,14 +1410,16 @@ TEST_F(AliasTest, Bookend_ReuseSegmentSet) { fusion->addOutput(out0); fusion->addOutput(out1); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3, 5}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_EQ(out_tensors[0].data_ptr(), out_tensors[1].data_ptr()); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_THAT( runtime->fusionSegments()->groups(), UnorderedElementsAre( @@ -1384,13 +1458,15 @@ TEST_F(AliasTest, QKVSplitBackprop) { fusion->addOutput(view_out); fusion->addOutput(permute_out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); std::vector in_tensors; for (int i = 0; i < 3; i++) { in_tensors.push_back(at::randn({b, s, h * f}).cuda()); } - std::vector out_tensors = fec.runFusionWithInputs(in_tensors); - testValidate(fec.fusion(), out_tensors, in_tensors, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs(in_tensors); + testValidate( + executor_cache.fusion(), out_tensors, in_tensors, __LINE__, __FILE__); EXPECT_TRUE(out_tensors[2].is_alias_of(out_tensors[1])); } @@ -1419,12 +1495,12 @@ TEST_F(AliasTest, Bookend_Issue2375) { at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::randn(input_shape, options); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({t0}); - testValidate(fec.fusion(), out_tensors, {t0}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({t0}); + testValidate(executor_cache.fusion(), out_tensors, {t0}, __LINE__, __FILE__); EXPECT_THAT( - fec.getMostRecentKernelRuntime()->fusionSegments()->groups(), + executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups(), UnorderedElementsAre( HeuristicIs(SchedulerType::NoOp), HeuristicIs(SchedulerType::InnerPersistent))); @@ -1458,10 +1534,15 @@ TEST_F(AliasTest, Issue2664) { auto t2 = at::randn({}, options); auto aten_out = (t2 + 1.0) * t1; - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({t1, t2}); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({t1, t2}); testValidate( - fec.fusion(), out_tensors, {t1, t2}, {aten_out}, __LINE__, __FILE__); + executor_cache.fusion(), + out_tensors, + {t1, t2}, + {aten_out}, + __LINE__, + __FILE__); } } // namespace nvfuser diff --git a/tests/cpp/test_alias_analysis.cpp b/tests/cpp/test_alias_analysis.cpp index ef72282f1b2..3b50a399b0b 100644 --- a/tests/cpp/test_alias_analysis.cpp +++ b/tests/cpp/test_alias_analysis.cpp @@ -182,11 +182,11 @@ TEST_F(AliasAnalysisTest, View_ForwardExpandedBroadcast) { EXPECT_EQ(analysis.getRoot(out), in); // Verify the last dimension isn't expanded physically. - FusionExecutor fe; + KernelExecutor ke; at::Tensor in_tensor = at::randn({4, 5}).cuda().as_strided({4, 5, 6}, {5, 1, 0}); - fe.compileFusion(&fusion, {in_tensor}); - at::Tensor out_tensor = fe.runFusion({in_tensor})[0]; + ke.compile(&fusion, {in_tensor}); + at::Tensor out_tensor = ke.run({in_tensor})[0]; EXPECT_THAT(out_tensor.strides(), ElementsAre(1, 0)); } diff --git a/tests/cpp/test_allocation_domain.cpp b/tests/cpp/test_allocation_domain.cpp index 167374c3799..55ac0ee99d4 100644 --- a/tests/cpp/test_allocation_domain.cpp +++ b/tests/cpp/test_allocation_domain.cpp @@ -29,8 +29,7 @@ using ::testing::ElementsAre; // A global->shared->global copy kernel, shared memory allocated transposed to // avoid bank conflict. TEST_F(AllocationDomainTest, TransposedIntermediate) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigConcreteTensor({32, 32}); @@ -58,17 +57,16 @@ TEST_F(AllocationDomainTest, TransposedIntermediate) { at::Tensor t0 = at::randn({32, 32}, options); - FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } // A global->global copy kernel converting NCHW memory format into NHWC, with a // 4d allocation domain in output. TEST_F(AllocationDomainTest, NCHW4d_To_NHWC4d) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigTensor(4); @@ -96,10 +94,10 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC4d) { at::Tensor t0 = at::randn({n, c, h, w}, options); - FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), {t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -109,8 +107,7 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC4d) { // A global->global copy kernel converting NCHW memory format into NHWC, with a // 1d allocation domain in output. TEST_F(AllocationDomainTest, NCHW4d_To_NHWC1d) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigTensor(4); @@ -135,10 +132,10 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC1d) { at::Tensor t0 = at::randn({n, c, h, w}, options); - FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), {t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -148,8 +145,7 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC1d) { // A global->global copy kernel converting NCHW memory format into NHWC, with a // 2d allocation domain in output. TEST_F(AllocationDomainTest, NCHW4d_To_NHWC2d) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigTensor(4); @@ -175,10 +171,10 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC2d) { at::Tensor t0 = at::randn({n, c, h, w}, options); - FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), {t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -188,8 +184,7 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC2d) { // Reshape and transpose a 3d tensor into an NHWC tensor with a 3d allocation // domain in fusion output. TEST_F(AllocationDomainTest, Tensor3d_To_NHWC3d) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); int n1 = 31, n2 = 29, h = 64, w = 104, c = 21; @@ -222,10 +217,10 @@ TEST_F(AllocationDomainTest, Tensor3d_To_NHWC3d) { at::Tensor t0 = at::randn({n1, n2, h * w * c}, options); - FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), {t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -242,8 +237,7 @@ TEST_F(AllocationDomainTest, Tensor3d_To_NHWC3d) { // output. The allocation domain is on both the producer and the consumer side // of the rFactor domain. TEST_F(AllocationDomainTest, Tensor3d_To_NHWC4d_FwdBwd) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); int n1 = 31, n2 = 29, h = 64, w = 104, c = 21; @@ -282,10 +276,10 @@ TEST_F(AllocationDomainTest, Tensor3d_To_NHWC4d_FwdBwd) { at::Tensor t0 = at::randn({n1, n2, c * h * w}, options); - FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), {t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -301,8 +295,7 @@ TEST_F(AllocationDomainTest, Tensor3d_To_NHWC4d_FwdBwd) { // A global->global copy kernel where both inputs and outputs are NHWC memory // format TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigTensor(4); @@ -338,15 +331,15 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d) { at::Tensor t0 = t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); - FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), {t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); EXPECT_THAT( - [&]() { fe.runFusion({t0_wrong_format}); }, + [&]() { ke.run({t0_wrong_format}); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Stride mismatch with contiguity info"))); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -356,8 +349,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d) { // A global->global copy kernel where both inputs are NHWC memory format. The // allocation domain view the input as a 1d tensor. TEST_F(AllocationDomainTest, NHWC1d_To_NHWC4d) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); int n = 31, h = 64, w = 103, c = 21; @@ -397,15 +389,15 @@ TEST_F(AllocationDomainTest, NHWC1d_To_NHWC4d) { at::Tensor t0 = t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); - FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), {t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); EXPECT_THAT( - [&]() { fe.runFusion({t0_wrong_format}); }, + [&]() { ke.run({t0_wrong_format}); }, ::testing::ThrowsMessage(::testing::HasSubstr( "splitting one dimension into discontiguous dimensions is not allowed in allocation domain"))); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -415,8 +407,7 @@ TEST_F(AllocationDomainTest, NHWC1d_To_NHWC4d) { // A global->global copy kernel where both inputs are NHWC memory format. The // allocation domain of the output view the output as a 1d tensor. TEST_F(AllocationDomainTest, NHWC4d_To_NHWC1d) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); int n = 31, h = 64, w = 103, c = 21; @@ -453,15 +444,15 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC1d) { at::Tensor t0 = t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); - FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), {t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); EXPECT_THAT( - [&]() { fe.runFusion({t0_wrong_format}); }, + [&]() { ke.run({t0_wrong_format}); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Stride mismatch with contiguity info"))); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -471,8 +462,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC1d) { // A global->global copy kernel where both inputs are NHWC memory format. The // allocation domain view both the input and the output as a 1d tensors. TEST_F(AllocationDomainTest, NHWC1d_To_NHWC1d) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); int n = 31, h = 64, w = 103, c = 21; @@ -514,15 +504,15 @@ TEST_F(AllocationDomainTest, NHWC1d_To_NHWC1d) { at::Tensor t0 = t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); - FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), {t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); EXPECT_THAT( - [&]() { fe.runFusion({t0_wrong_format}); }, + [&]() { ke.run({t0_wrong_format}); }, ::testing::ThrowsMessage(::testing::HasSubstr( "splitting one dimension into discontiguous dimensions is not"))); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -533,8 +523,7 @@ TEST_F(AllocationDomainTest, NHWC1d_To_NHWC1d) { // allocation domain view the input as a 2d tensor of shape [N*H/8, 8*W*C], and // view the output as a 2d tensor of shape [N*H*W*C/4, 4] TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); int n = 31, h = 64, w = 103, c = 21; @@ -582,15 +571,15 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d) { at::Tensor t0 = t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); - FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), {t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); EXPECT_THAT( - [&]() { fe.runFusion({t0_wrong_format}); }, + [&]() { ke.run({t0_wrong_format}); }, ::testing::ThrowsMessage(::testing::HasSubstr( "splitting one dimension into discontiguous dimensions is not allowed in allocation domain"))); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -599,8 +588,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d) { // Similar to NHWC4d_To_NHWC4d, but does a cacheBefore TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheBefore) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigTensor(4); @@ -647,15 +635,15 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheBefore) { at::Tensor t0 = t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); - FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), {t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); EXPECT_THAT( - [&]() { fe.runFusion({t0_wrong_format}); }, + [&]() { ke.run({t0_wrong_format}); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Stride mismatch with contiguity info"))); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -664,8 +652,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheBefore) { // Similar to NHWC2d_To_NHWC2d, but does a cacheBefore TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheBefore) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); int n = 31, h = 64, w = 103, c = 21; @@ -724,15 +711,15 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheBefore) { at::Tensor t0 = t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); - FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), {t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); EXPECT_THAT( - [&]() { fe.runFusion({t0_wrong_format}); }, + [&]() { ke.run({t0_wrong_format}); }, ::testing::ThrowsMessage(::testing::HasSubstr( "splitting one dimension into discontiguous dimensions is not allowed in allocation domain"))); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -741,8 +728,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheBefore) { // Similar to NHWC4d_To_NHWC4d, but does a cacheAfter TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheAfter) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigTensor(4); @@ -789,15 +775,15 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheAfter) { at::Tensor t0 = t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); - FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), {t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); EXPECT_THAT( - [&]() { fe.runFusion({t0_wrong_format}); }, + [&]() { ke.run({t0_wrong_format}); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Stride mismatch with contiguity info"))); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -808,8 +794,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheAfter) { // allocation tensor to be between rFactor domain and loop domain, which is not // the case for NHWC2d_To_NHWC2d TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheAfter) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); int n = 31, h = 64, w = 103, c = 21; @@ -860,15 +845,15 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheAfter) { at::Tensor t0 = t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); - FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), {t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); EXPECT_THAT( - [&]() { fe.runFusion({t0_wrong_format}); }, + [&]() { ke.run({t0_wrong_format}); }, ::testing::ThrowsMessage(::testing::HasSubstr( "merging of discontiguous dimensions is not allowed in allocation domain"))); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -877,8 +862,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheAfter) { // Similar to NHWC4d_To_NHWC4d, but does a cacheFork TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheFork) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigTensor(4); @@ -932,15 +916,15 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheFork) { at::Tensor t0 = t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); - FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), {t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); EXPECT_THAT( - [&]() { fe.runFusion({t0_wrong_format}); }, + [&]() { ke.run({t0_wrong_format}); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Stride mismatch with contiguity info"))); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -949,8 +933,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheFork) { // Similar to NHWC2d_To_NHWC2d, but does a cacheFork TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheFork) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); int n = 31, h = 64, w = 103, c = 21; @@ -1022,15 +1005,15 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheFork) { at::Tensor t0 = t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); - FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), {t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); EXPECT_THAT( - [&]() { fe.runFusion({t0_wrong_format}); }, + [&]() { ke.run({t0_wrong_format}); }, ::testing::ThrowsMessage(::testing::HasSubstr( "splitting one dimension into discontiguous dimensions is not allowed in allocation domain"))); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); @@ -1038,30 +1021,29 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheFork) { } TEST_F(AllocationDomainTest, VectorizationIssue902) { - auto fusion_ptr = std::make_unique(); - auto& fusion = *fusion_ptr; - FusionGuard fg(fusion_ptr.get()); + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); const std::vector shape({16, 16, 512, 64}); auto tv0 = makeContigTensor(4); - fusion.addInput(tv0); + fusion->addInput(tv0); auto tv1 = set(tv0); - fusion.addOutput(tv1); + fusion->addOutput(tv1); - std::vector aloc_domain; - aloc_domain.push_back(tv1->axis(0)); - aloc_domain.push_back(tv1->axis(2)); - aloc_domain.push_back(tv1->axis(3)); - aloc_domain.push_back(tv1->axis(1)); - tv1->setAllocationDomain(aloc_domain, true); + std::vector alloc_domain; + alloc_domain.push_back(tv1->axis(0)); + alloc_domain.push_back(tv1->axis(2)); + alloc_domain.push_back(tv1->axis(3)); + alloc_domain.push_back(tv1->axis(1)); + tv1->setAllocationDomain(alloc_domain, true); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutorCache executor_cache(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion)); auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); ASSERT_TRUE(cg_outputs[0].equal(t0)); @@ -1101,9 +1083,8 @@ TEST_F(AllocationDomainTest, TransposeMatrix) { } TEST_F(AllocationDomainTest, ContiguityIssue1021) { - std::unique_ptr fusion_ptr = std::make_unique(); - Fusion* fusion = fusion_ptr.get(); - FusionGuard fg(fusion); + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); auto tv0 = TensorViewBuilder() .ndims(2) @@ -1119,17 +1100,16 @@ TEST_F(AllocationDomainTest, ContiguityIssue1021) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({8, 8}, options).as_strided({4, 8}, {1, 8}); - FusionExecutorCache fec(std::move(fusion_ptr)); - auto outputs = fec.runFusionWithInputs({t0}); + FusionExecutorCache executor_cache(std::move(fusion)); + auto outputs = executor_cache.runFusionWithInputs({t0}); auto t1 = t0.add(5.0); - testValidate(fusion, outputs, {t0}, __LINE__, __FILE__); + testValidate(executor_cache.fusion(), outputs, {t0}, __LINE__, __FILE__); } TEST_F(AllocationDomainTest, ContiguityForBroadcast) { - std::unique_ptr fusion_ptr = std::make_unique(); - Fusion* fusion = fusion_ptr.get(); - FusionGuard fg(fusion); + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); auto tv0 = TensorViewBuilder() .ndims(2) @@ -1145,17 +1125,16 @@ TEST_F(AllocationDomainTest, ContiguityForBroadcast) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({1, 1}, options).as_strided({1, 1}, {0, 3}); - FusionExecutorCache fec(std::move(fusion_ptr)); - auto outputs = fec.runFusionWithInputs({t0}); + FusionExecutorCache executor_cache(std::move(fusion)); + auto outputs = executor_cache.runFusionWithInputs({t0}); auto t1 = t0.add(5.0); - testValidate(fusion, outputs, {t0}, __LINE__, __FILE__); + testValidate(executor_cache.fusion(), outputs, {t0}, __LINE__, __FILE__); } TEST_F(AllocationDomainTest, ContiguityForExplicitBroadcast) { - std::unique_ptr fusion_ptr = std::make_unique(); - Fusion* fusion = fusion_ptr.get(); - FusionGuard fg(fusion); + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); auto tv0 = TensorViewBuilder() .ndims(3) @@ -1172,11 +1151,11 @@ TEST_F(AllocationDomainTest, ContiguityForExplicitBroadcast) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({4, 8}, options).as_strided({3, 8, 4}, {0, 1, 8}); - FusionExecutorCache fec(std::move(fusion_ptr)); - auto outputs = fec.runFusionWithInputs({t0}); + FusionExecutorCache executor_cache(std::move(fusion)); + auto outputs = executor_cache.runFusionWithInputs({t0}); auto t1 = t0.add(5.0); - testValidate(fusion, outputs, {t0}, __LINE__, __FILE__); + testValidate(executor_cache.fusion(), outputs, {t0}, __LINE__, __FILE__); } // Test that allocation domain can be used to vectorize overlapping tensors, @@ -1189,8 +1168,7 @@ TEST_F(AllocationDomainTest, ContiguityForExplicitBroadcast) { // automatically supports all kinds of use cases, even those that we don't have // an active plan to support on). TEST_F(AllocationDomainTest, VectorizeOverlappingTensor) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigTensor(3); @@ -1225,9 +1203,9 @@ TEST_F(AllocationDomainTest, VectorizeOverlappingTensor) { at::Tensor t0 = at::randn({4 * 5 * 7}).cuda().as_strided({4, 5, 7}, {7, 4, 1}); - FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -1250,14 +1228,14 @@ TEST_F(AllocationDomainTest, Issue1290_ContiguityWasMissing) { at::Tensor in_tensor = at::randn({2 * 4}).cuda().as_strided({2, 3}, {4, 1}); - FusionExecutorCache fec(std::move(fusion)); - fec.runFusionWithInputs({in_tensor}); + FusionExecutorCache executor_cache(std::move(fusion)); + executor_cache.runFusionWithInputs({in_tensor}); // The initial issue was detected in the pointwise scheduler, so I added these // checks to make sure it's a valid regression test. The transpose scheduler // could accept this but decided not to because of a small problem size. const std::vector& groups = - fec.getMostRecentKernelRuntime()->fusionSegments()->groups(); + executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups(); ASSERT_EQ(groups.size(), 1); SegmentedGroup* group = groups[0]; EXPECT_EQ(group->schedulerType(), SchedulerType::PointWise); @@ -1275,9 +1253,9 @@ TEST_F(AllocationDomainTest, Issue1290_ReplayCasPFailedDueToDifferentRanks) { out->cacheBefore(); at::Tensor in_tensor = at::randn({2, 3}).cuda(); - FusionExecutor fe; - fe.compileFusion(&fusion, {in_tensor}); - at::Tensor out_tensor = fe.runFusion({in_tensor})[0]; + KernelExecutor ke; + ke.compile(&fusion, {in_tensor}); + at::Tensor out_tensor = ke.run({in_tensor})[0]; EXPECT_THAT(out_tensor.sizes(), ElementsAre(2)); } @@ -1311,8 +1289,8 @@ TEST_F(AllocationDomainTest, Issue1524) { {permute_out->axis(1), permute_out->axis(0)}, true); at::Tensor in_tensor = at::randn({2, 3}).cuda(); - FusionExecutorCache fec(std::move(fusion)); - fec.runFusionWithInputs({in_tensor}); + FusionExecutorCache executor_cache(std::move(fusion)); + executor_cache.runFusionWithInputs({in_tensor}); } TEST_F(AllocationDomainTest, EmptyAllocationDomainApi) { diff --git a/tests/cpp/test_allocation_order_inference.cpp b/tests/cpp/test_allocation_order_inference.cpp index b936d2252d0..c24d679bfbb 100644 --- a/tests/cpp/test_allocation_order_inference.cpp +++ b/tests/cpp/test_allocation_order_inference.cpp @@ -315,9 +315,9 @@ TEST_F(AllocationOrderInferenceTest, EnableInRuntime) { at::Tensor in_tensor = at::randn({2, 4, 8, 8}, options); at::Tensor in_nhwc = in_tensor.as_strided({2, 4, 8, 8}, {4 * 8 * 8, 1, 4 * 8, 4}); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); - auto cg_outputs = fec.runFusionWithInputs({in_nhwc}); + auto cg_outputs = executor_cache.runFusionWithInputs({in_nhwc}); auto ref_out = in_nhwc.relu(); EXPECT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); diff --git a/tests/cpp/test_circular_buffering.cpp b/tests/cpp/test_circular_buffering.cpp index d607579196f..cb0d27eafd2 100644 --- a/tests/cpp/test_circular_buffering.cpp +++ b/tests/cpp/test_circular_buffering.cpp @@ -64,17 +64,17 @@ TEST_P(CircularBufferingTest, SingleDim1) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({1000}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); // Given computeAt axis 1, the axis_extent is I0/128. constexpr int64_t axis_extent = 8; if (axis_extent < number_of_stages) { - ASSERT_ANY_THROW(fe.runFusion({t0})); + ASSERT_ANY_THROW(ke.run({t0})); return; } - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); auto ref = t0 + 1; testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); } @@ -112,17 +112,17 @@ TEST_P(CircularBufferingTest, SingleDim2) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({1000}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); // Given computeAt axis 1, the axis_extent is I0/128. constexpr int64_t axis_extent = 8; if (axis_extent < number_of_stages) { - ASSERT_ANY_THROW(fe.runFusion({t0})); + ASSERT_ANY_THROW(ke.run({t0})); return; } - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); auto ref = t0 + 1; testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); } @@ -167,17 +167,17 @@ TEST_P(CircularBufferingTest, SingleDim3) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({1000}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); // Given computeAt axis 2, the axis_extent is 128/32. constexpr int64_t axis_extent = 4; if (axis_extent < number_of_stages) { - ASSERT_ANY_THROW(fe.runFusion({t0})); + ASSERT_ANY_THROW(ke.run({t0})); return; } - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); auto ref = t0 + 2; testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); } @@ -219,18 +219,18 @@ TEST_P(CircularBufferingTest, SingleDimUnswitch1) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({1000}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); // Given computeAt axis -1 and axis 3 is parallelized with TIDx, the axis // extent is 4. constexpr int64_t axis_extent = 4; if (axis_extent < number_of_stages) { - ASSERT_ANY_THROW(fe.runFusion({t0})); + ASSERT_ANY_THROW(ke.run({t0})); return; } - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); auto ref = t0 + 2; testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); } @@ -271,18 +271,18 @@ TEST_P(CircularBufferingTest, SingleDimUnswitch2) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({1000}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); // Given computeAt axis -1 and axis 3 is parallelized with TIDx, the axis // extent is 4. constexpr int64_t axis_extent = 4; if (axis_extent < number_of_stages) { - ASSERT_ANY_THROW(fe.runFusion({t0})); + ASSERT_ANY_THROW(ke.run({t0})); return; } - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); auto ref = t0 + 1; testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); } @@ -325,18 +325,18 @@ TEST_P(CircularBufferingTest, SingleDimUnroll) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({199}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); // Given computeAt axis -1 and axis 4 is parallelized with TIDx, the axis // extent is 2. constexpr int64_t axis_extent = 2; if (axis_extent < number_of_stages) { - ASSERT_ANY_THROW(fe.runFusion({t0})); + ASSERT_ANY_THROW(ke.run({t0})); return; } - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); auto ref = t0 + 2; testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); } @@ -372,18 +372,18 @@ TEST_P(CircularBufferingTest, SingleDimVectorize) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({200}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); // Given computeAt axis 2 and axis 1 is parallelized with TIDx, the axis // extent is I0/128. constexpr int64_t axis_extent = 2; if (axis_extent < number_of_stages) { - ASSERT_ANY_THROW(fe.runFusion({t0})); + ASSERT_ANY_THROW(ke.run({t0})); return; } - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); auto ref = t0 + 1; testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); } @@ -424,17 +424,17 @@ TEST_P(CircularBufferingTest, MultipleTensors) { auto t0 = at::randn({500}, options); auto t1 = at::randn({500}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); + KernelExecutor ke; + ke.compile(&fusion, {t0, t1}); // Given computeAt axis 1, the axis extent is I0/32/4. constexpr int64_t axis_extent = 1; if (axis_extent < number_of_stages) { - ASSERT_ANY_THROW(fe.runFusion({t0})); + ASSERT_ANY_THROW(ke.run({t0})); return; } - auto cg_outputs = fe.runFusion({t0, t1}); + auto cg_outputs = ke.run({t0, t1}); auto ref = t0 + t1; testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__); } @@ -475,19 +475,19 @@ TEST_P(CircularBufferingTest, NestedTensors) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({1001}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); // Given computeAt axis 1 for tv2, the axis extent is I0/32/4 = 8. // Given computeAt axis 3 for tv3 and axis 3 is parallelized with TIDx, // the axis extent is 4. constexpr int64_t axis_extent = 4; if (axis_extent < number_of_stages) { - ASSERT_ANY_THROW(fe.runFusion({t0})); + ASSERT_ANY_THROW(ke.run({t0})); return; } - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); auto ref = t0 + 1; testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); } @@ -569,16 +569,16 @@ TEST_P(CircularBufferingTest, SmemBlockGemmCache) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); constexpr int64_t axis_extent = 2; if (axis_extent < number_of_stages) { - ASSERT_ANY_THROW(fe.runFusion({t0})); + ASSERT_ANY_THROW(ke.run({t0})); return; } - auto cg_outputs = fe.runFusion(aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); // The smem cache write in this test case is redundant predicated, @@ -586,7 +586,7 @@ TEST_P(CircularBufferingTest, SmemBlockGemmCache) { // insertion to ensure ordering of circular buffered tensor access. // The check below makes sure that the sync is inserted so that the // test isn't running on a race condition. - NVF_CHECK(fe.kernel()->summary().war_hazard_syncs_count > 0); + NVF_CHECK(ke.kernel()->summary().war_hazard_syncs_count > 0); } // Vectorized reset test for circular buffered registers @@ -623,16 +623,16 @@ TEST_P(CircularBufferingTest, Vector) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({200}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); constexpr int64_t axis_extent = 8; if (axis_extent < number_of_stages) { - ASSERT_ANY_THROW(fe.runFusion({t0})); + ASSERT_ANY_THROW(ke.run({t0})); return; } - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); auto ref = (t0 + 1).sum({0}); testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); } @@ -678,14 +678,14 @@ TEST_P(CircularBufferingTest, CpAsync1) { at::Tensor t0 = at::randn({m, n}, options); at::Tensor t1 = at::randn({m, n}, options); - FusionExecutor fe; + KernelExecutor ke; // requires ampere+ GPU if (!deviceMajorMinorCheck(8)) { - ASSERT_ANY_THROW(fe.compileFusion(&fusion, {t0, t1})); + ASSERT_ANY_THROW(ke.compile(&fusion, {t0, t1})); GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs"; } - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); auto ref = t0 + t1; @@ -731,14 +731,14 @@ TEST_P(CircularBufferingTest, CpAsync2) { at::Tensor t0 = at::randn({m, n}, options); at::Tensor t1 = at::randn({m, n}, options); - FusionExecutor fe; + KernelExecutor ke; // requires ampere+ GPU if (!deviceMajorMinorCheck(8)) { - ASSERT_ANY_THROW(fe.compileFusion(&fusion, {t0, t1})); + ASSERT_ANY_THROW(ke.compile(&fusion, {t0, t1})); GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs"; } - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); auto ref = t0 + t1; @@ -794,9 +794,9 @@ TEST_P(CircularBufferingTest, NoSync) { }); NVF_ERROR(!sync_inserted, "Un-expected block sync inserted"); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); auto ref = t0 + t1; @@ -971,9 +971,9 @@ TEST_F(NVFuserTest, ElectSyncCompatibility) { // (threadIdx.x < 4) predicate. This thread predicate is incompatible with // circular buffering because we generate an ElectSync predicate that uses // a single thread. - FusionExecutor fe; + KernelExecutor ke; try { - fe.compileFusion(fusion.get(), {t0}); + ke.compile(fusion.get(), {t0}); } catch (const std::exception& e) { const char* reference = R"(This thread-parallelized TensorView T2_s_float[ iblockIdx.x15{( ceilDiv(( ceilDiv(( ceilDiv(( ( ( (( (( getMetaData(T0) )).logical_size ))[0] ) * ( (( (( getMetaData(T0) )).logical_size ))[1] ) ) * ( (( (( getMetaData(T0) )).logical_size ))[2] ) ), 256) ), 4) ), 2) )}, iS16{2}, ithreadIdx.x14{4}, iB12{256} ] ca_pos( 2 ) is incorrectly contained within a If-Then-Else with the ElectSync predicate.)"; @@ -1023,10 +1023,10 @@ TEST_P(TmaCircularBufferingTest, SingleDim) { at::Tensor t0 = at::randn({tensor_inner_dim}, options); at::Tensor t1 = at::exp(t0); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {t0}); + KernelExecutor ke; + ke.compile(fusion.get(), {t0}); - std::vector cg_outputs = fe.runFusion({t0}); + std::vector cg_outputs = ke.run({t0}); compare(tensor_inner_dim, cg_outputs.front(), t1); testValidate(fusion.get(), cg_outputs, {t0}, {t1}, __LINE__, __FILE__); } @@ -1076,17 +1076,17 @@ TEST_P(TmaCircularBufferingTest, SingleDimUnroll) { at::Tensor t0 = at::randn({tensor_inner_dim}, options); at::Tensor t1 = at::exp(t0); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {t0}); + KernelExecutor ke; + ke.compile(fusion.get(), {t0}); int64_t axis_extent = ceilDiv(ceilDiv(tensor_inner_dim, bulk_inner_dim), unroll_dim); if (axis_extent < number_of_stages) { - ASSERT_ANY_THROW(fe.runFusion({t0})); + ASSERT_ANY_THROW(ke.run({t0})); return; } - std::vector cg_outputs = fe.runFusion({t0}); + std::vector cg_outputs = ke.run({t0}); compare(tensor_inner_dim, cg_outputs.front(), t1); testValidate(fusion.get(), cg_outputs, {t0}, {t1}, __LINE__, __FILE__); } @@ -1136,17 +1136,17 @@ TEST_P(TmaCircularBufferingTest, SingleDimUnswitch) { at::Tensor t0 = at::randn({tensor_inner_dim}, options); at::Tensor t1 = at::exp(t0); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {t0}); + KernelExecutor ke; + ke.compile(fusion.get(), {t0}); int64_t axis_extent = ceilDiv(ceilDiv(tensor_inner_dim, bulk_inner_dim), unroll_dim); if (axis_extent < number_of_stages) { - ASSERT_ANY_THROW(fe.runFusion({t0})); + ASSERT_ANY_THROW(ke.run({t0})); return; } - std::vector cg_outputs = fe.runFusion({t0}); + std::vector cg_outputs = ke.run({t0}); compare(tensor_inner_dim, cg_outputs.front(), t1); testValidate(fusion.get(), cg_outputs, {t0}, {t1}, __LINE__, __FILE__); } @@ -1206,10 +1206,10 @@ TEST_P(TmaCircularBufferingTest, MultiDim) { at::Tensor t0 = at::ones({tensor_outer_dim, tensor_inner_dim}, options); at::Tensor t1 = at::exp(t0); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {t0}); + KernelExecutor ke; + ke.compile(fusion.get(), {t0}); - std::vector cg_outputs = fe.runFusion({t0}); + std::vector cg_outputs = ke.run({t0}); compare(tensor_outer_dim, tensor_inner_dim, cg_outputs.front(), t1); testValidate(fusion.get(), cg_outputs, {t0}, {t1}, __LINE__, __FILE__); } @@ -1268,10 +1268,10 @@ TEST_P(TmaCircularBufferingTest, Pointwise) { at::Tensor t1 = at::randn({tensor_outer_dim, tensor_inner_dim}, options); at::Tensor t2 = t0 + t1; - FusionExecutor fe; - fe.compileFusion(fusion.get(), {t0, t1}); + KernelExecutor ke; + ke.compile(fusion.get(), {t0, t1}); - std::vector cg_outputs = fe.runFusion({t0, t1}); + std::vector cg_outputs = ke.run({t0, t1}); compare(tensor_outer_dim, tensor_inner_dim, cg_outputs.front(), t2); testValidate(fusion.get(), cg_outputs, {t0, t1}, {t2}, __LINE__, __FILE__); } @@ -1335,10 +1335,10 @@ TEST_P(TmaCircularBufferingTest, PointwiseCpAsync) { at::Tensor t1 = at::randn({tensor_outer_dim, tensor_inner_dim}, options); at::Tensor t2 = t0 + t1; - FusionExecutor fe; - fe.compileFusion(fusion.get(), {t0, t1}); + KernelExecutor ke; + ke.compile(fusion.get(), {t0, t1}); - std::vector cg_outputs = fe.runFusion({t0, t1}); + std::vector cg_outputs = ke.run({t0, t1}); compare(tensor_outer_dim, tensor_inner_dim, cg_outputs.front(), t2); testValidate(fusion.get(), cg_outputs, {t0, t1}, {t2}, __LINE__, __FILE__); } @@ -1393,10 +1393,10 @@ TEST_P(TmaCircularBufferingTest, Reduction) { at::Tensor t0 = at::randn({tensor_outer_dim, tensor_inner_dim}, options); at::Tensor t1 = sum(t0, {-1}); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {t0}); + KernelExecutor ke; + ke.compile(fusion.get(), {t0}); - std::vector cg_outputs = fe.runFusion({t0}); + std::vector cg_outputs = ke.run({t0}); compare(tensor_outer_dim, cg_outputs.front(), t1); testValidate(fusion.get(), cg_outputs, {t0}, {t1}, __LINE__, __FILE__); } @@ -1518,10 +1518,10 @@ TEST_P(TmaCircularBufferingTest, Persistent) { at::Tensor at_tv0 = at::randn({tensor_outer_dim, tensor_inner_dim}, options); at::Tensor at_tv1 = at::randn({tensor_outer_dim, tensor_inner_dim}, options); - // Compile with FusionExecutor directly to avoid scheduling - FusionExecutor fe; - fe.compileFusion(fusion.get(), {at_tv0}); - std::vector cg_outputs = fe.runFusion({at_tv0}); + // Compile with KernelExecutor directly to avoid scheduling + KernelExecutor ke; + ke.compile(fusion.get(), {at_tv0}); + std::vector cg_outputs = ke.run({at_tv0}); std::tuple at_var_mean = at::var_mean(at_tv0, {-1}, correction, keepdim); @@ -1640,10 +1640,10 @@ TEST_P(TmaCircularBufferingTest, Matmul) { at::Tensor aten_output = (t0.unsqueeze(/*dim=*/-1) * t1.unsqueeze(/*dim=*/0)).sum(/*dim=*/1); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {t0, t1}); + KernelExecutor ke; + ke.compile(fusion.get(), {t0, t1}); - std::vector cg_outputs = fe.runFusion({t0, t1}); + std::vector cg_outputs = ke.run({t0, t1}); compare( tensor_outer_dim, tensor_inner_dim, cg_outputs.front(), aten_output); testValidate( @@ -1754,10 +1754,10 @@ TEST_P(TmaCircularBufferingTest, MatmulWithBroadcastedInput) { at::Tensor t1 = at::randn({1, K, tensor_inner_dim}, options); at::Tensor aten_output = (t0 * t1).sum(/*dim=*/1); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {t0, t1}); + KernelExecutor ke; + ke.compile(fusion.get(), {t0, t1}); - std::vector cg_outputs = fe.runFusion({t0, t1}); + std::vector cg_outputs = ke.run({t0, t1}); compare( tensor_outer_dim, tensor_inner_dim, cg_outputs.front(), aten_output); testValidate( diff --git a/tests/cpp/test_combined_inner_outer_reduction.cpp b/tests/cpp/test_combined_inner_outer_reduction.cpp index 95eaadd4ad7..f0a90168cc4 100644 --- a/tests/cpp/test_combined_inner_outer_reduction.cpp +++ b/tests/cpp/test_combined_inner_outer_reduction.cpp @@ -104,10 +104,10 @@ TEST_P(CombinedSchedulerTest, LayerNormBackward) { auto aten_mean = std::get<1>(aten_results); auto aten_rstd = std::get<2>(aten_results); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); std::vector aten_inputs = { aten_grad_out, aten_input, aten_mean, aten_rstd, aten_weight, aten_bias}; - auto cg_outputs = fec.runFusionWithInputs(aten_inputs); + auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); auto aten_gradients = at::native_layer_norm_backward( aten_grad_out, @@ -120,7 +120,7 @@ TEST_P(CombinedSchedulerTest, LayerNormBackward) { {true, true, true}); testValidate( - fec.fusion(), + executor_cache.fusion(), {cg_outputs[0], cg_outputs[1], cg_outputs[2]}, aten_inputs, {std::get<0>(aten_gradients), @@ -261,7 +261,7 @@ TEST_F(CombinedSchedulerTest, SharedConsumer) { auto aten_mean = std::get<1>(aten_results); auto aten_rstd = std::get<2>(aten_results); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); std::vector aten_inputs = { aten_grad_out, aten_input, @@ -269,7 +269,7 @@ TEST_F(CombinedSchedulerTest, SharedConsumer) { aten_rstd, aten_weight, aten_bias}; - auto cg_outputs = fec.runFusionWithInputs(aten_inputs); + auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); auto aten_gradients = at::native_layer_norm_backward( aten_grad_out.to(at::kDouble), @@ -287,7 +287,8 @@ TEST_F(CombinedSchedulerTest, SharedConsumer) { if (!link_inner_outer) { aten_out_linked = aten_out_linked.mul(0.5); } - bool is_segmented = fec.getMostRecentKernelRuntime()->isSegmented(); + bool is_segmented = + executor_cache.getMostRecentKernelRuntime()->isSegmented(); NVF_CHECK(is_segmented, "Fusion is not segmented"); testValidate( @@ -443,7 +444,7 @@ TEST_F(CombinedSchedulerTest, SharedProducer) { auto aten_mean = std::get<1>(aten_results); auto aten_rstd = std::get<2>(aten_results); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); std::vector aten_inputs = { aten_grad_out, aten_input, @@ -451,9 +452,9 @@ TEST_F(CombinedSchedulerTest, SharedProducer) { aten_rstd, aten_weight, aten_bias}; - auto cg_outputs = fec.runFusionWithInputs(aten_inputs); + auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); switch (case_id) { case 0: case 1: @@ -634,9 +635,9 @@ TEST_F(CombinedSchedulerTest, CombinedReduction) { at::Tensor qv_cg_output = at::empty({dim1}, options); auto qv_aten_output = tv_input.to(at::kFloat).sum({0}); - FusionExecutor fe; - fe.compileFusion(&fusion, {tv_input}, launch_constraints, compile_params); - fe.runFusion( + KernelExecutor ke; + ke.compile(&fusion, {tv_input}, launch_constraints, compile_params); + ke.run( {tv_input}, {tv_cg_output, qv_cg_output}, launch_constraints, @@ -811,9 +812,9 @@ TEST_F(CombinedSchedulerTest, CombinedReductionMultiPerBlock) { at::Tensor qv_cg_output = at::empty({dim1}, options); at::Tensor tv_input2 = at::ones({dim0, dim1}, options); auto qv_aten_output = tv_input2.to(at::kFloat).sum({0}); - FusionExecutor fe; - fe.compileFusion(&fusion, {tv_input}, launch_constraints, compile_params); - fe.runFusion( + KernelExecutor ke; + ke.compile(&fusion, {tv_input}, launch_constraints, compile_params); + ke.run( {tv_input}, {tv_cg_output, qv_cg_output}, launch_constraints, @@ -850,10 +851,11 @@ TEST_F(CombinedSchedulerTest, InnerOuterMismatch) { at::Tensor t0 = at::randn({x, y, z}, options); std::vector aten_inputs = {t0}; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); - bool is_segmented = fec.getMostRecentKernelRuntime()->isSegmented(); + bool is_segmented = + executor_cache.getMostRecentKernelRuntime()->isSegmented(); if (outer_reduction_axis.size() == 2) { NVF_ERROR(!is_segmented, "Fusion should NOT be segmented!"); } else { @@ -980,8 +982,8 @@ TEST_F(CombinedSchedulerTest, SharedMemoryPersistentVectFactor) { heuristic_params->as()->smem_persistent_buffers = std::vector{tv1}; scheduler->schedule(&fusion, heuristic_params.get()); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); for (auto tv : fusion.allTvs()) { if (tv->getMemoryType() == MemoryType::Shared) { @@ -990,8 +992,8 @@ TEST_F(CombinedSchedulerTest, SharedMemoryPersistentVectFactor) { } } } - auto cg_outputs = fe.runFusion( - aten_inputs, heuristic_params->as()->lparams); + auto cg_outputs = + ke.run(aten_inputs, heuristic_params->as()->lparams); testValidate(&fusion_copy, cg_outputs, aten_inputs, __LINE__, __FILE__); } diff --git a/tests/cpp/test_dynamic_transform.cpp b/tests/cpp/test_dynamic_transform.cpp index ca29bf0825b..8eb468999b7 100644 --- a/tests/cpp/test_dynamic_transform.cpp +++ b/tests/cpp/test_dynamic_transform.cpp @@ -209,10 +209,10 @@ TEST_F(NVFuserTest, DynamicTransform3_CUDA) { at::Tensor t1 = at::randn(shape_after, options); std::vector inputs = {t0, t1}; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = fec.runFusionWithInputs(inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs(inputs); - testValidate(fec.fusion(), cg_outputs, inputs, __LINE__, __FILE__); + testValidate(executor_cache.fusion(), cg_outputs, inputs, __LINE__, __FILE__); } // Test multiple patterns of reshape @@ -777,13 +777,13 @@ void reductionDynamicViewAddFusion( : add(x_reshape, bias); fusion.addOutput(y); - FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); - size_t num_concretizations = fusion_executor_cache.countConcretizations(); + size_t num_concretizations = executor_cache.countConcretizations(); // Check that concretizations and runtimes are cache misses only when they // should be auto checkCache = [&](bool expect_miss) { - auto current = fusion_executor_cache.countConcretizations(); + auto current = executor_cache.countConcretizations(); ASSERT_EQ(current, num_concretizations + (size_t)expect_miss); num_concretizations = current; }; @@ -830,7 +830,7 @@ void reductionDynamicViewAddFusion( aten_inputs.emplace_back(output_shape[i]); } - auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); checkCache(expect_miss); auto at_tv1 = (reshape_before_reduction) ? (at_x + at_bias) @@ -902,22 +902,22 @@ void reductionDynamicPadAddFusion( auto y = sum(x_pad, {kReductionAxis}); fusion.addOutput(y); - FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); // Check that concretizations and runtimes are cache misses only when they // should be - size_t num_concretizations = fusion_executor_cache.getKernelRuntimes().size(); -#define CHECK_CACHE(expect_miss, ...) \ - auto current = fusion_executor_cache.getKernelRuntimes().size(); \ - auto expected = num_concretizations + (size_t)expect_miss; \ - NVF_CHECK( \ - current == expected, \ - "Expected cache size ", \ - expected, \ - " but found ", \ - current, \ - ". ", \ - __VA_ARGS__); \ + size_t num_concretizations = executor_cache.getKernelRuntimes().size(); +#define CHECK_CACHE(expect_miss, ...) \ + auto current = executor_cache.getKernelRuntimes().size(); \ + auto expected = num_concretizations + (size_t)expect_miss; \ + NVF_CHECK( \ + current == expected, \ + "Expected cache size ", \ + expected, \ + " but found ", \ + current, \ + ". ", \ + __VA_ARGS__); \ num_concretizations = current; for (auto& inv : invocations) { @@ -943,7 +943,7 @@ void reductionDynamicPadAddFusion( aten_inputs.emplace_back(pad_widths[i]); } - auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); CHECK_CACHE( expect_miss, "Input shape=", input_shape, " pad_widths=", pad_widths); @@ -1011,11 +1011,11 @@ TEST_F(NVFuserTest, FusionDynamicSliceToBroadcast_CUDA) { // concretized to Iteration, it does not wind up overwriting the Broadcast // logical. - FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor at0 = at::randn({5}, options); std::vector aten_inputs = {at0}; - auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -1037,13 +1037,13 @@ TEST_F(NVFuserTest, FusionDynamicEmptyCat1_CUDA) { fusion.addOutput(tv3); // Check correctness - FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor at0 = at::randn({5}, options); at::Tensor at1 = at::randn({0}, options); at::Tensor at2 = at::randn({3}, options); std::vector aten_inputs = {at0, at1, at2}; - auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -1063,16 +1063,16 @@ TEST_F(NVFuserTest, FusionDynamicEmptyCat2_CUDA) { fusion.addOutput(tv2); // Check correctness - FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor at0 = at::randn({5}, options); at::Tensor at1 = at::randn({0}, options); std::vector aten_inputs = {at0, at1}; - auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); // Check that fusion consists only of tv2 = set(tv0) - auto fkr = fusion_executor_cache.getMostRecentKernelRuntime(); + auto fkr = executor_cache.getMostRecentKernelRuntime(); auto seg_fusion = fkr->fusionSegments(); auto output_def = seg_fusion->outputs()[0]->definition(); EXPECT_TRUE(output_def->isA()); @@ -1098,15 +1098,15 @@ TEST_F(NVFuserTest, DynamicTransformIssue418_CUDA) { fusion->addOutput(vm.mean); fusion->addOutput(vm.var); - FusionExecutorCache fusion_executor_cache(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor at0 = at::randn({256, 128, 28, 28}, options); std::vector aten_inputs = {at0, 32}; - auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); testValidate( - fusion_executor_cache.fusion(), outputs, aten_inputs, __LINE__, __FILE__); + executor_cache.fusion(), outputs, aten_inputs, __LINE__, __FILE__); } TEST_F(NVFuserTest, Issue249_CUDA) { @@ -1126,15 +1126,14 @@ TEST_F(NVFuserTest, Issue249_CUDA) { auto tv3 = add(tv2, tv2); fusion.addOutput(tv3); - FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor at_x = at::randn({2, 3, 4, 5}, options); - auto outputs = fusion_executor_cache.runFusionWithInputs({at_x}); + auto outputs = executor_cache.runFusionWithInputs({at_x}); - testValidate( - fusion_executor_cache.fusion(), outputs, {at_x}, __LINE__, __FILE__); + testValidate(executor_cache.fusion(), outputs, {at_x}, __LINE__, __FILE__); } // This is just like the test above, but uses an input scalar with value -1 @@ -1158,7 +1157,7 @@ TEST_F(NVFuserTest, Issue249InputNegative1_CUDA) { auto tv3 = add(tv2, tv2); fusion.addOutput(tv3); - FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor at_x = at::randn({2, 3, 4, 5}, options); @@ -1166,18 +1165,13 @@ TEST_F(NVFuserTest, Issue249InputNegative1_CUDA) { // Dynamic reshape sizes that are not constant at definition must be explicit: // no -1 allowed EXPECT_THROW( - fusion_executor_cache.runFusionWithInputs({at_x, 2, 4, -1}), - std::exception); + executor_cache.runFusionWithInputs({at_x, 2, 4, -1}), std::exception); // Passing explicit sizes works fine - auto outputs = fusion_executor_cache.runFusionWithInputs({at_x, 2, 4, 15}); + auto outputs = executor_cache.runFusionWithInputs({at_x, 2, 4, 15}); testValidate( - fusion_executor_cache.fusion(), - outputs, - {at_x, 2, 4, 15}, - __LINE__, - __FILE__); + executor_cache.fusion(), outputs, {at_x, 2, 4, 15}, __LINE__, __FILE__); } // Test that OptOutMutator mutates expressions in a predictable way @@ -1215,10 +1209,10 @@ TEST_F(NVFuserTest, OptOutMutatorMutatedOutput) { inlineMost(); - FusionExecutor fe; - fe.compileFusion(fusion); + KernelExecutor ke; + ke.compile(fusion); - auto outputs = fe.runFusion({t0}); + auto outputs = ke.run({t0}); testValidate(fusion, outputs, {t0}, __LINE__, __FILE__); } @@ -1252,10 +1246,10 @@ TEST_F(NVFuserTest, OptOutMutatorRedefinedConstant) { inlineMost(); - FusionExecutor fe; - fe.compileFusion(fusion); + KernelExecutor ke; + ke.compile(fusion); - auto outputs = fe.runFusion({3L}); + auto outputs = ke.run({3L}); testValidate(fusion, outputs, {3L}, __LINE__, __FILE__); } @@ -1281,7 +1275,7 @@ TEST_F(NVFuserTest, SymbolicSqueeze) { tv1, std::vector({false, true})); // Squeeze second dimension fusion->addOutput(tv2); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({3, 2}, options); @@ -1289,14 +1283,14 @@ TEST_F(NVFuserTest, SymbolicSqueeze) { // An invalid input has a second dimension that cannot be squeezed std::vector invalid_inputs = {t0, 2, 3}; - auto outputs = fec.runFusionWithInputs(valid_inputs); + auto outputs = executor_cache.runFusionWithInputs(valid_inputs); testValidate(fusion, outputs, valid_inputs, __LINE__, __FILE__); // An informative error message should be given by // SqueezeOp::checkConcretization EXPECT_THAT( - [&]() { fec.runFusionWithInputs(invalid_inputs); }, + [&]() { executor_cache.runFusionWithInputs(invalid_inputs); }, ::testing::ThrowsMessage(::testing::HasSubstr( " must concretize to IterType::Broadcast but found"))); } @@ -1325,7 +1319,7 @@ TEST_F(NVFuserTest, SymbolicExpand) { fusion->addOutput(tv3); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({3, 2}, options); @@ -1333,13 +1327,14 @@ TEST_F(NVFuserTest, SymbolicExpand) { // An invalid input has a second dimension that cannot be expanded std::vector invalid_inputs = {t0, 2, 3, 2, 5}; - auto outputs = fec.runFusionWithInputs(valid_inputs); + auto outputs = executor_cache.runFusionWithInputs(valid_inputs); - testValidate(fec.fusion(), outputs, valid_inputs, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), outputs, valid_inputs, __LINE__, __FILE__); // An informative error message should be given during concretization EXPECT_THAT( - [&]() { fec.runFusionWithInputs(invalid_inputs); }, + [&]() { executor_cache.runFusionWithInputs(invalid_inputs); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Mismatch in sizes when concretizing expand."))); } @@ -1380,13 +1375,13 @@ TEST_F(NVFuserTest, ConcretizeConstantExtents) { fusion->addOutput(tv5); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({4096, 12288}, options); std::vector inputs = {t0}; - auto outputs = fec.runFusionWithInputs(inputs); + auto outputs = executor_cache.runFusionWithInputs(inputs); testValidate(fusion, outputs, inputs, __LINE__, __FILE__); } @@ -1417,13 +1412,13 @@ TEST_F(NVFuserTest, DynamicSqueezeTrivialReduction) { auto tv2 = sum(tv1, {0, 2, 3, 4}); fusion->addOutput(tv2); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({2, 2, 9}, options); std::vector inputs = {t0}; - auto outputs = fec.runFusionWithInputs(inputs); + auto outputs = executor_cache.runFusionWithInputs(inputs); testValidate(fusion, outputs, inputs, __LINE__, __FILE__); } @@ -1455,13 +1450,13 @@ TEST_F(NVFuserTest, DynamicSqueezeTrivialWelford) { fusion->addOutput(res.mean); fusion->addOutput(res.var); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({2, 2, 9}, options); std::vector inputs = {t0}; - auto outputs = fec.runFusionWithInputs(inputs); + auto outputs = executor_cache.runFusionWithInputs(inputs); testValidate(fusion, outputs, inputs, __LINE__, __FILE__); } diff --git a/tests/cpp/test_external_src.cpp b/tests/cpp/test_external_src.cpp index f0623ade609..21d487f17b4 100644 --- a/tests/cpp/test_external_src.cpp +++ b/tests/cpp/test_external_src.cpp @@ -28,7 +28,7 @@ class ExternalSrcExample : public NVFuserTest {}; TEST_F(ExternalSrcExample, Reduction_CUDA) { Fusion fusion; FusionGuard fg(&fusion); - FusionExecutor fe; + KernelExecutor ke; // By default, this env var should not be defined. To test using an // external source file, set it to the path to the external source @@ -44,7 +44,7 @@ TEST_F(ExternalSrcExample, Reduction_CUDA) { buffer << cuda_src.rdbuf(); std::string cuda_src_str = buffer.str(); - fe.compileRtc(cuda_src_str, "kernel1", true, PrimDataType::Int32); + ke.compileRtc(cuda_src_str, "kernel1", true, PrimDataType::Int32); // The following is a sample launch pattern of the compiled // kernel. It must be adapted for each particular source file. @@ -82,7 +82,7 @@ TEST_F(ExternalSrcExample, Reduction_CUDA) { clearL2Cache(); std::cout << "Launching the kernel" << std::endl; float elapsed_time_ms = - fe.runRtc(lp, {t0, t7, t14, t15, t16, t17}, PrimDataType::Int32); + ke.runRtc(lp, {t0, t7, t14, t15, t16, t17}, PrimDataType::Int32); std::cout << "kernel run in " << elapsed_time_ms << " ms, achieved " << (read_write_bytes / elapsed_time_ms / 1000.0 / 1000.0) << " GB/s" << std::endl; @@ -99,7 +99,7 @@ TEST_F(ExternalSrcExample, Reduction_CUDA) { TEST_F(ExternalSrcExample, Matmul_CUDA) { Fusion fusion; FusionGuard fg(&fusion); - FusionExecutor fe; + KernelExecutor ke; // By default, this env var should not be defined. To test using an // external source file, set it to the path to the external source @@ -115,7 +115,7 @@ TEST_F(ExternalSrcExample, Matmul_CUDA) { buffer << cuda_src.rdbuf(); std::string cuda_src_str = buffer.str(); - fe.compileRtc(cuda_src_str, "kernel1", true, PrimDataType::Int32); + ke.compileRtc(cuda_src_str, "kernel1", true, PrimDataType::Int32); int M = 2048, N = 3456, K = 2048; MmaLayout layout = MmaLayout::TN; @@ -129,7 +129,7 @@ TEST_F(ExternalSrcExample, Matmul_CUDA) { auto output = at::zeros_like(at_output); clearL2Cache(); std::cout << "Launching the kernel" << std::endl; - float elapsed_time_ms = fe.runRtc( + float elapsed_time_ms = ke.runRtc( lp, {inputs.first, inputs.second, output}, PrimDataType::Int32); std::cout << "kernel run in " << elapsed_time_ms << " ms." << std::endl; diff --git a/tests/cpp/test_gpu1.cpp b/tests/cpp/test_gpu1.cpp index f5ee88d0936..4ed46cddc46 100644 --- a/tests/cpp/test_gpu1.cpp +++ b/tests/cpp/test_gpu1.cpp @@ -207,9 +207,9 @@ TEST_F(NVFuserTest, FusionClear_CUDA) { at::Tensor input1 = at::randn({16, 8, 8}, options); at::Tensor input2 = at::randn_like(input1); - FusionExecutor fe; - fe.compileFusion(&fusion, {input1, input2}); - auto outputs = fe.runFusion({input1, input2}); + KernelExecutor ke; + ke.compile(&fusion, {input1, input2}); + auto outputs = ke.run({input1, input2}); at::Tensor tv2_ref = input2 + 2.0; at::Tensor output_ref = input1 + tv2_ref; @@ -813,9 +813,9 @@ TEST_F(NVFuserTest, FusionOuterSplit_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion({}); + KernelExecutor ke; + ke.compile(&fusion); + auto outputs = ke.run({}); const auto& output = outputs.at(0); at::Tensor output_ref = at::ones_like(output, options); @@ -855,9 +855,9 @@ TEST_F(NVFuserTest, FusionCodeGen_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion({}); + KernelExecutor ke; + ke.compile(&fusion); + auto outputs = ke.run({}); const auto& output = outputs.at(0); at::Tensor output_ref = at::ones_like(output, options); @@ -899,9 +899,9 @@ TEST_F(NVFuserTest, FusionCodeGen2_CUDA) { at::Tensor input1 = at::randn({16, 8, 8}, options); at::Tensor input2 = at::randn_like(input1); - FusionExecutor fe; - fe.compileFusion(&fusion, {input1, input2}); - auto outputs = fe.runFusion({input1, input2}); + KernelExecutor ke; + ke.compile(&fusion, {input1, input2}); + auto outputs = ke.run({input1, input2}); at::Tensor tv2_ref = input2 + 2.0; at::Tensor output_ref = input1 + tv2_ref; @@ -955,9 +955,9 @@ TEST_F(NVFuserTest, FusionSimplePWise_CUDA) { at::Tensor input2 = at::rand_like(input1); at::Tensor output = at::empty_like(input1); - FusionExecutor fe; - fe.compileFusion(&fusion, {input1, input2}); - fe.runFusion({input1, input2}, {output}); + KernelExecutor ke; + ke.compile(&fusion, {input1, input2}); + ke.run({input1, input2}, {output}); at::Tensor tv2_ref = input2 + 2.0; at::Tensor output_ref = input1 + tv2_ref; @@ -1013,9 +1013,9 @@ TEST_F(NVFuserTest, FusionSimplePWiseDtypeComplex_CUDA) { at::Tensor input2 = at::rand_like(input1); at::Tensor output = at::empty_like(input1); - FusionExecutor fe; - fe.compileFusion(&fusion, {input1, input2}); - fe.runFusion({input1, input2}, {output}); + KernelExecutor ke; + ke.compile(&fusion, {input1, input2}); + ke.run({input1, input2}, {output}); at::Tensor tv2_ref = input2 + static_cast>(scalar1); at::Tensor output_ref = input1 + tv2_ref; @@ -1063,9 +1063,9 @@ TEST_F(NVFuserTest, FusionExecKernel_CUDA) { at::Tensor input1 = at::ones({1, 128}, options); at::Tensor input2 = at::ones_like(input1); - FusionExecutor fe; - fe.compileFusion(&fusion, {input1, input2}); - auto outputs = fe.runFusion({input1, input2}); + KernelExecutor ke; + ke.compile(&fusion, {input1, input2}); + auto outputs = ke.run({input1, input2}); at::Tensor check = at::full({1, 128}, 4, options); ; @@ -1145,9 +1145,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAt1_CUDA) { std::vector cg_outputs = { at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - fe.runFusion({aten_input}, cg_outputs); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}); + ke.run({aten_input}, cg_outputs); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -1199,9 +1199,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAt2_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({129, 127}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto cg_outputs = fe.runFusion({input}); + KernelExecutor ke; + ke.compile(&fusion, {input}); + auto cg_outputs = ke.run({input}); testValidate(&fusion, cg_outputs, {input}, __LINE__, __FILE__); } @@ -1253,9 +1253,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAt3_CUDA) { at::Tensor cg_output = at::empty_like(t0, options); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - fe.runFusion(aten_inputs, {cg_output}); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + ke.run(aten_inputs, {cg_output}); testValidate(&fusion, {cg_output}, aten_inputs, __LINE__, __FILE__); } @@ -1317,9 +1317,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAt4_CUDA) { std::vector aten_inputs = {t0, t1, t2, t3}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -1353,9 +1353,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAt5_CUDA) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -1388,9 +1388,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAt6_CUDA) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -1449,9 +1449,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAt7_CUDA) { std::vector aten_inputs = {t0, t2, t6}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -1505,9 +1505,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAt8_CUDA) { std::vector aten_inputs = {t0, t2, t6}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -1574,9 +1574,9 @@ TEST_F(NVFuserTest, FusionComputeAtMultiConsumers_CUDA) { std::vector cg_outputs = { at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - fe.runFusion({aten_input}, cg_outputs); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}); + ke.run({aten_input}, cg_outputs); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -1644,9 +1644,9 @@ TEST_F(NVFuserTest, FusionComputeAtCommonConsumer1_CUDA) { at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - fe.runFusion({aten_input}, cg_outputs); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}); + ke.run({aten_input}, cg_outputs); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -1719,9 +1719,9 @@ TEST_F(NVFuserTest, FusionComputeAtCommonConsumer2_CUDA) { at::Tensor cg_output = at::empty_like(aten_input, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - fe.runFusion({aten_input}, {cg_output}); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}); + ke.run({aten_input}, {cg_output}); testValidate(&fusion, {cg_output}, {aten_input}, __LINE__, __FILE__); } @@ -1800,9 +1800,9 @@ TEST_F(NVFuserTest, FusionComputeAtCommonConsumer3_CUDA) { std::vector cg_outputs = { at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - fe.runFusion({aten_input}, cg_outputs); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}); + ke.run({aten_input}, cg_outputs); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -1864,9 +1864,9 @@ TEST_F(NVFuserTest, FusionComputeAtNoCommonConsumer_CUDA) { at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - fe.runFusion({aten_input}, cg_outputs); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}); + ke.run({aten_input}, cg_outputs); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -1972,9 +1972,9 @@ TEST_F(NVFuserTest, FusionScalarInputs_CUDA) { at::Scalar(fl2), at::Scalar(fl3)}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - fe.runFusion(aten_inputs, {cg_output}); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + ke.run(aten_inputs, {cg_output}); testValidate(&fusion, {cg_output}, aten_inputs, __LINE__, __FILE__); } @@ -2024,9 +2024,9 @@ TEST_F(NVFuserTest, FusionLoopUnroll_CUDA) { at::Tensor input0 = at::randn({129, 13, 3}, options); at::Tensor input1 = at::randn({129, 13, 3}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input0, input1}); - auto outputs = fe.runFusion({input0, input1}); + KernelExecutor ke; + ke.compile(&fusion, {input0, input1}); + auto outputs = ke.run({input0, input1}); NVF_CHECK(outputs[0].equal(input0.add(input1.add(2.0)))); } @@ -2173,9 +2173,9 @@ void test_op( std::vector output_vect = {cg_output}; cudaDeviceSynchronize(); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs_ivalues); - fe.runFusion(aten_inputs_ivalues, output_vect); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs_ivalues); + ke.run(aten_inputs_ivalues, output_vect); cudaDeviceSynchronize(); at::Tensor aten_output = af(aten_inputs); @@ -2710,17 +2710,17 @@ TEST_F(NVFuserTest, FusionFp8CastOps_CUDA) { // const at::ArrayRef input_ivalues(inputs); std::vector inputs = {input1}; - FusionExecutor fe; + KernelExecutor ke; if (!deviceMajorMinorCheck(9)) { ASSERT_THAT( - [&]() { fe.compileFusion(&fusion, inputs); }, + [&]() { ke.compile(&fusion, inputs); }, testing::ThrowsMessage(testing::HasSubstr( "Reason: Fusion contains Float8_xxx values which was introduced in Hopper (9.0)"))); GTEST_SKIP() << "skipping tests on pre-HOPPER GPUs"; } else { - fe.compileFusion(&fusion, inputs); - auto outputs = fe.runFusion(inputs); + ke.compile(&fusion, inputs); + auto outputs = ke.run(inputs); at::Tensor ref_output = input1.to(at_fp8_type).to(at_src_type); @@ -2790,9 +2790,9 @@ TEST_F(NVFuserTest, FusionReduction1_CUDA) { at::Tensor input = at::randn({numel_x, numel_y}, options); at::Tensor cg_output = at::empty({numel_x}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - fe.runFusion({input}, {cg_output}); + KernelExecutor ke; + ke.compile(&fusion, {input}); + ke.run({input}, {cg_output}); auto aten_output = input.to(at::kDouble).sum({1}); @@ -2862,9 +2862,9 @@ TEST_F(NVFuserTest, FusionReduction2_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({numel_x, numel_y}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto cg_outputs = fe.runFusion({input}); + KernelExecutor ke; + ke.compile(&fusion, {input}); + auto cg_outputs = ke.run({input}); auto aten_output = input.to(at::kDouble).sum({1}); testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__); @@ -2913,9 +2913,9 @@ TEST_F(NVFuserTest, FusionReduction3_CUDA) { at::Tensor aten_input = at::randn({numel_x, numel_y}, options); at::Tensor cg_output = at::empty({numel_x}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - fe.runFusion({aten_input}, {cg_output}); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}); + ke.run({aten_input}, {cg_output}); auto aten_output = aten_input.to(at::kDouble).sum({1}); @@ -2979,9 +2979,9 @@ TEST_F(NVFuserTest, FusionReduction4_CUDA) { at::Tensor t1 = at::randn({numel_x, numel_y}, options); at::Tensor t4 = at::randn({numel_x}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1, t4}); - auto cg_outputs = fe.runFusion({t0, t1, t4}); + KernelExecutor ke; + ke.compile(&fusion, {t0, t1, t4}); + auto cg_outputs = ke.run({t0, t1, t4}); auto t2 = t0.add(t1); auto t3 = t2.to(at::kDouble).sum({1}); @@ -3033,9 +3033,9 @@ TEST_F(NVFuserTest, FusionReduction5_CUDA) { at::Tensor cg_output = at::empty({bidy, tidx}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - fe.runFusion({input}, {cg_output}); + KernelExecutor ke; + ke.compile(&fusion, {input}); + ke.run({input}, {cg_output}); auto aten_output = input.to(at::kDouble).sum({1}); testValidate( @@ -3098,9 +3098,9 @@ TEST_F(NVFuserTest, FusionReduction6_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({numel_x, numel_y, numel_z}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto cg_outputs = fe.runFusion({input}); + KernelExecutor ke; + ke.compile(&fusion, {input}); + auto cg_outputs = ke.run({input}); auto aten_output = input.to(at::kDouble).sum({1, 2}); testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__); @@ -3130,9 +3130,9 @@ TEST_F(NVFuserTest, FusionMultiGridReduction_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({numel_x, numel_y}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto cg_outputs = fe.runFusion({input}); + KernelExecutor ke; + ke.compile(&fusion, {input}); + auto cg_outputs = ke.run({input}); testValidate(&fusion, cg_outputs, {input}, __LINE__, __FILE__); } @@ -3154,9 +3154,9 @@ TEST_F(NVFuserTest, FusionMultiGridReduction2_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({4, 8}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto cg_output = fe.runFusion({input}); + KernelExecutor ke; + ke.compile(&fusion, {input}); + auto cg_output = ke.run({input}); testValidate(&fusion, cg_output, {input}, __LINE__, __FILE__); } @@ -3207,9 +3207,9 @@ TEST_F(NVFuserTest, FusionReductionTFT_CUDA) { at::Tensor input = at::randn({numel_x, numel_y}, options); at::Tensor cg_output = at::empty({numel_x}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - fe.runFusion({input}, {cg_output}); + KernelExecutor ke; + ke.compile(&fusion, {input}); + ke.run({input}, {cg_output}); auto aten_output = input.to(at::kDouble).sum({1}); testValidate( @@ -3271,9 +3271,9 @@ TEST_F(NVFuserTest, FusionReductionOuterSplit_CUDA) { at::Tensor t1 = at::randn({numel_x, numel_y}, options); at::Tensor t4 = at::randn({numel_x}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1, t4}); - auto cg_outputs = fe.runFusion({t0, t1, t4}); + KernelExecutor ke; + ke.compile(&fusion, {t0, t1, t4}); + auto cg_outputs = ke.run({t0, t1, t4}); auto t2 = t0.add(t1); auto t3 = t2.to(at::kDouble).sum({1}); @@ -3310,7 +3310,7 @@ TEST_F(NVFuserTest, FusionBranches_CUDA) { at::Tensor t1 = at::randn({x, y}, options); at::Tensor t2 = at::randn({x, y}, options); - FusionExecutor fe; + KernelExecutor ke; tv6->merge(0); tv6->split(0, 128); tv6->split(0, 4); @@ -3331,8 +3331,8 @@ TEST_F(NVFuserTest, FusionBranches_CUDA) { std::vector aten_inputs = {t0, t1, t2}; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -3377,9 +3377,9 @@ TEST_F(NVFuserTest, FusionSimpleBCast1_CUDA) { std::vector aten_inputs = {t0, t2, t3}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -3429,9 +3429,9 @@ TEST_F(NVFuserTest, FusionSimpleBCast2_CUDA) { std::vector aten_inputs = {t0, t1, t4}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - fe.runFusion(aten_inputs, {cg_output}); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + ke.run(aten_inputs, {cg_output}); testValidate(&fusion, {cg_output}, aten_inputs, __LINE__, __FILE__); } @@ -3471,9 +3471,9 @@ TEST_F(NVFuserTest, FusionSimpleBCast3_CUDA) { std::vector aten_inputs = {t0, t2}; at::Tensor cg_output = at::empty({x, y, z}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - fe.runFusion(aten_inputs, {cg_output}); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + ke.run(aten_inputs, {cg_output}); testValidate(&fusion, {cg_output}, aten_inputs, __LINE__, __FILE__); } @@ -3516,9 +3516,9 @@ TEST_F(NVFuserTest, FusionSimpleBCast4_CUDA) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - fe.runFusion(aten_inputs, {cg_output}); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + ke.run(aten_inputs, {cg_output}); testValidate(&fusion, {cg_output}, aten_inputs, __LINE__, __FILE__); } @@ -3556,9 +3556,9 @@ TEST_F(NVFuserTest, FusionSimpleBCast5_CUDA) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - fe.runFusion(aten_inputs, {cg_output}); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + ke.run(aten_inputs, {cg_output}); testValidate(&fusion, {cg_output}, aten_inputs, __LINE__, __FILE__); } @@ -3608,9 +3608,9 @@ TEST_F(NVFuserTest, FusionComplexBCast1_CUDA) { std::vector aten_inputs = {t0, t3, t6}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -3652,9 +3652,9 @@ TEST_F(NVFuserTest, FusionComplexBCast2_CUDA) { at::Tensor t0 = at::randn({y, z}, options); at::Tensor t4 = at::randn({x, y}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t4}); - auto cg_outputs = fe.runFusion({t0, t4}); + KernelExecutor ke; + ke.compile(&fusion, {t0, t4}); + auto cg_outputs = ke.run({t0, t4}); testValidate(&fusion, {cg_outputs}, {t0, t4}, __LINE__, __FILE__); } @@ -3726,18 +3726,18 @@ TEST_F(NVFuserTest, FusionSimpleGemm_CUDA) { at::Tensor t0 = at::randn({M, K}, options); at::Tensor t1 = at::randn({K, N}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}, LaunchParams(1, -1, -1, 32, 4, 4)); + KernelExecutor ke; + ke.compile(&fusion, {t0, t1}, LaunchParams(1, -1, -1, 32, 4, 4)); // Lets specify a few bounds in launch params to make sure it works - fe.runFusion({t0, t1}, LaunchParams(1, -1, -1, 32, 4, 4)); + ke.run({t0, t1}, LaunchParams(1, -1, -1, 32, 4, 4)); // Make sure bad launch params throws // TODO: Re-enable once we have parallelization validation in. // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - // ASSERT_ANY_THROW(fe.runFusion({t0, t1}, LaunchParams(1, 2, 3, 4, 5, 6))); + // ASSERT_ANY_THROW(ke.runFusion({t0, t1}, LaunchParams(1, 2, 3, 4, 5, 6))); // Don't specify any launch params - auto cg_outputs = fe.runFusion({t0, t1}); + auto cg_outputs = ke.run({t0, t1}); auto aten_output = t0.to(at::kDouble).matmul(t1.to(at::kDouble)); @@ -3791,9 +3791,9 @@ TEST_F(NVFuserTest, FusionSoftmax1D_CUDA) { at::Tensor cg_output = at::empty({dimx}, options); at::Tensor t3_output = at::empty_like(cg_output, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - fe.runFusion({t0}, {cg_output}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + ke.run({t0}, {cg_output}); auto aten_output = at::_softmax(t0.to(at::kDouble), -1, false); @@ -3860,9 +3860,9 @@ TEST_F(NVFuserTest, FusionSoftmax1DNormalized_CUDA) { at::Tensor input = at::randn({dimx}, options); at::Tensor t3_output = at::empty({dimx}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto cg_outputs = fe.runFusion({input}); + KernelExecutor ke; + ke.compile(&fusion, {input}); + auto cg_outputs = ke.run({input}); auto aten_output = at::_softmax(input.to(at::kDouble), -1, false); @@ -3920,9 +3920,9 @@ TEST_F(NVFuserTest, FusionSoftmax3D_CUDA) { at::Tensor cg_output = at::empty({dimx, dimy, dimz}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - fe.runFusion({input}, {cg_output}); + KernelExecutor ke; + ke.compile(&fusion, {input}); + ke.run({input}, {cg_output}); auto aten_output = at::_softmax(input.to(at::kDouble), -1, false); @@ -3995,9 +3995,9 @@ TEST_F(NVFuserTest, FusionSoftmax3DNormalized_CUDA) { at::Tensor input = at::randn({dimx, dimy, dimz}, options); at::Tensor t3_output = at::empty({dimx, dimy, dimz}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto cg_outputs = fe.runFusion({input}); + KernelExecutor ke; + ke.compile(&fusion, {input}); + auto cg_outputs = ke.run({input}); auto aten_output = at::_softmax(input.to(at::kDouble), -1, false); @@ -4081,9 +4081,9 @@ TEST_F(NVFuserTest, FusionGridReduction1_CUDA) { at::Tensor input = at::randn({numel_x, numel_y}, options); at::Tensor cg_output = at::empty({numel_x}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - fe.runFusion({input}, {cg_output}); + KernelExecutor ke; + ke.compile(&fusion, {input}); + ke.run({input}, {cg_output}); auto aten_output = input.to(at::kDouble).sum({1}); @@ -4141,9 +4141,9 @@ TEST_F(NVFuserTest, FusionGridReduction2_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({numel_x, numel_y}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto cg_outputs = fe.runFusion({input}); + KernelExecutor ke; + ke.compile(&fusion, {input}); + auto cg_outputs = ke.run({input}); auto aten_output = input.to(at::kDouble).sum({1}); @@ -4203,9 +4203,9 @@ TEST_F(NVFuserTest, FusionGridReduction3dim1_CUDA) { at::Tensor input = at::randn({numel_x, numel_y}, options); at::Tensor cg_output = at::empty({numel_x}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - fe.runFusion({input}, {cg_output}); + KernelExecutor ke; + ke.compile(&fusion, {input}); + ke.run({input}, {cg_output}); auto aten_output = input.to(at::kDouble).sum({1}); testValidate( @@ -4262,9 +4262,9 @@ TEST_F(NVFuserTest, FusionGridReduction3dim0_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({numel_x, numel_y}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto cg_outputs = fe.runFusion({input}); + KernelExecutor ke; + ke.compile(&fusion, {input}); + auto cg_outputs = ke.run({input}); auto aten_output = input.to(at::kDouble).sum({0}); @@ -4328,9 +4328,9 @@ TEST_F(NVFuserTest, FusionGridReduction4_CUDA) { at::Tensor input = at::randn({numel_x, numel_y}, options); at::Tensor cg_output = at::empty({numel_x}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - fe.runFusion({input}, {cg_output}); + KernelExecutor ke; + ke.compile(&fusion, {input}); + ke.run({input}, {cg_output}); auto aten_output = input.to(at::kDouble).sum({1}); testValidate( @@ -4385,9 +4385,9 @@ TEST_F(NVFuserTest, FusionGridReduction5_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({numel_x, numel_y}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto cg_outputs = fe.runFusion({input}); + KernelExecutor ke; + ke.compile(&fusion, {input}); + auto cg_outputs = ke.run({input}); auto aten_output = input.to(at::kDouble).sum({1}); testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__); @@ -4450,9 +4450,9 @@ TEST_F(NVFuserTest, FusionGridReduction6_CUDA) { at::Tensor input = at::randn({numel_x, numel_y, numel_z}, options); at::Tensor cg_output = at::empty({numel_x}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - fe.runFusion({input}, {cg_output}); + KernelExecutor ke; + ke.compile(&fusion, {input}); + ke.run({input}, {cg_output}); auto aten_output = input.to(at::kDouble).sum({1, 2}); @@ -4482,9 +4482,9 @@ TEST_F(NVFuserTest, FusionGridReduction7_CUDA) { at::Tensor input = at::randn({numel_x}, options); at::Tensor cg_output = at::empty({numel_x}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto out = fe.runFusion({input}); + KernelExecutor ke; + ke.compile(&fusion, {input}); + auto out = ke.run({input}); testValidate(&fusion, out, {input}, __LINE__, __FILE__); } @@ -4508,9 +4508,9 @@ TEST_F(NVFuserTest, FusionGridReduction8_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({numel_x, numel_y}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto out = fe.runFusion({input}); + KernelExecutor ke; + ke.compile(&fusion, {input}); + auto out = ke.run({input}); testValidate(&fusion, out, {input}, __LINE__, __FILE__); } @@ -4545,9 +4545,9 @@ TEST_F(NVFuserTest, FusionGridReduction9_CUDA) { std::vector aten_inputs = {t0, t2}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_output = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_output = ke.run(aten_inputs); testValidate(&fusion, cg_output, {t0, t2}, __LINE__, __FILE__); } @@ -4586,9 +4586,9 @@ TEST_F(NVFuserTest, FusionGridReduction10_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({numel_w, numel_x, numel_y, numel_z}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_output = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_output = ke.run({t0}); testValidate(&fusion, cg_output, {t0}, __LINE__, __FILE__); } @@ -4616,9 +4616,9 @@ TEST_F(NVFuserTest, FusionNonRedAxisBind_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({16, bid_x * tid_x}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto cg_outputs = fe.runFusion({input}); + KernelExecutor ke; + ke.compile(&fusion, {input}); + auto cg_outputs = ke.run({input}); testValidate(&fusion, cg_outputs, {input}, __LINE__, __FILE__); } @@ -4666,9 +4666,9 @@ TEST_F(NVFuserTest, FusionSplitBCast_CUDA) { at::Tensor t1 = at::randn({32, 32, 128}, options); at::Tensor cg_output = at::empty({32, 32, 128}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - fe.runFusion({t0, t1}, {cg_output}); + KernelExecutor ke; + ke.compile(&fusion, {t0, t1}); + ke.run({t0, t1}, {cg_output}); } TEST_F(NVFuserTest, FusionBCastInnerDim_CUDA) { @@ -4747,9 +4747,9 @@ TEST_F(NVFuserTest, FusionComputeAtExprOrder1_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({100}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -4778,9 +4778,9 @@ TEST_F(NVFuserTest, FusionComputeAtExprOrder2_CUDA) { at::Tensor cg_output = at::empty_like(aten_input, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - fe.runFusion({aten_input}, {cg_output}); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}); + ke.run({aten_input}, {cg_output}); testValidate(&fusion, {cg_output}, {aten_input}, __LINE__, __FILE__); } @@ -4808,9 +4808,9 @@ TEST_F(NVFuserTest, FusionComputeAtExprOrder3_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({dimx, dimy}, options); - nvfuser::FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + nvfuser::KernelExecutor ke; + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -4831,9 +4831,9 @@ TEST_F(NVFuserTest, FusionZeroDimComputeAt_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({100}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -4866,9 +4866,9 @@ TEST_F(NVFuserTest, FusionZeroDimBroadcast_CUDA) { std::vector aten_inputs = {t0, t1}; at::Tensor cg_output = at::empty({}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - fe.runFusion(aten_inputs, {cg_output}); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + ke.run(aten_inputs, {cg_output}); testValidate(&fusion, {cg_output}, aten_inputs, __LINE__, __FILE__); } @@ -4901,9 +4901,9 @@ TEST_F(NVFuserTest, FusionZeroDimReduction_CUDA) { at::Tensor cg_output = at::empty({}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - fe.runFusion({aten_input}, {cg_output}); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}); + ke.run({aten_input}, {cg_output}); testValidate( &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__); @@ -4953,9 +4953,9 @@ TEST_F(NVFuserTest, FusionBCastAfterReduce_CUDA) { auto aten_output = t3.add(t4); std::vector aten_inputs = {t0, t4}; - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t4}); - auto cg_outputs = fe.runFusion({t0, t4}); + KernelExecutor ke; + ke.compile(&fusion, {t0, t4}); + auto cg_outputs = ke.run({t0, t4}); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); @@ -4977,9 +4977,9 @@ TEST_F(NVFuserTest, FusionOutputBroadcast_CUDA) { at::Tensor aten_input = at::randn({2, 3}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -5000,9 +5000,9 @@ TEST_F(NVFuserTest, FusionReductionKeepDimBasic_CUDA) { at::Tensor aten_input = at::randn({2, 3, 4, 5, 6}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -5076,9 +5076,9 @@ TEST_F(NVFuserTest, FusionSumTo_CUDA) { at::Tensor aten_input = at::randn(tensor_shape_ref, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); NVF_CHECK( cg_outputs[0].dim() == static_cast(sum_to_shape.size()), @@ -5118,9 +5118,9 @@ TEST_F(NVFuserTest, FusionSumToNoop_CUDA) { at::Tensor aten_input = at::randn(tensor_shape_ref, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); NVF_CHECK( cg_outputs[0].dim() == static_cast(sum_to_shape.size()), @@ -5265,9 +5265,9 @@ TEST_F(NVFuserTest, FusionSymbolicReduction_CUDA) { LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}, lparams); - auto cg_outputs = fe.runFusion({aten_input}, lparams); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}, lparams); + auto cg_outputs = ke.run({aten_input}, lparams); testValidate( &fusion, @@ -5307,11 +5307,9 @@ TEST_F(NVFuserTest, FusionReductionSchedulerMultiDimNonFastest_CUDA) { auto heuristic_params = SchedulerEntry::scheduleWith( &fusion, SchedulerType::Reduction, {aten_input}); - FusionExecutor fusion_executor; - fusion_executor.compileFusion( - &fusion, {aten_input}, heuristic_params->lparams); - fusion_executor.runFusion( - {aten_input}, {cg_output}, heuristic_params->lparams); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}, heuristic_params->lparams); + ke.run({aten_input}, {cg_output}, heuristic_params->lparams); testValidate( &fusion, @@ -5538,9 +5536,9 @@ TEST_F(NVFuserTest, FusionCacheBefore_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({M, N}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -5574,9 +5572,9 @@ TEST_F(NVFuserTest, FusionCacheAfter_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({M, N}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -5616,9 +5614,9 @@ TEST_F(NVFuserTest, FusionCacheFork_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({M, N}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -5663,9 +5661,9 @@ TEST_F(NVFuserTest, FusionCacheIndirect_CUDA) { std::vector aten_inputs = {t0, t1, t2, t3}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -5719,9 +5717,9 @@ TEST_F(NVFuserTest, FusionCacheBcast_CUDA) { at::Tensor t1 = at::randn({N}, options); std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -5756,9 +5754,9 @@ TEST_F(NVFuserTest, FusionCacheMultiConsumer_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({N}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -5808,13 +5806,13 @@ TEST_F(NVFuserTest, FusionSmem_CUDA) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); - NVF_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0); + NVF_CHECK(ke.kernel()->summary().war_hazard_syncs_count == 0); } TEST_F(NVFuserTest, FusionSmemReduce_CUDA) { @@ -5856,13 +5854,13 @@ TEST_F(NVFuserTest, FusionSmemReduce_CUDA) { at::Tensor aten_input = at::randn({M, K, N}, options); at::Tensor aten_output = sum(aten_input.to(at::kDouble), {1}); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); - NVF_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0); + NVF_CHECK(ke.kernel()->summary().war_hazard_syncs_count == 0); } TEST_F(NVFuserTest, FusionSmemBlockGemm_CUDA) { @@ -5926,14 +5924,14 @@ TEST_F(NVFuserTest, FusionSmemBlockGemm_CUDA) { std::vector aten_inputs = {t0, t1}; at::Tensor aten_output = at::matmul(t0.to(at::kDouble), t1.to(at::kDouble)); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); - NVF_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0); + NVF_CHECK(ke.kernel()->summary().war_hazard_syncs_count == 0); } TEST_F(NVFuserTest, FusionSmemBlockGemmCache_CUDA) { @@ -6015,14 +6013,14 @@ TEST_F(NVFuserTest, FusionSmemBlockGemmCache_CUDA) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); - NVF_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0); + NVF_CHECK(ke.kernel()->summary().war_hazard_syncs_count == 0); } TEST_F(NVFuserTest, FusionSmemDynamicPersistentSoftmax2D_CUDA) { @@ -6087,9 +6085,9 @@ TEST_F(NVFuserTest, FusionSmemDynamicPersistentSoftmax2D_CUDA) { at::Tensor aten_input = at::randn({dimx, dimy}, options); auto aten_output = at::_softmax(aten_input.to(at::kDouble), -1, false); - nvfuser::FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input, 128}); - auto cg_outputs = fe.runFusion({aten_input, 128}); + nvfuser::KernelExecutor ke; + ke.compile(&fusion, {aten_input, 128}); + auto cg_outputs = ke.run({aten_input, 128}); testValidate( &fusion, @@ -6265,10 +6263,10 @@ TEST_F(NVFuserTest, FusionMagicSchedulerLayerNormBackward_CUDA) { auto aten_mean = std::get<1>(aten_results); auto aten_rstd = std::get<2>(aten_results); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); std::vector aten_inputs = { aten_grad_out, aten_input, aten_mean, aten_rstd, aten_weight, aten_bias}; - auto cg_outputs = fec.runFusionWithInputs(aten_inputs); + auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -6321,10 +6319,10 @@ TEST_F(NVFuserTest, FusionMagicSchedulerRMSNormBackward_CUDA) { auto var = at::mul(sum, 1.0 / NORM_SIZE); auto aten_rstd = at::pow(at::add(var, kEps), -0.5); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); std::vector aten_inputs = { aten_grad_out, aten_input, aten_rstd, aten_weight}; - auto cg_outputs = fec.runFusionWithInputs(aten_inputs); + auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); auto in_mul_rstd = at::mul(aten_input, aten_rstd); auto grad_out_mul = at::mul(aten_grad_out, in_mul_rstd); @@ -6383,9 +6381,9 @@ TEST_F(NVFuserTest, FusionMagicSchedulerLayerNormalization_CUDA) { // tv11 and tv17 should not be predicated. See issue #496 ASSERT_FALSE(PredicatedChecker::isPredicated( - 11, cg_results.fusion_executor->kernel())); + 11, cg_results.kernel_executor->kernel())); ASSERT_FALSE(PredicatedChecker::isPredicated( - 17, cg_results.fusion_executor->kernel())); + 17, cg_results.kernel_executor->kernel())); } TEST_F(NVFuserTest, FusionMagicSchedulerRMSNormalization_CUDA) { @@ -6842,10 +6840,9 @@ TEST_F(NVFuserTest, FusionPersistentSoftmaxLocalShared_CUDA) { at::Tensor aten_dynamic_out = aten_output.narrow(1, static_size, dimy - static_size); - nvfuser::FusionExecutor fe; - fe.compileFusion(&fusion, {aten_static_in, aten_dynamic_in}); - fe.runFusion( - {aten_static_in, aten_dynamic_in}, {cg_static_out, cg_dynamic_out}); + nvfuser::KernelExecutor ke; + ke.compile(&fusion, {aten_static_in, aten_dynamic_in}); + ke.run({aten_static_in, aten_dynamic_in}, {cg_static_out, cg_dynamic_out}); testValidate( &fusion, @@ -7031,10 +7028,10 @@ TEST_F(NVFuserTest, FusionPersistentNormLocalShared_CUDA) { std::vector aten_inputs = { aten_static_in, aten_dynamic_in, kGamma, kBeta, kEps, dimy}; - nvfuser::FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); + nvfuser::KernelExecutor ke; + ke.compile(&fusion, aten_inputs); - fe.runFusion(aten_inputs, {cg_static_out, cg_dynamic_out}); + ke.run(aten_inputs, {cg_static_out, cg_dynamic_out}); auto at_mu = at::mean(aten_input.to(at::kDouble), -1).unsqueeze(1); auto at_var = at::var(aten_input.to(at::kDouble), -1, false).unsqueeze(1); @@ -7155,9 +7152,9 @@ TEST_F(NVFuserTest, FusionSmemDynamicPersistentNorm_CUDA) { std::vector aten_inputs = { aten_input, kGamma, kBeta, kEps, dimy, TIDX}; - nvfuser::FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + nvfuser::KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); @@ -7201,9 +7198,9 @@ TEST_F(NVFuserTest, FusionSmemDynamicReductionSymbolic_CUDA) { LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}, lparams); - auto cg_outputs = fe.runFusion({aten_input}, lparams); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}, lparams); + auto cg_outputs = ke.run({aten_input}, lparams); testValidate( &fusion, @@ -7214,7 +7211,7 @@ TEST_F(NVFuserTest, FusionSmemDynamicReductionSymbolic_CUDA) { __FILE__, "", lparams); - NVF_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0); + NVF_CHECK(ke.kernel()->summary().war_hazard_syncs_count == 0); } TEST_F(NVFuserTest, FusionSmemDynamicReductionSymbolicArg_CUDA) { @@ -7264,9 +7261,9 @@ TEST_F(NVFuserTest, FusionSmemDynamicReductionSymbolicArg_CUDA) { auto lparams = LaunchParams(-1, -1, -1, runtime_threadIdx_dim, -1, -1); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input, runtime_threadIdx_dim}, lparams); - auto cg_outputs = fe.runFusion({aten_input, runtime_threadIdx_dim}, lparams); + KernelExecutor ke; + ke.compile(&fusion, {aten_input, runtime_threadIdx_dim}, lparams); + auto cg_outputs = ke.run({aten_input, runtime_threadIdx_dim}, lparams); testValidate( &fusion, @@ -7278,7 +7275,7 @@ TEST_F(NVFuserTest, FusionSmemDynamicReductionSymbolicArg_CUDA) { "", lparams); - NVF_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0); + NVF_CHECK(ke.kernel()->summary().war_hazard_syncs_count == 0); } TEST_F(NVFuserTest, FusionSmemDynamicPwiseMulSymbolicArgWAR_CUDA) { @@ -7328,14 +7325,14 @@ TEST_F(NVFuserTest, FusionSmemDynamicPwiseMulSymbolicArgWAR_CUDA) { LaunchParams lparams(-1, -1, -1, BSX, -1, -1); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs, lparams); - auto cg_outputs = fe.runFusion(aten_inputs, lparams); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs, lparams); + auto cg_outputs = ke.run(aten_inputs, lparams); testValidate( &fusion, cg_outputs, aten_inputs, __LINE__, __FILE__, "", lparams); - NVF_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 1); + NVF_CHECK(ke.kernel()->summary().war_hazard_syncs_count == 1); } TEST_F(NVFuserTest, FusionSmemDynamicTiledGemm_CUDA) { @@ -7453,15 +7450,15 @@ TEST_F(NVFuserTest, FusionSmemDynamicTiledGemm_CUDA) { at::Tensor aten_output = mul(t0.unsqueeze(2), t1.unsqueeze(0)).to(at::kDouble).sum(1); - FusionExecutor fe; + KernelExecutor ke; // Generate CUDA and compile with nvRTC - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); - NVF_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 1); + NVF_CHECK(ke.kernel()->summary().war_hazard_syncs_count == 1); } } // namespace nvfuser diff --git a/tests/cpp/test_gpu2.cpp b/tests/cpp/test_gpu2.cpp index 5dae411ee12..2426d801176 100644 --- a/tests/cpp/test_gpu2.cpp +++ b/tests/cpp/test_gpu2.cpp @@ -94,9 +94,9 @@ TEST_F(NVFuserTest, FusionGlobalIntermediate_CUDA) { auto lparams = LaunchParams(-1, -1, -1, runtime_threadIdx_dim, -1, -1); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}, lparams); - auto cg_outputs = fe.runFusion({input}, lparams); + KernelExecutor ke; + ke.compile(&fusion, {input}, lparams); + auto cg_outputs = ke.run({input}, lparams); auto aten_output = input.to(at::kDouble).sum({1}); testValidate( @@ -141,9 +141,9 @@ TEST_F(NVFuserTest, FusionGlobalIntermediateDefaultSchedule_CUDA) { std::vector aten_inputs = {t0, t1, t2, t3}; - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1, t2, t3}); - auto cg_outputs = fe.runFusion({t0, t1, t2, t3}); + KernelExecutor ke; + ke.compile(&fusion, {t0, t1, t2, t3}); + auto cg_outputs = ke.run({t0, t1, t2, t3}); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -199,9 +199,9 @@ TEST_F(NVFuserTest, FusionUnrollWithAlloc_CUDA) { tv1->computeAt(tv2_rf, -1); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto cg_outputs = fe.runFusion({input}); + KernelExecutor ke; + ke.compile(&fusion, {input}); + auto cg_outputs = ke.run({input}); auto aten_output = (input + 0).to(at::kDouble).sum(1); @@ -276,9 +276,9 @@ TEST_F(NVFuserTest, FusionComputeAtNonterminatingOutput_CUDA) { auto t3 = t1 + 3; auto t4 = t3 + 4; - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -310,9 +310,9 @@ TEST_F(NVFuserTest, FusionTraversalOrder1_CUDA) { at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - fe.runFusion({aten_input}, cg_outputs); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}); + ke.run({aten_input}, cg_outputs); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -347,9 +347,9 @@ TEST_F(NVFuserTest, FusionTraversalOrder2_CUDA) { at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - fe.runFusion({aten_input}, cg_outputs); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}); + ke.run({aten_input}, cg_outputs); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -399,9 +399,9 @@ TEST_F(NVFuserTest, FusionTraversalOrder3_CUDA) { at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - fe.runFusion({aten_input}, cg_outputs); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}); + ke.run({aten_input}, cg_outputs); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -443,9 +443,9 @@ TEST_F(NVFuserTest, FusionTraversalOrder4_CUDA) { at::empty_like(t0, options), at::empty_like(t0, options)}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - fe.runFusion(aten_inputs, cg_outputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + ke.run(aten_inputs, cg_outputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -476,9 +476,9 @@ TEST_F(NVFuserTest, FusionTraversalOrder5_CUDA) { at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - fe.runFusion({aten_input}, cg_outputs); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}); + ke.run({aten_input}, cg_outputs); auto t1 = aten_input + 1; auto t2 = t1 + 2; @@ -518,9 +518,9 @@ TEST_F(NVFuserTest, FusionTraversalOrder6_CUDA) { at::Tensor cg_output = at::empty_like(aten_input, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - fe.runFusion({aten_input}, {cg_output}); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}); + ke.run({aten_input}, {cg_output}); testValidate(&fusion, {cg_output}, {aten_input}, __LINE__, __FILE__); } @@ -558,9 +558,9 @@ TEST_F(NVFuserTest, FusionTraversalOrder7_CUDA) { at::Tensor cg_output = at::empty_like(aten_input, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - fe.runFusion({aten_input}, {cg_output}); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}); + ke.run({aten_input}, {cg_output}); testValidate(&fusion, {cg_output}, {aten_input}, __LINE__, __FILE__); } @@ -619,9 +619,9 @@ TEST_F(NVFuserTest, FusionThreadPredicate_CUDA) { std::vector cg_outputs = { at::empty_like(aten_input, options), at::empty({numel_x}, options)}; - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - fe.runFusion({aten_input}, cg_outputs); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}); + ke.run({aten_input}, cg_outputs); testValidate( &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); @@ -739,10 +739,10 @@ TEST_F(NVFuserTest, FusionReduceSingle_CUDA) { at::Tensor aten_input = at::randn({100, 1}, options); // Grab only tensor views, though there shouldn't be any other type - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}); // no broadcasting needed, omitting the last optional argument; - auto cg_outputs = fe.runFusion({aten_input}); + auto cg_outputs = ke.run({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -871,9 +871,9 @@ TEST_F(NVFuserTest, FusionTrivialReduction_CUDA) { at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({10, 20, 1}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -1281,9 +1281,9 @@ TEST_F(NVFuserTest, FusionIssue459_CUDA) { std::vector aten_inputs = {t0, t1}; - nvfuser::FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + nvfuser::KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -1311,9 +1311,9 @@ TEST_F(NVFuserTest, FusionSmemIndexingSimple_CUDA) { auto aten_input = at::randn({12, 34}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -1422,9 +1422,9 @@ TEST_F(NVFuserTest, FusionSmemIndexing_CUDA) { // A, B, m_tile_dim, split_k, intra_cta_tile std::vector aten_inputs = {t0, t1, 3, 4, 5}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); @@ -1457,9 +1457,9 @@ TEST_F(NVFuserTest, FusionCacheBeforeReduction_CUDA) { at::Tensor aten_input = at::randn({numel_x, numel_y}, options); at::Tensor cg_output = at::empty({numel_x}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - fe.runFusion({aten_input}, {cg_output}); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}); + ke.run({aten_input}, {cg_output}); testValidate(&fusion, {cg_output}, {aten_input}, __LINE__, __FILE__); } @@ -1494,9 +1494,9 @@ TEST_F(NVFuserTest, FusionCacheBeforeReduction2_CUDA) { at::Tensor aten_input = at::randn({numel_x, numel_y, numel_z}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -1600,9 +1600,9 @@ TEST_F(NVFuserTest, FusionIssue367_CUDA) { at::Tensor aten_output = mul(t0.unsqueeze(2), t1.unsqueeze(0)).to(at::kDouble).sum(1); - nvfuser::FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + nvfuser::KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); @@ -1626,9 +1626,9 @@ TEST_F(NVFuserTest, FusionIssue468_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({10, 100}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -1678,9 +1678,9 @@ TEST_F(NVFuserTest, FusionIssue363_CUDA) { std::vector aten_inputs = {t0, t1}; - nvfuser::FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + nvfuser::KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -1704,9 +1704,9 @@ TEST_F(NVFuserTest, FusionIssue484_CUDA) { at::Tensor aten_input = at::randn({M, M}, options); - nvfuser::FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + nvfuser::KernelExecutor ke; + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -1730,9 +1730,9 @@ TEST_F(NVFuserTest, FusionIssue329_CUDA) { std::vector t0_shape{17, 19}; auto aten_input = at::randn(t0_shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -1771,9 +1771,9 @@ TEST_F(NVFuserTest, FusionIssue382_CUDA) { std::vector aten_inputs = {t0, t3}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -1800,9 +1800,9 @@ TEST_F(NVFuserTest, FusionIssue507_CUDA) { std::vector t0_shape{17, 19}; auto aten_input = at::randn(t0_shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__); } @@ -1838,9 +1838,9 @@ TEST_F(NVFuserTest, FusionIssue532_CUDA) { at::Tensor t0 = at::randn({M}, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -1867,9 +1867,9 @@ TEST_F(NVFuserTest, FusionLoopUnswitch_CUDA) { at::Tensor t0 = at::randn({M}, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -1945,17 +1945,17 @@ TEST_F(NVFuserTest, FusionIssue549_CUDA) { // Lets specify a few bounds in launch params to make sure it works LaunchParams lparams(1, -1, -1, 32, 4, 4); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}, lparams); - fe.runFusion({t0, t1}, lparams); + KernelExecutor ke; + ke.compile(&fusion, {t0, t1}, lparams); + ke.run({t0, t1}, lparams); // Make sure bad launch params throws // TODO: Re-enable once we have parallelization validation in. // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - // ASSERT_ANY_THROW(fe.runFusion({t0, t1}, LaunchParams(1, 2, 3, 4, 5, 6))); + // ASSERT_ANY_THROW(ke.runFusion({t0, t1}, LaunchParams(1, 2, 3, 4, 5, 6))); // Don't specify any launch params - auto cg_outputs = fe.runFusion({t0, t1}); + auto cg_outputs = ke.run({t0, t1}); auto aten_output = (t0 + 1).to(at::kDouble).matmul(t1.to(at::kDouble)); @@ -1964,7 +1964,7 @@ TEST_F(NVFuserTest, FusionIssue549_CUDA) { } TEST_F(NVFuserTest, FusionSimpleCompileRtc_CUDA) { - FusionExecutor fe; + KernelExecutor ke; std::string kernel = R"( __global__ void kernel1(Tensor T0, Tensor T1) { if(threadIdx.x==0){ @@ -1974,7 +1974,7 @@ __global__ void kernel1(Tensor T0, Tensor T1) { } } )"; - fe.compileRtc(kernel, "kernel1", false, PrimDataType::Int); + ke.compileRtc(kernel, "kernel1", false, PrimDataType::Int); LaunchParams lp( 256, // gdimx 1, // gdimy @@ -1989,14 +1989,14 @@ __global__ void kernel1(Tensor T0, Tensor T1) { const std::vector tensor_dims = {8}; auto in0 = at::randn(tensor_dims, options); auto out0 = at::empty_like(in0); - fe.runRtc(lp, {in0, out0}, PrimDataType::Int); + ke.runRtc(lp, {in0, out0}, PrimDataType::Int); auto out_ref = in0 * 2; NVF_CHECK(out_ref.allclose(out0)); } TEST_F(NVFuserTest, FusionSerialWelford_CUDA) { - FusionExecutor fe; + KernelExecutor ke; int x = 128, y = 64, z = 64; std::string kernel = R"( @@ -2030,7 +2030,7 @@ __global__ void kernel1( } } )"; - fe.compileRtc(kernel, "kernel1", false, PrimDataType::Int); + ke.compileRtc(kernel, "kernel1", false, PrimDataType::Int); LaunchParams lp( 1, // gdimx 1, // gdimy @@ -2046,14 +2046,14 @@ __global__ void kernel1( auto in0 = at::randn(tensor_dims, options); auto out_var = at::empty({x}, options); auto out_avg = at::empty({x}, options); - fe.runRtc(lp, {in0, out_var, out_avg}, PrimDataType::Int); + ke.runRtc(lp, {in0, out_var, out_avg}, PrimDataType::Int); NVF_CHECK(in0.var({1, 2}, false).allclose(out_var)); NVF_CHECK(in0.mean({1, 2}).allclose(out_avg, /*rtol*/ 1e-5, /*atol*/ 1e-6)); } TEST_F(NVFuserTest, FusionBlockWelford_CUDA) { - FusionExecutor fe; + KernelExecutor ke; int x = 7, y = 8, z = 9; std::string kernel = R"( @@ -2102,7 +2102,7 @@ __global__ void kernel1( } } )"; - fe.compileRtc(kernel, "kernel1", false, PrimDataType::Int); + ke.compileRtc(kernel, "kernel1", false, PrimDataType::Int); LaunchParams lp( 1, // gdimx 1, // gdimy @@ -2129,7 +2129,7 @@ __global__ void kernel1( // run kernel auto out_var = at::zeros({x}, options); auto out_avg = at::zeros({x}, options); - fe.runRtc( + ke.runRtc( lp, {in0, out_avg, out_var, init_avg, init_var, init_N}, PrimDataType::Int); @@ -2142,7 +2142,7 @@ __global__ void kernel1( } TEST_F(NVFuserTest, FusionBlockWelfordNoInit_CUDA) { - FusionExecutor fe; + KernelExecutor ke; int x = 7, y = 8, z = 9; // need support IValue for integer input as initial count @@ -2183,7 +2183,7 @@ __global__ void kernel1( } } )"; - fe.compileRtc(kernel, "kernel1", false, PrimDataType::Int); + ke.compileRtc(kernel, "kernel1", false, PrimDataType::Int); LaunchParams lp( 1, // gdimx 1, // gdimy @@ -2199,14 +2199,14 @@ __global__ void kernel1( auto in0 = at::randn(tensor_dims, options); auto out_var = at::empty({x}, options); auto out_avg = at::empty({x}, options); - fe.runRtc(lp, {in0, out_avg, out_var}, PrimDataType::Int); + ke.runRtc(lp, {in0, out_avg, out_var}, PrimDataType::Int); NVF_CHECK(in0.var({1, 2}, false).allclose(out_var)); NVF_CHECK(in0.mean({1, 2}).allclose(out_avg, /*rtol*/ 1e-5, /*atol*/ 1e-6)); } TEST_F(NVFuserTest, FusionGridWelfordNoInit_CUDA) { - FusionExecutor fe; + KernelExecutor ke; int x = 128, y = 64, z = 128; std::string kernel = R"( @@ -2258,7 +2258,7 @@ __global__ void kernel1( } } )"; - fe.compileRtc(kernel, "kernel1", false, PrimDataType::Int); + ke.compileRtc(kernel, "kernel1", false, PrimDataType::Int); LaunchParams lp( x, // gdimx y, // gdimy @@ -2282,7 +2282,7 @@ __global__ void kernel1( auto work_buf_var = at::empty({x * y * z}, options); auto work_buf_N = at::empty({x * y * z}, options_int); auto sync_flag = at::zeros({1}, options_int); - fe.runRtc( + ke.runRtc( lp, {in0, out_avg, @@ -2325,15 +2325,15 @@ TEST_F(NVFuserTest, FusionWelfordOp_CUDA) { auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0); at::Tensor t0 = at::randn({M, N}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); // by default Welford outputs sum of square diff so need to divide to get var outputs[1] /= N; testValidate( - fe.kernel(), + ke.kernel(), outputs, {t0}, {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N}, @@ -2370,15 +2370,15 @@ TEST_F(NVFuserTest, FusionBlockWelfordOp_CUDA) { at::Tensor t_avg = at::empty({M}, options); at::Tensor t_N = at::empty({M}, options_int); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); // by default Welford outputs sum of square diff so need to divide to get var outputs[1] /= N; testValidate( - fe.kernel(), + ke.kernel(), outputs, {t0}, {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N}, @@ -2415,15 +2415,15 @@ TEST_F(NVFuserTest, FusionGridWelfordOp_CUDA) { at::Tensor t_var = at::empty({M}, options); at::Tensor t_N = at::empty({M}, options_int); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); // by default Welford outputs sum of square diff so need to divide to get var outputs[1] /= N; testValidate( - fe.kernel(), + ke.kernel(), outputs, {t0}, {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N}, @@ -2459,15 +2459,15 @@ TEST_F(NVFuserTest, FusionRfactorWelfordOp_CUDA) { at::Tensor t_var = at::empty({M}, options); at::Tensor t_N = at::empty({M}, options_int); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); // by default Welford outputs sum of square diff so need to divide to get var outputs[1] /= N; testValidate( - fe.kernel(), + ke.kernel(), outputs, {t0}, {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N}, @@ -2588,17 +2588,17 @@ TEST_P(WelfordReduction, Test) { auto lparams = reduction_params->lparams; auto cparams = reduction_params->cparams; - FusionExecutor fe; + KernelExecutor ke; // Needs to pass compile para to use the correct index type, otherwise the // lowering pass will use int64 as the index tpye, since this test saves // `tv_N` as index type, it may cause vectorization size validation error. For // example, the heuristics set index type to int32 and the max vectorization - // factor is 4, if compile para is not passed to compileFusion, the lowering + // factor is 4, if compile para is not passed to compile, the lowering // pass uses int64 as index type, so the max vectorization factor is 16 bytes // sizeof(int64) = 2, which is wrong since the actual index type is int32 // and the max vectorization factor is 4. - fe.compileFusion(&fusion, {aten_input}, lparams, cparams); - auto outputs = fe.runFusion({aten_input}, lparams); + ke.compile(&fusion, {aten_input}, lparams, cparams); + auto outputs = ke.run({aten_input}, lparams); // by default Welford outputs sum of square diff so need to divide to // get var @@ -2613,7 +2613,7 @@ TEST_P(WelfordReduction, Test) { at_n = at_n.sum({axis}); testValidate( - fe.kernel(), + ke.kernel(), outputs, {aten_input}, {at_avg, at_var, at_n}, @@ -2755,12 +2755,12 @@ TEST_F(NVFuserTest, FusionSimpleGemmTransposed_CUDA) { // Lets specify a few bounds in launch params to make sure it works LaunchParams lparams(1, -1, -1, 32, 4, 4); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}, lparams); - fe.runFusion({t0, t1}, lparams); + KernelExecutor ke; + ke.compile(&fusion, {t0, t1}, lparams); + ke.run({t0, t1}, lparams); // Don't specify any launch params - auto cg_outputs = fe.runFusion({t0, t1}); + auto cg_outputs = ke.run({t0, t1}); auto aten_output = t0.t().to(at::kDouble).matmul(t1.t().to(at::kDouble)); @@ -2820,9 +2820,9 @@ TEST_F(NVFuserTest, FusionSoftmax3DTransposed_CUDA) { at::Tensor cg_output = at::empty({dimx, dimy, dimz}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - fe.runFusion({input}, {cg_output}); + KernelExecutor ke; + ke.compile(&fusion, {input}); + ke.run({input}, {cg_output}); auto aten_input_t = at::transpose(input, 1, 2); auto aten_output = at::_softmax(aten_input_t.to(at::kDouble), -1, false); @@ -2894,9 +2894,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed1_CUDA) { at::Tensor aten_input = at::randn({129, 127}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); at::Tensor aten_input_t = aten_input.t(); @@ -2963,9 +2963,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed2_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({129, 127}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto cg_outputs = fe.runFusion({input}); + KernelExecutor ke; + ke.compile(&fusion, {input}); + auto cg_outputs = ke.run({input}); auto input_t = input.t(); auto t1 = input_t.mul({-1.0}); @@ -3029,9 +3029,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed3_CUDA) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto t0_t = t0.permute({3, 0, 1, 2}); auto t1_t = t1.permute({3, 0, 1, 2}); @@ -3107,9 +3107,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed4_CUDA) { std::vector aten_inputs = {t0, t1, t2, t3}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto t0_t = t0.permute({3, 0, 1, 2}); auto t1_t = t1.permute({3, 0, 1, 2}); @@ -3155,9 +3155,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed5_CUDA) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto t2 = t0.t().add(2.0); auto aten_output = t1.t().mul(t2); @@ -3197,9 +3197,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed6_CUDA) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto t2 = t0.t().add(2.0); auto aten_output = t1.t().mul(t2); @@ -3348,9 +3348,9 @@ TEST_F(NVFuserTest, FusionVectorizeSimple_CUDA) { at::Tensor aten_input = at::empty({2, 6, 32}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); at::Tensor aten_output = aten_input.sin(); @@ -3423,9 +3423,9 @@ TEST_F(NVFuserTest, FusionSimpleVectorizeUnroll_CUDA) { at::Tensor input2 = at::rand_like(input1); at::Tensor output = at::empty_like(input1); - FusionExecutor fe; - fe.compileFusion(&fusion, {input1, input2}); - fe.runFusion({input1, input2}, {output}); + KernelExecutor ke; + ke.compile(&fusion, {input1, input2}); + ke.run({input1, input2}, {output}); at::Tensor tv2_ref = input2 + 2.0; at::Tensor output_ref = input1 + tv2_ref; @@ -3503,9 +3503,9 @@ TEST_F(NVFuserTest, FusionGridPersistence_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({numel_x}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto out = fe.runFusion({input}); + KernelExecutor ke; + ke.compile(&fusion, {input}); + auto out = ke.run({input}); testValidate(&fusion, out, {input}, __LINE__, __FILE__); } @@ -3536,9 +3536,9 @@ TEST_F(NVFuserTest, FusionGridPersistence2_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({numel_x, numel_y}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto out = fe.runFusion({input}); + KernelExecutor ke; + ke.compile(&fusion, {input}); + auto out = ke.run({input}); testValidate(&fusion, out, {input}, __LINE__, __FILE__); } @@ -3570,9 +3570,9 @@ TEST_F(NVFuserTest, FusionWelfordPersistence_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({numel_x}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto out = fe.runFusion({input}); + KernelExecutor ke; + ke.compile(&fusion, {input}); + auto out = ke.run({input}); auto aten_output = (input.mean({0}) + (input.var({0}, false) * numel_x)) .unsqueeze(-1) @@ -3610,9 +3610,9 @@ TEST_F(NVFuserTest, FusionWelfordPersistence2_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({numel_x, numel_y}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto out = fe.runFusion({input}); + KernelExecutor ke; + ke.compile(&fusion, {input}); + auto out = ke.run({input}); auto aten_output = (input.mean({0}) + (input.var({0}, false) * numel_x)) .unsqueeze(0) @@ -3648,9 +3648,9 @@ TEST_F(NVFuserTest, FusionIssue633_CUDA) { at::Tensor t1 = at::randn({dx, dy, 1}, options); std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -3681,9 +3681,9 @@ TEST_F(NVFuserTest, FusionBroadcastAcrossComputeAt_CUDA) { at::Tensor t1 = at::randn(shape, options); std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -3730,9 +3730,9 @@ TEST_F(NVFuserTest, FusionVectorizeMisalignedPointwise_CUDA) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -3786,9 +3786,9 @@ TEST_F(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeContig_CUDA) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -3847,9 +3847,9 @@ TEST_F(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeSymbolicPass_CUDA) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -3908,11 +3908,11 @@ TEST_F(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeSymbolicFail_CUDA) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; + KernelExecutor ke; // TODO: throw assertion - cannot merge non-contiguous vectorization axes // Make sure compilation fails // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(fe.compileFusion(&fusion)); + ASSERT_ANY_THROW(ke.compile(&fusion)); } TEST_F(NVFuserTest, FusionVectorizeMisalignedRFactor_CUDA) { @@ -3964,9 +3964,9 @@ TEST_F(NVFuserTest, FusionVectorizeMisalignedRFactor_CUDA) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto aten_output = t0.add(t1).sum(1); testValidate( @@ -4006,10 +4006,10 @@ TEST_F(NVFuserTest, FusionVectorizeMisalignedWrongDimFail_CUDA) { tv->axis(-2)->parallelize(ParallelType::MisalignedVectorize); } - FusionExecutor fe; + KernelExecutor ke; // Make sure compilation fails // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(fe.compileFusion(&fusion)); + ASSERT_ANY_THROW(ke.compile(&fusion)); } TEST_F(NVFuserTest, FusionVectorizeMisalignedStride_CUDA) { @@ -4056,9 +4056,9 @@ TEST_F(NVFuserTest, FusionVectorizeMisalignedStride_CUDA) { at::randn({bx, by}, options).index({"...", at::indexing::Slice(3)}); std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -4110,12 +4110,12 @@ TEST_F(NVFuserTest, FusionVectorizeMisalignedStrideFail_CUDA) { at::randn({bx, by}, options).index({"...", at::indexing::Slice(3)}); std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); // Failure because the input + output tensors do not have the same stride // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(fe.runFusion(aten_inputs)); + ASSERT_ANY_THROW(ke.run(aten_inputs)); } TEST_F(NVFuserTest, FusionVectorization1_CUDA) { @@ -4157,9 +4157,9 @@ TEST_F(NVFuserTest, FusionVectorization1_CUDA) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -4197,10 +4197,10 @@ TEST_F(NVFuserTest, FusionVectorization2_CUDA) { tv->axis(-2)->parallelize(ParallelType::Vectorize); } - FusionExecutor fe; + KernelExecutor ke; // Make sure compilation fails // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(fe.compileFusion(&fusion)); + ASSERT_ANY_THROW(ke.compile(&fusion)); } // TODO: Re-enable once vectorization validation is fixed @@ -4244,20 +4244,20 @@ TEST_F(NVFuserTest, FusionVectorization3_CUDA) { at::Tensor t1 = at::randn({bx, by}, options); std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(fe.runFusion(aten_inputs)); + ASSERT_ANY_THROW(ke.run(aten_inputs)); aten_inputs[0] = t0.index({"...", at::indexing::Slice(1)}); aten_inputs[1] = t1.index({"...", at::indexing::Slice(1)}); // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(fe.runFusion(aten_inputs)); + ASSERT_ANY_THROW(ke.run(aten_inputs)); t0 = at::randn({bx, 2048}, options).index({"...", at::indexing::Slice(4)}); t1 = at::randn({bx, 2048}, options).index({"...", at::indexing::Slice(4)}); aten_inputs = {t0, t1}; - auto cg_outputs = fe.runFusion(aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -4309,9 +4309,9 @@ TEST_F(NVFuserTest, FusionVectorizationRFactor_CUDA) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto aten_output = t0.add(t1).sum(1); testValidate( @@ -4372,9 +4372,9 @@ TEST_F(NVFuserTest, FusionSizeOneLoop1_CUDA) { at::Tensor t2 = at::randn({z, x, y}, options); std::vector aten_inputs = {t0, t1, t2}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -4406,9 +4406,9 @@ TEST_F(NVFuserTest, FusionSizeOneLoop2_CUDA) { at::Tensor t0 = at::randn({x}, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -4428,9 +4428,9 @@ TEST_F(NVFuserTest, FusionValidateParallelize1_CUDA) { tv2->axis(-1)->parallelize(ParallelType::TIDy); // Invalid as tv1 and tv2 do have the same ParallelType - FusionExecutor fe; + KernelExecutor ke; // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(fe.compileFusion(&fusion)); + ASSERT_ANY_THROW(ke.compile(&fusion)); } TEST_F(NVFuserTest, FusionValidateParallelize2_CUDA) { @@ -4450,8 +4450,8 @@ TEST_F(NVFuserTest, FusionValidateParallelize2_CUDA) { // tv1 and tv2 do have the same ParallelType, but tv1 is on shared // memory, so it is valid - FusionExecutor fe; - fe.compileFusion(&fusion); + KernelExecutor ke; + ke.compile(&fusion); } TEST_F(NVFuserTest, FusionValidateParallelize3_CUDA) { @@ -4473,8 +4473,8 @@ TEST_F(NVFuserTest, FusionValidateParallelize3_CUDA) { tv1->setMemoryType(MemoryType::Global); // tv1 and tv2 have the same shape and ParallelType - FusionExecutor fe; - fe.compileFusion(&fusion); + KernelExecutor ke; + ke.compile(&fusion); } TEST_F(NVFuserTest, FusionValidateParallelize4_CUDA) { @@ -4496,8 +4496,8 @@ TEST_F(NVFuserTest, FusionValidateParallelize4_CUDA) { tv1->setMemoryType(MemoryType::Global); // tv1 and tv2 do not have the same shape but global memory comm is supported. - FusionExecutor fe; - fe.compileFusion(&fusion); + KernelExecutor ke; + ke.compile(&fusion); } TEST_F(NVFuserTest, FusionValidateParallelize5_CUDA) { @@ -4520,8 +4520,8 @@ TEST_F(NVFuserTest, FusionValidateParallelize5_CUDA) { // tv1 and tv2 do not have the same shape, but tv1 is on shared // memory, so it is valid - FusionExecutor fe; - fe.compileFusion(&fusion); + KernelExecutor ke; + ke.compile(&fusion); } // See issue #995 @@ -4648,9 +4648,9 @@ TEST_F(NVFuserTest, FusionValidateParallelize8_CUDA) { at::Tensor input0 = at::arange(64, options).view({32, 2}); at::Tensor input1 = at::arange(32, options) * 0.01; - FusionExecutor fe; - fe.compileFusion(&fusion, {input0, input1}); - auto outputs = fe.runFusion({input0, input1}); + KernelExecutor ke; + ke.compile(&fusion, {input0, input1}); + auto outputs = ke.run({input0, input1}); testValidate(&fusion, outputs, {input0, input1}, __LINE__, __FILE__); } @@ -4737,9 +4737,9 @@ TEST_F(NVFuserTest, FusionValidateParallelize10_CUDA) { at::Tensor t1 = at::randn({s0, s1}, options); std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -4783,9 +4783,9 @@ TEST_F(NVFuserTest, FusionValidateParallelize11_CUDA) { at::Tensor t1 = at::randn({s0, s1}, options); std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -4897,9 +4897,9 @@ TEST_F(NVFuserTest, FusionBlockReduceInSerialLoop_CUDA) { at::Tensor t0 = at::randn({M, N, K}, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -4926,9 +4926,9 @@ TEST_F(NVFuserTest, FusionBlockWelfordInSerialLoop_CUDA) { at::Tensor t0 = at::randn({M, N, K}, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); at::Tensor aten_avg = t0.mean({1, 2}); at::Tensor aten_M2 = t0.var({1, 2}, false) * N * K; testValidate( @@ -4965,9 +4965,9 @@ TEST_F(NVFuserTest, FusionReductionPredicate_CUDA) { at::Tensor input = at::randn({numel_x, numel_y}, options); at::Tensor cg_output = at::empty({numel_y}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - fe.runFusion({input}, {cg_output}); + KernelExecutor ke; + ke.compile(&fusion, {input}); + ke.run({input}, {cg_output}); auto aten_output = input.to(at::kDouble).sum({0}); @@ -5062,9 +5062,9 @@ TEST_F(NVFuserTest, FusionIssue757_CUDA) { at::Tensor t3 = at::randn({numel_x, numel_y}, options); std::vector inputs = {t0, t3}; - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compile(&fusion, inputs); + auto outputs = ke.run(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } @@ -5100,9 +5100,9 @@ TEST_F(NVFuserTest, FusionPredicatedBlockBroadcast_CUDA) { at::Tensor t3 = at::randn({numel_x, numel_y}, options); std::vector inputs = {t0, t3}; - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compile(&fusion, inputs); + auto outputs = ke.run(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } @@ -5364,10 +5364,10 @@ TEST_F(NVFuserTest, FusionBNBackwardRepro_CUDA) { at::Tensor input6 = at::randn_like(input0); at::Tensor input7 = at::randn_like(input0); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); std::vector inputs = { input0, input1, input2, input3, input4, input5, input6, input7}; - auto outputs = fec.runFusionWithInputs(inputs); + auto outputs = executor_cache.runFusionWithInputs(inputs); } // TODO: We only changed inputs, merge this with the test above. @@ -5432,10 +5432,10 @@ TEST_F(NVFuserTest, FusionBNBackwardRepro2_CUDA) { at::Tensor input6 = at::randn_like(input0); at::Tensor input7 = at::randn_like(input0); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); std::vector inputs = { input0, input1, input2, input3, input4, input5, input6, input7}; - auto outputs = fec.runFusionWithInputs(inputs); + auto outputs = executor_cache.runFusionWithInputs(inputs); } TEST_F(NVFuserTest, FusionBNRepro_CUDA) { @@ -5494,10 +5494,10 @@ TEST_F(NVFuserTest, FusionBNRepro_CUDA) { auto input4_ref = input4.clone(); auto input5_ref = input5.clone(); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); std::vector aten_inputs = { input1, input2, input3, input4, input5}; - auto cg_outputs = fec.runFusionWithInputs(aten_inputs); + auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); auto at_results = at::native_batch_norm( input1_ref, @@ -5563,9 +5563,9 @@ TEST_F(NVFuserTest, FusionBNRepro2_CUDA) { at::Tensor weight; at::Tensor bias; - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); std::vector aten_inputs = {input1}; - auto cg_outputs = fec.runFusionWithInputs(aten_inputs); + auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -5894,8 +5894,8 @@ TEST_F(NVFuserTest, FusionSegmentIslands_CUDA) { at::Tensor t0 = at::randn({16, 16}, options); at::Tensor t1 = at::randn({16, 16}, options); - FusionExecutorCache fusion_executor_cache(std::move(fusion)); - fusion_executor_cache.runFusionWithInputs({t0, t1}); + FusionExecutorCache executor_cache(std::move(fusion)); + executor_cache.runFusionWithInputs({t0, t1}); } TEST_F(NVFuserTest, FusionBackOffInnerBroadcast_CUDA) { @@ -6004,9 +6004,9 @@ TEST_F(NVFuserTest, FusionSimpleWarp_CUDA) { auto at_output = input1.sum({1}, true).add(input1); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {input1}); - auto outputs = fe.runFusion({input1}); + KernelExecutor ke; + ke.compile(fusion.get(), {input1}); + auto outputs = ke.run({input1}); testValidate( fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__); @@ -6053,9 +6053,9 @@ TEST_F(NVFuserTest, FusionSimpleWarpPad_CUDA) { auto at_output = input1.sum({1}, true).add(input1); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {input1}); - auto outputs = fe.runFusion({input1}); + KernelExecutor ke; + ke.compile(fusion.get(), {input1}); + auto outputs = ke.run({input1}); testValidate( fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__); } @@ -6098,9 +6098,9 @@ TEST_F(NVFuserTest, FusionWarpPadMergeSplit_CUDA) { auto at_output = input1.sum({1, 2}, true).add(input1); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {input1}); - auto outputs = fe.runFusion({input1}); + KernelExecutor ke; + ke.compile(fusion.get(), {input1}); + auto outputs = ke.run({input1}); testValidate( fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__); } @@ -6140,9 +6140,9 @@ TEST_F(NVFuserTest, FusionSerialWarpReduction_CUDA) { auto at_output = input1.sum({1, 2}, true).add(input1); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {input1}); - auto outputs = fe.runFusion({input1}); + KernelExecutor ke; + ke.compile(fusion.get(), {input1}); + auto outputs = ke.run({input1}); testValidate( fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__); } @@ -6185,9 +6185,9 @@ TEST_F(NVFuserTest, FusionTrivialWarpReduction_CUDA) { auto at_output = input1.sum({1, 2, 3}, true).add(input1); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {input1}); - auto outputs = fe.runFusion({input1}); + KernelExecutor ke; + ke.compile(fusion.get(), {input1}); + auto outputs = ke.run({input1}); testValidate( fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__); } @@ -6240,9 +6240,9 @@ TEST_F(NVFuserTest, FusionMultipleDimBinding_CUDA) { auto at_output = input1.sum({1}, true).add(input1); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {input1, input2}); - auto outputs = fe.runFusion({input1, input2}); + KernelExecutor ke; + ke.compile(fusion.get(), {input1, input2}); + auto outputs = ke.run({input1, input2}); testValidate( fusion.get(), outputs, @@ -6278,9 +6278,9 @@ TEST_F(NVFuserTest, FusionPadNoWarpReduce_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input1 = at::randn({16, 31}, options); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {input1}); - auto outputs = fe.runFusion({input1}); + KernelExecutor ke; + ke.compile(fusion.get(), {input1}); + auto outputs = ke.run({input1}); testValidate(fusion.get(), outputs, {input1}, __LINE__, __FILE__); } @@ -6313,9 +6313,9 @@ TEST_F(NVFuserTest, FusionWarpMutipleThreadDim_CUDA) { auto at_output = (input1 + 1).sum({1}); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {input1}); - auto outputs = fe.runFusion({input1}); + KernelExecutor ke; + ke.compile(fusion.get(), {input1}); + auto outputs = ke.run({input1}); testValidate( fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__); } @@ -6364,9 +6364,9 @@ TEST_F(NVFuserTest, FusionWarpReduceUnrollOuterLoop_CUDA) { auto at_output = input1.sum({1}, true).add(input1); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {input1}); - auto outputs = fe.runFusion({input1}); + KernelExecutor ke; + ke.compile(fusion.get(), {input1}); + auto outputs = ke.run({input1}); testValidate( fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__); } @@ -6410,9 +6410,9 @@ TEST_F(NVFuserTest, FusionWarpReducePredication_CUDA) { auto t0 = at::randn(shape1, options); auto t2 = at::randn(shape2, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t2}); - auto cg_outputs = fe.runFusion({t0, t2}); + KernelExecutor ke; + ke.compile(&fusion, {t0, t2}); + auto cg_outputs = ke.run({t0, t2}); auto t1 = t0.sum({0}); auto t4 = (t2 + 1).sum({0}) + 1; @@ -6458,9 +6458,9 @@ TEST_F(NVFuserTest, FusionSegfaultReduction_CUDA) { at::Tensor input0 = at::randn({batch, c, h, w}, options); at::Tensor input1 = at::randn({batch, c, h, w}, options); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); std::vector inputs = {input0, input1}; - auto outputs = fec.runFusionWithInputs(inputs); + auto outputs = executor_cache.runFusionWithInputs(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } @@ -6491,9 +6491,9 @@ TEST_F(NVFuserTest, FusionBufferReuseBroadCastMultiVisit_CUDA) { auto in0 = at::randn({2, 2}, options); auto in1 = at::randn({2, 2, 2}, options); - FusionExecutor fe; - fe.compileFusion(fusion, {in0, in1}); - auto outputs = fe.runFusion({in0, in1}); + KernelExecutor ke; + ke.compile(fusion, {in0, in1}); + auto outputs = ke.run({in0, in1}); testValidate(fusion, outputs, {in0, in1}, __LINE__, __FILE__); } @@ -6535,10 +6535,10 @@ TEST_F(NVFuserTest, FusionBufferReuseStressTest_CUDA) { auto in0 = at::randn({2, 2}, options); auto in1 = at::randn({2, 2, 2}, options); - FusionExecutor fe; - fe.compileFusion(fusion, {in0, in1}); + KernelExecutor ke; + ke.compile(fusion, {in0, in1}); - auto outputs = fe.runFusion({in0, in1}); + auto outputs = ke.run({in0, in1}); testValidate(fusion, outputs, {in0, in1}, __LINE__, __FILE__); } @@ -6567,9 +6567,9 @@ TEST_F(NVFuserTest, FusionBufferReuseLargeBuffer_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto in0 = at::randn({256, 512}, options); - FusionExecutor fe; - fe.compileFusion(fusion, {in0}); - auto outputs = fe.runFusion({in0}); + KernelExecutor ke; + ke.compile(fusion, {in0}); + auto outputs = ke.run({in0}); testValidate(fusion, outputs, {in0}, __LINE__, __FILE__); } @@ -6599,9 +6599,9 @@ TEST_F(NVFuserTest, FusionBufferReuseNo2hop_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto in0 = at::randn({2, 2}, options); auto in1 = at::randn({2, 2, 2}, options); - FusionExecutor fe; - fe.compileFusion(fusion, {in0, in1}); - auto outputs = fe.runFusion({in0, in1}); + KernelExecutor ke; + ke.compile(fusion, {in0, in1}); + auto outputs = ke.run({in0, in1}); testValidate(fusion, outputs, {in0, in1}, __LINE__, __FILE__); } @@ -6633,9 +6633,9 @@ TEST_F(NVFuserTest, FusionBufferReuseAllocationOrder_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto in0 = at::randn({3, 3, 3}, options); - FusionExecutor fe; - fe.compileFusion(fusion, {in0}); - auto outputs = fe.runFusion({in0}); + KernelExecutor ke; + ke.compile(fusion, {in0}); + auto outputs = ke.run({in0}); testValidate(fusion, outputs, {in0}, __LINE__, __FILE__); } @@ -6662,9 +6662,9 @@ TEST_F(NVFuserTest, FusionBufferReuseLiveInterval_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto in0 = at::randn({16, 16}, options); - FusionExecutor fe; - fe.compileFusion(fusion, {in0}); - auto cg_outputs = fe.runFusion({in0}); + KernelExecutor ke; + ke.compile(fusion, {in0}); + auto cg_outputs = ke.run({in0}); testValidate(fusion, cg_outputs, {in0}, __LINE__, __FILE__); } @@ -6696,9 +6696,9 @@ TEST_F(NVFuserTest, FusionBufferReuseNoAcrossBroadcast_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto in0 = at::randn({2, 2}, options); auto in1 = at::randn({2, 2, 2}, options); - FusionExecutor fe; - fe.compileFusion(fusion, {in0, in1}); - auto outputs = fe.runFusion({in0, in1}); + KernelExecutor ke; + ke.compile(fusion, {in0, in1}); + auto outputs = ke.run({in0, in1}); testValidate(fusion, outputs, {in0, in1}, __LINE__, __FILE__); } @@ -6722,9 +6722,9 @@ TEST_F(NVFuserTest, FusionIssue970_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({nelm, nelm}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); testValidate(&fusion, outputs, {t0}, __LINE__, __FILE__); } @@ -6753,9 +6753,9 @@ TEST_F(NVFuserTest, FusionIssue1016_CUDA) { at::Tensor t0 = at::randn({numel_x, numel_y}, options); std::vector inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compile(&fusion, inputs); + auto outputs = ke.run(inputs); testValidate(&fusion, outputs, {t0}, __LINE__, __FILE__); } @@ -6784,9 +6784,9 @@ TEST_F(NVFuserTest, FusionIssue1021_CUDA) { at::Tensor t0 = at::randn({10}, options); std::vector inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compile(&fusion, inputs); + auto outputs = ke.run(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } @@ -6819,9 +6819,9 @@ TEST_F(NVFuserTest, FusionNonUniqueThreadDim_CUDA) { auto at_tv1 = (input1).sum({0}); auto at_tv2 = input1 + 1; - FusionExecutor fe; - fe.compileFusion(fusion.get(), {input1}); - auto outputs = fe.runFusion({input1}); + KernelExecutor ke; + ke.compile(fusion.get(), {input1}); + auto outputs = ke.run({input1}); testValidate( fusion.get(), outputs, {input1}, {at_tv1, at_tv2}, __LINE__, __FILE__); } @@ -6856,9 +6856,9 @@ TEST_F(NVFuserTest, FusionParallelDimensionMap1_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input1 = at::randn({32}, options); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {input1}); - auto outputs = fe.runFusion({input1}); + KernelExecutor ke; + ke.compile(fusion.get(), {input1}); + auto outputs = ke.run({input1}); testValidate(fusion.get(), outputs, {input1}, __LINE__, __FILE__); } @@ -6893,9 +6893,9 @@ TEST_F(NVFuserTest, FusionParallelDimensionMap2_CUDA) { at::Tensor input1 = at::randn({11}, options); at::Tensor input2 = at::randn({11, 13}, options); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {input1, input2}); - auto outputs = fe.runFusion({input1, input2}); + KernelExecutor ke; + ke.compile(fusion.get(), {input1, input2}); + auto outputs = ke.run({input1, input2}); testValidate(fusion.get(), outputs, {input1, input2}, __LINE__, __FILE__); } @@ -6941,9 +6941,9 @@ TEST_F(NVFuserTest, FusionParallelDimensionMap3_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input1 = at::randn({13}, options); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {input1}); - auto outputs = fe.runFusion({input1}); + KernelExecutor ke; + ke.compile(fusion.get(), {input1}); + auto outputs = ke.run({input1}); testValidate(fusion.get(), outputs, {input1}, __LINE__, __FILE__); } @@ -6987,9 +6987,9 @@ TEST_F(NVFuserTest, FusionParallelDimensionMap4_CUDA) { at::Tensor input1 = at::randn({13}, options); at::Tensor input2 = at::randn({15, 13}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input1, input2}); - auto outputs = fe.runFusion({input1, input2}); + KernelExecutor ke; + ke.compile(&fusion, {input1, input2}); + auto outputs = ke.run({input1, input2}); testValidate(&fusion, outputs, {input1, input2}, __LINE__, __FILE__); } @@ -7031,9 +7031,9 @@ TEST_F(NVFuserTest, FusionParallelDimensionMap5_CUDA) { at::Tensor input1 = at::randn({13}, options); at::Tensor input2 = at::randn({13, 15}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input1, input2}); - auto outputs = fe.runFusion({input1, input2}); + KernelExecutor ke; + ke.compile(&fusion, {input1, input2}); + auto outputs = ke.run({input1, input2}); testValidate(&fusion, outputs, {input1, input2}, __LINE__, __FILE__); } @@ -7185,9 +7185,9 @@ TEST_F(NVFuserTest, FusionSerialAndParallelIndexing_CUDA) { at::Tensor t0 = at::randn({nx}, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -7238,9 +7238,9 @@ TEST_F(NVFuserTest, FusionWARSyncAliasedSmem_CUDA) { at::Tensor t0 = at::randn({17}, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -7289,9 +7289,9 @@ TEST_F(NVFuserTest, FusionIssue1099_CUDA) { at::Tensor t3 = at::randn({19}, options); std::vector aten_inputs = {t0, t3}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -7331,9 +7331,9 @@ TEST_F(NVFuserTest, FusionUnswitchPredicate_CUDA) { at::Tensor t0 = at::randn({nx, ny}, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -7370,9 +7370,9 @@ TEST_F(NVFuserTest, FusionIssue1189_CUDA) { at::Tensor t0 = at::randn({16, 16, 1}, options); at::Tensor t1 = at::randn({16, 16, 1}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - auto outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compile(&fusion, {t0, t1}); + auto outputs = ke.run({t0, t1}); testValidate(&fusion, outputs, {t0, t1}, __LINE__, __FILE__); } @@ -7403,9 +7403,9 @@ TEST_F(NVFuserTest, FusionIssue1052_CUDA) { at::Tensor t1 = at::randn({100}, options); std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -7509,9 +7509,9 @@ TEST_F(NVFuserTest, FusionSmemAliasSerial_CUDA) { at::Tensor t4 = at::randn({1024}, options); std::vector aten_inputs = {t0, t4}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -7539,9 +7539,9 @@ TEST_F(NVFuserTest, FusionGridReductionWithNonExactParallelDimensions_CUDA) { at::Tensor t2 = at::randn({19}, options); std::vector aten_inputs = {t0, t2}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -7569,9 +7569,9 @@ TEST_F(NVFuserTest, FusionGridWelfordWithNonExactParallelDimensions_CUDA) { at::Tensor t2 = at::randn({19}, options); std::vector aten_inputs = {t0, t2}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); auto ref1 = t0 + 1; auto ref2 = mean(t2, {0}); @@ -7610,15 +7610,15 @@ TEST_F(NVFuserTest, FusionGridReductionWithNonExactParallelDimensions2_CUDA) { tv5->axis(1)->parallelize(ParallelType::BIDy); tv5->axis(2)->parallelize(ParallelType::BIDz); - FusionExecutor fe; - fe.compileFusion(&fusion); + KernelExecutor ke; + ke.compile(&fusion); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({2, 3}, options); at::Tensor t2 = at::randn({5, 6, 7}, options); at::Tensor t4 = at::randn({8, 9, 10}, options); std::vector aten_inputs = {t0, t2, t4}; - auto outputs = fe.runFusion(aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -7654,15 +7654,15 @@ TEST_F(NVFuserTest, FusionGridWelfordWithNonExactParallelDimensions2_CUDA) { tv5->axis(1)->parallelize(ParallelType::BIDy); tv5->axis(2)->parallelize(ParallelType::BIDz); - FusionExecutor fe; - fe.compileFusion(&fusion); + KernelExecutor ke; + ke.compile(&fusion); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({2, 3}, options); at::Tensor t2 = at::randn({5, 6, 7}, options); at::Tensor t4 = at::randn({8, 9, 10}, options); std::vector aten_inputs = {t0, t2, t4}; - auto outputs = fe.runFusion(aten_inputs); + auto outputs = ke.run(aten_inputs); auto ref1 = t0.mean(at::IntArrayRef{0, 1}); auto ref2 = t2 + 1; @@ -7723,9 +7723,9 @@ TEST_F(NVFuserTest, FusionPredicateParallelizedDomains_CUDA) { at::Tensor t4 = at::randn({19}, options); std::vector aten_inputs = {t0, t4}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); auto ref1 = t0 + 3; auto ref2 = sum(t4 + 4); @@ -7785,9 +7785,9 @@ TEST_F(NVFuserTest, FusionSmemPredicateUnswitch_CUDA) { at::Tensor t1 = at::randn({19}, options); std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -7834,9 +7834,9 @@ TEST_F(NVFuserTest, FusionFloatPow_CUDA) { t0 = abs(t0); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); auto p4 = at::pow(t0, 4); auto p2 = at::pow(t0, 2); @@ -7903,9 +7903,9 @@ TEST_F(NVFuserTest, FusionThreadPredicateUnswitch_CUDA) { at::Tensor t0 = at::randn({10, 1024}, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -7926,9 +7926,9 @@ TEST_F(NVFuserTest, FusionNonContigOutputs_CUDA) { at::Tensor at_input = at::randn({10}, options); at::Tensor at_output = at::empty_strided({10}, {2}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {at_input}); - auto returned_outputs = fe.runFusion({at_input}, {at_output}); + KernelExecutor ke; + ke.compile(&fusion, {at_input}); + auto returned_outputs = ke.run({at_input}, {at_output}); // Returned outputs should only contain one tensor that is the same // as the output tensor given to runFusion @@ -7974,9 +7974,9 @@ TEST_F(NVFuserTest, FusionTestWarpSoftMax_CUDA) { } // Test result - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); auto ref_output = at::_softmax(aten_input, 1, false); testValidate(&fusion, outputs, aten_inputs, {ref_output}, __LINE__, __FILE__); } @@ -8048,9 +8048,9 @@ TEST_F(NVFuserTest, FusionIssue1133_CUDA) { at::Tensor t0 = at::randn({99, 101}, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); auto ref = (t0 + 1).sum({1}) + 1; @@ -8082,9 +8082,9 @@ TEST_F(NVFuserTest, FusionRfactorContigIDs_CUDA) { at::Tensor t0 = at::randn({99, 101}, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); auto ref = t0.sum({1}); @@ -8137,9 +8137,9 @@ TEST_F(NVFuserTest, FusionIssue1223_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor at_t0 = at::ones({11, 10}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {at_t0}); - auto cg_outputs = fe.runFusion({at_t0}); + KernelExecutor ke; + ke.compile(&fusion, {at_t0}); + auto cg_outputs = ke.run({at_t0}); auto at_t1 = (at_t0 + 1).sum(); @@ -8181,9 +8181,9 @@ TEST_F(NVFuserTest, FusionRfactorPredication1_CUDA) { at_t0 = at::abs(at_t0); at::Tensor at_t3 = at::randn({128}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {at_t0, at_t3}); - auto cg_outputs = fe.runFusion({at_t0, at_t3}); + KernelExecutor ke; + ke.compile(&fusion, {at_t0, at_t3}); + auto cg_outputs = ke.run({at_t0, at_t3}); auto at_t2 = (at_t0 + 1).min(); auto at_t4 = at_t3 + 1; @@ -8233,9 +8233,9 @@ TEST_F(NVFuserTest, FusionRfactorPredication2_CUDA) { at_t0 = at::abs(at_t0); at::Tensor at_t3 = at::randn({128}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {at_t0, at_t3}); - auto cg_outputs = fe.runFusion({at_t0, at_t3}); + KernelExecutor ke; + ke.compile(&fusion, {at_t0, at_t3}); + auto cg_outputs = ke.run({at_t0, at_t3}); auto at_t2 = std::get<0>(at_t0.min(0)); auto at_t4 = at_t3 + 1; @@ -8270,9 +8270,9 @@ TEST_F(NVFuserTest, FusionRfactorIndirectRoot_CUDA) { auto at_in = at::randn({6, 6, 6}, options); auto at_out = at_in.sum({1, 2}); - FusionExecutor fe; - fe.compileFusion(&fusion, {at_in}); - auto cg_outputs = fe.runFusion({at_in}); + KernelExecutor ke; + ke.compile(&fusion, {at_in}); + auto cg_outputs = ke.run({at_in}); testValidate(&fusion, cg_outputs, {at_in}, {at_out}, __LINE__, __FILE__); } diff --git a/tests/cpp/test_gpu3.cpp b/tests/cpp/test_gpu3.cpp index 9862dcb8b07..f7deca99425 100644 --- a/tests/cpp/test_gpu3.cpp +++ b/tests/cpp/test_gpu3.cpp @@ -107,9 +107,9 @@ TEST_F(NVFuserTest, FusionNonDivisibleSplit1_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({24}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto ref = t0.sum(); @@ -161,9 +161,9 @@ TEST_F(NVFuserTest, FusionNonDivisibleSplit2_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({13, 17}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -210,9 +210,9 @@ TEST_F(NVFuserTest, FusionNonDivisibleSplit3_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({24}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto ref = (t0 + 1).sum(); @@ -260,9 +260,9 @@ TEST_F(NVFuserTest, FusionNonDivisibleSplit4_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({24, 2}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto ref = (t0 + 1).sum(); @@ -314,9 +314,9 @@ TEST_F(NVFuserTest, FusionNonDivisibleSplit5_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({24}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto ref = (t0 + 1).sum(); @@ -357,9 +357,9 @@ TEST_F(NVFuserTest, FusionNonDivisibleSplitVectorize1_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({32}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); @@ -367,7 +367,7 @@ TEST_F(NVFuserTest, FusionNonDivisibleSplitVectorize1_CUDA) { // Since ceilDiv(8, 8) is not divisible by 4, the vectorization is // illegal. The run-time validation of vectorization should throw an error. // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(fe.runFusion({t0_non_divisible})); + ASSERT_ANY_THROW(ke.run({t0_non_divisible})); } // If a split is validated at run time, it's not necessary to predicate. @@ -412,9 +412,9 @@ TEST_F(NVFuserTest, FusionNonDivisibleSplitVectorize2_CUDA) { auto t0 = at::randn({1024}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto ref = (t0 + 1).sum(); @@ -474,16 +474,16 @@ TEST_F(NVFuserTest, FusionIntermediateTensorVectorize_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({15}, options); - FusionExecutor fe; - fe.compileFusion(&fusion); + KernelExecutor ke; + ke.compile(&fusion); // This should throw an exception as the extent of t0 is not // divisible by the vector width // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(fe.runFusion({t0})); + ASSERT_ANY_THROW(ke.run({t0})); auto t1 = at::randn({16}, options); - auto cg_outputs = fe.runFusion({t1}); + auto cg_outputs = ke.run({t1}); testValidate(&fusion, cg_outputs, {t1}, __LINE__, __FILE__); } @@ -529,9 +529,9 @@ TEST_F(NVFuserTest, FusionBroadcastConcretization1_CUDA) { auto t2 = at::randn({10, 10}, options); std::vector aten_inputs = {t0, t1, t2}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -572,9 +572,9 @@ TEST_F(NVFuserTest, FusionBroadcastConcretization2_CUDA) { auto t0 = at::randn({10, 11}, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); auto t3 = t0.sum().unsqueeze(-1).unsqueeze(-1); @@ -617,9 +617,9 @@ TEST_F(NVFuserTest, FusionBroadcastConcretization3_CUDA) { auto t0 = at::randn(input_shape, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -790,9 +790,9 @@ TEST_F(NVFuserTest, FusionIssue1430_CUDA) { auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); at::Tensor t0 = at::randn({V, W, X, Y, Z}, options); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({t0}, LaunchParams(X, V, -1, Y, -1, -1)); + KernelExecutor ke; + ke.compile(&fusion); + auto cg_outputs = ke.run({t0}, LaunchParams(X, V, -1, Y, -1, -1)); auto t0_double = t0.to(at::kDouble); @@ -944,9 +944,9 @@ TEST_F(NVFuserTest, FusionTestGridComm_CUDA) { auto t0 = at::randn({X, Y, Z}, options); auto t1 = at::randn({X, Y, Z}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -988,9 +988,9 @@ TEST_F(NVFuserTest, FusionTestGridComm2_CUDA) { auto t0 = at::randn({X}, options); auto t1 = at::randn({W, X}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -1021,9 +1021,9 @@ TEST_F(NVFuserTest, FusionLargeSmem_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({(int)(12288 * 4)}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto ref = t0 + 1 + 2; testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); @@ -1057,10 +1057,10 @@ TEST_F(NVFuserTest, FusionTooLargeSmem_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({(int)(12288 * 4)}, options); - FusionExecutor fe; + KernelExecutor ke; // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(fe.compileFusion(&fusion, {t0})); + ASSERT_ANY_THROW(ke.compile(&fusion, {t0})); } // Try to test alignment when multiple tensors are @@ -1097,10 +1097,10 @@ TEST_F(NVFuserTest, FusionSmemAlignment_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({3, 4, 7, 2, 5}, options); - FusionExecutor fe; + KernelExecutor ke; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -1126,8 +1126,8 @@ TEST_F(NVFuserTest, FusionImmediateValueAsInput_CUDA) { fusion.addOutput(tv1); // Make sure the kernel is compiled. - FusionExecutor fe; - fe.compileFusion(&fusion); + KernelExecutor ke; + ke.compile(&fusion); } // Repro of #1506 @@ -1157,9 +1157,9 @@ TEST_F(NVFuserTest, FusionVectorizeContigIndex_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); NVF_CHECK(t0.equal(cg_outputs[0])); } @@ -1192,10 +1192,10 @@ TEST_F(NVFuserTest, FusionVectorizeContigIndexFail_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); - FusionExecutor fe; + KernelExecutor ke; // This should fail at compile time as we're trying to merge in a // non-contiguous dimension, then split and vectorize it. - ASSERT_ANY_THROW(fe.compileFusion(&fusion, {t0})); + ASSERT_ANY_THROW(ke.compile(&fusion, {t0})); } // Make sure the same fusion as FusionVectorizeContigIndex fails if @@ -1227,14 +1227,14 @@ TEST_F(NVFuserTest, FusionVectorizeContigIndexFail2_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); // This should fail at the launch time as 14 is not divisible by the // vector word size. The two domains are merged, but they are not // contiguous, so contig indexing is not involved in this case. // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(fe.runFusion({t0})); + ASSERT_ANY_THROW(ke.run({t0})); } TEST_F(NVFuserTest, FusionVectorizeInputToOutput_CUDA) { @@ -1260,18 +1260,18 @@ TEST_F(NVFuserTest, FusionVectorizeInputToOutput_CUDA) { auto t1_misaligned = at::empty({n + 1}, options).index({at::indexing::Slice(1)}); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); NVF_CHECK(t0.equal(cg_outputs[0])); // Pass misaligned input. This must fail. // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(fe.runFusion({t0_misaligned})); + ASSERT_ANY_THROW(ke.run({t0_misaligned})); // Pass misaligned output. This must fail too. // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(fe.runFusion({t0}, {t1_misaligned})); + ASSERT_ANY_THROW(ke.run({t0}, {t1_misaligned})); } // Repro of issue #1530 @@ -1300,11 +1300,11 @@ TEST_F(NVFuserTest, FusionVectorizeContigIndexValidationFail_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(fe.runFusion({t0})); + ASSERT_ANY_THROW(ke.run({t0})); } TEST_F(NVFuserTest, FusionContigIndexingWithBroadcast_CUDA) { @@ -1331,9 +1331,9 @@ TEST_F(NVFuserTest, FusionContigIndexingWithBroadcast_CUDA) { auto t1 = at::randn({3, 4}, options); { - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -1341,9 +1341,9 @@ TEST_F(NVFuserTest, FusionContigIndexingWithBroadcast_CUDA) { // Make sure tv2 indexing also works when it's stored in global memory tv2->setMemoryType(MemoryType::Global); { - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -1384,12 +1384,12 @@ TEST_F(NVFuserTest, FusionVectorizeContigIndexValidationFail2_CUDA) { auto t0 = at::randn(shape1, options); auto t1 = at::randn(shape2, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); + KernelExecutor ke; + ke.compile(&fusion, {t0, t1}); // Vectorization of tv2 should be detected as invalid. // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(fe.runFusion({t0, t1})); + ASSERT_ANY_THROW(ke.run({t0, t1})); } TEST_F(NVFuserTest, FusionVectorizeContigIndexWithBroadcast_CUDA) { @@ -1433,9 +1433,9 @@ TEST_F(NVFuserTest, FusionVectorizeContigIndexWithBroadcast_CUDA) { auto t0 = at::randn(shape1, options); auto t1 = at::randn(shape2, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -1467,7 +1467,7 @@ TEST_F(NVFuserTest, FusionVectorizeContigIndexPointwiseSchedule_CUDA) { // vector word size should be 4. Broadcasting of tv1 should not // matter. for (const auto& vec_info : - cg_results.fusion_executor->kernel()->summary().vectorized_set_info) { + cg_results.kernel_executor->kernel()->summary().vectorized_set_info) { NVF_CHECK( vec_info.word_size == 4, "Invalid vector word size: ", @@ -1512,9 +1512,9 @@ TEST_F(NVFuserTest, FusionTrivialReductionForwarding4_CUDA) { auto t0 = at::randn({111}, options); auto t1 = at::randn({123, 111}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); auto t2 = t0.unsqueeze(0); auto t3 = t1 + t2; @@ -1563,9 +1563,9 @@ TEST_F(NVFuserTest, FusionRAWSyncInsertionPlace1_CUDA) { auto t0 = at::randn({10, 64}, options); auto t1 = at::randn({10, 64}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -1608,9 +1608,9 @@ TEST_F(NVFuserTest, FusionRAWSyncInsertionPlace2_CUDA) { auto t0 = at::randn({10, 64}, options); auto t1 = at::randn({10, 64}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -1651,9 +1651,9 @@ TEST_F(NVFuserTest, FusionRAWSyncInsertionPlace3_CUDA) { auto t0 = at::randn({50, 64}, options); auto t1 = at::randn({50, 64}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -1756,9 +1756,9 @@ TEST_F(NVFuserTest, FusionSerialSmemWriteParallelRead1_CUDA) { at::Tensor t1 = at::randn({128, 6}, options); at::Tensor t2 = at::randn({128, 6}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1, t2}); - auto cg_outputs = fe.runFusion({t0, t1, t2}); + KernelExecutor ke; + ke.compile(&fusion, {t0, t1, t2}); + auto cg_outputs = ke.run({t0, t1, t2}); testValidate(&fusion, cg_outputs, {t0, t1, t2}, __LINE__, __FILE__); } @@ -1796,9 +1796,9 @@ TEST_F(NVFuserTest, FusionSerialSmemWriteParallelRead2_CUDA) { at::Tensor t1 = at::randn({128, 6}, options); at::Tensor t2 = at::randn({128, 6}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1, t2}); - auto cg_outputs = fe.runFusion({t0, t1, t2}); + KernelExecutor ke; + ke.compile(&fusion, {t0, t1, t2}); + auto cg_outputs = ke.run({t0, t1, t2}); testValidate(&fusion, cg_outputs, {t0, t1, t2}, __LINE__, __FILE__); } @@ -1831,20 +1831,20 @@ TEST_F(NVFuserTest, FusionSimpleCpAsync_CUDA) { at::Tensor t0 = at::randn({m, n}, options); at::Tensor t1 = at::randn({m, n}, options); - FusionExecutor fe; + KernelExecutor ke; // requires ampere+ GPU if (!deviceMajorMinorCheck(8)) { ASSERT_THAT( - [&]() { fe.compileFusion(&fusion, {t0, t1}); }, + [&]() { ke.compile(&fusion, {t0, t1}); }, testing::ThrowsMessage(testing::HasSubstr( "Reason: LoadStoreOpType::CpAsync requires Ampere"))); GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs"; } else { - fe.compileFusion(&fusion, {t0, t1}); + ke.compile(&fusion, {t0, t1}); } - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -1877,19 +1877,19 @@ TEST_F(NVFuserTest, FusionCpAsyncPredicate_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({m, n}, options); - FusionExecutor fe; + KernelExecutor ke; if (!deviceMajorMinorCheck(8)) { ASSERT_THAT( - [&]() { fe.compileFusion(&fusion, {t0}); }, + [&]() { ke.compile(&fusion, {t0}); }, testing::ThrowsMessage(testing::HasSubstr( "Reason: LoadStoreOpType::CpAsync requires Ampere"))); GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs"; } else { - fe.compileFusion(&fusion, {t0}); + ke.compile(&fusion, {t0}); } - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto ref = t0.sum({1}); @@ -2006,11 +2006,11 @@ TEST_F(NVFuserTest, FusionPropagateParallelTypesToSiblings_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({9999}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); - testValidate(fe.kernel(), outputs, {t0}, {t0.mean({0})}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, {t0}, {t0.mean({0})}, __LINE__, __FILE__); } // Test ExactLogicalDomainMap @@ -2211,13 +2211,13 @@ TEST_F(NVFuserTest, FusionTestReEntrantGridWelford_CUDA) { GpuLower gpulw(&fusion); checker.handle(gpulw.run()->topLevelExprs()); - FusionExecutor fe; - fe.compileFusion(&fusion, {}, LaunchParams()); + KernelExecutor ke; + ke.compile(&fusion, {}, LaunchParams()); auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); at::Tensor t0 = at::randn({X, Y, Y, Z}, options); - auto cg_outputs = fe.runFusion({t0}, LaunchParams(-1, -1, -1, -1, -1, -1)); + auto cg_outputs = ke.run({t0}, LaunchParams(-1, -1, -1, -1, -1, -1)); // by default Welford outputs sum of square diff so need to divide to get var cg_outputs[1] = cg_outputs[1].div((float)(X * Y * Y)); @@ -2280,9 +2280,9 @@ TEST_F(NVFuserTest, FusionRedundantPredSync_CUDA) { at::Tensor t0 = at::randn({32}, options); at::Tensor t1 = at::randn({32, 32}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -2345,9 +2345,9 @@ TEST_F(NVFuserTest, FusionRedundantPredSync2_CUDA) { at::Tensor t0 = at::randn({32}, options); at::Tensor t1 = at::randn({32, 32}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -2427,9 +2427,9 @@ TEST_F(NVFuserTest, FusionRedundantPredSync3_CUDA) { at::Tensor t0 = at::randn({32}, options); at::Tensor t1 = at::randn({32, 32}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -2532,9 +2532,9 @@ TEST_F(NVFuserTest, FusionUnsqueeze1_CUDA) { at::Tensor t0 = at::randn({10, 11}, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -2567,9 +2567,9 @@ TEST_F(NVFuserTest, FusionSqueeze1_CUDA) { at::Tensor t0 = at::randn({10, 11}, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -2596,11 +2596,11 @@ TEST_F(NVFuserTest, FusionContigPredicate_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({3, 4}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); - testValidate(fe.kernel(), cg_outputs, {t0}, __LINE__, __FILE__); + testValidate(ke.kernel(), cg_outputs, {t0}, __LINE__, __FILE__); } // Repro of https://github.com/csarofeen/pytorch/issues/1777 @@ -2620,9 +2620,9 @@ TEST_F(NVFuserTest, FusionDivScalarLhs_CUDA) { auto aten_output = at::div( at::native::wrapped_scalar_tensor(at::Scalar(2.0), options.device()), t0); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {aten_output}, __LINE__, __FILE__); } @@ -3242,9 +3242,9 @@ TEST_F(NVFuserTest, FusionIssue1785Repro_CUDA) { at::Tensor in1 = at::randn({16}, options); at::Tensor in2 = at::randn({12, 16}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {in1, in2}); - auto cg_outputs = fe.runFusion({in1, in2}); + KernelExecutor ke; + ke.compile(&fusion, {in1, in2}); + auto cg_outputs = ke.run({in1, in2}); testValidate(&fusion, cg_outputs, {in1, in2}, __LINE__, __FILE__); } @@ -3516,9 +3516,9 @@ TEST_F(NVFuserTest, FusionVectorComponentReduce_CUDA) { at::TensorOptions().dtype(at::kComplexFloat).device(at::kCUDA, 0); auto t0 = at::randn({1024}, options); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(fusion.get(), {t0}); + auto cg_outputs = ke.run({t0}); testValidate(fusion.get(), cg_outputs, {t0}, __LINE__, __FILE__, ""); } @@ -3799,9 +3799,9 @@ TEST_F(NVFuserTest, FusionPredicateUnshare_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({5, 5}, options); - FusionExecutor fe; - fe.compileFusion(fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto out = cg_outputs[0]; testValidate(fusion, {out}, {t0}, __LINE__, __FILE__); @@ -3879,9 +3879,9 @@ TEST_F(NVFuserTest, FusionMergeBroadcastingTrivialReduction1_CUDA) { at::Tensor t0 = at::randn({1, 1}, options); at::Tensor t1 = at::randn({10}, options); - FusionExecutor fe; - fe.compileFusion(fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compile(fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); auto out = cg_outputs[0]; testValidate( @@ -3923,9 +3923,9 @@ TEST_F(NVFuserTest, FusionMappingRelation_CUDA) { at::Tensor t0 = at::randn({1, 1}, options); at::Tensor t1 = at::randn({2, 1, 1}, options); - FusionExecutor fe; - fe.compileFusion(fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compile(fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); auto out = cg_outputs[0]; testValidate(fusion, {out}, {t0, t1}, __LINE__, __FILE__); @@ -3947,9 +3947,9 @@ TEST_F(NVFuserTest, FusionInlineAt_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({100, 2}, options); - FusionExecutor fe; - fe.compileFusion(fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto out = cg_outputs[0]; testValidate(fusion, {out}, {t0}, __LINE__, __FILE__); @@ -3981,9 +3981,9 @@ TEST_F(NVFuserTest, FusionReplayTrivialReductionAndBroadcast2_CUDA) { at::Tensor t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(fusion_ptr.get(), aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -4043,19 +4043,19 @@ TEST_F(NVFuserTest, FusionSimpleAmperePipeline_CUDA) { GpuLower gpulw(&fusion); pred_checker.handle(gpulw.run()->topLevelExprs()); - FusionExecutor fe; + KernelExecutor ke; // requires ampere+ GPU if (!deviceMajorMinorCheck(8)) { ASSERT_THAT( - [&]() { fe.compileFusion(&fusion, {input1}); }, + [&]() { ke.compile(&fusion, {input1}); }, testing::ThrowsMessage(testing::HasSubstr( "Reason: LoadStoreOpType::CpAsync requires Ampere"))); GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs"; } else { - fe.compileFusion(&fusion, {input1}); + ke.compile(&fusion, {input1}); } - auto cg_outputs = fe.runFusion({input1}); + auto cg_outputs = ke.run({input1}); testValidate(&fusion, cg_outputs, {input1}, __LINE__, __FILE__); } @@ -4078,8 +4078,8 @@ TEST_F(NVFuserTest, FusionExpandedInput_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({4096, 1, 4}, options).expand({-1, 7, -1}); - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = fec.runFusionWithInputs({t0}); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs({t0}); testValidate(fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -4112,8 +4112,8 @@ TEST_F(NVFuserTest, FusionVectorizeRepro1843_CUDA) { at::Tensor t0 = at::empty_strided({4096, 32128}, {32128, 1}, options).random_(); - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = fec.runFusionWithInputs({t1, t0}); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs({t1, t0}); testValidate(fusion, cg_outputs, {t1, t0}, __LINE__, __FILE__); } @@ -4137,8 +4137,8 @@ TEST_F(NVFuserTest, FusionBroadcastPersistentReduction_CUDA) { auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); auto t0 = at::randn({1024, 768}, options); - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = fec.runFusionWithInputs({t0}); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs({t0}); testValidate(fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -4284,8 +4284,8 @@ TEST_F(NVFuserTest, FusionRepro2094_CUDA) { outputs.push_back(t32); } - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = fec.runFusionWithInputs(inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs(inputs); testValidate(fusion, cg_outputs, inputs, outputs, __LINE__, __FILE__); } @@ -4428,9 +4428,9 @@ TEST_F(NVFuserTest, FusionSqueezeTransformPropagation_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({5, 1, 1, 1, 1}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -4482,9 +4482,9 @@ TEST_F(NVFuserTest, FusionSqueezeInlining_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({1, 1024}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -4885,9 +4885,9 @@ TEST_F(NVFuserTest, FusionPropagateVectorizePredicate_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({32}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); NVF_CHECK(t0.equal(cg_outputs[0])); } @@ -4992,16 +4992,16 @@ TEST_F(NVFuserTest, FusionIssue2163ReproInvalidAlias_CUDA) { std::vector aten_inputs({at_input, at_weight}); - FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(fusion_ptr.get(), aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto cg_output = cg_outputs.at(0); auto ref_x_sub_mean = at_input - at_input.sum({0}).unsqueeze(0); auto ref_y = ref_x_sub_mean * at_weight.unsqueeze(0); testValidate( - fe.kernel(), {cg_output}, aten_inputs, {ref_y}, __LINE__, __FILE__, ""); + ke.kernel(), {cg_output}, aten_inputs, {ref_y}, __LINE__, __FILE__, ""); } // Testing scalar FP types @@ -5080,9 +5080,9 @@ TEST_F(NVFuserTest, FusionFloatingPointType_CUDA) { std::vector inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto cg_outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compile(&fusion, inputs); + auto cg_outputs = ke.run(inputs); testValidate(&fusion, cg_outputs, inputs, __LINE__, __FILE__); } @@ -5146,9 +5146,9 @@ TEST_F(NVFuserTest, FusionIntegerType_CUDA) { std::vector inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto cg_outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compile(&fusion, inputs); + auto cg_outputs = ke.run(inputs); auto i2 = int64_val; auto i3 = int_val; @@ -5209,16 +5209,16 @@ TEST_F(NVFuserTest, FusionVectorizeWelford1_CUDA) { at::Tensor t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto ref_avg = t0.mean({0}); auto ref_var = t0.var({0}, false) * shape[0]; auto ref_N = at::ones({shape[1]}, options_int) * shape[0]; testValidate( - fe.kernel(), + ke.kernel(), cg_outputs, {t0}, {ref_avg, ref_var, ref_N}, @@ -5282,16 +5282,16 @@ TEST_F(NVFuserTest, FusionVectorizeWelford2_CUDA) { at::Tensor t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto ref_avg = t0.to(at::kDouble).mean({0}); auto ref_var = t0.to(at::kDouble).var({0}, false) * shape[0]; auto ref_N = at::ones({shape[1]}, options_int) * shape[0]; testValidate( - fe.kernel(), + ke.kernel(), cg_outputs, {t0}, {ref_avg, ref_var, ref_N}, @@ -5320,7 +5320,7 @@ TEST_F(NVFuserTest, FusionRepro2241_CUDA) { fusion->addOutput(t7); } - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().device(at::kCUDA, 0); at::Tensor t6 = at::tensor({15}, options.dtype(at::kLong)); @@ -5328,7 +5328,7 @@ TEST_F(NVFuserTest, FusionRepro2241_CUDA) { at::Tensor t20 = at::tensor({12}, options.dtype(at::kLong)).expand({1, 1, 1, 1}); - auto cg_outputs = fec.runFusionWithInputs({t6, t15, t20}); + auto cg_outputs = executor_cache.runFusionWithInputs({t6, t15, t20}); auto sample_total = at::sum(t15, {0, 1, 2, 3}, true); auto sample_mean = at::div(sample_total, t20); @@ -5338,7 +5338,12 @@ TEST_F(NVFuserTest, FusionRepro2241_CUDA) { auto t7 = at::div(total, t6); testValidate( - fec.fusion(), cg_outputs, {t6, t15, t20}, {t7}, __LINE__, __FILE__); + executor_cache.fusion(), + cg_outputs, + {t6, t15, t20}, + {t7}, + __LINE__, + __FILE__); } TEST_F(NVFuserTest, FusionExprSortMatmulLikeSchedule_CUDA) { @@ -5379,11 +5384,11 @@ TEST_F(NVFuserTest, FusionExprSortMatmulLikeSchedule_CUDA) { at::Tensor t0 = at::randn({M1, M2, K1, K2}, options); at::Tensor t1 = at::randn({N1, N2, K1, K2}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); - testValidate(fe.kernel(), cg_outputs, {t0, t1}, __LINE__, __FILE__); + testValidate(ke.kernel(), cg_outputs, {t0, t1}, __LINE__, __FILE__); } TEST_F(NVFuserTest, FusionFloatConstantWhere_CUDA) { @@ -5439,19 +5444,19 @@ TEST_F(NVFuserTest, FusionCpAsyncCommitWait_CUDA) { at::Tensor t0 = at::randn({12800, 8, 8, 8}, options); - FusionExecutor fe; + KernelExecutor ke; if (!deviceMajorMinorCheck(8)) { ASSERT_THAT( - [&]() { fe.compileFusion(&fusion, {t0}); }, + [&]() { ke.compile(&fusion, {t0}); }, testing::ThrowsMessage(testing::HasSubstr( "Reason: LoadStoreOpType::CpAsync requires Ampere"))); GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs"; } else { - fe.compileFusion(&fusion, {t0}); + ke.compile(&fusion, {t0}); } - auto cg_outputs = fe.runFusion({t0}); - testValidate(fe.kernel(), cg_outputs, {t0}, __LINE__, __FILE__); + auto cg_outputs = ke.run({t0}); + testValidate(ke.kernel(), cg_outputs, {t0}, __LINE__, __FILE__); } // Repro of issue #2459 @@ -5514,14 +5519,14 @@ TEST_F(NVFuserTest, FusionClearThreadPredicateByRAWSync_CUDA) { std::vector inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto cg_outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compile(&fusion, inputs); + auto cg_outputs = ke.run(inputs); auto t3 = t0.sum({1}).sum({0}); auto t6 = t0.sum({1}); - testValidate(fe.kernel(), cg_outputs, inputs, {t3, t6}, __LINE__, __FILE__); + testValidate(ke.kernel(), cg_outputs, inputs, {t3, t6}, __LINE__, __FILE__); } namespace { @@ -5636,15 +5641,15 @@ TEST_F(NVFuserTest, FusionPredicateReductionInitShared_CUDA) { std::vector inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto cg_outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compile(&fusion, inputs); + auto cg_outputs = ke.run(inputs); auto ref_t1 = t0.sum({0}); auto ref_t4 = t1.exp(); testValidate( - fe.kernel(), cg_outputs, inputs, {ref_t1, ref_t4}, __LINE__, __FILE__); + ke.kernel(), cg_outputs, inputs, {ref_t1, ref_t4}, __LINE__, __FILE__); } // Repro of issue #2487 @@ -5690,15 +5695,15 @@ TEST_F(NVFuserTest, FusionPredicateReductionInitGlobal_CUDA) { std::vector inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto cg_outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compile(&fusion, inputs); + auto cg_outputs = ke.run(inputs); auto ref_t1 = t0.sum({0}); auto ref_t3 = t1.exp(); testValidate( - fe.kernel(), cg_outputs, inputs, {ref_t1, ref_t3}, __LINE__, __FILE__); + ke.kernel(), cg_outputs, inputs, {ref_t1, ref_t3}, __LINE__, __FILE__); } TEST_F(NVFuserTest, FusionTypePromotionATenConsistency_CUDA) { @@ -5763,74 +5768,71 @@ TEST_F(NVFuserTest, FusionCompileIndexType_CUDA) { .getSmallestIndexTypeOfArguments() == PrimDataType::Int32); { - FusionExecutor fe; + KernelExecutor ke; // Lower the kernel with large inputs and int64 index type. CompileParams compile_opts = {.index_type = PrimDataType::Int}; - fe.compileFusion(&fusion, large_inputs, LaunchParams(), compile_opts); + ke.compile(&fusion, large_inputs, LaunchParams(), compile_opts); NVF_CHECK( - fe.kernel()->indexType() == PrimDataType::Int, + ke.kernel()->indexType() == PrimDataType::Int, "Unexpected kernel index type: ", - fe.kernel()->indexType()); + ke.kernel()->indexType()); // Since the index type is int64, both small and large inputs // should work fine - fe.runFusion(small_inputs); - fe.runFusion(large_inputs); + ke.run(small_inputs); + ke.run(large_inputs); } { - FusionExecutor fe; + KernelExecutor ke; // Lower the kernel with small inputs and int64 index type. CompileParams compile_opts = {.index_type = PrimDataType::Int}; - fe.compileFusion(&fusion, small_inputs, LaunchParams(), compile_opts); + ke.compile(&fusion, small_inputs, LaunchParams(), compile_opts); NVF_CHECK( - fe.kernel()->indexType() == PrimDataType::Int, + ke.kernel()->indexType() == PrimDataType::Int, "Unexpected kernel index type: ", - fe.kernel()->indexType()); + ke.kernel()->indexType()); // Since the index type is int64, both small and large inputs // should work fine - fe.runFusion(small_inputs); - fe.runFusion(large_inputs); + ke.run(small_inputs); + ke.run(large_inputs); } { - FusionExecutor fe; + KernelExecutor ke; LaunchParams launch_params; CompileParams compile_opts = {.index_type = PrimDataType::Int32}; - fe.compileFusion(&fusion, small_inputs, launch_params, compile_opts); + ke.compile(&fusion, small_inputs, launch_params, compile_opts); NVF_CHECK( - fe.kernel()->indexType() == PrimDataType::Int32, + ke.kernel()->indexType() == PrimDataType::Int32, "Unexpected kernel index type: ", - fe.kernel()->indexType()); + ke.kernel()->indexType()); // This should complete successfully as the arguments are small // enough to use the int32 index type - fe.runFusion(small_inputs); + ke.run(small_inputs); // This should fail as the Kernel is already compiled for Int32, but // the arguments are too large CompileParams compile_opts_large = {.index_type = PrimDataType::Int}; EXPECT_THAT( - [&]() { - fe.runFusion(large_inputs, launch_params, compile_opts_large); - }, + [&]() { ke.run(large_inputs, launch_params, compile_opts_large); }, testing::ThrowsMessage(testing::HasSubstr( "Kernel index type and compilation index type don't match"))); } { - FusionExecutor fe; + KernelExecutor ke; // Lower the kernel with large inputs and int32 index type. CompileParams compile_opts = {.index_type = PrimDataType::Int32}; // This should fail due to the conflict EXPECT_THAT( [&]() { - fe.compileFusion( - &fusion, large_inputs, LaunchParams(), compile_opts); + ke.compile(&fusion, large_inputs, LaunchParams(), compile_opts); }, testing::ThrowsMessage(testing::HasSubstr( "Compilation with int32 is requested but int64 is required for the arguments"))); @@ -6034,13 +6036,14 @@ TEST_F(NVFuserTest, FusionAvoidRedundantWriteBroadcastedSoftmaxInput_CUDA) { at::Tensor t1 = at::ones(shape1, options); std::vector inputs = {t0, t1}; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = fec.runFusionWithInputs(inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs(inputs); // check thread_pred and write_stride - const auto& fe = fec.getMostRecentKernelRuntime()->executors().at(0); - auto kernel = fe.kernel(); - const auto& thread_pred_map = fe.threadPredMap(); + const auto& ke = + executor_cache.getMostRecentKernelRuntime()->executors().at(0); + auto kernel = ke.kernel(); + const auto& thread_pred_map = ke.threadPredMap(); for (const auto expr : kernel->exprs()) { auto tv = ir_utils::getTvOutput(expr); if (tv && tv->name() == 15 && tv->getMemoryType() == MemoryType::Global) { @@ -6054,7 +6057,7 @@ TEST_F(NVFuserTest, FusionAvoidRedundantWriteBroadcastedSoftmaxInput_CUDA) { } } - testValidate(fec.fusion(), cg_outputs, inputs, __LINE__, __FILE__); + testValidate(executor_cache.fusion(), cg_outputs, inputs, __LINE__, __FILE__); } TEST_F(NVFuserTest, FusionAvoidRedundantWrite_CUDA) { @@ -6089,13 +6092,14 @@ TEST_F(NVFuserTest, FusionAvoidRedundantWrite_CUDA) { at::Tensor t1 = at::randn(shape1, options); std::vector inputs = {t0, t1}; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = fec.runFusionWithInputs(inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs(inputs); // check thread_pred and write_stride - const auto& fe = fec.getMostRecentKernelRuntime()->executors().at(0); - auto kernel = fe.kernel(); - const auto& thread_pred_map = fe.threadPredMap(); + const auto& ke = + executor_cache.getMostRecentKernelRuntime()->executors().at(0); + auto kernel = ke.kernel(); + const auto& thread_pred_map = ke.threadPredMap(); for (const auto expr : kernel->exprs()) { auto tv = ir_utils::getTvOutput(expr); @@ -6110,7 +6114,8 @@ TEST_F(NVFuserTest, FusionAvoidRedundantWrite_CUDA) { } } - testValidate(fec.fusion(), cg_outputs, inputs, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), cg_outputs, inputs, __LINE__, __FILE__); }; // Test case where [B1,I2,I3] is merged to [B1I2I3] @@ -6189,13 +6194,14 @@ TEST_F(NVFuserTest, FusionAvoidRedundantWriteDifferentConcretizedDomains_CUDA) { testing::ThrowsMessage(testing::HasSubstr( "Producer is required to be in Global Memory based on parallelization strategy. RAW flags: (blockIdx.x)"))); } else { - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); - auto optimized_fusion = fec.getMostRecentKernelRuntime(); + auto optimized_fusion = executor_cache.getMostRecentKernelRuntime(); NVF_CHECK(optimized_fusion->isSegmented(), "segmentation didn't happen!"); - testValidate(fec.fusion(), cg_outputs, aten_inputs, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), cg_outputs, aten_inputs, __LINE__, __FILE__); } }; runTest(true); @@ -6239,13 +6245,13 @@ TEST_F(NVFuserTest, FusionAvoidRedundantWriteNonOutput_CUDA) { at::Tensor t1 = at::randn({32, 64}, options); std::vector inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), inputs); - auto cg_outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compile(fusion_ptr.get(), inputs); + auto cg_outputs = ke.run(inputs); // check thread_pred - auto kernel = fe.kernel(); - const auto& thread_pred_map = fe.threadPredMap(); + auto kernel = ke.kernel(); + const auto& thread_pred_map = ke.threadPredMap(); for (const auto expr : kernel->exprs()) { auto tv = ir_utils::getTvOutput(expr); @@ -6303,13 +6309,13 @@ TEST_F(NVFuserTest, FusionAvoidRedundantWriteNonNeighbor_CUDA) { at::Tensor t1 = at::randn({8, 7, 10, 12, 9}, options); std::vector inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), inputs); - auto cg_outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compile(fusion_ptr.get(), inputs); + auto cg_outputs = ke.run(inputs); // check thread_pred - auto kernel = fe.kernel(); - const auto& thread_pred_map = fe.threadPredMap(); + auto kernel = ke.kernel(); + const auto& thread_pred_map = ke.threadPredMap(); for (const auto expr : kernel->exprs()) { auto tv = ir_utils::getTvOutput(expr); @@ -6759,9 +6765,9 @@ TEST_F(ExpandedBroadcastGlobalIntermediateTest, TheTest_CUDA) { at::Tensor t0 = at::randn({2, 1, 2}, options); - FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), {t0}); - auto cg_output = fe.runFusion({t0}).at(0); + KernelExecutor ke; + ke.compile(fusion_ptr.get(), {t0}); + auto cg_output = ke.run({t0}).at(0); ASSERT_EQ(cg_output.size(0), 2); ASSERT_EQ(cg_output.size(1), (1L << 60L)); @@ -6808,10 +6814,9 @@ TEST_F(NVFuserTest, FusionTestWarnRegisterSpill_CUDA) { auto compile_opts = heuristic_params->cparams; compile_opts.maxrregcount = 32; compile_opts.enable_ptxas_verbose = true; - FusionExecutor fe; - fe.compileFusion( - &fusion, {aten_input}, heuristic_params->lparams, compile_opts); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}, heuristic_params->lparams, compile_opts); + auto cg_outputs = ke.run({aten_input}); // validate results testValidate( @@ -6926,9 +6931,9 @@ TEST_F(NVFuserTest, IsFinite_CUDA) { std::array data{1.0, INFINITY, NAN}; const auto input = at::from_blob(data.data(), {3}, {1}).to(options); - FusionExecutor fe; - fe.compileFusion(fusion, {input}); - const auto output = fe.runFusion({input}); + KernelExecutor ke; + ke.compile(fusion, {input}); + const auto output = ke.run({input}); testValidate(fusion, output, {input}, __LINE__, __FILE__); } @@ -7026,8 +7031,8 @@ TEST_F(NVFuserTest, FusionOptionsGuard_CUDA) { // capture stdout and check stdout contains register spill warning captureStdout(); - FusionExecutor fe; - fe.compileFusion( + KernelExecutor ke; + ke.compile( &fusion, {aten_input}, heuristic_params->lparams, @@ -7070,18 +7075,18 @@ TEST_F(NVFuserTest, FusionDisableKernelReuse_CUDA) { auto tv1 = add(tv0, tv0); fusion->addOutput(tv1); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto a5 = at::zeros({5}, options); auto a6 = at::zeros({6}, options); auto a7 = at::zeros({7}, options); - fec.runFusionWithInputs({a5}); + executor_cache.runFusionWithInputs({a5}); - auto numRuntimes = [&fec]() -> size_t { + auto numRuntimes = [&executor_cache]() -> size_t { // this is map, vector> - const auto& runtime_map = fec.getKernelRuntimes(); + const auto& runtime_map = executor_cache.getKernelRuntimes(); return runtime_map .begin() // There should be only one device/concretization pair ->second.size(); @@ -7091,7 +7096,7 @@ TEST_F(NVFuserTest, FusionDisableKernelReuse_CUDA) { DisableOptionsGuard og; DisableOptionsGuard::getCurOptions().unset(DisableOption::KernelReuse); - fec.runFusionWithInputs({a6}); + executor_cache.runFusionWithInputs({a6}); // Since kernel reuse is enabled, we should not generate a new runtime EXPECT_EQ(numRuntimes(), 1); @@ -7101,7 +7106,7 @@ TEST_F(NVFuserTest, FusionDisableKernelReuse_CUDA) { DisableOptionsGuard og; DisableOptionsGuard::getCurOptions().set(DisableOption::KernelReuse); - fec.runFusionWithInputs({a7}); + executor_cache.runFusionWithInputs({a7}); // Disabling reuse means we should get a new runtime EXPECT_EQ(numRuntimes(), 2); @@ -7186,9 +7191,9 @@ TEST_F(NVFuserTest, FusionLayerNormSharedMemoryBuffer_CUDA) { "Shouldn't use shared memory buffer!"); } - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = - fec.runFusionWithInputs({aten_input, aten_weight, aten_bias}); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs( + {aten_input, aten_weight, aten_bias}); testValidate( &fusion_copy, @@ -7260,8 +7265,8 @@ TEST_F(NVFuserTest, FusionInstanceNormNHWC_CUDA) { outputs.push_back(t4); } - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = fec.runFusionWithInputs(inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs(inputs); testValidate(fusion, cg_outputs, inputs, outputs, __LINE__, __FILE__); } @@ -7401,9 +7406,9 @@ TEST_F(NVFuserTest, AllInputDtypes) { CompileParams opt{.index_type = index_type}; - FusionExecutor fe; - fe.compileFusion(fusion.get(), args, LaunchParams{}, opt); - auto outputs = fe.runFusion(args, LaunchParams{}, opt); + KernelExecutor ke; + ke.compile(fusion.get(), args, LaunchParams{}, opt); + auto outputs = ke.run(args, LaunchParams{}, opt); auto kernel_result = outputs.at(0).item(); auto expect = ee.evaluate(output).as().item(); @@ -7521,9 +7526,9 @@ TEST_F(NVFuserTest, OpaqueTupleAsComplex) { KernelArgumentHolder args; args.push(Opaque(std::array{1.2, 3.4})); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion(args); + KernelExecutor ke; + ke.compile(&fusion); + auto outputs = ke.run(args); EXPECT_EQ( outputs.at(0).item>(), c10::complex(1.2, 3.4)); @@ -7548,9 +7553,9 @@ TEST_F(NVFuserTest, StructConstruct) { fusion.addOutput(tv); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion({1.2, 3.4}); + KernelExecutor ke; + ke.compile(&fusion); + auto outputs = ke.run({1.2, 3.4}); EXPECT_EQ( outputs.at(0).item>(), c10::complex(1.2, 3.4)); @@ -7586,12 +7591,12 @@ TEST_F(NVFuserTest, VectorizationStrideValidation) { auto t0 = at::randn(shape, options).expand({-1, 5, -1}); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); // This previously triggered a false positive error with the stride // validation - auto cg_outputs = fe.runFusion(aten_inputs); + auto cg_outputs = ke.run(aten_inputs); ASSERT_TRUE(cg_outputs[0].equal(t0)); } @@ -7615,10 +7620,10 @@ TEST_F(NVFuserTest, ConstLongExpressions) { auto tv0 = full({}, s1, DataType::Int); fusion->addOutput(tv0); - FusionExecutor fe; - fe.compileFusion(fusion); + KernelExecutor ke; + ke.compile(fusion); - auto outputs = fe.runFusion({}); + auto outputs = ke.run({}); testValidate(fusion, outputs, {}, __LINE__, __FILE__); } @@ -7687,10 +7692,10 @@ TEST_F(NVFuserTest, PredicateRNGOps) { auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); at::Tensor t0 = at::zeros({2048, size}, options); - FusionExecutor fe; - fe.compileFusion(fusion, {t0}); + KernelExecutor ke; + ke.compile(fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); } TEST_F(NVFuserTest, LoweringHook) { @@ -7849,8 +7854,8 @@ TEST_F(NVFuserTest, AvoidCachingSliceInput) { NVF_CHECK(kernel_runtime->isSegmented(), "segmentation didn't happen"); const auto num_segments = kernel_runtime->fusionSegments()->groups().size(); NVF_CHECK(num_segments == 3, "Expect 3 segments, got: ", num_segments); - for (const auto& fe : kernel_runtime->executors()) { - for (auto expr : fe.fusion()->exprs()) { + for (const auto& ke : kernel_runtime->executors()) { + for (auto expr : ke.fusion()->exprs()) { if (expr->isA()) { auto slice = expr->as(); NVF_CHECK( @@ -7877,9 +7882,9 @@ TEST_F(NVFuserTest, UnsupportedBFloat) { fusion.addInput(tv0); fusion.addOutput(tv1); - FusionExecutor fe; + KernelExecutor ke; EXPECT_THAT( - [&]() { fe.compileFusion(&fusion); }, + [&]() { ke.compile(&fusion); }, testing::ThrowsMessage( testing::HasSubstr("Reason: Fusion contains BFloat16"))); } @@ -7943,9 +7948,9 @@ TEST_F(NVFuserTest, BlockReduction3D) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto ref = t0.sum(0).sum(-1); testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); }; @@ -7986,9 +7991,9 @@ TEST_F(NVFuserTest, ReverseMerge) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({11, 12}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); ASSERT_TRUE(t0.equal(cg_outputs.at(0))); } @@ -8016,9 +8021,9 @@ TEST_F(NVFuserTest, FusionCpAsyncPredicateAvoidIllegalMemoryAccess) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({m, n}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); ASSERT_TRUE(t0.equal(cg_outputs.at(0))); } @@ -8350,9 +8355,9 @@ TEST_F(NVFuserTest, BroadcastFromNowhereFusion) { // TODO: use larger tensor size at::Tensor t0 = at::randn({4}, options); at::Tensor t1 = at::randn({2, 4}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -8399,8 +8404,8 @@ TEST_F(NVFuserTest, ReplayRFactorMergeBcast) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor at_x = at::ones(input_shape, options); std::vector aten_inputs = {at_x}; - FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); - auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -8434,9 +8439,9 @@ TEST_F(NVFuserTest, MultipleDifferentSizeGridReduction) { const at::Tensor t1 = at::randn({192}, options); const std::vector inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto cg_outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compile(&fusion, inputs); + auto cg_outputs = ke.run(inputs); testValidate(&fusion, cg_outputs, inputs, __LINE__, __FILE__); } @@ -8869,11 +8874,123 @@ TEST_F(NVFuserTest, CpAsyncDataTypeBool) { // "r"((uint32_t)((!b3))) // ); // If not correctly lowered, would trigger error in compile - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } + +// Intermediate IDs generaetd by rFactor should also remain +// reductions. See #3327 for more info. +TEST_F(NVFuserTest, RfactorIntermediateIDs) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeSymbolicTensor(3); + fusion.addInput(tv0); + + auto tv1 = sum(tv0, {1, 2}); + fusion.addOutput(tv1); + + tv1->merge(1, 2); + tv1->split(1, 4); + + auto tv2 = tv1->rFactor({-1}); + + EXPECT_TRUE(tv2->axis(-1)->isReduction()); + EXPECT_FALSE(tv2->axis(-2)->isReduction()); + + auto split = dynamic_cast(tv2->axis(-1)->definition()); + ASSERT_NE(split, nullptr); + + auto merge_out = split->in(); + EXPECT_TRUE(merge_out->isReduction()); +} + +// Simple test to make sure replacement with a dependent val is +// detected as an error +TEST_F(NVFuserTest, AvoidReplacingWithDependentVal) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto i0 = IrBuilder::create(DataType::Int); + fusion.addInput(i0); + + auto i1 = mul(i0, IrBuilder::create(1, DataType::Int)); + + auto tv0 = TensorViewBuilder().shape({i1}).build(); + fusion.addInput(tv0); + + auto tv1 = set(tv0); + fusion.addOutput(tv1); + + std::unordered_map replacement_map; + replacement_map.emplace(i0, i1); + + EXPECT_THAT( + [&]() { ir_utils::replaceValue(&fusion, replacement_map); }, + testing::ThrowsMessage(testing::HasSubstr( + "not allowed as it would result in a recursive definition"))); +} + +// Was also a repro of issue #3347 +TEST_F(NVFuserTest, ReplaceSymbolicSizesPreferSimplerExtents) { + auto fusion_ptr = std::make_unique(); + Fusion& fusion = *fusion_ptr; + FusionGuard fg(fusion_ptr.get()); + + auto tv0 = makeSymbolicTensor(3); + fusion.addInput(tv0); + auto tv1 = makeSymbolicTensor(2); + fusion.addInput(tv1); + auto i0 = IrBuilder::create(DataType::Index); + fusion.addInput(i0); + + auto tv2 = reshape(tv0, {i0}); + auto tv3 = reshape(tv1, {i0}); + auto tv4 = add(tv2, tv3); + fusion.addOutput(tv4); + + ExpressionEvaluator expr_eval; + + expr_eval.bind(tv0->axis(0)->extent(), 2L); + expr_eval.bind(tv0->axis(1)->extent(), 4L); + expr_eval.bind(tv0->axis(2)->extent(), 8L); + expr_eval.bind(tv1->axis(0)->extent(), 8L); + expr_eval.bind(tv1->axis(1)->extent(), 8L); + expr_eval.bind(i0, 64L); + + auto initial_info = DynamicTransform::getInitialInfo(&fusion); + auto info = DynamicTransformConcretizationInfo(&initial_info, &expr_eval); + + DynamicTransform::concretizeFusion(&fusion, &info); + + replaceSymbolicSizes(&fusion); + + // All expr output tensors should use the extent of the tv3 since it + // has only one merge, whereas tv2 has two merges + // All expr output tensors should use the same extent. + auto ref_ext = fusion.outputs().at(0)->as()->axis(0)->extent(); + + // ref_ext should look like getMetaData(T1).logical_size[0] * + // getMetaData(T1).logical_size[1] + auto ext_def = dynamic_cast(ref_ext->definition()); + ASSERT_NE(ext_def, nullptr); + ASSERT_EQ(ext_def->getBinaryOpType(), BinaryOpType::Mul); + auto lhs = ext_def->input(0); + auto rhs = ext_def->input(1); + ASSERT_NE(dynamic_cast(lhs->definition()), nullptr); + ASSERT_NE(dynamic_cast(rhs->definition()), nullptr); + + for (auto expr : fusion.exprs()) { + auto tv_output = ir_utils::getTvOutput(expr); + ASSERT_EQ(tv_output->nDims(), 1); + auto ext = tv_output->axis(0)->extent(); + EXPECT_EQ(ref_ext, ext) << "Reference: " << ref_ext->toString() + << ", actual: " << ext->toString(); + } +} + // Test file size should be up to 10K LoC. Create a new file for more tests. } // namespace nvfuser diff --git a/tests/cpp/test_gpu_compute_with.cpp b/tests/cpp/test_gpu_compute_with.cpp index 9ef2cfced37..df3b5a9bff1 100644 --- a/tests/cpp/test_gpu_compute_with.cpp +++ b/tests/cpp/test_gpu_compute_with.cpp @@ -164,9 +164,9 @@ TEST_F(NVFuserTest, FusionComputeWith1_CUDA) { at::Tensor t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -219,9 +219,9 @@ TEST_F(NVFuserTest, FusionComputeWith2_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({dimx}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto aten_output = at::_softmax(t0.to(at::kDouble), -1, false); @@ -261,9 +261,9 @@ TEST_F(NVFuserTest, FusionComputeWith3_CUDA) { at::Tensor t0 = at::randn({123}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -307,9 +307,9 @@ TEST_F(NVFuserTest, FusionComputeWith4_CUDA) { at::Tensor t0 = at::randn({345, 10}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -346,9 +346,9 @@ TEST_F(NVFuserTest, FusionComputeWith5_CUDA) { at::Tensor t0 = at::randn({345, 10}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -449,9 +449,9 @@ TEST_F(NVFuserTest, FusionComputeWith6_CUDA) { const std::vector input_shape{N, H, W, C}; auto t0 = at::randn(input_shape, options_half); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, LaunchParams()); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}, LaunchParams()); + auto cg_outputs = ke.run({t0}); auto t1 = t0.to(at::kFloat); auto t2 = t1.mean({0, 1, 2}); diff --git a/tests/cpp/test_gpu_fused_reduction.cpp b/tests/cpp/test_gpu_fused_reduction.cpp index c29460a4241..6080862aed5 100644 --- a/tests/cpp/test_gpu_fused_reduction.cpp +++ b/tests/cpp/test_gpu_fused_reduction.cpp @@ -115,9 +115,9 @@ TEST_F(NVFuserTest, FusionGridAllreduce1_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({nx}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto ref = sum(t0).unsqueeze(0) + t0; @@ -164,9 +164,9 @@ TEST_F(NVFuserTest, FusionGridAllreduce2_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({nx}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto ref = sum(t0).unsqueeze(0) + t0; @@ -212,9 +212,9 @@ TEST_F(NVFuserTest, FusionGridAllreduce3_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({nx, ny}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto ref = sum(t0, {1}).unsqueeze(-1) + t0; @@ -257,9 +257,9 @@ TEST_F(NVFuserTest, FusionGridAllreduce4_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({nx}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto ref = (sum(t0) + 1).unsqueeze(0) + t0; @@ -319,9 +319,9 @@ TEST_F(NVFuserTest, FusionGridAllreduce5_CUDA) { auto t0 = at::randn({iter, nx}, options); auto t5 = at::randn({bdimy, bdimx}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t5}); - auto cg_outputs = fe.runFusion({t0, t5}); + KernelExecutor ke; + ke.compile(&fusion, {t0, t5}); + auto cg_outputs = ke.run({t0, t5}); auto ref = (sum(t0, {1}) + 1).unsqueeze(-1) + t0; @@ -371,14 +371,14 @@ TEST_F(NVFuserTest, FusionGridAllreduce6_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); auto t0_double = t0.to(at::kDouble); auto ref = t0_double + t0_double.sum({0}).unsqueeze(0); - testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); } TEST_F(NVFuserTest, FusionGridAllreduceWelford1_CUDA) { @@ -417,9 +417,9 @@ TEST_F(NVFuserTest, FusionGridAllreduceWelford1_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({nx}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto ref = (t0.mean({0}).unsqueeze(0) + t0) + t0.var({0}, false).unsqueeze(0) * nx; @@ -467,9 +467,9 @@ TEST_F(NVFuserTest, FusionGridAllreduceWelford2_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({nx, ny}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto ref = (sum(t0, {1}) / ny).unsqueeze(-1) + t0; @@ -586,10 +586,10 @@ TEST_F(NVFuserTest, FusionFusedReductionBatchnorm_CUDA) { GpuLower gpulw(&fusion); validateNoParallelBroadcastExist(gpulw.run()); - FusionExecutor fe; + KernelExecutor ke; LaunchParams launch_params(2, 2, -1, -1, -1, -1); - fe.compileFusion(&fusion, aten_inputs, launch_params); - auto cg_outputs = fe.runFusion(aten_inputs, launch_params); + ke.compile(&fusion, aten_inputs, launch_params); + auto cg_outputs = ke.run(aten_inputs, launch_params); auto t5 = t0.to(at::kFloat); auto t6 = t1.to(at::kFloat); @@ -653,13 +653,13 @@ TEST_F(NVFuserTest, FusionGroupedReduction1_CUDA) { auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); auto ref = t0.sum({1}) * 2; - testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); } // Grouping reductions with different ops @@ -698,13 +698,13 @@ TEST_F(NVFuserTest, FusionGroupedReduction2_CUDA) { auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); auto ref = (t0 + 1).sum({1}) + std::get<0>((t0 + 2).max(1)); - testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); } // Grouped reduction with different types @@ -741,13 +741,13 @@ TEST_F(NVFuserTest, FusionGroupedReduction3_CUDA) { auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); auto ref = t0.sum({1}) + t0.to(c10::kDouble).sum({1}).to(c10::kFloat); - testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); } // Testing validation @@ -829,11 +829,11 @@ TEST_F(NVFuserTest, FusionGroupedReduction6_CUDA) { auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); - testValidate(fe.kernel(), outputs, {t0}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, {t0}, __LINE__, __FILE__); } TEST_F(NVFuserTest, FusionGroupedReduction7_CUDA) { @@ -892,13 +892,13 @@ TEST_F(NVFuserTest, FusionGroupedReductionRfactor1_CUDA) { auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); auto ref = t0.sum({0}) * 2; - testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); } // Rfactoring grouped reductions @@ -937,13 +937,13 @@ TEST_F(NVFuserTest, FusionGroupedReductionRfactor2_CUDA) { auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); auto ref = t0.sum({0}) * 2; - testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); } // Group reductions of tensors that have computeAt positions set @@ -983,13 +983,13 @@ TEST_F(NVFuserTest, FusionGroupedReductionAfterComputeAt_CUDA) { auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); auto ref = (t0 + 1).sum({1}) * 2; - testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); } TEST_F(NVFuserTest, FusionGroupAllreduce1_CUDA) { @@ -1023,14 +1023,14 @@ TEST_F(NVFuserTest, FusionGroupAllreduce1_CUDA) { auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); auto t3 = t0.sum({0}).unsqueeze(-1); auto ref = t0 + t3 + t3; - testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); } // Grid reductionso of different types @@ -1076,15 +1076,15 @@ TEST_F(NVFuserTest, FusionGroupAllreduce2_CUDA) { auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); auto t2 = t0.sum({1}).unsqueeze(-1); auto t6 = t0.to(c10::kDouble).sum({1}).unsqueeze(-1).to(c10::kFloat); auto ref = t0 + t2 + t6; - testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); } // Grouping 3 grid allreduces @@ -1124,15 +1124,15 @@ TEST_F(NVFuserTest, FusionGroupAllreduce3_CUDA) { auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); auto t3 = t0 / t0.sum({0}).unsqueeze(0); auto t6 = t0 / std::get<0>(t0.max(0)).unsqueeze(0); auto t9 = t0 - std::get<0>(t0.min(0)).unsqueeze(0); - testValidate(fe.kernel(), outputs, {t0}, {t3, t6, t9}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, {t0}, {t3, t6, t9}, __LINE__, __FILE__); } // Grouping 8 grid allreduces @@ -1177,9 +1177,9 @@ TEST_F(NVFuserTest, FusionGroupAllreduce4_CUDA) { auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); at::Tensor ref = t0; for (int i = 0; i < num_reductions; ++i) { @@ -1189,7 +1189,7 @@ TEST_F(NVFuserTest, FusionGroupAllreduce4_CUDA) { ref = add(ref, bc); } - testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); } // Variation of FusionGroupAllreduce5_CUDA but with different @@ -1265,9 +1265,9 @@ TEST_F(NVFuserTest, FusionGroupAllreduce5_CUDA) { std::vector indices({at::indexing::Slice(0, 10)}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); auto t3 = t0 / t0.sum({0}).unsqueeze(0).to(at::kComplexDouble); auto t7 = t4 / t4.sum({0}).unsqueeze(0).to(at::kComplexDouble); @@ -1275,7 +1275,7 @@ TEST_F(NVFuserTest, FusionGroupAllreduce5_CUDA) { auto t15 = t12 / t12.sum({0}).unsqueeze(0).to(at::kComplexDouble); auto t19 = t16 / t16.sum({0}).unsqueeze(0).to(at::kComplexDouble); auto ref = t3 + t7 + t11 + t15 + t19; - testValidate(fe.kernel(), outputs, aten_inputs, {ref}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, aten_inputs, {ref}, __LINE__, __FILE__); } // Persistent batchnorm backward with grouped allreduce @@ -1428,14 +1428,14 @@ TEST_F(NVFuserTest, FusionPersistentBNBackwardAllreduce_CUDA) { GpuLower gpulw(&fusion); validateNoParallelBroadcastExist(gpulw.run()); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); if (bidx * bidy > deviceSMCount()) { GTEST_SKIP() << "Not enough SMs to run this test"; } - auto outputs = fe.runFusion(aten_inputs); + auto outputs = ke.run(aten_inputs); std::vector at_reduction_axes; std::copy( @@ -1483,7 +1483,7 @@ TEST_F(NVFuserTest, FusionPersistentBNBackwardAllreduce_CUDA) { } testValidate( - fe.kernel(), outputs, aten_inputs, {at_grad_input}, __LINE__, __FILE__); + ke.kernel(), outputs, aten_inputs, {at_grad_input}, __LINE__, __FILE__); } TEST_F(NVFuserTest, FusionGroupedReductionReEntrant1_CUDA) { @@ -1534,14 +1534,14 @@ TEST_F(NVFuserTest, FusionGroupedReductionReEntrant1_CUDA) { auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); auto t0_double = t0.to(at::kDouble); auto ref = (t0_double + 1).sum({0}) + (t0_double + 2).sum({0}); - testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); } // Channels-last batch norm with vectorization. Relies on re-entrant @@ -1649,9 +1649,9 @@ TEST_F(NVFuserTest, FusionGroupedReductionChannelsLastBatchNormLike_CUDA) { auto t2 = at::randn({shape.back()}, options_float); std::vector aten_inputs({t0, t1, t2}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); auto t0_double = t0.to(at::kDouble); auto t1_double = t1.to(at::kDouble); @@ -1664,7 +1664,7 @@ TEST_F(NVFuserTest, FusionGroupedReductionChannelsLastBatchNormLike_CUDA) { (t1_double - t2_double.unsqueeze(0).unsqueeze(0).unsqueeze(0)); auto t9 = t8.sum(at_reduction_axes); - testValidate(fe.kernel(), outputs, aten_inputs, {t5, t9}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, aten_inputs, {t5, t9}, __LINE__, __FILE__); } // Test the grouped grid allreduce with BN-like outer reductions @@ -1780,9 +1780,9 @@ TEST_F( auto t2 = at::randn({shape.back()}, options_float); std::vector aten_inputs({t0, t1, t2}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); auto t0_double = t0.to(at::kDouble); auto t1_double = t1.to(at::kDouble); @@ -1801,7 +1801,7 @@ TEST_F( auto t13 = t1_double + t12; testValidate( - fe.kernel(), outputs, aten_inputs, {t11, t13}, __LINE__, __FILE__); + ke.kernel(), outputs, aten_inputs, {t11, t13}, __LINE__, __FILE__); } TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduce1_CUDA) { @@ -1868,14 +1868,14 @@ TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduce1_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); auto t0_double = t0.to(at::kDouble); auto ref = t0_double + t0_double.sum({0}).unsqueeze(0); - testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); } // Test grouping of two domains @@ -1946,14 +1946,14 @@ TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduce2_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); auto t0_double = t0.to(at::kDouble); auto ref = t0_double + t0_double.sum({0}).unsqueeze(0); - testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); } // Group both expressions and iterations @@ -2030,16 +2030,16 @@ TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduce3_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); auto t0_double = t0.to(at::kDouble); auto t4 = t0_double + 1 + (t0_double + 1).sum({0}).unsqueeze(0); auto t8 = t0_double + 2 + (t0_double + 2).sum({0}).unsqueeze(0); auto ref = t4 + t8; - testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); } // ParallelType::Group with computeAt @@ -2122,14 +2122,14 @@ TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduce4_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); auto t0_double = t0.to(at::kDouble); auto ref = t0_double + t0_double.sum({0}).unsqueeze(0); - testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); } TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduceWelford1_CUDA) { @@ -2183,14 +2183,14 @@ TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduceWelford1_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); auto t0_double = t0.to(at::kDouble); auto ref = t0_double + t0_double.mean({0}).unsqueeze(0); - testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); } // Test grouping of two domains @@ -2248,14 +2248,14 @@ TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduceWelford2_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto outputs = ke.run({t0}); auto t0_double = t0.to(at::kDouble); auto ref = t0_double + t0_double.mean({0}).unsqueeze(0); - testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); + testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__); } // Follows the pattern of persistent outer grid welford in batchnorm @@ -2385,8 +2385,8 @@ TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduceWelfordShmoo_CUDA) { params.N, params.H, params.W, params.C}; auto t0 = at::randn(input_shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); // Skip the rest of this test size if the required number of SMs // exceeds the available SM count @@ -2397,7 +2397,7 @@ TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduceWelfordShmoo_CUDA) { return; } - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); auto t1 = t0.to(at::kDouble); auto t2 = t1.mean({0, 1, 2}).unsqueeze(0).unsqueeze(0).unsqueeze(0); @@ -2541,9 +2541,9 @@ TEST_F(NVFuserTest, FusionCrossEntropyGatherPattern_CUDA) { at::randint(0, num_classes, {batch_size}, options.dtype(at::kLong)); std::vector inputs = {at_log_probs, at_labels}; - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto cg_outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compile(&fusion, inputs); + auto cg_outputs = ke.run(inputs); auto ref = at::gather(at_log_probs, 1, at_labels.unsqueeze(1)).squeeze(); diff --git a/tests/cpp/test_gpu_indexing_ops.cpp b/tests/cpp/test_gpu_indexing_ops.cpp index 456fe0c2e46..ed5c4a5ce86 100644 --- a/tests/cpp/test_gpu_indexing_ops.cpp +++ b/tests/cpp/test_gpu_indexing_ops.cpp @@ -396,9 +396,9 @@ TEST_F(NVFuserTest, FusionIndexSelect_Sum_CUDA) { std::vector aten_inputs = {input1, input0, input_idx}; auto heuristic_params = SchedulerEntry::scheduleWith( &fusion, SchedulerType::Reduction, aten_inputs); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs, heuristic_params->lparams); - fe.runFusion(aten_inputs, {cg_output}, heuristic_params->lparams); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs, heuristic_params->lparams); + ke.run(aten_inputs, {cg_output}, heuristic_params->lparams); auto tv0_ref = at::index_select(input0, 0, input_idx); at::Tensor tv2_ref = tv0_ref * input1; diff --git a/tests/cpp/test_gpu_outer_reduction.cpp b/tests/cpp/test_gpu_outer_reduction.cpp index afaf89b7aeb..7934689844c 100644 --- a/tests/cpp/test_gpu_outer_reduction.cpp +++ b/tests/cpp/test_gpu_outer_reduction.cpp @@ -115,11 +115,11 @@ TEST_F(OuterReductionTest, GroupedGridWelfordOuterOpt) { auto t0 = at::randn(input_shape, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); NVF_CHECK( - fe.kernel()->summary().has_outer_grouped_grid_welford == + ke.kernel()->summary().has_outer_grouped_grid_welford == params.should_use_opt, (params.should_use_opt ? "Failed to use the optimized implementation" : "Should not use the optimized implementation"), @@ -132,7 +132,7 @@ TEST_F(OuterReductionTest, GroupedGridWelfordOuterOpt) { ", ", params.bidx); - auto cg_outputs = fe.runFusion(aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto t1 = t0; auto t2 = params.dtype == DataType::Half ? t1.to(at::kFloat) : t1; @@ -638,8 +638,8 @@ void grid_persistent_reduction_outer_norm_like( const std::vector input_shape{N, HW, HW, C}; auto t0 = at::randn(input_shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); auto bidy = ceilDiv(ceilDiv(N * HW * HW, params.tidy), params.pb); @@ -648,12 +648,12 @@ void grid_persistent_reduction_outer_norm_like( << params.bidx * bidy << ", available: " << deviceSMCount(); } - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); if (benchmark_mode) { for (int i = 0; i < 10; ++i) { clearL2Cache(); - cg_outputs = fe.runFusion({t0}); + cg_outputs = ke.run({t0}); } } @@ -737,8 +737,8 @@ void grid_persistent_welford_outer_norm_like( const std::vector input_shape{N, HW, HW, C}; auto t0 = at::randn(input_shape, options_half); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); auto bidy = ceilDiv(ceilDiv(N * HW * HW, params.tidy), params.pb); @@ -747,12 +747,12 @@ void grid_persistent_welford_outer_norm_like( << params.bidx * bidy << ", available: " << deviceSMCount(); } - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); if (benchmark_mode) { for (int i = 0; i < 10; ++i) { clearL2Cache(); - cg_outputs = fe.runFusion({t0}); + cg_outputs = ke.run({t0}); } } @@ -898,8 +898,8 @@ void grid_persistent_batchnorm_manual( std::vector aten_inputs( {at_input_nvfuser, at_weight, at_bias, at_running_mean, at_running_var}); - FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), aten_inputs); + KernelExecutor ke; + ke.compile(fusion_ptr.get(), aten_inputs); auto bidy = ceilDiv(ceilDiv(N * HW * HW, params.tidy), params.pb); @@ -908,7 +908,7 @@ void grid_persistent_batchnorm_manual( << params.bidx * bidy << ", available: " << deviceSMCount(); } - auto cg_outputs = fe.runFusion(aten_inputs); + auto cg_outputs = ke.run(aten_inputs); cg_outputs.at(2) = cg_outputs.at(2).permute({0, 3, 1, 2}); auto at_output = at::batch_norm( @@ -923,7 +923,7 @@ void grid_persistent_batchnorm_manual( true); testValidate( - fe.kernel(), + ke.kernel(), {cg_outputs.at(2)}, aten_inputs, {at_output}, @@ -934,7 +934,7 @@ void grid_persistent_batchnorm_manual( if (benchmark_mode) { for (int i = 0; i < 10; ++i) { clearL2Cache(); - cg_outputs = fe.runFusion(aten_inputs); + cg_outputs = ke.run(aten_inputs); } } } @@ -1037,8 +1037,8 @@ void grid_persistent_reduction_outer_norm_bwd_like( auto t1 = at::randn(input_shape, options); std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); auto bidy = ceilDiv(ceilDiv(N * HW * HW, params.tidy), params.pb); @@ -1047,12 +1047,12 @@ void grid_persistent_reduction_outer_norm_bwd_like( << params.bidx * bidy << ", available: " << deviceSMCount(); } - auto cg_outputs = fe.runFusion(aten_inputs); + auto cg_outputs = ke.run(aten_inputs); if (benchmark_mode) { for (int i = 0; i < 10; ++i) { clearL2Cache(); - cg_outputs = fe.runFusion(aten_inputs); + cg_outputs = ke.run(aten_inputs); } } @@ -1224,8 +1224,8 @@ void grid_persistent_batchnorm_bwd_manual( std::vector cg_outputs; - FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), aten_inputs); + KernelExecutor ke; + ke.compile(fusion_ptr.get(), aten_inputs); auto bidy = ceilDiv(ceilDiv(N * HW * HW, params.tidy), params.pb); @@ -1234,7 +1234,7 @@ void grid_persistent_batchnorm_bwd_manual( << params.bidx * bidy << ", available: " << deviceSMCount(); } - cg_outputs = fe.runFusion(aten_inputs); + cg_outputs = ke.run(aten_inputs); // Permute grad_input output cg_outputs.at(0) = cg_outputs.at(0).permute({0, 3, 1, 2}); @@ -1251,7 +1251,7 @@ void grid_persistent_batchnorm_bwd_manual( {true, true, true}); testValidate( - fe.kernel(), + ke.kernel(), cg_outputs, aten_inputs, {std::get<0>(at_output), std::get<1>(at_output), std::get<2>(at_output)}, @@ -1262,7 +1262,7 @@ void grid_persistent_batchnorm_bwd_manual( if (benchmark_mode) { for (int i = 0; i < 10; ++i) { clearL2Cache(); - cg_outputs = fe.runFusion(aten_inputs); + cg_outputs = ke.run(aten_inputs); } } } @@ -2181,22 +2181,20 @@ TEST_F(OuterReductionTest, IterGroupedBlockReduction) { rparams->unroll_factor_iter_dom = vect_factor; scheduler->schedule(&fusion, rparams); - FusionExecutor fusion_executor; - fusion_executor.compileFusion( - &fusion, aten_inputs, heuristic_params->lparams); - auto cg_outputs = - fusion_executor.runFusion(aten_inputs, heuristic_params->lparams); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs, heuristic_params->lparams); + auto cg_outputs = ke.run(aten_inputs, heuristic_params->lparams); // lowering & check iteration grouped reductions NVF_CHECK( - fusion_executor.kernel()->summary().has_iter_grouped_reductions, + ke.kernel()->summary().has_iter_grouped_reductions, "There must be iter domain grouped reductions."); NVF_CHECK( - fusion_executor.kernel()->summary().num_grouped_iterations == vect_factor, + ke.kernel()->summary().num_grouped_iterations == vect_factor, "Expected ", vect_factor, " grouped iterations, found ", - fusion_executor.kernel()->summary().num_grouped_iterations); + ke.kernel()->summary().num_grouped_iterations); testValidate( &fusion, @@ -2292,9 +2290,9 @@ void shmooTestsOfIterGroupedBlockOrGridReduction( auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs, lparams); - auto cg_outputs = fe.runFusion(aten_inputs, lparams); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs, lparams); + auto cg_outputs = ke.run(aten_inputs, lparams); testValidate( &fusion, @@ -2543,15 +2541,15 @@ TEST_F(OuterReductionTest, IterGroupedMultipleReductions) { << "Expect 2 Iteration domain grouped grid reductions, got: " << num_iter_grouped_reductions; - FusionExecutor fe; + KernelExecutor ke; std::vector shape({redu_dim, iter_dim}); auto options = at::TensorOptions().device(at::kCUDA, 0); auto t0 = at::randn(shape, options); auto t1 = at::randn(shape, options); std::vector aten_inputs({t0, t1}); - fe.compileFusion(&fusion, aten_inputs, lparams); - auto cg_outputs = fe.runFusion(aten_inputs, lparams); + ke.compile(&fusion, aten_inputs, lparams); + auto cg_outputs = ke.run(aten_inputs, lparams); testValidate( &fusion, @@ -2595,10 +2593,10 @@ TEST_F(NVFuserTest, SmallOuterBlockReductionIssue2766) { auto t0 = at::randn({shape[0] * shape[1], shape[2]}, options); std::vector inputs({t0}); - FusionExecutorCache fec(std::move(fusion_ptr)); - auto outputs = fec.runFusionWithInputs(inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(inputs); - testValidate(fec.fusion(), outputs, inputs, __LINE__, __FILE__); + testValidate(executor_cache.fusion(), outputs, inputs, __LINE__, __FILE__); } } // namespace nvfuser diff --git a/tests/cpp/test_gpu_transpose.cpp b/tests/cpp/test_gpu_transpose.cpp index 11c326878fc..13b01bfb8c9 100644 --- a/tests/cpp/test_gpu_transpose.cpp +++ b/tests/cpp/test_gpu_transpose.cpp @@ -547,9 +547,9 @@ TEST_F(TransposeTest, FusionManualScheduleTransposeComplexDAG1) { at::Tensor input1 = at::randn({1024, 512, 256}, options); at::Tensor input2 = at::randn({512, 256, 1024}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input0, input1, input2}); - auto outputs = fe.runFusion({input0, input1, input2}); + KernelExecutor ke; + ke.compile(&fusion, {input0, input1, input2}); + auto outputs = ke.run({input0, input1, input2}); testValidate(&fusion, outputs, {input0, input1, input2}, __LINE__, __FILE__); } @@ -987,9 +987,9 @@ TEST_F(TransposeTest, FusionTransposeBankConflict9) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({32, 32, 2}, options); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion({input}); + KernelExecutor ke; + ke.compile(&fusion); + auto outputs = ke.run({input}); testValidate(&fusion, outputs, {input}, __LINE__, __FILE__); } diff --git a/tests/cpp/test_gpu_view.cpp b/tests/cpp/test_gpu_view.cpp index 725469b2647..c3319a7ef39 100644 --- a/tests/cpp/test_gpu_view.cpp +++ b/tests/cpp/test_gpu_view.cpp @@ -134,9 +134,9 @@ TEST_F(GpuViewTest, FusionViewAsRealOutput) { at::Tensor at_y = at::randn(output_shape, out_options); std::vector aten_inputs = {at_x, at_bias, at_y}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -279,8 +279,8 @@ void reductionViewAddFusion( at::Tensor at_bias = at::randn(bias_shape, options); std::vector aten_inputs = {at_x, at_bias}; - FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); - auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -445,8 +445,8 @@ void persistentViewAddFusion( at::Tensor at_bias = at::randn(bias_shape, options); std::vector aten_inputs = {at_x, at_bias}; - FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); - auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -637,9 +637,9 @@ TEST_F(GpuViewTest, FusionReshapeConcreteDomain) { auto t0 = at::randn({2, 3}, options); auto t1 = at::randn({1, 6}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -668,8 +668,8 @@ TEST_F(GpuViewTest, FusionReshapeConcreteDomain2) { at::Tensor at_bias = at::randn(output_shape, options); std::vector aten_inputs = {at_x, at_bias}; - FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); - auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -704,8 +704,8 @@ TEST_F(GpuViewTest, FusionReshapeConcreteDomain3) { at::Tensor at_z = at::randn(other_shape, options); std::vector aten_inputs = {at_x, at_y, at_z}; - FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); - auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -850,9 +850,9 @@ TEST_F(GpuViewTest, FusionFlattenAfterUnsqueezeOutput) { x_add_bias->computeAt(x_reshape, 1); x_reshape->axis(0)->parallelize(ParallelType::TIDx); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -914,15 +914,15 @@ TEST_F(GpuViewTest, FusionExpandRepro) { at::Tensor at_y = at::randn(input_shape2, options); std::vector aten_inputs = {at_x, at_y}; - FusionExecutor fe; - fe.compileFusion(&fusion); + KernelExecutor ke; + ke.compile(&fusion); LaunchParams l_params; - auto outputs = fe.runFusion(aten_inputs, {}, l_params, {}); + auto outputs = ke.run(aten_inputs, {}, l_params, {}); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); // second run to verify cached output allocation - outputs = fe.runFusion(aten_inputs, {}, l_params, {}); + outputs = ke.run(aten_inputs, {}, l_params, {}); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -1349,9 +1349,9 @@ TEST_F(GpuViewTest, FusionPwiseViewSchedule) { at::Tensor t0 = at::randn({x, y, z}, options); at::Tensor t3 = at::randn({x, y, z}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t3}); - auto cg_outputs = fe.runFusion({t0, t3}); + KernelExecutor ke; + ke.compile(&fusion, {t0, t3}); + auto cg_outputs = ke.run({t0, t3}); testValidate(&fusion, cg_outputs, {t0, t3}, __LINE__, __FILE__); } @@ -1415,9 +1415,9 @@ TEST_F(GpuViewTest, FusionSumViewSchedule) { auto t5 = t4.sum({1}); auto t6 = t0 + t3; - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t3}); - auto cg_outputs = fe.runFusion({t0, t3}); + KernelExecutor ke; + ke.compile(&fusion, {t0, t3}); + auto cg_outputs = ke.run({t0, t3}); testValidate(&fusion, cg_outputs, {t0, t3}, {t2, t5, t6}, __LINE__, __FILE__); } @@ -1944,9 +1944,9 @@ TEST_F(GpuViewTest, FusionReshapeMapping) { at::Tensor t0 = at::randn({w, x, y * z}, options); at::Tensor t3 = at::randn({w, x * y, z}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t3}); - auto cg_outputs = fe.runFusion({t0, t3}); + KernelExecutor ke; + ke.compile(&fusion, {t0, t3}); + auto cg_outputs = ke.run({t0, t3}); testValidate(&fusion, cg_outputs, {t0, t3}, __LINE__, __FILE__); } @@ -2318,9 +2318,9 @@ TEST_F(GpuViewTest, ExpandedBroadcast) { at::Tensor in_tensor = at::randn({4, 5}, at::dtype(at::kFloat).device(at::kCUDA, 0)); - FusionExecutor fe; - fe.compileFusion(&fusion, {in_tensor}); - at::Tensor actual_out_tensor = fe.runFusion({in_tensor})[0]; + KernelExecutor ke; + ke.compile(&fusion, {in_tensor}); + at::Tensor actual_out_tensor = ke.run({in_tensor})[0]; testValidate(&fusion, {actual_out_tensor}, {in_tensor}, __LINE__, __FILE__); } @@ -2697,9 +2697,9 @@ TEST_F(GpuViewTest, FusionMismatchingReshape) { // TODO: use larger tensor size once we are able to successfully parallelize // this fusion. at::Tensor t0 = at::randn({2, 3, 5}).to(options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } diff --git a/tests/cpp/test_host_irs.cpp b/tests/cpp/test_host_irs.cpp index a651fb5778d..bee0c2979dc 100644 --- a/tests/cpp/test_host_irs.cpp +++ b/tests/cpp/test_host_irs.cpp @@ -346,7 +346,7 @@ TEST_P(HostIrTest, ThreeFusions) { // [Step 8)] Execute the Host program HostIrExecutorParams params; - // we test two different modes of the HostIrExecutor: using FusionExecutor or + // we test two different modes of the HostIrExecutor: using KernelExecutor or // FusionExecutorCache auto [use_fusion_executor_cache] = GetParam(); params.use_fusion_executor_cache = use_fusion_executor_cache; diff --git a/tests/cpp/test_indexing.cpp b/tests/cpp/test_indexing.cpp index 23c48dfc0b7..1862747ec47 100644 --- a/tests/cpp/test_indexing.cpp +++ b/tests/cpp/test_indexing.cpp @@ -1773,9 +1773,9 @@ TEST_F(IndexingTest, SmemAllocationDomainForTranspose) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input0 = at::randn({256, 256}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input0}); - auto outputs = fe.runFusion({input0}); + KernelExecutor ke; + ke.compile(&fusion, {input0}); + auto outputs = ke.run({input0}); testValidate(&fusion, outputs, {input0}, __LINE__, __FILE__); } @@ -2151,6 +2151,11 @@ TEST_F(IndexingTest, DoubleBuffering6) { return nullptr; } + // This loop is double buffered. Since the loop originally has + // just a trip count of 2, the double-buffered main loop has a + // trip count of 1. Thus, this loop is always trivial + loop_indices.at(1) = tv->fusion()->zeroVal(); + switch (tv->name()) { case 1: { if (!as_consumer) { @@ -3040,9 +3045,9 @@ TEST_F(PredicateIndexingTest, DoubleBuffering1) { EnableOptionsGuard enable_options_guard; EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compile(&fusion, inputs); + auto outputs = ke.run(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } @@ -3139,9 +3144,9 @@ TEST_F(PredicateIndexingTest, CircularBuffering1) { EnableOptionsGuard enable_options_guard; EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compile(&fusion, inputs); + auto outputs = ke.run(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } @@ -3306,9 +3311,9 @@ TEST_F(PredicateIndexingTest, UnrolledCircularBuffering) { EnableOptionsGuard enable_options_guard; EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compile(&fusion, inputs); + auto outputs = ke.run(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } @@ -3387,9 +3392,9 @@ TEST_F(PredicateIndexingTest, UnswitchedCircularBuffering1) { EnableOptionsGuard enable_options_guard; EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compile(&fusion, inputs); + auto outputs = ke.run(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } @@ -3476,9 +3481,9 @@ TEST_F(PredicateIndexingTest, UnswitchedCircularBuffering2) { EnableOptionsGuard enable_options_guard; EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compile(&fusion, inputs); + auto outputs = ke.run(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } @@ -3582,9 +3587,9 @@ TEST_P(PredicateIndexingTest, UnswitchedCircularBuffering3) { EnableOptionsGuard enable_options_guard; EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compile(&fusion, inputs); + auto outputs = ke.run(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } @@ -3661,9 +3666,9 @@ TEST_F(PredicateIndexingTest, UnswitchedCircularBuffering4) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({16}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -3754,9 +3759,9 @@ TEST_F(PredicateIndexingTest, NonDivisibleSplit1) { at::Tensor t0 = at::randn({999}, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -3845,9 +3850,9 @@ TEST_F(PredicateIndexingTest, NonDivisibleSplitWithUnswitch) { at::Tensor t0 = at::randn({999}, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -3940,9 +3945,9 @@ TEST_F(PredicateIndexingTest, NonDivisibleSplitWithCircularBuffering) { at::Tensor t0 = at::randn({999}, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -4051,9 +4056,9 @@ TEST_F( at::Tensor t0 = at::randn({999}, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -4136,9 +4141,9 @@ TEST_P(PredicateIndexingTest, UnswitchPredicateIssueRepro681) { EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); } - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); auto ref = t0.to(at::kDouble).sum(); @@ -4296,9 +4301,9 @@ TEST_F(PredicateIndexingTest, NonDivisibleSplitWithUnswitchAndBroadcast) { EnableOptionsGuard enable_options_guard; EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -4419,9 +4424,9 @@ TEST_F(PredicateIndexingTest, UnswitchConsolidationDifferentThreading) { EnableOptionsGuard enable_options_guard; EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -4834,9 +4839,9 @@ TEST_F(ContigIndexingTest, ConcretizedBroadcastMerge) { auto t1 = at::randn({5, 6, 7}, options); std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -5063,9 +5068,9 @@ TEST_F(ContigPredicateIndexingTest, NonDivisibleSplit1) { at::Tensor t0 = at::randn({10, 20}, options); std::vector aten_inputs = {t0}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } diff --git a/tests/cpp/test_indexing_advanced.cpp b/tests/cpp/test_indexing_advanced.cpp index 1787109a272..4b0674ca015 100644 --- a/tests/cpp/test_indexing_advanced.cpp +++ b/tests/cpp/test_indexing_advanced.cpp @@ -72,10 +72,10 @@ TEST_P(AdvancedIndexingTest, InlineBroadcast) { at::Tensor t0 = at::randn({123}, options); at::Tensor t1 = at::randn({3, 123}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); + KernelExecutor ke; + ke.compile(&fusion, {t0, t1}); - auto outputs = fe.runFusion({t0, t1}); + auto outputs = ke.run({t0, t1}); testValidate(&fusion, outputs, {t0, t1}, __LINE__, __FILE__); } @@ -117,15 +117,15 @@ TEST_P(AdvancedIndexingTest, 1) { tv2->axis(1)->parallelize(ParallelType::Unroll); tv2->axis(2)->parallelize(ParallelType::TIDx); - FusionExecutor fe; + KernelExecutor ke; at::Tensor t0 = at::randn({x, y, z}, options); at::Tensor t1 = at::randn({w, x, y, z}, options); std::vector aten_inputs = {t0, t1}; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -168,15 +168,15 @@ TEST_P(AdvancedIndexingTest, 2) { tv2->axis(1)->parallelize(ParallelType::Unroll); tv2->axis(2)->parallelize(ParallelType::TIDx); - FusionExecutor fe; + KernelExecutor ke; at::Tensor t0 = at::randn({x, y, z}, options); at::Tensor t1 = at::randn({w, x, y, z}, options); std::vector aten_inputs = {t0, t1}; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -229,9 +229,9 @@ TEST_P(AdvancedIndexingTest, 4) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -263,9 +263,9 @@ TEST_P(AdvancedIndexingTest, 5) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -345,9 +345,9 @@ TEST_P(AdvancedIndexingTest, 7) { auto at_t0 = at::randn({numel_x}, options); auto at_t1 = at::randn({numel_x, numel_y}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {at_t0, at_t1}); - auto cg_outputs = fe.runFusion({at_t0, at_t1}); + KernelExecutor ke; + ke.compile(&fusion, {at_t0, at_t1}); + auto cg_outputs = ke.run({at_t0, at_t1}); auto aten_output = (at_t0.unsqueeze(-1).expand({numel_x, numel_y}) + at_t1) .to(at::kDouble) @@ -391,9 +391,9 @@ TEST_P(AdvancedIndexingTest, 8) { auto at_t0 = at::randn({numel_x}, options); auto at_t1 = at::randn({numel_x, numel_y}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {at_t0, at_t1}); - auto cg_outputs = fe.runFusion({at_t0, at_t1}); + KernelExecutor ke; + ke.compile(&fusion, {at_t0, at_t1}); + auto cg_outputs = ke.run({at_t0, at_t1}); auto aten_output = (at_t0.unsqueeze(-1).expand({numel_x, numel_y}) + at_t1) .to(at::kDouble) @@ -484,9 +484,9 @@ TEST_P(AdvancedIndexingTest, 10) { at::Tensor input2 = at::rand_like(input1); at::Tensor output = at::empty_like(input1); - FusionExecutor fe; - fe.compileFusion(&fusion, {input1, input2}); - fe.runFusion({input1, input2}, {output}); + KernelExecutor ke; + ke.compile(&fusion, {input1, input2}); + ke.run({input1, input2}, {output}); at::Tensor tv2_ref = input2 + 2.0; at::Tensor output_ref = input1 + tv2_ref; @@ -531,15 +531,15 @@ TEST_P(AdvancedIndexingTest, 11) { tv3->axis(-1)->parallelize(ParallelType::TIDx); - FusionExecutor fe; + KernelExecutor ke; at::Tensor t0 = at::randn({w, x, y, z}, options); at::Tensor t1 = at::randn({x}, options); std::vector aten_inputs = {t0, t1}; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -574,9 +574,9 @@ TEST_P(AdvancedIndexingTest, 12) { std::vector aten_outputs = {t2, t4}; - FusionExecutor fe; - fe.compileFusion(&fusion, {aten_input}); - auto cg_outputs = fe.runFusion({aten_input}); + KernelExecutor ke; + ke.compile(&fusion, {aten_input}); + auto cg_outputs = ke.run({aten_input}); testValidate( &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); @@ -623,9 +623,9 @@ TEST_P(AdvancedIndexingTest, 13) { std::vector aten_inputs = {t0, t1, t2}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -665,9 +665,9 @@ TEST_P(AdvancedIndexingTest, 14) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -700,9 +700,9 @@ TEST_P(AdvancedIndexingTest, 15) { at::Tensor t3 = at::randn({bx, by, bz}, options); std::vector aten_inputs = {t0, t3}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -732,9 +732,9 @@ TEST_P(AdvancedIndexingTest, 16) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -767,9 +767,9 @@ TEST_P(AdvancedIndexingTest, 17) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -804,13 +804,13 @@ TEST_P(AdvancedIndexingTest, 18) { at::Tensor t1 = at::randn({5, 3}, options); std::vector inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto cg_outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compile(&fusion, inputs); + auto cg_outputs = ke.run(inputs); auto ref = (t0.unsqueeze(-1) + t1).sum(); - testValidate(fe.kernel(), cg_outputs, inputs, {ref}, __LINE__, __FILE__); + testValidate(ke.kernel(), cg_outputs, inputs, {ref}, __LINE__, __FILE__); } TEST_P(AdvancedIndexingTest, 19) { @@ -848,9 +848,9 @@ TEST_P(AdvancedIndexingTest, 19) { at::Tensor t1 = at::randn({5, 11}, options); std::vector inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compile(&fusion, inputs); + auto outputs = ke.run(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } @@ -913,9 +913,9 @@ TEST_F(AdvancedIndexingIdModelTest, 20) { at::Tensor t2 = at::randn({7, 13}, options); std::vector inputs = {t0, t1, t2}; - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compile(&fusion, inputs); + auto outputs = ke.run(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); #endif @@ -978,9 +978,9 @@ TEST_F(AdvancedIndexingIdModelTest, 21) { auto t6 = at::randn({3, 5, 7}, options); std::vector inputs = {t0, t3, t6}; - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compile(&fusion, inputs); + auto outputs = ke.run(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); #endif @@ -1022,9 +1022,9 @@ TEST_F(AdvancedIndexingIdModelTest, MultiPromotion1) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -1119,9 +1119,9 @@ TEST_F(AdvancedIndexingIdModelTest, IndexSplitMerge) { std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); diff --git a/tests/cpp/test_inlining.cpp b/tests/cpp/test_inlining.cpp index 320241e6112..569b541a838 100644 --- a/tests/cpp/test_inlining.cpp +++ b/tests/cpp/test_inlining.cpp @@ -48,9 +48,9 @@ TEST_F(InliningTest, InliningMismatchedDims1) { at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({2, 3, 4}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto cg_outputs = fe.runFusion({input}); + KernelExecutor ke; + ke.compile(&fusion, {input}); + auto cg_outputs = ke.run({input}); testValidate(&fusion, cg_outputs, {input}, __LINE__, __FILE__); } @@ -80,9 +80,9 @@ TEST_F(InliningTest, InliningMismatchedDims2) { at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({2, 3, 4}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto cg_outputs = fe.runFusion({input}); + KernelExecutor ke; + ke.compile(&fusion, {input}); + auto cg_outputs = ke.run({input}); testValidate(&fusion, cg_outputs, {input}, __LINE__, __FILE__); } @@ -113,9 +113,9 @@ TEST_F(InliningTest, InliningMismatchedDims4) { at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({2, 3, 4}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto cg_outputs = fe.runFusion({input}); + KernelExecutor ke; + ke.compile(&fusion, {input}); + auto cg_outputs = ke.run({input}); testValidate(&fusion, cg_outputs, {input}, __LINE__, __FILE__); } @@ -150,9 +150,9 @@ TEST_F(InliningTest, InliningBroadcast) { at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({2, 3, 4}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}); - auto cg_outputs = fe.runFusion({input}); + KernelExecutor ke; + ke.compile(&fusion, {input}); + auto cg_outputs = ke.run({input}); testValidate(&fusion, cg_outputs, {input}, __LINE__, __FILE__); } diff --git a/tests/cpp/test_loop_domain_scheduling.cpp b/tests/cpp/test_loop_domain_scheduling.cpp index 52884529727..710be9ce08a 100644 --- a/tests/cpp/test_loop_domain_scheduling.cpp +++ b/tests/cpp/test_loop_domain_scheduling.cpp @@ -86,9 +86,9 @@ TEST_F(LoopDomainSchedulingTest, ReshapeSplitThenMerge) { auto t0 = at::randn({10}, options); std::vector inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, inputs); - auto outputs = fe.runFusion(inputs); + KernelExecutor ke; + ke.compile(&fusion, inputs); + auto outputs = ke.run(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } @@ -147,9 +147,9 @@ TEST_F(LoopDomainSchedulingTest, Slice) { auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = t0.index({at::indexing::Slice(1, shape[0] - 1)}); @@ -306,9 +306,9 @@ TEST_F(LoopDomainSchedulingTest, ManyReshape) { auto t0 = at::randn({12}, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = t0 * 2; EXPECT_TRUE(ref.equal(cg_outputs[0])); diff --git a/tests/cpp/test_loop_rotation.cpp b/tests/cpp/test_loop_rotation.cpp index 39314552945..db5f3e20848 100644 --- a/tests/cpp/test_loop_rotation.cpp +++ b/tests/cpp/test_loop_rotation.cpp @@ -76,9 +76,9 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor for (auto n : {1, 99}) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({n, 3}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } } @@ -169,9 +169,9 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor for (auto n : {1, 99}) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({n, 3}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } } @@ -278,9 +278,9 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor for (auto n : {1, 99}) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({n, 3}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } } @@ -389,9 +389,9 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor for (auto n : {5, 99}) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({n, 3}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } } @@ -526,9 +526,9 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor for (auto n : {5, 99}) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({n, 3}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } } @@ -662,9 +662,9 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor for (auto n : {5, 99}) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({n, 3}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } } diff --git a/tests/cpp/test_matmul.cpp b/tests/cpp/test_matmul.cpp index 1c0fca0ac89..12ed11bd554 100644 --- a/tests/cpp/test_matmul.cpp +++ b/tests/cpp/test_matmul.cpp @@ -124,19 +124,19 @@ TEST_P(MatmulTestWithLayout, AmpereMatmul) { auto inputs = matmulAtInput3DTuring(M, N, K, layout); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); @@ -185,19 +185,19 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulBroadcastBatch) { auto inputs = matmulAtInput3DTuring(M, N, K, layout); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout) @@ -243,19 +243,19 @@ TEST_P(MatmulTestWithLayout, AmperePrologueFusionBroadcast) { auto inputs = matmulAtInput2D(M, N, K, layout); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); @@ -304,19 +304,19 @@ TEST_P(MatmulTestWithLayout, AmpereProloguePointwise) { auto inputs = matmulAtInput3DTuring(M, N, K, layout); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.sin().to(at::kFloat), inputs.second.sin().to(at::kFloat), @@ -365,19 +365,19 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulBFloat16) { auto inputs = matmulAtInput3DTuring(M, N, K, layout, at::kBFloat16); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); @@ -428,19 +428,19 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulPipelineGmem) { auto inputs = matmulAtInput3DTuring(M, N, K, layout); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); @@ -512,25 +512,25 @@ TEST_P(MatmulTestWithLayout, AmpereSwizzle) { FusionProfiler::createSegments(1); } - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.01, 0.01)); - int gdimx = fe.lastLaunchParams().gdimx(); - int gdimy = fe.lastLaunchParams().gdimy(); + int gdimx = ke.lastLaunchParams().gdimx(); + int gdimy = ke.lastLaunchParams().gdimy(); int expected_gdim_unswizzled = (dim + 128 - 1) / 128; int expected_gdimx = expected_gdim_unswizzled * swizzle; @@ -640,19 +640,19 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulRegCircularBuffer) { auto inputs = matmulAtInput3DTuring(M, N, K, layout); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); @@ -932,16 +932,14 @@ TEST_F(MatmulTest, MatmulMatmulAmpere) { .matmul(t1.t().to(at::kFloat)) .matmul(t2.t().to(at::kFloat)); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( - 8, - 0, - fe.compileFusion(&fusion, {t0, t1, t2}, LaunchParams(), matmul_cparams)); + 8, 0, ke.compile(&fusion, {t0, t1, t2}, LaunchParams(), matmul_cparams)); - auto cg_outputs = fe.runFusion({t0, t1, t2}); + auto cg_outputs = ke.run({t0, t1, t2}); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); // relaxed check for now, err accumulation is significant. NVF_CHECK(cg_outputs[0].allclose(tref, 0.1, 0.1)); } @@ -1312,16 +1310,14 @@ TEST_F(MatmulTest, MatmulSoftmaxMatmulAmpere) { auto t1 = at::randn({N1, K1}, options); auto t2 = at::randn({N2, K2}, options); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( - 8, - 0, - fe.compileFusion(&fusion, {t0, t1, t2}, LaunchParams(), matmul_cparams)); + 8, 0, ke.compile(&fusion, {t0, t1, t2}, LaunchParams(), matmul_cparams)); - auto cg_outputs = fe.runFusion({t0, t1, t2}); + auto cg_outputs = ke.run({t0, t1, t2}); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); auto g1 = t0.to(at::kFloat).matmul(t1.t().to(at::kFloat)); auto sg1 = at::_softmax(g1, -1, false); auto gsg1 = sg1.matmul(t2.t().to(at::kFloat)); @@ -1367,13 +1363,13 @@ TEST_P(MatmulTestWithLayout, TuringMatmul) { auto inputs = matmulAtInput3DTuring(M, N, K, layout); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( - 7, 5, fe.compileFusion(&fusion, {inputs.first, inputs.second})); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + 7, 5, ke.compile(&fusion, {inputs.first, inputs.second})); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); @@ -1511,15 +1507,13 @@ TEST_F(MatmulTest, AmpereMatmulTNCpAsync) { auto t0 = at::randn({M, K}, options); auto t1 = at::randn({N, K}, options); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( - 8, - 0, - fe.compileFusion(&fusion, {t0, t1}, LaunchParams(), matmul_cparams)); + 8, 0, ke.compile(&fusion, {t0, t1}, LaunchParams(), matmul_cparams)); - auto cg_outputs = fe.runFusion({t0, t1}); + auto cg_outputs = ke.run({t0, t1}); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); auto tref = t0.to(at::kFloat).matmul(t1.t().to(at::kFloat)); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); @@ -1679,16 +1673,14 @@ TEST_F(MatmulTest, AmpereStridedBatchedMatmulTN) { auto t0 = at::randn({B0, M, B1, K}, options); auto t1 = at::randn({B0, N, B1, K}, options); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( - 8, - 0, - fe.compileFusion(&fusion, {t0, t1}, LaunchParams(), matmul_cparams)); + 8, 0, ke.compile(&fusion, {t0, t1}, LaunchParams(), matmul_cparams)); - auto cg_outputs = fe.runFusion({t0, t1}); + auto cg_outputs = ke.run({t0, t1}); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); // ref implementation: auto ref_t0 = t0.permute({0, 2, 1, 3}) .contiguous() @@ -1852,16 +1844,14 @@ TEST_F(MatmulTest, AmpereViewMatmulTN) { auto t0 = at::randn({M, Ko, Ki}, options); auto t1 = at::randn({N, K}, options); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( - 8, - 0, - fe.compileFusion(&fusion, {t0, t1}, LaunchParams(), matmul_cparams)); + 8, 0, ke.compile(&fusion, {t0, t1}, LaunchParams(), matmul_cparams)); - auto cg_outputs = fe.runFusion({t0, t1}); + auto cg_outputs = ke.run({t0, t1}); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); auto tref = at::native::view(t0, {M, K}).to(at::kFloat).matmul(t1.t().to(at::kFloat)); @@ -2040,11 +2030,11 @@ TEST_F(MatmulTest, AmpereMatmulTNSwizzled) { auto t0 = at::randn({M, K}, options); auto t1 = at::randn({N, K}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compile(&fusion, {t0, t1}, LaunchParams(), matmul_cparams); + auto cg_outputs = ke.run({t0, t1}); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); auto tref = t0.to(at::kFloat).matmul(t1.t().to(at::kFloat)); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); @@ -2091,19 +2081,19 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulLargeLoad) { auto inputs = matmulAtInput3DTuring(M, N, K, layout); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); @@ -2147,19 +2137,19 @@ TEST_P(MatmulTestWithLayout, TuringMatmulLargeLoad) { auto inputs = matmulAtInput3DTuring(M, N, K, layout); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 7, 5, - fe.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); @@ -2219,19 +2209,19 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulTileCheck4warp) { auto inputs = matmulAtInput3DTuring(M, N, K, layout); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - EXPECT_TRUE(getBankConflictInfo(fe.kernel()).empty()); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + EXPECT_TRUE(getBankConflictInfo(ke.kernel()).empty()); + auto cg_outputs = ke.run({inputs.first, inputs.second}); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK( @@ -2300,19 +2290,19 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulTileCheck8warp) { auto inputs = matmulAtInput3DTuring(M, N, K, layout); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); @@ -2371,19 +2361,19 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulTileCheck6warp) { auto inputs = matmulAtInput3DTuring(M, N, K, layout); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); @@ -2431,19 +2421,19 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulLargeLoadLargeK) { auto inputs = matmulAtInput3DTuring(M, N, K, layout); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.001, 0.001)); @@ -2489,15 +2479,13 @@ TEST_P(MatmulTestWithLayout, AmpereSplitKLikeStridedBatchedMatmul) { auto t0 = matmulAtInput2D(layout, TensorMatmulPos::A, at::kHalf, M, N, K, B); auto t1 = matmulAtInput2D(layout, TensorMatmulPos::B, at::kHalf, M, N, K, B); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( - 8, - 0, - fe.compileFusion(&fusion, {t0, t1}, LaunchParams(), matmul_cparams)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + 8, 0, ke.compile(&fusion, {t0, t1}, LaunchParams(), matmul_cparams)); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion({t0, t1}); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.run({t0, t1}); auto tref = splitkLikeAtMatmul(t0.to(at::kFloat), t1.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); } @@ -2578,23 +2566,23 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulSmemEpilogue) { at::manual_seed(0); auto inputs = matmulAtInput3DTuring(M, N, K, layout); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); // check bank conflicts - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); // (0.001, 0.001) passed on local A100 but failed on CI A100 NVF_CHECK( cg_outputs[0].allclose(tref, 0.01, 0.01), @@ -2612,7 +2600,7 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulSmemEpilogue) { // - !use_smem_epilogue : A + B (this test is skipped in this case) // - use_smem_epilogue && !promote_prologue_smem_reuse : A + B + C // - use_smem_epilogue && promote_prologue_smem_reuse : max(A + B, C) - auto smem_allocs = fe.kernel()->summary().dynamic_smem_allocations; + auto smem_allocs = ke.kernel()->summary().dynamic_smem_allocations; NVF_CHECK(smem_allocs.size() == 3); if (mparams.promote_prologue_smem_reuse) { // Check prologue shared memory re-use @@ -2712,29 +2700,29 @@ TEST_F(MatmulTest, AmpereMatmulSmemEpiloguePromotionRequiredA100) { SchedulerEntry::makeSchedulerInstance(SchedulerType::Matmul) ->schedule(&fusion, &mparams); - // FusionExecutor::compileFusion would fail otherwise. + // KernelExecutor::compile would fail otherwise. SKIP_IF_INSUFFICIENT_SMEM(&mparams, data_types); at::manual_seed(0); auto inputs = matmulAtInput3DTuring(M, N, K, layout); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); // check bank conflicts - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); // (0.001, 0.001) passed on local A100 but failed on CI A100 NVF_CHECK( cg_outputs[0].allclose(tref, 0.01, 0.01), @@ -2818,23 +2806,23 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulSmemEpilogueCast) { at::manual_seed(0); auto inputs = matmulAtInput3DTuring(M, N, K, layout); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); tref = tref.to(at::kHalf); // check bank conflicts - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); // (0.001, 0.001) passed on local A100 but failed on CI A100 NVF_CHECK( cg_outputs[0].allclose(tref, 0.01, 0.01), @@ -2914,24 +2902,24 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulSmemEpilogueRelu) { at::manual_seed(0); auto inputs = matmulAtInput3DTuring(M, N, K, layout); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto t2 = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); auto tref = at::relu(t2).to(at::kFloat); // check bank conflicts - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); // (0.001, 0.001) passed on local A100 but failed on CI A100 NVF_CHECK( cg_outputs[0].allclose(tref, 0.01, 0.01), @@ -3003,13 +2991,13 @@ TEST_P(MatmulTestWithLayout, FusionAmpereMatmulSplitK_CUDA) { auto inputs = matmulAtInput3DTuring(M, N, K, layout); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( - 7, 5, fe.compileFusion(&fusion, {inputs.first, inputs.second})); - EXPECT_TRUE(getBankConflictInfo(fe.kernel()).empty()); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + 7, 5, ke.compile(&fusion, {inputs.first, inputs.second})); + EXPECT_TRUE(getBankConflictInfo(ke.kernel()).empty()); + auto cg_outputs = ke.run({inputs.first, inputs.second}); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); @@ -3068,13 +3056,12 @@ TEST_P(MatmulTestWithLayout, FusionAmpereMatmulSplitKBias_CUDA) { at::Tensor aten_bias = at::randn({M}, aten_a.options()); std::vector inputs = {aten_a, aten_b, aten_bias}; - FusionExecutor fe; - NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( - 7, 5, fe.compileFusion(&fusion, inputs)); - EXPECT_TRUE(getBankConflictInfo(fe.kernel()).empty()); - auto cg_outputs = fe.runFusion(inputs); + KernelExecutor ke; + NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(7, 5, ke.compile(&fusion, inputs)); + EXPECT_TRUE(getBankConflictInfo(ke.kernel()).empty()); + auto cg_outputs = ke.run(inputs); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); auto tref = atBiasEpilogue( atMatmul(aten_a.to(at::kFloat), aten_b.to(at::kFloat), layout), aten_bias); @@ -3131,13 +3118,12 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulBatchSplitK) { std::vector inputs = {aten_a, aten_b}; - FusionExecutor fe; - NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( - 7, 5, fe.compileFusion(&fusion, inputs)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + KernelExecutor ke; + NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(7, 5, ke.compile(&fusion, inputs)); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion(inputs); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.run(inputs); auto tref = atMatmul(aten_a.to(at::kFloat), aten_b.to(at::kFloat), layout); @@ -3198,13 +3184,12 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulBatchSplitKBias) { std::vector inputs = {aten_a, aten_b, aten_bias}; - FusionExecutor fe; - NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( - 7, 5, fe.compileFusion(&fusion, inputs)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + KernelExecutor ke; + NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(7, 5, ke.compile(&fusion, inputs)); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion(inputs); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.run(inputs); auto tref = atBiasEpilogue( atMatmul(aten_a.to(at::kFloat), aten_b.to(at::kFloat), layout), aten_bias); @@ -3257,19 +3242,19 @@ TEST_F(MatmulTest, ReproIssue1808) { auto inputs = matmulAtInput3DTuring(M, N, K, layout); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion( + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); @@ -3413,16 +3398,15 @@ TEST_P(MatmulTestWithLayout, MisalignedVectorization) { SchedulerEntry::makeSchedulerInstance(SchedulerType::Matmul) ->schedule(fusion.get(), &mparams); - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( 8, 0, - fe.compileFusion( - fusion.get(), inputs, LaunchParams(), matmul_cparams)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + ke.compile(fusion.get(), inputs, LaunchParams(), matmul_cparams)); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto outputs = fe.runFusion(inputs); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto outputs = ke.run(inputs); EXPECT_TRUE(outputs[0].allclose(tref, 0.001, 0.001)); } @@ -3473,13 +3457,13 @@ TEST_F(MatmulTest, MultipleConsecutiveDims) { at::Tensor B = at::randn({N1, N2, K}, options); std::vector inputs{A, B}; - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( - 8, 0, fe.compileFusion(&fusion, inputs, LaunchParams(), matmul_cparams)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + 8, 0, ke.compile(&fusion, inputs, LaunchParams(), matmul_cparams)); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion(inputs); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.run(inputs); auto tref = at::reshape( at::linear( at::reshape(A.to(at::kFloat), {M1 * M2, K}), @@ -3539,13 +3523,13 @@ TEST_F(MatmulTest, DISABLED_MultipleNonConsecutiveMDims) { at::Tensor B = at::randn({N, K}, options); std::vector inputs{A, B}; - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( - 8, 0, fe.compileFusion(&fusion, inputs, LaunchParams(), matmul_cparams)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + 8, 0, ke.compile(&fusion, inputs, LaunchParams(), matmul_cparams)); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion(inputs); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.run(inputs); auto Apermuted = A.permute({{1, 2}}).reshape({M1 * M2, K}); auto tref = at::linear(Apermuted.to(at::kFloat), B.to(at::kFloat)) .reshape({M1, M2, N}) @@ -3605,13 +3589,13 @@ TEST_F(MatmulTest, DISABLED_MultipleNonConsecutiveNDims) { at::Tensor B = at::randn({N1, K, N2}, options); std::vector inputs{A, B}; - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( - 8, 0, fe.compileFusion(&fusion, inputs, LaunchParams(), matmul_cparams)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + 8, 0, ke.compile(&fusion, inputs, LaunchParams(), matmul_cparams)); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion(inputs); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.run(inputs); auto Bpermuted = B.permute({{1, 2}}).reshape({N1 * N2, K}); auto tref = at::linear(A.to(at::kFloat), Bpermuted.to(at::kFloat)) .reshape({M, N1, N2}); @@ -3663,13 +3647,13 @@ TEST_F(MatmulTest, MultipleMDimsBatch) { at::Tensor B = at::randn({Batch, N, K}, options); std::vector inputs{A, B}; - FusionExecutor fe; + KernelExecutor ke; NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( - 8, 0, fe.compileFusion(&fusion, inputs, LaunchParams(), matmul_cparams)); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); + 8, 0, ke.compile(&fusion, inputs, LaunchParams(), matmul_cparams)); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); ASSERT_FALSE( - PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel())); - auto cg_outputs = fe.runFusion(inputs); + PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel())); + auto cg_outputs = ke.run(inputs); auto tref = at::matmul(A.to(at::kFloat), at::permute(B.to(at::kFloat), {0, 2, 1})); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); @@ -3798,10 +3782,10 @@ TEST_F(HopperMatmulTest, HSH_NT_128BSwizzle) { auto inputs = matmulAtInput3DHopperSS(M, N, K, layout, data_type_to_aten(dtype)); - FusionExecutor fe; - fe.compileFusion( + KernelExecutor ke; + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul(inputs.first.squeeze(), inputs.second.squeeze(), layout); EXPECT_TRUE(at::allclose(cg_outputs[0], tref, 1e-5, 1e-5)); } diff --git a/tests/cpp/test_matmul_aten_evaluation.cpp b/tests/cpp/test_matmul_aten_evaluation.cpp index 9dd8a927009..1d078b51d96 100644 --- a/tests/cpp/test_matmul_aten_evaluation.cpp +++ b/tests/cpp/test_matmul_aten_evaluation.cpp @@ -164,8 +164,8 @@ TEST_P(MatmulNodeParametrizedTest, MatmulNodeConcrete) { at::Tensor t1 = at::randn(b_shape, at::kHalf).cuda(); at::Tensor out_ref = at::matmul(t0, t1); - FusionExecutorCache fec(std::move(fusion)); - auto out = fec.runFusionWithInputs({t0, t1}); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out = executor_cache.runFusionWithInputs({t0, t1}); EXPECT_TRUE(at::allclose(out[0], out_ref)); } @@ -190,8 +190,8 @@ TEST_P(MatmulNodeParametrizedTest, MatmulNodeSymbolic) { at::Tensor t1 = at::randn(b_shape, at::kHalf).cuda(); at::Tensor out_ref = at::matmul(t0, t1); - FusionExecutorCache fec(std::move(fusion)); - auto out = fec.runFusionWithInputs({t0, t1}); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out = executor_cache.runFusionWithInputs({t0, t1}); EXPECT_TRUE(at::allclose(out[0], out_ref)); } @@ -227,17 +227,17 @@ TEST_P(LinearNodeParametrizedTest, LinearNodeConcrete) { } at::Tensor out_ref = at::linear(t0, t1, bias_opt); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); std::vector out = {}; if (bias_shape.has_value()) { - out = fec.runFusionWithInputs({t0, t1, bias_opt}); + out = executor_cache.runFusionWithInputs({t0, t1, bias_opt}); } else { - out = fec.runFusionWithInputs({t0, t1}); + out = executor_cache.runFusionWithInputs({t0, t1}); } - const std::vector& executors = - fec.getMostRecentKernelRuntime()->executors(); + const std::vector& executors = + executor_cache.getMostRecentKernelRuntime()->executors(); EXPECT_EQ(executors.size(), 1); // Verify that fusion compilation was skipped. EXPECT_FALSE(executors.front().hasCompiledKernel()); @@ -277,17 +277,17 @@ TEST_P(LinearNodeParametrizedTest, LinearNodeSymbolic) { } at::Tensor out_ref = at::linear(t0, t1, bias_opt); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); std::vector out = {}; if (bias_shape.has_value()) { - out = fec.runFusionWithInputs({t0, t1, bias_opt}); + out = executor_cache.runFusionWithInputs({t0, t1, bias_opt}); } else { - out = fec.runFusionWithInputs({t0, t1}); + out = executor_cache.runFusionWithInputs({t0, t1}); } - const std::vector& executors = - fec.getMostRecentKernelRuntime()->executors(); + const std::vector& executors = + executor_cache.getMostRecentKernelRuntime()->executors(); EXPECT_EQ(executors.size(), 1); // Verify that fusion compilation was skipped. EXPECT_FALSE(executors.front().hasCompiledKernel()); diff --git a/tests/cpp/test_matmul_sass.cpp b/tests/cpp/test_matmul_sass.cpp index 974300401d2..0d4385eb46b 100644 --- a/tests/cpp/test_matmul_sass.cpp +++ b/tests/cpp/test_matmul_sass.cpp @@ -98,16 +98,16 @@ sass::Container getSASSFor( SchedulerEntry::makeSchedulerInstance(SchedulerType::Matmul) ->schedule(&fusion, &mparams); - FusionExecutor fe; - fe.compileFusion( + KernelExecutor ke; + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); - return sass::parse(fe.disassembledKernelSASS()); + return sass::parse(ke.disassembledKernelSASS()); } // A fusion with epilogue made of binary op (scalar multiplication) @@ -161,13 +161,13 @@ sass::Container getBinaryOpMulEpilogueSASSFor( auto inputs = matmulAtInput3DTuring(M, N, K, layout); const double alpha = 2.5; - FusionExecutor fe; - fe.compileFusion( + KernelExecutor ke; + ke.compile( &fusion, {inputs.first, inputs.second, alpha}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second, alpha}); + auto cg_outputs = ke.run({inputs.first, inputs.second, alpha}); auto tref = at::mul( atMatmul( inputs.first.to(at::kFloat), @@ -178,7 +178,7 @@ sass::Container getBinaryOpMulEpilogueSASSFor( NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); - return sass::parse(fe.disassembledKernelSASS()); + return sass::parse(ke.disassembledKernelSASS()); } } // namespace diff --git a/tests/cpp/test_matmul_scheduler.cpp b/tests/cpp/test_matmul_scheduler.cpp index e532f32ef57..f2109c7f1e1 100644 --- a/tests/cpp/test_matmul_scheduler.cpp +++ b/tests/cpp/test_matmul_scheduler.cpp @@ -2811,10 +2811,10 @@ TEST_P(AllocationDomainTest, BasicMatmul) { ->schedule(fusion.get(), &mparams); auto [t0, t1] = getInputTensors(M, N, K, a_m_inner, b_k_inner); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); + KernelExecutor ke; + ke.compile(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({t0, t1}); + auto cg_outputs = ke.run({t0, t1}); auto tref = t0.to(at::kFloat).matmul(t1.to(at::kFloat)); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); } @@ -2844,10 +2844,10 @@ TEST_P(AllocationDomainTest, BasicMatmulNoTranspose) { ->schedule(fusion.get(), &mparams); auto [t0, t1] = getInputTensors(M, N, K, a_m_inner, b_k_inner); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); + KernelExecutor ke; + ke.compile(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({t0, t1}); + auto cg_outputs = ke.run({t0, t1}); auto tref = t0.to(at::kFloat).matmul(t1.to(at::kFloat)); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); } @@ -2880,10 +2880,10 @@ TEST_P(AllocationDomainTest, BasicMatmulWithPrologueSet) { ->schedule(fusion.get(), &mparams); auto [t0, t1] = getInputTensors(M, N, K, a_m_inner, b_k_inner); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); + KernelExecutor ke; + ke.compile(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({t0, t1}); + auto cg_outputs = ke.run({t0, t1}); auto tref = t0.to(at::kFloat).matmul(t1.to(at::kFloat)); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); } @@ -2918,10 +2918,10 @@ TEST_P(AllocationDomainTest, BasicMatmulWithPrologueSetCastSin) { ->schedule(fusion.get(), &mparams); auto [t0, t1] = getInputTensors(M, N, K, a_m_inner, b_k_inner); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); + KernelExecutor ke; + ke.compile(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({t0, t1}); + auto cg_outputs = ke.run({t0, t1}); auto tref = t0.to(at::kFloat).matmul(t1.sin().to(at::kFloat)); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); } @@ -2955,10 +2955,10 @@ TEST_P(AllocationDomainTest, BasicMatmulWithPrologueSetCastSinNoTranspose) { ->schedule(fusion.get(), &mparams); auto [t0, t1] = getInputTensors(M, N, K, a_m_inner, b_k_inner); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); + KernelExecutor ke; + ke.compile(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({t0, t1}); + auto cg_outputs = ke.run({t0, t1}); auto tref = t0.to(at::kFloat).matmul(t1.sin().to(at::kFloat)); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); } @@ -2992,10 +2992,10 @@ TEST_P(AllocationDomainTest, BasicMatmulWithPrologueSetCastSinSetNoTranspose) { ->schedule(fusion.get(), &mparams); auto [t0, t1] = getInputTensors(M, N, K, a_m_inner, b_k_inner); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); + KernelExecutor ke; + ke.compile(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({t0, t1}); + auto cg_outputs = ke.run({t0, t1}); auto tref = t0.to(at::kFloat).matmul(t1.sin().to(at::kFloat)); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); } @@ -3029,10 +3029,10 @@ TEST_P(AllocationDomainTest, MatmulWithPrologueSetCastSinTranspose) { ->schedule(fusion.get(), &mparams); auto [t0, t1] = getInputTensors(M, N, K, a_m_inner, b_k_inner); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); + KernelExecutor ke; + ke.compile(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({t0, t1}); + auto cg_outputs = ke.run({t0, t1}); auto tref = t0.to(at::kFloat).matmul(t1.sin().to(at::kFloat)); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); } @@ -3069,10 +3069,310 @@ TEST_F(MatmulSchedulerTest, OperandOrderIssue2434) { auto y_ref = at::randn({N, K}, options); std::vector inputs{x_ref, y_ref}; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = fec.runFusionWithInputs(inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs(inputs); auto tref = at::linear(x_ref.to(at::kFloat), y_ref.to(at::kFloat)); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); } +TEST_F(MatmulSchedulerTest, HSH_TT) { + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(9, 0, 10, 0); + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + + const auto dtype = DataType::Half; + constexpr auto layout = MmaLayout::TT; + + auto tv0 = makeContigConcreteTensor({-1, -1, 1}, dtype); // A [M, K, b] + auto tv1 = makeContigConcreteTensor({1, -1, -1}, dtype); // B [b, K, N] + fusion->addInput(tv0); + fusion->addInput(tv1); + + auto tv2 = fusedMultiplySum(tv0, tv1, {1}); + + // Reorder the accumulator as [M, N, K] + // [M, rK, N] -> [M, N, K] + tv2->reorder({{-2, -1}, {-1, -2}}); + tv2->commitLeafToLogical(); + + auto tv3 = castOp(DataType::Half, tv2); + fusion->addOutput(tv3); + + NVF_CHECK( + 1 == ir_utils::getOpsOfType(fusion.get()).size(), + "matmul fusion must have at least one MmaOp"); + + // Create custom Matmul Params + MatMulTileOptions gemm_tile; + // TODO cta tile is a multiple of mma macro for hopper. + gemm_tile.cta_tile = GemmTile(128, 128, 32); + + // TODO warp tile is (macroM, macroN, macroK) for hopper. + gemm_tile.warp_tile = GemmTile(64, 64, 32); + + // TODO instruction tile is not used for hopper. + gemm_tile.instruction_tile = GemmTile(16, 8, 16); + + MatmulParams mparams; + mparams.supported_vec_size = {8, 8, 4}; + + // TODO use hopper macro + // mparams.mma_macro = MmaMacro::Hopper_64_256_16; + mparams.mma_macro = MmaMacro::Ampere_16_8_16; + + mparams.tile_sizes = gemm_tile; + mparams.async_gmem_load_operands = true; + mparams.circular_buffer_options.circular_buffer_smem_write = true; + mparams.circular_buffer_options.circular_buffer_smem_read = true; + mparams.circular_buffer_options.smem_circular_buffer_stage = 4; + + // TODO Create prefetch parameter + // mparams.circular_buffer_options.smem_circular_buffer_prefetch = 3; + + // Schedule matmul fusion using custom parameters + SchedulerEntry::makeSchedulerInstance(SchedulerType::Matmul) + ->schedule(fusion.get(), &mparams); + + const int M = 32, N = 32, K = 256; + auto inputs = + matmulAtInput3DHopperSS(M, N, K, layout, data_type_to_aten(dtype)); + + //! TODO Disabled because hopper multiple matmul scheduler is currently a copy + //! of ampere scheduler. + /* + KernelExecutor ke; + ke.compile( + fusion.get(), + {inputs.first, inputs.second}, + LaunchParams(), + matmul_cparams); + auto cg_outputs = ke.run({inputs.first, inputs.second}); + auto tref = atMatmul(inputs.first.squeeze(), inputs.second.squeeze(), layout); + EXPECT_TRUE(at::allclose(cg_outputs[0], tref, 1e-5, 1e-5)); + */ +} + +TEST_F(MatmulSchedulerTest, HSH_TN) { + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(9, 0, 10, 0); + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + + const auto dtype = DataType::Half; + constexpr auto layout = MmaLayout::TN; + + auto tv0 = makeContigConcreteTensor({-1, 1, -1}, dtype); + auto tv1 = makeContigConcreteTensor({1, -1, -1}, dtype); + fusion->addInput(tv0); + fusion->addInput(tv1); + + // [M, b, K] x [b, N, K] -> [M, N, rK] + auto tv2 = fusedMultiplySum(tv0, tv1, {-1}); + + // [M, N] + auto tv3 = castOp(DataType::Half, tv2); + fusion->addOutput(tv3); + + NVF_CHECK( + 1 == ir_utils::getOpsOfType(fusion.get()).size(), + "matmul fusion must have at least one MmaOp"); + + // Create custom Matmul Params + MatMulTileOptions gemm_tile; + // TODO cta tile is a multiple of mma macro for hopper. + gemm_tile.cta_tile = GemmTile(128, 128, 32); + + // TODO warp tile is (macroM, macroN, macroK) for hopper. + gemm_tile.warp_tile = GemmTile(64, 64, 32); + + // TODO instruction tile is not used for hopper. + gemm_tile.instruction_tile = GemmTile(16, 8, 16); + + MatmulParams mparams; + mparams.supported_vec_size = {8, 8, 4}; + + // TODO use hopper macro + // mparams.mma_macro = MmaMacro::Hopper_64_256_16; + mparams.mma_macro = MmaMacro::Ampere_16_8_16; + + mparams.tile_sizes = gemm_tile; + mparams.async_gmem_load_operands = true; + mparams.circular_buffer_options.circular_buffer_smem_write = true; + mparams.circular_buffer_options.circular_buffer_smem_read = true; + mparams.circular_buffer_options.smem_circular_buffer_stage = 4; + + // TODO Create prefetch parameter + // mparams.circular_buffer_options.smem_circular_buffer_prefetch = 3; + + // Schedule matmul fusion using custom parameters + SchedulerEntry::makeSchedulerInstance(SchedulerType::Matmul) + ->schedule(fusion.get(), &mparams); + + const int M = 32, N = 32, K = 256; + auto inputs = + matmulAtInput3DHopperSS(M, N, K, layout, data_type_to_aten(dtype)); + + KernelExecutor ke; + ke.compile( + fusion.get(), + {inputs.first, inputs.second}, + LaunchParams(), + matmul_cparams); + + auto cg_outputs = ke.run({inputs.first, inputs.second}); + auto tref = atMatmul(inputs.first.squeeze(), inputs.second.squeeze(), layout); + EXPECT_TRUE(at::allclose(cg_outputs[0], tref, 1e-5, 1e-5)); +} + +TEST_F(MatmulSchedulerTest, HSH_NT) { + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(9, 0, 10, 0); + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + + const auto dtype = DataType::Half; + constexpr auto layout = MmaLayout::NT; // [K, M] x [K, N] -> [M, N] + + auto tv0 = makeContigConcreteTensor({-1, -1, 1}, dtype); + auto tv1 = makeContigConcreteTensor({-1, 1, -1}, dtype); + fusion->addInput(tv0); + fusion->addInput(tv1); + + auto tv2 = fusedMultiplySum(tv0, tv1, {0}); + + // Reorder the accumulator as [M, N, K] + // [K, M, N] -> [M, N, K] + tv2->reorder({{-3, -1}}); + tv2->commitLeafToLogical(); + + auto tv3 = castOp(DataType::Half, tv2); + + fusion->addOutput(tv3); + + NVF_CHECK( + 1 == ir_utils::getOpsOfType(fusion.get()).size(), + "matmul fusion must have at least one MmaOp"); + + // Create custom Matmul Params + MatMulTileOptions gemm_tile; + // TODO cta tile is a multiple of mma macro for hopper. + gemm_tile.cta_tile = GemmTile(128, 128, 32); + + // TODO warp tile is (macroM, macroN, macroK) for hopper. + gemm_tile.warp_tile = GemmTile(64, 64, 32); + + // TODO instruction tile is not used for hopper. + gemm_tile.instruction_tile = GemmTile(16, 8, 16); + + MatmulParams mparams; + mparams.supported_vec_size = {8, 8, 4}; + + // TODO use hopper macro + // mparams.mma_macro = MmaMacro::Hopper_64_256_16; + mparams.mma_macro = MmaMacro::Ampere_16_8_16; + + mparams.tile_sizes = gemm_tile; + mparams.async_gmem_load_operands = true; + mparams.circular_buffer_options.circular_buffer_smem_write = true; + mparams.circular_buffer_options.circular_buffer_smem_read = true; + mparams.circular_buffer_options.smem_circular_buffer_stage = 4; + + // TODO Create prefetch parameter + // mparams.circular_buffer_options.smem_circular_buffer_prefetch = 3; + + // Schedule matmul fusion using custom parameters + SchedulerEntry::makeSchedulerInstance(SchedulerType::Matmul) + ->schedule(fusion.get(), &mparams); + + const int M = 32, N = 32, K = 256; + auto inputs = + matmulAtInput3DHopperSS(M, N, K, layout, data_type_to_aten(dtype)); + + KernelExecutor ke; + ke.compile( + fusion.get(), + {inputs.first, inputs.second}, + LaunchParams(), + matmul_cparams); + + auto cg_outputs = ke.run({inputs.first, inputs.second}); + auto tref = atMatmul(inputs.first.squeeze(), inputs.second.squeeze(), layout); + EXPECT_TRUE(at::allclose(cg_outputs[0], tref, 1e-5, 1e-5)); +} + +TEST_F(MatmulSchedulerTest, HSH_NN) { + NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(9, 0, 10, 0); + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + + const auto dtype = DataType::Half; + constexpr auto layout = MmaLayout::NN; + + auto tv0 = makeContigConcreteTensor({1, -1, -1}, dtype); // A [b, K, M] + auto tv1 = makeContigConcreteTensor({-1, -1, 1}, dtype); // B [N, K, 1] + fusion->addInput(tv0); + fusion->addInput(tv1); + + auto tv2 = fusedMultiplySum(tv0, tv1, {1}); + + // Reorder the accumulator as [M, N, K] + // [N, rK, M] -> [M, N, K] + tv2->reorder({{-1, -3}}); + tv2->commitLeafToLogical(); + + auto tv3 = castOp(DataType::Half, tv2); + fusion->addOutput(tv3); + + NVF_CHECK( + 1 == ir_utils::getOpsOfType(fusion.get()).size(), + "matmul fusion must have at least one MmaOp"); + + // Create custom Matmul Params + MatMulTileOptions gemm_tile; + // TODO cta tile is a multiple of mma macro for hopper. + gemm_tile.cta_tile = GemmTile(128, 128, 32); + + // TODO warp tile is (macroM, macroN, macroK) for hopper. + gemm_tile.warp_tile = GemmTile(64, 64, 32); + + // TODO instruction tile is not used for hopper. + gemm_tile.instruction_tile = GemmTile(16, 8, 16); + + MatmulParams mparams; + mparams.supported_vec_size = {8, 8, 4}; + + // TODO use hopper macro + // mparams.mma_macro = MmaMacro::Hopper_64_256_16; + mparams.mma_macro = MmaMacro::Ampere_16_8_16; + + mparams.tile_sizes = gemm_tile; + mparams.async_gmem_load_operands = true; + mparams.circular_buffer_options.circular_buffer_smem_write = true; + mparams.circular_buffer_options.circular_buffer_smem_read = true; + mparams.circular_buffer_options.smem_circular_buffer_stage = 4; + + // TODO Create prefetch parameter + // mparams.circular_buffer_options.smem_circular_buffer_prefetch = 3; + + // Schedule matmul fusion using custom parameters + SchedulerEntry::makeSchedulerInstance(SchedulerType::Matmul) + ->schedule(fusion.get(), &mparams); + + const int M = 32, N = 32, K = 256; + auto inputs = + matmulAtInput3DHopperSS(M, N, K, layout, data_type_to_aten(dtype)); + + // TODO Disabled because hopper multiple matmul scheduler is currently a copy + // of ampere scheduler. + /* + KernelExecutor ke; + ke.compile( + fusion.get(), + {inputs.first, inputs.second}, + LaunchParams(), + matmul_cparams); + auto cg_outputs = ke.run({inputs.first, inputs.second}); + auto tref = atMatmul(inputs.first.squeeze(), inputs.second.squeeze(), layout); + EXPECT_TRUE(at::allclose(cg_outputs[0], tref, 1e-5, 1e-5)); + */ +} + } // namespace nvfuser diff --git a/tests/cpp/test_mbarrier.cpp b/tests/cpp/test_mbarrier.cpp index 84c58192271..f7f9611d895 100644 --- a/tests/cpp/test_mbarrier.cpp +++ b/tests/cpp/test_mbarrier.cpp @@ -46,9 +46,9 @@ TEST_F(MBarrierTest, Simple) { tv2->axis(0)->parallelize(ParallelType::TIDy); tv2->axis(1)->parallelize(ParallelType::TIDx); - FusionExecutor fe; + KernelExecutor ke; - fe.registerPostLoweringHook([](kir::Kernel* kernel) { + ke.registerPostLoweringHook([](kir::Kernel* kernel) { // Replace block sync with mbarrier FusionGuard fg(kernel); @@ -122,7 +122,7 @@ TEST_F(MBarrierTest, Simple) { top_level_exprs.push_back(invalidate); }); - fe.compileFusion(&fusion); + ke.compile(&fusion); // Make sure that the post-lowering hook successfully inserted all mbarrier // operations @@ -131,14 +131,14 @@ TEST_F(MBarrierTest, Simple) { &typeid(kir::MBarrierArrive), &typeid(kir::MBarrierWait), &typeid(kir::MBarrierInvalidate)}; - for (auto expr : fe.kernel()->topLevelExprs()) { + for (auto expr : ke.kernel()->topLevelExprs()) { remaining_mbarrier_exprs.erase(&typeid(*expr)); } EXPECT_TRUE(remaining_mbarrier_exprs.empty()); auto input = at::randn( {32, 32}, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0)); - auto outputs = fe.runFusion({input}); + auto outputs = ke.run({input}); testValidate(&fusion, outputs, {input}, __LINE__, __FILE__); } diff --git a/tests/cpp/test_memory.cpp b/tests/cpp/test_memory.cpp index 5af4ecc4867..085a95640bc 100644 --- a/tests/cpp/test_memory.cpp +++ b/tests/cpp/test_memory.cpp @@ -78,15 +78,15 @@ TEST_P(MemoryTest, LoadCache) { {1024}, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0)); at::Tensor expected_output = input + 1.0f; - FusionExecutor fe; + KernelExecutor ke; { DebugDumpOptionsGuard debug_dump_options_guard; DebugDumpOptionsGuard::getCurOptions().set(DebugDumpOption::Ptx); - fe.compileFusion(&fusion, {input}); + ke.compile(&fusion, {input}); } // Verify PTX. - const executor_utils::CompiledKernel& compiled_kernel = fe.compiledKernel(); + const executor_utils::CompiledKernel& compiled_kernel = ke.compiledKernel(); std::string ptx(compiled_kernel.ptx.begin(), compiled_kernel.ptx.end()); std::regex regex(R"(ld\.global\.)" + cache_op_str + R"(\.\S+)"); std::smatch match; @@ -98,7 +98,7 @@ TEST_P(MemoryTest, LoadCache) { std::filesystem::remove(compiled_kernel.ptx_filename); // Verify output tensors. - std::vector actual_ts = fe.runFusion({input}); + std::vector actual_ts = ke.run({input}); testValidate( &fusion, actual_ts, {input}, {expected_output}, __LINE__, __FILE__); } @@ -153,15 +153,15 @@ TEST_F(MemoryTest, RefineCachePolicy) { {1024}, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0)); at::Tensor c = a + b; - FusionExecutor fe; + KernelExecutor ke; { DebugDumpOptionsGuard debug_dump_options_guard; DebugDumpOptionsGuard::getCurOptions().set(DebugDumpOption::Ptx); - fe.compileFusion(&fusion, {a, b}); + ke.compile(&fusion, {a, b}); } // Verify PTX. - const executor_utils::CompiledKernel& compiled_kernel = fe.compiledKernel(); + const executor_utils::CompiledKernel& compiled_kernel = ke.compiledKernel(); std::string ptx(compiled_kernel.ptx.begin(), compiled_kernel.ptx.end()); expectMatchCount(ptx, R"(ld\.global\.ca\.v4\.\S+)", 1); expectMatchCount(ptx, R"(ld\.global\.cs\.v4\.\S+)", 1); @@ -170,7 +170,7 @@ TEST_F(MemoryTest, RefineCachePolicy) { debug() << "Removing " << compiled_kernel.ptx_filename << std::endl; std::filesystem::remove(compiled_kernel.ptx_filename); - std::vector actual_outputs = fe.runFusion({a, b}); + std::vector actual_outputs = ke.run({a, b}); testValidate(&fusion, actual_outputs, {a, b}, {c}, __LINE__, __FILE__); } @@ -457,16 +457,16 @@ TEST_P(TMASimpleLdstTest, Load) { auto options = at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), dim); - TMAPredicateChecker::checkPredicate(fe.kernel(), 1); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), dim); + TMAPredicateChecker::checkPredicate(ke.kernel(), 1); ASSERT_EQ( - XorFinder::findXor(fe.kernel()), (swizzle != MmaInputSmemSwizzle::None)); - TMADimChecker::getDim(fe.kernel()); + XorFinder::findXor(ke.kernel()), (swizzle != MmaInputSmemSwizzle::None)); + TMADimChecker::getDim(ke.kernel()); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -533,10 +533,10 @@ TEST_P(TMALoadTestWithABroadcastDim, LoadWithBroadcast) { auto options = at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -577,15 +577,15 @@ TEST_P(TMASimpleLdstTest, Store) { auto options = at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), dim); - TMAPredicateChecker::checkPredicate(fe.kernel(), 1); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), dim); + TMAPredicateChecker::checkPredicate(ke.kernel(), 1); ASSERT_EQ( - XorFinder::findXor(fe.kernel()), (swizzle != MmaInputSmemSwizzle::None)); + XorFinder::findXor(ke.kernel()), (swizzle != MmaInputSmemSwizzle::None)); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -639,13 +639,13 @@ TEST_F(TMAIndexingTest, Load2DTensorWith1DTMA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({1024, 1024}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 1); - TMAPredicateChecker::checkPredicate(fe.kernel(), 1); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 1); + TMAPredicateChecker::checkPredicate(ke.kernel(), 1); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -676,13 +676,13 @@ TEST_F(TMAIndexingTest, Load1DTensorWith2DTMA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({1024 * 1024}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2); - TMAPredicateChecker::checkPredicate(fe.kernel(), 1); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2); + TMAPredicateChecker::checkPredicate(ke.kernel(), 1); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -713,13 +713,13 @@ TEST_F(TMAIndexingTest, NonOneElementStride) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({1024, 1024}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2); - TMAPredicateChecker::checkPredicate(fe.kernel(), 0); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2); + TMAPredicateChecker::checkPredicate(ke.kernel(), 0); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -784,13 +784,13 @@ TEST_F(TMAIndexingTest, Advanced) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({4, 32, 2, 8, 8, 8, 32, 8}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 4); - TMAPredicateChecker::checkPredicate(fe.kernel(), 1); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 4); + TMAPredicateChecker::checkPredicate(ke.kernel(), 1); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -833,13 +833,13 @@ TEST_F(TMAIndexingTest, DefineBoxByCompositing1) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({4, 32, 2, 8, 8, 8, 32, 8}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 4); - EXPECT_FALSE(PredicatedChecker::isPredicated(tv1, fe.kernel())); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 4); + EXPECT_FALSE(PredicatedChecker::isPredicated(tv1, ke.kernel())); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -886,13 +886,13 @@ TEST_F(TMAIndexingTest, DefineBoxByCompositing2) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({32, 4, 2, 8, 8, 8, 2, 8, 4}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 5); - EXPECT_FALSE(PredicatedChecker::isPredicated(tv1, fe.kernel())); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 5); + EXPECT_FALSE(PredicatedChecker::isPredicated(tv1, ke.kernel())); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -947,13 +947,13 @@ TEST_F(TMAIndexingTest, DefineBoxByRotation1) { int64_t multiple_of_16B_but_not_more = 4 * 67; auto t0 = at::randn( {prime_number, prime_number, multiple_of_16B_but_not_more}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 3); - TMAPredicateChecker::checkPredicate(fe.kernel(), 1); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 3); + TMAPredicateChecker::checkPredicate(ke.kernel(), 1); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -994,18 +994,18 @@ TEST_F(TMAIndexingTest, DefineBoxByRotation2) { at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); int64_t multiple_of_8_but_not_more = 8 * 997; auto t0 = at::randn({multiple_of_8_but_not_more}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); // We will be using 2D TMA instead of 1D, because strided box can not be // merged with other bulk axes by rotation. So, this schedule will be // interpreted as viewing then tensor as 2D (M/8, 8) and then applying 2D TMA. // The outer dim of TMA is defined by boxing and striding splits, and the // inner dim is defined as implicit whole. - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2); - TMAPredicateChecker::checkPredicate(fe.kernel(), 1); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2); + TMAPredicateChecker::checkPredicate(ke.kernel(), 1); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); // The tensor shape is not a multiple of 8, so the view should fail. @@ -1016,7 +1016,7 @@ TEST_F(TMAIndexingTest, DefineBoxByRotation2) { .device(at::kCUDA, 0); int64_t prime_number = 997; auto t0 = at::randn({prime_number}, options); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); }, ::testing::ThrowsMessage( ::testing::HasSubstr("must be divisible by 8"))); @@ -1056,8 +1056,8 @@ TEST_F(TMAIndexingTest, DefineBoxByRotation3) { at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); int64_t multiple_of_23 = 23 * 997; auto t0 = at::randn({multiple_of_23, 8}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); // We will be using 3D TMA instead of 2D, because split(23, 8) is indivisible, // we can not consider this schedule as a 2D TMA whose first dimension has box @@ -1065,10 +1065,10 @@ TEST_F(TMAIndexingTest, DefineBoxByRotation3) { // TMA. The dim 0 of TMA is as implicit size-one, and the dim 1 is defined by // a boxing split whose box size is 8, and dim 2 is an implicit whole box with // size N. - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 3); - TMAPredicateChecker::checkPredicate(fe.kernel(), 1); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 3); + TMAPredicateChecker::checkPredicate(ke.kernel(), 1); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); // The tensor shape is not a multiple of 23, so the view should fail. @@ -1079,7 +1079,7 @@ TEST_F(TMAIndexingTest, DefineBoxByRotation3) { .device(at::kCUDA, 0); int64_t prime_number = 997; auto t0 = at::randn({prime_number, 8}, options); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); }, ::testing::ThrowsMessage( ::testing::HasSubstr("must be divisible by 23"))); @@ -1118,14 +1118,14 @@ TEST_F(TMAIndexingTest, NonTrivialGmemAllocationDomain1) { auto t0 = at::randn({128, 1024 * 128}, options) .transpose(0, 1) .view({128, 1024, 128}); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2); - TMAPredicateChecker::checkPredicate(fe.kernel(), 1); - ASSERT_TRUE(XorFinder::findXor(fe.kernel())); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2); + TMAPredicateChecker::checkPredicate(ke.kernel(), 1); + ASSERT_TRUE(XorFinder::findXor(ke.kernel())); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -1173,13 +1173,13 @@ TEST_F(TMAIndexingTest, NonTrivialGmemAllocationDomain2) { auto options = at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::randn({2, 3, 5, 7, 11, 32}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 3); - TMAPredicateChecker::checkPredicate(fe.kernel(), 1); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 3); + TMAPredicateChecker::checkPredicate(ke.kernel(), 1); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -1221,13 +1221,13 @@ TEST_F(TMAMiscTest, AdvancedThreadParallelizationLoad) { auto options = at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::randn({100000}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 1); - TMAPredicateChecker::checkPredicate(fe.kernel(), 4); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 1); + TMAPredicateChecker::checkPredicate(ke.kernel(), 4); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -1264,13 +1264,13 @@ TEST_F(TMAMiscTest, AdvancedThreadParallelizationStore) { auto options = at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::randn({100000}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 1); - TMAPredicateChecker::checkPredicate(fe.kernel(), 4); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 1); + TMAPredicateChecker::checkPredicate(ke.kernel(), 4); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -1300,13 +1300,13 @@ TEST_F(TMAMiscTest, DisableIndexHoisting) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({32}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 1); - TMAPredicateChecker::checkPredicate(fe.kernel(), 0); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 1); + TMAPredicateChecker::checkPredicate(ke.kernel(), 0); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -1332,13 +1332,13 @@ TEST_F(TMAMiscTest, Repro1977) { auto options = at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::randn({1024}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 1); - TMAPredicateChecker::checkPredicate(fe.kernel(), 0); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 1); + TMAPredicateChecker::checkPredicate(ke.kernel(), 0); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -1423,9 +1423,9 @@ TEST_F(TMAMiscTest, StoreSyncInsertion) { std::count_if(flattened_exprs.begin(), flattened_exprs.end(), is_wait), 1); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}, {}, matmul_cparams); - auto cg_outputs = fe.runFusion({input}); + KernelExecutor ke; + ke.compile(&fusion, {input}, {}, matmul_cparams); + auto cg_outputs = ke.run({input}); testValidate(&fusion, cg_outputs, {input}, {input}, __LINE__, __FILE__); } @@ -1475,9 +1475,9 @@ TEST_F(TMAMiscTest, StoreSyncInsertion) { // RAW sync is inserted, the WAR pass has not run yet. We should be able to // remove the RAW sync by adding a cleanup pass. - FusionExecutor fe; - fe.compileFusion(&fusion, {input}, {}, matmul_cparams); - auto cg_outputs = fe.runFusion({input}); + KernelExecutor ke; + ke.compile(&fusion, {input}, {}, matmul_cparams); + auto cg_outputs = ke.run({input}); testValidate(&fusion, cg_outputs, {input}, {input}, __LINE__, __FILE__); } @@ -1542,9 +1542,9 @@ TEST_F(TMAMiscTest, StoreSyncInsertion) { std::count_if(flattened_exprs.begin(), flattened_exprs.end(), is_wait), 2); - FusionExecutor fe; - fe.compileFusion(&fusion, {input}, {}, matmul_cparams); - auto cg_outputs = fe.runFusion({input}); + KernelExecutor ke; + ke.compile(&fusion, {input}, {}, matmul_cparams); + auto cg_outputs = ke.run({input}); testValidate(&fusion, cg_outputs, {input}, {input}, __LINE__, __FILE__); } } @@ -1586,12 +1586,12 @@ TEST_F(TMAMiscTest, LoadStrongCorrectness) { auto options = at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::arange(1, 33, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 1); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 1); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); auto expect = at::zeros({2, 1, 2, 16}, options); expect.flatten(0, 2).select(0, 0) = at::arange(1, 17, options); @@ -1632,8 +1632,8 @@ TEST_F(TMACompileTimeInvalidTest, BulkNotInTMA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({32}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage(::testing::HasSubstr( "ParallelType::Bulk is only supported for cp.async.bulk."))); @@ -1661,8 +1661,8 @@ TEST_F(TMACompileTimeInvalidTest, BulkBroadcast) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({32}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage(::testing::HasSubstr( "ParallelType::Bulk is only supported for IterType::Iteration."))); @@ -1689,8 +1689,8 @@ TEST_F(TMACompileTimeInvalidTest, InvalidParallelType) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({32}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Invalid parallel type for cp.async.bulk: V"))); @@ -1727,13 +1727,13 @@ TEST_F(TMARuntimeInvalidTest, MisalignedGlobalAddress) { at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0_aligned = at::randn({128 + items_of_16_bytes}, options) .narrow(0, items_of_16_bytes, 128); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0_aligned}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0_aligned}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 1); - TMAPredicateChecker::checkPredicate(fe.kernel(), 1); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 1); + TMAPredicateChecker::checkPredicate(ke.kernel(), 1); - auto cg_outputs = fe.runFusion({t0_aligned}); + auto cg_outputs = ke.run({t0_aligned}); testValidate( &fusion, cg_outputs, {t0_aligned}, {t0_aligned}, __LINE__, __FILE__); @@ -1741,7 +1741,7 @@ TEST_F(TMARuntimeInvalidTest, MisalignedGlobalAddress) { [&]() { auto t0_misaligned = at::randn({128 + items_of_16_bytes / 2}, options) .narrow(0, items_of_16_bytes / 2, 128); - fe.runFusion({t0_misaligned}); + ke.run({t0_misaligned}); }, ::testing::ThrowsMessage(::testing::HasSubstr( "globalAddress, which specifies the starting address of the memory region described, " @@ -1782,13 +1782,13 @@ TEST_F(TMARuntimeInvalidTest, MisalignedGlobalStride) { at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0_aligned = at::randn({128, 128 + items_of_16_bytes}, options).narrow(1, 0, 128); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0_aligned}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0_aligned}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2); - TMAPredicateChecker::checkPredicate(fe.kernel(), 1); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2); + TMAPredicateChecker::checkPredicate(ke.kernel(), 1); - auto cg_outputs = fe.runFusion({t0_aligned}); + auto cg_outputs = ke.run({t0_aligned}); testValidate( &fusion, cg_outputs, {t0_aligned}, {t0_aligned}, __LINE__, __FILE__); @@ -1797,7 +1797,7 @@ TEST_F(TMARuntimeInvalidTest, MisalignedGlobalStride) { auto t0_misaligned = at::randn({128, 128 + items_of_16_bytes / 2}, options) .narrow(1, 0, 128); - fe.runFusion({t0_misaligned}); + ke.run({t0_misaligned}); }, ::testing::ThrowsMessage(::testing::HasSubstr( "globalStrides array, which specifies tensor stride of each of the lower tensorRank - 1 dimensions in bytes, " @@ -1836,8 +1836,8 @@ TEST_F(TMACompileTimeInvalidTest, SizeOfTransfer) { EXPECT_THAT( [&]() { - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage(::testing::HasSubstr( "The expected bytes must be a multiple of 16 bytes, but 8 is not."))); @@ -1876,18 +1876,18 @@ TEST_F(TMARuntimeInvalidTest, SizeOfTransfer) { at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::randn({128}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, items_of_16_bytes}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0, items_of_16_bytes}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 1); - TMAPredicateChecker::checkPredicate(fe.kernel(), 1); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 1); + TMAPredicateChecker::checkPredicate(ke.kernel(), 1); - auto cg_outputs = fe.runFusion({t0, items_of_16_bytes}); + auto cg_outputs = ke.run({t0, items_of_16_bytes}); testValidate( &fusion, cg_outputs, {t0, items_of_16_bytes}, {t0}, __LINE__, __FILE__); EXPECT_THAT( - [&]() { fe.runFusion({t0, items_of_16_bytes / 2}); }, + [&]() { ke.run({t0, items_of_16_bytes / 2}); }, ::testing::ThrowsMessage(::testing::HasSubstr( "The expected bytes must be a multiple of 16 bytes, but "))); } @@ -1929,19 +1929,19 @@ TEST_F(TMARuntimeInvalidTest, InvalidView) { at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); // (10240,) can be viewed as (10, 1024) auto t0_valid = at::randn({10240}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0_valid}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0_valid}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2); - auto cg_outputs = fe.runFusion({t0_valid}); + auto cg_outputs = ke.run({t0_valid}); testValidate(&fusion, cg_outputs, {t0_valid}, {t0_valid}, __LINE__, __FILE__); EXPECT_THAT( [&]() { // it is impossible to view (10249,) as (?, 1024) auto t0_inval = at::randn({10249}, options); - fe.runFusion({t0_inval}); + ke.run({t0_inval}); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Invalid view in TMA: the extent of"))); @@ -1975,8 +1975,8 @@ TEST_F(TMACompileTimeInvalidTest, InnermostDiscontiguous) { EXPECT_THAT( [&]() { - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage(::testing::HasSubstr( "The innermost dimension of the TMA domain must be contiguous"))); @@ -2016,8 +2016,8 @@ TEST_F(TMACompileTimeInvalidTest, MergeDiscontiguous) { EXPECT_THAT( [&]() { - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Can not merge discontiguous dimensions, but"))); @@ -2052,8 +2052,8 @@ TEST_F(TMACompileTimeInvalidTest, InnermostElementStrideNotOne) { EXPECT_THAT( [&]() { - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage(::testing::HasSubstr( "When interleave is CU_TENSOR_MAP_INTERLEAVE_NONE " @@ -2091,8 +2091,8 @@ TEST_F(TMACompileTimeInvalidTest, SwizzleBulkWithNonBulk) { EXPECT_THAT( [&]() { - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage(::testing::HasSubstr( "TMA domain must be a view of the allocation domain of the gmem tensor"))); @@ -2135,8 +2135,8 @@ TEST_F(TMADocTest, Figure13a) { EXPECT_THAT( [&]() { - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Some error message"))); @@ -2173,13 +2173,13 @@ TEST_F(TMADocTest, Figure14a) { at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::randn({16, 200}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2); - TMAPredicateChecker::checkPredicate(fe.kernel(), 0); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2); + TMAPredicateChecker::checkPredicate(ke.kernel(), 0); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -2214,8 +2214,8 @@ TEST_F(TMADocTest, Figure13b) { EXPECT_THAT( [&]() { - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Some error message"))); @@ -2249,13 +2249,13 @@ TEST_F(TMADocTest, Figure14b) { at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::randn({16, 10}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2); - TMAPredicateChecker::checkPredicate(fe.kernel(), 0); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2); + TMAPredicateChecker::checkPredicate(ke.kernel(), 0); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -2291,8 +2291,8 @@ TEST_F(TMADocTest, Figure13c) { EXPECT_THAT( [&]() { - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Some error message"))); @@ -2327,13 +2327,13 @@ TEST_F(TMADocTest, Figure14c) { at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::randn({16, 200}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2); - TMAPredicateChecker::checkPredicate(fe.kernel(), 0); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2); + TMAPredicateChecker::checkPredicate(ke.kernel(), 0); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -2366,8 +2366,8 @@ TEST_F(TMADocTest, Figure13d) { EXPECT_THAT( [&]() { - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Some error message"))); @@ -2398,13 +2398,13 @@ TEST_F(TMADocTest, Figure14d) { at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::randn({16, 12}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2); - TMAPredicateChecker::checkPredicate(fe.kernel(), 1); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2); + TMAPredicateChecker::checkPredicate(ke.kernel(), 1); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -2441,8 +2441,8 @@ TEST_F(TMADocTest, Figure13e) { EXPECT_THAT( [&]() { - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Some error message"))); @@ -2478,13 +2478,13 @@ TEST_F(TMADocTest, Figure14e) { at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::randn({16, 10}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2); - TMAPredicateChecker::checkPredicate(fe.kernel(), 1); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2); + TMAPredicateChecker::checkPredicate(ke.kernel(), 1); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -2523,13 +2523,13 @@ TEST_F(TMADocTest, Figure15a) { at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::randn({16, 10}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2); - TMAPredicateChecker::checkPredicate(fe.kernel(), 0); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2); + TMAPredicateChecker::checkPredicate(ke.kernel(), 0); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -2565,13 +2565,13 @@ TEST_F(TMADocTest, Figure15b) { at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto t0 = at::randn({16, 12}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); - EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2); - TMAPredicateChecker::checkPredicate(fe.kernel(), 4); + EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2); + TMAPredicateChecker::checkPredicate(ke.kernel(), 4); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } @@ -2613,8 +2613,8 @@ TEST_F(TMADocTest, Figure15c) { EXPECT_THAT( [&]() { - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Some error message"))); @@ -2660,8 +2660,8 @@ TEST_F(TMADocTest, Figure15d) { EXPECT_THAT( [&]() { - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Some error message"))); @@ -2701,8 +2701,8 @@ TEST_F(TMADocTest, Figure15e) { EXPECT_THAT( [&]() { - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + KernelExecutor ke; + ke.compile(&fusion, {t0}, {}, matmul_cparams); }, ::testing::ThrowsMessage( ::testing::HasSubstr("Some error message"))); @@ -2755,9 +2755,9 @@ TEST_P(LdMatrixTest, Regular) { auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); auto t0 = at::randn({size1, getK(macro)}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}, LaunchParams(), matmul_cparams); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -2881,9 +2881,9 @@ TEST_P(StMatrixSingleTileTest, Regular) { auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); auto t0 = at::randn({sizeM, sizeN}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}, LaunchParams(), matmul_cparams); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -2942,9 +2942,9 @@ TEST_P(StMatrixTest, Regular) { auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); auto t0 = at::randn({sizeM, sizeN}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}, LaunchParams(), matmul_cparams); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -3017,9 +3017,9 @@ TEST_P(LdMatrixTest, Transpose) { auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); auto t0 = at::randn({getK(macro), size2}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}, LaunchParams(), matmul_cparams); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } diff --git a/tests/cpp/test_mma.cpp b/tests/cpp/test_mma.cpp index 470a1633a9e..5493b4a19d5 100644 --- a/tests/cpp/test_mma.cpp +++ b/tests/cpp/test_mma.cpp @@ -172,10 +172,10 @@ std::vector scheduleCompileAndRun( tv2->setLoopDomain(s.as()); } - FusionExecutor fe; - fe.compileFusion( + KernelExecutor ke; + ke.compile( fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams); - return fe.runFusion({inputs.first, inputs.second}); + return ke.run({inputs.first, inputs.second}); } TEST_P(MmaTest, SingleTile) { @@ -388,11 +388,11 @@ TEST_P(HopperRS, SingleTile) { auto inputs = matmulAtInput3DHopperRS( getM(macro), getN(macro), getK(macro), layout, data_type_to_aten(dtype)); - FusionExecutor fe; - fe.compileFusion( + KernelExecutor ke; + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.squeeze().to(at::kFloat), inputs.second.squeeze().to(at::kFloat), @@ -484,11 +484,11 @@ TEST_P(HopperRS, SingleTileWithTMALoadStore) { auto inputs = matmulAtInput3DHopperRS( getM(macro), getN(macro), getK(macro), layout, data_type_to_aten(dtype)); - FusionExecutor fe; - fe.compileFusion( + KernelExecutor ke; + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.squeeze().to(at::kFloat), inputs.second.squeeze().to(at::kFloat), @@ -650,10 +650,10 @@ TEST_P(HopperSS, SingleTile) { auto inputs = matmulAtInput3DHopperSS( getM(macro), getN(macro), getK(macro), layout, data_type_to_aten(dtype)); - FusionExecutor fe; - fe.compileFusion( + KernelExecutor ke; + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.squeeze().to(at::kFloat), inputs.second.squeeze().to(at::kFloat), @@ -779,10 +779,10 @@ TEST_P(HopperSS, SingleTileTransposed) { auto inputs = matmulAtInput3DHopperSS( getM(macro), getN(macro), getK(macro), layout, data_type_to_aten(dtype)); - FusionExecutor fe; - fe.compileFusion( + KernelExecutor ke; + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.squeeze().to(at::kFloat), inputs.second.squeeze().to(at::kFloat), @@ -958,10 +958,10 @@ TEST_P(HopperSS, MultipleTile) { layout, data_type_to_aten(dtype)); - FusionExecutor fe; - fe.compileFusion( + KernelExecutor ke; + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.squeeze().to(at::kFloat), inputs.second.squeeze().to(at::kFloat), diff --git a/tests/cpp/test_move_pad.cpp b/tests/cpp/test_move_pad.cpp index 4499ecb5ec5..92ccaeae676 100644 --- a/tests/cpp/test_move_pad.cpp +++ b/tests/cpp/test_move_pad.cpp @@ -41,13 +41,14 @@ TEST_F(MovePadTest, UnaryCat) { at::Tensor t1 = at::randn({2, 10}, options); std::vector aten_inputs = {t0, t1}; - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs(aten_inputs); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_EQ(runtime->fusionSegments()->groups().size(), 1); - testValidate(fec.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__); } TEST_F(MovePadTest, BinaryCat) { @@ -71,13 +72,14 @@ TEST_F(MovePadTest, BinaryCat) { at::Tensor t2 = at::randn({2, 10}, options); std::vector aten_inputs = {t0, t1, t2}; - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs(aten_inputs); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_EQ(runtime->fusionSegments()->groups().size(), 1); - testValidate(fec.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__); } TEST_F(MovePadTest, BinaryBroadcastOnNonCatDim) { @@ -105,19 +107,20 @@ TEST_F(MovePadTest, BinaryBroadcastOnNonCatDim) { at::Tensor t2 = at::randn({4, 5}, options); std::vector aten_inputs = {t0, t1, t2}; - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs(aten_inputs); // ensure that we propagate the pad across binary operation and the first // segment is no-op - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_THAT( runtime->fusionSegments()->groups(), UnorderedElementsAre( HeuristicIs(SchedulerType::NoOp), HeuristicIs(SchedulerType::PointWise))); - testValidate(fec.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__); } TEST_F(MovePadTest, BinaryBroadcastOnCatDim) { @@ -144,13 +147,14 @@ TEST_F(MovePadTest, BinaryBroadcastOnCatDim) { at::Tensor t2 = at::randn({2, 10}, options); std::vector aten_inputs = {t0, t1, t2}; - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs(aten_inputs); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_EQ(runtime->fusionSegments()->groups().size(), 2); - testValidate(fec.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__); } TEST_F(MovePadTest, PadReplayOnMultipleUsesCase0) { @@ -179,13 +183,14 @@ TEST_F(MovePadTest, PadReplayOnMultipleUsesCase0) { at::Tensor t1 = at::randn({1, 10}, options); std::vector aten_inputs = {t0, t1}; - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs(aten_inputs); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_EQ(runtime->fusionSegments()->groups().size(), 1); - testValidate(fec.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__); } TEST_F(MovePadTest, PadReplayOnMultipleUsesCase1) { @@ -215,10 +220,11 @@ TEST_F(MovePadTest, PadReplayOnMultipleUsesCase1) { at::Tensor t1 = at::randn({4, 10}, options); std::vector aten_inputs = {t0, t1}; - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs(aten_inputs); - testValidate(fec.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__); } TEST_F(MovePadTest, CascadePadCase0) { @@ -264,15 +270,16 @@ TEST_F(MovePadTest, CascadePadCase0) { at::Tensor t0 = at::randn({4, 10}, options); std::vector aten_inputs = {t0}; - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs(aten_inputs); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); Fusion* complete_fusion = runtime->fusionSegments()->completeFusion(); std::vector exprs = complete_fusion->exprs(); EXPECT_THAT(exprs, Contains(Property(&Expr::isA, IsTrue())).Times(1)); - testValidate(fec.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__); } TEST_F(MovePadTest, CascadePadCase1) { @@ -302,15 +309,16 @@ TEST_F(MovePadTest, CascadePadCase1) { at::Tensor t0 = at::randn({4, 10}, options); std::vector aten_inputs = {t0}; - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs(aten_inputs); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); Fusion* complete_fusion = runtime->fusionSegments()->completeFusion(); std::vector exprs = complete_fusion->exprs(); EXPECT_THAT(exprs, Contains(Property(&Expr::isA, IsTrue())).Times(2)); - testValidate(fec.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__); } TEST_F(MovePadTest, CascadePadCase2) { @@ -359,10 +367,11 @@ TEST_F(MovePadTest, CascadePadCase2) { at::Tensor t0 = at::randn({4, 10}, options); std::vector aten_inputs = {t0}; - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs(aten_inputs); - testValidate(fec.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__); } TEST_F(MovePadTest, NotMergeNegativePad) { @@ -391,10 +400,11 @@ TEST_F(MovePadTest, NotMergeNegativePad) { at::Tensor t0 = at::randn({4, 10}, options); std::vector aten_inputs = {t0}; - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs(aten_inputs); - testValidate(fec.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__); } TEST_F(MovePadTest, BooleanCat) { @@ -418,18 +428,24 @@ TEST_F(MovePadTest, BooleanCat) { at::Tensor t2 = at::randn({2, 10}, options) > 0.5; std::vector aten_inputs = {t0, t1, t2}; - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs(aten_inputs); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_EQ(runtime->fusionSegments()->groups().size(), 1); // ExpressionEvaluator is hitting an assert with dynamic value. - // https://github.com/NVIDIA/Fuser/issues/2697 testValidate(fec.fusion(), - // out_tensors, aten_inputs, __LINE__, __FILE__); + // https://github.com/NVIDIA/Fuser/issues/2697 + // testValidate(executor_cache.fusion(), out_tensors, aten_inputs, __LINE__, + // __FILE__); at::Tensor ref = at::cat({at::bitwise_and(t0, t1), t2}, 0); testValidate( - fec.fusion(), out_tensors, aten_inputs, {ref}, __LINE__, __FILE__); + executor_cache.fusion(), + out_tensors, + aten_inputs, + {ref}, + __LINE__, + __FILE__); } } // namespace nvfuser diff --git a/tests/cpp/test_move_split_cat.cpp b/tests/cpp/test_move_split_cat.cpp index 247aa96381e..beec3172e2d 100644 --- a/tests/cpp/test_move_split_cat.cpp +++ b/tests/cpp/test_move_split_cat.cpp @@ -39,9 +39,10 @@ TEST_F(MoveSplitCatTest, Cancellable_SplitImmediatelyFollowedByCat) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({4, 10}, options); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_TRUE(out_tensors[0].is_alias_of(in_tensor)); } @@ -60,9 +61,10 @@ TEST_F(MoveSplitCatTest, Noncancellable_DifferentOrder) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({2, 6}, options); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_FALSE(out_tensors[0].is_alias_of(in_tensor)); } @@ -83,9 +85,10 @@ TEST_F(MoveSplitCatTest, Cancellable_SetWithoutPermute) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({2, 5}, options); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_TRUE(out_tensors[0].is_alias_of(in_tensor)); } @@ -108,9 +111,10 @@ TEST_F(MoveSplitCatTest, Noncancellable_SliceAmountAndPaddingAmountMismatch) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({4, 10}, options); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_FALSE(out_tensors[0].is_alias_of(in_tensor)); } @@ -132,9 +136,10 @@ TEST_F(MoveSplitCatTest, Noncancellable_CatOnlySubsetOfSplitOutputs) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({4, 10}, options); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_FALSE(out_tensors[0].is_alias_of(in_tensor)); } @@ -158,9 +163,10 @@ TEST_F(MoveSplitCatTest, Cancellable_PermuteInBetween) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({2, 3, 10}, options); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_TRUE(out_tensors[0].is_alias_of(in_tensor)); } @@ -193,12 +199,13 @@ TEST_F(MoveSplitCatTest, Cancellable_IncompatibleAllocationOrder) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({2, 3, 5}, options); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); // Check the two permutes are merged to one. - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); Fusion* complete_fusion = runtime->fusionSegments()->completeFusion(); EXPECT_THAT(complete_fusion->exprs(), Contains(IsPermute()).Times(1)); @@ -232,9 +239,10 @@ TEST_F(MoveSplitCatTest, Cancellable_MultiplePermutesInBetween) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({2, 3, 10}, options); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_TRUE(out_tensors[0].is_alias_of(in_tensor)); } @@ -258,9 +266,10 @@ TEST_F(MoveSplitCatTest, Noncancellable_WrongAxis) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({2, 2, 4}, options); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_FALSE(out_tensors[0].is_alias_of(in_tensor)); } @@ -283,9 +292,10 @@ TEST_F(MoveSplitCatTest, Noncancellable_SomeButNotAllArePermuted) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({2, 2, 10}, options); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_FALSE(out_tensors[0].is_alias_of(in_tensor)); } @@ -311,9 +321,10 @@ TEST_F(MoveSplitCatTest, Noncancellable_PermutedDifferently) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({4, 2}, options); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_FALSE(out_tensors[0].is_alias_of(in_tensor)); } @@ -338,9 +349,10 @@ TEST_F(MoveSplitCatTest, Noncancellable_UnsupportedOps) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({2, 2, 4}, options); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_FALSE(out_tensors[0].is_alias_of(in_tensor)); } @@ -364,9 +376,10 @@ TEST_F(MoveSplitCatTest, Cancellable_ReshapeInBetween) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({4, 10}, options); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_TRUE(out_tensors[0].is_alias_of(in_tensor)); } @@ -393,9 +406,10 @@ TEST_F(MoveSplitCatTest, Cancellable_ReshapeAndPermuteInBetween) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({6, 10}, options); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_TRUE(out_tensors[0].is_alias_of(in_tensor)); } @@ -445,9 +459,10 @@ TEST_F(MoveSplitCatTest, Cancellable_Issue1768) { at::randn({b * h * 3 * s * f}, options) .as_strided({b, h * 3, s, f}, {h * 3 * s * f, f, h * 3 * f, 1}); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_TRUE(out_tensors[1].is_alias_of(in_tensor)); EXPECT_TRUE(out_tensors[2].is_alias_of(in_tensor)); @@ -471,9 +486,10 @@ TEST_F(MoveSplitCatTest, OuterSplit) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({4, 6}, options); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_FALSE(out_tensors[0].is_alias_of(in_tensor)); } @@ -514,11 +530,12 @@ TEST_F(MoveSplitCatTest, MultiplePairs) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({4, 6}, options); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); Fusion* complete_fusion = runtime->fusionSegments()->completeFusion(); std::vector exprs = complete_fusion->exprs(); @@ -564,9 +581,10 @@ TEST_F(MoveSplitCatTest, MultipleCatsOnSameSplit) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({4, 2}, options); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_FALSE(out_tensors[0].is_alias_of(in_tensor)); EXPECT_TRUE(out_tensors[1].is_alias_of(in_tensor)); diff --git a/tests/cpp/test_multidevice_lower_communication.cpp b/tests/cpp/test_multidevice_lower_communication.cpp index 3c454777f0f..47f80f77d9c 100644 --- a/tests/cpp/test_multidevice_lower_communication.cpp +++ b/tests/cpp/test_multidevice_lower_communication.cpp @@ -17,9 +17,10 @@ namespace nvfuser { namespace { -void assertIsCompiledToHostIrContainer(const FusionExecutorCache& fec) { - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); - const std::vector& executors = runtime->executors(); +void assertIsCompiledToHostIrContainer( + const FusionExecutorCache& executor_cache) { + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); + const std::vector& executors = runtime->executors(); EXPECT_THAT(executors, testing::SizeIs(1)); for (const auto& executor : executors) { EXPECT_TRUE(executor.fusion()->isA()) @@ -71,9 +72,9 @@ TEST_P(LowerGatherTest, ) { at::randn({in_mesh.size(), kTensorSize}, tensor_options); at::Tensor in_tensor = shardTensor(unsharded_tensor, in); - FusionExecutorCache fec(std::move(fusion)); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; - assertIsCompiledToHostIrContainer(fec); + FusionExecutorCache executor_cache(std::move(fusion)); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; + assertIsCompiledToHostIrContainer(executor_cache); if (out_mesh.has(device_id)) { EXPECT_TRUE(at::equal(out_tensor, unsharded_tensor)); @@ -112,9 +113,10 @@ TEST_P(LowerScatterTest, ) { at::Tensor unsharded_tensor = at::randn({out_mesh.size(), kTensorSize}, tensor_options); - FusionExecutorCache fec(std::move(fusion)); - at::Tensor out_tensor = fec.runFusionWithInputs({unsharded_tensor})[0]; - assertIsCompiledToHostIrContainer(fec); + FusionExecutorCache executor_cache(std::move(fusion)); + at::Tensor out_tensor = + executor_cache.runFusionWithInputs({unsharded_tensor})[0]; + assertIsCompiledToHostIrContainer(executor_cache); if (out_mesh.has(device_id)) { EXPECT_TRUE(at::equal(out_tensor, shardTensor(unsharded_tensor, out))); @@ -155,9 +157,9 @@ TEST_P(LowerSendRecvTest, ) { at::randn({in_mesh.size(), kTensorSize}, tensor_options); at::Tensor in_tensor = shardTensor(unsharded_tensor, in); - FusionExecutorCache fec(std::move(fusion)); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; - assertIsCompiledToHostIrContainer(fec); + FusionExecutorCache executor_cache(std::move(fusion)); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; + assertIsCompiledToHostIrContainer(executor_cache); if (out_mesh.has(device_id)) { EXPECT_TRUE(at::equal(out_tensor, shardTensor(unsharded_tensor, out))); @@ -194,9 +196,9 @@ TEST_F(LowerCollectiveTest, Allgather) { at::randn({num_devices, kTensorSize}, tensor_options); at::Tensor in_tensor = shardTensor(unsharded_tensor, in); - FusionExecutorCache fec(std::move(fusion)); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; - assertIsCompiledToHostIrContainer(fec); + FusionExecutorCache executor_cache(std::move(fusion)); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; + assertIsCompiledToHostIrContainer(executor_cache); EXPECT_TRUE(at::equal(out_tensor, unsharded_tensor)); } @@ -221,10 +223,10 @@ TEST_F(LowerCollectiveTest, Broadcast) { const auto device_id = communicator_->deviceId(); at::Tensor in_tensor = unsharded_tensor.slice(0, device_id, device_id + 1); - FusionExecutorCache fec(std::move(fusion)); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; + FusionExecutorCache executor_cache(std::move(fusion)); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; if (num_devices > 1) { - assertIsCompiledToHostIrContainer(fec); + assertIsCompiledToHostIrContainer(executor_cache); } EXPECT_TRUE( @@ -252,9 +254,9 @@ TEST_F(LowerCollectiveTest, Reduce) { const auto device_id = communicator_->deviceId(); at::Tensor in_tensor = shardTensor(unsharded_in_tensor, in); - FusionExecutorCache fec(std::move(fusion)); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; - assertIsCompiledToHostIrContainer(fec); + FusionExecutorCache executor_cache(std::move(fusion)); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; + assertIsCompiledToHostIrContainer(executor_cache); if (device_id == kRoot) { // at::allclose instead of at::equal because addition is involved. @@ -281,9 +283,9 @@ TEST_F(LowerCollectiveTest, Allreduce) { at::randn({num_devices, kTensorSize}, tensor_options); at::Tensor in_tensor = shardTensor(unsharded_in_tensor, in); - FusionExecutorCache fec(std::move(fusion)); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; - assertIsCompiledToHostIrContainer(fec); + FusionExecutorCache executor_cache(std::move(fusion)); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; + assertIsCompiledToHostIrContainer(executor_cache); EXPECT_TRUE(at::allclose(out_tensor, unsharded_in_tensor.sum(0))); } @@ -309,10 +311,10 @@ TEST_F(LowerCollectiveTest, Allreduce_Concrete) { at::randn({num_devices, kTensorSize}, tensor_options); at::Tensor in_tensor = shardTensor(unsharded_in_tensor, in); - FusionExecutorCache fec(std::move(fusion)); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; + FusionExecutorCache executor_cache(std::move(fusion)); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; if (num_devices > 1) { - assertIsCompiledToHostIrContainer(fec); + assertIsCompiledToHostIrContainer(executor_cache); } EXPECT_TRUE(at::allclose(out_tensor, unsharded_in_tensor.sum(0))); @@ -338,9 +340,9 @@ TEST_F(LowerCollectiveTest, ReduceScatter) { at::randn({num_devices, num_devices, kTensorSize}, tensor_options); at::Tensor in_tensor = shardTensor(unsharded_in_tensor, in); - FusionExecutorCache fec(std::move(fusion)); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; - assertIsCompiledToHostIrContainer(fec); + FusionExecutorCache executor_cache(std::move(fusion)); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; + assertIsCompiledToHostIrContainer(executor_cache); at::Tensor unsharded_out_tensor = unsharded_in_tensor.sum(0); EXPECT_TRUE(at::allclose(out_tensor, shardTensor(unsharded_out_tensor, out))); @@ -371,8 +373,8 @@ TEST_F(LowerCollectiveTest, ReduceScatter_Allgather) { at::randn({num_devices, num_devices, kTensorSize}, tensor_options); at::Tensor in_tensor = shardTensor(unsharded_in_tensor, in); - FusionExecutorCache fec(std::move(fusion)); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; + FusionExecutorCache executor_cache(std::move(fusion)); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; EXPECT_TRUE(at::allclose(out_tensor, unsharded_in_tensor.sum(0))); } diff --git a/tests/cpp/test_multidevice_matmul.cpp b/tests/cpp/test_multidevice_matmul.cpp index 3032db30b94..24e84f56e5e 100644 --- a/tests/cpp/test_multidevice_matmul.cpp +++ b/tests/cpp/test_multidevice_matmul.cpp @@ -102,12 +102,18 @@ TEST_F(DistributedMatmulTest, MulSum_LayoutTN_NoComms) { std::vector inputs = {shardTensor(in0, a), in1}; auto expected_output = shardTensor(out, c); - FusionExecutorCache fec(std::move(fusion)); - auto outputs = fec.runFusionWithInputs(inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto outputs = executor_cache.runFusionWithInputs(inputs); testValidate( - fec.fusion(), outputs, inputs, {expected_output}, __LINE__, __FILE__); + executor_cache.fusion(), + outputs, + inputs, + {expected_output}, + __LINE__, + __FILE__); - const FusionKernelRuntime* kernel_runtime = fec.getMostRecentKernelRuntime(); + const FusionKernelRuntime* kernel_runtime = + executor_cache.getMostRecentKernelRuntime(); EXPECT_THAT( kernel_runtime->fusionSegments()->groups(), Contains(HeuristicIs(SchedulerType::Matmul)).Times(1)); @@ -156,13 +162,19 @@ TEST_F(DistributedMatmulTest, Matmul_LayoutTN_NoComms) { std::vector inputs = {shardTensor(in0, a), in1}; auto expected_output = shardTensor(out, c); - FusionExecutorCache fec(std::move(fusion)); - auto outputs = fec.runFusionWithInputs(inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto outputs = executor_cache.runFusionWithInputs(inputs); testValidate( - fec.fusion(), outputs, inputs, {expected_output}, __LINE__, __FILE__); + executor_cache.fusion(), + outputs, + inputs, + {expected_output}, + __LINE__, + __FILE__); - const FusionKernelRuntime* kernel_runtime = fec.getMostRecentKernelRuntime(); + const FusionKernelRuntime* kernel_runtime = + executor_cache.getMostRecentKernelRuntime(); EXPECT_THAT( kernel_runtime->fusionSegments()->groups(), Contains(HeuristicIs(SchedulerType::ExprEval)).Times(1)); @@ -208,13 +220,19 @@ TEST_F(DistributedMatmulTest, Matmul_LayoutTN_Allgather) { std::vector inputs = {shardTensor(in0, a), in1}; auto expected_output = shardTensor(out, c); - FusionExecutorCache fec(std::move(fusion)); - auto outputs = fec.runFusionWithInputs(inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto outputs = executor_cache.runFusionWithInputs(inputs); testValidate( - fec.fusion(), outputs, inputs, {expected_output}, __LINE__, __FILE__); + executor_cache.fusion(), + outputs, + inputs, + {expected_output}, + __LINE__, + __FILE__); - const FusionKernelRuntime* kernel_runtime = fec.getMostRecentKernelRuntime(); + const FusionKernelRuntime* kernel_runtime = + executor_cache.getMostRecentKernelRuntime(); EXPECT_THAT( kernel_runtime->fusionSegments()->groups(), Contains(HeuristicIs(SchedulerType::ExprEval)).Times(1)); @@ -258,12 +276,14 @@ TEST_F(DistributedMatmulTest, Matmul_LayoutNT_AllReduce) { in1 = in1.view({Ko, Ki, N}); std::vector inputs = {shardTensor(in0, a), shardTensor(in1, b)}; - FusionExecutorCache fec(std::move(fusion)); - auto outputs = fec.runFusionWithInputs(inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto outputs = executor_cache.runFusionWithInputs(inputs); - testValidate(fec.fusion(), outputs, inputs, {out}, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), outputs, inputs, {out}, __LINE__, __FILE__); - const FusionKernelRuntime* kernel_runtime = fec.getMostRecentKernelRuntime(); + const FusionKernelRuntime* kernel_runtime = + executor_cache.getMostRecentKernelRuntime(); EXPECT_THAT( kernel_runtime->fusionSegments()->groups(), Contains(HeuristicIs(SchedulerType::ExprEval)).Times(1)); @@ -315,12 +335,18 @@ TEST_F(DistributedMatmulTest, Matmul_LayoutNT_ReduceScatter) { std::vector inputs = {shardTensor(in0, a), shardTensor(in1, b)}; auto expected_output = shardTensor(out, c).view({1, Mi, N}); - FusionExecutorCache fec(std::move(fusion)); - auto outputs = fec.runFusionWithInputs(inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto outputs = executor_cache.runFusionWithInputs(inputs); testValidate( - fec.fusion(), outputs, inputs, {expected_output}, __LINE__, __FILE__); + executor_cache.fusion(), + outputs, + inputs, + {expected_output}, + __LINE__, + __FILE__); - const FusionKernelRuntime* kernel_runtime = fec.getMostRecentKernelRuntime(); + const FusionKernelRuntime* kernel_runtime = + executor_cache.getMostRecentKernelRuntime(); EXPECT_THAT( kernel_runtime->fusionSegments()->groups(), Contains(HeuristicIs(SchedulerType::ExprEval)).Times(1)); @@ -354,16 +380,16 @@ TEST_F(DistributedMatmulTest, PresegPreservesSharding) { auto w_tensor = at::randn({mesh.size(), 36, 48}, tensor_options); auto sharded_w_tensor = shardTensor(w_tensor, w); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); std::vector inputs({x_tensor, sharded_w_tensor}); - std::vector outputs = fec.runFusionWithInputs(inputs); + std::vector outputs = executor_cache.runFusionWithInputs(inputs); at::Tensor expected_mm_t_tensor = atMatmul(x_tensor, w_tensor.view({mesh.size() * 36, 48}), MmaLayout::TN) .transpose(0, 1) .view({mesh.size(), 36, 12}); testValidate( - fec.fusion(), + executor_cache.fusion(), outputs, inputs, {shardTensor(expected_mm_t_tensor, mm_t)}, @@ -394,13 +420,13 @@ TEST_F(DistributedMatmulTest, AnnotateWeightOnly) { auto w_tensor = at::randn({mesh.size(), 3, 5}, tensor_options); auto sharded_w_tensor = shardTensor(w_tensor, w); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); std::vector inputs({x_tensor, sharded_w_tensor}); - std::vector outputs = fec.runFusionWithInputs(inputs); + std::vector outputs = executor_cache.runFusionWithInputs(inputs); at::Tensor expected_y_tensor = at::matmul(x_tensor, w_tensor); testValidate( - fec.fusion(), + executor_cache.fusion(), outputs, inputs, {shardTensor(expected_y_tensor, 0, mesh)}, diff --git a/tests/cpp/test_multidevice_sharding.cpp b/tests/cpp/test_multidevice_sharding.cpp index ece3a36b67a..1e1ff2eab9e 100644 --- a/tests/cpp/test_multidevice_sharding.cpp +++ b/tests/cpp/test_multidevice_sharding.cpp @@ -62,10 +62,16 @@ TEST_P(MultiDeviceReductionTest, UnshardedInput_ShardedOutput) { auto x1 = shardTensor(x0, tv1); auto x2 = x1 + x1; auto x3 = shardTensor(at::sum(x0 + x0, {sharded_input_dim}), tv3); - FusionExecutorCache fec(std::move(fusion)); - auto outputs = fec.runFusionWithInputs(inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto outputs = executor_cache.runFusionWithInputs(inputs); - testValidate(fec.fusion(), outputs, inputs, {x1, x2, x3}, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), + outputs, + inputs, + {x1, x2, x3}, + __LINE__, + __FILE__); } // Test multidevice fusion with sharded input and replicated intermediates and @@ -98,9 +104,10 @@ TEST_P(MultiDeviceReductionTest, ShardedInput_ReplicatedOutput) { auto x1 = at::randn(unsharded_input_shape, tensor_options); std::vector inputs = {shardTensor(x1, tv0)}; auto x2 = x1 * 2; - FusionExecutorCache fec(std::move(fusion)); - auto outputs = fec.runFusionWithInputs(inputs); - testValidate(fec.fusion(), outputs, inputs, {x1, x2}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto outputs = executor_cache.runFusionWithInputs(inputs); + testValidate( + executor_cache.fusion(), outputs, inputs, {x1, x2}, __LINE__, __FILE__); } INSTANTIATE_TEST_SUITE_P( @@ -137,10 +144,10 @@ TEST_F(MultiDeviceTest, Reduction) { auto unsharded_in_tensor = at::randn({mesh.size(), 4}, tensor_options); auto in_tensor = shardTensor(unsharded_in_tensor, in); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); testValidate( - fec.fusion(), + executor_cache.fusion(), out_tensors, {in_tensor}, {unsharded_in_tensor.sum(0)}, @@ -172,10 +179,10 @@ TEST_F(MultiDeviceTest, Slice) { auto expected_out = aten_x.split(4, 2); std::vector inputs = {{shardTensor(aten_x, x)}}; - FusionExecutorCache fec(std::move(fusion)); - auto outputs = fec.runFusionWithInputs(inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto outputs = executor_cache.runFusionWithInputs(inputs); testValidate( - fec.fusion(), + executor_cache.fusion(), outputs, inputs, {shardTensor(expected_out[0], x), shardTensor(expected_out[1], x)}, @@ -206,8 +213,8 @@ TEST_F(MultiDeviceTest, BackpropMeshes) { at::Tensor unsharded_x_tensor = at::randn({num_devices, 4}, tensor_options); at::Tensor x_tensor = shardTensor(unsharded_x_tensor, x); - FusionExecutorCache fec(std::move(fusion)); - at::Tensor z_tensor = fec.runFusionWithInputs({x_tensor})[0]; + FusionExecutorCache executor_cache(std::move(fusion)); + at::Tensor z_tensor = executor_cache.runFusionWithInputs({x_tensor})[0]; EXPECT_THAT(z_tensor.sizes(), ElementsAre(1, 4)) << "Due to sharding propagation, z is supposed to " << "be sharded in the same way as x."; @@ -239,11 +246,11 @@ TEST_F(MultiDeviceTest, LayerNorm) { auto aten_outputs = at::native_layer_norm(aten_x, norm_shape, aten_weight, aten_bias, kEps); - FusionExecutorCache fec(std::move(fusion)); - auto outputs = fec.runFusionWithInputs({aten_x}); + FusionExecutorCache executor_cache(std::move(fusion)); + auto outputs = executor_cache.runFusionWithInputs({aten_x}); testValidate( - fec.fusion(), + executor_cache.fusion(), outputs, {aten_x}, {std::get<0>(aten_outputs), @@ -278,14 +285,14 @@ TEST_F(MultiDeviceTest, Issue2758) { at::zeros({num_devices, num_devices, 4}, tensor_options); at::Tensor in_tensor = shardTensor(unsharded_in_tensor, in); - FusionExecutorCache fec(std::move(fusion)); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; + FusionExecutorCache executor_cache(std::move(fusion)); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; at::Tensor expected_out_tensor = shardTensor(unsharded_in_tensor.sum(0), reduce_scattered) + in_tensor.size(1); testValidate( - fec.fusion(), + executor_cache.fusion(), {out_tensor}, {in_tensor}, {expected_out_tensor}, @@ -314,20 +321,20 @@ TEST_F(MultiDeviceTest, Transpose) { at::randn({num_devices, 1024, 1024}, tensor_options); at::Tensor in_tensor = shardTensor(unsharded_in_tensor, in); - FusionExecutorCache fec(std::move(fusion)); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; + FusionExecutorCache executor_cache(std::move(fusion)); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; at::Tensor expected_out_tensor = shardTensor(unsharded_in_tensor.transpose(1, 2), out); testValidate( - fec.fusion(), + executor_cache.fusion(), {out_tensor}, {in_tensor}, {expected_out_tensor}, __LINE__, __FILE__); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_THAT( runtime->fusionSegments()->groups(), UnorderedElementsAre(HeuristicIs(SchedulerType::Transpose))); @@ -365,11 +372,12 @@ TEST_P(MultiDeviceBroadcastTest, NotExpanded) { fusion->addInput(in); fusion->addOutput(out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({1, 8}, options); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; - testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; + testValidate( + executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); } TEST_P(MultiDeviceBroadcastTest, Expanded) { @@ -395,11 +403,12 @@ TEST_P(MultiDeviceBroadcastTest, Expanded) { fusion->addInput(in); fusion->addOutput(out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({8}, options).as_strided({3, 8}, {0, 1}); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; - testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; + testValidate( + executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); } INSTANTIATE_TEST_SUITE_P(, MultiDeviceBroadcastTest, testing::Bool()); diff --git a/tests/cpp/test_multidevice_transformer.cpp b/tests/cpp/test_multidevice_transformer.cpp index f9d5d96c5da..f4b999e7aec 100644 --- a/tests/cpp/test_multidevice_transformer.cpp +++ b/tests/cpp/test_multidevice_transformer.cpp @@ -698,9 +698,9 @@ TEST_P(DistributedTransformerTest, MLP_Layer) { reference_outs[2], reference_outs[3]}; - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::manual_seed(getATenRandomSeed()); - auto outputs = fec.runFusionWithInputs(inputs); + auto outputs = executor_cache.runFusionWithInputs(inputs); validate(expected_outputs, outputs, {0.01, 0.01, 0.02, 0.02}); } @@ -785,9 +785,9 @@ TEST_P(DistributedTransformerTest, Sequence_Parallel_MLP_Layer) { shardTensor(reference_outs[2], 0, mesh), shardTensor(reference_outs[3], 0, mesh)}; - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::manual_seed(getATenRandomSeed()); - auto outputs = fec.runFusionWithInputs(inputs); + auto outputs = executor_cache.runFusionWithInputs(inputs); validate(expected_outputs, outputs, {0.01, 0.01, 0.02, 0.02}); } @@ -846,9 +846,9 @@ TEST_P(DistributedTransformerTest, MultiheadAttention) { reference_outs[2], reference_outs[3]}; - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::manual_seed(getATenRandomSeed()); - auto outputs = fec.runFusionWithInputs(inputs); + auto outputs = executor_cache.runFusionWithInputs(inputs); validate(expected_outputs, outputs, {0.02, 0.02, 0.02, 0.02}); } @@ -920,8 +920,8 @@ TEST_P(DistributedTransformerTest, MLP_Backward) { shardTensor(outs[5], 0, mesh), // linear0 bias grad outs[6]}; // linear0 grad x - FusionExecutorCache fec(std::move(fusion)); - auto outputs = fec.runFusionWithInputs(inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto outputs = executor_cache.runFusionWithInputs(inputs); validate(expected_outputs, outputs, {1e-5, 0.2, 1e-5, 0.01, 0.2, 0.01, 0.02}); } @@ -1021,9 +1021,9 @@ TEST_P(DistributedTransformerTest, MHA_Backward) { .view({1, 3 * E / D}), // linear0 bias grad reference_outs[12]}; - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::manual_seed(getATenRandomSeed()); - auto out = fec.runFusionWithInputs(inputs); + auto out = executor_cache.runFusionWithInputs(inputs); validate( expected_outputs, out, {1e-5, 0.02, 1e-5, .01, .02, 0.2, 0.2, 0.2, 0.02}); } @@ -1146,9 +1146,9 @@ TEST_P(DistributedTransformerTest, Forward) { std::vector expected_outputs = { ln0_out_, mha_out_, ln1_out_, mlp_out_, at_out}; - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::manual_seed(getATenRandomSeed()); - auto outputs = fec.runFusionWithInputs(inputs); + auto outputs = executor_cache.runFusionWithInputs(inputs); validate(expected_outputs, outputs, {1e-4, 0.02, 0.04, 0.04, 0.04}); } @@ -1430,9 +1430,9 @@ TEST_P(DistributedTransformerTest, Backward) { shardTensor(mlp_out_[0], 1, mesh) // mlp linear1 }; - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::manual_seed(getATenRandomSeed()); - auto outputs = fec.runFusionWithInputs(inputs); + auto outputs = executor_cache.runFusionWithInputs(inputs); validate( expected_outputs, outputs, diff --git a/tests/cpp/test_no_op.cpp b/tests/cpp/test_no_op.cpp index a6e35e9b9ac..0b0e093767a 100644 --- a/tests/cpp/test_no_op.cpp +++ b/tests/cpp/test_no_op.cpp @@ -186,10 +186,11 @@ TEST_F(NoOpTest, View) { TensorView* out = reshape(in, in_shape, out_shape); fusion->addOutput(out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3, 4}, at::dtype(at::kFloat).device(at::kCUDA, 0)); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); ASSERT_EQ(out_tensors.size(), 1); at::Tensor out_tensor = out_tensors[0]; @@ -198,7 +199,7 @@ TEST_F(NoOpTest, View) { // Verify the NoOp scheduler was kicked in. const std::vector& groups = - fec.getMostRecentKernelRuntime()->fusionSegments()->groups(); + executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups(); ASSERT_EQ(groups.size(), 1); SegmentedGroup* group = groups[0]; EXPECT_EQ(group->schedulerType(), SchedulerType::NoOp); @@ -220,12 +221,13 @@ TEST_F(NoOpTest, ExpandedReduction) { out = segment_set(out); fusion->addOutput(out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::ones({}).cuda().as_strided({2, 3}, {0, 0}); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; - testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; + testValidate( + executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_THAT( runtime->fusionSegments()->groups(), UnorderedElementsAre(HeuristicIs(SchedulerType::NoOp))); diff --git a/tests/cpp/test_persistent_buffer.cpp b/tests/cpp/test_persistent_buffer.cpp index 529423145dc..3463395b11b 100644 --- a/tests/cpp/test_persistent_buffer.cpp +++ b/tests/cpp/test_persistent_buffer.cpp @@ -343,8 +343,8 @@ TEST_F(PersistentBufferTest, FusionPersistentBufferProjection_CUDA) { auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); at::Tensor aten_t0 = at::randn({99, 101}, options); - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = fec.runFusionWithInputs({aten_t0}); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs({aten_t0}); testValidate(&fusion, cg_outputs, {aten_t0}, __LINE__, __FILE__); } @@ -611,8 +611,8 @@ TEST_F(PersistentBufferTest, FusionLayerNormFusedOpsRedundantCast_CUDA) { hidden_size * dataTypeSize(dtype), "Persistent buffer size is not correct!"); - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = fec.runFusionWithInputs(inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs(inputs); testValidate(fusion, cg_outputs, inputs, outputs, __LINE__, __FILE__); } @@ -679,8 +679,8 @@ TEST_F(PersistentBufferTest, FusionRecomputePersistentBuffer_CUDA) { persistent_buffer_info2.persistent_buffers.size() == 1, "After project to other buffers, should have one persistent buffer!"); - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = fec.runFusionWithInputs(inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs(inputs); testValidate(fusion, cg_outputs, inputs, outputs, __LINE__, __FILE__); } @@ -1172,10 +1172,10 @@ TEST_F(PersistentBufferTest, PostReductionBroadcastCheck) { auto t1 = at::randn({dim0, dim1}, options); auto t2 = at::sum(t0, {1}).unsqueeze(1) + t0; auto t4 = t2 + t1; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = fec.runFusionWithInputs({t0, t1}); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs({t0, t1}); NVF_CHECK( - !fec.getMostRecentKernelRuntime()->isSegmented(), + !executor_cache.getMostRecentKernelRuntime()->isSegmented(), "unexpected segmentation!"); testValidate(fusion, cg_outputs, {t0, t1}, {t4}, __LINE__, __FILE__); @@ -1211,10 +1211,10 @@ TEST_F(PersistentBufferTest, PostReductionBroadcastCheckMultiBcastDims) { auto t1 = at::randn({dim0, dim1, dim2}, options); auto t2 = at::sum(t0, {1, 2}).unsqueeze(-1).unsqueeze(-1) + t0; auto t4 = t2 + t1; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = fec.runFusionWithInputs({t0, t1}); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs({t0, t1}); NVF_CHECK( - !fec.getMostRecentKernelRuntime()->isSegmented(), + !executor_cache.getMostRecentKernelRuntime()->isSegmented(), "unexpected segmentation!"); testValidate(fusion, cg_outputs, {t0, t1}, {t4}, __LINE__, __FILE__); @@ -1243,15 +1243,16 @@ TEST_F(PersistentBufferTest, SmemPersistentNotSupportedIn3DReduction) { .device(at::kCUDA, 0); auto t0 = at::randn(input_shape, options); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); std::vector aten_inputs = {t0}; - auto cg_outputs = fec.runFusionWithInputs(aten_inputs); + auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); // should be segmented since buffer size is larger than 32K and smem // persistent is not supported yet for 3D reduction. - EXPECT_TRUE(fec.getMostRecentKernelRuntime()->isSegmented()); + EXPECT_TRUE(executor_cache.getMostRecentKernelRuntime()->isSegmented()); - testValidate(fec.fusion(), cg_outputs, aten_inputs, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), cg_outputs, aten_inputs, __LINE__, __FILE__); } TEST_F(PersistentBufferTest, SmemPersistent2DReduction) { @@ -1297,10 +1298,10 @@ TEST_F(PersistentBufferTest, SmemPersistent2DReduction) { scheduler->schedule(fusion.get(), heuristic_params.get()); // Run the fusion and validate the results - FusionExecutor fe; - fe.compileFusion(fusion.get(), aten_inputs); + KernelExecutor ke; + ke.compile(fusion.get(), aten_inputs); // Shared memory access should be vectorized. - // getBankConflictInfo(fe.kernel()) triggers error "std::get: wrong index for + // getBankConflictInfo(ke.kernel()) triggers error "std::get: wrong index for // variant" when trying to evaluate index with: // `expr_eval.evaluate(ti->index()).as();` for (auto tv : fusion->allTvs()) { @@ -1313,8 +1314,8 @@ TEST_F(PersistentBufferTest, SmemPersistent2DReduction) { } } } - auto cg_outputs = fe.runFusion( - aten_inputs, heuristic_params->as()->lparams); + auto cg_outputs = + ke.run(aten_inputs, heuristic_params->as()->lparams); auto t1 = t0 / t0.sum({1, 2, 3}, true); testValidate(fusion.get(), cg_outputs, aten_inputs, {t1}, __LINE__, __FILE__); } diff --git a/tests/cpp/test_pointwise.cpp b/tests/cpp/test_pointwise.cpp index 552cb18f3a8..c4684adbb6e 100644 --- a/tests/cpp/test_pointwise.cpp +++ b/tests/cpp/test_pointwise.cpp @@ -23,8 +23,8 @@ using PointwiseTest = NVFuserTest; namespace { -int64_t getVecSizeForPointwise(const FusionExecutorCache& fec) { - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); +int64_t getVecSizeForPointwise(const FusionExecutorCache& executor_cache) { + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); NVF_CHECK(!runtime->isSegmented()); const PointwiseParams* params = runtime->schedulerHeuristics() ->heuristicsList() @@ -62,7 +62,7 @@ TEST_F(PointwiseTest, VectorizeStrideContiguity2D) { auto tv1 = add(tv0, tv0); fusion->addOutput(tv1); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); std::vector> size_and_vec{{17, 1}, {18, 2}, {32, 4}}; @@ -71,9 +71,9 @@ TEST_F(PointwiseTest, VectorizeStrideContiguity2D) { auto vec = pair.second; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input0 = at::randn({1000000, size}, options).narrow(1, 0, 16); - auto cg_outputs = fec.runFusionWithInputs({input0}); + auto cg_outputs = executor_cache.runFusionWithInputs({input0}); - EXPECT_EQ(getVecSizeForPointwise(fec), vec); + EXPECT_EQ(getVecSizeForPointwise(executor_cache), vec); testValidate(fusion, cg_outputs, {input0}, __LINE__, __FILE__); } @@ -90,7 +90,7 @@ TEST_F(PointwiseTest, VectorizeStrideContiguity3D) { auto tv1 = add(tv0, tv0); fusion->addOutput(tv1); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); std::vector> size_and_vec{{17, 1}, {10, 2}, {16, 4}}; @@ -99,9 +99,9 @@ TEST_F(PointwiseTest, VectorizeStrideContiguity3D) { auto vec = pair.second; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input0 = at::randn({1000000, size, 3}, options).narrow(1, 0, 8); - auto cg_outputs = fec.runFusionWithInputs({input0}); + auto cg_outputs = executor_cache.runFusionWithInputs({input0}); - EXPECT_EQ(getVecSizeForPointwise(fec), vec); + EXPECT_EQ(getVecSizeForPointwise(executor_cache), vec); testValidate(fusion, cg_outputs, {input0}, __LINE__, __FILE__); } @@ -120,7 +120,7 @@ TEST_F(PointwiseTest, VectorizeStrideContiguity5D) { auto tv1 = add(tv0, tv0); fusion->addOutput(tv1); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); @@ -134,9 +134,9 @@ TEST_F(PointwiseTest, VectorizeStrideContiguity5D) { at::Tensor input0 = at::randn({4, size1, 12345, size2, 3}, options) .narrow(1, 0, 8) .narrow(3, 0, 4); - auto cg_outputs = fec.runFusionWithInputs({input0}); + auto cg_outputs = executor_cache.runFusionWithInputs({input0}); - EXPECT_EQ(getVecSizeForPointwise(fec), vec); + EXPECT_EQ(getVecSizeForPointwise(executor_cache), vec); testValidate(fusion, cg_outputs, {input0}, __LINE__, __FILE__); } @@ -158,7 +158,7 @@ TEST_F(PointwiseTest, VectorizeStrideMisalignedBase) { auto tv1 = add(tv0, tv0); fusion->addOutput(tv1); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); @@ -195,8 +195,8 @@ TEST_F(PointwiseTest, VectorizeStrideMisalignedBase) { at::Tensor flat = at::randn({alloc_size}, options); at::Tensor input0 = flat.as_strided(shape, stride, /*storage_offset=*/align); - auto cg_outputs = fec.runFusionWithInputs({input0}); - EXPECT_EQ(getVecSizeForPointwise(fec), vec); + auto cg_outputs = executor_cache.runFusionWithInputs({input0}); + EXPECT_EQ(getVecSizeForPointwise(executor_cache), vec); testValidate(fusion, cg_outputs, {input0}, __LINE__, __FILE__); } } @@ -214,7 +214,7 @@ TEST_F(PointwiseTest, VectorizeStrideContiguitySelfOverlapping) { auto tv1 = add(tv0, tv0); fusion->addOutput(tv1); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); @@ -241,8 +241,8 @@ TEST_F(PointwiseTest, VectorizeStrideContiguitySelfOverlapping) { stride1, (int64_t)stride2 * 12345, (int64_t)stride2, 3, 1}; at::Tensor input0 = at::empty_strided(shape, stride, options); input0.random_(); - auto cg_outputs = fec.runFusionWithInputs({input0}); - EXPECT_EQ(getVecSizeForPointwise(fec), vec); + auto cg_outputs = executor_cache.runFusionWithInputs({input0}); + EXPECT_EQ(getVecSizeForPointwise(executor_cache), vec); testValidate(fusion, cg_outputs, {input0}, __LINE__, __FILE__); } } @@ -262,13 +262,13 @@ TEST_F(PointwiseTest, VectorizeAllocationDomain) { tv1->setAllocationDomain({tv1->axis(0), tv1->axis(2), tv1->axis(1)}, true); fusion->addOutput(tv1); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input0 = at::empty_strided({1024, 128, 25}, {128 * 25, 1, 128}, options); - auto cg_outputs = fec.runFusionWithInputs({input0}); - EXPECT_EQ(getVecSizeForPointwise(fec), 4); + auto cg_outputs = executor_cache.runFusionWithInputs({input0}); + EXPECT_EQ(getVecSizeForPointwise(executor_cache), 4); testValidate(fusion, cg_outputs, {input0}, __LINE__, __FILE__); } @@ -407,7 +407,7 @@ TEST_F(PointwiseTest, Issue1567VectorizationFactorAnalysisCase2) { auto tv3 = transpose(tv2, 0, 1); fusion->addOutput(tv3); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input0 = at::randn({1024, 1, 2}, options); @@ -444,7 +444,7 @@ TEST_F(PointwiseTest, VIssue1567ectorizationFactorAnalysisCase3) { auto tv3 = transpose(tv2, 0, 1); fusion->addOutput(tv3); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input0 = at::randn({1, 1024, 2}, options); @@ -549,9 +549,9 @@ TEST_F(PointwiseTest, ShardedPointwise) { unsharded_pparams->flip_grid_binding); pwise_scheduler->schedule(&sharded_fusion, sharded_params.get()); - FusionExecutor fe; - fe.compileFusion(&sharded_fusion, sharded_inputs, sharded_params->lparams); - auto cg_outputs = fe.runFusion(sharded_inputs, sharded_params->lparams); + KernelExecutor ke; + ke.compile(&sharded_fusion, sharded_inputs, sharded_params->lparams); + auto cg_outputs = ke.run(sharded_inputs, sharded_params->lparams); testValidate( &sharded_fusion, cg_outputs, sharded_inputs, __LINE__, __FILE__); } @@ -659,11 +659,12 @@ TEST_F(PointwiseTest, VectorizeWithExpandedBroadcast) { auto in_tensor = at::randn({kTensorSize}, options).as_strided({2, kTensorSize}, {0, 1}); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); - EXPECT_GT(getVecSizeForPointwise(fec), 1); + EXPECT_GT(getVecSizeForPointwise(executor_cache), 1); } using VectUnrollFactors = std::tuple; @@ -705,10 +706,10 @@ TEST_P(PointwiseParamsTest, UnrollOnTopOfVectorize) { // Schedule, compile, run, validate scheduler_instance->schedule(fusion.get(), pparams); - FusionExecutor fe; - fe.compileFusion(fusion.get(), runtime_inputs, pparams->lparams); - auto cg_outputs = fe.runFusion(runtime_inputs, pparams->lparams); - const auto& lparams = fe.lastLaunchParams(); + KernelExecutor ke; + ke.compile(fusion.get(), runtime_inputs, pparams->lparams); + auto cg_outputs = ke.run(runtime_inputs, pparams->lparams); + const auto& lparams = ke.lastLaunchParams(); ASSERT_EQ(lparams.gdimy(), dim0 / unroll_outer); ASSERT_EQ( lparams.gdimx(), dim1 / vect_factor / lparams.bdimx() / unroll_inner); diff --git a/tests/cpp/test_predicate_elimination.cpp b/tests/cpp/test_predicate_elimination.cpp index 8b941f7e0e6..bfb12a12b8a 100644 --- a/tests/cpp/test_predicate_elimination.cpp +++ b/tests/cpp/test_predicate_elimination.cpp @@ -77,9 +77,9 @@ TEST_F(PredicateEliminationTest, 2) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto ref = (t0 + 1).sum({1}) + 1; @@ -127,9 +127,9 @@ TEST_F(PredicateEliminationTest, 3) { for (auto size : {1, 2, 999, 1001, 1234, 10000}) { auto t0 = at::randn({size}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto ref = sum(t0) + 1; testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); @@ -180,9 +180,9 @@ TEST_F(PredicateEliminationTest, 4) { for (auto s1 : sizes) { auto t0 = at::randn({s0, s1}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto t1 = t0.sum({1}); auto t3 = t1.sum({0}) + 1; @@ -228,9 +228,9 @@ TEST_F(PredicateEliminationTest, 5) { for (auto s0 : sizes) { auto t0 = at::randn({s0}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); auto ref = t0.mean({0}); @@ -277,9 +277,9 @@ TEST_F(PredicateEliminationTest, 6) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({2, 3}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -313,9 +313,9 @@ TEST_F(PredicateEliminationTest, 7) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({123}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -382,12 +382,12 @@ TEST_F(PredicateEliminationTest, 8) { at::Tensor aten_t3 = at::randn(full_size, options); // tv0 - 3 at::Tensor aten_t4 = at::randn({channel_size}, options); // tv4 - 4 - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = - fec.runFusionWithInputs({aten_t0, aten_t1, aten_t2, aten_t3, aten_t4}); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs( + {aten_t0, aten_t1, aten_t2, aten_t3, aten_t4}); const auto& compiled_executors = - fec.getMostRecentKernelRuntime()->executors(); + executor_cache.getMostRecentKernelRuntime()->executors(); NVF_CHECK(compiled_executors.size() == 1, "Unexpected scheduling"); NVF_CHECK( !PredicatedChecker::isPredicated(tv6, compiled_executors.at(0).kernel()), @@ -431,9 +431,9 @@ TEST_F(PredicateEliminationTest, 9) { // with TIDx in this tensor EXPECT_TRUE(PredicatedChecker::isPredicated(tv1, gpulw)); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(fusion.get(), {t0}); + auto cg_outputs = ke.run({t0}); testValidate(fusion.get(), cg_outputs, {t0}, __LINE__, __FILE__); } @@ -470,16 +470,16 @@ TEST_F(PredicateEliminationTest, ExtentEqualToMaxParallelTypeExtent) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({10 * 32}, options); - FusionExecutor fe; - fe.registerLoweringHook([&](GpuLower* lower) { + KernelExecutor ke; + ke.registerLoweringHook([&](GpuLower* lower) { lower->passes().insert( lower->passes().begin(), {"validate_smem_predicate_elimination", validate_smem_predicate_elimination}); }); - fe.compileFusion(&fusion, {t0}, {}, matmul_cparams); + ke.compile(&fusion, {t0}, {}, matmul_cparams); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } diff --git a/tests/cpp/test_preseg_passes.cpp b/tests/cpp/test_preseg_passes.cpp index ca554ca0532..1a34d693c53 100644 --- a/tests/cpp/test_preseg_passes.cpp +++ b/tests/cpp/test_preseg_passes.cpp @@ -635,11 +635,12 @@ TEST_F(PresegTest, ReplaceOutput) { TensorView* y = add(x, x); fusion->replaceOutput(x, y); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({10}, at::device(at::kCUDA)); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; - testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); } TEST_F(PresegTest, ExtentSubstitution) { diff --git a/tests/cpp/test_replay.cpp b/tests/cpp/test_replay.cpp index 1bb5c460bef..76f1271907a 100644 --- a/tests/cpp/test_replay.cpp +++ b/tests/cpp/test_replay.cpp @@ -46,8 +46,9 @@ TEST_F(ReplayTest, HorizontallyMergeReshapeAndPermute) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({4, 5}, options); - FusionExecutorCache fec(std::move(fusion)); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); + FusionExecutorCache executor_cache(std::move(fusion)); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); ASSERT_EQ(out_tensors.size(), 1); auto out_tensor = out_tensors[0]; @@ -85,8 +86,9 @@ TEST_F(ReplayTest, HorizontallyMergeReshapeAndNeg) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({4, 5}, options); - FusionExecutorCache fec(std::move(fusion)); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); + FusionExecutorCache executor_cache(std::move(fusion)); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); ASSERT_EQ(out_tensors.size(), 1); auto out_tensor = out_tensors[0]; diff --git a/tests/cpp/test_resize.cpp b/tests/cpp/test_resize.cpp index cc7fc96d8cd..ed02e67ee5f 100644 --- a/tests/cpp/test_resize.cpp +++ b/tests/cpp/test_resize.cpp @@ -63,9 +63,9 @@ TEST_P(ResizeTest, Pad1) { EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); } - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = at::pad(t0, {1, 1}); @@ -99,9 +99,9 @@ TEST_P(ResizeTest, Pad2) { EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); } - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = at::pad(t0, {1, 1}); @@ -152,9 +152,9 @@ TEST_P(ResizeTest, Pad3) { EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); } - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -186,9 +186,9 @@ TEST_P(ResizeTest, Pad4) { EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); } - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = at::pad(t0, {1, 1}); @@ -241,9 +241,9 @@ TEST_P(ResizeTest, Pad5) { EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); } - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = at::pad(t0, {1, 1}); @@ -292,9 +292,9 @@ TEST_P(ResizeTest, Pad6) { EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); } - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -343,9 +343,9 @@ TEST_P(ResizeTest, Pad7) { EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); } - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -388,9 +388,9 @@ TEST_F(ResizeTest, Pad8) { auto t0 = at::randn(999, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = at::pad(t0, {0, 1}) + at::pad(t0, {1, 0}); @@ -613,9 +613,9 @@ TEST_F(ResizeTest, Cat1) { auto t1 = at::randn(shape1, options); std::vector aten_inputs({t0, t1}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = at::cat({t0, t1}, 0); @@ -645,9 +645,9 @@ TEST_F(ResizeTest, Cat2) { auto t1 = at::randn(shape1, options); std::vector aten_inputs({t0, t1}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = at::cat({t0, t1}, 0); @@ -686,9 +686,9 @@ TEST_F(ResizeTest, Cat3) { auto t1 = at::randn(shape1, options); std::vector aten_inputs({t0, t1}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = at::cat({t0, t1}, 1); @@ -730,9 +730,9 @@ TEST_F(ResizeTest, Cat4) { auto t1 = at::randn(shape1, options); std::vector aten_inputs({t0, t1}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = at::cat({t0, t1}, 1); @@ -779,9 +779,9 @@ TEST_F(ResizeTest, Cat5) { auto t2 = at::randn(shape2, options); std::vector aten_inputs({t0, t1, t2}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -823,9 +823,9 @@ TEST_F(ResizeTest, Cat6) { auto t2 = at::randn(shape2, options); std::vector aten_inputs({t0, t1, t2}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = at::cat({t0, t1, t2}, 0); @@ -879,9 +879,9 @@ TEST_F(ResizeTest, Cat7) { std::vector aten_inputs_ivalue( {aten_inputs.begin(), aten_inputs.end()}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs_ivalue); - auto cg_outputs = fe.runFusion(aten_inputs_ivalue); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs_ivalue); + auto cg_outputs = ke.run(aten_inputs_ivalue); auto ref = at::cat(aten_inputs, concat_dim); @@ -1013,9 +1013,9 @@ TEST_F(ResizeTest, Slice1) { auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = t0.index({at::indexing::Slice(1, shape[0] - 1)}); @@ -1044,9 +1044,9 @@ TEST_F(ResizeTest, Slice2) { auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -1142,9 +1142,9 @@ TEST_F(ResizeTest, Slice4) { auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = (t0 + 1).to(at::kDouble).sum({1}); @@ -1197,9 +1197,9 @@ TEST_F(ResizeTest, Slice5) { auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto t1 = t0.index( {at::indexing::Slice(0, at::indexing::None), @@ -1249,9 +1249,9 @@ TEST_F(ResizeTest, SliceConstantShmoo) { auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -1294,13 +1294,13 @@ TEST_F(ResizeTest, SliceInputShmoo) { !fusion.hasDynamicTransform(), "Expected to have no dynamic transform"); } - FusionExecutor fe; - fe.compileFusion(&fusion); + KernelExecutor ke; + ke.compile(&fusion); auto t0 = at::randn(shape, options); for (auto [start, stop] : slice_cases) { std::vector aten_inputs({t0, start, stop}); - auto cg_outputs = fe.runFusion(aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -1328,14 +1328,15 @@ TEST_F(ResizeTest, SliceInputShmooFusionExecutorCache) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto t0 = at::randn(shape, options); for (auto [start, stop] : slice_cases) { std::vector aten_inputs({t0, start, stop}); - auto cg_outputs = fec.runFusionWithInputs(aten_inputs); + auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); - testValidate(fec.fusion(), cg_outputs, aten_inputs, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), cg_outputs, aten_inputs, __LINE__, __FILE__); } } @@ -1755,9 +1756,9 @@ TEST_P(ResizeTest, PadWithValue) { EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); } - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = at::pad(t0, {1, 1}, "constant", 2); @@ -1830,9 +1831,9 @@ TEST_P(ResizeTest, PadHalfWithDoubleValue) { EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); } - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = at::pad(t0, {1, 1}, "constant", 2.5); @@ -2186,7 +2187,7 @@ TEST_F(ResizeTest, FusionSizeZeroSliceSplitSchedule) { FusionExecutorCache executor_cache(std::move(fusion)); auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); - FusionExecutor fe; + KernelExecutor ke; auto ref0 = t0.index({at::indexing::Slice(0, 2)}); auto ref1 = t0.index({at::indexing::Slice(2, 4)}); @@ -2228,15 +2229,15 @@ TEST_F(ResizeTest, FusionSizeZeroSliceSplit) { tv1->merge(0, 1); // size 0*5 = 0 tv1->split(0, 4); // sizes (0, 4) - FusionExecutor fe; - fe.compileFusion(fusion.get()); + KernelExecutor ke; + ke.compile(fusion.get()); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - auto cg_outputs = fe.runFusion(aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref0 = t0.index({at::indexing::Slice(2, 2), at::indexing::Slice(0, 5)}); @@ -2267,7 +2268,7 @@ TEST_F(ResizeTest, FusionSqueezeSymbolic) { // tv1 is of shape {0, 5} fusion->addOutput(tv2); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::manual_seed(0); @@ -2275,14 +2276,14 @@ TEST_F(ResizeTest, FusionSqueezeSymbolic) { auto t0 = at::randn(shape, options); std::vector aten_inputs({t0, 20}); - auto cg_outputs = fec.runFusionWithInputs(aten_inputs); + auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); auto ref0 = t0.flatten(); NVF_CHECK(ref0.equal(cg_outputs[0])); EXPECT_THAT( - [&]() { fec.runFusionWithInputs({t0, 10}); }, + [&]() { executor_cache.runFusionWithInputs({t0, 10}); }, ThrowsMessage( HasSubstr("must concretize to IterType::Broadcast but found"))); } @@ -2680,9 +2681,9 @@ TEST_F(ResizeTest, Slice1DVectorizeManual1) { auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = t0.index({at::indexing::Slice(slice_offset, shape[0] - slice_offset)}); @@ -2733,9 +2734,9 @@ TEST_F(ResizeTest, Slice1DVectorizeManual2) { auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref_t1 = t0.index({at::indexing::Slice(slice_offset, shape[0] - slice_offset)}); @@ -2784,9 +2785,9 @@ TEST_F(ResizeTest, Slice1DVectorizeManual3) { auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = t0.index({at::indexing::Slice(slice_offset, shape[0] - slice_offset)}); @@ -2823,9 +2824,9 @@ TEST_F(ResizeTest, Slice1DVectorizeManual4) { auto t0_unaligned = at::randn(shape, options); auto t0_aligned = t0_unaligned.index({at::indexing::Slice(3, -1)}); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0_aligned}); - auto cg_outputs = fe.runFusion({t0_aligned}); + KernelExecutor ke; + ke.compile(&fusion, {t0_aligned}); + auto cg_outputs = ke.run({t0_aligned}); auto ref_aligned = t0_aligned.index({at::indexing::Slice(1, -3)}); @@ -2867,9 +2868,9 @@ TEST_F(ResizeTest, Slice2DVectorizeManual1) { auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = t0.index( {at::indexing::Slice(slice_offset, shape[0] - slice_offset), @@ -2917,11 +2918,11 @@ TEST_F(ResizeTest, Slice3DVectorizeManual1) { auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); EXPECT_THAT( - [&]() { fe.runFusion(aten_inputs); }, + [&]() { ke.run(aten_inputs); }, ThrowsMessage( HasSubstr("with word size 2 not possible due to invalid stride"))); } @@ -2960,11 +2961,11 @@ TEST_F(ResizeTest, Slice3DVectorizeManual2) { auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); EXPECT_THAT( - [&]() { fe.runFusion(aten_inputs); }, + [&]() { ke.run(aten_inputs); }, ThrowsMessage( HasSubstr("with word size 4 not possible due to invalid stride"))); } @@ -3041,9 +3042,9 @@ TEST_F(ResizeTest, SliceAndReshapeRepro540Manual) { auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); for (const auto i : c10::irange(3)) { auto slice_out_ref = t0.index( @@ -3086,23 +3087,23 @@ TEST_P(ResizeTest, ReshapeToPad) { EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); } - FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor at_x = at::randn({4, 3}, options); std::vector aten_inputs = {at_x, 1, 1, 3, 4}; auto at_y = at::pad(at_x.reshape({3, 4}), {0, 1, 0, 1}); - auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); // Assert that we segmented into two segments auto seg_fusion = - fusion_executor_cache.getMostRecentKernelRuntime()->fusionSegments(); + executor_cache.getMostRecentKernelRuntime()->fusionSegments(); EXPECT_TRUE(seg_fusion->isSegmented()); EXPECT_EQ(seg_fusion->groups().size(), 2); testValidate( - fusion_executor_cache.fusion(), + executor_cache.fusion(), outputs, aten_inputs, {at_y}, @@ -3131,23 +3132,23 @@ TEST_F(ResizeTest, ReshapeToSlice) { auto tv2 = slice(tv1, {{fusion.zeroVal(), s0}, {fusion.zeroVal(), s1}}); fusion.addOutput(tv2); - FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor at_x = at::randn({4, 3}, options); std::vector aten_inputs = {at_x, 3, 2, 3, 4}; auto at_y = at::slice(at::slice(at_x.reshape({3, 4}), 0, 0, 3), 1, 0, 2); - auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); // Assert that we segmented into two segments auto seg_fusion = - fusion_executor_cache.getMostRecentKernelRuntime()->fusionSegments(); + executor_cache.getMostRecentKernelRuntime()->fusionSegments(); EXPECT_TRUE(seg_fusion->isSegmented()); EXPECT_EQ(seg_fusion->groups().size(), 2); testValidate( - fusion_executor_cache.fusion(), + executor_cache.fusion(), outputs, aten_inputs, {at_y}, @@ -3179,9 +3180,9 @@ TEST_F(ResizeTest, CatOfBroadcast) { auto t1 = at::randn(shape1, options); std::vector aten_inputs({t0, t1}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = at::cat({t0, t1}, 0); @@ -3216,9 +3217,9 @@ TEST_F(ResizeTest, CatOfExpandedBroadcast) { auto t1 = at::randn(shape1, options); std::vector aten_inputs({t0, t1}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = at::cat({at::expand_copy(t0, shape0e), t1}, 0); @@ -3302,9 +3303,9 @@ TEST_P(ResizeTest, PadOfBroadcast) { EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); } - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -3338,9 +3339,9 @@ TEST_P(ResizeTest, PadOfExpandedBroadcast) { EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel); } - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); } @@ -3374,7 +3375,7 @@ TEST_F(ResizeTest, DynamicReshapeIssue1393) { auto tv4 = expand(tv3, {s0, s1, s3}); fusion->addOutput(tv4); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({3}, options).as_strided({3, 4}, {1, 0}); @@ -3382,7 +3383,7 @@ TEST_F(ResizeTest, DynamicReshapeIssue1393) { auto ref = t0.add(t1).as_strided({3, 4, 5}, {4, 1, 0}); std::vector aten_inputs({t0, t1}); - auto outputs = fec.runFusionWithInputs(aten_inputs); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); testValidate(fusion, outputs, {t0, t1}, {ref}, __LINE__, __FILE__); } @@ -3424,13 +3425,18 @@ TEST_F(ResizeTest, SqueezeSlicedExpand) { auto t0 = at::randn(shape0, options); std::vector aten_inputs({t0}); - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); auto ref = at::squeeze(at::slice(t0, 1, 2, 3), 1); testValidate( - fec.fusion(), cg_outputs, aten_inputs, {ref}, __LINE__, __FILE__); + executor_cache.fusion(), + cg_outputs, + aten_inputs, + {ref}, + __LINE__, + __FILE__); } // Vectorization through resize is not supported yet. Make sure @@ -3602,14 +3608,18 @@ TEST_F(ResizeTest, Issue2552) { TensorView* z = add(x, y); fusion->addOutput(z); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); auto options = at::dtype(at::kFloat).device(at::kCUDA); at::Tensor x_tensor = at::randn({1, 3}, options); at::Tensor y_tensor = at::randn({1, 3}, options); std::vector out_tensors = - fec.runFusionWithInputs({x_tensor, y_tensor}); + executor_cache.runFusionWithInputs({x_tensor, y_tensor}); testValidate( - fec.fusion(), out_tensors, {x_tensor, y_tensor}, __LINE__, __FILE__); + executor_cache.fusion(), + out_tensors, + {x_tensor, y_tensor}, + __LINE__, + __FILE__); } TEST_F(ResizeTest, Chunk_NegativeSize) { @@ -3623,11 +3633,11 @@ TEST_F(ResizeTest, Chunk_NegativeSize) { fusion->addOutput(out); } - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); EXPECT_THAT( [&]() { auto in_tensor = at::randn({13}).cuda(); - fec.runFusionWithInputs({in_tensor}); + executor_cache.runFusionWithInputs({in_tensor}); }, ThrowsMessage(HasSubstr("Invalid resized domain extent"))); } @@ -3643,10 +3653,11 @@ TEST_F(ResizeTest, Chunk_SizeZero) { fusion->addOutput(out); } - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); auto in_tensor = at::randn({15}).cuda(); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_EQ(out_tensors.back().numel(), 0); } @@ -3662,10 +3673,11 @@ TEST_F(ResizeTest, Chunk_Uneven) { fusion->addOutput(out); } - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); auto in_tensor = at::randn({16}).cuda(); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); EXPECT_EQ(out_tensors.back().numel(), 1); } @@ -3715,9 +3727,9 @@ TEST_F(ResizeTest, SliceScheduledLikeProducer) { auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = t0.index({at::indexing::Slice(1, shape[0] - 1)}); @@ -3763,9 +3775,9 @@ TEST_F(ResizeTest, PadScheduledLikeConsumer) { auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = at::pad(t0 + 1, {1, 1}) + 1; @@ -3815,9 +3827,9 @@ TEST_F(ResizeTest, SliceThenPadLeftHalf) { auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = at::pad( t0.index({at::indexing::Slice(0, shape[0] / 2)}), {0, shape[0] / 2}); @@ -3870,9 +3882,9 @@ TEST_F(ResizeTest, SliceThenPadRightHalf) { auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = at::pad( t0.index({at::indexing::Slice(shape[0] / 2, shape[0])}), @@ -3934,9 +3946,9 @@ TEST_F(ResizeTest, SliceThenConcat) { auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); EXPECT_TRUE(t0.equal(cg_outputs[0])); } @@ -4028,9 +4040,9 @@ TEST_F(ResizeTest, SliceSliceConcatConcat) { auto t0 = at::randn({i0}, options); std::vector aten_inputs({t0}); - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); - auto cg_outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); auto ref = at::concat( {at::slice(t0, 0, 0, rope_size / 2) + 1, @@ -4041,4 +4053,74 @@ TEST_F(ResizeTest, SliceSliceConcatConcat) { NVF_CHECK(ref.equal(cg_outputs[0])); } +// manual scheduling that should have vectorized load on padded inputs. +TEST_F(ResizeTest, VectorizePadLowering) { + auto fusion_ptr = std::make_unique(); + auto& fusion = *fusion_ptr; + FusionGuard fg(fusion_ptr.get()); + + const std::vector shape({1024L * 1024L}); + + auto tv0 = makeContigConcreteTensor(shape); + fusion.addInput(tv0); + + auto tv1 = pad(tv0, {IrBuilder::create(4L), IrBuilder::create(4L)}); + fusion.addOutput(tv1); + + tv1->split(0, 4); + tv1->split(0, 128); + + tv1->axis(0)->parallelize(ParallelType::BIDx); + tv1->axis(1)->parallelize(ParallelType::TIDx); + tv1->axis(2)->parallelize(ParallelType::Vectorize); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + auto t0 = at::randn(shape, options); + std::vector aten_inputs({t0}); + + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); + + auto ref = at::pad(t0, {4, 4}); + ASSERT_TRUE(ref.equal(cg_outputs[0])); +} + +// manual scheduling that should have vectorized load. +TEST_F(ResizeTest, VectorizeWhereLowering) { + auto fusion_ptr = std::make_unique(); + auto& fusion = *fusion_ptr; + FusionGuard fg(fusion_ptr.get()); + + const std::vector shape({1024L * 1024L}); + + // Note: nvfuser currently only supports vectorization with a single + // TensorView input. + auto s0 = IrBuilder::create(DataType::Bool); + fusion.addInput(s0); + auto tv0 = makeContigConcreteTensor(shape); + fusion.addInput(tv0); + auto tv1 = where(s0, IrBuilder::create(2.0), tv0); + fusion.addOutput(tv1); + + tv1->split(0, 4); + tv1->split(0, 128); + + tv1->axis(0)->parallelize(ParallelType::BIDx); + tv1->axis(1)->parallelize(ParallelType::TIDx); + tv1->axis(2)->parallelize(ParallelType::Vectorize); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + auto t0 = at::randn(shape, options); + std::vector aten_inputs({at::Scalar(false), t0}); + + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); + auto cg_outputs = ke.run(aten_inputs); + + // Note: we cannot use at::where, because aten only support tensor as + // predicate. + ASSERT_TRUE(t0.equal(cg_outputs[0])); +} + } // namespace nvfuser diff --git a/tests/cpp/test_rng.cpp b/tests/cpp/test_rng.cpp index fb05848a86a..c8f7c545ae6 100644 --- a/tests/cpp/test_rng.cpp +++ b/tests/cpp/test_rng.cpp @@ -80,18 +80,23 @@ TEST_F(RNGTest, ValidateWithCURand) { fusion->addOutput(tv0); fusion->addOutput(tv1); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); for (int64_t size : {16, 1024, 10001, 10002, 10003, 100000, 10000001}) { at::manual_seed(0); - auto cg_outputs = fec.runFusionWithInputs({size}); + auto cg_outputs = executor_cache.runFusionWithInputs({size}); at::manual_seed(0); auto ref0 = generate_uniform(size, at::kFloat); auto ref1 = generate_uniform(size, at::kDouble); testValidate( - fec.fusion(), cg_outputs, {size}, {ref0, ref1}, __LINE__, __FILE__); + executor_cache.fusion(), + cg_outputs, + {size}, + {ref0, ref1}, + __LINE__, + __FILE__); } } @@ -116,11 +121,11 @@ TEST_F(RNGTest, ManualScheduleValidateWithCURand) { auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA, 0); at::Tensor t0 = at::zeros({size}, options); - FusionExecutor fe; - fe.compileFusion(fusion, {t0}); + KernelExecutor ke; + ke.compile(fusion, {t0}); at::manual_seed(0); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); auto out = cg_outputs[0]; at::manual_seed(0); @@ -154,11 +159,11 @@ TEST_F(RNGTest, ManualScheduleValidateWithCURand2) { /*maybe_symbolic=*/false); fusion->addOutput(tv0); - FusionExecutor fe; - fe.compileFusion(fusion, {10, 10, 10, 10}); + KernelExecutor ke; + ke.compile(fusion, {10, 10, 10, 10}); at::manual_seed(0); - auto cg_outputs = fe.runFusion({10, 10, 10, 10}); + auto cg_outputs = ke.run({10, 10, 10, 10}); auto out = cg_outputs[0]; at::manual_seed(0); @@ -182,13 +187,13 @@ TEST_F(RNGTest, BroadcastingRNG) { auto tv4 = add(tv0, tv3); fusion->addOutput(tv4); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA, 0); at::Tensor t0 = at::zeros({5, 1}, options); at::Tensor t1 = at::zeros({5, 5}, options); - auto cg_outputs = fec.runFusionWithInputs({t0, t1}); + auto cg_outputs = executor_cache.runFusionWithInputs({t0, t1}); auto out = cg_outputs[0]; NVF_CHECK((out.select(1, 0) == out.select(1, 1)).all().item()) NVF_CHECK((out.select(1, 0) == out.select(1, 2)).all().item()) @@ -212,20 +217,21 @@ TEST_F(RNGTest, BroadcastingRNG2) { auto tv3 = add(tv1, tv2); fusion->addOutput(tv3); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA, 0); at::Tensor t0 = at::zeros({1}, options); at::Tensor t1 = at::zeros({size}, options); at::manual_seed(0); - auto cg_outputs = fec.runFusionWithInputs({t0, t1}); + auto cg_outputs = executor_cache.runFusionWithInputs({t0, t1}); auto out = cg_outputs[0]; at::manual_seed(0); auto ref = generate_uniform(1, dtype).expand_as(t1); - testValidate(fec.fusion(), {out}, {t0, t1}, {ref}, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), {out}, {t0, t1}, {ref}, __LINE__, __FILE__); } } } @@ -287,9 +293,9 @@ TEST_F(RNGTest, BroadcastingRNGSmemNonSquareTile) { SchedulerEntry::makeSchedulerInstance(SchedulerType::Transpose) ->schedule(fusion, &tparams); - FusionExecutor fe; - fe.compileFusion(fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compile(fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); auto out = cg_outputs[0]; NVF_CHECK((out.select(1, 0) == out.select(1, 1)).all().item()); @@ -314,18 +320,18 @@ TEST_F(RNGTest, Uniform) { fusion->addOutput(tv0); fusion->addOutput(tv1); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); for (int64_t size : {16, 1024, 10001, 10002, 10003, 100000, 10000001}) { at::manual_seed(0); - auto cg_outputs = fec.runFusionWithInputs({size, -1.0, 1.0}); + auto cg_outputs = executor_cache.runFusionWithInputs({size, -1.0, 1.0}); at::manual_seed(0); auto ref0 = generate_uniform(size, at::kFloat) * 2 - 1; auto ref1 = generate_uniform(size, at::kDouble) * 2 - 1; testValidate( - fec.fusion(), + executor_cache.fusion(), cg_outputs, {size, -1.0, 1.0}, {ref0, ref1}, @@ -354,11 +360,11 @@ TEST_F(RNGTest, Normal) { fusion->addOutput(tv2); fusion->addOutput(tv3); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); for (int64_t size : {16, 1024, 10001, 10002, 10003, 100000, 10000001}) { at::manual_seed(0); - auto cg_outputs = fec.runFusionWithInputs({size, 1.0, 0.5}); + auto cg_outputs = executor_cache.runFusionWithInputs({size, 1.0, 0.5}); at::manual_seed(0); auto ref0 = generate_normal(size, at::kFloat) * 0.5f + 1.0f; @@ -367,7 +373,7 @@ TEST_F(RNGTest, Normal) { auto ref3 = generate_normal(size, at::kDouble); testValidate( - fec.fusion(), + executor_cache.fusion(), cg_outputs, {size, 1.0, 0.5}, {ref0, ref1, ref2, ref3}, @@ -389,13 +395,13 @@ TEST_F(RNGTest, RandLikeReduction) { auto tv3 = add(tv1, tv2); fusion->addOutput(tv3); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA, 0); at::Tensor t0 = at::zeros({2, 3}, options); at::manual_seed(0); - auto cg_outputs = fec.runFusionWithInputs({t0}); + auto cg_outputs = executor_cache.runFusionWithInputs({t0}); auto out = cg_outputs[0]; at::manual_seed(0); @@ -403,7 +409,7 @@ TEST_F(RNGTest, RandLikeReduction) { auto t2 = generate_uniform(3, dtype).expand_as(t1); auto t3 = t1.add(t2); - testValidate(fec.fusion(), {out}, {t0}, {t3}, __LINE__, __FILE__); + testValidate(executor_cache.fusion(), {out}, {t0}, {t3}, __LINE__, __FILE__); } //! This is the same as the Uniform test, but we compare against @@ -447,7 +453,7 @@ TEST_F(RNGTest, FunctionalUniform) { fusion->addOutput(tv2); fusion->addOutput(tv3); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); for (int64_t size : {16, 1024, 10001, 10002, 10003, 100000, 10000001}) { at::manual_seed(0); @@ -465,7 +471,7 @@ TEST_F(RNGTest, FunctionalUniform) { std::vector aten_inputs({size, -1.0, 1.0, 0, 0}); at::manual_seed(0); - auto cg_outputs = fec.runFusionWithInputs(aten_inputs); + auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); std::vector aten_outputs; if (do_stochastic) { @@ -475,7 +481,7 @@ TEST_F(RNGTest, FunctionalUniform) { } testValidate( - fec.fusion(), + executor_cache.fusion(), cg_outputs, aten_inputs, aten_outputs, @@ -514,7 +520,7 @@ TEST_F(RNGTest, DifferentOffsets) { fusion->addOutput(tv0); } - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); std::unique_ptr fusion_ptr2 = std::make_unique(); { @@ -533,7 +539,7 @@ TEST_F(RNGTest, DifferentOffsets) { for (int64_t size : {1, 4}) { at::manual_seed(0); EXPECT_TRUE(get_current_offset() == 0); - auto r1 = fec.runFusionWithInputs({size}).at(0); + auto r1 = executor_cache.runFusionWithInputs({size}).at(0); EXPECT_TRUE(get_current_offset() == 4); auto r23 = fec2.runFusionWithInputs({size}); auto r2 = r23.at(0); diff --git a/tests/cpp/test_scalar_hoisting.cpp b/tests/cpp/test_scalar_hoisting.cpp index 6aa08c52b53..d0295aa20f3 100644 --- a/tests/cpp/test_scalar_hoisting.cpp +++ b/tests/cpp/test_scalar_hoisting.cpp @@ -213,9 +213,9 @@ TEST_F(ScalarHoistTest, IndexHoist1) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({15, 17}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -257,9 +257,9 @@ TEST_F(ScalarHoistTest, IndexHoist2) { auto t0 = at::randn({16}, options); auto t1 = at::randn({16}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - auto cg_outputs = fe.runFusion({t0, t1}); + KernelExecutor ke; + ke.compile(&fusion, {t0, t1}); + auto cg_outputs = ke.run({t0, t1}); testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); } @@ -290,9 +290,9 @@ TEST_F(ScalarHoistTest, IndexHoist3) { at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::arange(10000, options).view({100, 100}); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {t0}); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(fusion.get(), {t0}); + auto cg_outputs = ke.run({t0}); const std::string expected_kernel = R"( __global__ void CUDAGeneratedKernel(Tensor T0, Tensor T2) { @@ -369,9 +369,9 @@ TEST_F(ScalarHoistTest, ARange) { int64_t start = 0, end = 100, step = 1; - FusionExecutor fe; - fe.compileFusion(fusion.get(), {start, end, step}); - auto cg_outputs = fe.runFusion({start, end, step}); + KernelExecutor ke; + ke.compile(fusion.get(), {start, end, step}); + auto cg_outputs = ke.run({start, end, step}); const std::string expected_kernel = R"( __global__ void CUDAGeneratedKernel(int64_t i0, int64_t i1, int64_t i2, Tensor T0, Tensor T1) { diff --git a/tests/cpp/test_scatter_gather.cpp b/tests/cpp/test_scatter_gather.cpp index 67237e0b5e4..5e1bfdd2eb1 100644 --- a/tests/cpp/test_scatter_gather.cpp +++ b/tests/cpp/test_scatter_gather.cpp @@ -132,7 +132,7 @@ TEST_F(ScatterGatherTest, TorchGatherAllRankAllSelectedDim) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto options_i = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0); for (const auto is_take_along : {false, true}) { - for (int rank = 1; rank <= 5; ++rank) { + for (int rank = 1; rank <= 3; ++rank) { for (int dim = 0; dim < rank; ++dim) { // this test uses a random input shape, clear the allocator to avoid // OOM. @@ -586,10 +586,10 @@ TEST_F(ScatterGatherTest, TakeAlongAxisIntermediateTensorPointwise1) { auto t1 = at::randint(0, shape[1], {shape[0]}, options_i); std::vector aten_inputs = {t0, t1}; - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); - auto outputs = fe.runFusion(aten_inputs); + auto outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -621,11 +621,11 @@ TEST_F(ScatterGatherTest, TakeAlongAxisIntermediateTensorPointwise2) { auto t1 = at::randint(0, shape[1], {shape[0]}, options_i); std::vector aten_inputs = {t0, t1}; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto outputs = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); validateSegmentation( - fec.getMostRecentKernelRuntime(), {SchedulerType::PointWise}); + executor_cache.getMostRecentKernelRuntime(), {SchedulerType::PointWise}); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -655,11 +655,11 @@ TEST_F(ScatterGatherTest, TakeAlongAxisIntermediateTensorReduction1) { auto t1 = at::randint(0, shape[0], {2}, options_i); std::vector aten_inputs = {t0, t1}; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto outputs = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); validateSegmentation( - fec.getMostRecentKernelRuntime(), + executor_cache.getMostRecentKernelRuntime(), {SchedulerType::Reduction, SchedulerType::PointWise}); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); @@ -695,11 +695,11 @@ TEST_F(ScatterGatherTest, TakeAlongAxisIntermediateTensorReduction2) { auto t1 = at::randint(0, shape[1], {shape[0]}, options_i); std::vector aten_inputs = {t0, t1}; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto outputs = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); validateSegmentation( - fec.getMostRecentKernelRuntime(), + executor_cache.getMostRecentKernelRuntime(), {SchedulerType::PointWise, SchedulerType::Reduction}); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); @@ -734,11 +734,11 @@ TEST_F(ScatterGatherTest, TakeAlongAxisIntermediateTensorReduction3) { at::randint(0, shape_before_gather[1], shape_after_gather, options_i); std::vector aten_inputs = {t0, t1}; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto outputs = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); validateSegmentation( - fec.getMostRecentKernelRuntime(), {SchedulerType::Reduction}); + executor_cache.getMostRecentKernelRuntime(), {SchedulerType::Reduction}); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -776,11 +776,11 @@ TEST_F(ScatterGatherTest, DISABLED_TakeAlongAxisIntermediateTensorReduction4) { at::randint(0, shape_before_gather[1], shape_after_gather, options_i); std::vector aten_inputs = {t0, t1}; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto outputs = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); validateSegmentation( - fec.getMostRecentKernelRuntime(), {SchedulerType::Reduction}); + executor_cache.getMostRecentKernelRuntime(), {SchedulerType::Reduction}); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -814,11 +814,12 @@ TEST_F(ScatterGatherTest, TakeAlongAxisIntermediateTensorNormalization1) { auto t1 = at::randint(0, shape[1], {shape[0]}, options_i); std::vector aten_inputs = {t0, t1}; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto outputs = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); validateSegmentation( - fec.getMostRecentKernelRuntime(), {SchedulerType::InnerPersistent}); + executor_cache.getMostRecentKernelRuntime(), + {SchedulerType::InnerPersistent}); auto t0_d = t0.to(at::kDouble); auto ref = at::take_along_dim( @@ -857,11 +858,11 @@ TEST_F(ScatterGatherTest, TakeAlongAxisIntermediateTensorNormalization2) { auto t1 = at::randint(0, shape[1], {shape[0]}, options_i); std::vector aten_inputs = {t0, t1}; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto outputs = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); validateSegmentation( - fec.getMostRecentKernelRuntime(), + executor_cache.getMostRecentKernelRuntime(), {SchedulerType::PointWise, SchedulerType::InnerPersistent}); auto t5 = at::take_along_dim(t0.to(at::kDouble) + 1, t1.unsqueeze(-1), 1) @@ -902,11 +903,12 @@ TEST_F(ScatterGatherTest, TakeAlongAxisIntermediateTensorNormalization3) { at::randint(0, shape_before_gather[1], shape_after_gather, options_i); std::vector aten_inputs = {t0, t1}; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto outputs = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); validateSegmentation( - fec.getMostRecentKernelRuntime(), {SchedulerType::InnerPersistent}); + executor_cache.getMostRecentKernelRuntime(), + {SchedulerType::InnerPersistent}); auto t3 = at::take_along_dim(t0.to(at::kDouble) + 1, t1, 1); auto ref = t3 / t3.sum({1}).unsqueeze(-1); @@ -943,13 +945,13 @@ TEST_F( auto t1 = at::randint(0, shape[1], {shape[0], 1}, options_i); std::vector aten_inputs = {t0, t1}; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto outputs = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); // The reduction patterns of the normalization and the final // reduction are different, so they are segmented out validateSegmentation( - fec.getMostRecentKernelRuntime(), + executor_cache.getMostRecentKernelRuntime(), {SchedulerType::InnerPersistent, SchedulerType::Reduction}); auto t0_d = t0.to(at::kDouble); @@ -995,11 +997,12 @@ TEST_F( auto t1 = at::randint(0, shape[1], {shape[0]}, options_i); std::vector aten_inputs = {t0, t1}; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto outputs = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); validateSegmentation( - fec.getMostRecentKernelRuntime(), {SchedulerType::InnerPersistent}); + executor_cache.getMostRecentKernelRuntime(), + {SchedulerType::InnerPersistent}); auto t0_d = t0.to(at::kDouble); auto t6 = at::take_along_dim( @@ -1045,11 +1048,11 @@ TEST_F(ScatterGatherTest, TakeAlongAxisIntermediateTensorTranspose1) { auto t1 = at::randint(0, shape[0], {shape[1], shape[2]}, options_i); std::vector aten_inputs = {t0, t1}; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto outputs = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); validateSegmentation( - fec.getMostRecentKernelRuntime(), {SchedulerType::Transpose}); + executor_cache.getMostRecentKernelRuntime(), {SchedulerType::Transpose}); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -1088,11 +1091,11 @@ TEST_F(ScatterGatherTest, TakeAlongAxisIntermediateTensorTranspose2) { auto t1 = at::randint(0, shape[0], {10, shape[2], shape[1]}, options_i); std::vector aten_inputs = {t0, t1}; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto outputs = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); validateSegmentation( - fec.getMostRecentKernelRuntime(), {SchedulerType::PointWise}); + executor_cache.getMostRecentKernelRuntime(), {SchedulerType::PointWise}); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -1133,13 +1136,13 @@ TEST_F(ScatterGatherTest, TakeAlongAxisIntermediateTensorTranspose3) { auto t1 = at::randint(0, shape_before[2], shape_after, options_i); std::vector aten_inputs = {t0, t1}; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto outputs = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); // Transpose scheduler should work for this case but not currently // supported validateSegmentation( - fec.getMostRecentKernelRuntime(), {SchedulerType::PointWise}); + executor_cache.getMostRecentKernelRuntime(), {SchedulerType::PointWise}); testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__); } @@ -1188,11 +1191,11 @@ TEST_F(ScatterGatherTest, TakeAlongAxisCrossEntropyLoss) { auto t1 = at::randint(371, {128}, options).to(at::ScalarType::Long); std::vector inputs({t0, t1}); - FusionExecutorCache fec(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); - auto cg_outputs = fec.runFusionWithInputs(inputs); + auto cg_outputs = executor_cache.runFusionWithInputs(inputs); - auto kernel_runtime = fec.getMostRecentKernelRuntime(); + auto kernel_runtime = executor_cache.getMostRecentKernelRuntime(); validateSegmentation( kernel_runtime, @@ -1290,10 +1293,10 @@ TEST_F(ScatterGatherTest, GatherIterGoupedReduction) { " grouped iterations, found ", gpulw.kernel()->summary().num_grouped_iterations); - FusionExecutor fe; + KernelExecutor ke; auto lparams = rparams->lparams; - fe.compileFusion(&fusion, aten_inputs, lparams); - auto cg_outputs = fe.runFusion(aten_inputs, lparams); + ke.compile(&fusion, aten_inputs, lparams); + auto cg_outputs = ke.run(aten_inputs, lparams); auto t_gather = at::gather(input, dim, input_idx); testValidate( diff --git a/tests/cpp/test_sdpa_node.cpp b/tests/cpp/test_sdpa_node.cpp index 772945f0909..b63986d5a0b 100644 --- a/tests/cpp/test_sdpa_node.cpp +++ b/tests/cpp/test_sdpa_node.cpp @@ -252,8 +252,8 @@ TEST_F(SDPATest, NonCausalAttnConcrete) { /*return_debug_mask=*/false, scale); - FusionExecutorCache fec(std::move(fusion)); - auto nvf_out = fec.runFusionWithInputs({q, k, v}); + FusionExecutorCache executor_cache(std::move(fusion)); + auto nvf_out = executor_cache.runFusionWithInputs({q, k, v}); validateSdpaFwdOutputs(nvf_out, aten_out); } @@ -299,8 +299,8 @@ TEST_F(SDPATest, NonCausalAttnSymbolic) { /*return_debug_mask=*/false, scale); - FusionExecutorCache fec(std::move(fusion)); - auto nvf_out = fec.runFusionWithInputs({q, k, v}); + FusionExecutorCache executor_cache(std::move(fusion)); + auto nvf_out = executor_cache.runFusionWithInputs({q, k, v}); validateSdpaFwdOutputs(nvf_out, aten_out); } @@ -345,8 +345,8 @@ TEST_F(SDPATest, CausalAttn) { /*return_debug_mask=*/false, /*scale=*/1e-3); - FusionExecutorCache fec(std::move(fusion)); - auto nvf_out = fec.runFusionWithInputs({q, k, v}); + FusionExecutorCache executor_cache(std::move(fusion)); + auto nvf_out = executor_cache.runFusionWithInputs({q, k, v}); validateSdpaFwdOutputs(nvf_out, aten_out); } @@ -496,8 +496,8 @@ TEST_F(SDPATest, NonCausalAttnConcreteBwd) { std::vector sdpa_bwd_inputs = { grad_out, q, k, v, output, log_sumexp, philox_seed, philox_offset}; - FusionExecutorCache fec(std::move(fusion)); - auto out = fec.runFusionWithInputs(sdpa_bwd_inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out = executor_cache.runFusionWithInputs(sdpa_bwd_inputs); auto [ref_grad_query, ref_grad_key, ref_grad_value] = at::_scaled_dot_product_flash_attention_backward( @@ -518,7 +518,7 @@ TEST_F(SDPATest, NonCausalAttnConcreteBwd) { /*scale=*/scale); testValidate( - fec.fusion(), + executor_cache.fusion(), out, sdpa_bwd_inputs, {ref_grad_query, ref_grad_key, ref_grad_value}, @@ -605,8 +605,8 @@ TEST_F(SDPATest, NonCausalAttnSymbolicBwd) { std::vector sdpa_bwd_inputs = { grad_out, q, k, v, output, log_sumexp, philox_seed, philox_offset}; - FusionExecutorCache fec(std::move(fusion)); - auto out = fec.runFusionWithInputs(sdpa_bwd_inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out = executor_cache.runFusionWithInputs(sdpa_bwd_inputs); auto [ref_grad_query, ref_grad_key, ref_grad_value] = at::_scaled_dot_product_flash_attention_backward( @@ -627,7 +627,7 @@ TEST_F(SDPATest, NonCausalAttnSymbolicBwd) { /*scale=*/scale); testValidate( - fec.fusion(), + executor_cache.fusion(), out, sdpa_bwd_inputs, {ref_grad_query, ref_grad_key, ref_grad_value}, @@ -683,8 +683,8 @@ TEST_F(SDPATest, AttnProgram) { scale); auto expected_out = (std::get<0>(aten_outputs).to(at::kFloat)) * 2.0; - FusionExecutorCache fec(std::move(fusion)); - auto out = fec.runFusionWithInputs({q, k, v}); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out = executor_cache.runFusionWithInputs({q, k, v}); EXPECT_TRUE(at::allclose(out[0], expected_out)); } @@ -744,8 +744,8 @@ TEST_F(SDPATest, AttnFwdBwd) { at::Tensor v = at::randn(v_shape, options).set_requires_grad(true); at::Tensor grad_out = at::randn(attn_shape, options); - FusionExecutorCache fec(std::move(fusion)); - auto nvf_out = fec.runFusionWithInputs({q, k, v, grad_out}); + FusionExecutorCache executor_cache(std::move(fusion)); + auto nvf_out = executor_cache.runFusionWithInputs({q, k, v, grad_out}); auto attn = at::scaled_dot_product_attention( q, @@ -761,7 +761,7 @@ TEST_F(SDPATest, AttnFwdBwd) { attn.backward(grad_out); testValidate( - fec.fusion(), + executor_cache.fusion(), nvf_out, {q, k, v, grad_out}, {attn, q.grad(), k.grad(), v.grad()}, @@ -824,9 +824,9 @@ TEST_F(SDPATest, Sharded_SdpaFwd) { /*return_debug_mask=*/false, scale); - FusionExecutorCache fec(std::move(fusion)); - auto nvf_out = - fec.runFusionWithInputs({q.unsqueeze(0), k.unsqueeze(0), v.unsqueeze(0)}); + FusionExecutorCache executor_cache(std::move(fusion)); + auto nvf_out = executor_cache.runFusionWithInputs( + {q.unsqueeze(0), k.unsqueeze(0), v.unsqueeze(0)}); validateSdpaFwdOutputs(nvf_out, aten_out); } @@ -928,8 +928,8 @@ TEST_F(SDPATest, Sharded_SdpaBwd) { philox_seed, philox_offset}; - FusionExecutorCache fec(std::move(fusion)); - auto out = fec.runFusionWithInputs(sdpa_bwd_inputs); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out = executor_cache.runFusionWithInputs(sdpa_bwd_inputs); auto [ref_grad_query, ref_grad_key, ref_grad_value] = at::_scaled_dot_product_flash_attention_backward( @@ -950,7 +950,7 @@ TEST_F(SDPATest, Sharded_SdpaBwd) { /*scale=*/scale); testValidate( - fec.fusion(), + executor_cache.fusion(), out, sdpa_bwd_inputs, {ref_grad_query.unsqueeze(0), @@ -1016,9 +1016,9 @@ TEST_F(SDPATest, ComputeAt) { /*return_debug_mask=*/false, scale); - FusionExecutorCache fec(std::move(fusion)); - auto nvf_out = - fec.runFusionWithInputs({q.unsqueeze(0), k.unsqueeze(0), v.unsqueeze(0)}); + FusionExecutorCache executor_cache(std::move(fusion)); + auto nvf_out = executor_cache.runFusionWithInputs( + {q.unsqueeze(0), k.unsqueeze(0), v.unsqueeze(0)}); validateSdpaFwdOutputs(nvf_out, aten_out); } diff --git a/tests/cpp/test_segmentation.cpp b/tests/cpp/test_segmentation.cpp index c3a69b09cbb..b893c5c29ad 100644 --- a/tests/cpp/test_segmentation.cpp +++ b/tests/cpp/test_segmentation.cpp @@ -45,10 +45,10 @@ TEST_F(SegmentationTest, Issue1284_Repro1) { at::Tensor at_in_1 = at::randn(input_shape_1, options); std::vector aten_inputs = {at_in_0, at_in_1}; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto outputs = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_EQ(runtime->fusionSegments()->groups().size(), 2); testValidate(&fusion, outputs, {at_in_0, at_in_1}, __LINE__, __FILE__); @@ -84,10 +84,10 @@ TEST_F(SegmentationTest, Issue1284_Repro2) { std::vector aten_inputs = {at_in_0, at_in_1, at_in_2}; - FusionExecutorCache fec(std::move(fusion_ptr)); - auto outputs = fec.runFusionWithInputs(aten_inputs); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_EQ(runtime->fusionSegments()->groups().size(), 2); testValidate( @@ -147,12 +147,14 @@ TEST_F(SegmentationTest, SegmentHintOnNonTerminatingOutput) { fusion->addOutput(add_out); fusion->addOutput(mul_out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); // Segment 1: in -> add_out (defined by segment_set) // Segment 2: add_out -> mul_out EXPECT_EQ(runtime->fusionSegments()->groups().size(), 2); @@ -195,18 +197,19 @@ TEST_F(SegmentationTest, EnforceSegmentationByCachingBeforeAndAfter) { } } - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); testValidate( - fec.fusion(), + executor_cache.fusion(), out_tensors, {in_tensor}, {in_tensor / in_tensor.sum({0})}, __LINE__, __FILE__); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_EQ(runtime->fusionSegments()->groups().size(), 2); } @@ -225,10 +228,12 @@ TEST_F(SegmentationTest, SetAllocationDomainOnSegmentBoundary) { add_out->setAllocationDomain( {add_out->axis(0), add_out->axis(1), add_out->axis(2)}, false); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3, 5}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); } TEST_F(SegmentationTest, InputForwardingUntilBinary) { @@ -254,16 +259,20 @@ TEST_F(SegmentationTest, InputForwardingUntilBinary) { z = segment_set(z); fusion->addOutput(z); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({2, 3}, options); std::vector out_tensors = - fec.runFusionWithInputs({in_tensor, in_tensor}); + executor_cache.runFusionWithInputs({in_tensor, in_tensor}); testValidate( - fec.fusion(), out_tensors, {in_tensor, in_tensor}, __LINE__, __FILE__); + executor_cache.fusion(), + out_tensors, + {in_tensor, in_tensor}, + __LINE__, + __FILE__); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_EQ(runtime->fusionSegments()->groups().size(), 1); } @@ -285,14 +294,18 @@ TEST_F(SegmentationTest, InputForwardingUntilOutput) { fusion->addOutput(out0); fusion->addOutput(out1); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({2, 3}, options); std::vector out_tensors = - fec.runFusionWithInputs({in_tensor, in_tensor}); + executor_cache.runFusionWithInputs({in_tensor, in_tensor}); testValidate( - fec.fusion(), out_tensors, {in_tensor, in_tensor}, __LINE__, __FILE__); + executor_cache.fusion(), + out_tensors, + {in_tensor, in_tensor}, + __LINE__, + __FILE__); } TEST_F(SegmentationTest, ForwardedExprsAreNotMergeable) { @@ -308,9 +321,10 @@ TEST_F(SegmentationTest, ForwardedExprsAreNotMergeable) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto in_tensor = at::randn({10}, options); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); } TEST_F(SegmentationTest, ForwardedExprsAreReplicated) { @@ -328,9 +342,10 @@ TEST_F(SegmentationTest, ForwardedExprsAreReplicated) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto in_tensor = at::randn({10, 20}, options); - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); } TEST_F(SegmentationTest, ForceFp16Simple) { @@ -356,18 +371,18 @@ TEST_F(SegmentationTest, ForceFp16Simple) { fusion->addOutput(tv5); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); std::vector shape{15, 16}; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto in0 = at::randn(shape, options); auto in1 = at::randn(shape, options); - fec.runFusionWithInputs({in0, in1}); + executor_cache.runFusionWithInputs({in0, in1}); // Check the segmented edge is fp16 SegmentedFusion* segmented_fusion = - fec.getMostRecentKernelRuntime()->fusionSegments(); + executor_cache.getMostRecentKernelRuntime()->fusionSegments(); for (SegmentedEdge* edge : segmented_fusion->edges()) { auto* edge_tv = edge->val->as(); EXPECT_EQ(edge_tv->getDataType(), DataType::Half); @@ -406,18 +421,18 @@ TEST_F(SegmentationTest, ForceBf16Simple) { fusion->addOutput(tv5); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); std::vector shape{15, 16}; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto in0 = at::randn(shape, options); auto in1 = at::randn(shape, options); - fec.runFusionWithInputs({in0, in1}); + executor_cache.runFusionWithInputs({in0, in1}); // Check the segmented edge is bf16 SegmentedFusion* segmented_fusion = - fec.getMostRecentKernelRuntime()->fusionSegments(); + executor_cache.getMostRecentKernelRuntime()->fusionSegments(); for (SegmentedEdge* edge : segmented_fusion->edges()) { auto* edge_tv = edge->val->as(); EXPECT_EQ(edge_tv->getDataType(), DataType::BFloat16); @@ -452,17 +467,17 @@ TEST_F(SegmentationTest, ForceFp16NotAllCast) { fusion->addOutput(tv7); fusion->addOutput(tv8); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); std::vector shape{16, 16, 16}; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto in0 = at::randn(shape, options); auto in1 = at::randn(shape, options); - fec.runFusionWithInputs({in0, in1}); + executor_cache.runFusionWithInputs({in0, in1}); SegmentedFusion* segmented_fusion = - fec.getMostRecentKernelRuntime()->fusionSegments(); + executor_cache.getMostRecentKernelRuntime()->fusionSegments(); Fusion* complete_fusion = segmented_fusion->completeFusion(); // Check that the edge that wasn't fp16 is the producer of the @@ -513,17 +528,17 @@ TEST_F(SegmentationTest, ForceBf16NotAllCast) { fusion->addOutput(tv7); fusion->addOutput(tv8); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); std::vector shape{16, 16, 16}; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto in0 = at::randn(shape, options); auto in1 = at::randn(shape, options); - fec.runFusionWithInputs({in0, in1}); + executor_cache.runFusionWithInputs({in0, in1}); SegmentedFusion* segmented_fusion = - fec.getMostRecentKernelRuntime()->fusionSegments(); + executor_cache.getMostRecentKernelRuntime()->fusionSegments(); Fusion* complete_fusion = segmented_fusion->completeFusion(); // Check that the edge that wasn't fp16 is the producer of the @@ -558,14 +573,14 @@ TEST_F(SegmentationTest, SliceSegmentCasts) { fusion->addOutput(tv3); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); auto in0 = at::randn({5}, options); - auto outputs = fec.runFusionWithInputs({in0}); + auto outputs = executor_cache.runFusionWithInputs({in0}); SegmentedFusion* segmented_fusion = - fec.getMostRecentKernelRuntime()->fusionSegments(); + executor_cache.getMostRecentKernelRuntime()->fusionSegments(); ASSERT_EQ(segmented_fusion->edges().size(), 1); @@ -579,7 +594,7 @@ TEST_F(SegmentationTest, SliceSegmentCasts) { // There should be no cast before the slice EXPECT_TRUE(slice_edge->val->uses().at(0)->isA()); - testValidate(fec.fusion(), outputs, {in0}, __LINE__, __FILE__); + testValidate(executor_cache.fusion(), outputs, {in0}, __LINE__, __FILE__); } TEST_F(SegmentationTest, codeGenSupportedMergeIssue1970) { @@ -596,13 +611,13 @@ TEST_F(SegmentationTest, codeGenSupportedMergeIssue1970) { auto* tv3 = segment_set(tv2); fusion->addOutput(tv3); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto in0 = at::randn({3, 4, 3}, options); - auto outputs = fec.runFusionWithInputs({in0}); + auto outputs = executor_cache.runFusionWithInputs({in0}); - testValidate(fec.fusion(), outputs, {in0}, __LINE__, __FILE__); + testValidate(executor_cache.fusion(), outputs, {in0}, __LINE__, __FILE__); } // Test that Reduction axes are removed in segmentation edges @@ -622,15 +637,16 @@ TEST_F(SegmentationTest, EraseReductionsInSegmentationEdges) { fusion->addOutput(tv3); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto in0 = at::randn({3, 32, 17}, options); - auto outputs = fec.runFusionWithInputs({in0}); + auto outputs = executor_cache.runFusionWithInputs({in0}); - testValidate(fec.fusion(), outputs, {in0}, __LINE__, __FILE__); + testValidate(executor_cache.fusion(), outputs, {in0}, __LINE__, __FILE__); - const FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + const FusionKernelRuntime* runtime = + executor_cache.getMostRecentKernelRuntime(); ASSERT_TRUE(runtime != nullptr); SegmentedFusion* segmented_fusion = runtime->fusionSegments(); @@ -662,18 +678,18 @@ TEST_F(SegmentationTest, AliasedOutputOnSegmentation) { auto* tv2 = relu(seg_out); fusion->addOutput(tv2); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto in0 = at::randn({2, 3, 4}, options); auto in0_ref = in0.clone(); - auto outputs = fec.runFusionWithInputs({in0}); + auto outputs = executor_cache.runFusionWithInputs({in0}); auto in0_neg = in0_ref.neg(); EXPECT_TRUE(in0_neg.allclose(in0)); testValidate( - fec.fusion(), + executor_cache.fusion(), outputs, {in0.clone()}, {in0_neg.relu()}, @@ -693,14 +709,15 @@ TEST_F(SegmentationTest, MultipleSegmentSetsInOneSegment) { fusion->addInput(in); fusion->addOutput(out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor in_tensor = at::randn({10}, options); - at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0]; + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; - testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); - FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime(); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); EXPECT_THAT(runtime->fusionSegments()->groups(), SizeIs(2)); } @@ -723,10 +740,12 @@ TEST_F(SegmentationTest, ForwardInputsToSegmenterSetIssue2658) { fusion->addOutput(permute_out); fusion->addOutput(compute_out); - FusionExecutorCache fec(std::move(fusion)); + FusionExecutorCache executor_cache(std::move(fusion)); at::Tensor in_tensor = at::randn({2, 3}).cuda(); - std::vector out_tensors = fec.runFusionWithInputs({in_tensor}); - testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); + std::vector out_tensors = + executor_cache.runFusionWithInputs({in_tensor}); + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); } } // namespace nvfuser diff --git a/tests/cpp/test_serial_gridreduce.cpp b/tests/cpp/test_serial_gridreduce.cpp index 15d035a75d6..13ee5d77df9 100644 --- a/tests/cpp/test_serial_gridreduce.cpp +++ b/tests/cpp/test_serial_gridreduce.cpp @@ -116,15 +116,15 @@ TEST_F(SerialGridReductionTest, Scheduling) { inlineMost(); - FusionExecutor fe; + KernelExecutor ke; if (serial) { tv3->definition()->as()->requestSerialGridReduction(); } - fe.compileFusion(fusion); + ke.compile(fusion); auto input = at::randn( {H, W}, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0)); - auto outputs = fe.runFusion({input}); + auto outputs = ke.run({input}); if (serial) { // Verify that zeroed semaphore memory was reused instead of diff --git a/tests/cpp/test_sharding.cpp b/tests/cpp/test_sharding.cpp index c813bde5225..6738d99857c 100644 --- a/tests/cpp/test_sharding.cpp +++ b/tests/cpp/test_sharding.cpp @@ -155,9 +155,9 @@ TEST_P(ShardingTest, ComputeIndex) { // Dimension 2 has size 1 because that dimension is DIDx parallelized. auto a_tensor = at::randn({4, 2, 1, 5}, options); - FusionExecutor fe; - fe.compileFusion(fusion.get(), {a_tensor}); - auto outputs = fe.runFusion({a_tensor}); + KernelExecutor ke; + ke.compile(fusion.get(), {a_tensor}); + auto outputs = ke.run({a_tensor}); testValidate(fusion.get(), outputs, {a_tensor}, __LINE__, __FILE__); } diff --git a/tests/cpp/test_smem_reuse.cpp b/tests/cpp/test_smem_reuse.cpp index 295fd3c2345..5a258ab6917 100644 --- a/tests/cpp/test_smem_reuse.cpp +++ b/tests/cpp/test_smem_reuse.cpp @@ -556,9 +556,9 @@ TEST_F(SmemReuseTest, SmemReuseWithDifferentVectorizationFactor) { } auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({n_element}, options); - FusionExecutor fe; - fe.compileFusion(fusion.get()); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(fusion.get()); + auto cg_outputs = ke.run({t0}); testValidate(fusion.get(), cg_outputs, {t0}, __LINE__, __FILE__); } @@ -616,9 +616,9 @@ TEST_F(SmemReuseTest, RegisterReuseWithDifferentVectorizationFactor) { // run the fusion auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({n_element}, options); - FusionExecutor fe; - fe.compileFusion(fusion.get()); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(fusion.get()); + auto cg_outputs = ke.run({t0}); testValidate(fusion.get(), cg_outputs, {t0}, __LINE__, __FILE__); }; @@ -677,9 +677,9 @@ TEST_F(SmemReuseTest, ExpandInterferes) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({y}, options); - FusionExecutor fe; - fe.compileFusion(fusion.get()); - auto cg_outputs = fe.runFusion({t0}); + KernelExecutor ke; + ke.compile(fusion.get()); + auto cg_outputs = ke.run({t0}); testValidate(fusion.get(), cg_outputs, {t0}, __LINE__, __FILE__); }; diff --git a/tests/cpp/test_swizzle.cpp b/tests/cpp/test_swizzle.cpp index f2cb00546eb..d4e910a2522 100644 --- a/tests/cpp/test_swizzle.cpp +++ b/tests/cpp/test_swizzle.cpp @@ -54,12 +54,12 @@ TEST_F(LegacySwizzleTest, SimpleSwizzle0) { auto str = ir_utils::toString(exprs); NVF_CHECK(str.find("where") != std::string::npos); - FusionExecutor fe; - fe.compileFusion(&fusion); + KernelExecutor ke; + ke.compile(&fusion); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({2, 32}, options); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -93,12 +93,12 @@ TEST_F(LegacySwizzleTest, SimpleSwizzle1) { // Inlining a producer into a swizzled consumer is ok tv1->computeAt(tv2, -1); - FusionExecutor fe; - fe.compileFusion(&fusion); + KernelExecutor ke; + ke.compile(&fusion); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({2, 32}, options); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -150,12 +150,12 @@ TEST_F(LegacySwizzleTest, SimpleSwizzle2) { } } - FusionExecutor fe; - fe.compileFusion(&fusion); + KernelExecutor ke; + ke.compile(&fusion); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({32, 32}, options); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -279,12 +279,12 @@ TEST_F(LegacySwizzleTest, LoopSwizzle0) { tv0->computeAt(tv2, -1); - FusionExecutor fe; - fe.compileFusion(&fusion); + KernelExecutor ke; + ke.compile(&fusion); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({2, 32}, options); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -314,12 +314,12 @@ TEST_F(LegacySwizzleTest, LoopSwizzle1) { tv2->axis(0)->parallelize(ParallelType::BIDx); tv2->axis(1)->parallelize(ParallelType::BIDy); - FusionExecutor fe; - fe.compileFusion(&fusion); + KernelExecutor ke; + ke.compile(&fusion); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({45, 77}, options); - auto cg_outputs = fe.runFusion({t0}); + auto cg_outputs = ke.run({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } @@ -349,8 +349,8 @@ TEST_F(LegacySwizzleTest, LoopSwizzleCheck0) { tv0->computeAt(tv2, -1); - FusionExecutor fe; - ASSERT_ANY_THROW(fe.compileFusion(&fusion)); + KernelExecutor ke; + ASSERT_ANY_THROW(ke.compile(&fusion)); } // Test assertion in unsupported pattern: half-inlined loop swizzle. @@ -381,8 +381,8 @@ TEST_F(LegacySwizzleTest, LoopSwizzleCheck1) { // Make tv2 swizzled and partially-inlined (unsupported). tv0->computeAt(tv3, -2); - FusionExecutor fe; - ASSERT_ANY_THROW(fe.compileFusion(&fusion)); + KernelExecutor ke; + ASSERT_ANY_THROW(ke.compile(&fusion)); } TEST_F(LegacySwizzleTest, SwizzleVectorize) { @@ -528,8 +528,8 @@ at::Tensor getSwizzledTensor( fusion.addOutput(swizzle.first); fusion.addOutput(swizzle.second); - FusionExecutorCache fec(std::move(fusion_ptr)); - auto outputs = fec.runFusionWithInputs({size_x, size_y}); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto outputs = executor_cache.runFusionWithInputs({size_x, size_y}); return input.index_put({outputs[0], outputs[1]}, input); } @@ -615,9 +615,9 @@ TEST_F(LegacySwizzleTest, SwizzleIndexing170) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t = at::randn({64, 64}, options); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion({t}); + KernelExecutor ke; + ke.compile(&fusion); + auto outputs = ke.run({t}); testValidate(&fusion, outputs, {t}, __LINE__, __FILE__); } @@ -678,9 +678,9 @@ TEST_F(LegacySwizzleTest, SwizzleInProducerProjection) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t = at::randn({32, 64}, options); - FusionExecutor fe; - fe.compileFusion(fusion.get()); - auto outputs = fe.runFusion({t}); + KernelExecutor ke; + ke.compile(fusion.get()); + auto outputs = ke.run({t}); auto expect = at::empty_like(t); for (auto i : c10::irange(t.size(0) / 8)) { @@ -735,10 +735,10 @@ TEST_F(SwizzleTest, Transpose1) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t = at::randn({10240, 10240}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t}); - EXPECT_TRUE(getBankConflictInfo(fe.kernel()).empty()); - std::vector outputs = fe.runFusion({t}); + KernelExecutor ke; + ke.compile(&fusion, {t}); + EXPECT_TRUE(getBankConflictInfo(ke.kernel()).empty()); + std::vector outputs = ke.run({t}); EXPECT_TRUE(at::equal(t.t(), outputs[0])); } diff --git a/tests/cpp/test_tensor_factories.cpp b/tests/cpp/test_tensor_factories.cpp index 8c3b2462ca7..2eabde38b3b 100644 --- a/tests/cpp/test_tensor_factories.cpp +++ b/tests/cpp/test_tensor_factories.cpp @@ -352,9 +352,9 @@ TEST_F(TensorFactoryTest, TensorConstruct) { auto output = tensor(std::vector>{{i00, i01}, {i10, i11}}); fusion->addOutput(output); - FusionExecutor fe; - fe.compileFusion(fusion.get()); - auto cg_outputs = fe.runFusion({00, 01, 10, 11}); + KernelExecutor ke; + ke.compile(fusion.get()); + auto cg_outputs = ke.run({00, 01, 10, 11}); testValidate(fusion.get(), cg_outputs, {00, 01, 10, 11}, __LINE__, __FILE__); } @@ -403,9 +403,9 @@ TEST_F(TensorFactoryTest, MetadataAsTensor) { auto input0 = at::randn({2, 3, 4, 5}, options); auto input1 = at::randn({6, 7, 8, 9}, options); - FusionExecutor fe; - fe.compileFusion(fusion.get()); - auto cg_outputs = fe.runFusion({input0, input1}); + KernelExecutor ke; + ke.compile(fusion.get()); + auto cg_outputs = ke.run({input0, input1}); testValidate(fusion.get(), cg_outputs, {input0, input1}, __LINE__, __FILE__); } diff --git a/tests/cpp/test_translate_mma.cpp b/tests/cpp/test_translate_mma.cpp index 420bdf1a760..14e9a9d222f 100644 --- a/tests/cpp/test_translate_mma.cpp +++ b/tests/cpp/test_translate_mma.cpp @@ -229,11 +229,11 @@ TEST_P(CombineMulSumAsMmaTestWithLayout, AmpereMulSumToMatmul_Schedule) { auto inputs = matmulAtInput2D(M, N, K, layout); - FusionExecutor fe; - fe.compileFusion( + KernelExecutor ke; + ke.compile( &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams); - ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty()); - auto cg_outputs = fe.runFusion({inputs.first, inputs.second}); + ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty()); + auto cg_outputs = ke.run({inputs.first, inputs.second}); auto tref = atMatmul( inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout); NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001)); diff --git a/tests/cpp/test_tutorial.cpp b/tests/cpp/test_tutorial.cpp index 17e93767a1f..943b2fc6504 100644 --- a/tests/cpp/test_tutorial.cpp +++ b/tests/cpp/test_tutorial.cpp @@ -82,12 +82,12 @@ TEST_F(Tutorial, Memcpy) { std::vector aten_inputs = {t0}; // Next, lower the fusion to Kernel, generate CUDA kernel source and then - // compile it with nvrtc. All of them are done by FusionExecutor - FusionExecutor fe; - fe.compileFusion(&fusion, aten_inputs); + // compile it with nvrtc. All of them are done by KernelExecutor + KernelExecutor ke; + ke.compile(&fusion, aten_inputs); - // FusionExecutor now has a compiled kernel, which can be executed as: - std::vector outputs = fe.runFusion(aten_inputs); + // KernelExecutor now has a compiled kernel, which can be executed as: + std::vector outputs = ke.run(aten_inputs); // Note that this run is done using just one thread, which will be // corrected below. @@ -158,15 +158,15 @@ TEST_F(Tutorial, Memcpy) { } // Since the fusion is modified, we need to recompile it. - FusionExecutor fe2; - fe2.compileFusion(&fusion, aten_inputs); + KernelExecutor ke2; + ke2.compile(&fusion, aten_inputs); // This time, the kernel is launched with multiple threads and // thread blocks. Note that the launch configurations, i.e., the // thread block and grid shapes, are autoatically inferred from the // given inputs. To see how many threads are used, run this test // with NVFUSER_DUMP=launch_param - outputs = fe2.runFusion(aten_inputs); + outputs = ke2.run(aten_inputs); ASSERT_TRUE(outputs[0].equal(t0)); } @@ -205,9 +205,9 @@ TEST_F(Tutorial, Reduction) { at::Tensor ref = t0.sum({1}); { - FusionExecutor fe; - fe.compileFusion(&fusion); - std::vector outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion); + std::vector outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__); } @@ -221,9 +221,9 @@ TEST_F(Tutorial, Reduction) { } { - FusionExecutor fe; - fe.compileFusion(&fusion); - std::vector outputs = fe.runFusion(aten_inputs); + KernelExecutor ke; + ke.compile(&fusion); + std::vector outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__); } @@ -239,19 +239,19 @@ TEST_F(Tutorial, Reduction) { } { - FusionExecutor fe; - fe.compileFusion(&fusion); + KernelExecutor ke; + ke.compile(&fusion); // Running this fusion, however, should fail as it would require // thread blocks of shape 1024x10, i.e., the same shape as the // input tensor, which is too large in CUDA. // // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) - ASSERT_ANY_THROW(fe.runFusion(aten_inputs)); + ASSERT_ANY_THROW(ke.run(aten_inputs)); // Try again with a smaller input. This should launch a kernel // with thread blocks of shape 32x10 at::Tensor t1 = at::randn({10, 32}, options); - std::vector outputs = fe.runFusion({t1}); + std::vector outputs = ke.run({t1}); testValidate( &fusion, outputs, aten_inputs, {t1.sum({1})}, __LINE__, __FILE__); } @@ -266,13 +266,13 @@ TEST_F(Tutorial, Reduction) { } { - FusionExecutor fe; - fe.compileFusion(&fusion); + KernelExecutor ke; + ke.compile(&fusion); // The original input should not fail in this case. The kernel // will be launched with 10 thread blocks, each of which has 1024 // threads. Try running this test with NVFUSER_DUMP=launch_param // to see the launch configuration of each kernel lauch - std::vector outputs = fe.runFusion(aten_inputs); + std::vector outputs = ke.run(aten_inputs); testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__); } } @@ -380,13 +380,13 @@ TEST_F(Tutorial, ReductionRFactor) { std::vector aten_inputs = {t0}; at::Tensor ref = t0.sum({0}); - FusionExecutor fe; - fe.compileFusion(&fusion_copy); + KernelExecutor ke; + ke.compile(&fusion_copy); // Since the size of the input is 10000, which is split by a // factor of 1024, the first per-thread reduction is done for // ceilDiv(10000, 1024) = 10 elements. - std::vector outputs = fe.runFusion(aten_inputs); + std::vector outputs = ke.run(aten_inputs); testValidate(&fusion_copy, outputs, aten_inputs, {ref}, __LINE__, __FILE__); } @@ -439,10 +439,10 @@ TEST_F(Tutorial, ReductionRFactor) { std::vector aten_inputs = {t0}; at::Tensor ref = t0.sum({0}); - FusionExecutor fe; - fe.compileFusion(&fusion_copy); + KernelExecutor ke; + ke.compile(&fusion_copy); - std::vector outputs = fe.runFusion(aten_inputs); + std::vector outputs = ke.run(aten_inputs); testValidate(&fusion_copy, outputs, aten_inputs, {ref}, __LINE__, __FILE__); } } @@ -786,9 +786,9 @@ TEST_F(Tutorial, BasicTMA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); std::vector shape(3, 300); auto t = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t}, {}, index32bit); - std::vector outputs = fe.runFusion({t}); + KernelExecutor ke; + ke.compile(&fusion, {t}, {}, index32bit); + std::vector outputs = ke.run({t}); ASSERT_TRUE(at::equal(t, outputs[0])); } @@ -870,9 +870,9 @@ TEST_F(Tutorial, BasicTMA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); std::vector shape(3, 300); auto t = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t}, {}, index32bit); - std::vector outputs = fe.runFusion({t}); + KernelExecutor ke; + ke.compile(&fusion, {t}, {}, index32bit); + std::vector outputs = ke.run({t}); ASSERT_TRUE(at::equal(t, outputs[0])); } @@ -953,9 +953,9 @@ TEST_F(Tutorial, BasicTMA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); std::vector shape(3, 300); auto t = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t}, {}, index32bit); - std::vector outputs = fe.runFusion({t}); + KernelExecutor ke; + ke.compile(&fusion, {t}, {}, index32bit); + std::vector outputs = ke.run({t}); ASSERT_TRUE(at::equal(t, outputs[0])); } @@ -1033,9 +1033,9 @@ TEST_F(Tutorial, BasicTMA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); std::vector shape(3, 300); auto t = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t}, {}, index32bit); - std::vector outputs = fe.runFusion({t}); + KernelExecutor ke; + ke.compile(&fusion, {t}, {}, index32bit); + std::vector outputs = ke.run({t}); ASSERT_TRUE(at::equal(t, outputs[0])); } @@ -1138,9 +1138,9 @@ TEST_F(Tutorial, BasicTMA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); std::vector shape(3, 300); auto t = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t}, {}, index32bit); - std::vector outputs = fe.runFusion({t}); + KernelExecutor ke; + ke.compile(&fusion, {t}, {}, index32bit); + std::vector outputs = ke.run({t}); ASSERT_TRUE(at::equal(t, outputs[0])); } @@ -1244,9 +1244,9 @@ TEST_F(Tutorial, BasicTMA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); std::vector shape(3, 300); auto t = at::randn(shape, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t}, {}, index32bit); - std::vector outputs = fe.runFusion({t}); + KernelExecutor ke; + ke.compile(&fusion, {t}, {}, index32bit); + std::vector outputs = ke.run({t}); ASSERT_TRUE(at::equal(t, outputs[0])); } } @@ -1343,10 +1343,10 @@ TEST_F(Tutorial, VectorizeStorePointwiseTMA) { at::Tensor at_tv0 = at::randn({dim0, dim1}, options); at::Tensor at_tv1 = at::randn({dim0, dim1}, options); - // Compile with FusionExecutor directly to avoid scheduling - FusionExecutor fe; - fe.compileFusion(fusion.get(), {at_tv0, at_tv1}, {}, index32bit); - auto outputs = fe.runFusion({at_tv0, at_tv1}); + // Compile with KernelExecutor directly to avoid scheduling + KernelExecutor ke; + ke.compile(fusion.get(), {at_tv0, at_tv1}, {}, index32bit); + auto outputs = ke.run({at_tv0, at_tv1}); auto at_output = at_tv0 + at_tv1; testValidate( @@ -1447,10 +1447,10 @@ TEST_F(Tutorial, PointwiseBroadcastTMA) { at::Tensor at_tv0 = at::randn({dim1, dim2, dim3}, options); at::Tensor at_tv1 = at::randn({dim0, dim1, dim2, dim3}, options); - // Compile with FusionExecutor directly to avoid scheduling - FusionExecutor fe; - fe.compileFusion(fusion.get(), {at_tv0, at_tv1}, {}, index32bit); - auto outputs = fe.runFusion({at_tv0, at_tv1}); + // Compile with KernelExecutor directly to avoid scheduling + KernelExecutor ke; + ke.compile(fusion.get(), {at_tv0, at_tv1}, {}, index32bit); + auto outputs = ke.run({at_tv0, at_tv1}); auto at_output = at_tv0 + at_tv1; testValidate( @@ -1551,10 +1551,10 @@ TEST_F(Tutorial, TMABankConflictFreeTranspose) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t = at::randn({10000, 10000}, options); - FusionExecutor fe; + KernelExecutor ke; CompileParams index32bit{DataType::Int32, 255, false}; - fe.compileFusion(&fusion, {t}, {}, index32bit); - std::vector outputs = fe.runFusion({t}); + ke.compile(&fusion, {t}, {}, index32bit); + std::vector outputs = ke.run({t}); ASSERT_TRUE(at::equal(t.t(), outputs[0])); } diff --git a/tests/cpp/test_unary.cpp b/tests/cpp/test_unary.cpp index 9455230683b..76dfd789b39 100644 --- a/tests/cpp/test_unary.cpp +++ b/tests/cpp/test_unary.cpp @@ -57,13 +57,18 @@ TEST_P(UnaryTest, Neg) { in_tensor = at::randn(shape, options); } - FusionExecutorCache fec(std::move(fusion)); - auto out_tensors = fec.runFusionWithInputs({in_tensor}); + FusionExecutorCache executor_cache(std::move(fusion)); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); // Calculate the reference output explicitly. Type promotion happens when // building the fusion, e.g., inside `neg`. Relying ExpresionEvaluator to // verify the result would hide type promotion errors. testValidate( - fec.fusion(), out_tensors, {in_tensor}, {-in_tensor}, __LINE__, __FILE__); + executor_cache.fusion(), + out_tensors, + {in_tensor}, + {-in_tensor}, + __LINE__, + __FILE__); } namespace { diff --git a/tests/cpp/test_utils.cpp b/tests/cpp/test_utils.cpp index 5215e58f19d..e04fd39ca43 100644 --- a/tests/cpp/test_utils.cpp +++ b/tests/cpp/test_utils.cpp @@ -1115,16 +1115,16 @@ TEST_F(NVFuserTest, FusionSASSDumpError) { at::Tensor t0 = at::randn({8}, options); - FusionExecutor fe; - fe.compileFusion(&fusion, {t0}); + KernelExecutor ke; + ke.compile(&fusion, {t0}); EXPECT_THAT( - [&]() { fe.disassembledKernelSASS(); }, + [&]() { ke.disassembledKernelSASS(); }, ::testing::ThrowsMessage( ::testing::HasSubstr("I am fake"))); - auto cg_outputs = fe.runFusion({t0}); - testValidate(fe.kernel(), cg_outputs, {t0}, __LINE__, __FILE__); + auto cg_outputs = ke.run({t0}); + testValidate(ke.kernel(), cg_outputs, {t0}, __LINE__, __FILE__); } TEST_F(NVFuserTest, ProveLinearAndGetStride) { diff --git a/tests/cpp/utils.cpp b/tests/cpp/utils.cpp index ca29c5fe1bf..64b1cbe55f7 100644 --- a/tests/cpp/utils.cpp +++ b/tests/cpp/utils.cpp @@ -24,15 +24,13 @@ CGResultsPackage scheduleAndRun( bool validate_scheduler) { auto heuristic_params = SchedulerEntry::scheduleWith( fusion, scheduler_type, runtime_inputs, validate_scheduler); - auto fusion_executor = std::make_unique(); - fusion_executor->compileFusion( - fusion, runtime_inputs, heuristic_params->lparams); - auto cg_outputs = - fusion_executor->runFusion(runtime_inputs, heuristic_params->lparams); + auto ke = std::make_unique(); + ke->compile(fusion, runtime_inputs, heuristic_params->lparams); + auto cg_outputs = ke->run(runtime_inputs, heuristic_params->lparams); CGResultsPackage results = { .outputs = cg_outputs, .heuristic_params = std::move(heuristic_params), - .fusion_executor = std::move(fusion_executor)}; + .kernel_executor = std::move(ke)}; return results; } diff --git a/tests/cpp/utils.h b/tests/cpp/utils.h index 648b85dbe55..d2964c8b731 100644 --- a/tests/cpp/utils.h +++ b/tests/cpp/utils.h @@ -40,12 +40,12 @@ namespace nvfuser { struct CGResultsPackage { std::vector outputs; std::unique_ptr heuristic_params; - std::unique_ptr fusion_executor; + std::unique_ptr kernel_executor; }; // Grabs heuristics and schedules with the provided scheduler type, compiles and // runs with Fuion executor, returns a struct containing the outputs, -// heuristic_params, and FusionExecutor. These structures are for convenience in +// heuristic_params, and KernelExecutor. These structures are for convenience in // testing. If validate_scheduler is set to false the scheduler check will still // be run but it will be ignored. Otherwise canScheduler returning false will // throw. diff --git a/tests/python/utils.py b/tests/python/utils.py index 2a7fadc4a14..9ff3f1b8d78 100644 --- a/tests/python/utils.py +++ b/tests/python/utils.py @@ -437,8 +437,10 @@ def exec_nvfuser( self.assertTrue( check_captured_python_definition(out, fd, inputs_captured, device) ) - - self.assertEqual(fc.num_fusions() - before_fusions, int(new_fusion_expected)) + if not disable_serde: + self.assertEqual( + fc.num_fusions() - before_fusions, int(new_fusion_expected) + ) if is_clonable: self.assertTrue(check_cpp_translation(out, fd, inputs_cloned)) diff --git a/tools/examples/repro.cpp b/tools/examples/repro.cpp index a1e123dd3aa..53058ace4ab 100644 --- a/tools/examples/repro.cpp +++ b/tools/examples/repro.cpp @@ -103,7 +103,7 @@ TEST_F(NVFuserTest, FusionGeneratedTest_CUDA) { outputs.push_back(t32); } - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = fec.runFusionWithInputs(inputs); + KernelExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs(inputs); testValidate(fusion, cg_outputs, inputs, outputs, __LINE__, __FILE__); }