diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp index 36a0f9fc966..dff02b6c216 100644 --- a/csrc/runtime/executor.cpp +++ b/csrc/runtime/executor.cpp @@ -724,7 +724,7 @@ LaunchParams KernelExecutor::computeLaunchParams( // This check is only done once a kernel has been compiled, since // maybe_available_dynamic_smem_ needs to be evaluated on // a compiled kernel. - if (hasCompiledKernel()) { + if (isCompiled()) { validateDynamicSmemSize(dynamic_smem_size); } @@ -784,7 +784,7 @@ std::vector KernelExecutor::getIntermediateBufferInfo( } void KernelExecutor::setUsedTVs() { - auto used_vals = fusion()->usedMathVals(); + auto used_vals = kernel()->usedMathVals(); auto used_tvs = ir_utils::filterByType(used_vals); used_tvs_.clear(); used_tvs_.insert(used_tvs_.begin(), used_tvs.begin(), used_tvs.end()); @@ -1128,8 +1128,7 @@ void KernelExecutor::recompileKernel( int64_t KernelExecutor::getAvailableDynamicSmemSize() { NVF_ERROR( - hasCompiledKernel(), - "Cannot get dynamic smem size unless kernel is compiled"); + isCompiled(), "Cannot get dynamic smem size unless kernel is compiled"); if (!available_dynamic_smem_size_.has_value()) { int size = 0; NVFUSER_CUDA_SAFE_CALL(cuFuncGetAttribute( @@ -1143,8 +1142,7 @@ int64_t KernelExecutor::getAvailableDynamicSmemSize() { int64_t KernelExecutor::getStaticSmemSize() { NVF_ERROR( - hasCompiledKernel(), - "Cannot get static smem size unless kernel is compiled"); + isCompiled(), "Cannot get static smem size unless kernel is compiled"); if (!static_smem_size_.has_value()) { int size = 0; // Is this really a costly operation worth caching? @@ -1160,7 +1158,7 @@ int64_t KernelExecutor::getStaticSmemSize() { void KernelExecutor::validateDynamicSmemSize(int64_t dynamic_smem_size) { // If specified, check that dynamic smem size matches what the scheduler // expects - int64_t expected_dynamic_smem_size = fusion()->expectedDynamicSmemBytes(); + int64_t expected_dynamic_smem_size = kernel()->expectedDynamicSmemBytes(); if (expected_dynamic_smem_size >= 0) { NVF_ERROR( dynamic_smem_size == expected_dynamic_smem_size, @@ -1185,8 +1183,7 @@ void KernelExecutor::validateDynamicSmemSize(int64_t dynamic_smem_size) { int64_t KernelExecutor::ensureAvailableDynamicSmemSize( int64_t dynamic_smem_size) { NVF_ERROR( - hasCompiledKernel(), - "Cannot set dynamic smem size unless kernel is compiled"); + isCompiled(), "Cannot set dynamic smem size unless kernel is compiled"); if (dynamic_smem_size > getAvailableDynamicSmemSize()) { validateDynamicSmemSize(dynamic_smem_size); NVFUSER_CUDA_SAFE_CALL(cuFuncSetAttribute( @@ -1224,7 +1221,7 @@ std::vector KernelExecutor::run( NVF_ERROR(isCompiled()); NVF_ERROR( - outputs.empty() || (outputs.size() == fusion()->outputs().size()), + outputs.empty() || (outputs.size() == kernel()->outputs().size()), __func__, " provided number of outputs does not match fusion output"); @@ -1279,12 +1276,12 @@ std::vector KernelExecutor::run( at::AutoDispatchBelowADInplaceOrView non_variable_type_mode; // Bind fusion inputs - auto expr_eval = executor_utils::bindInputs(args, fusion()); + auto expr_eval = executor_utils::bindInputs(args, kernel()); // only allocate outputs when not given if (outputs.empty()) { outputs = allocateOutputs( - fusion(), executor_entry->outputs, options_.device, expr_eval); + kernel(), executor_entry->outputs, options_.device, expr_eval); } args.push(outputs); @@ -1563,12 +1560,6 @@ flatbuffers::Offset KernelExecutor::serialize( executor_entry_lookup_values_fb.push_back(serialize(builder, value)); } - // When compilation is skipped, avoid serializing cubin because it doesn't - // exist. The remaining fields are also not necessary in this case. - if (!hasCompiledKernel()) { - return serde::CreateKernelExecutorDirect(builder); - } - return serde::CreateKernelExecutorDirect( builder, device_smem_limit_, @@ -1776,7 +1767,7 @@ void KernelExecutor::deserialize( compiled_kernel_ = executor_utils::getCompiledKernel( buffer->compiled_kernel(), compile_params); - NVF_ERROR(hasCompiledKernel(), "Failed to deserialize KernelExecutor"); + NVF_ERROR(isCompiled(), "Failed to deserialize KernelExecutor"); } KernelExecutor::ExecutorEntry KernelExecutor::deserialize( diff --git a/csrc/runtime/executor.h b/csrc/runtime/executor.h index 45fee75b6b1..ce7039a1b91 100644 --- a/csrc/runtime/executor.h +++ b/csrc/runtime/executor.h @@ -144,24 +144,13 @@ class KernelExecutor : public ExecutorAbstract { post_lowering_hooks_.push_back(std::move(hook)); } - // Function to query whether compilation was attempted for a `KernelExecutor` + // Returns whether this `KernelExecutor` has a compiled kernel to execute. bool isCompiled() const override { - int num_compiled_artifacts = (fusion_ != nullptr) + (lowered_ != nullptr); - NVF_ERROR(num_compiled_artifacts <= 1); - return num_compiled_artifacts == 1; - }; - - // function to query whether a `KernelExecutor` has a compiled kernel to - // execute - bool hasCompiledKernel() const { if (compiled_kernel_ != nullptr) { NVF_ERROR(compiled_kernel_->function != nullptr); - NVF_ERROR( - fusion_ == nullptr, - "fusion_ should only be initialized when using expression evaluator."); } return validKernelId() && lowered_ && compiled_kernel_ != nullptr; - }; + } void evictCache(size_t cache_id) { executor_entry_lookup_.erase(cache_id); @@ -200,17 +189,6 @@ class KernelExecutor : public ExecutorAbstract { return lowered_->kernel(); } - Fusion* fusion() const { - NVF_ERROR(isCompiled()); - if (fusion_ != nullptr) { - return fusion_.get(); - } - if (lowered_ != nullptr) { - return lowered_->kernel()->as(); - } - NVF_THROW("unreachable because of the isCompiled check"); - } - const ThreadPredicateMap& threadPredMap() const { return lowered_->threadPredMap(); } @@ -503,9 +481,6 @@ class KernelExecutor : public ExecutorAbstract { std::unique_ptr lowered_; - // Initialized for non-compiled fusions - std::unique_ptr fusion_; - // Track the block size this kernel was compiled with. If the block size // increases, recompile to adjust maxregister count. int64_t block_size_high_water_mark_ = 1; diff --git a/tests/cpp/test_gpu3.cpp b/tests/cpp/test_gpu3.cpp index c7eba2b1fed..0d6632616ef 100644 --- a/tests/cpp/test_gpu3.cpp +++ b/tests/cpp/test_gpu3.cpp @@ -8068,7 +8068,7 @@ TEST_F(NVFuserTest, AvoidCachingSliceInput) { continue; } const auto* ke = exec->as(); - for (auto expr : ke->fusion()->exprs()) { + for (auto expr : ke->kernel()->exprs()) { if (expr->isA()) { auto slice = expr->as(); EXPECT_EQ(slice->in()->getMemoryType(), MemoryType::Global); diff --git a/tests/cpp/test_resize.cpp b/tests/cpp/test_resize.cpp index 970c4066043..6f485da09d0 100644 --- a/tests/cpp/test_resize.cpp +++ b/tests/cpp/test_resize.cpp @@ -4155,8 +4155,7 @@ TEST_P(ResizeSchedulerTest, PropagateSliceToInputs) { runtime->schedulerHeuristics()->heuristicsList().front(); EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize); Fusion* scheduled_fusion = - dynamic_cast(runtime->executors().at(0).get()) - ->fusion(); + runtime->executors().at(0)->as()->kernel(); checkLoopDomainEquivalence( scheduled_fusion->outputs().at(0)->as()); } @@ -4245,8 +4244,7 @@ TEST_P(ResizeSchedulerTest, PropagateSliceToInputsWithReshape1) { runtime->schedulerHeuristics()->heuristicsList().front(); EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize); Fusion* scheduled_fusion = - dynamic_cast(runtime->executors().at(0).get()) - ->fusion(); + runtime->executors().at(0)->as()->kernel(); checkLoopDomainEquivalence( scheduled_fusion->outputs().at(0)->as()); } @@ -4331,8 +4329,7 @@ TEST_P(ResizeSchedulerTest, PropagateSliceToInputsWithReshape2) { runtime->schedulerHeuristics()->heuristicsList().front(); EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize); Fusion* scheduled_fusion = - dynamic_cast(runtime->executors().at(0).get()) - ->fusion(); + runtime->executors().at(0)->as()->kernel(); checkLoopDomainEquivalence( scheduled_fusion->outputs().at(0)->as()); } @@ -4441,8 +4438,7 @@ TEST_P(ResizeSchedulerTest, PropagateMultipleSlicesToInputs1) { runtime->schedulerHeuristics()->heuristicsList().front(); EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize); Fusion* scheduled_fusion = - dynamic_cast(runtime->executors().at(0).get()) - ->fusion(); + runtime->executors().at(0)->as()->kernel(); checkLoopDomainEquivalence( scheduled_fusion->outputs().at(0)->as()); } @@ -4648,7 +4644,7 @@ TEST_F(ResizeSchedulerTest, PropagateMultipleSlicesToInputs3) { runtime->schedulerHeuristics()->heuristicsList().front(); EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize); Fusion* scheduled_fusion = - dynamic_cast(runtime->executors().at(0).get())->fusion(); + runtime->executors().at(0)->as()->kernel(); checkLoopDomainEquivalence( scheduled_fusion->outputs().at(0)->as()); } @@ -4788,8 +4784,7 @@ TEST_P(ResizeSchedulerTest, PropagateMultipleSlicesToInputs5) { runtime->schedulerHeuristics()->heuristicsList().front(); EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize); Fusion* scheduled_fusion = - dynamic_cast(runtime->executors().at(0).get()) - ->fusion(); + runtime->executors().at(0)->as()->kernel(); checkLoopDomainEquivalence( scheduled_fusion->outputs().at(0)->as()); } @@ -4978,8 +4973,7 @@ TEST_P(ResizeSchedulerTest, SliceRotateCat) { runtime->schedulerHeuristics()->heuristicsList().front(); EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize); Fusion* scheduled_fusion = - dynamic_cast(runtime->executors().at(0).get()) - ->fusion(); + runtime->executors().at(0)->as()->kernel(); checkLoopDomainEquivalence( scheduled_fusion->outputs().at(0)->as()); } @@ -5122,8 +5116,7 @@ TEST_P(ResizeSchedulerTest, SliceRotateCatResidual) { runtime->schedulerHeuristics()->heuristicsList().front(); EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize); Fusion* scheduled_fusion = - dynamic_cast(runtime->executors().at(0).get()) - ->fusion(); + runtime->executors().at(0)->as()->kernel(); checkLoopDomainEquivalence( scheduled_fusion->outputs().at(0)->as()); } @@ -5310,8 +5303,7 @@ TEST_P(ResizeSchedulerTest, PropagatePadToInputs) { runtime->schedulerHeuristics()->heuristicsList().front(); EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize); Fusion* scheduled_fusion = - dynamic_cast(runtime->executors().at(0).get()) - ->fusion(); + runtime->executors().at(0)->as()->kernel(); checkLoopDomainEquivalence( scheduled_fusion->outputs().at(0)->as()); } @@ -5412,8 +5404,7 @@ TEST_P(ResizeSchedulerTest, PropagateCatToInputs) { runtime->schedulerHeuristics()->heuristicsList().front(); EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize); Fusion* scheduled_fusion = - dynamic_cast(runtime->executors().at(0).get()) - ->fusion(); + runtime->executors().at(0)->as()->kernel(); checkLoopDomainEquivalence( scheduled_fusion->outputs().at(0)->as()); } diff --git a/tests/cpp/test_rope.cpp b/tests/cpp/test_rope.cpp index 46ca54cc53b..a8a601f4316 100644 --- a/tests/cpp/test_rope.cpp +++ b/tests/cpp/test_rope.cpp @@ -924,7 +924,7 @@ TEST_F(RopeTest, EndingRepeat) { runtime->schedulerHeuristics()->heuristicsList().front(); EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize); Fusion* scheduled_fusion = - dynamic_cast(runtime->executors().at(0).get())->fusion(); + runtime->executors().at(0)->as()->kernel(); // Check the loop domain of the reference. It should look like: //