Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove KernelExecutor::fusion_ #3725

Merged
merged 3 commits into from
Jan 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 10 additions & 19 deletions csrc/runtime/executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -724,7 +724,7 @@ LaunchParams KernelExecutor::computeLaunchParams(
// This check is only done once a kernel has been compiled, since
// maybe_available_dynamic_smem_ needs to be evaluated on
// a compiled kernel.
if (hasCompiledKernel()) {
if (isCompiled()) {
validateDynamicSmemSize(dynamic_smem_size);
}

Expand Down Expand Up @@ -784,7 +784,7 @@ std::vector<GlobalBufferInfo> KernelExecutor::getIntermediateBufferInfo(
}

void KernelExecutor::setUsedTVs() {
auto used_vals = fusion()->usedMathVals();
auto used_vals = kernel()->usedMathVals();
auto used_tvs = ir_utils::filterByType<TensorView>(used_vals);
used_tvs_.clear();
used_tvs_.insert(used_tvs_.begin(), used_tvs.begin(), used_tvs.end());
Expand Down Expand Up @@ -1128,8 +1128,7 @@ void KernelExecutor::recompileKernel(

int64_t KernelExecutor::getAvailableDynamicSmemSize() {
NVF_ERROR(
hasCompiledKernel(),
"Cannot get dynamic smem size unless kernel is compiled");
isCompiled(), "Cannot get dynamic smem size unless kernel is compiled");
if (!available_dynamic_smem_size_.has_value()) {
int size = 0;
NVFUSER_CUDA_SAFE_CALL(cuFuncGetAttribute(
Expand All @@ -1143,8 +1142,7 @@ int64_t KernelExecutor::getAvailableDynamicSmemSize() {

int64_t KernelExecutor::getStaticSmemSize() {
NVF_ERROR(
hasCompiledKernel(),
"Cannot get static smem size unless kernel is compiled");
isCompiled(), "Cannot get static smem size unless kernel is compiled");
if (!static_smem_size_.has_value()) {
int size = 0;
// Is this really a costly operation worth caching?
Expand All @@ -1160,7 +1158,7 @@ int64_t KernelExecutor::getStaticSmemSize() {
void KernelExecutor::validateDynamicSmemSize(int64_t dynamic_smem_size) {
// If specified, check that dynamic smem size matches what the scheduler
// expects
int64_t expected_dynamic_smem_size = fusion()->expectedDynamicSmemBytes();
int64_t expected_dynamic_smem_size = kernel()->expectedDynamicSmemBytes();
if (expected_dynamic_smem_size >= 0) {
NVF_ERROR(
dynamic_smem_size == expected_dynamic_smem_size,
Expand All @@ -1185,8 +1183,7 @@ void KernelExecutor::validateDynamicSmemSize(int64_t dynamic_smem_size) {
int64_t KernelExecutor::ensureAvailableDynamicSmemSize(
int64_t dynamic_smem_size) {
NVF_ERROR(
hasCompiledKernel(),
"Cannot set dynamic smem size unless kernel is compiled");
isCompiled(), "Cannot set dynamic smem size unless kernel is compiled");
if (dynamic_smem_size > getAvailableDynamicSmemSize()) {
validateDynamicSmemSize(dynamic_smem_size);
NVFUSER_CUDA_SAFE_CALL(cuFuncSetAttribute(
Expand Down Expand Up @@ -1224,7 +1221,7 @@ std::vector<at::Tensor> KernelExecutor::run(

NVF_ERROR(isCompiled());
NVF_ERROR(
outputs.empty() || (outputs.size() == fusion()->outputs().size()),
outputs.empty() || (outputs.size() == kernel()->outputs().size()),
__func__,
" provided number of outputs does not match fusion output");

Expand Down Expand Up @@ -1279,12 +1276,12 @@ std::vector<at::Tensor> KernelExecutor::run(
at::AutoDispatchBelowADInplaceOrView non_variable_type_mode;

// Bind fusion inputs
auto expr_eval = executor_utils::bindInputs(args, fusion());
auto expr_eval = executor_utils::bindInputs(args, kernel());

// only allocate outputs when not given
if (outputs.empty()) {
outputs = allocateOutputs(
fusion(), executor_entry->outputs, options_.device, expr_eval);
kernel(), executor_entry->outputs, options_.device, expr_eval);
}
args.push(outputs);

Expand Down Expand Up @@ -1563,12 +1560,6 @@ flatbuffers::Offset<serde::KernelExecutor> KernelExecutor::serialize(
executor_entry_lookup_values_fb.push_back(serialize(builder, value));
}

// When compilation is skipped, avoid serializing cubin because it doesn't
// exist. The remaining fields are also not necessary in this case.
if (!hasCompiledKernel()) {
return serde::CreateKernelExecutorDirect(builder);
}

return serde::CreateKernelExecutorDirect(
builder,
device_smem_limit_,
Expand Down Expand Up @@ -1776,7 +1767,7 @@ void KernelExecutor::deserialize(
compiled_kernel_ = executor_utils::getCompiledKernel(
buffer->compiled_kernel(), compile_params);

NVF_ERROR(hasCompiledKernel(), "Failed to deserialize KernelExecutor");
NVF_ERROR(isCompiled(), "Failed to deserialize KernelExecutor");
}

KernelExecutor::ExecutorEntry KernelExecutor::deserialize(
Expand Down
29 changes: 2 additions & 27 deletions csrc/runtime/executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -144,24 +144,13 @@ class KernelExecutor : public ExecutorAbstract {
post_lowering_hooks_.push_back(std::move(hook));
}

// Function to query whether compilation was attempted for a `KernelExecutor`
// Returns whether this `KernelExecutor` has a compiled kernel to execute.
bool isCompiled() const override {
int num_compiled_artifacts = (fusion_ != nullptr) + (lowered_ != nullptr);
NVF_ERROR(num_compiled_artifacts <= 1);
return num_compiled_artifacts == 1;
};

// function to query whether a `KernelExecutor` has a compiled kernel to
// execute
bool hasCompiledKernel() const {
if (compiled_kernel_ != nullptr) {
NVF_ERROR(compiled_kernel_->function != nullptr);
NVF_ERROR(
fusion_ == nullptr,
"fusion_ should only be initialized when using expression evaluator.");
}
return validKernelId() && lowered_ && compiled_kernel_ != nullptr;
};
}

void evictCache(size_t cache_id) {
executor_entry_lookup_.erase(cache_id);
Expand Down Expand Up @@ -200,17 +189,6 @@ class KernelExecutor : public ExecutorAbstract {
return lowered_->kernel();
}

Fusion* fusion() const {
NVF_ERROR(isCompiled());
if (fusion_ != nullptr) {
return fusion_.get();
}
if (lowered_ != nullptr) {
return lowered_->kernel()->as<Fusion>();
}
NVF_THROW("unreachable because of the isCompiled check");
}

const ThreadPredicateMap& threadPredMap() const {
return lowered_->threadPredMap();
}
Expand Down Expand Up @@ -503,9 +481,6 @@ class KernelExecutor : public ExecutorAbstract {

std::unique_ptr<GpuLower> lowered_;

// Initialized for non-compiled fusions
std::unique_ptr<Fusion> fusion_;

// Track the block size this kernel was compiled with. If the block size
// increases, recompile to adjust maxregister count.
int64_t block_size_high_water_mark_ = 1;
Expand Down
2 changes: 1 addition & 1 deletion tests/cpp/test_gpu3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8068,7 +8068,7 @@ TEST_F(NVFuserTest, AvoidCachingSliceInput) {
continue;
}
const auto* ke = exec->as<KernelExecutor>();
for (auto expr : ke->fusion()->exprs()) {
for (auto expr : ke->kernel()->exprs()) {
if (expr->isA<SliceOp>()) {
auto slice = expr->as<SliceOp>();
EXPECT_EQ(slice->in()->getMemoryType(), MemoryType::Global);
Expand Down
29 changes: 10 additions & 19 deletions tests/cpp/test_resize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4155,8 +4155,7 @@ TEST_P(ResizeSchedulerTest, PropagateSliceToInputs) {
runtime->schedulerHeuristics()->heuristicsList().front();
EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize);
Fusion* scheduled_fusion =
dynamic_cast<KernelExecutor*>(runtime->executors().at(0).get())
->fusion();
runtime->executors().at(0)->as<KernelExecutor>()->kernel();
checkLoopDomainEquivalence(
scheduled_fusion->outputs().at(0)->as<TensorView>());
}
Expand Down Expand Up @@ -4245,8 +4244,7 @@ TEST_P(ResizeSchedulerTest, PropagateSliceToInputsWithReshape1) {
runtime->schedulerHeuristics()->heuristicsList().front();
EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize);
Fusion* scheduled_fusion =
dynamic_cast<KernelExecutor*>(runtime->executors().at(0).get())
->fusion();
runtime->executors().at(0)->as<KernelExecutor>()->kernel();
checkLoopDomainEquivalence(
scheduled_fusion->outputs().at(0)->as<TensorView>());
}
Expand Down Expand Up @@ -4331,8 +4329,7 @@ TEST_P(ResizeSchedulerTest, PropagateSliceToInputsWithReshape2) {
runtime->schedulerHeuristics()->heuristicsList().front();
EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize);
Fusion* scheduled_fusion =
dynamic_cast<KernelExecutor*>(runtime->executors().at(0).get())
->fusion();
runtime->executors().at(0)->as<KernelExecutor>()->kernel();
checkLoopDomainEquivalence(
scheduled_fusion->outputs().at(0)->as<TensorView>());
}
Expand Down Expand Up @@ -4441,8 +4438,7 @@ TEST_P(ResizeSchedulerTest, PropagateMultipleSlicesToInputs1) {
runtime->schedulerHeuristics()->heuristicsList().front();
EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize);
Fusion* scheduled_fusion =
dynamic_cast<KernelExecutor*>(runtime->executors().at(0).get())
->fusion();
runtime->executors().at(0)->as<KernelExecutor>()->kernel();
checkLoopDomainEquivalence(
scheduled_fusion->outputs().at(0)->as<TensorView>());
}
Expand Down Expand Up @@ -4648,7 +4644,7 @@ TEST_F(ResizeSchedulerTest, PropagateMultipleSlicesToInputs3) {
runtime->schedulerHeuristics()->heuristicsList().front();
EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize);
Fusion* scheduled_fusion =
dynamic_cast<KernelExecutor*>(runtime->executors().at(0).get())->fusion();
runtime->executors().at(0)->as<KernelExecutor>()->kernel();
checkLoopDomainEquivalence(
scheduled_fusion->outputs().at(0)->as<TensorView>());
}
Expand Down Expand Up @@ -4788,8 +4784,7 @@ TEST_P(ResizeSchedulerTest, PropagateMultipleSlicesToInputs5) {
runtime->schedulerHeuristics()->heuristicsList().front();
EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize);
Fusion* scheduled_fusion =
dynamic_cast<KernelExecutor*>(runtime->executors().at(0).get())
->fusion();
runtime->executors().at(0)->as<KernelExecutor>()->kernel();
checkLoopDomainEquivalence(
scheduled_fusion->outputs().at(0)->as<TensorView>());
}
Expand Down Expand Up @@ -4978,8 +4973,7 @@ TEST_P(ResizeSchedulerTest, SliceRotateCat) {
runtime->schedulerHeuristics()->heuristicsList().front();
EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize);
Fusion* scheduled_fusion =
dynamic_cast<KernelExecutor*>(runtime->executors().at(0).get())
->fusion();
runtime->executors().at(0)->as<KernelExecutor>()->kernel();
checkLoopDomainEquivalence(
scheduled_fusion->outputs().at(0)->as<TensorView>());
}
Expand Down Expand Up @@ -5122,8 +5116,7 @@ TEST_P(ResizeSchedulerTest, SliceRotateCatResidual) {
runtime->schedulerHeuristics()->heuristicsList().front();
EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize);
Fusion* scheduled_fusion =
dynamic_cast<KernelExecutor*>(runtime->executors().at(0).get())
->fusion();
runtime->executors().at(0)->as<KernelExecutor>()->kernel();
checkLoopDomainEquivalence(
scheduled_fusion->outputs().at(0)->as<TensorView>());
}
Expand Down Expand Up @@ -5310,8 +5303,7 @@ TEST_P(ResizeSchedulerTest, PropagatePadToInputs) {
runtime->schedulerHeuristics()->heuristicsList().front();
EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize);
Fusion* scheduled_fusion =
dynamic_cast<KernelExecutor*>(runtime->executors().at(0).get())
->fusion();
runtime->executors().at(0)->as<KernelExecutor>()->kernel();
checkLoopDomainEquivalence(
scheduled_fusion->outputs().at(0)->as<TensorView>());
}
Expand Down Expand Up @@ -5412,8 +5404,7 @@ TEST_P(ResizeSchedulerTest, PropagateCatToInputs) {
runtime->schedulerHeuristics()->heuristicsList().front();
EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize);
Fusion* scheduled_fusion =
dynamic_cast<KernelExecutor*>(runtime->executors().at(0).get())
->fusion();
runtime->executors().at(0)->as<KernelExecutor>()->kernel();
checkLoopDomainEquivalence(
scheduled_fusion->outputs().at(0)->as<TensorView>());
}
Expand Down
2 changes: 1 addition & 1 deletion tests/cpp/test_rope.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -924,7 +924,7 @@ TEST_F(RopeTest, EndingRepeat) {
runtime->schedulerHeuristics()->heuristicsList().front();
EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize);
Fusion* scheduled_fusion =
dynamic_cast<KernelExecutor*>(runtime->executors().at(0).get())->fusion();
runtime->executors().at(0)->as<KernelExecutor>()->kernel();

// Check the loop domain of the reference. It should look like:
//
Expand Down
Loading